diff --git a/docs/llvm-cov/html/coverage/home/runner/work/MathCAT/MathCAT/src/bin/mathml2text.rs.html b/docs/llvm-cov/html/coverage/home/runner/work/MathCAT/MathCAT/src/bin/mathml2text.rs.html index 88a1a196..c5f32ffd 100644 --- a/docs/llvm-cov/html/coverage/home/runner/work/MathCAT/MathCAT/src/bin/mathml2text.rs.html +++ b/docs/llvm-cov/html/coverage/home/runner/work/MathCAT/MathCAT/src/bin/mathml2text.rs.html @@ -1 +1 @@ -
Line | Count | Source |
1 | // *** MathCAT doesn't normally want to build a binary *** | |
2 | // *** This file is here because it is useful for trying out things *** | |
3 | #![allow(clippy::needless_return)] | |
4 | ||
5 | use libmathcat::{errors::*, interface::*}; | |
6 | use log::*; | |
7 | use std::path::PathBuf; | |
8 | use clap::{Parser, ValueEnum}; | |
9 | ||
10 | // Maybe also have this speak to test the TTS generation. | |
11 | // There is a rust winapi crate that mirrors the WinPAI and has "Speak(...)" in it | |
12 | ||
13 | // env RUST_LOG=DEBUG cargo run --features "include-zip" | |
14 | 0 | fn get_rules_dir() -> String { |
15 | // for testing with zipped rules dir | |
16 | // let rules_path = std::env::current_exe().unwrap().parent().unwrap().join("../../../MathCATForPython/addon/globalPlugins/MathCAT/Rules"); | |
17 | 0 | let rules_path = std::env::current_exe().unwrap().parent().unwrap().join("../../Rules"); |
18 | 0 | return rules_path.as_os_str().to_str().unwrap().to_string(); |
19 | 0 | } |
20 | ||
21 | #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)] | |
22 | enum OutputType { | |
23 | Text, | |
24 | Braille, | |
25 | #[cfg(feature="tts")] | |
26 | Speech, | |
27 | } | |
28 | ||
29 | #[derive(Parser)] | |
30 | #[command(version, about)] | |
31 | struct Options { | |
32 | #[arg(short, long)] | |
33 | rules_dir: Option<PathBuf>, | |
34 | ||
35 | input_file: Option<PathBuf>, | |
36 | ||
37 | #[arg(short, long, default_value="en")] | |
38 | language: String, | |
39 | ||
40 | #[arg(value_enum, long, default_value="text")] | |
41 | output: OutputType, | |
42 | } | |
43 | ||
44 | ||
45 | 0 | fn main() -> Result<()> { |
46 | 0 | env_logger::builder() |
47 | 0 | .format_timestamp(None) |
48 | 0 | .format_module_path(false) |
49 | 0 | .format_indent(Some(2)) |
50 | 0 | .format_level(false) |
51 | 0 | .init(); |
52 | ||
53 | 0 | let cli = Options::parse(); |
54 | ||
55 | 0 | let expr = if let Some(f) = cli.input_file { |
56 | 0 | std::fs::read_to_string(&f).with_context(|| format!("unable to open {}", f.to_str().unwrap_or_default()))? |
57 | } else { | |
58 | 0 | r#" |
59 | 0 | <math xmlns="http://www.w3.org/1998/Math/MathML"><mo>(</mo><mn>1</mn><mo>)</mo></math> |
60 | 0 | "#.to_string() |
61 | }; | |
62 | ||
63 | 0 | if let Err(e) = set_rules_dir(get_rules_dir()) { |
64 | 0 | panic!("Error: exiting -- {}", errors_to_string(&e)); |
65 | 0 | } |
66 | 0 | debug!("Languages: {}", libmathcat::interface::get_supported_languages()?.join(", ")); |
67 | ||
68 | #[cfg(feature = "include-zip")] | |
69 | info!("***********include-zip is present**********"); | |
70 | 0 | info!("Version = '{}' using Rules dir {}", get_version(), get_rules_dir()); |
71 | 0 | set_preference("Language", cli.language)?; |
72 | ||
73 | 0 | set_preference("DecimalSeparator", "Auto").unwrap(); |
74 | 0 | set_preference("BrailleCode", "Nemeth").unwrap(); |
75 | 0 | set_preference("TTS", "None").unwrap(); |
76 | 0 | set_preference("Verbosity", "Verbose").unwrap(); |
77 | 0 | set_preference("NavVerbosity", "Verbose").unwrap(); |
78 | 0 | set_preference("NavMode", "Enhanced").unwrap(); |
79 | 0 | set_preference("Impairment", "Blindness").unwrap(); |
80 | 0 | set_preference("SpeechOverrides_CapitalLetters", "").unwrap(); |
81 | 0 | set_preference("MathRate", "80").unwrap(); |
82 | 0 | set_preference("CapitalLetters_Beep", "true").unwrap(); |
83 | 0 | set_preference("IntentErrorRecovery", "Error").unwrap(); |
84 | ||
85 | 0 | set_preference("Bookmark", "false").unwrap(); |
86 | 0 | set_preference("SpeechStyle", "ClearSpeak").unwrap(); |
87 | 0 | info!("Languages: {}", libmathcat::interface::get_supported_languages()?.join(", ")); |
88 | 0 | info!("Speech styles: {}", libmathcat::interface::get_supported_speech_styles("ClearSpeak")?.join(", ")); |
89 | 0 | info!("BrailleCodes: {}", libmathcat::interface::get_supported_braille_codes()?.join(", ")); |
90 | ||
91 | 0 | debug!("Speech language is {}", get_preference("Language").unwrap()); |
92 | 0 | debug!("DecimalSeparator: {:?}", get_preference("DecimalSeparator").unwrap()); |
93 | 0 | debug!("DecimalSeparators: {:?}, BlockSeparators: {:?}", get_preference("DecimalSeparators").unwrap(), get_preference("BlockSeparators").unwrap()); |
94 | 0 | debug!("SpeechStyle: {:?}", get_preference("SpeechStyle").unwrap()); |
95 | 0 | debug!("Verbosity: {:?}", get_preference("Verbosity").unwrap()); |
96 | ||
97 | 0 | match set_mathml(&expr) { |
98 | 0 | Err(e) => { |
99 | 0 | panic!("Error: exiting -- {}", errors_to_string(&e)); |
100 | }, | |
101 | 0 | Ok(fmt) => { |
102 | 0 | info!("formatted input mathml into {fmt}"); |
103 | } | |
104 | } | |
105 | ||
106 | 0 | match cli.output { |
107 | OutputType::Text => { | |
108 | 0 | match get_spoken_text() { |
109 | 0 | Ok(speech) => println!("{speech}"), |
110 | 0 | Err(e) => panic!("{}", errors_to_string(&e)), |
111 | } | |
112 | }, | |
113 | OutputType::Braille => { | |
114 | 0 | debug!("...using BrailleCode: {:?}", get_preference("BrailleCode").unwrap()); |
115 | 0 | match get_braille("") { |
116 | 0 | Ok(braille) => println!("{braille}"), |
117 | 0 | Err(e) => panic!("{}", errors_to_string(&e)), |
118 | } | |
119 | }, | |
120 | #[cfg(feature="tts")] | |
121 | OutputType::Speech => { | |
122 | // Create the NaturalTts struct using the builder pattern. | |
123 | let mut natural = natural_tts::NaturalTtsBuilder::default() | |
124 | .gtts_model(natural_tts::models::gtts::GttsModel::default()) | |
125 | .default_model(natural_tts::Model::Gtts) | |
126 | .build().expect("failed to generate natural tts gtts model"); | |
127 | ||
128 | ||
129 | // Start producing an output using the default_model. | |
130 | let _ = natural.start(get_spoken_text().unwrap(), &PathBuf::from("output.wav")); | |
131 | ||
132 | // Play the audio until it finishes | |
133 | natural.sleep_until_end(); | |
134 | } | |
135 | } | |
136 | ||
137 | 0 | Ok(()) |
138 | 0 | } |
Line | Count | Source |
1 | // *** MathCAT doesn't normally want to build a binary *** | |
2 | // *** This file is here because it is useful for trying out things *** | |
3 | #![allow(clippy::needless_return)] | |
4 | ||
5 | use libmathcat::{errors::*, interface::*}; | |
6 | use log::*; | |
7 | use std::path::PathBuf; | |
8 | use clap::{Parser, ValueEnum}; | |
9 | ||
10 | // Maybe also have this speak to test the TTS generation. | |
11 | // There is a rust winapi crate that mirrors the WinPAI and has "Speak(...)" in it | |
12 | ||
13 | // env RUST_LOG=DEBUG cargo run --features "include-zip" | |
14 | 0 | fn get_rules_dir() -> String { |
15 | // for testing with zipped rules dir | |
16 | // let rules_path = std::env::current_exe().unwrap().parent().unwrap().join("../../../MathCATForPython/addon/globalPlugins/MathCAT/Rules"); | |
17 | 0 | let rules_path = std::env::current_exe().unwrap().parent().unwrap().join("../../Rules"); |
18 | 0 | return rules_path.as_os_str().to_str().unwrap().to_string(); |
19 | 0 | } |
20 | ||
21 | #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)] | |
22 | enum OutputType { | |
23 | Text, | |
24 | Braille, | |
25 | #[cfg(feature="tts")] | |
26 | Speech, | |
27 | } | |
28 | ||
29 | #[derive(Parser)] | |
30 | #[command(version, about)] | |
31 | struct Options { | |
32 | #[arg(short, long)] | |
33 | rules_dir: Option<PathBuf>, | |
34 | ||
35 | input_file: Option<PathBuf>, | |
36 | ||
37 | #[arg(short, long, default_value="en")] | |
38 | language: String, | |
39 | ||
40 | #[arg(value_enum, long, default_value="text")] | |
41 | output: OutputType, | |
42 | } | |
43 | ||
44 | ||
45 | 0 | fn main() -> Result<()> { |
46 | 0 | env_logger::builder() |
47 | 0 | .format_timestamp(None) |
48 | 0 | .format_module_path(false) |
49 | 0 | .format_indent(Some(2)) |
50 | 0 | .format_level(false) |
51 | 0 | .init(); |
52 | ||
53 | 0 | let cli = Options::parse(); |
54 | ||
55 | 0 | let expr = if let Some(f) = cli.input_file { |
56 | 0 | std::fs::read_to_string(&f).with_context(|| format!("unable to open {}", f.to_str().unwrap_or_default()))? |
57 | } else { | |
58 | 0 | r#" |
59 | 0 | <math xmlns="http://www.w3.org/1998/Math/MathML"><mo>(</mo><mn>1</mn><mo>)</mo></math> |
60 | 0 | "#.to_string() |
61 | }; | |
62 | ||
63 | 0 | if let Err(e) = set_rules_dir(get_rules_dir()) { |
64 | 0 | panic!("Error: exiting -- {}", errors_to_string(&e)); |
65 | 0 | } |
66 | 0 | debug!("Languages: {}", libmathcat::interface::get_supported_languages()?.join(", ")); |
67 | ||
68 | #[cfg(feature = "include-zip")] | |
69 | info!("***********include-zip is present**********"); | |
70 | 0 | info!("Version = '{}' using Rules dir {}", get_version(), get_rules_dir()); |
71 | 0 | set_preference("Language", cli.language)?; |
72 | ||
73 | 0 | set_preference("DecimalSeparator", "Auto").unwrap(); |
74 | 0 | set_preference("BrailleCode", "Nemeth").unwrap(); |
75 | 0 | set_preference("TTS", "None").unwrap(); |
76 | 0 | set_preference("Verbosity", "Verbose").unwrap(); |
77 | 0 | set_preference("NavVerbosity", "Verbose").unwrap(); |
78 | 0 | set_preference("NavMode", "Enhanced").unwrap(); |
79 | 0 | set_preference("Impairment", "Blindness").unwrap(); |
80 | 0 | set_preference("SpeechOverrides_CapitalLetters", "").unwrap(); |
81 | 0 | set_preference("MathRate", "80").unwrap(); |
82 | 0 | set_preference("CapitalLetters_Beep", "true").unwrap(); |
83 | 0 | set_preference("IntentErrorRecovery", "Error").unwrap(); |
84 | ||
85 | 0 | set_preference("Bookmark", "false").unwrap(); |
86 | 0 | set_preference("SpeechStyle", "ClearSpeak").unwrap(); |
87 | 0 | info!("Languages: {}", libmathcat::interface::get_supported_languages()?.join(", ")); |
88 | 0 | info!("Speech styles: {}", libmathcat::interface::get_supported_speech_styles("ClearSpeak")?.join(", ")); |
89 | 0 | info!("BrailleCodes: {}", libmathcat::interface::get_supported_braille_codes()?.join(", ")); |
90 | ||
91 | 0 | debug!("Speech language is {}", get_preference("Language").unwrap()); |
92 | 0 | debug!("DecimalSeparator: {:?}", get_preference("DecimalSeparator").unwrap()); |
93 | 0 | debug!("DecimalSeparators: {:?}, BlockSeparators: {:?}", get_preference("DecimalSeparators").unwrap(), get_preference("BlockSeparators").unwrap()); |
94 | 0 | debug!("SpeechStyle: {:?}", get_preference("SpeechStyle").unwrap()); |
95 | 0 | debug!("Verbosity: {:?}", get_preference("Verbosity").unwrap()); |
96 | ||
97 | 0 | match set_mathml(&expr) { |
98 | 0 | Err(e) => { |
99 | 0 | panic!("Error: exiting -- {}", errors_to_string(&e)); |
100 | }, | |
101 | 0 | Ok(fmt) => { |
102 | 0 | info!("formatted input mathml into {fmt}"); |
103 | } | |
104 | } | |
105 | ||
106 | 0 | match cli.output { |
107 | OutputType::Text => { | |
108 | 0 | match get_spoken_text() { |
109 | 0 | Ok(speech) => println!("{speech}"), |
110 | 0 | Err(e) => panic!("{}", errors_to_string(&e)), |
111 | } | |
112 | }, | |
113 | OutputType::Braille => { | |
114 | 0 | debug!("...using BrailleCode: {:?}", get_preference("BrailleCode").unwrap()); |
115 | 0 | match get_braille("") { |
116 | 0 | Ok(braille) => println!("{braille}"), |
117 | 0 | Err(e) => panic!("{}", errors_to_string(&e)), |
118 | } | |
119 | }, | |
120 | #[cfg(feature="tts")] | |
121 | OutputType::Speech => { | |
122 | // Create the NaturalTts struct using the builder pattern. | |
123 | let mut natural = natural_tts::NaturalTtsBuilder::default() | |
124 | .gtts_model(natural_tts::models::gtts::GttsModel::default()) | |
125 | .default_model(natural_tts::Model::Gtts) | |
126 | .build().expect("failed to generate natural tts gtts model"); | |
127 | ||
128 | ||
129 | // Start producing an output using the default_model. | |
130 | let _ = natural.start(get_spoken_text().unwrap(), &PathBuf::from("output.wav")); | |
131 | ||
132 | // Play the audio until it finishes | |
133 | natural.sleep_until_end(); | |
134 | } | |
135 | } | |
136 | ||
137 | 0 | Ok(()) |
138 | 0 | } |
Line | Count | Source |
1 | #![allow(clippy::needless_return)] | |
2 | use strum_macros::Display; | |
3 | use sxd_document::dom::{Element, ChildOfElement}; | |
4 | use sxd_document::Package; | |
5 | use crate::definitions::SPEECH_DEFINITIONS; | |
6 | use crate::errors::*; | |
7 | use crate::pretty_print::mml_to_string; | |
8 | use crate::prefs::PreferenceManager; | |
9 | use std::cell::Ref; | |
10 | use regex::{Captures, Regex, RegexSet}; | |
11 | use phf::{phf_map, phf_set}; | |
12 | use crate::speech::{BRAILLE_RULES, SpeechRulesWithContext, braille_replace_chars, make_quoted_string}; | |
13 | use crate::canonicalize::get_parent; | |
14 | use std::borrow::Cow; | |
15 | use std::ops::Range; | |
16 | use std::sync::LazyLock; | |
17 | use log::{debug, error}; | |
18 | ||
19 | 320 | fn is_ueb_prefix(ch: char) -> bool { |
20 | 320 |
|
21 | 320 | } |
22 | ||
23 | /// Returns the braille *char* at the given position in the braille string. | |
24 | 971 | fn braille_at(braille: &str, index: usize) -> char { |
25 | // braille is always 3 bytes per char | |
26 | 971 | return braille[index..index+3].chars().next().unwrap(); |
27 | ||
28 | 971 | } |
29 | ||
30 | /// braille the MathML | |
31 | /// If 'nav_node_id' is not an empty string, then the element with that id will have dots 7 & 8 turned on as per the pref | |
32 | /// Returns the braille string (highlighted) along with the *character* start/end of the highlight (whole string if no highlight) | |
33 | 1.82k | pub fn braille_mathml(mathml: Element, nav_node_id: &str) -> Result<(String, usize, usize)> { |
34 | 1.82k | return BRAILLE_RULES.with(|rules| { |
35 | 1.82k | rules.borrow_mut().read_files() |
36 | 1.82k | let rules = rules.borrow(); |
37 | 1.82k | let new_package = Package::new(); |
38 | 1.82k | let mut rules_with_context = SpeechRulesWithContext::new(&rules, new_package.as_document(), nav_node_id, 0); |
39 | 1.82k | let braille_string = rules_with_context.match_pattern::<String>(mathml) |
40 | 1.82k | .context("Pattern match/replacement failure!") |
41 | // debug!("braille_mathml: braille string: {}", &braille_string); | |
42 | 1.82k | let braille_string = braille_string.replace(' ', ""); |
43 | 1.82k | let pref_manager = rules_with_context.get_rules().pref_manager.borrow(); |
44 | 1.82k | let highlight_style = pref_manager.pref_to_string("BrailleNavHighlight"); |
45 | 1.82k | let braille_code = pref_manager.pref_to_string("BrailleCode"); |
46 | 1.82k | let braille = match braille_code.as_str() { |
47 | 1.82k | "Nemeth" => |
48 | 941 | "UEB" => |
49 | 575 | "Vietnam" => |
50 | 463 | "CMU" => |
51 | 91 | "Finnish" => |
52 | 91 | "Swedish" => |
53 | 91 | "LaTeX" => |
54 | 41 | "ASCIIMath" => ASCIIMath_cleanup(pref_manager, braille_string), |
55 | 0 | "ASCIIMath-fi" => ASCIIMath_cleanup(pref_manager, braille_string), |
56 | 0 | _ => braille_string.trim_matches('⠀').to_string(), // probably needs cleanup if someone has another code, but this will have to get added by hand |
57 | }; | |
58 | ||
59 | return Ok( | |
60 | 1.82k | if highlight_style != "Off" { |
61 | 520 | highlight_braille_chars(braille, &braille_code, highlight_style == "All") |
62 | } else { | |
63 | 1.30k | let end = braille.len()/3; |
64 | 1.30k | (braille, 0, end) |
65 | } | |
66 | ); | |
67 | 1.82k | }); |
68 | ||
69 | /// highlight with dots 7 & 8 based on the highlight style | |
70 | /// both the start and stop points will be extended to deal with indicators such as capitalization | |
71 | /// if 'fill_range' is true, the interior will be highlighted | |
72 | /// Returns the braille string (highlighted) along with the [start, end) *character* of the highlight (whole string if no highlight) | |
73 | 520 | fn highlight_braille_chars(braille: String, braille_code: &str, fill_range: bool) -> (String, usize, usize) { |
74 | 520 | let mut braille = braille; |
75 | // some special (non-braille) chars weren't converted to having dots 7 & 8 to indicate navigation position | |
76 | // they need to be added to the start | |
77 | ||
78 | // find start and end (byte) indexes of the highlighted region (braille chars have length=3 bytes) | |
79 | 520 | let start = braille.find(is_highlighted); |
80 | 520 | let end = braille.rfind(is_highlighted); |
81 | 520 | if start.is_none() { |
82 | 57 | assert!(end.is_none()); |
83 | 57 | let end = braille.len(); |
84 | 57 | return (braille, 0, end/3); |
85 | 463 | }; |
86 | ||
87 | 463 | let start = start.unwrap(); |
88 | 463 | let mut end = end.unwrap() + 3; // always exists if start exists ('end' is exclusive) |
89 | // debug!("braille highlight: start/end={}/{}; braille={}", start/3, end/3, braille); | |
90 | 463 | let mut start = highlight_first_indicator(&mut braille, braille_code, start, end); |
91 | 463 | if let Some( |
92 | 45 | (start, end) = new_range |
93 | 418 | } |
94 | ||
95 | 463 | if start == end { |
96 | 0 | return (braille, start/3, end/3); |
97 | 463 | } |
98 | ||
99 | 463 | if !fill_range { |
100 | 459 | return (braille, start/3, end/3); |
101 | 4 | } |
102 | ||
103 | 4 | let mut result = String::with_capacity(braille.len()); |
104 | 4 | result.push_str(&braille[..start]); |
105 | 4 | let highlight_region =&mut braille[start..end]; |
106 | 8 | for ch in |
107 | 8 | result.push( highlight(ch) ); |
108 | 8 | }; |
109 | 4 | result.push_str(&braille[end..]); |
110 | 4 | return (result, start/3, end/3); |
111 | ||
112 | /// Return the byte index of the first place to highlight | |
113 | 463 | fn highlight_first_indicator(braille: &mut String, braille_code: &str, start_index: usize, end_index: usize) -> usize { |
114 | // chars in the braille block range use 3 bytes -- we can use that to optimize the code some | |
115 | 463 | let first_ch = unhighlight(braille_at(braille, start_index)); |
116 | ||
117 | // need to highlight (optional) capital/number, language, and style (max 2 chars) also in that (rev) order | |
118 | 463 | let mut prefix_ch_index = std::cmp::max(0, start_index as isize - 5*3) as usize; |
119 | 463 | if prefix_ch_index == 0 && |
120 | // don't count the word or passage mode as part of a indicator | |
121 | 46 | if braille.starts_with("⠰⠰⠰") { |
122 | 42 | prefix_ch_index = 9; |
123 | 42 | } else if |
124 | 0 | prefix_ch_index = 6; |
125 | 4 | } |
126 | 417 | } |
127 | 463 | let indicators = &braille[prefix_ch_index..start_index]; // chars to be examined |
128 | 463 | let i_byte_start = start_index - 3 * match braille_code { |
129 | 463 | "Nemeth" => |
130 | 334 | _ => i_start_ueb(indicators), // treat all the other like UEB because they probably have similar number and letter prefixes |
131 | }; | |
132 | 463 | if i_byte_start < start_index { |
133 | // remove old highlight as long as we don't wipe out the end highlight | |
134 | 59 | if start_index < end_index { |
135 | 59 | let old_first_char_bytes = start_index..start_index+3; |
136 | 59 | let replacement_str = unhighlight(braille_at(braille, start_index)).to_string(); |
137 | 59 | braille.replace_range(old_first_char_bytes, &replacement_str); |
138 | 59 |
|
139 | ||
140 | // add new highlight | |
141 | 59 | let new_first_char_bytes = i_byte_start..i_byte_start+3; |
142 | 59 | let replacement_str = highlight(braille_at(braille, i_byte_start)).to_string(); |
143 | 59 | braille.replace_range(new_first_char_bytes, &replacement_str); |
144 | 404 | } |
145 | ||
146 | 463 | return i_byte_start; |
147 | 463 | } |
148 | ||
149 | /// Return the byte indexes of the first and last place to highlight | |
150 | /// Currently, this only does something for CMU braille | |
151 | 463 | fn expand_highlight(braille: &mut String, braille_code: &str, start_index: usize, end_index: usize) -> Option<(usize, usize)> { |
152 | // For CMU, we want to expand mrows to include the opening and closing grouping indicators if they exist | |
153 | 463 | if start_index == 0 || |
154 | 358 | return None; |
155 | 105 | } |
156 | ||
157 | 105 | let first_ch = unhighlight(braille_at(braille, start_index)); |
158 | 105 | let last_ch = unhighlight(braille_at(braille, end_index-3)); |
159 | // We need to be careful not to expand the selection if we are already on a grouping indicator | |
160 | 105 | if first_ch == '⠢' && |
161 | 0 | return None; |
162 | 105 | } |
163 | 105 | let preceding_ch = braille_at(braille, start_index-3); |
164 | 105 | if preceding_ch != '⠢' { |
165 | 43 | return None; |
166 | 62 | } |
167 | ||
168 | 62 | let following_ch = braille_at(braille, end_index); |
169 | 62 | if following_ch != '⠔' { |
170 | 17 | return None; |
171 | 45 | } |
172 | ||
173 | 45 | let preceding_ch = highlight(preceding_ch); |
174 | 45 | braille.replace_range(start_index-3..start_index+3, format!("{preceding_ch}{first_ch}").as_str()); |
175 | 45 | let following_ch = highlight(following_ch); |
176 | 45 | braille.replace_range(end_index-3..end_index+3, format!("{last_ch}{following_ch}").as_str()); |
177 | 45 | return Some( (start_index-3, end_index + 3) ); |
178 | 463 | } |
179 | 520 | } |
180 | ||
181 | /// Given a position in a Nemeth string, what is the position character that starts it (e.g, the prev char for capital letter) | |
182 | 129 | fn i_start_nemeth(braille_prefix: &str, first_ch: char) -> usize { |
183 | 0 | fn is_nemeth_number(ch: char) -> bool { |
184 | 0 | matches!(ch, '⠂' | '⠆' | '⠒' | '⠲' | '⠢' | '⠖' | '⠶' | '⠦' | '⠔' | '⠴' | '⠨') |
185 | 0 | } |
186 | 129 | let mut n_chars = 0; |
187 | 129 | let prefix = &mut braille_prefix.chars().rev().peekable(); |
188 | 129 | if prefix.peek() == Some(&'⠠') || // cap indicator |
189 | 129 | (prefix.peek() == Some(&'⠼') && |
190 | 129 | [Some(&'⠸'), Some(&'⠈'), Some(&'⠨')].contains(&prefix.peek()) { // bold, script/blackboard, italic indicator |
191 | 1 | n_chars += 1; |
192 | 1 | prefix.next(); |
193 | 128 | } |
194 | ||
195 | 129 | if [Some(&'⠰'), Some(&'⠸'), Some(&'⠨')].contains(&prefix.peek()) { // English, German, Greek |
196 | 0 | n_chars += 1; |
197 | 129 | } else if prefix.peek() == Some(&'⠈') { |
198 | 0 | let ch = prefix.next(); // Russian/Greek Variant |
199 | 0 | if ch == Some('⠈') || ch == Some('⠨') { |
200 | 0 | n_chars += 2; |
201 | 0 | } |
202 | 129 | } else if prefix.peek() == Some(&'⠠') { // Hebrew |
203 | 0 | let ch = prefix.next(); // Russian/Greek Variant |
204 | 0 | if ch == Some('⠠') { |
205 | 0 | n_chars += 2; |
206 | 0 | } |
207 | 129 | }; |
208 | 129 | return n_chars; |
209 | 129 | } |
210 | ||
211 | /// Given a position in a UEB string, what is the position character that starts it (e.g, the prev char for capital letter) | |
212 | 334 | fn i_start_ueb(braille_prefix: &str) -> usize { |
213 | 334 | let prefix = &mut braille_prefix.chars().rev().peekable(); |
214 | 334 | let mut n_chars = 0; |
215 | 392 | while let Some( |
216 | 320 | if is_ueb_prefix(ch) { |
217 | 58 | n_chars += 1; |
218 | 262 | } else if ch == '⠆' { |
219 | 0 | let n_typeform_chars = check_for_typeform(prefix); |
220 | 0 | if n_typeform_chars > 0 { |
221 | 0 | n_chars += n_typeform_chars; |
222 | 0 | } else { |
223 | 0 | break; |
224 | } | |
225 | } else { | |
226 | 262 | break; |
227 | } | |
228 | } | |
229 | 334 | return n_chars; |
230 | 334 | } |
231 | ||
232 | ||
233 | 0 | fn check_for_typeform(prefix: &mut dyn std::iter::Iterator<Item=char>) -> usize { |
234 | 0 | fn is_ueb_typeform_prefix(ch: char) -> bool { |
235 | 0 | matches!(ch, '⠈' | '⠘' | '⠸' | '⠨') |
236 | 0 | } |
237 | ||
238 | 0 | if let Some(typeform_indicator) = prefix.next() { |
239 | 0 | if is_ueb_typeform_prefix(typeform_indicator) { |
240 | 0 | return 2; |
241 | 0 | } else if typeform_indicator == '⠼' && |
242 | 0 | let Some(user_defined_typeform_indicator) = prefix.next() && |
243 | 0 | (is_ueb_typeform_prefix(user_defined_typeform_indicator) || user_defined_typeform_indicator == '⠐') { |
244 | 0 | return 3; |
245 | 0 | } |
246 | 0 | } |
247 | 0 | return 0; |
248 | 0 | } |
249 | 1.82k | } |
250 | ||
251 | // FIX: if 8-dot braille is needed, perhaps the highlights can be shifted to a "highlighted" 256 char block in private space | |
252 | // they would need to be unshifted for the external world | |
253 | 11.0k | fn is_highlighted(ch: char) -> bool { |
254 | 11.0k | let ch_as_u32 = ch as u32; |
255 | 11.0k | return (0x28C0..0x28FF).contains(&ch_as_u32) || |
256 | 11.0k | } |
257 | ||
258 | 159 | fn highlight(ch: char) -> char { |
259 | // safe because we have checked the range | |
260 | 159 | return unsafe{char::from_u32_unchecked(ch as u32 | 0xC0)}; // 0x28C0..0x28FF all have dots 7 & 8 on |
261 | 159 | } |
262 | ||
263 | 3.12k | fn unhighlight(ch: char) -> char { |
264 | 3.12k | let ch_as_u32 = ch as u32; |
265 | 3.12k | if (0x28C0..0x28FF).contains(&ch_as_u32) { // 0x28C0..0x28FF all have dots 7 & 8 on |
266 | 903 | return unsafe{char::from_u32_unchecked(ch_as_u32 & 0x283F)}; // safe because we have checked the range |
267 | } else { | |
268 | 2.22k | return ch; |
269 | } | |
270 | 3.12k | } |
271 | ||
272 | use std::cell::RefCell; | |
273 | thread_local!{ | |
274 | /// Count number of probes -- get a sense of how well algorithm is working (for debugging) | |
275 | static N_PROBES: RefCell<usize> = const { RefCell::new(0) }; | |
276 | } | |
277 | ||
278 | ||
279 | /// Given a 0-based braille position, return the id of the smallest MathML node enclosing it. | |
280 | /// This node might be a leaf with an offset. | |
281 | 91 | pub fn get_navigation_node_from_braille_position(mathml: Element, position: usize) -> Result<(String, usize)> { |
282 | // This works via a "smart" binary search (the trees aren't binary or balanced, we estimate the child to look in): | |
283 | // braille the mathml with a nav node and see where 'position' is in relation to the start/end of the nav node | |
284 | // Each call to find_navigation_node() returns a search state that tell us where to look next if not found | |
285 | #[derive(Debug, Display)] | |
286 | enum SearchStatus { | |
287 | LookInParent, // look up a level for exact match | |
288 | LookLeft, // went too far, backup | |
289 | LookRight, // continue searching right | |
290 | Found, | |
291 | } | |
292 | ||
293 | struct SearchState<'e> { | |
294 | status: SearchStatus, | |
295 | node: Element<'e>, | |
296 | highlight_start: usize, // if status is Found, then this is the offset within a leaf node | |
297 | highlight_end: usize, // if status is Found, this is ignored | |
298 | } | |
299 | ||
300 | // save the current highlight state, set the state to be the end points so we can find the braille, then restore the state | |
301 | // FIX: this can fail if there is 8-dot braille | |
302 | use crate::interface::{get_preference, set_preference}; | |
303 | 91 | let saved_highlight_style = get_preference("BrailleNavHighlight").unwrap(); |
304 | 91 | set_preference("BrailleNavHighlight", "EndPoints").unwrap(); |
305 | ||
306 | 91 | N_PROBES.with(|n| {*n.borrow_mut() = 0}); |
307 | // dive into the child of the <math> element (should only be one) | |
308 | 91 | let search_state = find_navigation_node(mathml, as_element(mathml.children()[0]), position) |
309 | 91 | set_preference("BrailleNavHighlight", saved_highlight_style.as_str()).unwrap(); |
310 | ||
311 | // we know the attr value exists because it was found internally | |
312 | // FIX: what should be done if we never did the search? | |
313 | 91 | match search_state.status { |
314 | SearchStatus::Found | SearchStatus::LookInParent => { | |
315 | 86 | return Ok( (search_state.node.attribute_value("id").unwrap().to_string(), search_state.highlight_start) ) |
316 | }, | |
317 | _ => { | |
318 | // weird state -- return the entire expr | |
319 | 5 | match mathml.attribute_value("id") { |
320 | 0 | None => bail!("'id' is not present on mathml: {}", mml_to_string(mathml)), |
321 | 5 | Some(id) => return Ok( (id.to_string(), 0) ), |
322 | } | |
323 | } | |
324 | } | |
325 | ||
326 | /// find the navigation node that most tightly encapsulates the target position (0-based) | |
327 | /// 'node' is the current node we are on inside of 'mathml' | |
328 | 465 | fn find_navigation_node<'e>(mathml: Element<'e>, node: Element<'e>, target_position: usize) -> Result<SearchState<'e>> { |
329 | 465 | let node_id = match node.attribute_value("id") { |
330 | 465 | Some(id) => id, |
331 | 0 | None => bail!("'id' is not present on mathml: {}", mml_to_string(node)), |
332 | }; | |
333 | 465 | N_PROBES.with(|n| {*n.borrow_mut() += 1}); |
334 | 465 | let (braille, char_start, char_end) = braille_mathml(mathml, node_id) |
335 | 465 | let mut status = None; |
336 | // debug!("find_navigation_node ({}, id={}): highlight=[{}, {}); target={}", name(node), node_id, char_start, char_end, target_position); | |
337 | 465 | if is_leaf(node) { |
338 | 100 | if char_start == 0 && |
339 | 6 | // nothing highlighted -- probably invisible char not represented in braille -- continue looking to the right |
340 | 6 | // debug!(" return due invisible char (?)' "); |
341 | 6 | status = Some(SearchStatus::LookRight); |
342 | 94 | } else if char_start <= target_position && |
343 | // FIX: need to handle multi-char leaves and set the offset (char_start) appropriately | |
344 | // debug!(" return due to target_position inside leaf: {} <= {} < {}", char_start, target_position, char_end); | |
345 | 58 | return Ok( SearchState { |
346 | 58 | status: SearchStatus::Found, |
347 | 58 | node, |
348 | 58 | highlight_start: target_position - char_start, |
349 | 58 | highlight_end: 0, |
350 | 58 | }); |
351 | 36 | } else if name(node) == "mo" { |
352 | // if there is whitespace before or after the operator, consider the operator to be a match | |
353 | 18 | if (char_start > 0 && target_position == char_start - 1 && |
354 | 2 | braille_at(&braille, 3*(char_start - 1)) == '⠀' && is_operator_that_adds_whitespace(node)) || |
355 | 16 | (3*char_end < braille.len() && target_position == char_end && |
356 | 11 | braille_at(&braille, 3*char_end) == '⠀' && |
357 | 4 | return Ok( SearchState { |
358 | 4 | status: SearchStatus::Found, |
359 | 4 | node, |
360 | 4 | highlight_start: 0, |
361 | 4 | highlight_end: 0, |
362 | 4 | } ); |
363 | 14 | } |
364 | 18 | } |
365 | 365 | } |
366 | 403 | if status.is_none() { |
367 | 397 | if target_position < char_start { |
368 | 23 | // debug!(" return due to target_position {} < start {}", target_position, char_start); |
369 | 23 | status = Some(SearchStatus::LookLeft); |
370 | 374 | } else if target_position >= char_end { |
371 | 49 | // debug!(" return due to target_position {} >= end {}", target_position, char_end); |
372 | 49 | status = Some(SearchStatus::LookRight); |
373 | 325 | } |
374 | 6 | } |
375 | 403 | if let Some( |
376 | 78 | return Ok( SearchState { |
377 | 78 | status, |
378 | 78 | node, |
379 | 78 | highlight_start: char_start, |
380 | 78 | highlight_end: char_end, |
381 | 78 | } ); |
382 | 325 | } |
383 | ||
384 | 325 | let children = node.children(); |
385 | 325 | let mut i_left_child = 0; // inclusive |
386 | 325 | let mut i_right_child = children.len(); // exclusive |
387 | 325 | let mut call_start = char_start; |
388 | 325 | let mut guess_fn: Box<dyn Fn(usize, usize, usize, usize) -> usize> = Box::new(|i_left, i_right, start, target: usize| guess_child_node_ltr(&children, i_left, i_right, start, target)); |
389 | 398 | while i_left_child < i_right_child { |
390 | 374 | let i_guess_child = guess_fn(i_left_child, i_right_child, call_start, target_position); |
391 | 374 | let status = find_navigation_node(mathml, as_element(children[i_guess_child]), target_position) |
392 | // debug!(" in {} loop: status: {}, child: left/guess/right {}/({},{})/{}; highlight=[{}, {})", | |
393 | // name(node), status.status, | |
394 | // i_left_child, i_guess_child, name(as_element(children[i_guess_child])),i_right_child, | |
395 | // status.highlight_start, status.highlight_end); | |
396 | 374 | match status.status { |
397 | SearchStatus::Found => { | |
398 | 301 | return Ok(status); |
399 | }, | |
400 | SearchStatus::LookInParent => { | |
401 | 0 | let (_, start, end) = braille_mathml(mathml, node_id)?; |
402 | // debug!(" parent ({}) braille: start/end={}/{}; target_position={}", name(node), start, end, target_position); | |
403 | 0 | if start <= target_position && target_position < end { |
404 | // debug!(" ..found: id={}", node_id); | |
405 | 0 | return Ok( SearchState{ |
406 | 0 | status: SearchStatus::Found, |
407 | 0 | node, |
408 | 0 | highlight_start: 0, |
409 | 0 | highlight_end: 0, |
410 | 0 | } ); // done or look up another level |
411 | 0 | } |
412 | 0 | return Ok(status); // look up a level |
413 | }, | |
414 | SearchStatus::LookLeft => { | |
415 | 20 | i_right_child = if i_guess_child == 0 { |
416 | 20 | call_start = status.highlight_start-1; |
417 | 20 | guess_fn = Box::new(|i_left, i_right, start, target| |
418 | }, | |
419 | SearchStatus::LookRight => { | |
420 | 53 | i_left_child = i_guess_child+1; |
421 | 53 | call_start = status.highlight_end+1; |
422 | 53 | guess_fn = Box::new(|i_left, i_right, start, target| |
423 | }, | |
424 | } | |
425 | } | |
426 | // debug!("Didn't child in node {}: left/right={}/{}; target_position={}", name(node), i_left_child, i_right_child, target_position); | |
427 | ||
428 | // if we get here, we didn't find it in the children | |
429 | // debug!("..end of loop: look in parent of {} has start/end={}/{}", name(node), char_start, char_end); | |
430 | return Ok( SearchState{ | |
431 | 24 | status: if char_start <= target_position && target_position <= char_end {SearchStatus::Found} else { |
432 | 24 | node, |
433 | highlight_start: 0, | |
434 | highlight_end: 0, | |
435 | } ); | |
436 | 465 | } |
437 | ||
438 | 4 | fn is_operator_that_adds_whitespace(node: Element) -> bool { |
439 | use crate::definitions::BRAILLE_DEFINITIONS; | |
440 | 4 | if PreferenceManager::get().borrow().pref_to_string("UseSpacesAroundAllOperators") == "true" { |
441 | 0 | return true; |
442 | 4 | } |
443 | ||
444 | 4 | return BRAILLE_DEFINITIONS.with(|definitions| { |
445 | 4 | let definitions = definitions.borrow(); |
446 | 4 | let comparison_operators = definitions.get_hashset("ComparisonOperators").unwrap(); |
447 | 4 | return comparison_operators.contains(as_text(node)); |
448 | 4 | }); |
449 | 4 | } |
450 | ||
451 | /// look in children[i_left..i_right] for a count that exceeds target | |
452 | 367 | fn guess_child_node_ltr(children: &[ChildOfElement], i_left: usize, i_right: usize, start: usize, target: usize) -> usize { |
453 | 367 | let mut estimated_position = start; |
454 | // number of chars to add for number indicators | |
455 | 367 | let n_number_indicator = if PreferenceManager::get().borrow().pref_to_string("BrailleCode") == "Nemeth" { |
456 | #[allow(clippy::needless_range_loop)] // I don't like enumerate/take/skip here | |
457 | 666 | for i in |
458 | 666 | estimated_position += estimate_braille_chars(children[i], n_number_indicator); |
459 | 666 | if estimated_position >= target { |
460 | 344 | return i; |
461 | 322 | } |
462 | } | |
463 | 23 | return i_right-1; // estimate was too large, return the last child as a guess |
464 | 367 | } |
465 | ||
466 | /// look in children[i_left..i_right].rev for a count that is less than target | |
467 | 7 | fn guess_child_node_rtl(children: &[ChildOfElement], i_left: usize, i_right: usize, start: usize, target: usize) -> usize { |
468 | 7 | let mut estimated_position = start; |
469 | 7 | let n_number_indicator = if PreferenceManager::get().borrow().pref_to_string("BrailleCode") == "Nemeth" { |
470 | 7 | for i in (i_left..i_right).rev() { |
471 | 7 | estimated_position -= estimate_braille_chars(children[i], n_number_indicator); |
472 | 7 | if estimated_position <= target { |
473 | 7 | return i; |
474 | 0 | } |
475 | } | |
476 | 0 | return i_left; // estimate was too small, return the first child as a guess |
477 | 7 | } |
478 | ||
479 | 4.58k | fn estimate_braille_chars(child: ChildOfElement, n_number_indicator: usize) -> usize { |
480 | 4.58k | let node = as_element(child); |
481 | 4.58k | let leaf_name = name(node); |
482 | 4.58k | if is_leaf(node) { |
483 | 3.13k | let text = as_text(node); |
484 | // len() is close since mn's probably have ASCII digits and lower case vars are common (count as) and other chars need extra braille chars | |
485 | // don't want to count invisible chars since they don't display and would give a length = 3 | |
486 | 3.13k | if text == "\u{2061}" || text == "\u{2062}" { // invisible function apply/times (most common by far) |
487 | 597 | return 0; |
488 | 2.53k | } |
489 | // FIX: this assumption is bad for 8-dot braille | |
490 | 2.53k | return match leaf_name { |
491 | 2.53k | "mn" => |
492 | 1.90k | "mo" => |
493 | 1.16k | _ => text.len(), |
494 | } | |
495 | 1.45k | } |
496 | 1.45k | let mut estimate = if leaf_name == "mrow" { |
497 | 1.45k | if leaf_name == "msup" || |
498 | 260 | estimate -= 1; // opening superscript/subscript indicator not needed |
499 | 1.19k | } |
500 | 3.91k | for child in |
501 | 3.91k | estimate += estimate_braille_chars(child, n_number_indicator); |
502 | 3.91k | } |
503 | // debug!("estimate_braille_chars for {}: {}", crate::canonicalize::element_summary(as_element(child)), estimate); | |
504 | 1.45k | return estimate; |
505 | 4.58k | } |
506 | 91 | } |
507 | ||
508 | 888 | fn nemeth_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String { |
509 | // Typeface: S: sans-serif, B: bold, T: script/blackboard, I: italic, R: Roman | |
510 | // Language: E: English, D: German, G: Greek, V: Greek variants, H: Hebrew, U: Russian | |
511 | // Indicators: C: capital, N: number, P: punctuation, M: multipurpose | |
512 | // Others: | |
513 | // W -- whitespace that should be kept (e.g, in a numeral) | |
514 | // 𝑁 -- hack for special case of a lone decimal pt -- not considered a number but follows rules mostly | |
515 | // SRE doesn't have H: Hebrew or U: Russian, so not encoded (yet) | |
516 | // Note: some "positive" patterns find cases to keep the char and transform them to the lower case version | |
517 | static NEMETH_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! { | |
518 | "S" => "⠠⠨", // sans-serif | |
519 | "B" => "⠸", // bold | |
520 | "𝔹" => "⠨", // blackboard | |
521 | "T" => "⠈", // script | |
522 | "I" => "⠨", // italic (mapped to be the same a blackboard) | |
523 | "R" => "", // roman | |
524 | "E" => "⠰", // English | |
525 | "D" => "⠸", // German (Deutsche) | |
526 | "G" => "⠨", // Greek | |
527 | "V" => "⠨⠈", // Greek Variants | |
528 | "H" => "⠠⠠", // Hebrew | |
529 | "U" => "⠈⠈", // Russian | |
530 | "C" => "⠠", // capital | |
531 | "P" => "⠸", // punctuation | |
532 | "𝐏" => "⠸", // hack for punctuation after a roman numeral -- never removed | |
533 | "L" => "", // letter | |
534 | "l" => "", // letter inside enclosed list | |
535 | "M" => "", // multipurpose indicator | |
536 | "m" => "⠐", // required multipurpose indicator | |
537 | "N" => "", // potential number indicator before digit | |
538 | "n" => "⠼", // required number indicator before digit | |
539 | "𝑁" => "", // hack for special case of a lone decimal pt -- not considered a number but follows rules mostly | |
540 | "W" => "⠀", // whitespace | |
541 | "w" => "⠀", // whitespace from comparison operator | |
542 | "," => "⠠⠀", // comma | |
543 | "b" => "⠐", // baseline | |
544 | "𝑏" => "⣐", // highlight baseline (it's a hack) | |
545 | "↑" => "⠘", // superscript | |
546 | "↓" => "⠰", // subscript | |
547 | }; | |
548 | ||
549 | // Add an English Letter indicator. This involves finding "single letters". | |
550 | // The green book has a complicated set of cases, but the Nemeth UEB Rule book (May 2020), 4.10 has a much shorter explanation: | |
551 | // punctuation or whitespace on the left and right ignoring open/close chars | |
552 | // https://nfb.org/sites/www.nfb.org/files/files-pdf/braille-certification/lesson-4--provisional-5-9-20.pdf | |
553 | 2 | static ADD_ENGLISH_LETTER_INDICATOR: LazyLock<Regex> = LazyLock::new(|| { |
554 | 2 | Regex::new(r"(?P<start>^|W|P.[\u2800-\u28FF]?|,)(?P<open>[\u2800-\u28FF]?⠷)?(?P<letter>C?L.)(?P<close>[\u2800-\u28FF]?⠾)?(?P<end>W|P|,|$)").unwrap() |
555 | 2 | }); |
556 | ||
557 | // Trim braille spaces before and after braille indicators | |
558 | // In order: fraction, /, cancellation, letter, baseline | |
559 | // Note: fraction over is not listed due to example 42(4) which shows a space before the "/" | |
560 | static REMOVE_SPACE_BEFORE_BRAILLE_INDICATORS: LazyLock<Regex> = | |
561 | 2 | LazyLock::new(|| Regex::new(r"(⠄⠄⠄|⠤⠤⠤⠤)[Ww]+([⠼⠸⠪])").unwrap()); |
562 | static REMOVE_SPACE_AFTER_BRAILLE_INDICATORS: LazyLock<Regex> = | |
563 | 2 | LazyLock::new(|| Regex::new(r"([⠹⠻Llb])[Ww]+(⠄⠄⠄|⠤⠤⠤⠤)").unwrap()); |
564 | ||
565 | // Hack to convert non-numeric '.' to numeric '.' | |
566 | // The problem is that the numbers are hidden inside of mover -- this might be more general than rule 99_2. | |
567 | 2 | static DOTS_99_A_2: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"𝑁⠨mN").unwrap()); |
568 | ||
569 | // Punctuation is one or two chars. There are (currently) only 3 2-char punct chars (—‘’) -- we explicitly list them below | |
570 | static REMOVE_SPACE_BEFORE_PUNCTUATION_151: LazyLock<Regex> = | |
571 | 2 | LazyLock::new(|| Regex::new(r"w(P.[⠤⠦⠠]?|[\u2800-\u28FF]?⠾)").unwrap()); |
572 | static REMOVE_SPACE_AFTER_PUNCTUATION_151: LazyLock<Regex> = | |
573 | 2 | LazyLock::new(|| Regex::new(r"(P.[⠤⠦⠠]?|[\u2800-\u28FF]?⠷)w").unwrap()); |
574 | ||
575 | // Multipurpose indicator insertion | |
576 | // 149 -- consecutive comparison operators have no space -- instead a multipurpose indicator is used (doesn't require a regex) | |
577 | ||
578 | // 177.2 -- add after a letter and before a digit (or decimal pt) -- digits will start with N | |
579 | 2 | static MULTI_177_2: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"([Ll].)[N𝑁]").unwrap()); |
580 | ||
581 | // keep between numeric subscript and digit ('M' added by subscript rule) | |
582 | 2 | static MULTI_177_3: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"([N𝑁].)M([N𝑁].)").unwrap()); |
583 | ||
584 | // Add after decimal pt for non-digits except for comma and punctuation | |
585 | // Note: since "." can be in the middle of a number, there is not necessarily a "N" | |
586 | // Although not mentioned in 177_5, don't add an 'M' before an 'm' | |
587 | static MULTI_177_5: LazyLock<Regex> = | |
588 | 2 | LazyLock::new(|| Regex::new(r"([N𝑁]⠨)([^⠂⠆⠒⠲⠢⠖⠶⠦⠔N𝑁,Pm])").unwrap()); |
589 | ||
590 | // Pattern for rule II.9a (add numeric indicator at start of line or after a space) | |
591 | // 1. start of line | |
592 | // 2. optional minus sign (⠤) | |
593 | // 3. optional typeface indicator | |
594 | // 4. number (N) | |
595 | 2 | static NUM_IND_9A: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?P<start>^|[,Ww])(?P<minus>⠤?)N").unwrap()); |
596 | ||
597 | // Needed after section mark(§), paragraph mark(¶), #, or * | |
598 | 2 | static NUM_IND_9C: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(⠤?)(⠠⠷|⠠⠳|⠠⠈⠷)N").unwrap()); |
599 | ||
600 | // Needed after section mark(§), paragraph mark(¶), #, or * | |
601 | 2 | static NUM_IND_9D: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(⠈⠠⠎|⠈⠠⠏|⠨⠼|⠈⠼)N").unwrap()); |
602 | ||
603 | // Needed after a typeface change or interior shape modifier indicator | |
604 | 2 | static NUM_IND_9E: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?P<face>[SB𝔹TIR]+?)N").unwrap()); |
605 | 2 | static NUM_IND_9E_SHAPE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?P<mod>⠸⠫)N").unwrap()); |
606 | ||
607 | // Needed after hyphen that follows a word, abbreviation, or punctuation (caution about rule 11d) | |
608 | // Note -- hyphen might encode as either "P⠤" or "⠤" depending on the tag used | |
609 | 2 | static NUM_IND_9F: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"([Ll].[Ll].|P.)(P?⠤)N").unwrap()); |
610 | ||
611 | // Enclosed list exception | |
612 | // Normally we don't add numeric indicators in enclosed lists (done in get_braille_nemeth_chars). | |
613 | // The green book says "at the start" of an item, don't add the numeric indicator. | |
614 | // The NFB list exceptions after function abbreviations and angles, but what this really means is "after a space" | |
615 | 2 | static NUM_IND_ENCLOSED_LIST: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"w([⠂⠆⠒⠲⠢⠖⠶⠦⠔⠴])").unwrap()); |
616 | ||
617 | // Punctuation chars (Rule 38.6 says don't use before ",", "hyphen", "-", "…") | |
618 | // Never use punctuation indicator before these (38-6) | |
619 | // "…": "⠀⠄⠄⠄" | |
620 | // "-": "⠸⠤" (hyphen and dash) | |
621 | // ",": "⠠⠀" -- spacing already added | |
622 | // Rule II.9b (add numeric indicator after punctuation [optional minus[optional .][digit] | |
623 | // because this is run after the above rule, some cases are already caught, so don't | |
624 | // match if there is already a numeric indicator | |
625 | 2 | static NUM_IND_9B: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?P<punct>P..?)(?P<minus>⠤?)N").unwrap()); |
626 | ||
627 | // Before 79b (punctuation) | |
628 | 2 | static REMOVE_LEVEL_IND_BEFORE_SPACE_COMMA_PUNCT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?:[↑↓]+[b𝑏]?|[b𝑏])([Ww,P]|$)").unwrap()); |
629 | ||
630 | // Most commas have a space after them, but not when followed by a close quote (others?) | |
631 | 2 | static NO_SPACE_AFTER_COMMA: LazyLock<Regex> = LazyLock::new(|| Regex::new(r",P⠴").unwrap()); // captures both single and double close quote |
632 | 2 | static REMOVE_LEVEL_IND_BEFORE_BASELINE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?:[↑↓mb𝑏]+)([b𝑏])").unwrap()); |
633 | ||
634 | // Except for the four chars above, the unicode rules always include a punctuation indicator. | |
635 | // The cases to remove them (that seem relevant to MathML) are: | |
636 | // Beginning of line or after a space (V 38.1) | |
637 | // After a word (38.4) | |
638 | // 2nd or subsequent punctuation (includes, "-", etc) (38.7) | |
639 | 2 | static REMOVE_AFTER_PUNCT_IND: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(^|[Ww]|[Ll].[Ll].)P(.)").unwrap()); |
640 | 2 | static REPLACE_INDICATORS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"([SB𝔹TIREDGVHUP𝐏CLlMmb𝑏↑↓Nn𝑁Ww,])").unwrap()); |
641 | 2 | static COLLAPSE_SPACES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"⠀⠀+").unwrap()); |
642 | ||
643 | // debug!("Before: \"{}\"", raw_braille); | |
644 | // replacements might overlap at boundaries (e.g., whitespace) -- need to repeat | |
645 | 888 | let mut start = 0; |
646 | 888 | let mut result = String::with_capacity(raw_braille.len()+ raw_braille.len()/4); // likely upper bound |
647 | 923 | while let Some( |
648 | 35 | result.push_str(&raw_braille[start..matched.start()]); |
649 | 35 | let replacement = ADD_ENGLISH_LETTER_INDICATOR.replace( |
650 | 35 | &raw_braille[matched.start()..matched.end()], "${start}${open}E${letter}${close}"); |
651 | 35 | // debug!("matched='{}', start/end={}/{}; replacement: {}", &raw_braille[matched.start()..matched.end()], matched.start(), matched.end(), replacement); |
652 | 35 | result.push_str(&replacement); |
653 | 35 | // put $end back on because needed for next match (e.g., whitespace at end and then start of next match) |
654 | 35 | // but it could also match because it was at the end, in which case "-1" is wrong -- tested after loop for that |
655 | 35 | start = matched.end() - 1; |
656 | 35 | } |
657 | 888 | if !raw_braille.is_empty() && ( start < raw_braille.len()-1 || |
658 | 882 | result.push_str(&raw_braille[start..]); |
659 | 882 |
|
660 | // debug!("ELIs: \"{}\"", result); | |
661 | ||
662 | 888 | let result = NUM_IND_ENCLOSED_LIST.replace_all(&result, "wn${1}"); |
663 | ||
664 | // Remove blanks before and after braille indicators | |
665 | 888 | let result = REMOVE_SPACE_BEFORE_BRAILLE_INDICATORS.replace_all(&result, "$1$2"); |
666 | 888 | let result = REMOVE_SPACE_AFTER_BRAILLE_INDICATORS.replace_all(&result, "$1$2"); |
667 | ||
668 | 888 | let result = REMOVE_SPACE_BEFORE_PUNCTUATION_151.replace_all(&result, "$1"); |
669 | 888 | let result = REMOVE_SPACE_AFTER_PUNCTUATION_151.replace_all(&result, "$1"); |
670 | // debug!("spaces: \"{}\"", result); | |
671 | ||
672 | 888 | let result = DOTS_99_A_2.replace_all(&result, "N⠨mN"); |
673 | ||
674 | // Multipurpose indicator | |
675 | 888 | let result = result.replace("ww", "m"); // 149 |
676 | 888 | let result = MULTI_177_2.replace_all(&result, "${1}m${2}"); |
677 | 888 | let result = MULTI_177_3.replace_all(&result, "${1}m$2"); |
678 | 888 | let result = MULTI_177_5.replace_all(&result, "${1}m$2"); |
679 | // debug!("MULTI: \"{}\"", result); | |
680 | ||
681 | 888 | let result = NUM_IND_9A.replace_all(&result, "${start}${minus}n"); |
682 | // debug!("IND_9A: \"{}\"", result); | |
683 | 888 | let result = NUM_IND_9C.replace_all(&result, "${1}${2}n"); |
684 | 888 | let result = NUM_IND_9D.replace_all(&result, "${1}n"); |
685 | 888 | let result = NUM_IND_9E.replace_all(&result, "${face}n"); |
686 | 888 | let result = NUM_IND_9E_SHAPE.replace_all(&result, "${mod}n"); |
687 | 888 | let result = NUM_IND_9F.replace_all(&result, "${1}${2}n"); |
688 | ||
689 | // debug!("IND_9F: \"{}\"", result); | |
690 | ||
691 | // 9b: insert after punctuation (optional minus sign) | |
692 | // common punctuation adds a space, so 9a handled it. Here we deal with other "punctuation" | |
693 | // FIX other punctuation and reference symbols (9d) | |
694 | 888 | let result = NUM_IND_9B.replace_all(&result, "$punct${minus}n"); |
695 | // debug!("A PUNCT: \"{}\"", &result); | |
696 | ||
697 | // strip level indicators | |
698 | // check first to remove level indicators before baseline, then potentially remove the baseline | |
699 | 888 | let mut result = REMOVE_LEVEL_IND_BEFORE_BASELINE.replace_all(&result, "$1"); |
700 | // debug!("Punct : \"{}\"", &result); | |
701 | // checks for punctuation char, so needs to before punctuation is stripped. | |
702 | // if '𝑏' is removed, then the highlight needs to be shifted to the left in some cases | |
703 | 888 | let result = remove_baseline_before_space_or_punctuation(&mut result); |
704 | // debug!("Removed: \"{}\"", &result); | |
705 | ||
706 | 888 | let result = NO_SPACE_AFTER_COMMA.replace_all(&result, "⠠P⠴"); |
707 | ||
708 | 888 | let result = REMOVE_AFTER_PUNCT_IND.replace_all(&result, "$1$2"); |
709 | // debug!("Punct38: \"{}\"", &result); | |
710 | ||
711 | // these typeforms need to get pulled from user-prefs as they are transcriber-defined | |
712 | 888 | let sans_serif = pref_manager.pref_to_string("Nemeth_SansSerif"); |
713 | 888 | let bold = pref_manager.pref_to_string("Nemeth_Bold"); |
714 | 888 | let double_struck = pref_manager.pref_to_string("Nemeth_DoubleStruck"); |
715 | 888 | let script = pref_manager.pref_to_string("Nemeth_Script"); |
716 | 888 | let italic = pref_manager.pref_to_string("Nemeth_Italic"); |
717 | ||
718 | 7.57k | let |
719 | 7.57k | let matched_char = &cap[0]; |
720 | 7.57k | match matched_char { |
721 | 7.57k | "S" => |
722 | 7.57k | "B" => |
723 | 7.52k | "𝔹" => |
724 | 7.49k | "T" => |
725 | 7.49k | "I" => |
726 | 7.48k | _ => match NEMETH_INDICATOR_REPLACEMENTS.get(&cap[0]) { |
727 | 0 | None => {error!("REPLACE_INDICATORS and NEMETH_INDICATOR_REPLACEMENTS are not in sync"); ""}, |
728 | 7.48k | Some(&ch) => ch, |
729 | } | |
730 | } | |
731 | 7.57k | }); |
732 | ||
733 | // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces | |
734 | 888 | let result = result.trim_start_matches('⠀').trim_end_matches('⠀'); |
735 | 888 | let result = COLLAPSE_SPACES.replace_all(result, "⠀"); |
736 | ||
737 | 888 | return result.to_string(); |
738 | ||
739 | 888 | fn remove_baseline_before_space_or_punctuation<'a>(braille: &'a mut Cow<'a, str>) -> Cow<'a, str> { |
740 | // If the baseline highlight is at the end of the string and it is going to be deleted by the regex, | |
741 | // then we need to shift the highlight to the left if what is to it's left is not whitespace (which should never be a highlight end) | |
742 | // This only happens when BrailleNavHighlight == "EndPoints". | |
743 | 888 | let highlight_style = PreferenceManager::get().borrow().pref_to_string("BrailleNavHighlight"); |
744 | 888 | if highlight_style == "EndPoints" && |
745 | 132 | let Some( |
746 | 129 | braille[last_highlighted..].starts_with('𝑏') { |
747 | 7 | let i_after_baseline = last_highlighted + '𝑏'.len_utf8(); |
748 | 7 | if i_after_baseline == braille.len() || |
749 | // shift the highlight to the left after doing just the replacement (if any) that the regex below does | |
750 | // the shift runs until a non blank braille char is found | |
751 | 2 | let mut bytes_deleted = 0; |
752 | 2 | let mut char_to_highlight = "".to_string(); // illegal value |
753 | 2 | for ch in braille[..last_highlighted].chars().rev() { |
754 | 2 | bytes_deleted += ch.len_utf8(); |
755 | 2 | if (0x2801..0x28FF).contains(&(ch as u32)) { |
756 | 2 | char_to_highlight = highlight(ch).to_string(); |
757 | 2 | break; |
758 | 0 | } |
759 | } | |
760 | 2 | braille.to_mut().replace_range(last_highlighted-bytes_deleted..last_highlighted+'𝑏'.len_utf8(), |
761 | 2 | &char_to_highlight); |
762 | 5 | } |
763 | 881 | } |
764 | 888 | return REMOVE_LEVEL_IND_BEFORE_SPACE_COMMA_PUNCT.replace_all(braille, "$1"); |
765 | ||
766 | 888 | } |
767 | 888 | } |
768 | ||
769 | // Typeface: S: sans-serif, B: bold, T: script/blackboard, I: italic, R: Roman | |
770 | // Language: E: English, D: German, G: Greek, V: Greek variants, H: Hebrew, U: Russian | |
771 | // Indicators: C: capital, N: number, P: punctuation, M: multipurpose | |
772 | // Others: | |
773 | // W -- whitespace that should be kept (e.g, in a numeral) | |
774 | // 𝑁 -- hack for special case of a lone decimal pt -- not considered a number but follows rules mostly | |
775 | // Note: some "positive" patterns find cases to keep the char and transform them to the lower case version | |
776 | static UEB_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! { | |
777 | "S" => "XXX", // sans-serif -- from prefs | |
778 | "B" => "⠘", // bold | |
779 | "𝔹" => "XXX", // blackboard -- from prefs | |
780 | "T" => "⠈", // script | |
781 | "I" => "⠨", // italic | |
782 | "R" => "", // roman | |
783 | // "E" => "⠰", // English | |
784 | "1" => "⠰", // Grade 1 symbol | |
785 | "𝟙" => "⠰⠰", // Grade 1 word | |
786 | "L" => "", // Letter left in to assist in locating letters | |
787 | "D" => "XXX", // German (Deutsche) -- from prefs | |
788 | "G" => "⠨", // Greek | |
789 | "V" => "⠨⠈", // Greek Variants | |
790 | // "H" => "⠠⠠", // Hebrew | |
791 | // "U" => "⠈⠈", // Russian | |
792 | "C" => "⠠", // capital | |
793 | "𝐶" => "⠠", // capital that never should get word indicator (from chemical element) | |
794 | "N" => "⠼", // number indicator | |
795 | "t" => "⠱", // shape terminator | |
796 | "W" => "⠀", // whitespace | |
797 | "𝐖"=> "⠀", // whitespace (hard break -- basically, it separates exprs) | |
798 | "s" => "⠆", // typeface single char indicator | |
799 | "w" => "⠂", // typeface word indicator | |
800 | "e" => "⠄", // typeface & capital terminator | |
801 | "o" => "", // flag that what follows is an open indicator (used for standing alone rule) | |
802 | "c" => "", // flag that what follows is an close indicator (used for standing alone rule) | |
803 | "b" => "", // flag that what follows is an open or close indicator (used for standing alone rule) | |
804 | "," => "⠂", // comma | |
805 | "." => "⠲", // period | |
806 | "-" => "-", // hyphen | |
807 | "—" => "⠠⠤", // normal dash (2014) -- assume all normal dashes are unified here [RUEB appendix 3] | |
808 | "―" => "⠐⠠⠤", // long dash (2015) -- assume all long dashes are unified here [RUEB appendix 3] | |
809 | "#" => "", // signals end of script | |
810 | // '(', '{', '[', '"', '\'', '“', '‘', '«', // opening chars | |
811 | // ')', '}', ']', '\"', '\'', '”', '’', '»', // closing chars | |
812 | // ',', ';', ':', '.', '…', '!', '?' // punctuation | |
813 | ||
814 | }; | |
815 | ||
816 | // static LETTERS: phf::Set<char> = phf_set! { | |
817 | // '⠁', '⠃', '⠉', '⠙', '⠑', '⠋', '⠛', '⠓', '⠊', '⠚', '⠅', '⠇', '⠍', | |
818 | // '⠝', '⠕', '⠏', '⠟', '⠗', '⠎', '⠞', '⠥', '⠧', '⠺', '⠭', '⠽', '⠵', | |
819 | // }; | |
820 | ||
821 | 2.39k | fn is_letter_number(ch: char) -> bool { |
822 | 2.39k |
|
823 | 2.39k | } |
824 | ||
825 | static SHORT_FORMS: phf::Set<&str> = phf_set! { | |
826 | "L⠁L⠃", "L⠁L⠃L⠧", "L⠁L⠉", "L⠁L⠉L⠗", "L⠁L⠋", | |
827 | "L⠁L⠋L⠝", "L⠁L⠋L⠺", "L⠁L⠛", "L⠁L⠛L⠌", "L⠁L⠇", | |
828 | "L⠁L⠇L⠍", "L⠁L⠇L⠗", "L⠁L⠇L⠞", "L⠁L⠇L⠹", "L⠁L⠇L⠺", | |
829 | "L⠃L⠇", "L⠃L⠗L⠇", "L⠉L⠙", "L⠙L⠉L⠇", "L⠙L⠉L⠇L⠛", | |
830 | "L⠙L⠉L⠧", "L⠙L⠉L⠧L⠛", "L⠑L⠊", "L⠋L⠗", "L⠋L⠌", "L⠛L⠙", | |
831 | "L⠛L⠗L⠞", "L⠓L⠍", "L⠓L⠍L⠋", "L⠓L⠻L⠋", "L⠊L⠍L⠍", "L⠇L⠇", "L⠇L⠗", | |
832 | "L⠍L⠽L⠋", "L⠍L⠡", "L⠍L⠌", "L⠝L⠑L⠉", "L⠝L⠑L⠊", "L⠏L⠙", | |
833 | "L⠏L⠻L⠉L⠧", "L⠏L⠻L⠉L⠧L⠛", "L⠏L⠻L⠓", "L⠟L⠅", "L⠗L⠉L⠧", | |
834 | "L⠗L⠉L⠧L⠛", "L⠗L⠚L⠉", "L⠗L⠚L⠉L⠛", "L⠎L⠙", "L⠎L⠡", "L⠞L⠙", | |
835 | "L⠞L⠛L⠗", "L⠞L⠍", "L⠞L⠝", "L⠭L⠋", "L⠭L⠎", "L⠽L⠗", "L⠽L⠗L⠋", | |
836 | "L⠽L⠗L⠧L⠎", "L⠮L⠍L⠧L⠎", "L⠡L⠝", "L⠩L⠙", "L⠹L⠽L⠋", "L⠳L⠗L⠧L⠎", | |
837 | "L⠺L⠙", "L⠆L⠉", "L⠆L⠋", "L⠆L⠓", "L⠆L⠇", "L⠆L⠝", "L⠆L⠎", "L⠆L⠞", | |
838 | "L⠆L⠽", "L⠒L⠉L⠧", "L⠒L⠉L⠧L⠛", "L⠐L⠕L⠋" | |
839 | }; | |
840 | ||
841 | 1.75k | fn is_letter_prefix(ch: char) -> bool { |
842 | 1.75k |
|
843 | 1.75k | } |
844 | ||
845 | // Trim braille spaces before and after braille indicators | |
846 | // In order: fraction, /, cancellation, letter, baseline | |
847 | // Note: fraction over is not listed due to example 42(4) which shows a space before the "/" | |
848 | // static ref REMOVE_SPACE_BEFORE_BRAILLE_INDICATORS: Regex = | |
849 | // Regex::new(r"(⠄⠄⠄|⠤⠤⠤)W+([⠼⠸⠪])").unwrap(); | |
850 | 2 | static REPLACE_INDICATORS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"([1𝟙SB𝔹TIREDGVHP𝐶𝑐CLMNW𝐖swe,.-—―#ocb])").unwrap()); |
851 | 2 | static COLLAPSE_SPACES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"⠀⠀+").unwrap()); |
852 | ||
853 | 35 | fn is_short_form(chars: &[char]) -> bool { |
854 | 204 | let |
855 | 35 | return SHORT_FORMS.contains(&chars_as_string); |
856 | 35 | } |
857 | ||
858 | 366 | fn ueb_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String { |
859 | // debug!("ueb_cleanup: start={}", raw_braille); | |
860 | 366 | let result = typeface_to_word_mode(&raw_braille); |
861 | 366 | let result = capitals_to_word_mode(&result); |
862 | ||
863 | 366 | let use_only_grade1 = pref_manager.pref_to_string("UEB_START_MODE").as_str() == "Grade1"; |
864 | ||
865 | // '𝐖' is a hard break -- basically, it separates exprs | |
866 | 366 | let mut result = result.split('𝐖') |
867 | 370 | . |
868 | 366 | .collect::<String>(); |
869 | 366 | result.pop(); // we added a 'W' at the end that needs to be removed. |
870 | ||
871 | 366 | let result = result.replace("tW", "W"); |
872 | ||
873 | // these typeforms need to get pulled from user-prefs as they are transcriber-defined | |
874 | 366 | let double_struck = pref_manager.pref_to_string("UEB_DoubleStruck"); |
875 | 366 | let sans_serif = pref_manager.pref_to_string("UEB_SansSerif"); |
876 | 366 | let fraktur = pref_manager.pref_to_string("UEB_Fraktur"); |
877 | 366 | let greek_variant = pref_manager.pref_to_string("UEB_GreekVariant"); |
878 | ||
879 | 3.77k | let |
880 | 3.77k | let matched_char = &cap[0]; |
881 | 3.77k | match matched_char { |
882 | 3.77k | "𝔹" => |
883 | 3.77k | "S" => |
884 | 3.77k | "D" => |
885 | 3.77k | "V" => |
886 | 3.77k | _ => match UEB_INDICATOR_REPLACEMENTS.get(matched_char) { |
887 | 0 | None => {error!("REPLACE_INDICATORS and UEB_INDICATOR_REPLACEMENTS are not in sync: missing '{matched_char}'"); ""}, |
888 | 3.77k | Some(&ch) => ch, |
889 | }, | |
890 | } | |
891 | 3.77k | }); |
892 | ||
893 | // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces | |
894 | // let result = result.trim_start_matches('⠀').trim_end_matches('⠀'); | |
895 | 366 | let result = COLLAPSE_SPACES.replace_all(&result, "⠀"); |
896 | ||
897 | 366 | return result.to_string(); |
898 | ||
899 | 370 | fn pick_start_mode(raw_braille: &str, use_only_grade1: bool) -> String { |
900 | // Need to decide what the start mode should be | |
901 | // From http://www.brailleauthority.org/ueb/ueb_math_guidance/final_for_posting_ueb_math_guidance_may_2019_102419.pdf | |
902 | // Unless a math expression can be correctly represented with only a grade 1 symbol indicator in the first three cells | |
903 | // or before a single letter standing alone anywhere in the expression, | |
904 | // begin the expression with a grade 1 word indicator (or a passage indicator if the expression includes spaces) | |
905 | // Apparently "only a grade 1 symbol..." means at most one grade 1 symbol based on some examples (GTM 6.4, example 4) | |
906 | // debug!("before determining mode: '{}'", raw_braille); | |
907 | ||
908 | // a bit ugly because we need to store the string if we have cap passage mode | |
909 | 370 | let raw_braille_string = if is_cap_passage_mode_good(raw_braille) { |
910 | 370 | let raw_braille = if raw_braille_string.is_empty() { |
911 | 370 | if use_only_grade1 { |
912 | 1 | return remove_unneeded_mode_changes(raw_braille, UEB_Mode::Grade1, UEB_Duration::Passage); |
913 | 369 | } |
914 | 369 | let grade2 = remove_unneeded_mode_changes(raw_braille, UEB_Mode::Grade2, UEB_Duration::Symbol); |
915 | 369 | debug!("Symbol mode: '{}'", grade2); |
916 | ||
917 | 369 | if is_grade2_string_ok(&grade2) { |
918 | 143 | return grade2; |
919 | } else { | |
920 | // BANA says use g1 word mode if spaces are present, but that's not what their examples do | |
921 | // A conversation with Ms. DeAndrea from BANA said that they mean use passage mode if ≥3 "segments" (≥2 blanks) | |
922 | // The G1 Word mode might not be at the start (iceb.rs:omission_3_6_7) | |
923 | 226 | let grade1_word = try_grade1_word_mode(raw_braille); |
924 | 226 | debug!("Word mode: '{}'", grade1_word); |
925 | 226 | if !grade1_word.is_empty() { |
926 | 36 | return grade1_word; |
927 | } else { | |
928 | 190 | let grade1_passage = remove_unneeded_mode_changes(raw_braille, UEB_Mode::Grade1, UEB_Duration::Passage); |
929 | 190 | return "⠰⠰⠰".to_string() + &grade1_passage + "⠰⠄"; |
930 | } | |
931 | } | |
932 | ||
933 | /// Return true if at least five (= # of cap passage indicators) cap indicators and no lower case letters | |
934 | 370 | fn is_cap_passage_mode_good(braille: &str) -> bool { |
935 | 370 | let mut n_caps = 0; |
936 | 370 | let mut is_cap_mode = false; |
937 | 370 | let mut cap_mode = UEB_Duration::Symbol; // real value set when is_cap_mode is set to true |
938 | 370 | let mut chars = braille.chars(); |
939 | ||
940 | // look CL or CCL for caps (CC runs until we get whitespace) | |
941 | // if we find an L not in caps mode, we return false | |
942 | // Note: caps can be C𝐶, whitespace can be W𝐖 | |
943 | 2.03k | while let Some( |
944 | 1.96k | if ch == 'L' { |
945 | 401 | if !is_cap_mode { |
946 | 288 | return false; |
947 | 113 | } |
948 | 113 | chars.next(); // skip letter |
949 | 113 | if cap_mode == UEB_Duration::Symbol { |
950 | 79 | is_cap_mode = false; |
951 | 79 |
|
952 | 1.55k | } else if ch == 'C' || |
953 | 107 | if is_cap_mode { |
954 | 16 | if cap_mode == UEB_Duration::Symbol { |
955 | 12 | cap_mode = UEB_Duration::Word; |
956 | 12 |
|
957 | 91 | } else { |
958 | 91 | is_cap_mode = true; |
959 | 91 | cap_mode = UEB_Duration::Symbol; |
960 | 91 | } |
961 | 107 | n_caps += 1; |
962 | 1.45k | } else if ch == 'W' || |
963 | 119 | if is_cap_mode { |
964 | 2 | assert!(cap_mode == UEB_Duration::Word); |
965 | 117 | } |
966 | 119 | is_cap_mode = false; |
967 | 1.33k | } else if ch == '1' && |
968 | 3 | break; |
969 | 1.33k | } |
970 | } | |
971 | 82 | return n_caps > 4; |
972 | 370 | } |
973 | ||
974 | 3 | fn convert_to_cap_passage_mode(braille: &str) -> String { |
975 | 3 | return "⠠⠠⠠".to_string() + &braille.replace(['C', '𝐶'], "") + "⠠⠄"; |
976 | 3 | } |
977 | ||
978 | /// Return true if the BANA or ICEB guidelines say it is ok to start with grade 2 | |
979 | 369 | fn is_grade2_string_ok(grade2_braille: &str) -> bool { |
980 | // BANA says use grade 2 if there is not more than one grade one symbol or single letter standing alone. | |
981 | // The exact quote from their guidance: | |
982 | // Unless a math expression can be correctly represented with only a grade 1 symbol indicator in the first three cells | |
983 | // or before a single letter standing alone anywhere in the expression, | |
984 | // begin the expression with a grade 1 word indicator | |
985 | // Note: I modified this slightly to exclude the cap indicator in the count. That allows three more ICEB rule to pass and seems | |
986 | // like it is a reasonable thing to do. | |
987 | // Another modification is allow a single G1 indicator to occur after whitespace later on | |
988 | // because ICEB examples show it and it seems better than going to passage mode if it is the only G1 indicator | |
989 | ||
990 | // Because of the 'L's which go away, we have to put a little more work into finding the first three chars | |
991 | 369 | let chars = grade2_braille.chars().collect::<Vec<char>>(); |
992 | 369 | let mut n_real_chars = 0; // actually number of chars |
993 | 369 | let mut found_g1 = false; |
994 | 369 | let mut i = 0; |
995 | 1.75k | while i < chars.len() { |
996 | 1.75k | let ch = chars[i]; |
997 | 1.75k | if ch == '1' && |
998 | 269 | if found_g1 { |
999 | 19 | return false; |
1000 | 250 | } |
1001 | 250 | found_g1 = true; |
1002 | 1.48k | } else if !"𝐶CLobc".contains(ch) { |
1003 | 1.07k | if n_real_chars == 2 { |
1004 | 347 | i += 1; |
1005 | 347 | break; // this is the third real char |
1006 | 730 | }; |
1007 | 730 | n_real_chars += 1; |
1008 | 407 | } |
1009 | 1.38k | i += 1 |
1010 | } | |
1011 | ||
1012 | // if we find *another* g1 that isn't forced and isn't standing alone, we are done | |
1013 | // I've added a 'follows whitespace' clause for test iceb.rs:omission_3_6_2 to the standing alone rule | |
1014 | // we only allow one standing alone example -- not sure if BANA guidance has this limit, but GTM 11_5_5_3 seems better with it | |
1015 | // Same for GTM 1_7_3_1 (passage mode is mentioned also) | |
1016 | 350 | let mut is_standing_alone_already_encountered = false; |
1017 | 350 | let mut is_after_whitespace = false; |
1018 | 2.43k | while i < chars.len() { |
1019 | 2.29k | let ch = chars[i]; |
1020 | 2.29k | if ch == 'W' { |
1021 | 355 | is_after_whitespace = true; |
1022 | 1.93k | } else if ch == '1' && |
1023 | 235 | if is_standing_alone_already_encountered || |
1024 | 226 | ((found_g1 || |
1025 | 207 | return false; |
1026 | 28 | } |
1027 | 28 | found_g1 = true; |
1028 | 28 | is_standing_alone_already_encountered = true; |
1029 | 1.70k | } |
1030 | 2.08k | i += 1; |
1031 | } | |
1032 | 143 | return true; |
1033 | 369 | } |
1034 | ||
1035 | /// Return true if the sequence of chars forces a '1' at the `i`th position | |
1036 | /// Note: `chars[i]` should be '1' | |
1037 | 930 | fn is_forced_grade1(chars: &[char], i: usize) -> bool { |
1038 | // A '1' is forced if 'a-j' follows a digit | |
1039 | 930 | assert_eq!(chars[i], '1', "'is_forced_grade1' didn't start with '1'"); |
1040 | // check that a-j follows the '1' -- we have '1Lx' where 'x' is the letter to check | |
1041 | 930 | if i+2 < chars.len() && |
1042 | // check for a number before the '1' | |
1043 | // this will be 'N' followed by LETTER_NUMBERS or the number ".", ",", or " " | |
1044 | 25 | for j in ( |
1045 | 25 | let ch = chars[j]; |
1046 | 25 | if !(is_letter_number(unhighlight(ch)) || |
1047 | 12 | return ch == 'N' |
1048 | 13 | } |
1049 | } | |
1050 | 918 | } |
1051 | 918 | return false; |
1052 | 930 | } |
1053 | ||
1054 | 203 | fn is_single_letter_on_right(chars: &[char], i: usize) -> bool { |
1055 | 205 | fn is_skip_char(ch: char) -> bool { |
1056 | 205 |
|
1057 | 205 | } |
1058 | ||
1059 | // find the first char (if any) | |
1060 | 203 | let mut count = 0; // how many letters |
1061 | 203 | let mut i = i+1; |
1062 | 209 | while i < chars.len() { |
1063 | 205 | let ch = chars[i]; |
1064 | 205 | if !is_skip_char(ch) { |
1065 | 204 | if ch == 'L' { |
1066 | 5 | if count == 1 { |
1067 | 0 | return false; // found a second letter in the sequence |
1068 | 5 | } |
1069 | 5 | count += 1; |
1070 | } else { | |
1071 | 199 | return count==1; |
1072 | } | |
1073 | 5 | i += 2; // eat 'L' and actual letter |
1074 | 1 | } else { |
1075 | 1 | i += 1; |
1076 | 1 | } |
1077 | } | |
1078 | 4 | return true; |
1079 | 203 | } |
1080 | ||
1081 | 226 | fn try_grade1_word_mode(raw_braille: &str) -> String { |
1082 | // this isn't quite right, but pretty close -- try splitting at 'W' (words) | |
1083 | // only one of the parts can be in word mode and none of the others can have '1' unless forced | |
1084 | 226 | let mut g1_words = Vec::default(); |
1085 | 226 | let mut found_word_mode = false; |
1086 | 622 | for raw_word in |
1087 | 622 | let word = remove_unneeded_mode_changes(raw_word, UEB_Mode::Grade2, UEB_Duration::Symbol); |
1088 | // debug!("try_grade1_word_mode: word='{}'", word); | |
1089 | 622 | let word_chars = word.chars().collect::<Vec<char>>(); |
1090 | 622 | let needs_word_mode = word_chars.iter().enumerate() |
1091 | 1.12k | . |
1092 | 622 | if needs_word_mode { |
1093 | 416 | if found_word_mode { |
1094 | 190 | return "".to_string(); |
1095 | 226 | } |
1096 | 226 | found_word_mode = true; |
1097 | 226 | g1_words.push("⠰⠰".to_string() + &remove_unneeded_mode_changes(raw_word, UEB_Mode::Grade1, UEB_Duration::Word) |
1098 | ); | |
1099 | 206 | } else { |
1100 | 206 | g1_words.push(word); |
1101 | 206 | } |
1102 | } | |
1103 | 36 | return if found_word_mode {g1_words.join("W")} else { |
1104 | 226 | } |
1105 | 370 | } |
1106 | 366 | } |
1107 | ||
1108 | 478 | fn typeface_to_word_mode(braille: &str) -> String { |
1109 | 2 | static HAS_TYPEFACE: LazyLock<Regex> = LazyLock::new(|| Regex::new("[BI𝔹STD]").unwrap()); |
1110 | // debug!("before typeface fix: '{}'", braille); | |
1111 | ||
1112 | 478 | let mut result = "".to_string(); |
1113 | 478 | let chars = braille.chars().collect::<Vec<char>>(); |
1114 | 478 | let mut word_mode = Vec::with_capacity(5); |
1115 | 478 | let mut word_mode_end = Vec::with_capacity(5); |
1116 | 478 | let mut i = 0; |
1117 | 11.5k | while i < chars.len() { |
1118 | 11.0k | let ch = chars[i]; |
1119 | 11.0k | if HAS_TYPEFACE.is_match(ch.to_string().as_str()) { |
1120 | 8 | let i_next_char_target = find_next_char(&chars[i+1..], ch); |
1121 | 8 | if word_mode.contains(&ch) { |
1122 | 3 | if i_next_char_target.is_none() { |
1123 | 2 | word_mode.retain(|&item| item!=ch); // drop the char since word mode is done |
1124 | 2 | word_mode_end.push(ch); // add the char to signal to add end sequence |
1125 | 1 | } |
1126 | } else { | |
1127 | 5 | result.push(ch); |
1128 | 5 | if i_next_char_target.is_some() { |
1129 | 2 | result.push('w'); // typeface word indicator |
1130 | 2 | word_mode.push(ch); // starting word mode for this char |
1131 | 3 | } else { |
1132 | 3 | result.push('s'); // typeface single char indicator |
1133 | 3 | } |
1134 | } | |
1135 | 8 | i += 1; // eat "B", etc |
1136 | 11.0k | } else if ch == 'L' || |
1137 | 3.70k | result.push(chars[i]); |
1138 | 3.70k | result.push(chars[i+1]); |
1139 | 3.70k | if !word_mode_end.is_empty() && |
1140 | // add terminator unless word sequence is terminated by end of string or whitespace | |
1141 | 1 | for &ch in &word_mode_end { |
1142 | 1 | result.push(ch); |
1143 | 1 | result.push('e'); |
1144 | 1 | }; |
1145 | 1 | word_mode_end.clear(); |
1146 | 3.70k | } |
1147 | 3.70k | i += 2; // eat Ll/Nd |
1148 | 7.30k | } else { |
1149 | 7.30k | result.push(ch); |
1150 | 7.30k | i += 1; |
1151 | 7.30k | } |
1152 | } | |
1153 | 478 | return result; |
1154 | ||
1155 | 478 | } |
1156 | ||
1157 | 478 | fn capitals_to_word_mode(braille: &str) -> String { |
1158 | use std::iter::FromIterator; | |
1159 | // debug!("before capitals fix: '{}'", braille); | |
1160 | ||
1161 | 478 | let mut result = "".to_string(); |
1162 | 478 | let chars = braille.chars().collect::<Vec<char>>(); |
1163 | 478 | let mut is_word_mode = false; |
1164 | 478 | let mut i = 0; |
1165 | // look for a sequence of CLxCLy... and create CCLxLy... | |
1166 | 12.6k | while i < chars.len() { |
1167 | 12.1k | let ch = chars[i]; |
1168 | 12.1k | if ch == 'C' { |
1169 | // '𝑐' should only occur after a 'C', so we don't have top-level check for it | |
1170 | 256 | let mut next_non_cap = i+1; |
1171 | 257 | while let Some( |
1172 | 1 | next_non_cap += i_next + 1; // C/𝑐, L, letter |
1173 | 1 | } |
1174 | 256 | if find_next_char(&chars[next_non_cap..], 'C').is_some() { // next letter sequence "C..." |
1175 | 63 | if is_next_char_start_of_section_12_modifier(&chars[next_non_cap+1..]) { |
1176 | // to me this is tricky -- section 12 modifiers apply to the previous item | |
1177 | // the last clause of the "item" def is the previous indivisible symbol" which ICEB 2.1 say is: | |
1178 | // braille sign: one or more consecutive braille characters comprising a unit, | |
1179 | // consisting of a root on its own or a root preceded by one or more | |
1180 | // prefixes (also referred to as braille symbol) | |
1181 | // this means the capital indicator needs to be stated and can't be part of a word or passage | |
1182 | 1 | is_word_mode = false; |
1183 | 1 | result.push_str(String::from_iter(&chars[i..next_non_cap]).as_str()); |
1184 | 1 | i = next_non_cap; |
1185 | 1 | continue; |
1186 | 62 | } |
1187 | 62 | if is_word_mode { |
1188 | 12 | i += 1; // skip the 'C' |
1189 | 50 | } else { |
1190 | 50 | // start word mode -- need an extra 'C' |
1191 | 50 | result.push('C'); |
1192 | 50 | is_word_mode = true; |
1193 | 50 | } |
1194 | 193 | } else if is_word_mode { |
1195 | 50 | i += 1; // skip the 'C' |
1196 | 143 | } |
1197 | 255 | if chars[next_non_cap] == 'G' { |
1198 | 8 | // Greek letters are a bit exceptional in that the pattern is "CGLx" -- bump 'i' |
1199 | 8 | next_non_cap += 1; |
1200 | 247 | } |
1201 | 255 | if chars[next_non_cap] != 'L' { |
1202 | 0 | error!("capitals_to_word_mode: internal error: didn't find L after C in '{}'.", |
1203 | 0 | chars[i..next_non_cap+2].iter().collect::<String>().as_str()); |
1204 | 255 | } |
1205 | 255 | let i_braille_char = next_non_cap + 2; |
1206 | 255 | result.push_str(String::from_iter(&chars[i..i_braille_char]).as_str()); |
1207 | 255 | i = i_braille_char; |
1208 | 11.9k | } else if ch == 'L' { // must be lowercase -- uppercase consumed above |
1209 | // assert!(LETTERS.contains(&unhighlight(chars[i+1]))); not true for other alphabets | |
1210 | 2.03k | if is_word_mode { |
1211 | 2 | result.push('e'); // terminate Word mode (letter after caps) |
1212 | 2 | is_word_mode = false; |
1213 | 2.03k | } |
1214 | 2.03k | result.push('L'); |
1215 | 2.03k | result.push(chars[i+1]); |
1216 | 2.03k | i += 2; // eat L, letter |
1217 | 9.88k | } else { |
1218 | 9.88k | is_word_mode = false; // non-letters terminate cap word mode |
1219 | 9.88k | result.push(ch); |
1220 | 9.88k | i += 1; |
1221 | 9.88k | } |
1222 | } | |
1223 | 478 | return result; |
1224 | ||
1225 | 63 | fn is_next_char_start_of_section_12_modifier(chars: &[char]) -> bool { |
1226 | // first find the L and eat the char so that we are at the potential start of where the target lies | |
1227 | 63 | let chars_len = chars.len(); |
1228 | 63 | let mut i_cap = 0; |
1229 | 126 | while chars[i_cap] != 'C' { // we know 'C' is in the string, so no need to check for exceeding chars_len |
1230 | 63 | i_cap += 1; |
1231 | 63 | } |
1232 | 73 | for i_end in |
1233 | 73 | if chars[i_end] == 'L' { |
1234 | // skip the next char to get to the real start, and then look for the modifier string or next L/N | |
1235 | // debug!(" after L '{}'", chars[i_end+2..].iter().collect::<String>()); | |
1236 | 65 | for i in |
1237 | 65 | let ch = chars[i]; |
1238 | 65 | if ch == '1' { |
1239 | // Fix: there's probably a much better way to check if we have a match against one of "⠱", "⠘⠱", "⠘⠲", "⠸⠱", "⠐⠱ ", "⠨⠸⠱" | |
1240 | 5 | if chars[i+1] == '⠱' { |
1241 | 0 | return true; |
1242 | 5 | } else if i+2 < chars_len { |
1243 | 5 | let mut str = chars[i+1].to_string(); |
1244 | 5 | str.push(chars[i+2]); |
1245 | 5 | if str == "⠘⠱" || str == "⠘⠲" || str == "⠸⠱" || str == "⠐⠱" { |
1246 | 1 | return true; |
1247 | 4 | } else if i+3 < chars_len { |
1248 | 4 | str.push(chars[i+3]); |
1249 | 4 | return str == "⠨⠸⠱"; |
1250 | 0 | } |
1251 | 0 | return false; |
1252 | 0 | } |
1253 | 60 | } |
1254 | 60 | if ch == 'L' || |
1255 | 48 | return false; |
1256 | 12 | } |
1257 | } | |
1258 | 10 | } |
1259 | } | |
1260 | 10 | return false; |
1261 | 63 | } |
1262 | 478 | } |
1263 | ||
1264 | 521 | fn find_next_char(chars: &[char], target: char) -> Option<usize> { |
1265 | // first find the L or N and eat the char so that we are at the potential start of where the target lies | |
1266 | // debug!("Looking for '{}' in '{}'", target, chars.iter().collect::<String>()); | |
1267 | 610 | for i_end in |
1268 | 610 | if chars[i_end] == 'L' || |
1269 | // skip the next char to get to the real start, and then look for the target | |
1270 | // stop when L/N signals past potential target or we hit some non L/N char (actual braille) | |
1271 | // debug!(" after L/N '{}'", chars[i_end+2..].iter().collect::<String>()); | |
1272 | 521 | for ( |
1273 | 515 | if ch == 'L' || |
1274 | 383 | return None; |
1275 | 132 | } else if ch == target { |
1276 | // debug!(" found target"); | |
1277 | 67 | return Some(i); |
1278 | 65 | } |
1279 | } | |
1280 | 89 | } |
1281 | } | |
1282 | 71 | return None; |
1283 | 521 | } |
1284 | ||
1285 | #[allow(non_camel_case_types)] | |
1286 | #[derive(Debug, PartialEq, Copy, Clone)] | |
1287 | enum UEB_Mode { | |
1288 | Numeric, // also includes Grade1 | |
1289 | Grade1, | |
1290 | Grade2, | |
1291 | } | |
1292 | ||
1293 | #[allow(non_camel_case_types)] | |
1294 | #[derive(Debug, PartialEq, Copy, Clone)] | |
1295 | enum UEB_Duration { | |
1296 | // Standing alone: A braille symbol that is standing alone may have a contracted (grade 2) meaning. | |
1297 | // A letter or unbroken sequence of letters is “standing alone” if the symbols before and after the letter or | |
1298 | // sequence are spaces, hyphens, dashes or any combination thereof, including some common punctuation. | |
1299 | // Item: An “item” is defined as the next symbol or one of seven groupings listed in Rules of Unified English Braille, §11.4.1. | |
1300 | Symbol, | |
1301 | ||
1302 | // The grade 1 word indicator sets grade 1 mode for the next word or symbol sequence. | |
1303 | // A symbol sequence in UEB is defined as an unbroken string of braille signs, | |
1304 | // whether alphabetic or non-alphabetic, preceded and followed by a space. | |
1305 | Word, | |
1306 | Passage, | |
1307 | } | |
1308 | ||
1309 | // used to determine standing alone (on left side) | |
1310 | 4.53k | fn is_left_intervening_char(ch: char) -> bool { |
1311 | 4.53k |
|
1312 | 4.53k | } |
1313 | ||
1314 | /// Return value for use_g1_word_mode() | |
1315 | #[derive(Debug, PartialEq)] | |
1316 | enum Grade1WordIndicator { | |
1317 | NotInWord, // no '𝟙' in the current/next word | |
1318 | InWord, // '𝟙' in the current/next word | |
1319 | NotInChars, // no '𝟙' in the entire string (optimization for common case) | |
1320 | } | |
1321 | ||
1322 | 1.89k | fn remove_unneeded_mode_changes(raw_braille: &str, start_mode: UEB_Mode, start_duration: UEB_Duration) -> String { |
1323 | // FIX: need to be smarter about moving on wrt to typeforms/typefaces, caps, bold/italic. [maybe just let them loop through the default?] | |
1324 | 1.89k | let mut mode = start_mode; |
1325 | 1.89k | let mut duration = start_duration; |
1326 | 1.89k | let mut start_g2_letter = None; // used for start of contraction checks |
1327 | 1.89k | let mut i_g2_start = None; // set to 'i' when entering G2 mode; None in other modes. '1' indicator goes here if standing alone |
1328 | 1.89k | let mut cap_word_mode = false; // only set to true in G2 to prevent contractions |
1329 | 1.89k | let mut result = String::default(); |
1330 | 1.89k | let chars = raw_braille.chars().collect::<Vec<char>>(); |
1331 | 1.89k | let mut g1_word_indicator = Grade1WordIndicator::NotInChars; // almost always true (and often irrelevant) |
1332 | 1.89k | if mode == UEB_Mode::Grade2 || |
1333 | 991 | g1_word_indicator = use_g1_word_mode(&chars); |
1334 | 991 | if g1_word_indicator == Grade1WordIndicator::InWord { |
1335 | 1 | mode = UEB_Mode::Grade1; |
1336 | 1 | if duration == UEB_Duration::Symbol { |
1337 | 1 | duration = UEB_Duration::Word; // if Passage mode, leave as is |
1338 | 1 | result.push('𝟙') |
1339 | 0 | } |
1340 | 990 | } |
1341 | 901 | } |
1342 | 1.89k | let mut i = 0; |
1343 | 37.0k | while i < chars.len() { |
1344 | 35.1k | let ch = chars[i]; |
1345 | 35.1k | match mode { |
1346 | UEB_Mode::Numeric => { | |
1347 | // Numeric Mode: (from https://uebmath.aphtech.org/lesson1.0 and lesson4.0) | |
1348 | // Symbols that can appear within numeric mode include the ten digits, comma, period, simple fraction line, | |
1349 | // line continuation indicator, and numeric space digit symbols. | |
1350 | // A space or any other symbol not listed here terminates numeric mode. | |
1351 | // Numeric mode is also terminated by the "!" -- used after a script | |
1352 | // | |
1353 | // The numeric indicator also turns on grade 1 mode. | |
1354 | // When grade 1 mode is set by the numeric indicator, | |
1355 | // grade 1 indicators are not used unless a single lower-case letter a-j immediately follows a digit. | |
1356 | // Grade 1 mode when set by the numeric indicator is terminated by a space, hyphen, dash, or a grade 1 indicator. | |
1357 | 3.31k | i_g2_start = None; |
1358 | // debug!("Numeric: ch={}, duration: {:?}", ch, duration); | |
1359 | 3.31k | match ch { |
1360 | 'L' => { | |
1361 | // terminate numeric mode -- duration doesn't change | |
1362 | // let the default case handle pushing on the chars for the letter | |
1363 | 1.42k | if is_letter_number(unhighlight(chars[i+1])) { |
1364 | 1.37k | result.push('1'); // need to distinguish a-j from a digit |
1365 | 1.37k |
|
1366 | 1.42k | result.push(ch); |
1367 | 1.42k | i += 1; |
1368 | 1.42k | mode = UEB_Mode::Grade1; |
1369 | // duration remains Word | |
1370 | }, | |
1371 | '1' | '𝟙' => { | |
1372 | // numeric mode implies grade 1, so don't output indicator; | |
1373 | 107 | i += 1; |
1374 | 107 | mode = UEB_Mode::Grade1; |
1375 | 107 | if start_duration == UEB_Duration::Passage { |
1376 | 15 | duration = UEB_Duration::Passage; // otherwise it remains at Word |
1377 | 92 | } |
1378 | }, | |
1379 | '#' => { | |
1380 | // terminate numeric mode -- duration doesn't change | |
1381 | 738 | i += 1; |
1382 | 738 | if i+1 < chars.len() && |
1383 | 9 | // special case where the script was numeric and a letter follows, so need to put out G1 indicator |
1384 | 9 | result.push('1'); |
1385 | 9 | // the G1 case should work with 'L' now |
1386 | 729 | } |
1387 | 738 | mode = UEB_Mode::Grade1; |
1388 | }, | |
1389 | 521 | 'N' => { |
1390 | 521 | // stay in the same mode (includes numeric "," and "." space) -- don't let default get these chars |
1391 | 521 | result.push(chars[i+1]); |
1392 | 521 | i += 2; |
1393 | 521 | }, |
1394 | _ => { | |
1395 | // moving out of numeric mode | |
1396 | 524 | result.push(ch); |
1397 | 524 | i += 1; |
1398 | 524 | if "W𝐖-—―".contains(ch) { |
1399 | 94 | mode = start_mode; |
1400 | 94 | if mode == UEB_Mode::Grade2 { |
1401 | 47 | start_g2_letter = None; // will be set to real letter |
1402 | 47 | } |
1403 | 94 | if start_duration != UEB_Duration::Passage { |
1404 | 47 | duration = UEB_Duration::Symbol; |
1405 | 47 | } |
1406 | } else { | |
1407 | 430 | mode = UEB_Mode::Grade1 |
1408 | } | |
1409 | }, | |
1410 | } | |
1411 | }, | |
1412 | UEB_Mode::Grade1 => { | |
1413 | // Grade 1 Mode: | |
1414 | // The numeric indicator also sets grade 1 mode. | |
1415 | // Grade 1 mode, when initiated by the numeric indicator, is terminated by a space, hyphen, dash or grade 1 terminator. | |
1416 | // Grade 1 mode is also set by grade 1 indicators. | |
1417 | 25.0k | i_g2_start = None; |
1418 | // debug!("Grade 1: ch={}, duration: {:?}", ch, duration); | |
1419 | 25.0k | match ch { |
1420 | 3.34k | 'L' => { |
1421 | 3.34k | // note: be aware of '#' case for Numeric because '1' might already be generated |
1422 | 3.34k | // let prev_ch = if i > 1 {chars[i-1]} else {'1'}; // '1' -- anything beside ',' or '.' |
1423 | 3.34k | // if duration == UEB_Duration::Symbol || |
1424 | 3.34k | // ( ",. ".contains(prev_ch) && LETTER_NUMBERS.contains(&unhighlight(chars[i+1])) ) { |
1425 | 3.34k | // result.push('1'); // need to retain grade 1 indicator (RUEB 6.5.2) |
1426 | 3.34k | // } |
1427 | 3.34k | // let the default case handle pushing on the chars for the letter |
1428 | 3.34k | result.push(ch); |
1429 | 3.34k | i += 1; |
1430 | 3.34k | }, |
1431 | '1' | '𝟙' => { | |
1432 | 2.35k | assert!(ch == '1' || |
1433 | // nothing to do -- let the default case handle the following chars | |
1434 | 2.35k | i += 1; |
1435 | }, | |
1436 | 2.36k | 'N' => { |
1437 | 2.36k | result.push(ch); |
1438 | 2.36k | result.push(chars[i+1]); |
1439 | 2.36k | i += 2; |
1440 | 2.36k | mode = UEB_Mode::Numeric; |
1441 | 2.36k | duration = UEB_Duration::Word; |
1442 | 2.36k | }, |
1443 | 'W' | '𝐖' => { | |
1444 | // this terminates a word mode if there was one | |
1445 | 711 | result.push(ch); |
1446 | 711 | i += 1; |
1447 | 711 | if start_duration != UEB_Duration::Passage { |
1448 | 224 | duration = UEB_Duration::Symbol; |
1449 | 224 | mode = UEB_Mode::Grade2; |
1450 | 487 | } |
1451 | }, | |
1452 | _ => { | |
1453 | 16.3k | result.push(ch); |
1454 | 16.3k | i += 1; |
1455 | 16.3k | if duration == UEB_Duration::Symbol && |
1456 | 1.34k | mode = start_mode; |
1457 | 14.9k | } |
1458 | } | |
1459 | } | |
1460 | 25.0k | if mode == UEB_Mode::Grade2 { |
1461 | 1.56k | start_g2_letter = None; // will be set to real letter |
1462 | 23.5k | } |
1463 | ||
1464 | }, | |
1465 | UEB_Mode::Grade2 => { | |
1466 | // note: if we ended up using a '1', it only extends to the next char, which is also dealt with, so mode doesn't change | |
1467 | 6.79k | if i_g2_start.is_none() { |
1468 | 2.58k | i_g2_start = Some(i); |
1469 | 2.58k | cap_word_mode = false; |
1470 | 4.21k | } |
1471 | // debug!("Grade 2: ch={}, duration: {:?}", ch, duration); | |
1472 | 6.79k | match ch { |
1473 | 'L' => { | |
1474 | 1.44k | if start_g2_letter.is_none() { |
1475 | 1.34k | start_g2_letter = Some(i); |
1476 | 1.34k |
|
1477 | 1.44k | let (is_alone, right_matched_chars, n_letters) = stands_alone(&chars, i); |
1478 | // GTM 1.2.1 says we only need to use G1 for single letters or sequences that are a shortform (e.g, "ab") | |
1479 | 1.44k | if is_alone && ( |
1480 | 373 | // debug!(" is_alone -- pushing '1'"); |
1481 | 373 | result.push('1'); |
1482 | 373 | mode = UEB_Mode::Grade1; |
1483 | 1.07k | } |
1484 | // debug!(" pushing {:?}", right_matched_chars); | |
1485 | 3.13k |
|
1486 | 1.44k | i += right_matched_chars.len(); |
1487 | }, | |
1488 | 'C' => { | |
1489 | // Want 'C' before 'L'; Could be CC for word cap -- if so, eat it and move on | |
1490 | // Note: guaranteed that there is a char after the 'C', so chars[i+1] is safe | |
1491 | 99 | if chars[i+1] == 'C' { |
1492 | 14 | cap_word_mode = true; |
1493 | 14 | i += 1; |
1494 | 14 | } else { |
1495 | 85 | let is_greek = chars[i+1] == 'G'; |
1496 | 85 | let (is_alone, right_matched_chars, n_letters) = stands_alone(&chars, if is_greek { |
1497 | // GTM 1.2.1 says we only need to use G1 for single letters or sequences that are a shortform (e.g, "ab") | |
1498 | 85 | if is_alone && ( |
1499 | 16 | // debug!(" is_alone -- pushing '1'"); |
1500 | 16 | result.push('1'); |
1501 | 16 | mode = UEB_Mode::Grade1; |
1502 | 69 | } |
1503 | 85 | if cap_word_mode { |
1504 | 14 | result.push('C'); // first 'C' if cap word |
1505 | 71 | } |
1506 | 85 | result.push('C'); |
1507 | 85 | if is_greek { |
1508 | 2 | result.push('G'); |
1509 | 2 | i += 1; |
1510 | 83 | } |
1511 | 85 | start_g2_letter = Some(i); |
1512 | // debug!(" pushing 'C' + {:?}", right_matched_chars); | |
1513 | 256 |
|
1514 | 85 | i += 1 + right_matched_chars.len(); |
1515 | } | |
1516 | }, | |
1517 | 1.34k | '1' => { |
1518 | 1.34k | result.push(ch); |
1519 | 1.34k | i += 1; |
1520 | 1.34k | mode = UEB_Mode::Grade1; |
1521 | 1.34k | duration = UEB_Duration::Symbol; |
1522 | 1.34k | }, |
1523 | '𝟙' => { | |
1524 | // '𝟙' should have forced G1 Word mode | |
1525 | 0 | error!("Internal error: '𝟙' found in G2 mode: index={i} in '{raw_braille}'"); |
1526 | 0 | i += 1; |
1527 | } | |
1528 | 582 | 'N' => { |
1529 | 582 | result.push(ch); |
1530 | 582 | result.push(chars[i+1]); |
1531 | 582 | i += 2; |
1532 | 582 | mode = UEB_Mode::Numeric; |
1533 | 582 | duration = UEB_Duration::Word; |
1534 | 582 | }, |
1535 | _ => { | |
1536 | 3.32k | if let Some( |
1537 | 505 | if !cap_word_mode { |
1538 | 504 | result = handle_contractions(&chars[start..i], result); |
1539 | 504 |
|
1540 | 505 | cap_word_mode = false; |
1541 | 505 | start_g2_letter = None; // not start of char sequence |
1542 | 2.81k | } |
1543 | 3.32k | result.push(ch); |
1544 | 3.32k | i += 1; |
1545 | 3.32k | if !is_left_intervening_char(ch) { |
1546 | 3.29k | cap_word_mode = false; |
1547 | 3.29k | i_g2_start = Some(i); |
1548 | 3.29k |
|
1549 | ||
1550 | } | |
1551 | } | |
1552 | 6.79k | if mode != UEB_Mode::Grade2 && |
1553 | 2.30k | let Some( |
1554 | 883 | result = handle_contractions(&chars[start..i], result); |
1555 | 883 | start_g2_letter = None; // not start of char sequence |
1556 | 5.91k | } |
1557 | }, | |
1558 | } | |
1559 | ||
1560 | 35.1k | if (ch == 'W' || |
1561 | 602 | (mode == UEB_Mode::Grade2 || |
1562 | 602 | g1_word_indicator = use_g1_word_mode(&chars[i..]); |
1563 | 602 | if g1_word_indicator == Grade1WordIndicator::InWord { |
1564 | 1 | mode = UEB_Mode::Grade1; |
1565 | 1 | if duration == UEB_Duration::Symbol { |
1566 | 1 | duration = UEB_Duration::Word; // if Passage mode, leave as is |
1567 | 1 | result.push('𝟙') |
1568 | 0 | } |
1569 | 601 | } |
1570 | 34.5k | } |
1571 | } | |
1572 | 1.89k | if mode == UEB_Mode::Grade2 && |
1573 | 289 | let Some( |
1574 | 31 | result = handle_contractions(&chars[start..i], result); |
1575 | 1.86k | } |
1576 | ||
1577 | 1.89k | return result; |
1578 | ||
1579 | ||
1580 | 1.59k | fn use_g1_word_mode(chars: &[char]) -> Grade1WordIndicator { |
1581 | // debug!("use_g1_word_mode: chars='{:?}'", chars); | |
1582 | 19.5k | for &ch in |
1583 | 19.5k | if ch == 'W' || |
1584 | 601 | return Grade1WordIndicator::NotInWord; // reached a word boundary |
1585 | 18.9k | } |
1586 | 18.9k | if ch == '𝟙' { |
1587 | 2 | return Grade1WordIndicator::InWord; // need word mode in this "word" |
1588 | 18.9k | } |
1589 | } | |
1590 | 990 | return Grade1WordIndicator::NotInChars; // |
1591 | 1.59k | } |
1592 | 1.89k | } |
1593 | ||
1594 | /// Returns a tuple: | |
1595 | /// true if the ith char "stands alone" (UEB 2.6) | |
1596 | /// the chars on the right that are part of the standing alone sequence | |
1597 | /// the number of letters in that sequence | |
1598 | /// This basically means a letter sequence surrounded by white space with some potentially intervening chars | |
1599 | /// The intervening chars can be typeform/cap indicators, along with various forms of punctuation | |
1600 | /// The ith char should be an "L" | |
1601 | /// This assumes that there is whitespace before and after the character string | |
1602 | 1.52k | fn stands_alone(chars: &[char], i: usize) -> (bool, &[char], usize) { |
1603 | // scan backward and check the conditions for "standing-alone" | |
1604 | // we scan forward and check the conditions for "standing-alone" | |
1605 | 1.52k | assert_eq!(chars[i], 'L', "'stands_alone' starts with non 'L'"); |
1606 | // debug!("stands_alone: i={}, chars: {:?}", i, chars); | |
1607 | 1.52k | if !left_side_stands_alone(&chars[0..i]) { |
1608 | 977 | return (false, &chars[i..i+2], 0); |
1609 | 552 | } |
1610 | ||
1611 | 552 | let (mut is_alone, n_letters, n_right_matched) = right_side_stands_alone(&chars[i+2..]); |
1612 | // debug!("left is alone, right is alone: {}, : n_letters={}, n_right_matched={}", is_alone, n_letters, n_right_matched); | |
1613 | ||
1614 | 552 | if is_alone && |
1615 | 390 | let ch = chars[i+1]; |
1616 | 390 | if ch=='⠁' || |
1617 | 3 | is_alone = false; |
1618 | 387 | } |
1619 | 162 | } |
1620 | 552 | return (is_alone, &chars[i..i+2+n_right_matched], n_letters); |
1621 | ||
1622 | /// chars before 'L' | |
1623 | 1.52k | fn left_side_stands_alone(chars: &[char]) -> bool { |
1624 | // scan backwards to skip letters and intervening chars | |
1625 | // once we hit an intervening char, only intervening chars are allowed if standing alone | |
1626 | 1.52k | let mut intervening_chars_mode = false; // true when we are on the final stretch |
1627 | 1.52k | let mut i = chars.len(); |
1628 | 1.86k | while i > 0 { |
1629 | 1.38k | i -= 1; |
1630 | 1.38k | let ch = chars[i]; |
1631 | 1.38k | let prev_ch = if i > 0 { |
1632 | // debug!(" left alone: prev/ch {}/{}", prev_ch, ch); | |
1633 | 1.38k | if (!intervening_chars_mode && |
1634 | 1.30k | (prev_ch == 'o' || |
1635 | 174 | intervening_chars_mode = true; |
1636 | 174 | i -= 1; // ignore 'Lx' and also ignore 'ox' |
1637 | 1.21k | } else if is_left_intervening_char(ch) { |
1638 | 161 | intervening_chars_mode = true; |
1639 | 161 | } else { |
1640 | 1.05k | return "W𝐖-—―".contains(ch); |
1641 | } | |
1642 | } | |
1643 | ||
1644 | 475 | return true; |
1645 | 1.52k | } |
1646 | ||
1647 | // chars after character we are testing | |
1648 | 552 | fn right_side_stands_alone(chars: &[char]) -> (bool, usize, usize) { |
1649 | // see RUEB 2.6.3 | |
1650 | 355 | fn is_right_intervening_char(ch: char) -> bool { |
1651 | 355 |
|
1652 | 355 | } |
1653 | // scan forward to skip letters and intervening chars | |
1654 | // once we hit an intervening char, only intervening chars are allowed if standing alone ('c' and 'b' are part of them) | |
1655 | 552 | let mut intervening_chars_mode = false; // true when we are on the final stretch |
1656 | 552 | let mut i = 0; |
1657 | 552 | let mut n_letters = 1; // we have skipped the first letter |
1658 | 725 | while i < chars.len() { |
1659 | 515 | let ch = chars[i]; |
1660 | // debug!(" right alone: ch/next {}/{}", ch, if i+1<chars.len() {chars[i+1]} else {' '}); | |
1661 | 515 | if !intervening_chars_mode && |
1662 | 140 | n_letters += 1; |
1663 | 140 | i += 1; // ignore 'Lx' and also ignore 'ox' |
1664 | 375 | } else if ch == 'c' || |
1665 | 20 | i += 1; // ignore 'Lx' and also ignore 'ox' |
1666 | 355 | } else if is_right_intervening_char(ch) { |
1667 | 13 | intervening_chars_mode = true; |
1668 | 13 | } else { |
1669 | 342 | return if "W𝐖-—―".contains(ch) { |
1670 | } | |
1671 | 173 | i += 1; |
1672 | } | |
1673 | ||
1674 | 210 | return (true, n_letters, chars.len()); |
1675 | 552 | } |
1676 | 1.52k | } |
1677 | ||
1678 | ||
1679 | /// Return a modified result if chars can be contracted. | |
1680 | /// Otherwise, the original string is returned | |
1681 | 1.41k | fn handle_contractions(chars: &[char], mut result: String) -> String { |
1682 | struct Replacement { | |
1683 | pattern: String, | |
1684 | replacement: &'static str | |
1685 | } | |
1686 | ||
1687 | const ASCII_TO_UNICODE: &[char] = &[ | |
1688 | '⠀', '⠮', '⠐', '⠼', '⠫', '⠩', '⠯', '⠄', '⠷', '⠾', '⠡', '⠬', '⠠', '⠤', '⠨', '⠌', | |
1689 | '⠴', '⠂', '⠆', '⠒', '⠲', '⠢', '⠖', '⠶', '⠦', '⠔', '⠱', '⠰', '⠣', '⠿', '⠜', '⠹', | |
1690 | '⠈', '⠁', '⠃', '⠉', '⠙', '⠑', '⠋', '⠛', '⠓', '⠊', '⠚', '⠅', '⠇', '⠍', '⠝', '⠕', | |
1691 | '⠏', '⠟', '⠗', '⠎', '⠞', '⠥', '⠧', '⠺', '⠭', '⠽', '⠵', '⠪', '⠳', '⠻', '⠘', '⠸', | |
1692 | ]; | |
1693 | ||
1694 | 36 | fn to_unicode_braille(ascii: &str) -> String { |
1695 | 36 | let mut unicode = String::with_capacity(4*ascii.len()); // 'L' + 3 bytes for braille char |
1696 | 82 | for ch in |
1697 | 82 | unicode.push('L'); |
1698 | 82 | unicode.push(ASCII_TO_UNICODE[(ch.to_ascii_uppercase() - 32) as usize]) |
1699 | } | |
1700 | 36 | return unicode; |
1701 | 36 | } |
1702 | ||
1703 | // It would be much better from an extensibility point of view to read the table in from a file | |
1704 | 2 | static CONTRACTIONS: LazyLock<Vec<Replacement>> = LazyLock::new(|| { vec![ |
1705 | // 10.3: Strong contractions | |
1706 | 2 | Replacement{ pattern: to_unicode_braille("and"), replacement: "L⠯"}, |
1707 | 2 | Replacement{ pattern: to_unicode_braille("for"), replacement: "L⠿"}, |
1708 | 2 | Replacement{ pattern: to_unicode_braille("of"), replacement: "L⠷"}, |
1709 | 2 | Replacement{ pattern: to_unicode_braille("the"), replacement: "L⠮"}, |
1710 | 2 | Replacement{ pattern: to_unicode_braille("with"), replacement: "L⠾"}, |
1711 | ||
1712 | // 10.8: final-letter group signs (this need to precede 'en' and any other shorter contraction) | |
1713 | 2 | Replacement{ pattern: "(?P<s>L.)L⠍L⠑L⠝L⠞".to_string(), replacement: "${s}L⠰L⠞" }, // ment |
1714 | 2 | Replacement{ pattern: "(?P<s>L.)L⠞L⠊L⠕L⠝".to_string(), replacement: "${s}L⠰L⠝" } ,// tion |
1715 | ||
1716 | // 10.4: Strong group signs | |
1717 | 2 | Replacement{ pattern: to_unicode_braille("ch"), replacement: "L⠡"}, |
1718 | 2 | Replacement{ pattern: to_unicode_braille("gh"), replacement: "L⠣"}, |
1719 | 2 | Replacement{ pattern: to_unicode_braille("sh"), replacement: "L⠩"}, |
1720 | 2 | Replacement{ pattern: to_unicode_braille("th"), replacement: "L⠹"}, |
1721 | 2 | Replacement{ pattern: to_unicode_braille("wh"), replacement: "L⠱"}, |
1722 | 2 | Replacement{ pattern: to_unicode_braille("ed"), replacement: "L⠫"}, |
1723 | 2 | Replacement{ pattern: to_unicode_braille("er"), replacement: "L⠻"}, |
1724 | 2 | Replacement{ pattern: to_unicode_braille("ou"), replacement: "L⠳"}, |
1725 | 2 | Replacement{ pattern: to_unicode_braille("ow"), replacement: "L⠪"}, |
1726 | 2 | Replacement{ pattern: to_unicode_braille("st"), replacement: "L⠌"}, |
1727 | 2 | Replacement{ pattern: "(?P<s>L.)L⠊L⠝L⠛".to_string(), replacement: "${s}L⠬" }, // 'ing', not at start |
1728 | 2 | Replacement{ pattern: to_unicode_braille("ar"), replacement: "L⠜"}, |
1729 | ||
1730 | // 10.6.5: Lower group signs preceded and followed by letters | |
1731 | // FIX: don't match if after/before a cap letter -- can't use negative pattern (?!...) in regex package | |
1732 | // Note: removed cc because "arccos" shouldn't be contracted (10.11.1), but there is no way to know about compound words | |
1733 | // Add it back after implementing a lookup dictionary of exceptions | |
1734 | 2 | Replacement{ pattern: "(?P<s>L.)L⠑L⠁(?P<e>L.)".to_string(), replacement: "${s}L⠂${e}" }, // ea |
1735 | 2 | Replacement{ pattern: "(?P<s>L.)L⠃L⠃(?P<e>L.)".to_string(), replacement: "${s}L⠆${e}" }, // bb |
1736 | // Replacement{ pattern: "(?P<s>L.)L⠉L⠉(?P<e>L.)".to_string(), replacement: "${s}L⠒${e}" }, // cc | |
1737 | 2 | Replacement{ pattern: "(?P<s>L.)L⠋L⠋(?P<e>L.)".to_string(), replacement: "${s}L⠖${e}" }, // ff |
1738 | 2 | Replacement{ pattern: "(?P<s>L.)L⠛L⠛(?P<e>L.)".to_string(), replacement: "${s}L⠶${e}" }, // gg |
1739 | ||
1740 | // 10.6.8: Lower group signs ("in" also 10.5.4 lower word signs) | |
1741 | // FIX: these need restrictions about only applying when upper dots are present | |
1742 | 2 | Replacement{ pattern: to_unicode_braille("en"), replacement: "⠢"}, |
1743 | 2 | Replacement{ pattern: to_unicode_braille("in"), replacement: "⠔"}, |
1744 | ||
1745 | ] | |
1746 | 2 | }); |
1747 | ||
1748 | 2 | static CONTRACTION_PATTERNS: LazyLock<RegexSet> = LazyLock::new(|| init_patterns(&CONTRACTIONS)); |
1749 | 1 | static CONTRACTION_REGEX: LazyLock<Vec<Regex>> = LazyLock::new(|| init_regex(&CONTRACTIONS)); |
1750 | ||
1751 | 1.41k | let mut chars_as_str = chars.iter().collect::<String>(); |
1752 | // debug!(" handle_contractions: examine '{}'", &chars_as_str); | |
1753 | 1.41k | let matches = CONTRACTION_PATTERNS.matches(&chars_as_str); |
1754 | 1.41k | for |
1755 | 35 | let element = &CONTRACTIONS[i]; |
1756 | 35 | // debug!(" replacing '{}' with '{}' in '{}'", element.pattern, element.replacement, &chars_as_str); |
1757 | 35 | result.truncate(result.len() - chars_as_str.len()); |
1758 | 35 | chars_as_str = CONTRACTION_REGEX[i].replace_all(&chars_as_str, element.replacement).to_string(); |
1759 | 35 | result.push_str(&chars_as_str); |
1760 | 35 | // debug!(" result after replace '{}'", result); |
1761 | 35 | } |
1762 | 1.41k | return result; |
1763 | ||
1764 | ||
1765 | ||
1766 | 2 | fn init_patterns(contractions: &[Replacement]) -> RegexSet { |
1767 | 2 | let mut vec: Vec<&str> = Vec::with_capacity(contractions.len()); |
1768 | 50 | for contraction in |
1769 | 50 | vec.push(&contraction.pattern); |
1770 | 50 | } |
1771 | 2 | return RegexSet::new(&vec).unwrap(); |
1772 | 2 | } |
1773 | ||
1774 | 1 | fn init_regex(contractions: &[Replacement]) -> Vec<Regex> { |
1775 | 1 | let mut vec = Vec::with_capacity(contractions.len()); |
1776 | 25 | for contraction in |
1777 | 25 | vec.push(Regex::new(&contraction.pattern).unwrap()); |
1778 | 25 | } |
1779 | 1 | return vec; |
1780 | 1 | } |
1781 | 1.41k | } |
1782 | ||
1783 | ||
1784 | ||
1785 | ||
1786 | static VIETNAM_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! { | |
1787 | "S" => "XXX", // sans-serif -- from prefs | |
1788 | "B" => "⠘", // bold | |
1789 | "𝔹" => "XXX", // blackboard -- from prefs | |
1790 | "T" => "⠈", // script | |
1791 | "I" => "⠨", // italic | |
1792 | "R" => "", // roman | |
1793 | // "E" => "⠰", // English | |
1794 | "1" => "⠠", // Grade 1 symbol | |
1795 | "L" => "", // Letter left in to assist in locating letters | |
1796 | "D" => "XXX", // German (Deutsche) -- from prefs | |
1797 | "G" => "⠰", // Greek | |
1798 | "V" => "XXX", // Greek Variants | |
1799 | // "H" => "⠠⠠", // Hebrew | |
1800 | // "U" => "⠈⠈", // Russian | |
1801 | "C" => "⠨", // capital | |
1802 | "𝑐" => "", // second or latter braille cell of a capital letter | |
1803 | "𝐶" => "⠨", // capital that never should get word indicator (from chemical element) | |
1804 | "N" => "⠼", // number indicator | |
1805 | "t" => "⠱", // shape terminator | |
1806 | "W" => "⠀", // whitespace" | |
1807 | "𝐖"=> "⠀", // whitespace | |
1808 | "s" => "⠆", // typeface single char indicator | |
1809 | "w" => "", // typeface word indicator | |
1810 | "e" => "", // typeface & capital terminator | |
1811 | "o" => "", // flag that what follows is an open indicator (used for standing alone rule) | |
1812 | "c" => "", // flag that what follows is an close indicator (used for standing alone rule) | |
1813 | "b" => "", // flag that what follows is an open or close indicator (used for standing alone rule) | |
1814 | "," => "⠂", // comma | |
1815 | "." => "⠲", // period | |
1816 | "-" => "-", // hyphen | |
1817 | "—" => "⠠⠤", // normal dash (2014) -- assume all normal dashes are unified here [RUEB appendix 3] | |
1818 | "―" => "⠐⠠⠤", // long dash (2015) -- assume all long dashes are unified here [RUEB appendix 3] | |
1819 | "#" => "", // signals end of script | |
1820 | "!" => "", // Hack used to prevent some regular expression matches | |
1821 | }; | |
1822 | ||
1823 | 112 | fn vietnam_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String { |
1824 | // Deal with Vietnamese "rhymes" -- moving accents around | |
1825 | // See "Vietnamese Uncontracted Braille Update in MathCAT" or maybe https://icanreadvietnamese.com/blog/14-rule-of-tone-mark-placement | |
1826 | // Note: I don't know how to write (for example) I_E_RULE so that it excludes "qu" and "gi", so I use two rules | |
1827 | // The first rule rewrites the patterns with "qu" and "gi" to add "!" to prevent a match of the second rule -- "!" is dropped later | |
1828 | 1 | static QU_GI_RULE_EXCEPTION: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(L⠟L⠥|L⠛L⠊)").unwrap()); |
1829 | 1 | static IUOY_E_RULE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"L(⠊|⠥|⠕|⠽)(L[⠔⠰⠢⠤⠠])L(⠑|⠣)").unwrap()); // ie, ue, oe, and ye rule |
1830 | 1 | static UO_A_RULE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"L(⠥|⠕)(L[⠔⠰⠢⠤⠠])L(⠁|⠡|⠜)").unwrap()); // ua, oa rule |
1831 | 1 | static UU_O_RULE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"L(⠥|⠳)(L[⠔⠰⠢⠤⠠])L(⠪|⠹)").unwrap()); // uo, ưo rule |
1832 | 1 | static UYE_RULE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"L⠥L([⠔⠰⠢⠤⠠])L⠽L⠣").unwrap()); // uo, ưo rule |
1833 | 1 | static UY_RULE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"L⠥L([⠔⠰⠢⠤⠠])L⠽").unwrap()); // uo, ưo rule |
1834 | 1 | static REPLACE_INDICATORS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"([1𝟙SB𝔹TIREDGVHP𝐶𝑐CLMNW𝐖swe,.-—―#ocb!])").unwrap()); |
1835 | // debug!("vietnam_cleanup: start={}", raw_braille); | |
1836 | 112 | let result = typeface_to_word_mode(&raw_braille); |
1837 | 112 | let result = capitals_to_word_mode(&result); |
1838 | ||
1839 | 112 | let result = result.replace("tW", "W"); |
1840 | 112 | let result = result.replace("CG", "⠸"); // capital Greek letters are problematic in Vietnam braille |
1841 | 112 | let result = result.replace("CC", "⠸"); // capital word more is the same as capital Greek letters |
1842 | // debug!(" after typeface/caps={}", &result); | |
1843 | ||
1844 | // deal with "rhymes" | |
1845 | 112 | let result = QU_GI_RULE_EXCEPTION.replace_all(&result, "${1}!"); |
1846 | // debug!(" after except={}", &result); | |
1847 | 112 | let result = IUOY_E_RULE.replace_all(&result, "${2}L${1}L${3}"); |
1848 | // debug!(" after IUOY_E={}", &result); | |
1849 | 112 | let result = UO_A_RULE.replace_all(&result, "${2}L${1}L${3}"); |
1850 | // debug!(" after UO_A={}", &result); | |
1851 | 112 | let result = UU_O_RULE.replace_all(&result, "${2}L${1}L${3}"); |
1852 | // debug!(" after UO_O={}", &result); | |
1853 | 112 | let result = UYE_RULE.replace_all(&result, "${1}L⠥L⠽L⠣"); // longer match first |
1854 | // debug!(" after UYE={}", &result); | |
1855 | 112 | let result = UY_RULE.replace_all(&result, "${1}L⠥L⠽"); |
1856 | // debug!(" after UY={}", &result); | |
1857 | ||
1858 | // these typeforms need to get pulled from user-prefs as they are transcriber-defined | |
1859 | 112 | let double_struck = pref_manager.pref_to_string("Vietnam_DoubleStruck"); |
1860 | 112 | let sans_serif = pref_manager.pref_to_string("Vietnam_SansSerif"); |
1861 | 112 | let fraktur = pref_manager.pref_to_string("Vietnam_Fraktur"); |
1862 | 112 | let greek_variant = pref_manager.pref_to_string("Vietnam_GreekVariant"); |
1863 | ||
1864 | // This reuses the code just for getting rid of unnecessary "L"s and "N"s | |
1865 | 112 | let result = remove_unneeded_mode_changes(&result, UEB_Mode::Grade1, UEB_Duration::Passage); |
1866 | ||
1867 | ||
1868 | 1.23k | let |
1869 | 1.23k | let matched_char = &cap[0]; |
1870 | 1.23k | match matched_char { |
1871 | 1.23k | "𝔹" => |
1872 | 1.23k | "S" => |
1873 | 1.23k | "D" => |
1874 | 1.23k | "V" => |
1875 | 1.23k | _ => match VIETNAM_INDICATOR_REPLACEMENTS.get(matched_char) { |
1876 | 0 | None => {error!("REPLACE_INDICATORS and VIETNAM_INDICATOR_REPLACEMENTS are not in sync: missing '{matched_char}'"); ""}, |
1877 | 1.23k | Some(&ch) => ch, |
1878 | }, | |
1879 | } | |
1880 | 1.23k | }); |
1881 | ||
1882 | // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces | |
1883 | // let result = result.trim_start_matches('⠀').trim_end_matches('⠀'); | |
1884 | 112 | let result = COLLAPSE_SPACES.replace_all(&result, "⠀"); |
1885 | ||
1886 | 112 | return result.to_string(); |
1887 | 112 | } |
1888 | ||
1889 | ||
1890 | static CMU_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! { | |
1891 | // "S" => "XXX", // sans-serif -- from prefs | |
1892 | "B" => "⠔", // bold | |
1893 | "𝔹" => "⠬", // blackboard -- from prefs | |
1894 | // "T" => "⠈", // script | |
1895 | "I" => "⠔", // italic -- same as bold | |
1896 | // "R" => "", // roman | |
1897 | // "E" => "⠰", // English | |
1898 | "1" => "⠐", // Grade 1 symbol -- used here for a-j after number | |
1899 | "L" => "", // Letter left in to assist in locating letters | |
1900 | "D" => "⠠", // German (Gothic) | |
1901 | "G" => "⠈", // Greek | |
1902 | "V" => "⠈⠬", // Greek Variants | |
1903 | // "H" => "⠠⠠", // Hebrew | |
1904 | // "U" => "⠈⠈", // Russian | |
1905 | "C" => "⠨", // capital | |
1906 | "𝐶" => "⠨", // capital that never should get word indicator (from chemical element) | |
1907 | "N" => "⠼", // number indicator | |
1908 | "𝑁" => "", // continue number | |
1909 | // "t" => "⠱", // shape terminator | |
1910 | "W" => "⠀", // whitespace" | |
1911 | "𝐖"=> "⠀", // whitespace | |
1912 | // "𝘄" => "⠀", // add whitespace if char to the left has dots 1, 2, or 3 -- special rule handled separately, so commented out | |
1913 | "s" => "", // typeface single char indicator | |
1914 | // "w" => "⠂", // typeface word indicator | |
1915 | // "e" => "⠄", // typeface & capital terminator | |
1916 | // "o" => "", // flag that what follows is an open indicator (used for standing alone rule) | |
1917 | // "c" => "", // flag that what follows is an close indicator (used for standing alone rule) | |
1918 | // "b" => "", // flag that what follows is an open or close indicator (used for standing alone rule) | |
1919 | "," => "⠂", // comma | |
1920 | "." => "⠄", // period | |
1921 | "-" => "⠤", // hyphen | |
1922 | "—" => "⠤⠤", // normal dash (2014) -- assume all normal dashes are unified here [RUEB appendix 3] | |
1923 | // "―" => "⠐⠤⠤", // long dash (2015) -- assume all long dashes are unified here [RUEB appendix 3] | |
1924 | "#" => "⠼", // signals to end/restart of numeric mode (mixed fractions) | |
1925 | }; | |
1926 | ||
1927 | ||
1928 | 372 | fn cmu_cleanup(_pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String { |
1929 | 2 | static ADD_WHITE_SPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"𝘄(.)|𝘄$").unwrap()); |
1930 | ||
1931 | // debug!("cmu_cleanup: start={}", raw_braille); | |
1932 | // let result = typeface_to_word_mode(&raw_braille); | |
1933 | ||
1934 | // let result = result.replace("tW", "W"); | |
1935 | 372 | let result = raw_braille.replace("CG", "⠘") |
1936 | 372 | .replace("𝔹C", "⠩") |
1937 | 372 | .replace("DC", "⠰"); |
1938 | // let result = result.replace("CC", "⠸"); | |
1939 | ||
1940 | // these typeforms need to get pulled from user-prefs as they are transcriber-defined | |
1941 | // let double_struck = pref_manager.pref_to_string("CMU_DoubleStruck"); | |
1942 | // let sans_serif = pref_manager.pref_to_string("CMU_SansSerif"); | |
1943 | // let fraktur = pref_manager.pref_to_string("CMU_Fraktur"); | |
1944 | ||
1945 | // debug!("Before remove mode changes: '{}'", &result); | |
1946 | // This reuses the code just for getting rid of unnecessary "L"s and "N"s | |
1947 | 372 | let result = remove_unneeded_mode_changes(&result, UEB_Mode::Grade1, UEB_Duration::Passage); |
1948 | 372 | let result = result.replace("𝑁N", ""); |
1949 | // debug!(" After remove mode changes: '{}'", &result); | |
1950 | ||
1951 | 2.58k | let |
1952 | 2.58k | match CMU_INDICATOR_REPLACEMENTS.get(&cap[0]) { |
1953 | 0 | None => {error!("REPLACE_INDICATORS and CMU_INDICATOR_REPLACEMENTS are not in sync"); ""}, |
1954 | 2.58k | Some(&ch) => ch, |
1955 | } | |
1956 | 2.58k | }); |
1957 | 372 | let result = ADD_WHITE_SPACE.replace_all(&result, |cap: &Captures| |
1958 | 12 | if cap.get(1).is_none() { |
1959 | 2 | return "⠀".to_string(); |
1960 | } else { | |
1961 | // debug!("ADD_WHITE_SPACE match='{}', has left dots = {}", &cap[1], has_left_dots(cap[1].chars().next().unwrap())); | |
1962 | 10 | let mut next_chars = cap[1].chars(); |
1963 | 10 | let next_char = next_chars.next().unwrap(); |
1964 | 10 | assert!(next_chars.next().is_none()); |
1965 | 10 | return (if has_left_dots(next_char) { |
1966 | } | |
1967 | 12 | }); |
1968 | ||
1969 | // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces | |
1970 | 372 | let result = COLLAPSE_SPACES.replace_all(&result, "⠀"); |
1971 | 372 | let result = result.trim_start_matches('⠀'); // don't trip end (e.g., see once::vector_11_2_5) |
1972 | 372 | return result.to_string(); |
1973 | ||
1974 | 10 | fn has_left_dots(ch: char) -> bool { |
1975 | // Unicode braille is set up so dot 1 is 2^0, dot 2 is 2^1, etc | |
1976 | 10 | return ( (ch as u32 - 0x2800) >> 4 ) > 0; |
1977 | 10 | } |
1978 | 372 | } |
1979 | ||
1980 | ||
1981 | ||
1982 | static SWEDISH_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! { | |
1983 | // FIX: this needs cleaning up -- not all of these are used | |
1984 | "S" => "XXX", // sans-serif -- from prefs | |
1985 | "B" => "⠨", // bold | |
1986 | "𝔹" => "XXX", // blackboard -- from prefs | |
1987 | "T" => "⠈", // script | |
1988 | "I" => "⠨", // italic | |
1989 | "R" => "", // roman | |
1990 | "1" => "⠱", // Grade 1 symbol (used for number followed by a letter) | |
1991 | "L" => "", // Letter left in to assist in locating letters | |
1992 | "D" => "XXX", // German (Deutsche) -- from prefs | |
1993 | "G" => "⠰", // Greek | |
1994 | "V" => "XXX", // Greek Variants | |
1995 | // "H" => "⠠⠠", // Hebrew | |
1996 | // "U" => "⠈⠈", // Russian | |
1997 | "C" => "⠠", // capital | |
1998 | "𝑐" => "", // second or latter braille cell of a capital letter | |
1999 | "𝐶" => "⠠", // capital that never should get word indicator (from chemical element) | |
2000 | "N" => "⠼", // number indicator | |
2001 | "t" => "⠱", // shape terminator | |
2002 | "W" => "⠀", // whitespace" | |
2003 | "𝐖"=> "⠀", // whitespace | |
2004 | "w" => "⠀", // whitespace after function name | |
2005 | "s" => "", // typeface single char indicator | |
2006 | "e" => "", // typeface & capital terminator | |
2007 | "E" => "⠱", // empty base -- see index of radical | |
2008 | "o" => "", // flag that what follows is an open indicator (used for standing alone rule) | |
2009 | "c" => "", // flag that what follows is an close indicator (used for standing alone rule) | |
2010 | "b" => "", // flag that what follows is an open or close indicator (used for standing alone rule) | |
2011 | "," => "⠂", // comma | |
2012 | "." => "⠲", // period | |
2013 | "-" => "-", // hyphen | |
2014 | "—" => "⠠⠤", // normal dash (2014) -- assume all normal dashes are unified here [RUEB appendix 3] | |
2015 | "―" => "⠐⠠⠤", // long dash (2015) -- assume all long dashes are unified here [RUEB appendix 3] | |
2016 | "#" => "", // signals end of script | |
2017 | ||
2018 | }; | |
2019 | ||
2020 | ||
2021 | static FINNISH_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! { | |
2022 | // FIX: this needs cleaning up -- not all of these are used | |
2023 | "S" => "XXX", // sans-serif -- from prefs | |
2024 | "B" => "⠨", // bold | |
2025 | "𝔹" => "XXX", // blackboard -- from prefs | |
2026 | "T" => "⠈", // script | |
2027 | "I" => "⠨", // italic | |
2028 | "R" => "", // roman | |
2029 | "E" => "⠰", // English | |
2030 | "1" => "⠀", // Grade 1 symbol (used for number followed by a letter) | |
2031 | "L" => "", // Letter left in to assist in locating letters | |
2032 | "D" => "XXX", // German (Deutsche) -- from prefs | |
2033 | "G" => "⠨", // Greek | |
2034 | "V" => "XXX", // Greek Variants | |
2035 | // "H" => "⠠⠠", // Hebrew | |
2036 | // "U" => "⠈⠈", // Russian | |
2037 | "C" => "⠠", // capital | |
2038 | "𝑐" => "", // second or latter braille cell of a capital letter | |
2039 | "𝐶" => "⠠", // capital that never should get whitespace in front (from chemical element) | |
2040 | "N" => "⠼", // number indicator | |
2041 | "n" => "⠼", // number indicator for drop numbers (special case with close parens) | |
2042 | "t" => "⠱", // shape terminator | |
2043 | "W" => "⠀", // whitespace" | |
2044 | "𝐖"=> "⠀", // whitespace | |
2045 | "s" => "⠆", // typeface single char indicator | |
2046 | "w" => "", // typeface word indicator | |
2047 | "e" => "", // typeface & capital terminator | |
2048 | "," => "⠂", // comma | |
2049 | "." => "⠲", // period | |
2050 | "-" => "-", // hyphen | |
2051 | "—" => "⠠⠤", // normal dash (2014) -- assume all normal dashes are unified here [RUEB appendix 3] | |
2052 | "―" => "⠐⠠⠤", // long dash (2015) -- assume all long dashes are unified here [RUEB appendix 3] | |
2053 | "(" => "⠦", // Not really needed, but done for consistency with ")" | |
2054 | ")" => "⠴", // Needed for rules with drop numbers to avoid mistaking for dropped 0 | |
2055 | "↑" => "⠬", // superscript | |
2056 | "↓" => "⠡", // subscript | |
2057 | "#" => "", // signals end of script | |
2058 | "Z" => "⠐", // signals end of index of root, integrand/lim from function ("zone change") | |
2059 | ||
2060 | }; | |
2061 | ||
2062 | 0 | fn finnish_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String { |
2063 | 0 | static REPLACE_INDICATORS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"([SB𝔹TIREDGVHUP𝐏C𝐶LlMmb↑↓Nn𝑁WwZ,()])").unwrap()); |
2064 | // Numbers need to end with a space, but sometimes there is one there for other reasons | |
2065 | 0 | static DROP_NUMBER_SEPARATOR: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(n.)\)").unwrap()); |
2066 | 0 | static NUMBER_MATCH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"((N.)+[^WN𝐶#↑↓Z])").unwrap()); |
2067 | ||
2068 | // debug!("finnish_cleanup: start={}", raw_braille); | |
2069 | 0 | let result = DROP_NUMBER_SEPARATOR.replace_all(&raw_braille, |cap: &Captures| { |
2070 | // match includes the char after the number -- insert the whitespace before it | |
2071 | // debug!("DROP_NUMBER_SEPARATOR match='{}'", &cap[1]); | |
2072 | 0 | return cap[1].to_string() + "𝐶)"; // hack to use "𝐶" instead of dot 6 directly, but works for NUMBER_MATCH |
2073 | 0 | }); |
2074 | 0 | let result = result.replace('n', "N"); // avoids having to modify remove_unneeded_mode_changes() |
2075 | 0 | let result = NUMBER_MATCH.replace_all(&result, |cap: &Captures| { |
2076 | // match includes the char after the number -- insert the whitespace before it | |
2077 | // debug!("NUMBER_MATCH match='{}'", &cap[1]); | |
2078 | 0 | let mut chars = cap[0].chars(); |
2079 | 0 | let last_char = chars.next_back().unwrap(); // unwrap safe since several chars were matched |
2080 | 0 | return chars.as_str().to_string() + "W" + &last_char.to_string(); |
2081 | 0 | }); |
2082 | ||
2083 | // FIX: need to implement this -- this is just a copy of the Vietnam code | |
2084 | 0 | let result = result.replace("CG", "⠘") |
2085 | 0 | .replace("𝔹C", "⠩") |
2086 | 0 | .replace("DC", "⠰"); |
2087 | ||
2088 | // debug!(" after typeface/caps={}", &result); | |
2089 | ||
2090 | // these typeforms need to get pulled from user-prefs as they are transcriber-defined | |
2091 | 0 | let double_struck = pref_manager.pref_to_string("Vietnam_DoubleStruck"); |
2092 | 0 | let sans_serif = pref_manager.pref_to_string("Vietnam_SansSerif"); |
2093 | 0 | let fraktur = pref_manager.pref_to_string("Vietnam_Fraktur"); |
2094 | 0 | let greek_variant = pref_manager.pref_to_string("Vietnam_GreekVariant"); |
2095 | ||
2096 | // This reuses the code just for getting rid of unnecessary "L"s and "N"s | |
2097 | 0 | let result = remove_unneeded_mode_changes(&result, UEB_Mode::Grade1, UEB_Duration::Passage); |
2098 | // debug!(" remove_unneeded_mode_changes={}", &result); | |
2099 | ||
2100 | ||
2101 | 0 | let result = REPLACE_INDICATORS.replace_all(&result, |cap: &Captures| { |
2102 | 0 | let matched_char = &cap[0]; |
2103 | 0 | match matched_char { |
2104 | 0 | "𝔹" => &double_struck, |
2105 | 0 | "S" => &sans_serif, |
2106 | 0 | "D" => &fraktur, |
2107 | 0 | "V" => &greek_variant, |
2108 | 0 | _ => match FINNISH_INDICATOR_REPLACEMENTS.get(matched_char) { |
2109 | 0 | None => {error!("REPLACE_INDICATORS and SWEDISH_INDICATOR_REPLACEMENTS are not in sync: missing '{matched_char}'"); ""}, |
2110 | 0 | Some(&ch) => ch, |
2111 | }, | |
2112 | } | |
2113 | 0 | }); |
2114 | ||
2115 | // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces | |
2116 | // let result = result.trim_start_matches('⠀').trim_end_matches('⠀'); | |
2117 | 0 | let result = COLLAPSE_SPACES.replace_all(&result, "⠀"); |
2118 | ||
2119 | 0 | return result.to_string(); |
2120 | 0 | } |
2121 | ||
2122 | ||
2123 | 0 | fn swedish_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String { |
2124 | // FIX: need to implement this -- this is just a copy of the Vietnam code | |
2125 | // Empty bases are ok if they follow whitespace | |
2126 | 0 | static EMPTY_BASE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(^|[W𝐖w])E").unwrap()); |
2127 | // debug!("swedish_cleanup: start={}", raw_braille); | |
2128 | 0 | let result = typeface_to_word_mode(&raw_braille); |
2129 | 0 | let result = capitals_to_word_mode(&result); |
2130 | ||
2131 | 0 | let result = result.replace("CG", "⠘") |
2132 | 0 | .replace("𝔹C", "⠩") |
2133 | 0 | .replace("DC", "⠰"); |
2134 | ||
2135 | // debug!(" after typeface/caps={}", &result); | |
2136 | ||
2137 | // these typeforms need to get pulled from user-prefs as they are transcriber-defined | |
2138 | 0 | let double_struck = pref_manager.pref_to_string("Vietnam_DoubleStruck"); |
2139 | 0 | let sans_serif = pref_manager.pref_to_string("Vietnam_SansSerif"); |
2140 | 0 | let fraktur = pref_manager.pref_to_string("Vietnam_Fraktur"); |
2141 | 0 | let greek_variant = pref_manager.pref_to_string("Vietnam_GreekVariant"); |
2142 | ||
2143 | // This reuses the code just for getting rid of unnecessary "L"s and "N"s | |
2144 | 0 | let result = remove_unneeded_mode_changes(&result, UEB_Mode::Grade1, UEB_Duration::Passage); |
2145 | // debug!(" after removing mode changes={}", &result); | |
2146 | ||
2147 | ||
2148 | 0 | let result = EMPTY_BASE.replace_all(&result, "$1"); |
2149 | 0 | let result = REPLACE_INDICATORS.replace_all(&result, |cap: &Captures| { |
2150 | 0 | let matched_char = &cap[0]; |
2151 | 0 | match matched_char { |
2152 | 0 | "𝔹" => &double_struck, |
2153 | 0 | "S" => &sans_serif, |
2154 | 0 | "D" => &fraktur, |
2155 | 0 | "V" => &greek_variant, |
2156 | 0 | _ => match SWEDISH_INDICATOR_REPLACEMENTS.get(matched_char) { |
2157 | 0 | None => {error!("REPLACE_INDICATORS and SWEDISH_INDICATOR_REPLACEMENTS are not in sync: missing '{matched_char}'"); ""}, |
2158 | 0 | Some(&ch) => ch, |
2159 | }, | |
2160 | } | |
2161 | 0 | }); |
2162 | ||
2163 | // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces | |
2164 | // let result = result.trim_start_matches('⠀').trim_end_matches('⠀'); | |
2165 | 0 | let result = COLLAPSE_SPACES.replace_all(&result, "⠀"); |
2166 | ||
2167 | 0 | return result.to_string(); |
2168 | 0 | } |
2169 | ||
2170 | #[allow(non_snake_case)] | |
2171 | 50 | fn LaTeX_cleanup(_pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String { |
2172 | 1 | static REMOVE_SPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r" ([\^_,;)\]}])").unwrap()); // '^', '_', ',', ';', ')', ']', '}' |
2173 | 1 | static COLLAPSE_SPACES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r" +").unwrap()); |
2174 | // debug!("LaTeX_cleanup: start={}", raw_braille); | |
2175 | 50 | let result = raw_braille.replace('𝐖', " "); |
2176 | // let result = COLLAPSE_SPACES.replace_all(&raw_braille, "⠀"); | |
2177 | 50 | let result = COLLAPSE_SPACES.replace_all(&result, " "); |
2178 | // debug!("After collapse: {}", &result); | |
2179 | 50 | let result = REMOVE_SPACE.replace_all(&result, "$1"); |
2180 | // debug!("After remove: {}", &result); | |
2181 | // let result = result.trim_matches('⠀'); | |
2182 | 50 | let result = result.trim_matches(' '); |
2183 | ||
2184 | 50 | return result.to_string(); |
2185 | 50 | } |
2186 | ||
2187 | #[allow(non_snake_case)] | |
2188 | 41 | fn ASCIIMath_cleanup(_pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String { |
2189 | 1 | static REMOVE_SPACE_BEFORE_OP: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"([\w\d]) +([^\w\d"]|[\^_,;)\]}])"#).unwrap()); |
2190 | 1 | static REMOVE_SPACE_AFTER_OP: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"([^\^_,;)\]}\w\d"]) +([\w\d])"#).unwrap()); |
2191 | 1 | static COLLAPSE_SPACES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r" +").unwrap()); |
2192 | // debug!("ASCIIMath_cleanup: start={}", raw_braille); | |
2193 | 41 | let result = raw_braille.replace("|𝐖__|", "|𝐰__|"); // protect the whitespace to prevent misinterpretation as lfloor |
2194 | 41 | let result = result.replace('𝐖', " "); |
2195 | 41 | let result = COLLAPSE_SPACES.replace_all(&result, " "); |
2196 | // debug!("After collapse: {}", &result); | |
2197 | 41 | let result = REMOVE_SPACE_BEFORE_OP.replace_all(&result, "$1$2"); |
2198 | 41 | let result = REMOVE_SPACE_AFTER_OP.replace_all(&result, "$1$2"); |
2199 | 41 | let result = result.replace('𝐰', " "); // spaces around relational operators |
2200 | 41 | let result = COLLAPSE_SPACES.replace_all(&result, " "); |
2201 | // debug!("After remove: {}", &result); | |
2202 | // let result = result.trim_matches('⠀'); | |
2203 | 41 | let result = result.trim_matches(' '); |
2204 | ||
2205 | 41 | return result.to_string(); |
2206 | 41 | } |
2207 | ||
2208 | ||
2209 | /************** Braille xpath functionality ***************/ | |
2210 | use crate::canonicalize::{as_element, as_text, name}; | |
2211 | use crate::xpath_functions::{is_leaf, validate_one_node, IsBracketed}; | |
2212 | use std::result::Result as StdResult; | |
2213 | use sxd_document::dom::ParentOfChild; | |
2214 | use sxd_xpath::function::Error as XPathError; | |
2215 | use sxd_xpath::function::{Args, Function}; | |
2216 | use sxd_xpath::{context, nodeset::*, Value}; | |
2217 | ||
2218 | pub struct NemethNestingChars; | |
2219 | const NEMETH_FRAC_LEVEL: &str = "data-nemeth-frac-level"; // name of attr where value is cached | |
2220 | const FIRST_CHILD_ONLY: &[&str] = &["mroot", "msub", "msup", "msubsup", "munder", "mover", "munderover", "mmultiscripts"]; | |
2221 | impl NemethNestingChars { | |
2222 | // returns a 'repeat_char' corresponding to the Nemeth rules for nesting | |
2223 | // note: this value is likely one char too long because the starting fraction is counted | |
2224 | 537 | fn nemeth_frac_value(node: Element, repeat_char: &str) -> String { |
2225 | 537 | let children = node.children(); |
2226 | 537 | let name = name(node); |
2227 | 537 | if is_leaf(node) { |
2228 | 244 | return "".to_string(); |
2229 | 293 | } else if name == "mfrac" { |
2230 | // have we already computed the value? | |
2231 | 221 | if let Some( |
2232 | 152 | return value.to_string(); |
2233 | 69 | } |
2234 | ||
2235 | 69 | let num_value = NemethNestingChars::nemeth_frac_value(as_element(children[0]), repeat_char); |
2236 | 69 | let denom_value = NemethNestingChars::nemeth_frac_value(as_element(children[1]), repeat_char); |
2237 | 69 | let mut max_value = if num_value.len() > denom_value.len() { |
2238 | 69 | max_value += repeat_char; |
2239 | 69 | node.set_attribute_value(NEMETH_FRAC_LEVEL, &max_value); |
2240 | 69 | return max_value; |
2241 | 72 | } else if FIRST_CHILD_ONLY.contains(&name) { |
2242 | // only look at the base -- ignore scripts/index | |
2243 | 10 | return NemethNestingChars::nemeth_frac_value(as_element(children[0]), repeat_char); |
2244 | } else { | |
2245 | 62 | let mut result = "".to_string(); |
2246 | 197 | for child in |
2247 | 197 | let value = NemethNestingChars::nemeth_frac_value(as_element(child), repeat_char); |
2248 | 197 | if value.len() > result.len() { |
2249 | 19 | result = value; |
2250 | 178 | } |
2251 | } | |
2252 | 62 | return result; |
2253 | } | |
2254 | 537 | } |
2255 | ||
2256 | 0 | fn nemeth_root_value(node: Element, repeat_char: &str) -> StdResult<String, XPathError> { |
2257 | // returns the correct number of repeat_chars to use | |
2258 | // note: because the highest count is toward the leaves and | |
2259 | // because this is a loop and not recursive, caching doesn't work without a lot of overhead | |
2260 | 0 | let parent = node.parent().unwrap(); |
2261 | 0 | if let ParentOfChild::Element(e) = parent { |
2262 | 0 | let mut parent = e; |
2263 | 0 | let mut result = "".to_string(); |
2264 | loop { | |
2265 | 0 | let name = name(parent); |
2266 | 0 | if name == "math" { |
2267 | 0 | return Ok( result ); |
2268 | 0 | } |
2269 | 0 | if name == "msqrt" || name == "mroot" { |
2270 | 0 | result += repeat_char; |
2271 | 0 | } |
2272 | 0 | let parent_of_child = parent.parent().unwrap(); |
2273 | 0 | if let ParentOfChild::Element(e) = parent_of_child { |
2274 | 0 | parent = e; |
2275 | 0 | } else { |
2276 | 0 | return Err( sxd_xpath::function::Error::Other("Internal error in nemeth_root_value: didn't find 'math' tag".to_string()) ); |
2277 | } | |
2278 | } | |
2279 | 0 | } |
2280 | 0 | return Err( XPathError::Other("Internal error in nemeth_root_value: didn't find 'math' tag".to_string()) ); |
2281 | 0 | } |
2282 | } | |
2283 | ||
2284 | impl Function for NemethNestingChars { | |
2285 | /** | |
2286 | * Returns a string with the correct number of nesting chars (could be an empty string) | |
2287 | * @param(node) -- current node | |
2288 | * @param(char) -- char (string) that should be repeated | |
2289 | * Note: as a side effect, an attribute with the value so repeated calls to this or a child will be fast | |
2290 | */ | |
2291 | 192 | fn evaluate<'d>(&self, |
2292 | 192 | _context: &context::Evaluation<'_, 'd>, |
2293 | 192 | args: Vec<Value<'d>>) |
2294 | 192 | -> StdResult<Value<'d>, XPathError> |
2295 | { | |
2296 | 192 | let mut args = Args(args); |
2297 | 192 | args.exactly(2) |
2298 | 192 | let repeat_char = args.pop_string() |
2299 | 192 | let node = crate::xpath_functions::validate_one_node(args.pop_nodeset() |
2300 | 192 | if let Node::Element(el) = node { |
2301 | 192 | let name = name(el); |
2302 | // it is likely a bug to call this one a non mfrac | |
2303 | 192 | if name == "mfrac" { |
2304 | // because it is called on itself, the fraction is counted one too many times -- chop one off | |
2305 | // this is slightly messy because we are chopping off a char, not a byte | |
2306 | const BRAILLE_BYTE_LEN: usize = "⠹".len(); // all Unicode braille symbols have the same number of bytes | |
2307 | 192 | return Ok( Value::String( NemethNestingChars::nemeth_frac_value(el, &repeat_char)[BRAILLE_BYTE_LEN..].to_string() ) ); |
2308 | 0 | } else if name == "msqrt" || name == "mroot" { |
2309 | 0 | return Ok( Value::String( NemethNestingChars::nemeth_root_value(el, &repeat_char)? ) ); |
2310 | } else { | |
2311 | 0 | return Err(XPathError::Other(format!("NestingChars chars should be used only on 'mfrac'. '{}' was passed in", name))); |
2312 | } | |
2313 | } else { | |
2314 | // not an element, so nothing to do | |
2315 | 0 | return Ok( Value::String("".to_string()) ); |
2316 | } | |
2317 | 192 | } |
2318 | } | |
2319 | ||
2320 | pub struct BrailleChars; | |
2321 | impl BrailleChars { | |
2322 | // returns a string for the chars in the *leaf* node. | |
2323 | // this string follows the Nemeth rules typefaces and deals with mathvariant | |
2324 | // which has partially turned chars to the alphanumeric block | |
2325 | 12.5k | fn get_braille_chars(node: Element, code: &str, text_range: Option<Range<usize>>) -> StdResult<String, XPathError> { |
2326 | 12.5k | let result = match code { |
2327 | 12.5k | "Nemeth" => |
2328 | 6.52k | "UEB" => |
2329 | 4.24k | "CMU" => |
2330 | 536 | "Vietnam" => BrailleChars:: get_braille_vietnam_chars(node, text_range), |
2331 | 0 | "Swedish" => BrailleChars:: get_braille_ueb_chars(node, text_range), // FIX: need to figure out what to implement |
2332 | 0 | "Finnish" => BrailleChars:: get_braille_ueb_chars(node, text_range), // FIX: need to figure out what to implement |
2333 | 0 | _ => return Err(sxd_xpath::function::Error::Other(format!("get_braille_chars: unknown braille code '{code}'"))) |
2334 | }; | |
2335 | 12.5k | return match result { |
2336 | 12.5k | Ok(string) => Ok(make_quoted_string(string)), |
2337 | 0 | Err(err) => return Err(sxd_xpath::function::Error::Other(err.to_string())), |
2338 | } | |
2339 | 12.5k | } |
2340 | ||
2341 | 5.99k | fn get_braille_nemeth_chars(node: Element, text_range: Option<Range<usize>>) -> Result<String> { |
2342 | // To greatly simplify typeface/language generation, the chars have unique ASCII chars for them: | |
2343 | // Typeface: S: sans-serif, B: bold, 𝔹: blackboard, T: script, I: italic, R: Roman | |
2344 | // Language: E: English, D: German, G: Greek, V: Greek variants, H: Hebrew, U: Russian | |
2345 | // Indicators: C: capital, L: letter, N: number, P: punctuation, M: multipurpose | |
2346 | 2 | static PICK_APART_CHAR: LazyLock<Regex> = LazyLock::new(|| { |
2347 | 2 | Regex::new(r"(?P<face>[SB𝔹TIR]*)(?P<lang>[EDGVHU]?)(?P<cap>C?)(?P<letter>L?)(?P<num>[N]?)(?P<char>.)").unwrap() |
2348 | 2 | }); |
2349 | 5.99k | let math_variant = node.attribute_value("mathvariant"); |
2350 | // FIX: cover all the options -- use phf::Map | |
2351 | 5.99k | let attr_typeface = match math_variant { |
2352 | 5.76k | None => "R", |
2353 | 233 | Some(variant) => match variant { |
2354 | 233 | "bold" => |
2355 | 191 | "italic" => |
2356 | 189 | "double-struck" => |
2357 | 162 | "script" => |
2358 | 157 | "fraktur" => |
2359 | 157 | "sans-serif" => |
2360 | 156 | _ => "R", // normal and unknown |
2361 | }, | |
2362 | }; | |
2363 | 5.99k | let text = BrailleChars::substring(as_text(node), &text_range); |
2364 | 5.99k | let braille_chars = braille_replace_chars(&text, node) |
2365 | // debug!("Nemeth chars: text='{}', braille_chars='{}'", &text, &braille_chars); | |
2366 | ||
2367 | // we want to pull the prefix (typeface, language) out to the front until a change happens | |
2368 | // the same is true for number indicator | |
2369 | // also true (sort of) for capitalization -- if all caps, use double cap in front (assume abbr or Roman Numeral) | |
2370 | ||
2371 | // we only care about this for numbers and identifiers/text, so we filter for only those | |
2372 | 5.99k | let node_name = name(node); |
2373 | 5.99k | let is_in_enclosed_list = node_name != "mo" && |
2374 | 5.99k | let is_mn_in_enclosed_list = is_in_enclosed_list && |
2375 | 5.99k | let mut typeface = "R".to_string(); // assumption is "R" and if attr or letter is different, something happens |
2376 | 5.99k | let mut is_all_caps = true; |
2377 | 5.99k | let mut is_all_caps_valid = false; // all_caps only valid if we did a replacement |
2378 | 7.87k | let |
2379 | // debug!(" face: {:?}, lang: {:?}, num {:?}, letter: {:?}, cap: {:?}, char: {:?}", | |
2380 | // &caps["face"], &caps["lang"], &caps["num"], &caps["letter"], &caps["cap"], &caps["char"]); | |
2381 | 7.87k | let mut nemeth_chars = "".to_string(); |
2382 | 7.87k | let char_face = if caps["face"].is_empty() { |
2383 | 7.87k | let typeface_changed = typeface != char_face; |
2384 | 7.87k | if typeface_changed { |
2385 | 86 | typeface = char_face.to_string(); // needs to outlast this instance of the loop |
2386 | 86 | nemeth_chars += &typeface; |
2387 | 86 | nemeth_chars += &caps["lang"]; |
2388 | 7.78k | } else { |
2389 | 7.78k | nemeth_chars += &caps["lang"]; |
2390 | 7.78k | } |
2391 | // debug!(" typeface changed: {}, is_in_list: {}; num: {}", typeface_changed, is_in_enclosed_list, !caps["num"].is_empty()); | |
2392 | 7.87k | if !caps["num"].is_empty() && ( |
2393 | 2.58k | nemeth_chars += "N"; |
2394 | 5.28k | } |
2395 | 7.87k | is_all_caps_valid = true; |
2396 | 7.87k | is_all_caps &= !&caps["cap"].is_empty(); |
2397 | 7.87k | nemeth_chars += &caps["cap"]; // will be stripped later if all caps |
2398 | 7.87k | if is_in_enclosed_list { |
2399 | 228 | nemeth_chars += &caps["letter"].replace('L', "l"); |
2400 | 7.64k | } else { |
2401 | 7.64k | nemeth_chars += &caps["letter"]; |
2402 | 7.64k | } |
2403 | 7.87k | nemeth_chars += &caps["char"]; |
2404 | 7.87k | return nemeth_chars; |
2405 | 7.87k | }); |
2406 | // debug!(" result: {}", &result); | |
2407 | 5.99k | let mut text_chars = text.chars(); // see if more than one char |
2408 | 5.99k | if is_all_caps_valid && |
2409 | 7 | return Ok( "CC".to_string() + &result.replace('C', "")); |
2410 | } else { | |
2411 | 5.98k | return Ok( result.to_string() ); |
2412 | } | |
2413 | 5.99k | } |
2414 | ||
2415 | 2.82k | fn get_braille_ueb_chars(node: Element, text_range: Option<Range<usize>>) -> Result<String> { |
2416 | // Because in UEB typeforms and caps may extend for multiple tokens, | |
2417 | // this routine merely deals with the mathvariant attr. | |
2418 | // Canonicalize has already transformed all chars it can to math alphanumerics, but not all have bold/italic | |
2419 | // The typeform/caps transforms to (potentially) word mode are handled later. | |
2420 | 1 | static HAS_TYPEFACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(".*?(double-struck|script|fraktur|sans-serif).*").unwrap()); |
2421 | 1 | static PICK_APART_CHAR: LazyLock<Regex> = LazyLock::new(|| { |
2422 | 1 | Regex::new(r"(?P<bold>B??)(?P<italic>I??)(?P<face>[S𝔹TD]??)s??(?P<cap>C??)(?P<greek>G??)(?P<char>[NL].)").unwrap() |
2423 | 1 | }); |
2424 | ||
2425 | 2.82k | let math_variant = node.attribute_value("mathvariant"); |
2426 | 2.82k | let text = BrailleChars::substring(as_text(node), &text_range); |
2427 | 2.82k | let mut braille_chars = braille_replace_chars(&text, node) |
2428 | ||
2429 | // debug!("get_braille_ueb_chars: before/after unicode.yaml: '{}'/'{}'", text, braille_chars); | |
2430 | 2.82k | if math_variant.is_none() { // nothing we need to do |
2431 | 2.71k | return Ok(braille_chars); |
2432 | 108 | } |
2433 | // mathvariant could be "sans-serif-bold-italic" -- get the parts | |
2434 | 108 | let math_variant = math_variant.unwrap(); |
2435 | 108 | let italic = math_variant.contains("italic"); |
2436 | 108 | if italic & !braille_chars.contains('I') { |
2437 | 0 | braille_chars = "I".to_string() + &braille_chars; |
2438 | 108 | } |
2439 | 108 | let bold = math_variant.contains("bold"); |
2440 | 108 | if bold & !braille_chars.contains('B') { |
2441 | 0 | braille_chars = "B".to_string() + &braille_chars; |
2442 | 108 | } |
2443 | 108 | let typeface = match HAS_TYPEFACE.find(math_variant) { |
2444 | 107 | None => "", |
2445 | 1 | Some(m) => match m.as_str() { |
2446 | 1 | "double-struck" => |
2447 | 1 | "script" => |
2448 | 1 | "fraktur" => "D", |
2449 | 0 | "sans-serif" => "S", |
2450 | // don't consider monospace as a typeform | |
2451 | 0 | _ => "", |
2452 | }, | |
2453 | }; | |
2454 | 116 | let |
2455 | // debug!("captures: {:?}", caps); | |
2456 | // debug!(" bold: {:?}, italic: {:?}, face: {:?}, cap: {:?}, char: {:?}", | |
2457 | // &caps["bold"], &caps["italic"], &caps["face"], &caps["cap"], &caps["char"]); | |
2458 | 116 | if bold || |
2459 | 116 | + if italic || !caps["italic"].is_empty() { |
2460 | 116 | + if !&caps["face"].is_empty() { |
2461 | 116 | + &caps["cap"] |
2462 | 116 | + &caps["greek"] |
2463 | 116 | + &caps["char"] |
2464 | 116 | }); |
2465 | // debug!("get_braille_ueb_chars: '{}'", &result); | |
2466 | 108 | return Ok(result.to_string()) |
2467 | 2.82k | } |
2468 | ||
2469 | 3.70k | fn get_braille_cmu_chars(node: Element, text_range: Option<Range<usize>>) -> Result<String> { |
2470 | // In CMU, we need to replace spaces used for number blocks with "." | |
2471 | // For other numbers, we need to add "." to create digit blocks | |
2472 | ||
2473 | 1 | static HAS_TYPEFACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(".*?(double-struck|script|fraktur|sans-serif).*").unwrap()); |
2474 | 1 | static PICK_APART_CHAR: LazyLock<Regex> = LazyLock::new(|| { |
2475 | 1 | Regex::new(r"(?P<bold>B??)(?P<italic>I??)(?P<face>[S𝔹TD]??)s??(?P<cap>C??)(?P<greek>G??)(?P<char>[NL].)").unwrap() |
2476 | 1 | }); |
2477 | ||
2478 | 3.70k | let math_variant = node.attribute_value("mathvariant"); |
2479 | 3.70k | let text = BrailleChars::substring(as_text(node), &text_range); |
2480 | 3.70k | let text = add_separator(text); |
2481 | ||
2482 | 3.70k | let braille_chars = braille_replace_chars(&text, node) |
2483 | ||
2484 | // debug!("get_braille_ueb_chars: before/after unicode.yaml: '{}'/'{}'", text, braille_chars); | |
2485 | 3.70k | if math_variant.is_none() { // nothing we need to do |
2486 | 3.70k | return Ok(braille_chars); |
2487 | 4 | } |
2488 | // mathvariant could be "sans-serif-bold-italic" -- get the parts | |
2489 | 4 | let math_variant = math_variant.unwrap(); |
2490 | 4 | let bold = math_variant.contains("bold"); |
2491 | 4 | let italic = math_variant.contains("italic"); |
2492 | 4 | let typeface = match HAS_TYPEFACE.find(math_variant) { |
2493 | 4 | None => "", |
2494 | 0 | Some(m) => match m.as_str() { |
2495 | 0 | "double-struck" => "𝔹", |
2496 | 0 | "script" => "T", |
2497 | 0 | "fraktur" => "D", |
2498 | 0 | "sans-serif" => "S", |
2499 | // don't consider monospace as a typeform | |
2500 | 0 | _ => "", |
2501 | }, | |
2502 | }; | |
2503 | 4 | let result = PICK_APART_CHAR.replace_all(&braille_chars, |caps: &Captures| { |
2504 | // debug!("captures: {:?}", caps); | |
2505 | // debug!(" bold: {:?}, italic: {:?}, face: {:?}, cap: {:?}, char: {:?}", | |
2506 | // &caps["bold"], &caps["italic"], &caps["face"], &caps["cap"], &caps["char"]); | |
2507 | 4 | if bold || !caps["bold"].is_empty() { |
2508 | 4 | + if italic || !caps["italic"].is_empty() { |
2509 | 4 | + if !&caps["face"].is_empty() { |
2510 | 4 | + &caps["cap"] |
2511 | 4 | + &caps["greek"] |
2512 | 4 | + &caps["char"] |
2513 | 4 | }); |
2514 | 4 | return Ok(result.to_string()); |
2515 | ||
2516 | 3.70k | fn add_separator(text: String) -> String { |
2517 | use crate::definitions::BRAILLE_DEFINITIONS; | |
2518 | 3.70k | if let Some( |
2519 | // "." after arc (7.5.3) | |
2520 | 0 | let is_function_name = BRAILLE_DEFINITIONS.with(|definitions| { |
2521 | 0 | let definitions = definitions.borrow(); |
2522 | 0 | let set = definitions.get_hashset("CMUFunctionNames").unwrap(); |
2523 | 0 | return set.contains(&text); |
2524 | 0 | }); |
2525 | 0 | if is_function_name { |
2526 | 0 | return "arc.".to_string() + text_without_arc; |
2527 | 0 | } |
2528 | 3.70k | } |
2529 | 3.70k | return text; |
2530 | 3.70k | } |
2531 | 3.70k | } |
2532 | ||
2533 | 536 | fn get_braille_vietnam_chars(node: Element, text_range: Option<Range<usize>>) -> Result<String> { |
2534 | // this is basically the same as for ueb except: | |
2535 | // 1. we deal with switching '.' and ',' if in English style for numbers | |
2536 | // 2. if it is identified as a Roman Numeral, we make all but the first char lower case because they shouldn't get a cap indicator | |
2537 | // 3. double letter chemical elements should NOT be part of a cap word sequence | |
2538 | 536 | if name(node) == "mn" { |
2539 | 248 | // text of element is modified by these if needed |
2540 | 248 | lower_case_roman_numerals(node); |
2541 | 248 | switch_if_english_style_number(node); |
2542 | 288 | } |
2543 | 536 | let result = BrailleChars::get_braille_ueb_chars(node, text_range) |
2544 | 536 | return Ok(result); |
2545 | ||
2546 | 248 | fn lower_case_roman_numerals(mn_node: Element) { |
2547 | 248 | if mn_node.attribute("data-roman-numeral").is_some() { |
2548 | 2 | // if a roman numeral, all ASCII so we can optimize |
2549 | 2 | let text = as_text(mn_node); |
2550 | 2 | let mut new_text = String::from(&text[..1]); |
2551 | 2 | new_text.push_str(text[1..].to_ascii_lowercase().as_str()); // works for single char too |
2552 | 2 | mn_node.set_text(&new_text); |
2553 | 246 | } |
2554 | 248 | } |
2555 | 248 | fn switch_if_english_style_number(mn_node: Element) { |
2556 | 248 | let text = as_text(mn_node); |
2557 | 248 | let dot = text.find('.'); |
2558 | 248 | let comma = text.find(','); |
2559 | 248 | match (dot, comma) { |
2560 | 218 | (None, None) => (), |
2561 | 4 | (Some(dot), Some(comma)) => { |
2562 | 4 | if comma < dot { |
2563 | 2 | // switch dot/comma -- using "\x01" as a temp when switching the two chars |
2564 | 2 | let switched = text.replace('.', "\x01").replace(',', ".").replace('\x01', ","); |
2565 | 2 | mn_node.set_text(&switched); |
2566 | 2 | } |
2567 | }, | |
2568 | 17 | (Some(dot), None) => { |
2569 | // If it starts with a '.', a leading 0, or if there is only one '.' and not three chars after it | |
2570 | 17 | if dot==0 || |
2571 | 15 | (dot==1 && |
2572 | 13 | (text[dot+1..].find('.').is_none() && |
2573 | 5 | mn_node.set_text(&text.replace('.', ",")); |
2574 | 12 | } |
2575 | }, | |
2576 | 9 | (None, Some(comma)) => { |
2577 | // if there is more than one ",", than it can't be a decimal separator | |
2578 | 9 | if text[comma+1..].find(',').is_some() { |
2579 | 1 | mn_node.set_text(&text.replace(',', ".")); |
2580 | 8 | } |
2581 | }, | |
2582 | } | |
2583 | 248 | } |
2584 | ||
2585 | 536 | } |
2586 | ||
2587 | ||
2588 | 3.45k | fn is_in_enclosed_list(node: Element) -> bool { |
2589 | // Nemeth Rule 10 defines an enclosed list: | |
2590 | // 1: begins and ends with fence | |
2591 | // 2: FIX: not implemented -- must contain no word, abbreviation, ordinal or plural ending | |
2592 | // 3: function names or signs of shape and the signs which follow them are a single item (not a word) | |
2593 | // 4: an item of the list may be an ellipsis or any sign used for omission | |
2594 | // 5: no relational operator may appear within the list | |
2595 | // 6: the list must have at least 2 items. | |
2596 | // Items are separated by commas, can not have other punctuation (except ellipsis and dash) | |
2597 | 3.45k | let mut parent = get_parent(node); // safe since 'math' is always at root |
2598 | 7.37k | while name(parent) == "mrow" { |
2599 | 4.04k | if IsBracketed::is_bracketed(parent, "", "", true, false) { |
2600 | 388 | for child in |
2601 | 388 | if !child_meets_conditions(as_element(child)) { |
2602 | 14 | return false; |
2603 | 374 | } |
2604 | } | |
2605 | 120 | return true; |
2606 | 3.91k | } |
2607 | 3.91k | parent = get_parent(parent); |
2608 | } | |
2609 | 3.32k | return false; |
2610 | ||
2611 | 1.55k | fn child_meets_conditions(node: Element) -> bool { |
2612 | 1.55k | let name = name(node); |
2613 | 1.55k | return match name { |
2614 | 1.55k | "mi" | |
2615 | 1.07k | "mo" => |
2616 | 412 | "mtext" => { |
2617 | 9 | let text = as_text(node).trim(); |
2618 | 9 | return text=="?" || text=="-?-" || text.is_empty(); // various forms of "fill in missing content" (see also Nemeth_RULEs.yaml, "omissions") |
2619 | }, | |
2620 | 403 | "mrow" => { |
2621 | 385 | if IsBracketed::is_bracketed(node, "", "", false, false) { |
2622 | 125 | return child_meets_conditions(as_element(node.children()[1])); |
2623 | } else { | |
2624 | 1.00k | for child in |
2625 | 1.00k | if !child_meets_conditions(as_element(child)) { |
2626 | 28 | return false; |
2627 | 975 | } |
2628 | } | |
2629 | } | |
2630 | 232 | true |
2631 | }, | |
2632 | 18 | "menclose" => { |
2633 | 0 | if let Some(notation) = node.attribute_value("notation") { |
2634 | 0 | if notation != "bottom" || notation != "box" { |
2635 | 0 | return false; |
2636 | 0 | } |
2637 | 0 | let child = as_element(node.children()[0]); // menclose has exactly one child |
2638 | 0 | return is_leaf(child) && as_text(child) == "?"; |
2639 | 0 | } |
2640 | 0 | return false; |
2641 | }, | |
2642 | _ => { | |
2643 | 36 | for child in |
2644 | 36 | if !child_meets_conditions(as_element(child)) { |
2645 | 0 | return false; |
2646 | 36 | } |
2647 | } | |
2648 | 18 | true |
2649 | }, | |
2650 | } | |
2651 | 1.55k | } |
2652 | 3.45k | } |
2653 | ||
2654 | /// Extract the `char`s from `str` within `range` (these are chars, not byte offsets) | |
2655 | 12.5k | fn substring(str: &str, text_range: &Option<Range<usize>>) -> String { |
2656 | 12.5k | return match text_range { |
2657 | 9.99k | None => str.to_string(), |
2658 | 2.52k | Some(range) => str.chars().skip(range.start).take(range.end - range.start).collect(), |
2659 | } | |
2660 | 12.5k | } |
2661 | } | |
2662 | ||
2663 | impl Function for BrailleChars { | |
2664 | /** | |
2665 | * Returns a string with the correct number of nesting chars (could be an empty string) | |
2666 | * @param(node) -- current node or string | |
2667 | * @param(char) -- char (string) that should be repeated | |
2668 | * Note: as a side effect, an attribute with the value so repeated calls to this or a child will be fast | |
2669 | */ | |
2670 | 12.5k | fn evaluate<'d>(&self, |
2671 | 12.5k | context: &context::Evaluation<'_, 'd>, |
2672 | 12.5k | args: Vec<Value<'d>>) |
2673 | 12.5k | -> StdResult<Value<'d>, XPathError> |
2674 | { | |
2675 | use crate::canonicalize::create_mathml_element; | |
2676 | 12.5k | let mut args = Args(args); |
2677 | 12.5k | if let Err( |
2678 | 0 | return Err( XPathError::Other(format!("BrailleChars requires 2 or 4 args: {e}"))); |
2679 | 12.5k | }; |
2680 | ||
2681 | 12.5k | let range = if args.len() == 4 { |
2682 | 2.52k | let end = args.pop_number() |
2683 | 2.52k | let start = args.pop_number() |
2684 | 2.52k | Some(start..end) |
2685 | } else { | |
2686 | 9.99k | None |
2687 | }; | |
2688 | 12.5k | let braille_code = args.pop_string() |
2689 | 12.5k | let v: Value<'_> = args.0.pop().ok_or(XPathError::ArgumentMissing) |
2690 | 12.5k | let node = match v { |
2691 | 11.8k | Value::Nodeset(nodes) => { |
2692 | 11.8k | validate_one_node(nodes, "BrailleChars") |
2693 | }, | |
2694 | 2 | Value::Number(n) => { |
2695 | 2 | let new_node = create_mathml_element(&context.node.document(), "mn"); |
2696 | 2 | new_node.set_text(&n.to_string()); |
2697 | 2 | new_node |
2698 | }, | |
2699 | 681 | Value::String(s) => { |
2700 | 681 | let new_node = create_mathml_element(&context.node.document(), "mi"); // FIX: try to guess mi vs mo??? |
2701 | 681 | new_node.set_text(&s); |
2702 | 681 | new_node |
2703 | }, | |
2704 | _ => { | |
2705 | 0 | return Ok( Value::String("".to_string()) ) // not an element, so nothing to do |
2706 | }, | |
2707 | }; | |
2708 | ||
2709 | 12.5k | if !is_leaf(node) { |
2710 | 0 | return Err( XPathError::Other(format!("BrailleChars called on non-leaf element '{}'", mml_to_string(node))) ); |
2711 | 12.5k | } |
2712 | 12.5k | return Ok( Value::String( BrailleChars::get_braille_chars(node, &braille_code, range) |
2713 | 12.5k | } |
2714 | } | |
2715 | ||
2716 | pub struct NeedsToBeGrouped; | |
2717 | impl NeedsToBeGrouped { | |
2718 | // ordinals often have an irregular start (e.g., "half") before becoming regular. | |
2719 | // if the number is irregular, return the ordinal form, otherwise return 'None'. | |
2720 | 805 | fn needs_grouping_for_cmu(element: Element, _is_base: bool) -> bool { |
2721 | 805 | let node_name = name(element); |
2722 | 805 | let children = element.children(); |
2723 | 805 | if node_name == "mrow" { |
2724 | // check for bracketed exprs | |
2725 | 544 | if IsBracketed::is_bracketed(element, "", "", false, true) { |
2726 | 0 | return false; |
2727 | 544 | } |
2728 | ||
2729 | // check for prefix and postfix ops at start or end (=> len()==2, prefix is first op, postfix is last op) | |
2730 | 544 | if children.len() == 2 && |
2731 | 9 | (name(as_element(children[0])) == "mo" || |
2732 | 7 | return false; |
2733 | 537 | } |
2734 | ||
2735 | 537 | if children.len() != 3 { // ==3, need to check if it a linear fraction |
2736 | 4 | return true; |
2737 | 533 | } |
2738 | 533 | let operator = as_element(children[1]); |
2739 | 533 | if name(operator) != "mo" || as_text(operator) != "/" { |
2740 | 532 | return true; |
2741 | 1 | } |
2742 | 261 | } |
2743 | ||
2744 | 262 | if !(node_name == "mrow" || |
2745 | 258 | return false; |
2746 | 4 | } |
2747 | // check for numeric fractions (regular fractions need brackets, not numeric fractions), either as an mfrac or with "/" | |
2748 | // if the fraction starts with a "-", it is still a numeric fraction that doesn't need parens | |
2749 | 4 | let mut numerator = as_element(children[0]); |
2750 | 4 | let denominator = as_element(children[children.len()-1]); |
2751 | 4 | let decimal_separator = crate::interface::get_preference("DecimalSeparators").unwrap() |
2752 | 4 | .chars().next().unwrap_or('.'); |
2753 | 4 | if is_integer(denominator, decimal_separator) { |
2754 | // check numerator being either an integer "- integer" | |
2755 | 2 | if name(numerator) == "mrow" { |
2756 | 1 | let numerator_children = numerator.children(); |
2757 | 1 | if !(numerator_children.len() == 2 && |
2758 | 1 | name(as_element(numerator_children[0])) == "mo" && |
2759 | 1 | as_text(as_element(numerator_children[0])) == "-") { |
2760 | 0 | return true; |
2761 | 1 | } |
2762 | 1 | numerator = as_element(numerator_children[1]); |
2763 | 1 | } |
2764 | 2 | return !is_integer(numerator, decimal_separator); |
2765 | 2 | } |
2766 | 2 | return true; |
2767 | ||
2768 | 6 | fn is_integer(mathml: Element, decimal_separator: char) -> bool { |
2769 | 6 | return name(mathml) == "mn" && |
2770 | 6 | } |
2771 | 805 | } |
2772 | ||
2773 | /// FIX: what needs to be implemented? | |
2774 | 0 | fn needs_grouping_for_finnish(mathml: Element, is_base: bool) -> bool { |
2775 | use crate::xpath_functions::IsInDefinition; | |
2776 | 0 | let mut node_name = name(mathml); |
2777 | 0 | if mathml.attribute_value("data-roman-numeral").is_some() { |
2778 | 0 | node_name = "mi"; // roman numerals don't follow number rules |
2779 | 0 | } |
2780 | ||
2781 | // FIX: the leaf rules are from UEB -- check the Swedish rules | |
2782 | 0 | match node_name { |
2783 | 0 | "mn" => { |
2784 | 0 | if !is_base { |
2785 | 0 | return false; |
2786 | 0 | } // clause 1 |
2787 | // two 'mn's can be adjacent, in which case we need to group the 'mn' to make it clear it is separate (see bug #204) | |
2788 | 0 | let parent = get_parent(mathml); // there is always a "math" node |
2789 | 0 | let grandparent = if name(parent) == "math" {parent} else {get_parent(parent)}; |
2790 | 0 | if name(grandparent) != "mrow" { |
2791 | 0 | return false; |
2792 | 0 | } |
2793 | 0 | let preceding = parent.preceding_siblings(); |
2794 | 0 | if preceding.len() < 2 { |
2795 | 0 | return false; |
2796 | 0 | } |
2797 | // any 'mn' would be separated from this node by invisible times | |
2798 | 0 | let previous_child = as_element(preceding[preceding.len()-1]); |
2799 | 0 | if name(previous_child) == "mo" && as_text(previous_child) == "\u{2062}" { |
2800 | 0 | let previous_child = as_element(preceding[preceding.len()-2]); |
2801 | 0 | return name(previous_child) == "mn" |
2802 | } else { | |
2803 | 0 | return false; |
2804 | } | |
2805 | }, | |
2806 | 0 | "mi" | "mo" | "mtext" => { |
2807 | 0 | let text = as_text(mathml); |
2808 | 0 | let parent = get_parent(mathml); // there is always a "math" node |
2809 | 0 | let parent_name = name(parent); // there is always a "math" node |
2810 | 0 | if is_base && (parent_name == "msub" || parent_name == "msup" || parent_name == "msubsup") && !text.contains([' ', '\u{00A0}']) { |
2811 | 0 | return false; |
2812 | 0 | } |
2813 | 0 | let mut chars = text.chars(); |
2814 | 0 | let first_char = chars.next().unwrap(); // canonicalization assures it isn't empty; |
2815 | 0 | let is_one_char = chars.next().is_none(); |
2816 | // '¨', etc., brailles as two chars -- there probably is some exception list but I haven't found it -- these are the ones I know about | |
2817 | 0 | return !((is_one_char && !['¨', '″', '‴', '⁗'].contains(&first_char)) || // clause 8 |
2818 | // "lim", "cos", etc., appear not to get parens, but the rules don't mention it (tests show it) | |
2819 | 0 | IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "FunctionNames").unwrap() || |
2820 | 0 | IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "Arrows").unwrap() || // clause 4 |
2821 | 0 | IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "GeometryShapes").unwrap()); // clause 5 |
2822 | }, | |
2823 | 0 | "mrow" => { |
2824 | // check for bracketed exprs | |
2825 | 0 | if IsBracketed::is_bracketed(mathml, "", "", false, true) { |
2826 | 0 | return false; |
2827 | 0 | } |
2828 | ||
2829 | 0 | let parent = get_parent(mathml); // safe since 'math' is always at root |
2830 | 0 | if name(parent) == "mfrac" { |
2831 | 0 | let children = mathml.children(); |
2832 | 0 | if mathml.preceding_siblings().is_empty() { |
2833 | // numerator: check for multiplication -- doesn't need grouping in numerator | |
2834 | 0 | if children.len() >= 3 { |
2835 | 0 | let operator = as_element(children[1]); |
2836 | 0 | if name(operator) == "mo" { |
2837 | 0 | let ch = as_text(operator); |
2838 | 0 | if ch == "\u{2062}" || ch == "⋅" || ch == "×" { |
2839 | 0 | return false; |
2840 | 0 | } |
2841 | 0 | } |
2842 | 0 | } |
2843 | 0 | return true; |
2844 | } else { | |
2845 | // denominator | |
2846 | 0 | return true; |
2847 | } | |
2848 | ||
2849 | 0 | } |
2850 | // check for prefix at start | |
2851 | // example 7.12 has "2-" in superscript and is grouped, so we don't consider postfix ops | |
2852 | 0 | let children = mathml.children(); |
2853 | 0 | if children.len() == 2 && |
2854 | 0 | (name(as_element(children[0])) == "mo") { |
2855 | 0 | return false; |
2856 | 0 | } |
2857 | 0 | return true; |
2858 | }, | |
2859 | 0 | _ => return false, |
2860 | } | |
2861 | 0 | } |
2862 | ||
2863 | // ordinals often have an irregular start (e.g., "half") before becoming regular. | |
2864 | // if the number is irregular, return the ordinal form, otherwise return 'None'. | |
2865 | 0 | fn needs_grouping_for_swedish(mathml: Element, is_base: bool) -> bool { |
2866 | use crate::xpath_functions::IsInDefinition; | |
2867 | 0 | let mut node_name = name(mathml); |
2868 | 0 | if mathml.attribute_value("data-roman-numeral").is_some() { |
2869 | 0 | node_name = "mi"; // roman numerals don't follow number rules |
2870 | 0 | } |
2871 | ||
2872 | 0 | match node_name { |
2873 | 0 | "mn" => return false, |
2874 | 0 | "mi" | "mo" | "mtext" => { |
2875 | 0 | let text = as_text(mathml); |
2876 | 0 | let parent = get_parent(mathml); // there is always a "math" node |
2877 | 0 | let parent_name = name(parent); // there is always a "math" node |
2878 | 0 | if is_base && (parent_name == "msub" || parent_name == "msup" || parent_name == "msubsup") && !text.contains([' ', '\u{00A0}']) { |
2879 | 0 | return false; |
2880 | 0 | } |
2881 | 0 | let mut chars = text.chars(); |
2882 | 0 | let first_char = chars.next().unwrap(); // canonicalization assures it isn't empty; |
2883 | 0 | let is_one_char = chars.next().is_none(); |
2884 | // '¨', etc., brailles as two chars -- there probably is some exception list but I haven't found it -- these are the ones I know about | |
2885 | 0 | return !((is_one_char && !['¨', '″', '‴', '⁗'].contains(&first_char)) || // clause 8 |
2886 | // "lim", "cos", etc., appear not to get parens, but the rules don't mention it (tests show it) | |
2887 | 0 | IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "FunctionNames").unwrap() || |
2888 | 0 | IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "Arrows").unwrap() || // clause 4 |
2889 | 0 | IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "GeometryShapes").unwrap()); // clause 5 |
2890 | }, | |
2891 | 0 | "mrow" => { |
2892 | // check for bracketed exprs | |
2893 | 0 | if IsBracketed::is_bracketed(mathml, "", "", false, true) { |
2894 | 0 | return false; |
2895 | 0 | } |
2896 | ||
2897 | // check for prefix at start | |
2898 | // example 7.12 has "2-" in superscript and is grouped, so we don't consider postfix ops | |
2899 | 0 | let children = mathml.children(); |
2900 | 0 | if children.len() == 2 && |
2901 | 0 | (name(as_element(children[0])) == "mo") { |
2902 | 0 | return false; |
2903 | 0 | } |
2904 | 0 | return true; |
2905 | }, | |
2906 | 0 | "mfrac" => { |
2907 | // exclude simple fractions -- they are not bracketed with start/end marks | |
2908 | 0 | let children = mathml.children(); |
2909 | 0 | return !(NeedsToBeGrouped::needs_grouping_for_swedish(as_element(children[0]), true) || |
2910 | 0 | NeedsToBeGrouped::needs_grouping_for_swedish(as_element(children[0]), true)); |
2911 | }, | |
2912 | // At least for msup (Ex 7.7, and 7.32 and maybe more), spec seems to feel grouping is not needed. | |
2913 | // "msub" | "msup" | "msubsup" | "munder" | "mover" | "munderover" => return true, | |
2914 | 0 | "mtable" => return true, // Fix: should check for trivial cases that don't need grouping |
2915 | 0 | _ => return false, |
2916 | } | |
2917 | 0 | } |
2918 | ||
2919 | /// Returns true if the element needs grouping symbols | |
2920 | /// Bases need extra attention because if they are a number and the item to the left is one, that needs distinguishing | |
2921 | 538 | fn needs_grouping_for_ueb(mathml: Element, is_base: bool) -> bool { |
2922 | // From GTM 7.1 | |
2923 | // 1. An entire number, i.e. the initiating numeric symbol and all succeeding symbols within the numeric mode thus | |
2924 | // established (which would include any interior decimal points, commas, separator spaces, or simple numeric fraction lines). | |
2925 | // 2. An entire general fraction, enclosed in fraction indicators. | |
2926 | // 3. An entire radical expression, enclosed in radical indicators. | |
2927 | // 4. An arrow. | |
2928 | // 5. An arbitrary shape. | |
2929 | // 6. Any expression enclosed in matching pairs of round parentheses, square brackets or curly braces. | |
2930 | // 7. Any expression enclosed in the braille grouping indicators. [Note: not possible here] | |
2931 | // 8. If none of the foregoing apply, the item is simply the [this element's] individual symbol. | |
2932 | ||
2933 | use crate::xpath_functions::IsInDefinition; | |
2934 | 538 | let mut node_name = name(mathml); |
2935 | 538 | if mathml.attribute_value("data-roman-numeral").is_some() { |
2936 | 1 | node_name = "mi"; // roman numerals don't follow number rules |
2937 | 537 | } |
2938 | 538 | match node_name { |
2939 | 538 | "mn" => { |
2940 | 250 | if !is_base { |
2941 | 233 | return false; |
2942 | 17 | } // clause 1 |
2943 | // two 'mn's can be adjacent, in which case we need to group the 'mn' to make it clear it is separate (see bug #204) | |
2944 | 17 | let parent = get_parent(mathml); // there is always a "math" node |
2945 | 17 | let grandparent = if name(parent) == "math" { |
2946 | 17 | if name(grandparent) != "mrow" { |
2947 | 2 | return false; |
2948 | 15 | } |
2949 | 15 | let preceding = parent.preceding_siblings(); |
2950 | 15 | if preceding.len() < 2 { |
2951 | 6 | return false; |
2952 | 9 | } |
2953 | // any 'mn' would be separated from this node by invisible times | |
2954 | 9 | let previous_child = as_element(preceding[preceding.len()-1]); |
2955 | 9 | if name(previous_child) == "mo" && as_text(previous_child) == "\u{2062}" { |
2956 | 6 | let previous_child = as_element(preceding[preceding.len()-2]); |
2957 | 6 | return name(previous_child) == "mn" |
2958 | } else { | |
2959 | 3 | return false; |
2960 | } | |
2961 | }, | |
2962 | 288 | "mi" | |
2963 | 258 | let text = as_text(mathml); |
2964 | 258 | let parent = get_parent(mathml); // there is always a "math" node |
2965 | 258 | let parent_name = name(parent); // there is always a "math" node |
2966 | 258 | if is_base && ( |
2967 | 224 | return false; |
2968 | 34 | } |
2969 | 34 | let mut chars = text.chars(); |
2970 | 34 | let first_char = chars.next().unwrap(); // canonicalization assures it isn't empty; |
2971 | 34 | let is_one_char = chars.next().is_none(); |
2972 | // '¨', etc., brailles as two chars -- there probably is some exception list but I haven't found it -- these are the ones I know about | |
2973 | 34 | return !((is_one_char && |
2974 | // "lim", "cos", etc., appear not to get parens, but the rules don't mention it (tests show it) | |
2975 | 4 | IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "FunctionNames").unwrap() || |
2976 | 3 | IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "Arrows").unwrap() || // clause 4 |
2977 | 3 | IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "GeometryShapes").unwrap()); // clause 5 |
2978 | }, | |
2979 | 30 | "mfrac" => return |
2980 | 28 | "msqrt" | "mroot" => return |
2981 | // clause 6 only mentions three grouping chars, I'm a little suspicious of that, but that's what it says | |
2982 | 28 | "mrow" => return !( |
2983 | 16 | IsBracketed::is_bracketed(mathml, "[", "]", false, false) || |
2984 | 15 | IsBracketed::is_bracketed(mathml, "{", "}", false, false) ), |
2985 | 6 | "msub" | |
2986 | // I'm a little dubious about the false value, but see GTM 7.7(2) | |
2987 | 5 | if !is_base { |
2988 | 3 | return true; |
2989 | 2 | } |
2990 | // need to group nested scripts in base -- see GTM 12.2(2) | |
2991 | 2 | let parent = get_parent(mathml); // there is always a "math" node |
2992 | 2 | let parent_name = name(parent); // there is always a "math" node |
2993 | 2 | return parent_name == "munder" || parent_name == "mover" || |
2994 | }, | |
2995 | 1 | _ => return true, |
2996 | } | |
2997 | ||
2998 | 538 | } |
2999 | } | |
3000 | ||
3001 | impl Function for NeedsToBeGrouped { | |
3002 | // convert a node to an ordinal number | |
3003 | 1.34k | fn evaluate<'d>(&self, |
3004 | 1.34k | _context: &context::Evaluation<'_, 'd>, |
3005 | 1.34k | args: Vec<Value<'d>>) |
3006 | 1.34k | -> StdResult<Value<'d>, XPathError> |
3007 | { | |
3008 | 1.34k | let mut args = Args(args); |
3009 | 1.34k | args.exactly(3) |
3010 | 1.34k | let is_base = args.pop_boolean() |
3011 | 1.34k | let braille_code = args.pop_string() |
3012 | 1.34k | let node = validate_one_node(args.pop_nodeset() |
3013 | 1.34k | if let Node::Element(e) = node { |
3014 | 1.34k | let answer = match braille_code.as_str() { |
3015 | 1.34k | "CMU" => |
3016 | 538 | "UEB" => NeedsToBeGrouped::needs_grouping_for_ueb(e, is_base), |
3017 | 0 | "Finnish" => NeedsToBeGrouped::needs_grouping_for_finnish(e, is_base), |
3018 | 0 | "Swedish" => NeedsToBeGrouped::needs_grouping_for_swedish(e, is_base), |
3019 | 0 | _ => return Err(XPathError::Other(format!("NeedsToBeGrouped: braille code arg '{braille_code:?}' is not a known code ('UEB', 'CMU', or 'Swedish')"))), |
3020 | }; | |
3021 | 1.34k | return Ok( Value::Boolean( answer ) ); |
3022 | 0 | } |
3023 | ||
3024 | 0 | return Err(XPathError::Other(format!("NeedsToBeGrouped: first arg '{node:?}' is not a node"))); |
3025 | 1.34k | } |
3026 | } | |
3027 | ||
3028 | ||
3029 | ||
3030 | #[cfg(test)] | |
3031 | mod tests { | |
3032 | use super::*; | |
3033 | #[allow(unused_imports)] | |
3034 | use crate::init_logger; | |
3035 | use crate::interface::*; | |
3036 | use log::debug; | |
3037 | ||
3038 | #[test] | |
3039 | 1 | fn ueb_highlight_24() -> Result<()> { // issue 24 |
3040 | 1 | let mathml_str = "<math display='block' id='id-0'> |
3041 | 1 | <mrow id='id-1'> |
3042 | 1 | <mn id='id-2'>4</mn> |
3043 | 1 | <mo id='id-3'>⁢</mo> |
3044 | 1 | <mi id='id-4'>a</mi> |
3045 | 1 | <mo id='id-5'>⁢</mo> |
3046 | 1 | <mi id='id-6'>c</mi> |
3047 | 1 | </mrow> |
3048 | 1 | </math>"; |
3049 | 1 | crate::interface::set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); |
3050 | 1 | set_mathml(mathml_str).unwrap(); |
3051 | 1 | set_preference("BrailleCode", "UEB").unwrap(); |
3052 | 1 | set_preference("BrailleNavHighlight", "All").unwrap(); |
3053 | 1 | let braille = get_braille("id-2") |
3054 | 1 | assert_eq!("⣼⣙⠰⠁⠉", braille); |
3055 | 1 | set_navigation_node("id-2", 0) |
3056 | 1 | assert_eq!( get_braille_position() |
3057 | ||
3058 | 1 | let braille = get_braille("id-4") |
3059 | 1 | assert_eq!("⠼⠙⣰⣁⠉", braille); |
3060 | 1 | set_navigation_node("id-4", 0) |
3061 | 1 | assert_eq!( get_braille_position() |
3062 | 1 | return Ok( () ); |
3063 | 1 | } |
3064 | ||
3065 | #[test] | |
3066 | // This test probably should be repeated for each braille code and be taken out of here | |
3067 | 1 | fn find_mathml_from_braille() -> Result<()> { |
3068 | use std::time::Instant; | |
3069 | 1 | let mathml_str = "<math id='id-0'> |
3070 | 1 | <mrow data-changed='added' id='id-1'> |
3071 | 1 | <mi id='id-2'>x</mi> |
3072 | 1 | <mo id='id-3'>=</mo> |
3073 | 1 | <mfrac id='id-4'> |
3074 | 1 | <mrow id='id-5'> |
3075 | 1 | <mrow data-changed='added' id='id-6'> |
3076 | 1 | <mo id='id-7'>-</mo> |
3077 | 1 | <mi id='id-8'>b</mi> |
3078 | 1 | </mrow> |
3079 | 1 | <mo id='id-9'>±</mo> |
3080 | 1 | <msqrt id='id-10'> |
3081 | 1 | <mrow data-changed='added' id='id-11'> |
3082 | 1 | <msup id='id-12'> |
3083 | 1 | <mi id='id-13'>b</mi> |
3084 | 1 | <mn id='id-14'>2</mn> |
3085 | 1 | </msup> |
3086 | 1 | <mo id='id-15'>-</mo> |
3087 | 1 | <mrow data-changed='added' id='id-16'> |
3088 | 1 | <mn id='id-17'>4</mn> |
3089 | 1 | <mo data-changed='added' id='id-18'>⁢</mo> |
3090 | 1 | <mi id='id-19'>a</mi> |
3091 | 1 | <mo data-changed='added' id='id-20'>⁢</mo> |
3092 | 1 | <mi id='id-21'>c</mi> |
3093 | 1 | </mrow> |
3094 | 1 | </mrow> |
3095 | 1 | </msqrt> |
3096 | 1 | </mrow> |
3097 | 1 | <mrow id='id-22'> |
3098 | 1 | <mn id='id-23'>2</mn> |
3099 | 1 | <mo data-changed='added' id='id-24'>⁢</mo> |
3100 | 1 | <mi id='id-25'>a</mi> |
3101 | 1 | </mrow> |
3102 | 1 | </mfrac> |
3103 | 1 | </mrow> |
3104 | 1 | </math>"; |
3105 | 1 | crate::interface::set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); |
3106 | 1 | set_mathml(mathml_str).unwrap(); |
3107 | 1 | set_preference("BrailleNavHighlight", "Off").unwrap(); |
3108 | ||
3109 | 1 | set_preference("BrailleCode", "Nemeth").unwrap(); |
3110 | 1 | let _braille = get_braille("") |
3111 | 1 | let answers= &[2, 3, 3, 3, 3, 4, 7, 8, 9, 9, 10, 13, 12, 14, 12, 15, 17, 19, 21, 10, 4, 23, 25, 4]; |
3112 | 24 | let |
3113 | 1 | debug!("\n*** Testing Nemeth ***"); |
3114 | 24 | for (i, answer) in |
3115 | 24 | debug!("\n=== i={} ===", i); |
3116 | 24 | let instant = Instant::now(); |
3117 | 24 | let (id, _offset) = crate::interface::get_navigation_node_from_braille_position(i) |
3118 | 24 | N_PROBES.with(|n| {debug!("test {:2} #probes = {}", i, |
3119 | 24 | debug!("Time taken: {}ms", |
3120 | 24 | assert_eq!(*answer, id, "\nNemeth test ith position={}", i); |
3121 | } | |
3122 | ||
3123 | 1 | set_preference("BrailleCode", "UEB").unwrap(); |
3124 | 1 | let _braille = get_braille("") |
3125 | 1 | let answers= &[0, 0, 0, 2, 3, 3, 3, 3, 4, 7, 7, 8, 9, 9, 10, 13, 12, 14, 14, 15, 15, 17, 17, 19, 19, 21, 10, 4, 4, 23, 23, 25, 25, 4, 0, 0]; |
3126 | 36 | let |
3127 | 1 | debug!("\n\n*** Testing UEB ***"); |
3128 | 36 | for (i, answer) in |
3129 | 36 | debug!("\n=== i={} ===", i); |
3130 | 36 | let instant = Instant::now(); |
3131 | 36 | let (id, _offset) = crate::interface::get_navigation_node_from_braille_position(i) |
3132 | 36 | N_PROBES.with(|n| {debug!("test {:2} #probes = {}", i, |
3133 | 36 | debug!("Time taken: {}ms", |
3134 | 36 | assert_eq!(*answer, id, "\nUEB test ith position={}", i); |
3135 | } | |
3136 | 1 | set_preference("BrailleCode", "CMU").unwrap(); |
3137 | 1 | let braille = get_braille("") |
3138 | 1 | let answers= &[2, 3, 5, 7, 8, 9, 9, 9, 10, 10, 11, 13, 12, 14, 14, 15, 17, 17, 19, 19, 21, 11, 5, 4, 22, 23, 23, 25, 25, 22,]; |
3139 | 30 | let |
3140 | 1 | debug!("\n\n*** Testing CMU ***"); |
3141 | 1 | debug!("Braille: {}", braille); |
3142 | 30 | for (i, answer) in |
3143 | 30 | debug!("\n=== i={} ===", i); |
3144 | 30 | let instant = Instant::now(); |
3145 | 30 | let (id, _offset) = crate::interface::get_navigation_node_from_braille_position(i) |
3146 | 30 | N_PROBES.with(|n| {debug!("test {:2} #probes = {}", i, |
3147 | 30 | debug!("Time taken: {}ms", |
3148 | 30 | assert_eq!(*answer, id, "\nCMU test ith position={}", i); |
3149 | } | |
3150 | 1 | return Ok( () ); |
3151 | 1 | } |
3152 | ||
3153 | #[test] | |
3154 | #[allow(non_snake_case)] | |
3155 | 1 | fn test_UEB_start_mode() -> Result<()> { |
3156 | 1 | let mathml_str = "<math><msup><mi>x</mi><mi>n</mi></msup></math>"; |
3157 | 1 | crate::interface::set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); |
3158 | 1 | set_mathml(mathml_str).unwrap(); |
3159 | 1 | set_preference("BrailleCode", "UEB").unwrap(); |
3160 | 1 | set_preference("UEB_START_MODE", "Grade2").unwrap(); |
3161 | 1 | let braille = get_braille("") |
3162 | 1 | assert_eq!("⠭⠰⠔⠝", braille, "Grade2"); |
3163 | 1 | set_preference("UEB_START_MODE", "Grade1").unwrap(); |
3164 | 1 | let braille = get_braille("") |
3165 | 1 | assert_eq!("⠭⠔⠝", braille, "Grade1"); |
3166 | 1 | return Ok( () ); |
3167 | 1 | } |
3168 | } |
Line | Count | Source |
1 | #![allow(clippy::needless_return)] | |
2 | use strum_macros::Display; | |
3 | use sxd_document::dom::{Element, ChildOfElement}; | |
4 | use sxd_document::Package; | |
5 | use crate::definitions::SPEECH_DEFINITIONS; | |
6 | use crate::errors::*; | |
7 | use crate::pretty_print::mml_to_string; | |
8 | use crate::prefs::PreferenceManager; | |
9 | use std::cell::Ref; | |
10 | use regex::{Captures, Regex, RegexSet}; | |
11 | use phf::{phf_map, phf_set}; | |
12 | use crate::speech::{BRAILLE_RULES, SpeechRulesWithContext, braille_replace_chars, make_quoted_string}; | |
13 | use crate::canonicalize::get_parent; | |
14 | use std::borrow::Cow; | |
15 | use std::ops::Range; | |
16 | use std::sync::LazyLock; | |
17 | use log::{debug, error}; | |
18 | ||
19 | 320 | fn is_ueb_prefix(ch: char) -> bool { |
20 | 320 |
|
21 | 320 | } |
22 | ||
23 | /// Returns the braille *char* at the given position in the braille string. | |
24 | 971 | fn braille_at(braille: &str, index: usize) -> char { |
25 | // braille is always 3 bytes per char | |
26 | 971 | return braille[index..index+3].chars().next().unwrap(); |
27 | ||
28 | 971 | } |
29 | ||
30 | /// braille the MathML | |
31 | /// If 'nav_node_id' is not an empty string, then the element with that id will have dots 7 & 8 turned on as per the pref | |
32 | /// Returns the braille string (highlighted) along with the *character* start/end of the highlight (whole string if no highlight) | |
33 | 1.82k | pub fn braille_mathml(mathml: Element, nav_node_id: &str) -> Result<(String, usize, usize)> { |
34 | 1.82k | return BRAILLE_RULES.with(|rules| { |
35 | 1.82k | rules.borrow_mut().read_files() |
36 | 1.82k | let rules = rules.borrow(); |
37 | 1.82k | let new_package = Package::new(); |
38 | 1.82k | let mut rules_with_context = SpeechRulesWithContext::new(&rules, new_package.as_document(), nav_node_id, 0); |
39 | 1.82k | let braille_string = rules_with_context.match_pattern::<String>(mathml) |
40 | 1.82k | .context("Pattern match/replacement failure!") |
41 | // debug!("braille_mathml: braille string: {}", &braille_string); | |
42 | 1.82k | let braille_string = braille_string.replace(' ', ""); |
43 | 1.82k | let pref_manager = rules_with_context.get_rules().pref_manager.borrow(); |
44 | 1.82k | let highlight_style = pref_manager.pref_to_string("BrailleNavHighlight"); |
45 | 1.82k | let braille_code = pref_manager.pref_to_string("BrailleCode"); |
46 | 1.82k | let braille = match braille_code.as_str() { |
47 | 1.82k | "Nemeth" => |
48 | 941 | "UEB" => |
49 | 575 | "Vietnam" => |
50 | 463 | "CMU" => |
51 | 91 | "Finnish" => |
52 | 91 | "Swedish" => |
53 | 91 | "LaTeX" => |
54 | 41 | "ASCIIMath" => ASCIIMath_cleanup(pref_manager, braille_string), |
55 | 0 | "ASCIIMath-fi" => ASCIIMath_cleanup(pref_manager, braille_string), |
56 | 0 | _ => braille_string.trim_matches('⠀').to_string(), // probably needs cleanup if someone has another code, but this will have to get added by hand |
57 | }; | |
58 | ||
59 | return Ok( | |
60 | 1.82k | if highlight_style != "Off" { |
61 | 520 | highlight_braille_chars(braille, &braille_code, highlight_style == "All") |
62 | } else { | |
63 | 1.30k | let end = braille.len()/3; |
64 | 1.30k | (braille, 0, end) |
65 | } | |
66 | ); | |
67 | 1.82k | }); |
68 | ||
69 | /// highlight with dots 7 & 8 based on the highlight style | |
70 | /// both the start and stop points will be extended to deal with indicators such as capitalization | |
71 | /// if 'fill_range' is true, the interior will be highlighted | |
72 | /// Returns the braille string (highlighted) along with the [start, end) *character* of the highlight (whole string if no highlight) | |
73 | 520 | fn highlight_braille_chars(braille: String, braille_code: &str, fill_range: bool) -> (String, usize, usize) { |
74 | 520 | let mut braille = braille; |
75 | // some special (non-braille) chars weren't converted to having dots 7 & 8 to indicate navigation position | |
76 | // they need to be added to the start | |
77 | ||
78 | // find start and end (byte) indexes of the highlighted region (braille chars have length=3 bytes) | |
79 | 520 | let start = braille.find(is_highlighted); |
80 | 520 | let end = braille.rfind(is_highlighted); |
81 | 520 | if start.is_none() { |
82 | 57 | assert!(end.is_none()); |
83 | 57 | let end = braille.len(); |
84 | 57 | return (braille, 0, end/3); |
85 | 463 | }; |
86 | ||
87 | 463 | let start = start.unwrap(); |
88 | 463 | let mut end = end.unwrap() + 3; // always exists if start exists ('end' is exclusive) |
89 | // debug!("braille highlight: start/end={}/{}; braille={}", start/3, end/3, braille); | |
90 | 463 | let mut start = highlight_first_indicator(&mut braille, braille_code, start, end); |
91 | 463 | if let Some( |
92 | 45 | (start, end) = new_range |
93 | 418 | } |
94 | ||
95 | 463 | if start == end { |
96 | 0 | return (braille, start/3, end/3); |
97 | 463 | } |
98 | ||
99 | 463 | if !fill_range { |
100 | 459 | return (braille, start/3, end/3); |
101 | 4 | } |
102 | ||
103 | 4 | let mut result = String::with_capacity(braille.len()); |
104 | 4 | result.push_str(&braille[..start]); |
105 | 4 | let highlight_region =&mut braille[start..end]; |
106 | 8 | for ch in |
107 | 8 | result.push( highlight(ch) ); |
108 | 8 | }; |
109 | 4 | result.push_str(&braille[end..]); |
110 | 4 | return (result, start/3, end/3); |
111 | ||
112 | /// Return the byte index of the first place to highlight | |
113 | 463 | fn highlight_first_indicator(braille: &mut String, braille_code: &str, start_index: usize, end_index: usize) -> usize { |
114 | // chars in the braille block range use 3 bytes -- we can use that to optimize the code some | |
115 | 463 | let first_ch = unhighlight(braille_at(braille, start_index)); |
116 | ||
117 | // need to highlight (optional) capital/number, language, and style (max 2 chars) also in that (rev) order | |
118 | 463 | let mut prefix_ch_index = std::cmp::max(0, start_index as isize - 5*3) as usize; |
119 | 463 | if prefix_ch_index == 0 && |
120 | // don't count the word or passage mode as part of a indicator | |
121 | 46 | if braille.starts_with("⠰⠰⠰") { |
122 | 42 | prefix_ch_index = 9; |
123 | 42 | } else if |
124 | 0 | prefix_ch_index = 6; |
125 | 4 | } |
126 | 417 | } |
127 | 463 | let indicators = &braille[prefix_ch_index..start_index]; // chars to be examined |
128 | 463 | let i_byte_start = start_index - 3 * match braille_code { |
129 | 463 | "Nemeth" => |
130 | 334 | _ => i_start_ueb(indicators), // treat all the other like UEB because they probably have similar number and letter prefixes |
131 | }; | |
132 | 463 | if i_byte_start < start_index { |
133 | // remove old highlight as long as we don't wipe out the end highlight | |
134 | 59 | if start_index < end_index { |
135 | 59 | let old_first_char_bytes = start_index..start_index+3; |
136 | 59 | let replacement_str = unhighlight(braille_at(braille, start_index)).to_string(); |
137 | 59 | braille.replace_range(old_first_char_bytes, &replacement_str); |
138 | 59 |
|
139 | ||
140 | // add new highlight | |
141 | 59 | let new_first_char_bytes = i_byte_start..i_byte_start+3; |
142 | 59 | let replacement_str = highlight(braille_at(braille, i_byte_start)).to_string(); |
143 | 59 | braille.replace_range(new_first_char_bytes, &replacement_str); |
144 | 404 | } |
145 | ||
146 | 463 | return i_byte_start; |
147 | 463 | } |
148 | ||
149 | /// Return the byte indexes of the first and last place to highlight | |
150 | /// Currently, this only does something for CMU braille | |
151 | 463 | fn expand_highlight(braille: &mut String, braille_code: &str, start_index: usize, end_index: usize) -> Option<(usize, usize)> { |
152 | // For CMU, we want to expand mrows to include the opening and closing grouping indicators if they exist | |
153 | 463 | if start_index == 0 || |
154 | 358 | return None; |
155 | 105 | } |
156 | ||
157 | 105 | let first_ch = unhighlight(braille_at(braille, start_index)); |
158 | 105 | let last_ch = unhighlight(braille_at(braille, end_index-3)); |
159 | // We need to be careful not to expand the selection if we are already on a grouping indicator | |
160 | 105 | if first_ch == '⠢' && |
161 | 0 | return None; |
162 | 105 | } |
163 | 105 | let preceding_ch = braille_at(braille, start_index-3); |
164 | 105 | if preceding_ch != '⠢' { |
165 | 43 | return None; |
166 | 62 | } |
167 | ||
168 | 62 | let following_ch = braille_at(braille, end_index); |
169 | 62 | if following_ch != '⠔' { |
170 | 17 | return None; |
171 | 45 | } |
172 | ||
173 | 45 | let preceding_ch = highlight(preceding_ch); |
174 | 45 | braille.replace_range(start_index-3..start_index+3, format!("{preceding_ch}{first_ch}").as_str()); |
175 | 45 | let following_ch = highlight(following_ch); |
176 | 45 | braille.replace_range(end_index-3..end_index+3, format!("{last_ch}{following_ch}").as_str()); |
177 | 45 | return Some( (start_index-3, end_index + 3) ); |
178 | 463 | } |
179 | 520 | } |
180 | ||
181 | /// Given a position in a Nemeth string, what is the position character that starts it (e.g, the prev char for capital letter) | |
182 | 129 | fn i_start_nemeth(braille_prefix: &str, first_ch: char) -> usize { |
183 | 0 | fn is_nemeth_number(ch: char) -> bool { |
184 | 0 | matches!(ch, '⠂' | '⠆' | '⠒' | '⠲' | '⠢' | '⠖' | '⠶' | '⠦' | '⠔' | '⠴' | '⠨') |
185 | 0 | } |
186 | 129 | let mut n_chars = 0; |
187 | 129 | let prefix = &mut braille_prefix.chars().rev().peekable(); |
188 | 129 | if prefix.peek() == Some(&'⠠') || // cap indicator |
189 | 129 | (prefix.peek() == Some(&'⠼') && |
190 | 129 | [Some(&'⠸'), Some(&'⠈'), Some(&'⠨')].contains(&prefix.peek()) { // bold, script/blackboard, italic indicator |
191 | 1 | n_chars += 1; |
192 | 1 | prefix.next(); |
193 | 128 | } |
194 | ||
195 | 129 | if [Some(&'⠰'), Some(&'⠸'), Some(&'⠨')].contains(&prefix.peek()) { // English, German, Greek |
196 | 0 | n_chars += 1; |
197 | 129 | } else if prefix.peek() == Some(&'⠈') { |
198 | 0 | let ch = prefix.next(); // Russian/Greek Variant |
199 | 0 | if ch == Some('⠈') || ch == Some('⠨') { |
200 | 0 | n_chars += 2; |
201 | 0 | } |
202 | 129 | } else if prefix.peek() == Some(&'⠠') { // Hebrew |
203 | 0 | let ch = prefix.next(); // Russian/Greek Variant |
204 | 0 | if ch == Some('⠠') { |
205 | 0 | n_chars += 2; |
206 | 0 | } |
207 | 129 | }; |
208 | 129 | return n_chars; |
209 | 129 | } |
210 | ||
211 | /// Given a position in a UEB string, what is the position character that starts it (e.g, the prev char for capital letter) | |
212 | 334 | fn i_start_ueb(braille_prefix: &str) -> usize { |
213 | 334 | let prefix = &mut braille_prefix.chars().rev().peekable(); |
214 | 334 | let mut n_chars = 0; |
215 | 392 | while let Some( |
216 | 320 | if is_ueb_prefix(ch) { |
217 | 58 | n_chars += 1; |
218 | 262 | } else if ch == '⠆' { |
219 | 0 | let n_typeform_chars = check_for_typeform(prefix); |
220 | 0 | if n_typeform_chars > 0 { |
221 | 0 | n_chars += n_typeform_chars; |
222 | 0 | } else { |
223 | 0 | break; |
224 | } | |
225 | } else { | |
226 | 262 | break; |
227 | } | |
228 | } | |
229 | 334 | return n_chars; |
230 | 334 | } |
231 | ||
232 | ||
233 | 0 | fn check_for_typeform(prefix: &mut dyn std::iter::Iterator<Item=char>) -> usize { |
234 | 0 | fn is_ueb_typeform_prefix(ch: char) -> bool { |
235 | 0 | matches!(ch, '⠈' | '⠘' | '⠸' | '⠨') |
236 | 0 | } |
237 | ||
238 | 0 | if let Some(typeform_indicator) = prefix.next() { |
239 | 0 | if is_ueb_typeform_prefix(typeform_indicator) { |
240 | 0 | return 2; |
241 | 0 | } else if typeform_indicator == '⠼' && |
242 | 0 | let Some(user_defined_typeform_indicator) = prefix.next() && |
243 | 0 | (is_ueb_typeform_prefix(user_defined_typeform_indicator) || user_defined_typeform_indicator == '⠐') { |
244 | 0 | return 3; |
245 | 0 | } |
246 | 0 | } |
247 | 0 | return 0; |
248 | 0 | } |
249 | 1.82k | } |
250 | ||
251 | // FIX: if 8-dot braille is needed, perhaps the highlights can be shifted to a "highlighted" 256 char block in private space | |
252 | // they would need to be unshifted for the external world | |
253 | 11.0k | fn is_highlighted(ch: char) -> bool { |
254 | 11.0k | let ch_as_u32 = ch as u32; |
255 | 11.0k | return (0x28C0..0x28FF).contains(&ch_as_u32) || |
256 | 11.0k | } |
257 | ||
258 | 159 | fn highlight(ch: char) -> char { |
259 | // safe because we have checked the range | |
260 | 159 | return unsafe{char::from_u32_unchecked(ch as u32 | 0xC0)}; // 0x28C0..0x28FF all have dots 7 & 8 on |
261 | 159 | } |
262 | ||
263 | 3.12k | fn unhighlight(ch: char) -> char { |
264 | 3.12k | let ch_as_u32 = ch as u32; |
265 | 3.12k | if (0x28C0..0x28FF).contains(&ch_as_u32) { // 0x28C0..0x28FF all have dots 7 & 8 on |
266 | 903 | return unsafe{char::from_u32_unchecked(ch_as_u32 & 0x283F)}; // safe because we have checked the range |
267 | } else { | |
268 | 2.22k | return ch; |
269 | } | |
270 | 3.12k | } |
271 | ||
272 | use std::cell::RefCell; | |
273 | thread_local!{ | |
274 | /// Count number of probes -- get a sense of how well algorithm is working (for debugging) | |
275 | static N_PROBES: RefCell<usize> = const { RefCell::new(0) }; | |
276 | } | |
277 | ||
278 | ||
279 | /// Given a 0-based braille position, return the id of the smallest MathML node enclosing it. | |
280 | /// This node might be a leaf with an offset. | |
281 | 91 | pub fn get_navigation_node_from_braille_position(mathml: Element, position: usize) -> Result<(String, usize)> { |
282 | // This works via a "smart" binary search (the trees aren't binary or balanced, we estimate the child to look in): | |
283 | // braille the mathml with a nav node and see where 'position' is in relation to the start/end of the nav node | |
284 | // Each call to find_navigation_node() returns a search state that tell us where to look next if not found | |
285 | #[derive(Debug, Display)] | |
286 | enum SearchStatus { | |
287 | LookInParent, // look up a level for exact match | |
288 | LookLeft, // went too far, backup | |
289 | LookRight, // continue searching right | |
290 | Found, | |
291 | } | |
292 | ||
293 | struct SearchState<'e> { | |
294 | status: SearchStatus, | |
295 | node: Element<'e>, | |
296 | highlight_start: usize, // if status is Found, then this is the offset within a leaf node | |
297 | highlight_end: usize, // if status is Found, this is ignored | |
298 | } | |
299 | ||
300 | // save the current highlight state, set the state to be the end points so we can find the braille, then restore the state | |
301 | // FIX: this can fail if there is 8-dot braille | |
302 | use crate::interface::{get_preference, set_preference}; | |
303 | 91 | let saved_highlight_style = get_preference("BrailleNavHighlight").unwrap(); |
304 | 91 | set_preference("BrailleNavHighlight", "EndPoints").unwrap(); |
305 | ||
306 | 91 | N_PROBES.with(|n| {*n.borrow_mut() = 0}); |
307 | // dive into the child of the <math> element (should only be one) | |
308 | 91 | let search_state = find_navigation_node(mathml, as_element(mathml.children()[0]), position) |
309 | 91 | set_preference("BrailleNavHighlight", saved_highlight_style.as_str()).unwrap(); |
310 | ||
311 | // we know the attr value exists because it was found internally | |
312 | // FIX: what should be done if we never did the search? | |
313 | 91 | match search_state.status { |
314 | SearchStatus::Found | SearchStatus::LookInParent => { | |
315 | 86 | return Ok( (search_state.node.attribute_value("id").unwrap().to_string(), search_state.highlight_start) ) |
316 | }, | |
317 | _ => { | |
318 | // weird state -- return the entire expr | |
319 | 5 | match mathml.attribute_value("id") { |
320 | 0 | None => bail!("'id' is not present on mathml: {}", mml_to_string(mathml)), |
321 | 5 | Some(id) => return Ok( (id.to_string(), 0) ), |
322 | } | |
323 | } | |
324 | } | |
325 | ||
326 | /// find the navigation node that most tightly encapsulates the target position (0-based) | |
327 | /// 'node' is the current node we are on inside of 'mathml' | |
328 | 465 | fn find_navigation_node<'e>(mathml: Element<'e>, node: Element<'e>, target_position: usize) -> Result<SearchState<'e>> { |
329 | 465 | let node_id = match node.attribute_value("id") { |
330 | 465 | Some(id) => id, |
331 | 0 | None => bail!("'id' is not present on mathml: {}", mml_to_string(node)), |
332 | }; | |
333 | 465 | N_PROBES.with(|n| {*n.borrow_mut() += 1}); |
334 | 465 | let (braille, char_start, char_end) = braille_mathml(mathml, node_id) |
335 | 465 | let mut status = None; |
336 | // debug!("find_navigation_node ({}, id={}): highlight=[{}, {}); target={}", name(node), node_id, char_start, char_end, target_position); | |
337 | 465 | if is_leaf(node) { |
338 | 100 | if char_start == 0 && |
339 | 6 | // nothing highlighted -- probably invisible char not represented in braille -- continue looking to the right |
340 | 6 | // debug!(" return due invisible char (?)' "); |
341 | 6 | status = Some(SearchStatus::LookRight); |
342 | 94 | } else if char_start <= target_position && |
343 | // FIX: need to handle multi-char leaves and set the offset (char_start) appropriately | |
344 | // debug!(" return due to target_position inside leaf: {} <= {} < {}", char_start, target_position, char_end); | |
345 | 58 | return Ok( SearchState { |
346 | 58 | status: SearchStatus::Found, |
347 | 58 | node, |
348 | 58 | highlight_start: target_position - char_start, |
349 | 58 | highlight_end: 0, |
350 | 58 | }); |
351 | 36 | } else if name(node) == "mo" { |
352 | // if there is whitespace before or after the operator, consider the operator to be a match | |
353 | 18 | if (char_start > 0 && target_position == char_start - 1 && |
354 | 2 | braille_at(&braille, 3*(char_start - 1)) == '⠀' && is_operator_that_adds_whitespace(node)) || |
355 | 16 | (3*char_end < braille.len() && target_position == char_end && |
356 | 11 | braille_at(&braille, 3*char_end) == '⠀' && |
357 | 4 | return Ok( SearchState { |
358 | 4 | status: SearchStatus::Found, |
359 | 4 | node, |
360 | 4 | highlight_start: 0, |
361 | 4 | highlight_end: 0, |
362 | 4 | } ); |
363 | 14 | } |
364 | 18 | } |
365 | 365 | } |
366 | 403 | if status.is_none() { |
367 | 397 | if target_position < char_start { |
368 | 23 | // debug!(" return due to target_position {} < start {}", target_position, char_start); |
369 | 23 | status = Some(SearchStatus::LookLeft); |
370 | 374 | } else if target_position >= char_end { |
371 | 49 | // debug!(" return due to target_position {} >= end {}", target_position, char_end); |
372 | 49 | status = Some(SearchStatus::LookRight); |
373 | 325 | } |
374 | 6 | } |
375 | 403 | if let Some( |
376 | 78 | return Ok( SearchState { |
377 | 78 | status, |
378 | 78 | node, |
379 | 78 | highlight_start: char_start, |
380 | 78 | highlight_end: char_end, |
381 | 78 | } ); |
382 | 325 | } |
383 | ||
384 | 325 | let children = node.children(); |
385 | 325 | let mut i_left_child = 0; // inclusive |
386 | 325 | let mut i_right_child = children.len(); // exclusive |
387 | 325 | let mut call_start = char_start; |
388 | 325 | let mut guess_fn: Box<dyn Fn(usize, usize, usize, usize) -> usize> = Box::new(|i_left, i_right, start, target: usize| guess_child_node_ltr(&children, i_left, i_right, start, target)); |
389 | 398 | while i_left_child < i_right_child { |
390 | 374 | let i_guess_child = guess_fn(i_left_child, i_right_child, call_start, target_position); |
391 | 374 | let status = find_navigation_node(mathml, as_element(children[i_guess_child]), target_position) |
392 | // debug!(" in {} loop: status: {}, child: left/guess/right {}/({},{})/{}; highlight=[{}, {})", | |
393 | // name(node), status.status, | |
394 | // i_left_child, i_guess_child, name(as_element(children[i_guess_child])),i_right_child, | |
395 | // status.highlight_start, status.highlight_end); | |
396 | 374 | match status.status { |
397 | SearchStatus::Found => { | |
398 | 301 | return Ok(status); |
399 | }, | |
400 | SearchStatus::LookInParent => { | |
401 | 0 | let (_, start, end) = braille_mathml(mathml, node_id)?; |
402 | // debug!(" parent ({}) braille: start/end={}/{}; target_position={}", name(node), start, end, target_position); | |
403 | 0 | if start <= target_position && target_position < end { |
404 | // debug!(" ..found: id={}", node_id); | |
405 | 0 | return Ok( SearchState{ |
406 | 0 | status: SearchStatus::Found, |
407 | 0 | node, |
408 | 0 | highlight_start: 0, |
409 | 0 | highlight_end: 0, |
410 | 0 | } ); // done or look up another level |
411 | 0 | } |
412 | 0 | return Ok(status); // look up a level |
413 | }, | |
414 | SearchStatus::LookLeft => { | |
415 | 20 | i_right_child = if i_guess_child == 0 { |
416 | 20 | call_start = status.highlight_start-1; |
417 | 20 | guess_fn = Box::new(|i_left, i_right, start, target| |
418 | }, | |
419 | SearchStatus::LookRight => { | |
420 | 53 | i_left_child = i_guess_child+1; |
421 | 53 | call_start = status.highlight_end+1; |
422 | 53 | guess_fn = Box::new(|i_left, i_right, start, target| |
423 | }, | |
424 | } | |
425 | } | |
426 | // debug!("Didn't child in node {}: left/right={}/{}; target_position={}", name(node), i_left_child, i_right_child, target_position); | |
427 | ||
428 | // if we get here, we didn't find it in the children | |
429 | // debug!("..end of loop: look in parent of {} has start/end={}/{}", name(node), char_start, char_end); | |
430 | return Ok( SearchState{ | |
431 | 24 | status: if char_start <= target_position && target_position <= char_end {SearchStatus::Found} else { |
432 | 24 | node, |
433 | highlight_start: 0, | |
434 | highlight_end: 0, | |
435 | } ); | |
436 | 465 | } |
437 | ||
438 | 4 | fn is_operator_that_adds_whitespace(node: Element) -> bool { |
439 | use crate::definitions::BRAILLE_DEFINITIONS; | |
440 | 4 | if PreferenceManager::get().borrow().pref_to_string("UseSpacesAroundAllOperators") == "true" { |
441 | 0 | return true; |
442 | 4 | } |
443 | ||
444 | 4 | return BRAILLE_DEFINITIONS.with(|definitions| { |
445 | 4 | let definitions = definitions.borrow(); |
446 | 4 | let comparison_operators = definitions.get_hashset("ComparisonOperators").unwrap(); |
447 | 4 | return comparison_operators.contains(as_text(node)); |
448 | 4 | }); |
449 | 4 | } |
450 | ||
451 | /// look in children[i_left..i_right] for a count that exceeds target | |
452 | 367 | fn guess_child_node_ltr(children: &[ChildOfElement], i_left: usize, i_right: usize, start: usize, target: usize) -> usize { |
453 | 367 | let mut estimated_position = start; |
454 | // number of chars to add for number indicators | |
455 | 367 | let n_number_indicator = if PreferenceManager::get().borrow().pref_to_string("BrailleCode") == "Nemeth" { |
456 | #[allow(clippy::needless_range_loop)] // I don't like enumerate/take/skip here | |
457 | 666 | for i in |
458 | 666 | estimated_position += estimate_braille_chars(children[i], n_number_indicator); |
459 | 666 | if estimated_position >= target { |
460 | 344 | return i; |
461 | 322 | } |
462 | } | |
463 | 23 | return i_right-1; // estimate was too large, return the last child as a guess |
464 | 367 | } |
465 | ||
466 | /// look in children[i_left..i_right].rev for a count that is less than target | |
467 | 7 | fn guess_child_node_rtl(children: &[ChildOfElement], i_left: usize, i_right: usize, start: usize, target: usize) -> usize { |
468 | 7 | let mut estimated_position = start; |
469 | 7 | let n_number_indicator = if PreferenceManager::get().borrow().pref_to_string("BrailleCode") == "Nemeth" { |
470 | 7 | for i in (i_left..i_right).rev() { |
471 | 7 | estimated_position -= estimate_braille_chars(children[i], n_number_indicator); |
472 | 7 | if estimated_position <= target { |
473 | 7 | return i; |
474 | 0 | } |
475 | } | |
476 | 0 | return i_left; // estimate was too small, return the first child as a guess |
477 | 7 | } |
478 | ||
479 | 4.58k | fn estimate_braille_chars(child: ChildOfElement, n_number_indicator: usize) -> usize { |
480 | 4.58k | let node = as_element(child); |
481 | 4.58k | let leaf_name = name(node); |
482 | 4.58k | if is_leaf(node) { |
483 | 3.13k | let text = as_text(node); |
484 | // len() is close since mn's probably have ASCII digits and lower case vars are common (count as) and other chars need extra braille chars | |
485 | // don't want to count invisible chars since they don't display and would give a length = 3 | |
486 | 3.13k | if text == "\u{2061}" || text == "\u{2062}" { // invisible function apply/times (most common by far) |
487 | 597 | return 0; |
488 | 2.53k | } |
489 | // FIX: this assumption is bad for 8-dot braille | |
490 | 2.53k | return match leaf_name { |
491 | 2.53k | "mn" => |
492 | 1.90k | "mo" => |
493 | 1.16k | _ => text.len(), |
494 | } | |
495 | 1.45k | } |
496 | 1.45k | let mut estimate = if leaf_name == "mrow" { |
497 | 1.45k | if leaf_name == "msup" || |
498 | 260 | estimate -= 1; // opening superscript/subscript indicator not needed |
499 | 1.19k | } |
500 | 3.91k | for child in |
501 | 3.91k | estimate += estimate_braille_chars(child, n_number_indicator); |
502 | 3.91k | } |
503 | // debug!("estimate_braille_chars for {}: {}", crate::canonicalize::element_summary(as_element(child)), estimate); | |
504 | 1.45k | return estimate; |
505 | 4.58k | } |
506 | 91 | } |
507 | ||
508 | 888 | fn nemeth_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String { |
509 | // Typeface: S: sans-serif, B: bold, T: script/blackboard, I: italic, R: Roman | |
510 | // Language: E: English, D: German, G: Greek, V: Greek variants, H: Hebrew, U: Russian | |
511 | // Indicators: C: capital, N: number, P: punctuation, M: multipurpose | |
512 | // Others: | |
513 | // W -- whitespace that should be kept (e.g, in a numeral) | |
514 | // 𝑁 -- hack for special case of a lone decimal pt -- not considered a number but follows rules mostly | |
515 | // SRE doesn't have H: Hebrew or U: Russian, so not encoded (yet) | |
516 | // Note: some "positive" patterns find cases to keep the char and transform them to the lower case version | |
517 | static NEMETH_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! { | |
518 | "S" => "⠠⠨", // sans-serif | |
519 | "B" => "⠸", // bold | |
520 | "𝔹" => "⠨", // blackboard | |
521 | "T" => "⠈", // script | |
522 | "I" => "⠨", // italic (mapped to be the same a blackboard) | |
523 | "R" => "", // roman | |
524 | "E" => "⠰", // English | |
525 | "D" => "⠸", // German (Deutsche) | |
526 | "G" => "⠨", // Greek | |
527 | "V" => "⠨⠈", // Greek Variants | |
528 | "H" => "⠠⠠", // Hebrew | |
529 | "U" => "⠈⠈", // Russian | |
530 | "C" => "⠠", // capital | |
531 | "P" => "⠸", // punctuation | |
532 | "𝐏" => "⠸", // hack for punctuation after a roman numeral -- never removed | |
533 | "L" => "", // letter | |
534 | "l" => "", // letter inside enclosed list | |
535 | "M" => "", // multipurpose indicator | |
536 | "m" => "⠐", // required multipurpose indicator | |
537 | "N" => "", // potential number indicator before digit | |
538 | "n" => "⠼", // required number indicator before digit | |
539 | "𝑁" => "", // hack for special case of a lone decimal pt -- not considered a number but follows rules mostly | |
540 | "W" => "⠀", // whitespace | |
541 | "w" => "⠀", // whitespace from comparison operator | |
542 | "," => "⠠⠀", // comma | |
543 | "b" => "⠐", // baseline | |
544 | "𝑏" => "⣐", // highlight baseline (it's a hack) | |
545 | "↑" => "⠘", // superscript | |
546 | "↓" => "⠰", // subscript | |
547 | }; | |
548 | ||
549 | // Add an English Letter indicator. This involves finding "single letters". | |
550 | // The green book has a complicated set of cases, but the Nemeth UEB Rule book (May 2020), 4.10 has a much shorter explanation: | |
551 | // punctuation or whitespace on the left and right ignoring open/close chars | |
552 | // https://nfb.org/sites/www.nfb.org/files/files-pdf/braille-certification/lesson-4--provisional-5-9-20.pdf | |
553 | 2 | static ADD_ENGLISH_LETTER_INDICATOR: LazyLock<Regex> = LazyLock::new(|| { |
554 | 2 | Regex::new(r"(?P<start>^|W|P.[\u2800-\u28FF]?|,)(?P<open>[\u2800-\u28FF]?⠷)?(?P<letter>C?L.)(?P<close>[\u2800-\u28FF]?⠾)?(?P<end>W|P|,|$)").unwrap() |
555 | 2 | }); |
556 | ||
557 | // Trim braille spaces before and after braille indicators | |
558 | // In order: fraction, /, cancellation, letter, baseline | |
559 | // Note: fraction over is not listed due to example 42(4) which shows a space before the "/" | |
560 | static REMOVE_SPACE_BEFORE_BRAILLE_INDICATORS: LazyLock<Regex> = | |
561 | 2 | LazyLock::new(|| Regex::new(r"(⠄⠄⠄|⠤⠤⠤⠤)[Ww]+([⠼⠸⠪])").unwrap()); |
562 | static REMOVE_SPACE_AFTER_BRAILLE_INDICATORS: LazyLock<Regex> = | |
563 | 2 | LazyLock::new(|| Regex::new(r"([⠹⠻Llb])[Ww]+(⠄⠄⠄|⠤⠤⠤⠤)").unwrap()); |
564 | ||
565 | // Hack to convert non-numeric '.' to numeric '.' | |
566 | // The problem is that the numbers are hidden inside of mover -- this might be more general than rule 99_2. | |
567 | 2 | static DOTS_99_A_2: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"𝑁⠨mN").unwrap()); |
568 | ||
569 | // Punctuation is one or two chars. There are (currently) only 3 2-char punct chars (—‘’) -- we explicitly list them below | |
570 | static REMOVE_SPACE_BEFORE_PUNCTUATION_151: LazyLock<Regex> = | |
571 | 2 | LazyLock::new(|| Regex::new(r"w(P.[⠤⠦⠠]?|[\u2800-\u28FF]?⠾)").unwrap()); |
572 | static REMOVE_SPACE_AFTER_PUNCTUATION_151: LazyLock<Regex> = | |
573 | 2 | LazyLock::new(|| Regex::new(r"(P.[⠤⠦⠠]?|[\u2800-\u28FF]?⠷)w").unwrap()); |
574 | ||
575 | // Multipurpose indicator insertion | |
576 | // 149 -- consecutive comparison operators have no space -- instead a multipurpose indicator is used (doesn't require a regex) | |
577 | ||
578 | // 177.2 -- add after a letter and before a digit (or decimal pt) -- digits will start with N | |
579 | 2 | static MULTI_177_2: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"([Ll].)[N𝑁]").unwrap()); |
580 | ||
581 | // keep between numeric subscript and digit ('M' added by subscript rule) | |
582 | 2 | static MULTI_177_3: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"([N𝑁].)M([N𝑁].)").unwrap()); |
583 | ||
584 | // Add after decimal pt for non-digits except for comma and punctuation | |
585 | // Note: since "." can be in the middle of a number, there is not necessarily a "N" | |
586 | // Although not mentioned in 177_5, don't add an 'M' before an 'm' | |
587 | static MULTI_177_5: LazyLock<Regex> = | |
588 | 2 | LazyLock::new(|| Regex::new(r"([N𝑁]⠨)([^⠂⠆⠒⠲⠢⠖⠶⠦⠔N𝑁,Pm])").unwrap()); |
589 | ||
590 | // Pattern for rule II.9a (add numeric indicator at start of line or after a space) | |
591 | // 1. start of line | |
592 | // 2. optional minus sign (⠤) | |
593 | // 3. optional typeface indicator | |
594 | // 4. number (N) | |
595 | 2 | static NUM_IND_9A: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?P<start>^|[,Ww])(?P<minus>⠤?)N").unwrap()); |
596 | ||
597 | // Needed after section mark(§), paragraph mark(¶), #, or * | |
598 | 2 | static NUM_IND_9C: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(⠤?)(⠠⠷|⠠⠳|⠠⠈⠷)N").unwrap()); |
599 | ||
600 | // Needed after section mark(§), paragraph mark(¶), #, or * | |
601 | 2 | static NUM_IND_9D: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(⠈⠠⠎|⠈⠠⠏|⠨⠼|⠈⠼)N").unwrap()); |
602 | ||
603 | // Needed after a typeface change or interior shape modifier indicator | |
604 | 2 | static NUM_IND_9E: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?P<face>[SB𝔹TIR]+?)N").unwrap()); |
605 | 2 | static NUM_IND_9E_SHAPE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?P<mod>⠸⠫)N").unwrap()); |
606 | ||
607 | // Needed after hyphen that follows a word, abbreviation, or punctuation (caution about rule 11d) | |
608 | // Note -- hyphen might encode as either "P⠤" or "⠤" depending on the tag used | |
609 | 2 | static NUM_IND_9F: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"([Ll].[Ll].|P.)(P?⠤)N").unwrap()); |
610 | ||
611 | // Enclosed list exception | |
612 | // Normally we don't add numeric indicators in enclosed lists (done in get_braille_nemeth_chars). | |
613 | // The green book says "at the start" of an item, don't add the numeric indicator. | |
614 | // The NFB list exceptions after function abbreviations and angles, but what this really means is "after a space" | |
615 | 2 | static NUM_IND_ENCLOSED_LIST: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"w([⠂⠆⠒⠲⠢⠖⠶⠦⠔⠴])").unwrap()); |
616 | ||
617 | // Punctuation chars (Rule 38.6 says don't use before ",", "hyphen", "-", "…") | |
618 | // Never use punctuation indicator before these (38-6) | |
619 | // "…": "⠀⠄⠄⠄" | |
620 | // "-": "⠸⠤" (hyphen and dash) | |
621 | // ",": "⠠⠀" -- spacing already added | |
622 | // Rule II.9b (add numeric indicator after punctuation [optional minus[optional .][digit] | |
623 | // because this is run after the above rule, some cases are already caught, so don't | |
624 | // match if there is already a numeric indicator | |
625 | 2 | static NUM_IND_9B: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?P<punct>P..?)(?P<minus>⠤?)N").unwrap()); |
626 | ||
627 | // Before 79b (punctuation) | |
628 | 2 | static REMOVE_LEVEL_IND_BEFORE_SPACE_COMMA_PUNCT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?:[↑↓]+[b𝑏]?|[b𝑏])([Ww,P]|$)").unwrap()); |
629 | ||
630 | // Most commas have a space after them, but not when followed by a close quote (others?) | |
631 | 2 | static NO_SPACE_AFTER_COMMA: LazyLock<Regex> = LazyLock::new(|| Regex::new(r",P⠴").unwrap()); // captures both single and double close quote |
632 | 2 | static REMOVE_LEVEL_IND_BEFORE_BASELINE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?:[↑↓mb𝑏]+)([b𝑏])").unwrap()); |
633 | ||
634 | // Except for the four chars above, the unicode rules always include a punctuation indicator. | |
635 | // The cases to remove them (that seem relevant to MathML) are: | |
636 | // Beginning of line or after a space (V 38.1) | |
637 | // After a word (38.4) | |
638 | // 2nd or subsequent punctuation (includes, "-", etc) (38.7) | |
639 | 2 | static REMOVE_AFTER_PUNCT_IND: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(^|[Ww]|[Ll].[Ll].)P(.)").unwrap()); |
640 | 2 | static REPLACE_INDICATORS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"([SB𝔹TIREDGVHUP𝐏CLlMmb𝑏↑↓Nn𝑁Ww,])").unwrap()); |
641 | 2 | static COLLAPSE_SPACES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"⠀⠀+").unwrap()); |
642 | ||
643 | // debug!("Before: \"{}\"", raw_braille); | |
644 | // replacements might overlap at boundaries (e.g., whitespace) -- need to repeat | |
645 | 888 | let mut start = 0; |
646 | 888 | let mut result = String::with_capacity(raw_braille.len()+ raw_braille.len()/4); // likely upper bound |
647 | 923 | while let Some( |
648 | 35 | result.push_str(&raw_braille[start..matched.start()]); |
649 | 35 | let replacement = ADD_ENGLISH_LETTER_INDICATOR.replace( |
650 | 35 | &raw_braille[matched.start()..matched.end()], "${start}${open}E${letter}${close}"); |
651 | 35 | // debug!("matched='{}', start/end={}/{}; replacement: {}", &raw_braille[matched.start()..matched.end()], matched.start(), matched.end(), replacement); |
652 | 35 | result.push_str(&replacement); |
653 | 35 | // put $end back on because needed for next match (e.g., whitespace at end and then start of next match) |
654 | 35 | // but it could also match because it was at the end, in which case "-1" is wrong -- tested after loop for that |
655 | 35 | start = matched.end() - 1; |
656 | 35 | } |
657 | 888 | if !raw_braille.is_empty() && ( start < raw_braille.len()-1 || |
658 | 882 | result.push_str(&raw_braille[start..]); |
659 | 882 |
|
660 | // debug!("ELIs: \"{}\"", result); | |
661 | ||
662 | 888 | let result = NUM_IND_ENCLOSED_LIST.replace_all(&result, "wn${1}"); |
663 | ||
664 | // Remove blanks before and after braille indicators | |
665 | 888 | let result = REMOVE_SPACE_BEFORE_BRAILLE_INDICATORS.replace_all(&result, "$1$2"); |
666 | 888 | let result = REMOVE_SPACE_AFTER_BRAILLE_INDICATORS.replace_all(&result, "$1$2"); |
667 | ||
668 | 888 | let result = REMOVE_SPACE_BEFORE_PUNCTUATION_151.replace_all(&result, "$1"); |
669 | 888 | let result = REMOVE_SPACE_AFTER_PUNCTUATION_151.replace_all(&result, "$1"); |
670 | // debug!("spaces: \"{}\"", result); | |
671 | ||
672 | 888 | let result = DOTS_99_A_2.replace_all(&result, "N⠨mN"); |
673 | ||
674 | // Multipurpose indicator | |
675 | 888 | let result = result.replace("ww", "m"); // 149 |
676 | 888 | let result = MULTI_177_2.replace_all(&result, "${1}m${2}"); |
677 | 888 | let result = MULTI_177_3.replace_all(&result, "${1}m$2"); |
678 | 888 | let result = MULTI_177_5.replace_all(&result, "${1}m$2"); |
679 | // debug!("MULTI: \"{}\"", result); | |
680 | ||
681 | 888 | let result = NUM_IND_9A.replace_all(&result, "${start}${minus}n"); |
682 | // debug!("IND_9A: \"{}\"", result); | |
683 | 888 | let result = NUM_IND_9C.replace_all(&result, "${1}${2}n"); |
684 | 888 | let result = NUM_IND_9D.replace_all(&result, "${1}n"); |
685 | 888 | let result = NUM_IND_9E.replace_all(&result, "${face}n"); |
686 | 888 | let result = NUM_IND_9E_SHAPE.replace_all(&result, "${mod}n"); |
687 | 888 | let result = NUM_IND_9F.replace_all(&result, "${1}${2}n"); |
688 | ||
689 | // debug!("IND_9F: \"{}\"", result); | |
690 | ||
691 | // 9b: insert after punctuation (optional minus sign) | |
692 | // common punctuation adds a space, so 9a handled it. Here we deal with other "punctuation" | |
693 | // FIX other punctuation and reference symbols (9d) | |
694 | 888 | let result = NUM_IND_9B.replace_all(&result, "$punct${minus}n"); |
695 | // debug!("A PUNCT: \"{}\"", &result); | |
696 | ||
697 | // strip level indicators | |
698 | // check first to remove level indicators before baseline, then potentially remove the baseline | |
699 | 888 | let mut result = REMOVE_LEVEL_IND_BEFORE_BASELINE.replace_all(&result, "$1"); |
700 | // debug!("Punct : \"{}\"", &result); | |
701 | // checks for punctuation char, so needs to before punctuation is stripped. | |
702 | // if '𝑏' is removed, then the highlight needs to be shifted to the left in some cases | |
703 | 888 | let result = remove_baseline_before_space_or_punctuation(&mut result); |
704 | // debug!("Removed: \"{}\"", &result); | |
705 | ||
706 | 888 | let result = NO_SPACE_AFTER_COMMA.replace_all(&result, "⠠P⠴"); |
707 | ||
708 | 888 | let result = REMOVE_AFTER_PUNCT_IND.replace_all(&result, "$1$2"); |
709 | // debug!("Punct38: \"{}\"", &result); | |
710 | ||
711 | // these typeforms need to get pulled from user-prefs as they are transcriber-defined | |
712 | 888 | let sans_serif = pref_manager.pref_to_string("Nemeth_SansSerif"); |
713 | 888 | let bold = pref_manager.pref_to_string("Nemeth_Bold"); |
714 | 888 | let double_struck = pref_manager.pref_to_string("Nemeth_DoubleStruck"); |
715 | 888 | let script = pref_manager.pref_to_string("Nemeth_Script"); |
716 | 888 | let italic = pref_manager.pref_to_string("Nemeth_Italic"); |
717 | ||
718 | 7.57k | let |
719 | 7.57k | let matched_char = &cap[0]; |
720 | 7.57k | match matched_char { |
721 | 7.57k | "S" => |
722 | 7.57k | "B" => |
723 | 7.52k | "𝔹" => |
724 | 7.49k | "T" => |
725 | 7.49k | "I" => |
726 | 7.48k | _ => match NEMETH_INDICATOR_REPLACEMENTS.get(&cap[0]) { |
727 | 0 | None => {error!("REPLACE_INDICATORS and NEMETH_INDICATOR_REPLACEMENTS are not in sync"); ""}, |
728 | 7.48k | Some(&ch) => ch, |
729 | } | |
730 | } | |
731 | 7.57k | }); |
732 | ||
733 | // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces | |
734 | 888 | let result = result.trim_start_matches('⠀').trim_end_matches('⠀'); |
735 | 888 | let result = COLLAPSE_SPACES.replace_all(result, "⠀"); |
736 | ||
737 | 888 | return result.to_string(); |
738 | ||
739 | 888 | fn remove_baseline_before_space_or_punctuation<'a>(braille: &'a mut Cow<'a, str>) -> Cow<'a, str> { |
740 | // If the baseline highlight is at the end of the string and it is going to be deleted by the regex, | |
741 | // then we need to shift the highlight to the left if what is to it's left is not whitespace (which should never be a highlight end) | |
742 | // This only happens when BrailleNavHighlight == "EndPoints". | |
743 | 888 | let highlight_style = PreferenceManager::get().borrow().pref_to_string("BrailleNavHighlight"); |
744 | 888 | if highlight_style == "EndPoints" && |
745 | 132 | let Some( |
746 | 129 | braille[last_highlighted..].starts_with('𝑏') { |
747 | 7 | let i_after_baseline = last_highlighted + '𝑏'.len_utf8(); |
748 | 7 | if i_after_baseline == braille.len() || |
749 | // shift the highlight to the left after doing just the replacement (if any) that the regex below does | |
750 | // the shift runs until a non blank braille char is found | |
751 | 2 | let mut bytes_deleted = 0; |
752 | 2 | let mut char_to_highlight = "".to_string(); // illegal value |
753 | 2 | for ch in braille[..last_highlighted].chars().rev() { |
754 | 2 | bytes_deleted += ch.len_utf8(); |
755 | 2 | if (0x2801..0x28FF).contains(&(ch as u32)) { |
756 | 2 | char_to_highlight = highlight(ch).to_string(); |
757 | 2 | break; |
758 | 0 | } |
759 | } | |
760 | 2 | braille.to_mut().replace_range(last_highlighted-bytes_deleted..last_highlighted+'𝑏'.len_utf8(), |
761 | 2 | &char_to_highlight); |
762 | 5 | } |
763 | 881 | } |
764 | 888 | return REMOVE_LEVEL_IND_BEFORE_SPACE_COMMA_PUNCT.replace_all(braille, "$1"); |
765 | ||
766 | 888 | } |
767 | 888 | } |
768 | ||
769 | // Typeface: S: sans-serif, B: bold, T: script/blackboard, I: italic, R: Roman | |
770 | // Language: E: English, D: German, G: Greek, V: Greek variants, H: Hebrew, U: Russian | |
771 | // Indicators: C: capital, N: number, P: punctuation, M: multipurpose | |
772 | // Others: | |
773 | // W -- whitespace that should be kept (e.g, in a numeral) | |
774 | // 𝑁 -- hack for special case of a lone decimal pt -- not considered a number but follows rules mostly | |
775 | // Note: some "positive" patterns find cases to keep the char and transform them to the lower case version | |
776 | static UEB_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! { | |
777 | "S" => "XXX", // sans-serif -- from prefs | |
778 | "B" => "⠘", // bold | |
779 | "𝔹" => "XXX", // blackboard -- from prefs | |
780 | "T" => "⠈", // script | |
781 | "I" => "⠨", // italic | |
782 | "R" => "", // roman | |
783 | // "E" => "⠰", // English | |
784 | "1" => "⠰", // Grade 1 symbol | |
785 | "𝟙" => "⠰⠰", // Grade 1 word | |
786 | "L" => "", // Letter left in to assist in locating letters | |
787 | "D" => "XXX", // German (Deutsche) -- from prefs | |
788 | "G" => "⠨", // Greek | |
789 | "V" => "⠨⠈", // Greek Variants | |
790 | // "H" => "⠠⠠", // Hebrew | |
791 | // "U" => "⠈⠈", // Russian | |
792 | "C" => "⠠", // capital | |
793 | "𝐶" => "⠠", // capital that never should get word indicator (from chemical element) | |
794 | "N" => "⠼", // number indicator | |
795 | "t" => "⠱", // shape terminator | |
796 | "W" => "⠀", // whitespace | |
797 | "𝐖"=> "⠀", // whitespace (hard break -- basically, it separates exprs) | |
798 | "s" => "⠆", // typeface single char indicator | |
799 | "w" => "⠂", // typeface word indicator | |
800 | "e" => "⠄", // typeface & capital terminator | |
801 | "o" => "", // flag that what follows is an open indicator (used for standing alone rule) | |
802 | "c" => "", // flag that what follows is an close indicator (used for standing alone rule) | |
803 | "b" => "", // flag that what follows is an open or close indicator (used for standing alone rule) | |
804 | "," => "⠂", // comma | |
805 | "." => "⠲", // period | |
806 | "-" => "-", // hyphen | |
807 | "—" => "⠠⠤", // normal dash (2014) -- assume all normal dashes are unified here [RUEB appendix 3] | |
808 | "―" => "⠐⠠⠤", // long dash (2015) -- assume all long dashes are unified here [RUEB appendix 3] | |
809 | "#" => "", // signals end of script | |
810 | // '(', '{', '[', '"', '\'', '“', '‘', '«', // opening chars | |
811 | // ')', '}', ']', '\"', '\'', '”', '’', '»', // closing chars | |
812 | // ',', ';', ':', '.', '…', '!', '?' // punctuation | |
813 | ||
814 | }; | |
815 | ||
816 | // static LETTERS: phf::Set<char> = phf_set! { | |
817 | // '⠁', '⠃', '⠉', '⠙', '⠑', '⠋', '⠛', '⠓', '⠊', '⠚', '⠅', '⠇', '⠍', | |
818 | // '⠝', '⠕', '⠏', '⠟', '⠗', '⠎', '⠞', '⠥', '⠧', '⠺', '⠭', '⠽', '⠵', | |
819 | // }; | |
820 | ||
821 | 2.39k | fn is_letter_number(ch: char) -> bool { |
822 | 2.39k |
|
823 | 2.39k | } |
824 | ||
825 | static SHORT_FORMS: phf::Set<&str> = phf_set! { | |
826 | "L⠁L⠃", "L⠁L⠃L⠧", "L⠁L⠉", "L⠁L⠉L⠗", "L⠁L⠋", | |
827 | "L⠁L⠋L⠝", "L⠁L⠋L⠺", "L⠁L⠛", "L⠁L⠛L⠌", "L⠁L⠇", | |
828 | "L⠁L⠇L⠍", "L⠁L⠇L⠗", "L⠁L⠇L⠞", "L⠁L⠇L⠹", "L⠁L⠇L⠺", | |
829 | "L⠃L⠇", "L⠃L⠗L⠇", "L⠉L⠙", "L⠙L⠉L⠇", "L⠙L⠉L⠇L⠛", | |
830 | "L⠙L⠉L⠧", "L⠙L⠉L⠧L⠛", "L⠑L⠊", "L⠋L⠗", "L⠋L⠌", "L⠛L⠙", | |
831 | "L⠛L⠗L⠞", "L⠓L⠍", "L⠓L⠍L⠋", "L⠓L⠻L⠋", "L⠊L⠍L⠍", "L⠇L⠇", "L⠇L⠗", | |
832 | "L⠍L⠽L⠋", "L⠍L⠡", "L⠍L⠌", "L⠝L⠑L⠉", "L⠝L⠑L⠊", "L⠏L⠙", | |
833 | "L⠏L⠻L⠉L⠧", "L⠏L⠻L⠉L⠧L⠛", "L⠏L⠻L⠓", "L⠟L⠅", "L⠗L⠉L⠧", | |
834 | "L⠗L⠉L⠧L⠛", "L⠗L⠚L⠉", "L⠗L⠚L⠉L⠛", "L⠎L⠙", "L⠎L⠡", "L⠞L⠙", | |
835 | "L⠞L⠛L⠗", "L⠞L⠍", "L⠞L⠝", "L⠭L⠋", "L⠭L⠎", "L⠽L⠗", "L⠽L⠗L⠋", | |
836 | "L⠽L⠗L⠧L⠎", "L⠮L⠍L⠧L⠎", "L⠡L⠝", "L⠩L⠙", "L⠹L⠽L⠋", "L⠳L⠗L⠧L⠎", | |
837 | "L⠺L⠙", "L⠆L⠉", "L⠆L⠋", "L⠆L⠓", "L⠆L⠇", "L⠆L⠝", "L⠆L⠎", "L⠆L⠞", | |
838 | "L⠆L⠽", "L⠒L⠉L⠧", "L⠒L⠉L⠧L⠛", "L⠐L⠕L⠋" | |
839 | }; | |
840 | ||
841 | 1.75k | fn is_letter_prefix(ch: char) -> bool { |
842 | 1.75k |
|
843 | 1.75k | } |
844 | ||
845 | // Trim braille spaces before and after braille indicators | |
846 | // In order: fraction, /, cancellation, letter, baseline | |
847 | // Note: fraction over is not listed due to example 42(4) which shows a space before the "/" | |
848 | // static ref REMOVE_SPACE_BEFORE_BRAILLE_INDICATORS: Regex = | |
849 | // Regex::new(r"(⠄⠄⠄|⠤⠤⠤)W+([⠼⠸⠪])").unwrap(); | |
850 | 2 | static REPLACE_INDICATORS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"([1𝟙SB𝔹TIREDGVHP𝐶𝑐CLMNW𝐖swe,.-—―#ocb])").unwrap()); |
851 | 2 | static COLLAPSE_SPACES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"⠀⠀+").unwrap()); |
852 | ||
853 | 35 | fn is_short_form(chars: &[char]) -> bool { |
854 | 204 | let |
855 | 35 | return SHORT_FORMS.contains(&chars_as_string); |
856 | 35 | } |
857 | ||
858 | 366 | fn ueb_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String { |
859 | // debug!("ueb_cleanup: start={}", raw_braille); | |
860 | 366 | let result = typeface_to_word_mode(&raw_braille); |
861 | 366 | let result = capitals_to_word_mode(&result); |
862 | ||
863 | 366 | let use_only_grade1 = pref_manager.pref_to_string("UEB_START_MODE").as_str() == "Grade1"; |
864 | ||
865 | // '𝐖' is a hard break -- basically, it separates exprs | |
866 | 366 | let mut result = result.split('𝐖') |
867 | 370 | . |
868 | 366 | .collect::<String>(); |
869 | 366 | result.pop(); // we added a 'W' at the end that needs to be removed. |
870 | ||
871 | 366 | let result = result.replace("tW", "W"); |
872 | ||
873 | // these typeforms need to get pulled from user-prefs as they are transcriber-defined | |
874 | 366 | let double_struck = pref_manager.pref_to_string("UEB_DoubleStruck"); |
875 | 366 | let sans_serif = pref_manager.pref_to_string("UEB_SansSerif"); |
876 | 366 | let fraktur = pref_manager.pref_to_string("UEB_Fraktur"); |
877 | 366 | let greek_variant = pref_manager.pref_to_string("UEB_GreekVariant"); |
878 | ||
879 | 3.77k | let |
880 | 3.77k | let matched_char = &cap[0]; |
881 | 3.77k | match matched_char { |
882 | 3.77k | "𝔹" => |
883 | 3.77k | "S" => |
884 | 3.77k | "D" => |
885 | 3.77k | "V" => |
886 | 3.77k | _ => match UEB_INDICATOR_REPLACEMENTS.get(matched_char) { |
887 | 0 | None => {error!("REPLACE_INDICATORS and UEB_INDICATOR_REPLACEMENTS are not in sync: missing '{matched_char}'"); ""}, |
888 | 3.77k | Some(&ch) => ch, |
889 | }, | |
890 | } | |
891 | 3.77k | }); |
892 | ||
893 | // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces | |
894 | // let result = result.trim_start_matches('⠀').trim_end_matches('⠀'); | |
895 | 366 | let result = COLLAPSE_SPACES.replace_all(&result, "⠀"); |
896 | ||
897 | 366 | return result.to_string(); |
898 | ||
899 | 370 | fn pick_start_mode(raw_braille: &str, use_only_grade1: bool) -> String { |
900 | // Need to decide what the start mode should be | |
901 | // From http://www.brailleauthority.org/ueb/ueb_math_guidance/final_for_posting_ueb_math_guidance_may_2019_102419.pdf | |
902 | // Unless a math expression can be correctly represented with only a grade 1 symbol indicator in the first three cells | |
903 | // or before a single letter standing alone anywhere in the expression, | |
904 | // begin the expression with a grade 1 word indicator (or a passage indicator if the expression includes spaces) | |
905 | // Apparently "only a grade 1 symbol..." means at most one grade 1 symbol based on some examples (GTM 6.4, example 4) | |
906 | // debug!("before determining mode: '{}'", raw_braille); | |
907 | ||
908 | // a bit ugly because we need to store the string if we have cap passage mode | |
909 | 370 | let raw_braille_string = if is_cap_passage_mode_good(raw_braille) { |
910 | 370 | let raw_braille = if raw_braille_string.is_empty() { |
911 | 370 | if use_only_grade1 { |
912 | 1 | return remove_unneeded_mode_changes(raw_braille, UEB_Mode::Grade1, UEB_Duration::Passage); |
913 | 369 | } |
914 | 369 | let grade2 = remove_unneeded_mode_changes(raw_braille, UEB_Mode::Grade2, UEB_Duration::Symbol); |
915 | 369 | debug!("Symbol mode: '{}'", grade2); |
916 | ||
917 | 369 | if is_grade2_string_ok(&grade2) { |
918 | 143 | return grade2; |
919 | } else { | |
920 | // BANA says use g1 word mode if spaces are present, but that's not what their examples do | |
921 | // A conversation with Ms. DeAndrea from BANA said that they mean use passage mode if ≥3 "segments" (≥2 blanks) | |
922 | // The G1 Word mode might not be at the start (iceb.rs:omission_3_6_7) | |
923 | 226 | let grade1_word = try_grade1_word_mode(raw_braille); |
924 | 226 | debug!("Word mode: '{}'", grade1_word); |
925 | 226 | if !grade1_word.is_empty() { |
926 | 36 | return grade1_word; |
927 | } else { | |
928 | 190 | let grade1_passage = remove_unneeded_mode_changes(raw_braille, UEB_Mode::Grade1, UEB_Duration::Passage); |
929 | 190 | return "⠰⠰⠰".to_string() + &grade1_passage + "⠰⠄"; |
930 | } | |
931 | } | |
932 | ||
933 | /// Return true if at least five (= # of cap passage indicators) cap indicators and no lower case letters | |
934 | 370 | fn is_cap_passage_mode_good(braille: &str) -> bool { |
935 | 370 | let mut n_caps = 0; |
936 | 370 | let mut is_cap_mode = false; |
937 | 370 | let mut cap_mode = UEB_Duration::Symbol; // real value set when is_cap_mode is set to true |
938 | 370 | let mut chars = braille.chars(); |
939 | ||
940 | // look CL or CCL for caps (CC runs until we get whitespace) | |
941 | // if we find an L not in caps mode, we return false | |
942 | // Note: caps can be C𝐶, whitespace can be W𝐖 | |
943 | 2.03k | while let Some( |
944 | 1.96k | if ch == 'L' { |
945 | 401 | if !is_cap_mode { |
946 | 288 | return false; |
947 | 113 | } |
948 | 113 | chars.next(); // skip letter |
949 | 113 | if cap_mode == UEB_Duration::Symbol { |
950 | 79 | is_cap_mode = false; |
951 | 79 |
|
952 | 1.55k | } else if ch == 'C' || |
953 | 107 | if is_cap_mode { |
954 | 16 | if cap_mode == UEB_Duration::Symbol { |
955 | 12 | cap_mode = UEB_Duration::Word; |
956 | 12 |
|
957 | 91 | } else { |
958 | 91 | is_cap_mode = true; |
959 | 91 | cap_mode = UEB_Duration::Symbol; |
960 | 91 | } |
961 | 107 | n_caps += 1; |
962 | 1.45k | } else if ch == 'W' || |
963 | 119 | if is_cap_mode { |
964 | 2 | assert!(cap_mode == UEB_Duration::Word); |
965 | 117 | } |
966 | 119 | is_cap_mode = false; |
967 | 1.33k | } else if ch == '1' && |
968 | 3 | break; |
969 | 1.33k | } |
970 | } | |
971 | 82 | return n_caps > 4; |
972 | 370 | } |
973 | ||
974 | 3 | fn convert_to_cap_passage_mode(braille: &str) -> String { |
975 | 3 | return "⠠⠠⠠".to_string() + &braille.replace(['C', '𝐶'], "") + "⠠⠄"; |
976 | 3 | } |
977 | ||
978 | /// Return true if the BANA or ICEB guidelines say it is ok to start with grade 2 | |
979 | 369 | fn is_grade2_string_ok(grade2_braille: &str) -> bool { |
980 | // BANA says use grade 2 if there is not more than one grade one symbol or single letter standing alone. | |
981 | // The exact quote from their guidance: | |
982 | // Unless a math expression can be correctly represented with only a grade 1 symbol indicator in the first three cells | |
983 | // or before a single letter standing alone anywhere in the expression, | |
984 | // begin the expression with a grade 1 word indicator | |
985 | // Note: I modified this slightly to exclude the cap indicator in the count. That allows three more ICEB rule to pass and seems | |
986 | // like it is a reasonable thing to do. | |
987 | // Another modification is allow a single G1 indicator to occur after whitespace later on | |
988 | // because ICEB examples show it and it seems better than going to passage mode if it is the only G1 indicator | |
989 | ||
990 | // Because of the 'L's which go away, we have to put a little more work into finding the first three chars | |
991 | 369 | let chars = grade2_braille.chars().collect::<Vec<char>>(); |
992 | 369 | let mut n_real_chars = 0; // actually number of chars |
993 | 369 | let mut found_g1 = false; |
994 | 369 | let mut i = 0; |
995 | 1.75k | while i < chars.len() { |
996 | 1.75k | let ch = chars[i]; |
997 | 1.75k | if ch == '1' && |
998 | 269 | if found_g1 { |
999 | 19 | return false; |
1000 | 250 | } |
1001 | 250 | found_g1 = true; |
1002 | 1.48k | } else if !"𝐶CLobc".contains(ch) { |
1003 | 1.07k | if n_real_chars == 2 { |
1004 | 347 | i += 1; |
1005 | 347 | break; // this is the third real char |
1006 | 730 | }; |
1007 | 730 | n_real_chars += 1; |
1008 | 407 | } |
1009 | 1.38k | i += 1 |
1010 | } | |
1011 | ||
1012 | // if we find *another* g1 that isn't forced and isn't standing alone, we are done | |
1013 | // I've added a 'follows whitespace' clause for test iceb.rs:omission_3_6_2 to the standing alone rule | |
1014 | // we only allow one standing alone example -- not sure if BANA guidance has this limit, but GTM 11_5_5_3 seems better with it | |
1015 | // Same for GTM 1_7_3_1 (passage mode is mentioned also) | |
1016 | 350 | let mut is_standing_alone_already_encountered = false; |
1017 | 350 | let mut is_after_whitespace = false; |
1018 | 2.43k | while i < chars.len() { |
1019 | 2.29k | let ch = chars[i]; |
1020 | 2.29k | if ch == 'W' { |
1021 | 355 | is_after_whitespace = true; |
1022 | 1.93k | } else if ch == '1' && |
1023 | 235 | if is_standing_alone_already_encountered || |
1024 | 226 | ((found_g1 || |
1025 | 207 | return false; |
1026 | 28 | } |
1027 | 28 | found_g1 = true; |
1028 | 28 | is_standing_alone_already_encountered = true; |
1029 | 1.70k | } |
1030 | 2.08k | i += 1; |
1031 | } | |
1032 | 143 | return true; |
1033 | 369 | } |
1034 | ||
1035 | /// Return true if the sequence of chars forces a '1' at the `i`th position | |
1036 | /// Note: `chars[i]` should be '1' | |
1037 | 930 | fn is_forced_grade1(chars: &[char], i: usize) -> bool { |
1038 | // A '1' is forced if 'a-j' follows a digit | |
1039 | 930 | assert_eq!(chars[i], '1', "'is_forced_grade1' didn't start with '1'"); |
1040 | // check that a-j follows the '1' -- we have '1Lx' where 'x' is the letter to check | |
1041 | 930 | if i+2 < chars.len() && |
1042 | // check for a number before the '1' | |
1043 | // this will be 'N' followed by LETTER_NUMBERS or the number ".", ",", or " " | |
1044 | 25 | for j in ( |
1045 | 25 | let ch = chars[j]; |
1046 | 25 | if !(is_letter_number(unhighlight(ch)) || |
1047 | 12 | return ch == 'N' |
1048 | 13 | } |
1049 | } | |
1050 | 918 | } |
1051 | 918 | return false; |
1052 | 930 | } |
1053 | ||
1054 | 203 | fn is_single_letter_on_right(chars: &[char], i: usize) -> bool { |
1055 | 205 | fn is_skip_char(ch: char) -> bool { |
1056 | 205 |
|
1057 | 205 | } |
1058 | ||
1059 | // find the first char (if any) | |
1060 | 203 | let mut count = 0; // how many letters |
1061 | 203 | let mut i = i+1; |
1062 | 209 | while i < chars.len() { |
1063 | 205 | let ch = chars[i]; |
1064 | 205 | if !is_skip_char(ch) { |
1065 | 204 | if ch == 'L' { |
1066 | 5 | if count == 1 { |
1067 | 0 | return false; // found a second letter in the sequence |
1068 | 5 | } |
1069 | 5 | count += 1; |
1070 | } else { | |
1071 | 199 | return count==1; |
1072 | } | |
1073 | 5 | i += 2; // eat 'L' and actual letter |
1074 | 1 | } else { |
1075 | 1 | i += 1; |
1076 | 1 | } |
1077 | } | |
1078 | 4 | return true; |
1079 | 203 | } |
1080 | ||
1081 | 226 | fn try_grade1_word_mode(raw_braille: &str) -> String { |
1082 | // this isn't quite right, but pretty close -- try splitting at 'W' (words) | |
1083 | // only one of the parts can be in word mode and none of the others can have '1' unless forced | |
1084 | 226 | let mut g1_words = Vec::default(); |
1085 | 226 | let mut found_word_mode = false; |
1086 | 622 | for raw_word in |
1087 | 622 | let word = remove_unneeded_mode_changes(raw_word, UEB_Mode::Grade2, UEB_Duration::Symbol); |
1088 | // debug!("try_grade1_word_mode: word='{}'", word); | |
1089 | 622 | let word_chars = word.chars().collect::<Vec<char>>(); |
1090 | 622 | let needs_word_mode = word_chars.iter().enumerate() |
1091 | 1.12k | . |
1092 | 622 | if needs_word_mode { |
1093 | 416 | if found_word_mode { |
1094 | 190 | return "".to_string(); |
1095 | 226 | } |
1096 | 226 | found_word_mode = true; |
1097 | 226 | g1_words.push("⠰⠰".to_string() + &remove_unneeded_mode_changes(raw_word, UEB_Mode::Grade1, UEB_Duration::Word) |
1098 | ); | |
1099 | 206 | } else { |
1100 | 206 | g1_words.push(word); |
1101 | 206 | } |
1102 | } | |
1103 | 36 | return if found_word_mode {g1_words.join("W")} else { |
1104 | 226 | } |
1105 | 370 | } |
1106 | 366 | } |
1107 | ||
1108 | 478 | fn typeface_to_word_mode(braille: &str) -> String { |
1109 | 2 | static HAS_TYPEFACE: LazyLock<Regex> = LazyLock::new(|| Regex::new("[BI𝔹STD]").unwrap()); |
1110 | // debug!("before typeface fix: '{}'", braille); | |
1111 | ||
1112 | 478 | let mut result = "".to_string(); |
1113 | 478 | let chars = braille.chars().collect::<Vec<char>>(); |
1114 | 478 | let mut word_mode = Vec::with_capacity(5); |
1115 | 478 | let mut word_mode_end = Vec::with_capacity(5); |
1116 | 478 | let mut i = 0; |
1117 | 11.5k | while i < chars.len() { |
1118 | 11.0k | let ch = chars[i]; |
1119 | 11.0k | if HAS_TYPEFACE.is_match(ch.to_string().as_str()) { |
1120 | 8 | let i_next_char_target = find_next_char(&chars[i+1..], ch); |
1121 | 8 | if word_mode.contains(&ch) { |
1122 | 3 | if i_next_char_target.is_none() { |
1123 | 2 | word_mode.retain(|&item| item!=ch); // drop the char since word mode is done |
1124 | 2 | word_mode_end.push(ch); // add the char to signal to add end sequence |
1125 | 1 | } |
1126 | } else { | |
1127 | 5 | result.push(ch); |
1128 | 5 | if i_next_char_target.is_some() { |
1129 | 2 | result.push('w'); // typeface word indicator |
1130 | 2 | word_mode.push(ch); // starting word mode for this char |
1131 | 3 | } else { |
1132 | 3 | result.push('s'); // typeface single char indicator |
1133 | 3 | } |
1134 | } | |
1135 | 8 | i += 1; // eat "B", etc |
1136 | 11.0k | } else if ch == 'L' || |
1137 | 3.70k | result.push(chars[i]); |
1138 | 3.70k | result.push(chars[i+1]); |
1139 | 3.70k | if !word_mode_end.is_empty() && |
1140 | // add terminator unless word sequence is terminated by end of string or whitespace | |
1141 | 1 | for &ch in &word_mode_end { |
1142 | 1 | result.push(ch); |
1143 | 1 | result.push('e'); |
1144 | 1 | }; |
1145 | 1 | word_mode_end.clear(); |
1146 | 3.70k | } |
1147 | 3.70k | i += 2; // eat Ll/Nd |
1148 | 7.30k | } else { |
1149 | 7.30k | result.push(ch); |
1150 | 7.30k | i += 1; |
1151 | 7.30k | } |
1152 | } | |
1153 | 478 | return result; |
1154 | ||
1155 | 478 | } |
1156 | ||
1157 | 478 | fn capitals_to_word_mode(braille: &str) -> String { |
1158 | use std::iter::FromIterator; | |
1159 | // debug!("before capitals fix: '{}'", braille); | |
1160 | ||
1161 | 478 | let mut result = "".to_string(); |
1162 | 478 | let chars = braille.chars().collect::<Vec<char>>(); |
1163 | 478 | let mut is_word_mode = false; |
1164 | 478 | let mut i = 0; |
1165 | // look for a sequence of CLxCLy... and create CCLxLy... | |
1166 | 12.6k | while i < chars.len() { |
1167 | 12.1k | let ch = chars[i]; |
1168 | 12.1k | if ch == 'C' { |
1169 | // '𝑐' should only occur after a 'C', so we don't have top-level check for it | |
1170 | 256 | let mut next_non_cap = i+1; |
1171 | 257 | while let Some( |
1172 | 1 | next_non_cap += i_next + 1; // C/𝑐, L, letter |
1173 | 1 | } |
1174 | 256 | if find_next_char(&chars[next_non_cap..], 'C').is_some() { // next letter sequence "C..." |
1175 | 63 | if is_next_char_start_of_section_12_modifier(&chars[next_non_cap+1..]) { |
1176 | // to me this is tricky -- section 12 modifiers apply to the previous item | |
1177 | // the last clause of the "item" def is the previous indivisible symbol" which ICEB 2.1 say is: | |
1178 | // braille sign: one or more consecutive braille characters comprising a unit, | |
1179 | // consisting of a root on its own or a root preceded by one or more | |
1180 | // prefixes (also referred to as braille symbol) | |
1181 | // this means the capital indicator needs to be stated and can't be part of a word or passage | |
1182 | 1 | is_word_mode = false; |
1183 | 1 | result.push_str(String::from_iter(&chars[i..next_non_cap]).as_str()); |
1184 | 1 | i = next_non_cap; |
1185 | 1 | continue; |
1186 | 62 | } |
1187 | 62 | if is_word_mode { |
1188 | 12 | i += 1; // skip the 'C' |
1189 | 50 | } else { |
1190 | 50 | // start word mode -- need an extra 'C' |
1191 | 50 | result.push('C'); |
1192 | 50 | is_word_mode = true; |
1193 | 50 | } |
1194 | 193 | } else if is_word_mode { |
1195 | 50 | i += 1; // skip the 'C' |
1196 | 143 | } |
1197 | 255 | if chars[next_non_cap] == 'G' { |
1198 | 8 | // Greek letters are a bit exceptional in that the pattern is "CGLx" -- bump 'i' |
1199 | 8 | next_non_cap += 1; |
1200 | 247 | } |
1201 | 255 | if chars[next_non_cap] != 'L' { |
1202 | 0 | error!("capitals_to_word_mode: internal error: didn't find L after C in '{}'.", |
1203 | 0 | chars[i..next_non_cap+2].iter().collect::<String>().as_str()); |
1204 | 255 | } |
1205 | 255 | let i_braille_char = next_non_cap + 2; |
1206 | 255 | result.push_str(String::from_iter(&chars[i..i_braille_char]).as_str()); |
1207 | 255 | i = i_braille_char; |
1208 | 11.9k | } else if ch == 'L' { // must be lowercase -- uppercase consumed above |
1209 | // assert!(LETTERS.contains(&unhighlight(chars[i+1]))); not true for other alphabets | |
1210 | 2.03k | if is_word_mode { |
1211 | 2 | result.push('e'); // terminate Word mode (letter after caps) |
1212 | 2 | is_word_mode = false; |
1213 | 2.03k | } |
1214 | 2.03k | result.push('L'); |
1215 | 2.03k | result.push(chars[i+1]); |
1216 | 2.03k | i += 2; // eat L, letter |
1217 | 9.88k | } else { |
1218 | 9.88k | is_word_mode = false; // non-letters terminate cap word mode |
1219 | 9.88k | result.push(ch); |
1220 | 9.88k | i += 1; |
1221 | 9.88k | } |
1222 | } | |
1223 | 478 | return result; |
1224 | ||
1225 | 63 | fn is_next_char_start_of_section_12_modifier(chars: &[char]) -> bool { |
1226 | // first find the L and eat the char so that we are at the potential start of where the target lies | |
1227 | 63 | let chars_len = chars.len(); |
1228 | 63 | let mut i_cap = 0; |
1229 | 126 | while chars[i_cap] != 'C' { // we know 'C' is in the string, so no need to check for exceeding chars_len |
1230 | 63 | i_cap += 1; |
1231 | 63 | } |
1232 | 73 | for i_end in |
1233 | 73 | if chars[i_end] == 'L' { |
1234 | // skip the next char to get to the real start, and then look for the modifier string or next L/N | |
1235 | // debug!(" after L '{}'", chars[i_end+2..].iter().collect::<String>()); | |
1236 | 65 | for i in |
1237 | 65 | let ch = chars[i]; |
1238 | 65 | if ch == '1' { |
1239 | // Fix: there's probably a much better way to check if we have a match against one of "⠱", "⠘⠱", "⠘⠲", "⠸⠱", "⠐⠱ ", "⠨⠸⠱" | |
1240 | 5 | if chars[i+1] == '⠱' { |
1241 | 0 | return true; |
1242 | 5 | } else if i+2 < chars_len { |
1243 | 5 | let mut str = chars[i+1].to_string(); |
1244 | 5 | str.push(chars[i+2]); |
1245 | 5 | if str == "⠘⠱" || str == "⠘⠲" || str == "⠸⠱" || str == "⠐⠱" { |
1246 | 1 | return true; |
1247 | 4 | } else if i+3 < chars_len { |
1248 | 4 | str.push(chars[i+3]); |
1249 | 4 | return str == "⠨⠸⠱"; |
1250 | 0 | } |
1251 | 0 | return false; |
1252 | 0 | } |
1253 | 60 | } |
1254 | 60 | if ch == 'L' || |
1255 | 48 | return false; |
1256 | 12 | } |
1257 | } | |
1258 | 10 | } |
1259 | } | |
1260 | 10 | return false; |
1261 | 63 | } |
1262 | 478 | } |
1263 | ||
1264 | 521 | fn find_next_char(chars: &[char], target: char) -> Option<usize> { |
1265 | // first find the L or N and eat the char so that we are at the potential start of where the target lies | |
1266 | // debug!("Looking for '{}' in '{}'", target, chars.iter().collect::<String>()); | |
1267 | 610 | for i_end in |
1268 | 610 | if chars[i_end] == 'L' || |
1269 | // skip the next char to get to the real start, and then look for the target | |
1270 | // stop when L/N signals past potential target or we hit some non L/N char (actual braille) | |
1271 | // debug!(" after L/N '{}'", chars[i_end+2..].iter().collect::<String>()); | |
1272 | 521 | for ( |
1273 | 515 | if ch == 'L' || |
1274 | 383 | return None; |
1275 | 132 | } else if ch == target { |
1276 | // debug!(" found target"); | |
1277 | 67 | return Some(i); |
1278 | 65 | } |
1279 | } | |
1280 | 89 | } |
1281 | } | |
1282 | 71 | return None; |
1283 | 521 | } |
1284 | ||
1285 | #[allow(non_camel_case_types)] | |
1286 | #[derive(Debug, PartialEq, Copy, Clone)] | |
1287 | enum UEB_Mode { | |
1288 | Numeric, // also includes Grade1 | |
1289 | Grade1, | |
1290 | Grade2, | |
1291 | } | |
1292 | ||
1293 | #[allow(non_camel_case_types)] | |
1294 | #[derive(Debug, PartialEq, Copy, Clone)] | |
1295 | enum UEB_Duration { | |
1296 | // Standing alone: A braille symbol that is standing alone may have a contracted (grade 2) meaning. | |
1297 | // A letter or unbroken sequence of letters is “standing alone” if the symbols before and after the letter or | |
1298 | // sequence are spaces, hyphens, dashes or any combination thereof, including some common punctuation. | |
1299 | // Item: An “item” is defined as the next symbol or one of seven groupings listed in Rules of Unified English Braille, §11.4.1. | |
1300 | Symbol, | |
1301 | ||
1302 | // The grade 1 word indicator sets grade 1 mode for the next word or symbol sequence. | |
1303 | // A symbol sequence in UEB is defined as an unbroken string of braille signs, | |
1304 | // whether alphabetic or non-alphabetic, preceded and followed by a space. | |
1305 | Word, | |
1306 | Passage, | |
1307 | } | |
1308 | ||
1309 | // used to determine standing alone (on left side) | |
1310 | 4.53k | fn is_left_intervening_char(ch: char) -> bool { |
1311 | 4.53k |
|
1312 | 4.53k | } |
1313 | ||
1314 | /// Return value for use_g1_word_mode() | |
1315 | #[derive(Debug, PartialEq)] | |
1316 | enum Grade1WordIndicator { | |
1317 | NotInWord, // no '𝟙' in the current/next word | |
1318 | InWord, // '𝟙' in the current/next word | |
1319 | NotInChars, // no '𝟙' in the entire string (optimization for common case) | |
1320 | } | |
1321 | ||
1322 | 1.89k | fn remove_unneeded_mode_changes(raw_braille: &str, start_mode: UEB_Mode, start_duration: UEB_Duration) -> String { |
1323 | // FIX: need to be smarter about moving on wrt to typeforms/typefaces, caps, bold/italic. [maybe just let them loop through the default?] | |
1324 | 1.89k | let mut mode = start_mode; |
1325 | 1.89k | let mut duration = start_duration; |
1326 | 1.89k | let mut start_g2_letter = None; // used for start of contraction checks |
1327 | 1.89k | let mut i_g2_start = None; // set to 'i' when entering G2 mode; None in other modes. '1' indicator goes here if standing alone |
1328 | 1.89k | let mut cap_word_mode = false; // only set to true in G2 to prevent contractions |
1329 | 1.89k | let mut result = String::default(); |
1330 | 1.89k | let chars = raw_braille.chars().collect::<Vec<char>>(); |
1331 | 1.89k | let mut g1_word_indicator = Grade1WordIndicator::NotInChars; // almost always true (and often irrelevant) |
1332 | 1.89k | if mode == UEB_Mode::Grade2 || |
1333 | 991 | g1_word_indicator = use_g1_word_mode(&chars); |
1334 | 991 | if g1_word_indicator == Grade1WordIndicator::InWord { |
1335 | 1 | mode = UEB_Mode::Grade1; |
1336 | 1 | if duration == UEB_Duration::Symbol { |
1337 | 1 | duration = UEB_Duration::Word; // if Passage mode, leave as is |
1338 | 1 | result.push('𝟙') |
1339 | 0 | } |
1340 | 990 | } |
1341 | 901 | } |
1342 | 1.89k | let mut i = 0; |
1343 | 37.0k | while i < chars.len() { |
1344 | 35.1k | let ch = chars[i]; |
1345 | 35.1k | match mode { |
1346 | UEB_Mode::Numeric => { | |
1347 | // Numeric Mode: (from https://uebmath.aphtech.org/lesson1.0 and lesson4.0) | |
1348 | // Symbols that can appear within numeric mode include the ten digits, comma, period, simple fraction line, | |
1349 | // line continuation indicator, and numeric space digit symbols. | |
1350 | // A space or any other symbol not listed here terminates numeric mode. | |
1351 | // Numeric mode is also terminated by the "!" -- used after a script | |
1352 | // | |
1353 | // The numeric indicator also turns on grade 1 mode. | |
1354 | // When grade 1 mode is set by the numeric indicator, | |
1355 | // grade 1 indicators are not used unless a single lower-case letter a-j immediately follows a digit. | |
1356 | // Grade 1 mode when set by the numeric indicator is terminated by a space, hyphen, dash, or a grade 1 indicator. | |
1357 | 3.31k | i_g2_start = None; |
1358 | // debug!("Numeric: ch={}, duration: {:?}", ch, duration); | |
1359 | 3.31k | match ch { |
1360 | 'L' => { | |
1361 | // terminate numeric mode -- duration doesn't change | |
1362 | // let the default case handle pushing on the chars for the letter | |
1363 | 1.42k | if is_letter_number(unhighlight(chars[i+1])) { |
1364 | 1.37k | result.push('1'); // need to distinguish a-j from a digit |
1365 | 1.37k |
|
1366 | 1.42k | result.push(ch); |
1367 | 1.42k | i += 1; |
1368 | 1.42k | mode = UEB_Mode::Grade1; |
1369 | // duration remains Word | |
1370 | }, | |
1371 | '1' | '𝟙' => { | |
1372 | // numeric mode implies grade 1, so don't output indicator; | |
1373 | 107 | i += 1; |
1374 | 107 | mode = UEB_Mode::Grade1; |
1375 | 107 | if start_duration == UEB_Duration::Passage { |
1376 | 15 | duration = UEB_Duration::Passage; // otherwise it remains at Word |
1377 | 92 | } |
1378 | }, | |
1379 | '#' => { | |
1380 | // terminate numeric mode -- duration doesn't change | |
1381 | 738 | i += 1; |
1382 | 738 | if i+1 < chars.len() && |
1383 | 9 | // special case where the script was numeric and a letter follows, so need to put out G1 indicator |
1384 | 9 | result.push('1'); |
1385 | 9 | // the G1 case should work with 'L' now |
1386 | 729 | } |
1387 | 738 | mode = UEB_Mode::Grade1; |
1388 | }, | |
1389 | 521 | 'N' => { |
1390 | 521 | // stay in the same mode (includes numeric "," and "." space) -- don't let default get these chars |
1391 | 521 | result.push(chars[i+1]); |
1392 | 521 | i += 2; |
1393 | 521 | }, |
1394 | _ => { | |
1395 | // moving out of numeric mode | |
1396 | 524 | result.push(ch); |
1397 | 524 | i += 1; |
1398 | 524 | if "W𝐖-—―".contains(ch) { |
1399 | 94 | mode = start_mode; |
1400 | 94 | if mode == UEB_Mode::Grade2 { |
1401 | 47 | start_g2_letter = None; // will be set to real letter |
1402 | 47 | } |
1403 | 94 | if start_duration != UEB_Duration::Passage { |
1404 | 47 | duration = UEB_Duration::Symbol; |
1405 | 47 | } |
1406 | } else { | |
1407 | 430 | mode = UEB_Mode::Grade1 |
1408 | } | |
1409 | }, | |
1410 | } | |
1411 | }, | |
1412 | UEB_Mode::Grade1 => { | |
1413 | // Grade 1 Mode: | |
1414 | // The numeric indicator also sets grade 1 mode. | |
1415 | // Grade 1 mode, when initiated by the numeric indicator, is terminated by a space, hyphen, dash or grade 1 terminator. | |
1416 | // Grade 1 mode is also set by grade 1 indicators. | |
1417 | 25.0k | i_g2_start = None; |
1418 | // debug!("Grade 1: ch={}, duration: {:?}", ch, duration); | |
1419 | 25.0k | match ch { |
1420 | 3.34k | 'L' => { |
1421 | 3.34k | // note: be aware of '#' case for Numeric because '1' might already be generated |
1422 | 3.34k | // let prev_ch = if i > 1 {chars[i-1]} else {'1'}; // '1' -- anything beside ',' or '.' |
1423 | 3.34k | // if duration == UEB_Duration::Symbol || |
1424 | 3.34k | // ( ",. ".contains(prev_ch) && LETTER_NUMBERS.contains(&unhighlight(chars[i+1])) ) { |
1425 | 3.34k | // result.push('1'); // need to retain grade 1 indicator (RUEB 6.5.2) |
1426 | 3.34k | // } |
1427 | 3.34k | // let the default case handle pushing on the chars for the letter |
1428 | 3.34k | result.push(ch); |
1429 | 3.34k | i += 1; |
1430 | 3.34k | }, |
1431 | '1' | '𝟙' => { | |
1432 | 2.35k | assert!(ch == '1' || |
1433 | // nothing to do -- let the default case handle the following chars | |
1434 | 2.35k | i += 1; |
1435 | }, | |
1436 | 2.36k | 'N' => { |
1437 | 2.36k | result.push(ch); |
1438 | 2.36k | result.push(chars[i+1]); |
1439 | 2.36k | i += 2; |
1440 | 2.36k | mode = UEB_Mode::Numeric; |
1441 | 2.36k | duration = UEB_Duration::Word; |
1442 | 2.36k | }, |
1443 | 'W' | '𝐖' => { | |
1444 | // this terminates a word mode if there was one | |
1445 | 711 | result.push(ch); |
1446 | 711 | i += 1; |
1447 | 711 | if start_duration != UEB_Duration::Passage { |
1448 | 224 | duration = UEB_Duration::Symbol; |
1449 | 224 | mode = UEB_Mode::Grade2; |
1450 | 487 | } |
1451 | }, | |
1452 | _ => { | |
1453 | 16.3k | result.push(ch); |
1454 | 16.3k | i += 1; |
1455 | 16.3k | if duration == UEB_Duration::Symbol && |
1456 | 1.34k | mode = start_mode; |
1457 | 14.9k | } |
1458 | } | |
1459 | } | |
1460 | 25.0k | if mode == UEB_Mode::Grade2 { |
1461 | 1.56k | start_g2_letter = None; // will be set to real letter |
1462 | 23.5k | } |
1463 | ||
1464 | }, | |
1465 | UEB_Mode::Grade2 => { | |
1466 | // note: if we ended up using a '1', it only extends to the next char, which is also dealt with, so mode doesn't change | |
1467 | 6.79k | if i_g2_start.is_none() { |
1468 | 2.58k | i_g2_start = Some(i); |
1469 | 2.58k | cap_word_mode = false; |
1470 | 4.21k | } |
1471 | // debug!("Grade 2: ch={}, duration: {:?}", ch, duration); | |
1472 | 6.79k | match ch { |
1473 | 'L' => { | |
1474 | 1.44k | if start_g2_letter.is_none() { |
1475 | 1.34k | start_g2_letter = Some(i); |
1476 | 1.34k |
|
1477 | 1.44k | let (is_alone, right_matched_chars, n_letters) = stands_alone(&chars, i); |
1478 | // GTM 1.2.1 says we only need to use G1 for single letters or sequences that are a shortform (e.g, "ab") | |
1479 | 1.44k | if is_alone && ( |
1480 | 373 | // debug!(" is_alone -- pushing '1'"); |
1481 | 373 | result.push('1'); |
1482 | 373 | mode = UEB_Mode::Grade1; |
1483 | 1.07k | } |
1484 | // debug!(" pushing {:?}", right_matched_chars); | |
1485 | 3.13k |
|
1486 | 1.44k | i += right_matched_chars.len(); |
1487 | }, | |
1488 | 'C' => { | |
1489 | // Want 'C' before 'L'; Could be CC for word cap -- if so, eat it and move on | |
1490 | // Note: guaranteed that there is a char after the 'C', so chars[i+1] is safe | |
1491 | 99 | if chars[i+1] == 'C' { |
1492 | 14 | cap_word_mode = true; |
1493 | 14 | i += 1; |
1494 | 14 | } else { |
1495 | 85 | let is_greek = chars[i+1] == 'G'; |
1496 | 85 | let (is_alone, right_matched_chars, n_letters) = stands_alone(&chars, if is_greek { |
1497 | // GTM 1.2.1 says we only need to use G1 for single letters or sequences that are a shortform (e.g, "ab") | |
1498 | 85 | if is_alone && ( |
1499 | 16 | // debug!(" is_alone -- pushing '1'"); |
1500 | 16 | result.push('1'); |
1501 | 16 | mode = UEB_Mode::Grade1; |
1502 | 69 | } |
1503 | 85 | if cap_word_mode { |
1504 | 14 | result.push('C'); // first 'C' if cap word |
1505 | 71 | } |
1506 | 85 | result.push('C'); |
1507 | 85 | if is_greek { |
1508 | 2 | result.push('G'); |
1509 | 2 | i += 1; |
1510 | 83 | } |
1511 | 85 | start_g2_letter = Some(i); |
1512 | // debug!(" pushing 'C' + {:?}", right_matched_chars); | |
1513 | 256 |
|
1514 | 85 | i += 1 + right_matched_chars.len(); |
1515 | } | |
1516 | }, | |
1517 | 1.34k | '1' => { |
1518 | 1.34k | result.push(ch); |
1519 | 1.34k | i += 1; |
1520 | 1.34k | mode = UEB_Mode::Grade1; |
1521 | 1.34k | duration = UEB_Duration::Symbol; |
1522 | 1.34k | }, |
1523 | '𝟙' => { | |
1524 | // '𝟙' should have forced G1 Word mode | |
1525 | 0 | error!("Internal error: '𝟙' found in G2 mode: index={i} in '{raw_braille}'"); |
1526 | 0 | i += 1; |
1527 | } | |
1528 | 582 | 'N' => { |
1529 | 582 | result.push(ch); |
1530 | 582 | result.push(chars[i+1]); |
1531 | 582 | i += 2; |
1532 | 582 | mode = UEB_Mode::Numeric; |
1533 | 582 | duration = UEB_Duration::Word; |
1534 | 582 | }, |
1535 | _ => { | |
1536 | 3.32k | if let Some( |
1537 | 505 | if !cap_word_mode { |
1538 | 504 | result = handle_contractions(&chars[start..i], result); |
1539 | 504 |
|
1540 | 505 | cap_word_mode = false; |
1541 | 505 | start_g2_letter = None; // not start of char sequence |
1542 | 2.81k | } |
1543 | 3.32k | result.push(ch); |
1544 | 3.32k | i += 1; |
1545 | 3.32k | if !is_left_intervening_char(ch) { |
1546 | 3.29k | cap_word_mode = false; |
1547 | 3.29k | i_g2_start = Some(i); |
1548 | 3.29k |
|
1549 | ||
1550 | } | |
1551 | } | |
1552 | 6.79k | if mode != UEB_Mode::Grade2 && |
1553 | 2.30k | let Some( |
1554 | 883 | result = handle_contractions(&chars[start..i], result); |
1555 | 883 | start_g2_letter = None; // not start of char sequence |
1556 | 5.91k | } |
1557 | }, | |
1558 | } | |
1559 | ||
1560 | 35.1k | if (ch == 'W' || |
1561 | 602 | (mode == UEB_Mode::Grade2 || |
1562 | 602 | g1_word_indicator = use_g1_word_mode(&chars[i..]); |
1563 | 602 | if g1_word_indicator == Grade1WordIndicator::InWord { |
1564 | 1 | mode = UEB_Mode::Grade1; |
1565 | 1 | if duration == UEB_Duration::Symbol { |
1566 | 1 | duration = UEB_Duration::Word; // if Passage mode, leave as is |
1567 | 1 | result.push('𝟙') |
1568 | 0 | } |
1569 | 601 | } |
1570 | 34.5k | } |
1571 | } | |
1572 | 1.89k | if mode == UEB_Mode::Grade2 && |
1573 | 289 | let Some( |
1574 | 31 | result = handle_contractions(&chars[start..i], result); |
1575 | 1.86k | } |
1576 | ||
1577 | 1.89k | return result; |
1578 | ||
1579 | ||
1580 | 1.59k | fn use_g1_word_mode(chars: &[char]) -> Grade1WordIndicator { |
1581 | // debug!("use_g1_word_mode: chars='{:?}'", chars); | |
1582 | 19.5k | for &ch in |
1583 | 19.5k | if ch == 'W' || |
1584 | 601 | return Grade1WordIndicator::NotInWord; // reached a word boundary |
1585 | 18.9k | } |
1586 | 18.9k | if ch == '𝟙' { |
1587 | 2 | return Grade1WordIndicator::InWord; // need word mode in this "word" |
1588 | 18.9k | } |
1589 | } | |
1590 | 990 | return Grade1WordIndicator::NotInChars; // |
1591 | 1.59k | } |
1592 | 1.89k | } |
1593 | ||
1594 | /// Returns a tuple: | |
1595 | /// true if the ith char "stands alone" (UEB 2.6) | |
1596 | /// the chars on the right that are part of the standing alone sequence | |
1597 | /// the number of letters in that sequence | |
1598 | /// This basically means a letter sequence surrounded by white space with some potentially intervening chars | |
1599 | /// The intervening chars can be typeform/cap indicators, along with various forms of punctuation | |
1600 | /// The ith char should be an "L" | |
1601 | /// This assumes that there is whitespace before and after the character string | |
1602 | 1.52k | fn stands_alone(chars: &[char], i: usize) -> (bool, &[char], usize) { |
1603 | // scan backward and check the conditions for "standing-alone" | |
1604 | // we scan forward and check the conditions for "standing-alone" | |
1605 | 1.52k | assert_eq!(chars[i], 'L', "'stands_alone' starts with non 'L'"); |
1606 | // debug!("stands_alone: i={}, chars: {:?}", i, chars); | |
1607 | 1.52k | if !left_side_stands_alone(&chars[0..i]) { |
1608 | 977 | return (false, &chars[i..i+2], 0); |
1609 | 552 | } |
1610 | ||
1611 | 552 | let (mut is_alone, n_letters, n_right_matched) = right_side_stands_alone(&chars[i+2..]); |
1612 | // debug!("left is alone, right is alone: {}, : n_letters={}, n_right_matched={}", is_alone, n_letters, n_right_matched); | |
1613 | ||
1614 | 552 | if is_alone && |
1615 | 390 | let ch = chars[i+1]; |
1616 | 390 | if ch=='⠁' || |
1617 | 3 | is_alone = false; |
1618 | 387 | } |
1619 | 162 | } |
1620 | 552 | return (is_alone, &chars[i..i+2+n_right_matched], n_letters); |
1621 | ||
1622 | /// chars before 'L' | |
1623 | 1.52k | fn left_side_stands_alone(chars: &[char]) -> bool { |
1624 | // scan backwards to skip letters and intervening chars | |
1625 | // once we hit an intervening char, only intervening chars are allowed if standing alone | |
1626 | 1.52k | let mut intervening_chars_mode = false; // true when we are on the final stretch |
1627 | 1.52k | let mut i = chars.len(); |
1628 | 1.86k | while i > 0 { |
1629 | 1.38k | i -= 1; |
1630 | 1.38k | let ch = chars[i]; |
1631 | 1.38k | let prev_ch = if i > 0 { |
1632 | // debug!(" left alone: prev/ch {}/{}", prev_ch, ch); | |
1633 | 1.38k | if (!intervening_chars_mode && |
1634 | 1.30k | (prev_ch == 'o' || |
1635 | 174 | intervening_chars_mode = true; |
1636 | 174 | i -= 1; // ignore 'Lx' and also ignore 'ox' |
1637 | 1.21k | } else if is_left_intervening_char(ch) { |
1638 | 161 | intervening_chars_mode = true; |
1639 | 161 | } else { |
1640 | 1.05k | return "W𝐖-—―".contains(ch); |
1641 | } | |
1642 | } | |
1643 | ||
1644 | 475 | return true; |
1645 | 1.52k | } |
1646 | ||
1647 | // chars after character we are testing | |
1648 | 552 | fn right_side_stands_alone(chars: &[char]) -> (bool, usize, usize) { |
1649 | // see RUEB 2.6.3 | |
1650 | 355 | fn is_right_intervening_char(ch: char) -> bool { |
1651 | 355 |
|
1652 | 355 | } |
1653 | // scan forward to skip letters and intervening chars | |
1654 | // once we hit an intervening char, only intervening chars are allowed if standing alone ('c' and 'b' are part of them) | |
1655 | 552 | let mut intervening_chars_mode = false; // true when we are on the final stretch |
1656 | 552 | let mut i = 0; |
1657 | 552 | let mut n_letters = 1; // we have skipped the first letter |
1658 | 725 | while i < chars.len() { |
1659 | 515 | let ch = chars[i]; |
1660 | // debug!(" right alone: ch/next {}/{}", ch, if i+1<chars.len() {chars[i+1]} else {' '}); | |
1661 | 515 | if !intervening_chars_mode && |
1662 | 140 | n_letters += 1; |
1663 | 140 | i += 1; // ignore 'Lx' and also ignore 'ox' |
1664 | 375 | } else if ch == 'c' || |
1665 | 20 | i += 1; // ignore 'Lx' and also ignore 'ox' |
1666 | 355 | } else if is_right_intervening_char(ch) { |
1667 | 13 | intervening_chars_mode = true; |
1668 | 13 | } else { |
1669 | 342 | return if "W𝐖-—―".contains(ch) { |
1670 | } | |
1671 | 173 | i += 1; |
1672 | } | |
1673 | ||
1674 | 210 | return (true, n_letters, chars.len()); |
1675 | 552 | } |
1676 | 1.52k | } |
1677 | ||
1678 | ||
1679 | /// Return a modified result if chars can be contracted. | |
1680 | /// Otherwise, the original string is returned | |
1681 | 1.41k | fn handle_contractions(chars: &[char], mut result: String) -> String { |
1682 | struct Replacement { | |
1683 | pattern: String, | |
1684 | replacement: &'static str | |
1685 | } | |
1686 | ||
1687 | const ASCII_TO_UNICODE: &[char] = &[ | |
1688 | '⠀', '⠮', '⠐', '⠼', '⠫', '⠩', '⠯', '⠄', '⠷', '⠾', '⠡', '⠬', '⠠', '⠤', '⠨', '⠌', | |
1689 | '⠴', '⠂', '⠆', '⠒', '⠲', '⠢', '⠖', '⠶', '⠦', '⠔', '⠱', '⠰', '⠣', '⠿', '⠜', '⠹', | |
1690 | '⠈', '⠁', '⠃', '⠉', '⠙', '⠑', '⠋', '⠛', '⠓', '⠊', '⠚', '⠅', '⠇', '⠍', '⠝', '⠕', | |
1691 | '⠏', '⠟', '⠗', '⠎', '⠞', '⠥', '⠧', '⠺', '⠭', '⠽', '⠵', '⠪', '⠳', '⠻', '⠘', '⠸', | |
1692 | ]; | |
1693 | ||
1694 | 36 | fn to_unicode_braille(ascii: &str) -> String { |
1695 | 36 | let mut unicode = String::with_capacity(4*ascii.len()); // 'L' + 3 bytes for braille char |
1696 | 82 | for ch in |
1697 | 82 | unicode.push('L'); |
1698 | 82 | unicode.push(ASCII_TO_UNICODE[(ch.to_ascii_uppercase() - 32) as usize]) |
1699 | } | |
1700 | 36 | return unicode; |
1701 | 36 | } |
1702 | ||
1703 | // It would be much better from an extensibility point of view to read the table in from a file | |
1704 | 2 | static CONTRACTIONS: LazyLock<Vec<Replacement>> = LazyLock::new(|| { vec![ |
1705 | // 10.3: Strong contractions | |
1706 | 2 | Replacement{ pattern: to_unicode_braille("and"), replacement: "L⠯"}, |
1707 | 2 | Replacement{ pattern: to_unicode_braille("for"), replacement: "L⠿"}, |
1708 | 2 | Replacement{ pattern: to_unicode_braille("of"), replacement: "L⠷"}, |
1709 | 2 | Replacement{ pattern: to_unicode_braille("the"), replacement: "L⠮"}, |
1710 | 2 | Replacement{ pattern: to_unicode_braille("with"), replacement: "L⠾"}, |
1711 | ||
1712 | // 10.8: final-letter group signs (this need to precede 'en' and any other shorter contraction) | |
1713 | 2 | Replacement{ pattern: "(?P<s>L.)L⠍L⠑L⠝L⠞".to_string(), replacement: "${s}L⠰L⠞" }, // ment |
1714 | 2 | Replacement{ pattern: "(?P<s>L.)L⠞L⠊L⠕L⠝".to_string(), replacement: "${s}L⠰L⠝" } ,// tion |
1715 | ||
1716 | // 10.4: Strong group signs | |
1717 | 2 | Replacement{ pattern: to_unicode_braille("ch"), replacement: "L⠡"}, |
1718 | 2 | Replacement{ pattern: to_unicode_braille("gh"), replacement: "L⠣"}, |
1719 | 2 | Replacement{ pattern: to_unicode_braille("sh"), replacement: "L⠩"}, |
1720 | 2 | Replacement{ pattern: to_unicode_braille("th"), replacement: "L⠹"}, |
1721 | 2 | Replacement{ pattern: to_unicode_braille("wh"), replacement: "L⠱"}, |
1722 | 2 | Replacement{ pattern: to_unicode_braille("ed"), replacement: "L⠫"}, |
1723 | 2 | Replacement{ pattern: to_unicode_braille("er"), replacement: "L⠻"}, |
1724 | 2 | Replacement{ pattern: to_unicode_braille("ou"), replacement: "L⠳"}, |
1725 | 2 | Replacement{ pattern: to_unicode_braille("ow"), replacement: "L⠪"}, |
1726 | 2 | Replacement{ pattern: to_unicode_braille("st"), replacement: "L⠌"}, |
1727 | 2 | Replacement{ pattern: "(?P<s>L.)L⠊L⠝L⠛".to_string(), replacement: "${s}L⠬" }, // 'ing', not at start |
1728 | 2 | Replacement{ pattern: to_unicode_braille("ar"), replacement: "L⠜"}, |
1729 | ||
1730 | // 10.6.5: Lower group signs preceded and followed by letters | |
1731 | // FIX: don't match if after/before a cap letter -- can't use negative pattern (?!...) in regex package | |
1732 | // Note: removed cc because "arccos" shouldn't be contracted (10.11.1), but there is no way to know about compound words | |
1733 | // Add it back after implementing a lookup dictionary of exceptions | |
1734 | 2 | Replacement{ pattern: "(?P<s>L.)L⠑L⠁(?P<e>L.)".to_string(), replacement: "${s}L⠂${e}" }, // ea |
1735 | 2 | Replacement{ pattern: "(?P<s>L.)L⠃L⠃(?P<e>L.)".to_string(), replacement: "${s}L⠆${e}" }, // bb |
1736 | // Replacement{ pattern: "(?P<s>L.)L⠉L⠉(?P<e>L.)".to_string(), replacement: "${s}L⠒${e}" }, // cc | |
1737 | 2 | Replacement{ pattern: "(?P<s>L.)L⠋L⠋(?P<e>L.)".to_string(), replacement: "${s}L⠖${e}" }, // ff |
1738 | 2 | Replacement{ pattern: "(?P<s>L.)L⠛L⠛(?P<e>L.)".to_string(), replacement: "${s}L⠶${e}" }, // gg |
1739 | ||
1740 | // 10.6.8: Lower group signs ("in" also 10.5.4 lower word signs) | |
1741 | // FIX: these need restrictions about only applying when upper dots are present | |
1742 | 2 | Replacement{ pattern: to_unicode_braille("en"), replacement: "⠢"}, |
1743 | 2 | Replacement{ pattern: to_unicode_braille("in"), replacement: "⠔"}, |
1744 | ||
1745 | ] | |
1746 | 2 | }); |
1747 | ||
1748 | 2 | static CONTRACTION_PATTERNS: LazyLock<RegexSet> = LazyLock::new(|| init_patterns(&CONTRACTIONS)); |
1749 | 1 | static CONTRACTION_REGEX: LazyLock<Vec<Regex>> = LazyLock::new(|| init_regex(&CONTRACTIONS)); |
1750 | ||
1751 | 1.41k | let mut chars_as_str = chars.iter().collect::<String>(); |
1752 | // debug!(" handle_contractions: examine '{}'", &chars_as_str); | |
1753 | 1.41k | let matches = CONTRACTION_PATTERNS.matches(&chars_as_str); |
1754 | 1.41k | for |
1755 | 35 | let element = &CONTRACTIONS[i]; |
1756 | 35 | // debug!(" replacing '{}' with '{}' in '{}'", element.pattern, element.replacement, &chars_as_str); |
1757 | 35 | result.truncate(result.len() - chars_as_str.len()); |
1758 | 35 | chars_as_str = CONTRACTION_REGEX[i].replace_all(&chars_as_str, element.replacement).to_string(); |
1759 | 35 | result.push_str(&chars_as_str); |
1760 | 35 | // debug!(" result after replace '{}'", result); |
1761 | 35 | } |
1762 | 1.41k | return result; |
1763 | ||
1764 | ||
1765 | ||
1766 | 2 | fn init_patterns(contractions: &[Replacement]) -> RegexSet { |
1767 | 2 | let mut vec: Vec<&str> = Vec::with_capacity(contractions.len()); |
1768 | 50 | for contraction in |
1769 | 50 | vec.push(&contraction.pattern); |
1770 | 50 | } |
1771 | 2 | return RegexSet::new(&vec).unwrap(); |
1772 | 2 | } |
1773 | ||
1774 | 1 | fn init_regex(contractions: &[Replacement]) -> Vec<Regex> { |
1775 | 1 | let mut vec = Vec::with_capacity(contractions.len()); |
1776 | 25 | for contraction in |
1777 | 25 | vec.push(Regex::new(&contraction.pattern).unwrap()); |
1778 | 25 | } |
1779 | 1 | return vec; |
1780 | 1 | } |
1781 | 1.41k | } |
1782 | ||
1783 | ||
1784 | ||
1785 | ||
1786 | static VIETNAM_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! { | |
1787 | "S" => "XXX", // sans-serif -- from prefs | |
1788 | "B" => "⠘", // bold | |
1789 | "𝔹" => "XXX", // blackboard -- from prefs | |
1790 | "T" => "⠈", // script | |
1791 | "I" => "⠨", // italic | |
1792 | "R" => "", // roman | |
1793 | // "E" => "⠰", // English | |
1794 | "1" => "⠠", // Grade 1 symbol | |
1795 | "L" => "", // Letter left in to assist in locating letters | |
1796 | "D" => "XXX", // German (Deutsche) -- from prefs | |
1797 | "G" => "⠰", // Greek | |
1798 | "V" => "XXX", // Greek Variants | |
1799 | // "H" => "⠠⠠", // Hebrew | |
1800 | // "U" => "⠈⠈", // Russian | |
1801 | "C" => "⠨", // capital | |
1802 | "𝑐" => "", // second or latter braille cell of a capital letter | |
1803 | "𝐶" => "⠨", // capital that never should get word indicator (from chemical element) | |
1804 | "N" => "⠼", // number indicator | |
1805 | "t" => "⠱", // shape terminator | |
1806 | "W" => "⠀", // whitespace" | |
1807 | "𝐖"=> "⠀", // whitespace | |
1808 | "s" => "⠆", // typeface single char indicator | |
1809 | "w" => "", // typeface word indicator | |
1810 | "e" => "", // typeface & capital terminator | |
1811 | "o" => "", // flag that what follows is an open indicator (used for standing alone rule) | |
1812 | "c" => "", // flag that what follows is an close indicator (used for standing alone rule) | |
1813 | "b" => "", // flag that what follows is an open or close indicator (used for standing alone rule) | |
1814 | "," => "⠂", // comma | |
1815 | "." => "⠲", // period | |
1816 | "-" => "-", // hyphen | |
1817 | "—" => "⠠⠤", // normal dash (2014) -- assume all normal dashes are unified here [RUEB appendix 3] | |
1818 | "―" => "⠐⠠⠤", // long dash (2015) -- assume all long dashes are unified here [RUEB appendix 3] | |
1819 | "#" => "", // signals end of script | |
1820 | "!" => "", // Hack used to prevent some regular expression matches | |
1821 | }; | |
1822 | ||
1823 | 112 | fn vietnam_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String { |
1824 | // Deal with Vietnamese "rhymes" -- moving accents around | |
1825 | // See "Vietnamese Uncontracted Braille Update in MathCAT" or maybe https://icanreadvietnamese.com/blog/14-rule-of-tone-mark-placement | |
1826 | // Note: I don't know how to write (for example) I_E_RULE so that it excludes "qu" and "gi", so I use two rules | |
1827 | // The first rule rewrites the patterns with "qu" and "gi" to add "!" to prevent a match of the second rule -- "!" is dropped later | |
1828 | 1 | static QU_GI_RULE_EXCEPTION: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(L⠟L⠥|L⠛L⠊)").unwrap()); |
1829 | 1 | static IUOY_E_RULE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"L(⠊|⠥|⠕|⠽)(L[⠔⠰⠢⠤⠠])L(⠑|⠣)").unwrap()); // ie, ue, oe, and ye rule |
1830 | 1 | static UO_A_RULE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"L(⠥|⠕)(L[⠔⠰⠢⠤⠠])L(⠁|⠡|⠜)").unwrap()); // ua, oa rule |
1831 | 1 | static UU_O_RULE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"L(⠥|⠳)(L[⠔⠰⠢⠤⠠])L(⠪|⠹)").unwrap()); // uo, ưo rule |
1832 | 1 | static UYE_RULE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"L⠥L([⠔⠰⠢⠤⠠])L⠽L⠣").unwrap()); // uo, ưo rule |
1833 | 1 | static UY_RULE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"L⠥L([⠔⠰⠢⠤⠠])L⠽").unwrap()); // uo, ưo rule |
1834 | 1 | static REPLACE_INDICATORS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"([1𝟙SB𝔹TIREDGVHP𝐶𝑐CLMNW𝐖swe,.-—―#ocb!])").unwrap()); |
1835 | // debug!("vietnam_cleanup: start={}", raw_braille); | |
1836 | 112 | let result = typeface_to_word_mode(&raw_braille); |
1837 | 112 | let result = capitals_to_word_mode(&result); |
1838 | ||
1839 | 112 | let result = result.replace("tW", "W"); |
1840 | 112 | let result = result.replace("CG", "⠸"); // capital Greek letters are problematic in Vietnam braille |
1841 | 112 | let result = result.replace("CC", "⠸"); // capital word more is the same as capital Greek letters |
1842 | // debug!(" after typeface/caps={}", &result); | |
1843 | ||
1844 | // deal with "rhymes" | |
1845 | 112 | let result = QU_GI_RULE_EXCEPTION.replace_all(&result, "${1}!"); |
1846 | // debug!(" after except={}", &result); | |
1847 | 112 | let result = IUOY_E_RULE.replace_all(&result, "${2}L${1}L${3}"); |
1848 | // debug!(" after IUOY_E={}", &result); | |
1849 | 112 | let result = UO_A_RULE.replace_all(&result, "${2}L${1}L${3}"); |
1850 | // debug!(" after UO_A={}", &result); | |
1851 | 112 | let result = UU_O_RULE.replace_all(&result, "${2}L${1}L${3}"); |
1852 | // debug!(" after UO_O={}", &result); | |
1853 | 112 | let result = UYE_RULE.replace_all(&result, "${1}L⠥L⠽L⠣"); // longer match first |
1854 | // debug!(" after UYE={}", &result); | |
1855 | 112 | let result = UY_RULE.replace_all(&result, "${1}L⠥L⠽"); |
1856 | // debug!(" after UY={}", &result); | |
1857 | ||
1858 | // these typeforms need to get pulled from user-prefs as they are transcriber-defined | |
1859 | 112 | let double_struck = pref_manager.pref_to_string("Vietnam_DoubleStruck"); |
1860 | 112 | let sans_serif = pref_manager.pref_to_string("Vietnam_SansSerif"); |
1861 | 112 | let fraktur = pref_manager.pref_to_string("Vietnam_Fraktur"); |
1862 | 112 | let greek_variant = pref_manager.pref_to_string("Vietnam_GreekVariant"); |
1863 | ||
1864 | // This reuses the code just for getting rid of unnecessary "L"s and "N"s | |
1865 | 112 | let result = remove_unneeded_mode_changes(&result, UEB_Mode::Grade1, UEB_Duration::Passage); |
1866 | ||
1867 | ||
1868 | 1.23k | let |
1869 | 1.23k | let matched_char = &cap[0]; |
1870 | 1.23k | match matched_char { |
1871 | 1.23k | "𝔹" => |
1872 | 1.23k | "S" => |
1873 | 1.23k | "D" => |
1874 | 1.23k | "V" => |
1875 | 1.23k | _ => match VIETNAM_INDICATOR_REPLACEMENTS.get(matched_char) { |
1876 | 0 | None => {error!("REPLACE_INDICATORS and VIETNAM_INDICATOR_REPLACEMENTS are not in sync: missing '{matched_char}'"); ""}, |
1877 | 1.23k | Some(&ch) => ch, |
1878 | }, | |
1879 | } | |
1880 | 1.23k | }); |
1881 | ||
1882 | // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces | |
1883 | // let result = result.trim_start_matches('⠀').trim_end_matches('⠀'); | |
1884 | 112 | let result = COLLAPSE_SPACES.replace_all(&result, "⠀"); |
1885 | ||
1886 | 112 | return result.to_string(); |
1887 | 112 | } |
1888 | ||
1889 | ||
1890 | static CMU_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! { | |
1891 | // "S" => "XXX", // sans-serif -- from prefs | |
1892 | "B" => "⠔", // bold | |
1893 | "𝔹" => "⠬", // blackboard -- from prefs | |
1894 | // "T" => "⠈", // script | |
1895 | "I" => "⠔", // italic -- same as bold | |
1896 | // "R" => "", // roman | |
1897 | // "E" => "⠰", // English | |
1898 | "1" => "⠐", // Grade 1 symbol -- used here for a-j after number | |
1899 | "L" => "", // Letter left in to assist in locating letters | |
1900 | "D" => "⠠", // German (Gothic) | |
1901 | "G" => "⠈", // Greek | |
1902 | "V" => "⠈⠬", // Greek Variants | |
1903 | // "H" => "⠠⠠", // Hebrew | |
1904 | // "U" => "⠈⠈", // Russian | |
1905 | "C" => "⠨", // capital | |
1906 | "𝐶" => "⠨", // capital that never should get word indicator (from chemical element) | |
1907 | "N" => "⠼", // number indicator | |
1908 | "𝑁" => "", // continue number | |
1909 | // "t" => "⠱", // shape terminator | |
1910 | "W" => "⠀", // whitespace" | |
1911 | "𝐖"=> "⠀", // whitespace | |
1912 | // "𝘄" => "⠀", // add whitespace if char to the left has dots 1, 2, or 3 -- special rule handled separately, so commented out | |
1913 | "s" => "", // typeface single char indicator | |
1914 | // "w" => "⠂", // typeface word indicator | |
1915 | // "e" => "⠄", // typeface & capital terminator | |
1916 | // "o" => "", // flag that what follows is an open indicator (used for standing alone rule) | |
1917 | // "c" => "", // flag that what follows is an close indicator (used for standing alone rule) | |
1918 | // "b" => "", // flag that what follows is an open or close indicator (used for standing alone rule) | |
1919 | "," => "⠂", // comma | |
1920 | "." => "⠄", // period | |
1921 | "-" => "⠤", // hyphen | |
1922 | "—" => "⠤⠤", // normal dash (2014) -- assume all normal dashes are unified here [RUEB appendix 3] | |
1923 | // "―" => "⠐⠤⠤", // long dash (2015) -- assume all long dashes are unified here [RUEB appendix 3] | |
1924 | "#" => "⠼", // signals to end/restart of numeric mode (mixed fractions) | |
1925 | }; | |
1926 | ||
1927 | ||
1928 | 372 | fn cmu_cleanup(_pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String { |
1929 | 2 | static ADD_WHITE_SPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"𝘄(.)|𝘄$").unwrap()); |
1930 | ||
1931 | // debug!("cmu_cleanup: start={}", raw_braille); | |
1932 | // let result = typeface_to_word_mode(&raw_braille); | |
1933 | ||
1934 | // let result = result.replace("tW", "W"); | |
1935 | 372 | let result = raw_braille.replace("CG", "⠘") |
1936 | 372 | .replace("𝔹C", "⠩") |
1937 | 372 | .replace("DC", "⠰"); |
1938 | // let result = result.replace("CC", "⠸"); | |
1939 | ||
1940 | // these typeforms need to get pulled from user-prefs as they are transcriber-defined | |
1941 | // let double_struck = pref_manager.pref_to_string("CMU_DoubleStruck"); | |
1942 | // let sans_serif = pref_manager.pref_to_string("CMU_SansSerif"); | |
1943 | // let fraktur = pref_manager.pref_to_string("CMU_Fraktur"); | |
1944 | ||
1945 | // debug!("Before remove mode changes: '{}'", &result); | |
1946 | // This reuses the code just for getting rid of unnecessary "L"s and "N"s | |
1947 | 372 | let result = remove_unneeded_mode_changes(&result, UEB_Mode::Grade1, UEB_Duration::Passage); |
1948 | 372 | let result = result.replace("𝑁N", ""); |
1949 | // debug!(" After remove mode changes: '{}'", &result); | |
1950 | ||
1951 | 2.58k | let |
1952 | 2.58k | match CMU_INDICATOR_REPLACEMENTS.get(&cap[0]) { |
1953 | 0 | None => {error!("REPLACE_INDICATORS and CMU_INDICATOR_REPLACEMENTS are not in sync"); ""}, |
1954 | 2.58k | Some(&ch) => ch, |
1955 | } | |
1956 | 2.58k | }); |
1957 | 372 | let result = ADD_WHITE_SPACE.replace_all(&result, |cap: &Captures| |
1958 | 12 | if cap.get(1).is_none() { |
1959 | 2 | return "⠀".to_string(); |
1960 | } else { | |
1961 | // debug!("ADD_WHITE_SPACE match='{}', has left dots = {}", &cap[1], has_left_dots(cap[1].chars().next().unwrap())); | |
1962 | 10 | let mut next_chars = cap[1].chars(); |
1963 | 10 | let next_char = next_chars.next().unwrap(); |
1964 | 10 | assert!(next_chars.next().is_none()); |
1965 | 10 | return (if has_left_dots(next_char) { |
1966 | } | |
1967 | 12 | }); |
1968 | ||
1969 | // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces | |
1970 | 372 | let result = COLLAPSE_SPACES.replace_all(&result, "⠀"); |
1971 | 372 | let result = result.trim_start_matches('⠀'); // don't trip end (e.g., see once::vector_11_2_5) |
1972 | 372 | return result.to_string(); |
1973 | ||
1974 | 10 | fn has_left_dots(ch: char) -> bool { |
1975 | // Unicode braille is set up so dot 1 is 2^0, dot 2 is 2^1, etc | |
1976 | 10 | return ( (ch as u32 - 0x2800) >> 4 ) > 0; |
1977 | 10 | } |
1978 | 372 | } |
1979 | ||
1980 | ||
1981 | ||
1982 | static SWEDISH_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! { | |
1983 | // FIX: this needs cleaning up -- not all of these are used | |
1984 | "S" => "XXX", // sans-serif -- from prefs | |
1985 | "B" => "⠨", // bold | |
1986 | "𝔹" => "XXX", // blackboard -- from prefs | |
1987 | "T" => "⠈", // script | |
1988 | "I" => "⠨", // italic | |
1989 | "R" => "", // roman | |
1990 | "1" => "⠱", // Grade 1 symbol (used for number followed by a letter) | |
1991 | "L" => "", // Letter left in to assist in locating letters | |
1992 | "D" => "XXX", // German (Deutsche) -- from prefs | |
1993 | "G" => "⠰", // Greek | |
1994 | "V" => "XXX", // Greek Variants | |
1995 | // "H" => "⠠⠠", // Hebrew | |
1996 | // "U" => "⠈⠈", // Russian | |
1997 | "C" => "⠠", // capital | |
1998 | "𝑐" => "", // second or latter braille cell of a capital letter | |
1999 | "𝐶" => "⠠", // capital that never should get word indicator (from chemical element) | |
2000 | "N" => "⠼", // number indicator | |
2001 | "t" => "⠱", // shape terminator | |
2002 | "W" => "⠀", // whitespace" | |
2003 | "𝐖"=> "⠀", // whitespace | |
2004 | "w" => "⠀", // whitespace after function name | |
2005 | "s" => "", // typeface single char indicator | |
2006 | "e" => "", // typeface & capital terminator | |
2007 | "E" => "⠱", // empty base -- see index of radical | |
2008 | "o" => "", // flag that what follows is an open indicator (used for standing alone rule) | |
2009 | "c" => "", // flag that what follows is an close indicator (used for standing alone rule) | |
2010 | "b" => "", // flag that what follows is an open or close indicator (used for standing alone rule) | |
2011 | "," => "⠂", // comma | |
2012 | "." => "⠲", // period | |
2013 | "-" => "-", // hyphen | |
2014 | "—" => "⠠⠤", // normal dash (2014) -- assume all normal dashes are unified here [RUEB appendix 3] | |
2015 | "―" => "⠐⠠⠤", // long dash (2015) -- assume all long dashes are unified here [RUEB appendix 3] | |
2016 | "#" => "", // signals end of script | |
2017 | ||
2018 | }; | |
2019 | ||
2020 | ||
2021 | static FINNISH_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! { | |
2022 | // FIX: this needs cleaning up -- not all of these are used | |
2023 | "S" => "XXX", // sans-serif -- from prefs | |
2024 | "B" => "⠨", // bold | |
2025 | "𝔹" => "XXX", // blackboard -- from prefs | |
2026 | "T" => "⠈", // script | |
2027 | "I" => "⠨", // italic | |
2028 | "R" => "", // roman | |
2029 | "E" => "⠰", // English | |
2030 | "1" => "⠀", // Grade 1 symbol (used for number followed by a letter) | |
2031 | "L" => "", // Letter left in to assist in locating letters | |
2032 | "D" => "XXX", // German (Deutsche) -- from prefs | |
2033 | "G" => "⠨", // Greek | |
2034 | "V" => "XXX", // Greek Variants | |
2035 | // "H" => "⠠⠠", // Hebrew | |
2036 | // "U" => "⠈⠈", // Russian | |
2037 | "C" => "⠠", // capital | |
2038 | "𝑐" => "", // second or latter braille cell of a capital letter | |
2039 | "𝐶" => "⠠", // capital that never should get whitespace in front (from chemical element) | |
2040 | "N" => "⠼", // number indicator | |
2041 | "n" => "⠼", // number indicator for drop numbers (special case with close parens) | |
2042 | "t" => "⠱", // shape terminator | |
2043 | "W" => "⠀", // whitespace" | |
2044 | "𝐖"=> "⠀", // whitespace | |
2045 | "s" => "⠆", // typeface single char indicator | |
2046 | "w" => "", // typeface word indicator | |
2047 | "e" => "", // typeface & capital terminator | |
2048 | "," => "⠂", // comma | |
2049 | "." => "⠲", // period | |
2050 | "-" => "-", // hyphen | |
2051 | "—" => "⠠⠤", // normal dash (2014) -- assume all normal dashes are unified here [RUEB appendix 3] | |
2052 | "―" => "⠐⠠⠤", // long dash (2015) -- assume all long dashes are unified here [RUEB appendix 3] | |
2053 | "(" => "⠦", // Not really needed, but done for consistency with ")" | |
2054 | ")" => "⠴", // Needed for rules with drop numbers to avoid mistaking for dropped 0 | |
2055 | "↑" => "⠬", // superscript | |
2056 | "↓" => "⠡", // subscript | |
2057 | "#" => "", // signals end of script | |
2058 | "Z" => "⠐", // signals end of index of root, integrand/lim from function ("zone change") | |
2059 | ||
2060 | }; | |
2061 | ||
2062 | 0 | fn finnish_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String { |
2063 | 0 | static REPLACE_INDICATORS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"([SB𝔹TIREDGVHUP𝐏C𝐶LlMmb↑↓Nn𝑁WwZ,()])").unwrap()); |
2064 | // Numbers need to end with a space, but sometimes there is one there for other reasons | |
2065 | 0 | static DROP_NUMBER_SEPARATOR: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(n.)\)").unwrap()); |
2066 | 0 | static NUMBER_MATCH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"((N.)+[^WN𝐶#↑↓Z])").unwrap()); |
2067 | ||
2068 | // debug!("finnish_cleanup: start={}", raw_braille); | |
2069 | 0 | let result = DROP_NUMBER_SEPARATOR.replace_all(&raw_braille, |cap: &Captures| { |
2070 | // match includes the char after the number -- insert the whitespace before it | |
2071 | // debug!("DROP_NUMBER_SEPARATOR match='{}'", &cap[1]); | |
2072 | 0 | return cap[1].to_string() + "𝐶)"; // hack to use "𝐶" instead of dot 6 directly, but works for NUMBER_MATCH |
2073 | 0 | }); |
2074 | 0 | let result = result.replace('n', "N"); // avoids having to modify remove_unneeded_mode_changes() |
2075 | 0 | let result = NUMBER_MATCH.replace_all(&result, |cap: &Captures| { |
2076 | // match includes the char after the number -- insert the whitespace before it | |
2077 | // debug!("NUMBER_MATCH match='{}'", &cap[1]); | |
2078 | 0 | let mut chars = cap[0].chars(); |
2079 | 0 | let last_char = chars.next_back().unwrap(); // unwrap safe since several chars were matched |
2080 | 0 | return chars.as_str().to_string() + "W" + &last_char.to_string(); |
2081 | 0 | }); |
2082 | ||
2083 | // FIX: need to implement this -- this is just a copy of the Vietnam code | |
2084 | 0 | let result = result.replace("CG", "⠘") |
2085 | 0 | .replace("𝔹C", "⠩") |
2086 | 0 | .replace("DC", "⠰"); |
2087 | ||
2088 | // debug!(" after typeface/caps={}", &result); | |
2089 | ||
2090 | // these typeforms need to get pulled from user-prefs as they are transcriber-defined | |
2091 | 0 | let double_struck = pref_manager.pref_to_string("Vietnam_DoubleStruck"); |
2092 | 0 | let sans_serif = pref_manager.pref_to_string("Vietnam_SansSerif"); |
2093 | 0 | let fraktur = pref_manager.pref_to_string("Vietnam_Fraktur"); |
2094 | 0 | let greek_variant = pref_manager.pref_to_string("Vietnam_GreekVariant"); |
2095 | ||
2096 | // This reuses the code just for getting rid of unnecessary "L"s and "N"s | |
2097 | 0 | let result = remove_unneeded_mode_changes(&result, UEB_Mode::Grade1, UEB_Duration::Passage); |
2098 | // debug!(" remove_unneeded_mode_changes={}", &result); | |
2099 | ||
2100 | ||
2101 | 0 | let result = REPLACE_INDICATORS.replace_all(&result, |cap: &Captures| { |
2102 | 0 | let matched_char = &cap[0]; |
2103 | 0 | match matched_char { |
2104 | 0 | "𝔹" => &double_struck, |
2105 | 0 | "S" => &sans_serif, |
2106 | 0 | "D" => &fraktur, |
2107 | 0 | "V" => &greek_variant, |
2108 | 0 | _ => match FINNISH_INDICATOR_REPLACEMENTS.get(matched_char) { |
2109 | 0 | None => {error!("REPLACE_INDICATORS and SWEDISH_INDICATOR_REPLACEMENTS are not in sync: missing '{matched_char}'"); ""}, |
2110 | 0 | Some(&ch) => ch, |
2111 | }, | |
2112 | } | |
2113 | 0 | }); |
2114 | ||
2115 | // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces | |
2116 | // let result = result.trim_start_matches('⠀').trim_end_matches('⠀'); | |
2117 | 0 | let result = COLLAPSE_SPACES.replace_all(&result, "⠀"); |
2118 | ||
2119 | 0 | return result.to_string(); |
2120 | 0 | } |
2121 | ||
2122 | ||
2123 | 0 | fn swedish_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String { |
2124 | // FIX: need to implement this -- this is just a copy of the Vietnam code | |
2125 | // Empty bases are ok if they follow whitespace | |
2126 | 0 | static EMPTY_BASE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(^|[W𝐖w])E").unwrap()); |
2127 | // debug!("swedish_cleanup: start={}", raw_braille); | |
2128 | 0 | let result = typeface_to_word_mode(&raw_braille); |
2129 | 0 | let result = capitals_to_word_mode(&result); |
2130 | ||
2131 | 0 | let result = result.replace("CG", "⠘") |
2132 | 0 | .replace("𝔹C", "⠩") |
2133 | 0 | .replace("DC", "⠰"); |
2134 | ||
2135 | // debug!(" after typeface/caps={}", &result); | |
2136 | ||
2137 | // these typeforms need to get pulled from user-prefs as they are transcriber-defined | |
2138 | 0 | let double_struck = pref_manager.pref_to_string("Vietnam_DoubleStruck"); |
2139 | 0 | let sans_serif = pref_manager.pref_to_string("Vietnam_SansSerif"); |
2140 | 0 | let fraktur = pref_manager.pref_to_string("Vietnam_Fraktur"); |
2141 | 0 | let greek_variant = pref_manager.pref_to_string("Vietnam_GreekVariant"); |
2142 | ||
2143 | // This reuses the code just for getting rid of unnecessary "L"s and "N"s | |
2144 | 0 | let result = remove_unneeded_mode_changes(&result, UEB_Mode::Grade1, UEB_Duration::Passage); |
2145 | // debug!(" after removing mode changes={}", &result); | |
2146 | ||
2147 | ||
2148 | 0 | let result = EMPTY_BASE.replace_all(&result, "$1"); |
2149 | 0 | let result = REPLACE_INDICATORS.replace_all(&result, |cap: &Captures| { |
2150 | 0 | let matched_char = &cap[0]; |
2151 | 0 | match matched_char { |
2152 | 0 | "𝔹" => &double_struck, |
2153 | 0 | "S" => &sans_serif, |
2154 | 0 | "D" => &fraktur, |
2155 | 0 | "V" => &greek_variant, |
2156 | 0 | _ => match SWEDISH_INDICATOR_REPLACEMENTS.get(matched_char) { |
2157 | 0 | None => {error!("REPLACE_INDICATORS and SWEDISH_INDICATOR_REPLACEMENTS are not in sync: missing '{matched_char}'"); ""}, |
2158 | 0 | Some(&ch) => ch, |
2159 | }, | |
2160 | } | |
2161 | 0 | }); |
2162 | ||
2163 | // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces | |
2164 | // let result = result.trim_start_matches('⠀').trim_end_matches('⠀'); | |
2165 | 0 | let result = COLLAPSE_SPACES.replace_all(&result, "⠀"); |
2166 | ||
2167 | 0 | return result.to_string(); |
2168 | 0 | } |
2169 | ||
2170 | #[allow(non_snake_case)] | |
2171 | 50 | fn LaTeX_cleanup(_pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String { |
2172 | 1 | static REMOVE_SPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r" ([\^_,;)\]}])").unwrap()); // '^', '_', ',', ';', ')', ']', '}' |
2173 | 1 | static COLLAPSE_SPACES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r" +").unwrap()); |
2174 | // debug!("LaTeX_cleanup: start={}", raw_braille); | |
2175 | 50 | let result = raw_braille.replace('𝐖', " "); |
2176 | // let result = COLLAPSE_SPACES.replace_all(&raw_braille, "⠀"); | |
2177 | 50 | let result = COLLAPSE_SPACES.replace_all(&result, " "); |
2178 | // debug!("After collapse: {}", &result); | |
2179 | 50 | let result = REMOVE_SPACE.replace_all(&result, "$1"); |
2180 | // debug!("After remove: {}", &result); | |
2181 | // let result = result.trim_matches('⠀'); | |
2182 | 50 | let result = result.trim_matches(' '); |
2183 | ||
2184 | 50 | return result.to_string(); |
2185 | 50 | } |
2186 | ||
2187 | #[allow(non_snake_case)] | |
2188 | 41 | fn ASCIIMath_cleanup(_pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String { |
2189 | 1 | static REMOVE_SPACE_BEFORE_OP: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"([\w\d]) +([^\w\d"]|[\^_,;)\]}])"#).unwrap()); |
2190 | 1 | static REMOVE_SPACE_AFTER_OP: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"([^\^_,;)\]}\w\d"]) +([\w\d])"#).unwrap()); |
2191 | 1 | static COLLAPSE_SPACES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r" +").unwrap()); |
2192 | // debug!("ASCIIMath_cleanup: start={}", raw_braille); | |
2193 | 41 | let result = raw_braille.replace("|𝐖__|", "|𝐰__|"); // protect the whitespace to prevent misinterpretation as lfloor |
2194 | 41 | let result = result.replace('𝐖', " "); |
2195 | 41 | let result = COLLAPSE_SPACES.replace_all(&result, " "); |
2196 | // debug!("After collapse: {}", &result); | |
2197 | 41 | let result = REMOVE_SPACE_BEFORE_OP.replace_all(&result, "$1$2"); |
2198 | 41 | let result = REMOVE_SPACE_AFTER_OP.replace_all(&result, "$1$2"); |
2199 | 41 | let result = result.replace('𝐰', " "); // spaces around relational operators |
2200 | 41 | let result = COLLAPSE_SPACES.replace_all(&result, " "); |
2201 | // debug!("After remove: {}", &result); | |
2202 | // let result = result.trim_matches('⠀'); | |
2203 | 41 | let result = result.trim_matches(' '); |
2204 | ||
2205 | 41 | return result.to_string(); |
2206 | 41 | } |
2207 | ||
2208 | ||
2209 | /************** Braille xpath functionality ***************/ | |
2210 | use crate::canonicalize::{as_element, as_text, name}; | |
2211 | use crate::xpath_functions::{is_leaf, validate_one_node, IsBracketed}; | |
2212 | use std::result::Result as StdResult; | |
2213 | use sxd_document::dom::ParentOfChild; | |
2214 | use sxd_xpath::function::Error as XPathError; | |
2215 | use sxd_xpath::function::{Args, Function}; | |
2216 | use sxd_xpath::{context, nodeset::*, Value}; | |
2217 | ||
2218 | pub struct NemethNestingChars; | |
2219 | const NEMETH_FRAC_LEVEL: &str = "data-nemeth-frac-level"; // name of attr where value is cached | |
2220 | const FIRST_CHILD_ONLY: &[&str] = &["mroot", "msub", "msup", "msubsup", "munder", "mover", "munderover", "mmultiscripts"]; | |
2221 | impl NemethNestingChars { | |
2222 | // returns a 'repeat_char' corresponding to the Nemeth rules for nesting | |
2223 | // note: this value is likely one char too long because the starting fraction is counted | |
2224 | 537 | fn nemeth_frac_value(node: Element, repeat_char: &str) -> String { |
2225 | 537 | let children = node.children(); |
2226 | 537 | let name = name(node); |
2227 | 537 | if is_leaf(node) { |
2228 | 244 | return "".to_string(); |
2229 | 293 | } else if name == "mfrac" { |
2230 | // have we already computed the value? | |
2231 | 221 | if let Some( |
2232 | 152 | return value.to_string(); |
2233 | 69 | } |
2234 | ||
2235 | 69 | let num_value = NemethNestingChars::nemeth_frac_value(as_element(children[0]), repeat_char); |
2236 | 69 | let denom_value = NemethNestingChars::nemeth_frac_value(as_element(children[1]), repeat_char); |
2237 | 69 | let mut max_value = if num_value.len() > denom_value.len() { |
2238 | 69 | max_value += repeat_char; |
2239 | 69 | node.set_attribute_value(NEMETH_FRAC_LEVEL, &max_value); |
2240 | 69 | return max_value; |
2241 | 72 | } else if FIRST_CHILD_ONLY.contains(&name) { |
2242 | // only look at the base -- ignore scripts/index | |
2243 | 10 | return NemethNestingChars::nemeth_frac_value(as_element(children[0]), repeat_char); |
2244 | } else { | |
2245 | 62 | let mut result = "".to_string(); |
2246 | 197 | for child in |
2247 | 197 | let value = NemethNestingChars::nemeth_frac_value(as_element(child), repeat_char); |
2248 | 197 | if value.len() > result.len() { |
2249 | 19 | result = value; |
2250 | 178 | } |
2251 | } | |
2252 | 62 | return result; |
2253 | } | |
2254 | 537 | } |
2255 | ||
2256 | 0 | fn nemeth_root_value(node: Element, repeat_char: &str) -> StdResult<String, XPathError> { |
2257 | // returns the correct number of repeat_chars to use | |
2258 | // note: because the highest count is toward the leaves and | |
2259 | // because this is a loop and not recursive, caching doesn't work without a lot of overhead | |
2260 | 0 | let parent = node.parent().unwrap(); |
2261 | 0 | if let ParentOfChild::Element(e) = parent { |
2262 | 0 | let mut parent = e; |
2263 | 0 | let mut result = "".to_string(); |
2264 | loop { | |
2265 | 0 | let name = name(parent); |
2266 | 0 | if name == "math" { |
2267 | 0 | return Ok( result ); |
2268 | 0 | } |
2269 | 0 | if name == "msqrt" || name == "mroot" { |
2270 | 0 | result += repeat_char; |
2271 | 0 | } |
2272 | 0 | let parent_of_child = parent.parent().unwrap(); |
2273 | 0 | if let ParentOfChild::Element(e) = parent_of_child { |
2274 | 0 | parent = e; |
2275 | 0 | } else { |
2276 | 0 | return Err( sxd_xpath::function::Error::Other("Internal error in nemeth_root_value: didn't find 'math' tag".to_string()) ); |
2277 | } | |
2278 | } | |
2279 | 0 | } |
2280 | 0 | return Err( XPathError::Other("Internal error in nemeth_root_value: didn't find 'math' tag".to_string()) ); |
2281 | 0 | } |
2282 | } | |
2283 | ||
2284 | impl Function for NemethNestingChars { | |
2285 | /** | |
2286 | * Returns a string with the correct number of nesting chars (could be an empty string) | |
2287 | * @param(node) -- current node | |
2288 | * @param(char) -- char (string) that should be repeated | |
2289 | * Note: as a side effect, an attribute with the value so repeated calls to this or a child will be fast | |
2290 | */ | |
2291 | 192 | fn evaluate<'d>(&self, |
2292 | 192 | _context: &context::Evaluation<'_, 'd>, |
2293 | 192 | args: Vec<Value<'d>>) |
2294 | 192 | -> StdResult<Value<'d>, XPathError> |
2295 | { | |
2296 | 192 | let mut args = Args(args); |
2297 | 192 | args.exactly(2) |
2298 | 192 | let repeat_char = args.pop_string() |
2299 | 192 | let node = crate::xpath_functions::validate_one_node(args.pop_nodeset() |
2300 | 192 | if let Node::Element(el) = node { |
2301 | 192 | let name = name(el); |
2302 | // it is likely a bug to call this one a non mfrac | |
2303 | 192 | if name == "mfrac" { |
2304 | // because it is called on itself, the fraction is counted one too many times -- chop one off | |
2305 | // this is slightly messy because we are chopping off a char, not a byte | |
2306 | const BRAILLE_BYTE_LEN: usize = "⠹".len(); // all Unicode braille symbols have the same number of bytes | |
2307 | 192 | return Ok( Value::String( NemethNestingChars::nemeth_frac_value(el, &repeat_char)[BRAILLE_BYTE_LEN..].to_string() ) ); |
2308 | 0 | } else if name == "msqrt" || name == "mroot" { |
2309 | 0 | return Ok( Value::String( NemethNestingChars::nemeth_root_value(el, &repeat_char)? ) ); |
2310 | } else { | |
2311 | 0 | return Err(XPathError::Other(format!("NestingChars chars should be used only on 'mfrac'. '{}' was passed in", name))); |
2312 | } | |
2313 | } else { | |
2314 | // not an element, so nothing to do | |
2315 | 0 | return Ok( Value::String("".to_string()) ); |
2316 | } | |
2317 | 192 | } |
2318 | } | |
2319 | ||
2320 | pub struct BrailleChars; | |
2321 | impl BrailleChars { | |
2322 | // returns a string for the chars in the *leaf* node. | |
2323 | // this string follows the Nemeth rules typefaces and deals with mathvariant | |
2324 | // which has partially turned chars to the alphanumeric block | |
2325 | 12.5k | fn get_braille_chars(node: Element, code: &str, text_range: Option<Range<usize>>) -> StdResult<String, XPathError> { |
2326 | 12.5k | let result = match code { |
2327 | 12.5k | "Nemeth" => |
2328 | 6.52k | "UEB" => |
2329 | 4.24k | "CMU" => |
2330 | 536 | "Vietnam" => BrailleChars:: get_braille_vietnam_chars(node, text_range), |
2331 | 0 | "Swedish" => BrailleChars:: get_braille_ueb_chars(node, text_range), // FIX: need to figure out what to implement |
2332 | 0 | "Finnish" => BrailleChars:: get_braille_ueb_chars(node, text_range), // FIX: need to figure out what to implement |
2333 | 0 | _ => return Err(sxd_xpath::function::Error::Other(format!("get_braille_chars: unknown braille code '{code}'"))) |
2334 | }; | |
2335 | 12.5k | return match result { |
2336 | 12.5k | Ok(string) => Ok(make_quoted_string(string)), |
2337 | 0 | Err(err) => return Err(sxd_xpath::function::Error::Other(err.to_string())), |
2338 | } | |
2339 | 12.5k | } |
2340 | ||
2341 | 5.99k | fn get_braille_nemeth_chars(node: Element, text_range: Option<Range<usize>>) -> Result<String> { |
2342 | // To greatly simplify typeface/language generation, the chars have unique ASCII chars for them: | |
2343 | // Typeface: S: sans-serif, B: bold, 𝔹: blackboard, T: script, I: italic, R: Roman | |
2344 | // Language: E: English, D: German, G: Greek, V: Greek variants, H: Hebrew, U: Russian | |
2345 | // Indicators: C: capital, L: letter, N: number, P: punctuation, M: multipurpose | |
2346 | 2 | static PICK_APART_CHAR: LazyLock<Regex> = LazyLock::new(|| { |
2347 | 2 | Regex::new(r"(?P<face>[SB𝔹TIR]*)(?P<lang>[EDGVHU]?)(?P<cap>C?)(?P<letter>L?)(?P<num>[N]?)(?P<char>.)").unwrap() |
2348 | 2 | }); |
2349 | 5.99k | let math_variant = node.attribute_value("mathvariant"); |
2350 | // FIX: cover all the options -- use phf::Map | |
2351 | 5.99k | let attr_typeface = match math_variant { |
2352 | 5.76k | None => "R", |
2353 | 233 | Some(variant) => match variant { |
2354 | 233 | "bold" => |
2355 | 191 | "italic" => |
2356 | 189 | "double-struck" => |
2357 | 162 | "script" => |
2358 | 157 | "fraktur" => |
2359 | 157 | "sans-serif" => |
2360 | 156 | _ => "R", // normal and unknown |
2361 | }, | |
2362 | }; | |
2363 | 5.99k | let text = BrailleChars::substring(as_text(node), &text_range); |
2364 | 5.99k | let braille_chars = braille_replace_chars(&text, node) |
2365 | // debug!("Nemeth chars: text='{}', braille_chars='{}'", &text, &braille_chars); | |
2366 | ||
2367 | // we want to pull the prefix (typeface, language) out to the front until a change happens | |
2368 | // the same is true for number indicator | |
2369 | // also true (sort of) for capitalization -- if all caps, use double cap in front (assume abbr or Roman Numeral) | |
2370 | ||
2371 | // we only care about this for numbers and identifiers/text, so we filter for only those | |
2372 | 5.99k | let node_name = name(node); |
2373 | 5.99k | let is_in_enclosed_list = node_name != "mo" && |
2374 | 5.99k | let is_mn_in_enclosed_list = is_in_enclosed_list && |
2375 | 5.99k | let mut typeface = "R".to_string(); // assumption is "R" and if attr or letter is different, something happens |
2376 | 5.99k | let mut is_all_caps = true; |
2377 | 5.99k | let mut is_all_caps_valid = false; // all_caps only valid if we did a replacement |
2378 | 7.87k | let |
2379 | // debug!(" face: {:?}, lang: {:?}, num {:?}, letter: {:?}, cap: {:?}, char: {:?}", | |
2380 | // &caps["face"], &caps["lang"], &caps["num"], &caps["letter"], &caps["cap"], &caps["char"]); | |
2381 | 7.87k | let mut nemeth_chars = "".to_string(); |
2382 | 7.87k | let char_face = if caps["face"].is_empty() { |
2383 | 7.87k | let typeface_changed = typeface != char_face; |
2384 | 7.87k | if typeface_changed { |
2385 | 86 | typeface = char_face.to_string(); // needs to outlast this instance of the loop |
2386 | 86 | nemeth_chars += &typeface; |
2387 | 86 | nemeth_chars += &caps["lang"]; |
2388 | 7.78k | } else { |
2389 | 7.78k | nemeth_chars += &caps["lang"]; |
2390 | 7.78k | } |
2391 | // debug!(" typeface changed: {}, is_in_list: {}; num: {}", typeface_changed, is_in_enclosed_list, !caps["num"].is_empty()); | |
2392 | 7.87k | if !caps["num"].is_empty() && ( |
2393 | 2.58k | nemeth_chars += "N"; |
2394 | 5.28k | } |
2395 | 7.87k | is_all_caps_valid = true; |
2396 | 7.87k | is_all_caps &= !&caps["cap"].is_empty(); |
2397 | 7.87k | nemeth_chars += &caps["cap"]; // will be stripped later if all caps |
2398 | 7.87k | if is_in_enclosed_list { |
2399 | 228 | nemeth_chars += &caps["letter"].replace('L', "l"); |
2400 | 7.64k | } else { |
2401 | 7.64k | nemeth_chars += &caps["letter"]; |
2402 | 7.64k | } |
2403 | 7.87k | nemeth_chars += &caps["char"]; |
2404 | 7.87k | return nemeth_chars; |
2405 | 7.87k | }); |
2406 | // debug!(" result: {}", &result); | |
2407 | 5.99k | let mut text_chars = text.chars(); // see if more than one char |
2408 | 5.99k | if is_all_caps_valid && |
2409 | 7 | return Ok( "CC".to_string() + &result.replace('C', "")); |
2410 | } else { | |
2411 | 5.98k | return Ok( result.to_string() ); |
2412 | } | |
2413 | 5.99k | } |
2414 | ||
2415 | 2.82k | fn get_braille_ueb_chars(node: Element, text_range: Option<Range<usize>>) -> Result<String> { |
2416 | // Because in UEB typeforms and caps may extend for multiple tokens, | |
2417 | // this routine merely deals with the mathvariant attr. | |
2418 | // Canonicalize has already transformed all chars it can to math alphanumerics, but not all have bold/italic | |
2419 | // The typeform/caps transforms to (potentially) word mode are handled later. | |
2420 | 1 | static HAS_TYPEFACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(".*?(double-struck|script|fraktur|sans-serif).*").unwrap()); |
2421 | 1 | static PICK_APART_CHAR: LazyLock<Regex> = LazyLock::new(|| { |
2422 | 1 | Regex::new(r"(?P<bold>B??)(?P<italic>I??)(?P<face>[S𝔹TD]??)s??(?P<cap>C??)(?P<greek>G??)(?P<char>[NL].)").unwrap() |
2423 | 1 | }); |
2424 | ||
2425 | 2.82k | let math_variant = node.attribute_value("mathvariant"); |
2426 | 2.82k | let text = BrailleChars::substring(as_text(node), &text_range); |
2427 | 2.82k | let mut braille_chars = braille_replace_chars(&text, node) |
2428 | ||
2429 | // debug!("get_braille_ueb_chars: before/after unicode.yaml: '{}'/'{}'", text, braille_chars); | |
2430 | 2.82k | if math_variant.is_none() { // nothing we need to do |
2431 | 2.71k | return Ok(braille_chars); |
2432 | 108 | } |
2433 | // mathvariant could be "sans-serif-bold-italic" -- get the parts | |
2434 | 108 | let math_variant = math_variant.unwrap(); |
2435 | 108 | let italic = math_variant.contains("italic"); |
2436 | 108 | if italic & !braille_chars.contains('I') { |
2437 | 0 | braille_chars = "I".to_string() + &braille_chars; |
2438 | 108 | } |
2439 | 108 | let bold = math_variant.contains("bold"); |
2440 | 108 | if bold & !braille_chars.contains('B') { |
2441 | 0 | braille_chars = "B".to_string() + &braille_chars; |
2442 | 108 | } |
2443 | 108 | let typeface = match HAS_TYPEFACE.find(math_variant) { |
2444 | 107 | None => "", |
2445 | 1 | Some(m) => match m.as_str() { |
2446 | 1 | "double-struck" => |
2447 | 1 | "script" => |
2448 | 1 | "fraktur" => "D", |
2449 | 0 | "sans-serif" => "S", |
2450 | // don't consider monospace as a typeform | |
2451 | 0 | _ => "", |
2452 | }, | |
2453 | }; | |
2454 | 116 | let |
2455 | // debug!("captures: {:?}", caps); | |
2456 | // debug!(" bold: {:?}, italic: {:?}, face: {:?}, cap: {:?}, char: {:?}", | |
2457 | // &caps["bold"], &caps["italic"], &caps["face"], &caps["cap"], &caps["char"]); | |
2458 | 116 | if bold || |
2459 | 116 | + if italic || !caps["italic"].is_empty() { |
2460 | 116 | + if !&caps["face"].is_empty() { |
2461 | 116 | + &caps["cap"] |
2462 | 116 | + &caps["greek"] |
2463 | 116 | + &caps["char"] |
2464 | 116 | }); |
2465 | // debug!("get_braille_ueb_chars: '{}'", &result); | |
2466 | 108 | return Ok(result.to_string()) |
2467 | 2.82k | } |
2468 | ||
2469 | 3.70k | fn get_braille_cmu_chars(node: Element, text_range: Option<Range<usize>>) -> Result<String> { |
2470 | // In CMU, we need to replace spaces used for number blocks with "." | |
2471 | // For other numbers, we need to add "." to create digit blocks | |
2472 | ||
2473 | 1 | static HAS_TYPEFACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(".*?(double-struck|script|fraktur|sans-serif).*").unwrap()); |
2474 | 1 | static PICK_APART_CHAR: LazyLock<Regex> = LazyLock::new(|| { |
2475 | 1 | Regex::new(r"(?P<bold>B??)(?P<italic>I??)(?P<face>[S𝔹TD]??)s??(?P<cap>C??)(?P<greek>G??)(?P<char>[NL].)").unwrap() |
2476 | 1 | }); |
2477 | ||
2478 | 3.70k | let math_variant = node.attribute_value("mathvariant"); |
2479 | 3.70k | let text = BrailleChars::substring(as_text(node), &text_range); |
2480 | 3.70k | let text = add_separator(text); |
2481 | ||
2482 | 3.70k | let braille_chars = braille_replace_chars(&text, node) |
2483 | ||
2484 | // debug!("get_braille_ueb_chars: before/after unicode.yaml: '{}'/'{}'", text, braille_chars); | |
2485 | 3.70k | if math_variant.is_none() { // nothing we need to do |
2486 | 3.70k | return Ok(braille_chars); |
2487 | 4 | } |
2488 | // mathvariant could be "sans-serif-bold-italic" -- get the parts | |
2489 | 4 | let math_variant = math_variant.unwrap(); |
2490 | 4 | let bold = math_variant.contains("bold"); |
2491 | 4 | let italic = math_variant.contains("italic"); |
2492 | 4 | let typeface = match HAS_TYPEFACE.find(math_variant) { |
2493 | 4 | None => "", |
2494 | 0 | Some(m) => match m.as_str() { |
2495 | 0 | "double-struck" => "𝔹", |
2496 | 0 | "script" => "T", |
2497 | 0 | "fraktur" => "D", |
2498 | 0 | "sans-serif" => "S", |
2499 | // don't consider monospace as a typeform | |
2500 | 0 | _ => "", |
2501 | }, | |
2502 | }; | |
2503 | 4 | let result = PICK_APART_CHAR.replace_all(&braille_chars, |caps: &Captures| { |
2504 | // debug!("captures: {:?}", caps); | |
2505 | // debug!(" bold: {:?}, italic: {:?}, face: {:?}, cap: {:?}, char: {:?}", | |
2506 | // &caps["bold"], &caps["italic"], &caps["face"], &caps["cap"], &caps["char"]); | |
2507 | 4 | if bold || !caps["bold"].is_empty() { |
2508 | 4 | + if italic || !caps["italic"].is_empty() { |
2509 | 4 | + if !&caps["face"].is_empty() { |
2510 | 4 | + &caps["cap"] |
2511 | 4 | + &caps["greek"] |
2512 | 4 | + &caps["char"] |
2513 | 4 | }); |
2514 | 4 | return Ok(result.to_string()); |
2515 | ||
2516 | 3.70k | fn add_separator(text: String) -> String { |
2517 | use crate::definitions::BRAILLE_DEFINITIONS; | |
2518 | 3.70k | if let Some( |
2519 | // "." after arc (7.5.3) | |
2520 | 0 | let is_function_name = BRAILLE_DEFINITIONS.with(|definitions| { |
2521 | 0 | let definitions = definitions.borrow(); |
2522 | 0 | let set = definitions.get_hashset("CMUFunctionNames").unwrap(); |
2523 | 0 | return set.contains(&text); |
2524 | 0 | }); |
2525 | 0 | if is_function_name { |
2526 | 0 | return "arc.".to_string() + text_without_arc; |
2527 | 0 | } |
2528 | 3.70k | } |
2529 | 3.70k | return text; |
2530 | 3.70k | } |
2531 | 3.70k | } |
2532 | ||
2533 | 536 | fn get_braille_vietnam_chars(node: Element, text_range: Option<Range<usize>>) -> Result<String> { |
2534 | // this is basically the same as for ueb except: | |
2535 | // 1. we deal with switching '.' and ',' if in English style for numbers | |
2536 | // 2. if it is identified as a Roman Numeral, we make all but the first char lower case because they shouldn't get a cap indicator | |
2537 | // 3. double letter chemical elements should NOT be part of a cap word sequence | |
2538 | 536 | if name(node) == "mn" { |
2539 | 248 | // text of element is modified by these if needed |
2540 | 248 | lower_case_roman_numerals(node); |
2541 | 248 | switch_if_english_style_number(node); |
2542 | 288 | } |
2543 | 536 | let result = BrailleChars::get_braille_ueb_chars(node, text_range) |
2544 | 536 | return Ok(result); |
2545 | ||
2546 | 248 | fn lower_case_roman_numerals(mn_node: Element) { |
2547 | 248 | if mn_node.attribute("data-roman-numeral").is_some() { |
2548 | 2 | // if a roman numeral, all ASCII so we can optimize |
2549 | 2 | let text = as_text(mn_node); |
2550 | 2 | let mut new_text = String::from(&text[..1]); |
2551 | 2 | new_text.push_str(text[1..].to_ascii_lowercase().as_str()); // works for single char too |
2552 | 2 | mn_node.set_text(&new_text); |
2553 | 246 | } |
2554 | 248 | } |
2555 | 248 | fn switch_if_english_style_number(mn_node: Element) { |
2556 | 248 | let text = as_text(mn_node); |
2557 | 248 | let dot = text.find('.'); |
2558 | 248 | let comma = text.find(','); |
2559 | 248 | match (dot, comma) { |
2560 | 218 | (None, None) => (), |
2561 | 4 | (Some(dot), Some(comma)) => { |
2562 | 4 | if comma < dot { |
2563 | 2 | // switch dot/comma -- using "\x01" as a temp when switching the two chars |
2564 | 2 | let switched = text.replace('.', "\x01").replace(',', ".").replace('\x01', ","); |
2565 | 2 | mn_node.set_text(&switched); |
2566 | 2 | } |
2567 | }, | |
2568 | 17 | (Some(dot), None) => { |
2569 | // If it starts with a '.', a leading 0, or if there is only one '.' and not three chars after it | |
2570 | 17 | if dot==0 || |
2571 | 15 | (dot==1 && |
2572 | 13 | (text[dot+1..].find('.').is_none() && |
2573 | 5 | mn_node.set_text(&text.replace('.', ",")); |
2574 | 12 | } |
2575 | }, | |
2576 | 9 | (None, Some(comma)) => { |
2577 | // if there is more than one ",", than it can't be a decimal separator | |
2578 | 9 | if text[comma+1..].find(',').is_some() { |
2579 | 1 | mn_node.set_text(&text.replace(',', ".")); |
2580 | 8 | } |
2581 | }, | |
2582 | } | |
2583 | 248 | } |
2584 | ||
2585 | 536 | } |
2586 | ||
2587 | ||
2588 | 3.45k | fn is_in_enclosed_list(node: Element) -> bool { |
2589 | // Nemeth Rule 10 defines an enclosed list: | |
2590 | // 1: begins and ends with fence | |
2591 | // 2: FIX: not implemented -- must contain no word, abbreviation, ordinal or plural ending | |
2592 | // 3: function names or signs of shape and the signs which follow them are a single item (not a word) | |
2593 | // 4: an item of the list may be an ellipsis or any sign used for omission | |
2594 | // 5: no relational operator may appear within the list | |
2595 | // 6: the list must have at least 2 items. | |
2596 | // Items are separated by commas, can not have other punctuation (except ellipsis and dash) | |
2597 | 3.45k | let mut parent = get_parent(node); // safe since 'math' is always at root |
2598 | 7.37k | while name(parent) == "mrow" { |
2599 | 4.04k | if IsBracketed::is_bracketed(parent, "", "", true, false) { |
2600 | 388 | for child in |
2601 | 388 | if !child_meets_conditions(as_element(child)) { |
2602 | 14 | return false; |
2603 | 374 | } |
2604 | } | |
2605 | 120 | return true; |
2606 | 3.91k | } |
2607 | 3.91k | parent = get_parent(parent); |
2608 | } | |
2609 | 3.32k | return false; |
2610 | ||
2611 | 1.55k | fn child_meets_conditions(node: Element) -> bool { |
2612 | 1.55k | let name = name(node); |
2613 | 1.55k | return match name { |
2614 | 1.55k | "mi" | |
2615 | 1.07k | "mo" => |
2616 | 412 | "mtext" => { |
2617 | 9 | let text = as_text(node).trim(); |
2618 | 9 | return text=="?" || text=="-?-" || text.is_empty(); // various forms of "fill in missing content" (see also Nemeth_RULEs.yaml, "omissions") |
2619 | }, | |
2620 | 403 | "mrow" => { |
2621 | 385 | if IsBracketed::is_bracketed(node, "", "", false, false) { |
2622 | 125 | return child_meets_conditions(as_element(node.children()[1])); |
2623 | } else { | |
2624 | 1.00k | for child in |
2625 | 1.00k | if !child_meets_conditions(as_element(child)) { |
2626 | 28 | return false; |
2627 | 975 | } |
2628 | } | |
2629 | } | |
2630 | 232 | true |
2631 | }, | |
2632 | 18 | "menclose" => { |
2633 | 0 | if let Some(notation) = node.attribute_value("notation") { |
2634 | 0 | if notation != "bottom" || notation != "box" { |
2635 | 0 | return false; |
2636 | 0 | } |
2637 | 0 | let child = as_element(node.children()[0]); // menclose has exactly one child |
2638 | 0 | return is_leaf(child) && as_text(child) == "?"; |
2639 | 0 | } |
2640 | 0 | return false; |
2641 | }, | |
2642 | _ => { | |
2643 | 36 | for child in |
2644 | 36 | if !child_meets_conditions(as_element(child)) { |
2645 | 0 | return false; |
2646 | 36 | } |
2647 | } | |
2648 | 18 | true |
2649 | }, | |
2650 | } | |
2651 | 1.55k | } |
2652 | 3.45k | } |
2653 | ||
2654 | /// Extract the `char`s from `str` within `range` (these are chars, not byte offsets) | |
2655 | 12.5k | fn substring(str: &str, text_range: &Option<Range<usize>>) -> String { |
2656 | 12.5k | return match text_range { |
2657 | 9.99k | None => str.to_string(), |
2658 | 2.52k | Some(range) => str.chars().skip(range.start).take(range.end - range.start).collect(), |
2659 | } | |
2660 | 12.5k | } |
2661 | } | |
2662 | ||
2663 | impl Function for BrailleChars { | |
2664 | /** | |
2665 | * Returns a string with the correct number of nesting chars (could be an empty string) | |
2666 | * @param(node) -- current node or string | |
2667 | * @param(char) -- char (string) that should be repeated | |
2668 | * Note: as a side effect, an attribute with the value so repeated calls to this or a child will be fast | |
2669 | */ | |
2670 | 12.5k | fn evaluate<'d>(&self, |
2671 | 12.5k | context: &context::Evaluation<'_, 'd>, |
2672 | 12.5k | args: Vec<Value<'d>>) |
2673 | 12.5k | -> StdResult<Value<'d>, XPathError> |
2674 | { | |
2675 | use crate::canonicalize::create_mathml_element; | |
2676 | 12.5k | let mut args = Args(args); |
2677 | 12.5k | if let Err( |
2678 | 0 | return Err( XPathError::Other(format!("BrailleChars requires 2 or 4 args: {e}"))); |
2679 | 12.5k | }; |
2680 | ||
2681 | 12.5k | let range = if args.len() == 4 { |
2682 | 2.52k | let end = args.pop_number() |
2683 | 2.52k | let start = args.pop_number() |
2684 | 2.52k | Some(start..end) |
2685 | } else { | |
2686 | 9.99k | None |
2687 | }; | |
2688 | 12.5k | let braille_code = args.pop_string() |
2689 | 12.5k | let v: Value<'_> = args.0.pop().ok_or(XPathError::ArgumentMissing) |
2690 | 12.5k | let node = match v { |
2691 | 11.8k | Value::Nodeset(nodes) => { |
2692 | 11.8k | validate_one_node(nodes, "BrailleChars") |
2693 | }, | |
2694 | 2 | Value::Number(n) => { |
2695 | 2 | let new_node = create_mathml_element(&context.node.document(), "mn"); |
2696 | 2 | new_node.set_text(&n.to_string()); |
2697 | 2 | new_node |
2698 | }, | |
2699 | 681 | Value::String(s) => { |
2700 | 681 | let new_node = create_mathml_element(&context.node.document(), "mi"); // FIX: try to guess mi vs mo??? |
2701 | 681 | new_node.set_text(&s); |
2702 | 681 | new_node |
2703 | }, | |
2704 | _ => { | |
2705 | 0 | return Ok( Value::String("".to_string()) ) // not an element, so nothing to do |
2706 | }, | |
2707 | }; | |
2708 | ||
2709 | 12.5k | if !is_leaf(node) { |
2710 | 0 | return Err( XPathError::Other(format!("BrailleChars called on non-leaf element '{}'", mml_to_string(node))) ); |
2711 | 12.5k | } |
2712 | 12.5k | return Ok( Value::String( BrailleChars::get_braille_chars(node, &braille_code, range) |
2713 | 12.5k | } |
2714 | } | |
2715 | ||
2716 | pub struct NeedsToBeGrouped; | |
2717 | impl NeedsToBeGrouped { | |
2718 | // ordinals often have an irregular start (e.g., "half") before becoming regular. | |
2719 | // if the number is irregular, return the ordinal form, otherwise return 'None'. | |
2720 | 805 | fn needs_grouping_for_cmu(element: Element, _is_base: bool) -> bool { |
2721 | 805 | let node_name = name(element); |
2722 | 805 | let children = element.children(); |
2723 | 805 | if node_name == "mrow" { |
2724 | // check for bracketed exprs | |
2725 | 544 | if IsBracketed::is_bracketed(element, "", "", false, true) { |
2726 | 0 | return false; |
2727 | 544 | } |
2728 | ||
2729 | // check for prefix and postfix ops at start or end (=> len()==2, prefix is first op, postfix is last op) | |
2730 | 544 | if children.len() == 2 && |
2731 | 9 | (name(as_element(children[0])) == "mo" || |
2732 | 7 | return false; |
2733 | 537 | } |
2734 | ||
2735 | 537 | if children.len() != 3 { // ==3, need to check if it a linear fraction |
2736 | 4 | return true; |
2737 | 533 | } |
2738 | 533 | let operator = as_element(children[1]); |
2739 | 533 | if name(operator) != "mo" || as_text(operator) != "/" { |
2740 | 532 | return true; |
2741 | 1 | } |
2742 | 261 | } |
2743 | ||
2744 | 262 | if !(node_name == "mrow" || |
2745 | 258 | return false; |
2746 | 4 | } |
2747 | // check for numeric fractions (regular fractions need brackets, not numeric fractions), either as an mfrac or with "/" | |
2748 | // if the fraction starts with a "-", it is still a numeric fraction that doesn't need parens | |
2749 | 4 | let mut numerator = as_element(children[0]); |
2750 | 4 | let denominator = as_element(children[children.len()-1]); |
2751 | 4 | let decimal_separator = crate::interface::get_preference("DecimalSeparators").unwrap() |
2752 | 4 | .chars().next().unwrap_or('.'); |
2753 | 4 | if is_integer(denominator, decimal_separator) { |
2754 | // check numerator being either an integer "- integer" | |
2755 | 2 | if name(numerator) == "mrow" { |
2756 | 1 | let numerator_children = numerator.children(); |
2757 | 1 | if !(numerator_children.len() == 2 && |
2758 | 1 | name(as_element(numerator_children[0])) == "mo" && |
2759 | 1 | as_text(as_element(numerator_children[0])) == "-") { |
2760 | 0 | return true; |
2761 | 1 | } |
2762 | 1 | numerator = as_element(numerator_children[1]); |
2763 | 1 | } |
2764 | 2 | return !is_integer(numerator, decimal_separator); |
2765 | 2 | } |
2766 | 2 | return true; |
2767 | ||
2768 | 6 | fn is_integer(mathml: Element, decimal_separator: char) -> bool { |
2769 | 6 | return name(mathml) == "mn" && |
2770 | 6 | } |
2771 | 805 | } |
2772 | ||
2773 | /// FIX: what needs to be implemented? | |
2774 | 0 | fn needs_grouping_for_finnish(mathml: Element, is_base: bool) -> bool { |
2775 | use crate::xpath_functions::IsInDefinition; | |
2776 | 0 | let mut node_name = name(mathml); |
2777 | 0 | if mathml.attribute_value("data-roman-numeral").is_some() { |
2778 | 0 | node_name = "mi"; // roman numerals don't follow number rules |
2779 | 0 | } |
2780 | ||
2781 | // FIX: the leaf rules are from UEB -- check the Swedish rules | |
2782 | 0 | match node_name { |
2783 | 0 | "mn" => { |
2784 | 0 | if !is_base { |
2785 | 0 | return false; |
2786 | 0 | } // clause 1 |
2787 | // two 'mn's can be adjacent, in which case we need to group the 'mn' to make it clear it is separate (see bug #204) | |
2788 | 0 | let parent = get_parent(mathml); // there is always a "math" node |
2789 | 0 | let grandparent = if name(parent) == "math" {parent} else {get_parent(parent)}; |
2790 | 0 | if name(grandparent) != "mrow" { |
2791 | 0 | return false; |
2792 | 0 | } |
2793 | 0 | let preceding = parent.preceding_siblings(); |
2794 | 0 | if preceding.len() < 2 { |
2795 | 0 | return false; |
2796 | 0 | } |
2797 | // any 'mn' would be separated from this node by invisible times | |
2798 | 0 | let previous_child = as_element(preceding[preceding.len()-1]); |
2799 | 0 | if name(previous_child) == "mo" && as_text(previous_child) == "\u{2062}" { |
2800 | 0 | let previous_child = as_element(preceding[preceding.len()-2]); |
2801 | 0 | return name(previous_child) == "mn" |
2802 | } else { | |
2803 | 0 | return false; |
2804 | } | |
2805 | }, | |
2806 | 0 | "mi" | "mo" | "mtext" => { |
2807 | 0 | let text = as_text(mathml); |
2808 | 0 | let parent = get_parent(mathml); // there is always a "math" node |
2809 | 0 | let parent_name = name(parent); // there is always a "math" node |
2810 | 0 | if is_base && (parent_name == "msub" || parent_name == "msup" || parent_name == "msubsup") && !text.contains([' ', '\u{00A0}']) { |
2811 | 0 | return false; |
2812 | 0 | } |
2813 | 0 | let mut chars = text.chars(); |
2814 | 0 | let first_char = chars.next().unwrap(); // canonicalization assures it isn't empty; |
2815 | 0 | let is_one_char = chars.next().is_none(); |
2816 | // '¨', etc., brailles as two chars -- there probably is some exception list but I haven't found it -- these are the ones I know about | |
2817 | 0 | return !((is_one_char && !['¨', '″', '‴', '⁗'].contains(&first_char)) || // clause 8 |
2818 | // "lim", "cos", etc., appear not to get parens, but the rules don't mention it (tests show it) | |
2819 | 0 | IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "FunctionNames").unwrap() || |
2820 | 0 | IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "Arrows").unwrap() || // clause 4 |
2821 | 0 | IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "GeometryShapes").unwrap()); // clause 5 |
2822 | }, | |
2823 | 0 | "mrow" => { |
2824 | // check for bracketed exprs | |
2825 | 0 | if IsBracketed::is_bracketed(mathml, "", "", false, true) { |
2826 | 0 | return false; |
2827 | 0 | } |
2828 | ||
2829 | 0 | let parent = get_parent(mathml); // safe since 'math' is always at root |
2830 | 0 | if name(parent) == "mfrac" { |
2831 | 0 | let children = mathml.children(); |
2832 | 0 | if mathml.preceding_siblings().is_empty() { |
2833 | // numerator: check for multiplication -- doesn't need grouping in numerator | |
2834 | 0 | if children.len() >= 3 { |
2835 | 0 | let operator = as_element(children[1]); |
2836 | 0 | if name(operator) == "mo" { |
2837 | 0 | let ch = as_text(operator); |
2838 | 0 | if ch == "\u{2062}" || ch == "⋅" || ch == "×" { |
2839 | 0 | return false; |
2840 | 0 | } |
2841 | 0 | } |
2842 | 0 | } |
2843 | 0 | return true; |
2844 | } else { | |
2845 | // denominator | |
2846 | 0 | return true; |
2847 | } | |
2848 | ||
2849 | 0 | } |
2850 | // check for prefix at start | |
2851 | // example 7.12 has "2-" in superscript and is grouped, so we don't consider postfix ops | |
2852 | 0 | let children = mathml.children(); |
2853 | 0 | if children.len() == 2 && |
2854 | 0 | (name(as_element(children[0])) == "mo") { |
2855 | 0 | return false; |
2856 | 0 | } |
2857 | 0 | return true; |
2858 | }, | |
2859 | 0 | _ => return false, |
2860 | } | |
2861 | 0 | } |
2862 | ||
2863 | // ordinals often have an irregular start (e.g., "half") before becoming regular. | |
2864 | // if the number is irregular, return the ordinal form, otherwise return 'None'. | |
2865 | 0 | fn needs_grouping_for_swedish(mathml: Element, is_base: bool) -> bool { |
2866 | use crate::xpath_functions::IsInDefinition; | |
2867 | 0 | let mut node_name = name(mathml); |
2868 | 0 | if mathml.attribute_value("data-roman-numeral").is_some() { |
2869 | 0 | node_name = "mi"; // roman numerals don't follow number rules |
2870 | 0 | } |
2871 | ||
2872 | 0 | match node_name { |
2873 | 0 | "mn" => return false, |
2874 | 0 | "mi" | "mo" | "mtext" => { |
2875 | 0 | let text = as_text(mathml); |
2876 | 0 | let parent = get_parent(mathml); // there is always a "math" node |
2877 | 0 | let parent_name = name(parent); // there is always a "math" node |
2878 | 0 | if is_base && (parent_name == "msub" || parent_name == "msup" || parent_name == "msubsup") && !text.contains([' ', '\u{00A0}']) { |
2879 | 0 | return false; |
2880 | 0 | } |
2881 | 0 | let mut chars = text.chars(); |
2882 | 0 | let first_char = chars.next().unwrap(); // canonicalization assures it isn't empty; |
2883 | 0 | let is_one_char = chars.next().is_none(); |
2884 | // '¨', etc., brailles as two chars -- there probably is some exception list but I haven't found it -- these are the ones I know about | |
2885 | 0 | return !((is_one_char && !['¨', '″', '‴', '⁗'].contains(&first_char)) || // clause 8 |
2886 | // "lim", "cos", etc., appear not to get parens, but the rules don't mention it (tests show it) | |
2887 | 0 | IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "FunctionNames").unwrap() || |
2888 | 0 | IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "Arrows").unwrap() || // clause 4 |
2889 | 0 | IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "GeometryShapes").unwrap()); // clause 5 |
2890 | }, | |
2891 | 0 | "mrow" => { |
2892 | // check for bracketed exprs | |
2893 | 0 | if IsBracketed::is_bracketed(mathml, "", "", false, true) { |
2894 | 0 | return false; |
2895 | 0 | } |
2896 | ||
2897 | // check for prefix at start | |
2898 | // example 7.12 has "2-" in superscript and is grouped, so we don't consider postfix ops | |
2899 | 0 | let children = mathml.children(); |
2900 | 0 | if children.len() == 2 && |
2901 | 0 | (name(as_element(children[0])) == "mo") { |
2902 | 0 | return false; |
2903 | 0 | } |
2904 | 0 | return true; |
2905 | }, | |
2906 | 0 | "mfrac" => { |
2907 | // exclude simple fractions -- they are not bracketed with start/end marks | |
2908 | 0 | let children = mathml.children(); |
2909 | 0 | return !(NeedsToBeGrouped::needs_grouping_for_swedish(as_element(children[0]), true) || |
2910 | 0 | NeedsToBeGrouped::needs_grouping_for_swedish(as_element(children[0]), true)); |
2911 | }, | |
2912 | // At least for msup (Ex 7.7, and 7.32 and maybe more), spec seems to feel grouping is not needed. | |
2913 | // "msub" | "msup" | "msubsup" | "munder" | "mover" | "munderover" => return true, | |
2914 | 0 | "mtable" => return true, // Fix: should check for trivial cases that don't need grouping |
2915 | 0 | _ => return false, |
2916 | } | |
2917 | 0 | } |
2918 | ||
2919 | /// Returns true if the element needs grouping symbols | |
2920 | /// Bases need extra attention because if they are a number and the item to the left is one, that needs distinguishing | |
2921 | 538 | fn needs_grouping_for_ueb(mathml: Element, is_base: bool) -> bool { |
2922 | // From GTM 7.1 | |
2923 | // 1. An entire number, i.e. the initiating numeric symbol and all succeeding symbols within the numeric mode thus | |
2924 | // established (which would include any interior decimal points, commas, separator spaces, or simple numeric fraction lines). | |
2925 | // 2. An entire general fraction, enclosed in fraction indicators. | |
2926 | // 3. An entire radical expression, enclosed in radical indicators. | |
2927 | // 4. An arrow. | |
2928 | // 5. An arbitrary shape. | |
2929 | // 6. Any expression enclosed in matching pairs of round parentheses, square brackets or curly braces. | |
2930 | // 7. Any expression enclosed in the braille grouping indicators. [Note: not possible here] | |
2931 | // 8. If none of the foregoing apply, the item is simply the [this element's] individual symbol. | |
2932 | ||
2933 | use crate::xpath_functions::IsInDefinition; | |
2934 | 538 | let mut node_name = name(mathml); |
2935 | 538 | if mathml.attribute_value("data-roman-numeral").is_some() { |
2936 | 1 | node_name = "mi"; // roman numerals don't follow number rules |
2937 | 537 | } |
2938 | 538 | match node_name { |
2939 | 538 | "mn" => { |
2940 | 250 | if !is_base { |
2941 | 233 | return false; |
2942 | 17 | } // clause 1 |
2943 | // two 'mn's can be adjacent, in which case we need to group the 'mn' to make it clear it is separate (see bug #204) | |
2944 | 17 | let parent = get_parent(mathml); // there is always a "math" node |
2945 | 17 | let grandparent = if name(parent) == "math" { |
2946 | 17 | if name(grandparent) != "mrow" { |
2947 | 2 | return false; |
2948 | 15 | } |
2949 | 15 | let preceding = parent.preceding_siblings(); |
2950 | 15 | if preceding.len() < 2 { |
2951 | 6 | return false; |
2952 | 9 | } |
2953 | // any 'mn' would be separated from this node by invisible times | |
2954 | 9 | let previous_child = as_element(preceding[preceding.len()-1]); |
2955 | 9 | if name(previous_child) == "mo" && as_text(previous_child) == "\u{2062}" { |
2956 | 6 | let previous_child = as_element(preceding[preceding.len()-2]); |
2957 | 6 | return name(previous_child) == "mn" |
2958 | } else { | |
2959 | 3 | return false; |
2960 | } | |
2961 | }, | |
2962 | 288 | "mi" | |
2963 | 258 | let text = as_text(mathml); |
2964 | 258 | let parent = get_parent(mathml); // there is always a "math" node |
2965 | 258 | let parent_name = name(parent); // there is always a "math" node |
2966 | 258 | if is_base && ( |
2967 | 224 | return false; |
2968 | 34 | } |
2969 | 34 | let mut chars = text.chars(); |
2970 | 34 | let first_char = chars.next().unwrap(); // canonicalization assures it isn't empty; |
2971 | 34 | let is_one_char = chars.next().is_none(); |
2972 | // '¨', etc., brailles as two chars -- there probably is some exception list but I haven't found it -- these are the ones I know about | |
2973 | 34 | return !((is_one_char && |
2974 | // "lim", "cos", etc., appear not to get parens, but the rules don't mention it (tests show it) | |
2975 | 4 | IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "FunctionNames").unwrap() || |
2976 | 3 | IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "Arrows").unwrap() || // clause 4 |
2977 | 3 | IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "GeometryShapes").unwrap()); // clause 5 |
2978 | }, | |
2979 | 30 | "mfrac" => return |
2980 | 28 | "msqrt" | "mroot" => return |
2981 | // clause 6 only mentions three grouping chars, I'm a little suspicious of that, but that's what it says | |
2982 | 28 | "mrow" => return !( |
2983 | 16 | IsBracketed::is_bracketed(mathml, "[", "]", false, false) || |
2984 | 15 | IsBracketed::is_bracketed(mathml, "{", "}", false, false) ), |
2985 | 6 | "msub" | |
2986 | // I'm a little dubious about the false value, but see GTM 7.7(2) | |
2987 | 5 | if !is_base { |
2988 | 3 | return true; |
2989 | 2 | } |
2990 | // need to group nested scripts in base -- see GTM 12.2(2) | |
2991 | 2 | let parent = get_parent(mathml); // there is always a "math" node |
2992 | 2 | let parent_name = name(parent); // there is always a "math" node |
2993 | 2 | return parent_name == "munder" || parent_name == "mover" || |
2994 | }, | |
2995 | 1 | _ => return true, |
2996 | } | |
2997 | ||
2998 | 538 | } |
2999 | } | |
3000 | ||
3001 | impl Function for NeedsToBeGrouped { | |
3002 | // convert a node to an ordinal number | |
3003 | 1.34k | fn evaluate<'d>(&self, |
3004 | 1.34k | _context: &context::Evaluation<'_, 'd>, |
3005 | 1.34k | args: Vec<Value<'d>>) |
3006 | 1.34k | -> StdResult<Value<'d>, XPathError> |
3007 | { | |
3008 | 1.34k | let mut args = Args(args); |
3009 | 1.34k | args.exactly(3) |
3010 | 1.34k | let is_base = args.pop_boolean() |
3011 | 1.34k | let braille_code = args.pop_string() |
3012 | 1.34k | let node = validate_one_node(args.pop_nodeset() |
3013 | 1.34k | if let Node::Element(e) = node { |
3014 | 1.34k | let answer = match braille_code.as_str() { |
3015 | 1.34k | "CMU" => |
3016 | 538 | "UEB" => NeedsToBeGrouped::needs_grouping_for_ueb(e, is_base), |
3017 | 0 | "Finnish" => NeedsToBeGrouped::needs_grouping_for_finnish(e, is_base), |
3018 | 0 | "Swedish" => NeedsToBeGrouped::needs_grouping_for_swedish(e, is_base), |
3019 | 0 | _ => return Err(XPathError::Other(format!("NeedsToBeGrouped: braille code arg '{braille_code:?}' is not a known code ('UEB', 'CMU', or 'Swedish')"))), |
3020 | }; | |
3021 | 1.34k | return Ok( Value::Boolean( answer ) ); |
3022 | 0 | } |
3023 | ||
3024 | 0 | return Err(XPathError::Other(format!("NeedsToBeGrouped: first arg '{node:?}' is not a node"))); |
3025 | 1.34k | } |
3026 | } | |
3027 | ||
3028 | ||
3029 | ||
3030 | #[cfg(test)] | |
3031 | mod tests { | |
3032 | use super::*; | |
3033 | #[allow(unused_imports)] | |
3034 | use crate::init_logger; | |
3035 | use crate::interface::*; | |
3036 | use log::debug; | |
3037 | ||
3038 | #[test] | |
3039 | 1 | fn ueb_highlight_24() -> Result<()> { // issue 24 |
3040 | 1 | let mathml_str = "<math display='block' id='id-0'> |
3041 | 1 | <mrow id='id-1'> |
3042 | 1 | <mn id='id-2'>4</mn> |
3043 | 1 | <mo id='id-3'>⁢</mo> |
3044 | 1 | <mi id='id-4'>a</mi> |
3045 | 1 | <mo id='id-5'>⁢</mo> |
3046 | 1 | <mi id='id-6'>c</mi> |
3047 | 1 | </mrow> |
3048 | 1 | </math>"; |
3049 | 1 | crate::interface::set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); |
3050 | 1 | set_mathml(mathml_str).unwrap(); |
3051 | 1 | set_preference("BrailleCode", "UEB").unwrap(); |
3052 | 1 | set_preference("BrailleNavHighlight", "All").unwrap(); |
3053 | 1 | let braille = get_braille("id-2") |
3054 | 1 | assert_eq!("⣼⣙⠰⠁⠉", braille); |
3055 | 1 | set_navigation_node("id-2", 0) |
3056 | 1 | assert_eq!( get_braille_position() |
3057 | ||
3058 | 1 | let braille = get_braille("id-4") |
3059 | 1 | assert_eq!("⠼⠙⣰⣁⠉", braille); |
3060 | 1 | set_navigation_node("id-4", 0) |
3061 | 1 | assert_eq!( get_braille_position() |
3062 | 1 | return Ok( () ); |
3063 | 1 | } |
3064 | ||
3065 | #[test] | |
3066 | // This test probably should be repeated for each braille code and be taken out of here | |
3067 | 1 | fn find_mathml_from_braille() -> Result<()> { |
3068 | use std::time::Instant; | |
3069 | 1 | let mathml_str = "<math id='id-0'> |
3070 | 1 | <mrow data-changed='added' id='id-1'> |
3071 | 1 | <mi id='id-2'>x</mi> |
3072 | 1 | <mo id='id-3'>=</mo> |
3073 | 1 | <mfrac id='id-4'> |
3074 | 1 | <mrow id='id-5'> |
3075 | 1 | <mrow data-changed='added' id='id-6'> |
3076 | 1 | <mo id='id-7'>-</mo> |
3077 | 1 | <mi id='id-8'>b</mi> |
3078 | 1 | </mrow> |
3079 | 1 | <mo id='id-9'>±</mo> |
3080 | 1 | <msqrt id='id-10'> |
3081 | 1 | <mrow data-changed='added' id='id-11'> |
3082 | 1 | <msup id='id-12'> |
3083 | 1 | <mi id='id-13'>b</mi> |
3084 | 1 | <mn id='id-14'>2</mn> |
3085 | 1 | </msup> |
3086 | 1 | <mo id='id-15'>-</mo> |
3087 | 1 | <mrow data-changed='added' id='id-16'> |
3088 | 1 | <mn id='id-17'>4</mn> |
3089 | 1 | <mo data-changed='added' id='id-18'>⁢</mo> |
3090 | 1 | <mi id='id-19'>a</mi> |
3091 | 1 | <mo data-changed='added' id='id-20'>⁢</mo> |
3092 | 1 | <mi id='id-21'>c</mi> |
3093 | 1 | </mrow> |
3094 | 1 | </mrow> |
3095 | 1 | </msqrt> |
3096 | 1 | </mrow> |
3097 | 1 | <mrow id='id-22'> |
3098 | 1 | <mn id='id-23'>2</mn> |
3099 | 1 | <mo data-changed='added' id='id-24'>⁢</mo> |
3100 | 1 | <mi id='id-25'>a</mi> |
3101 | 1 | </mrow> |
3102 | 1 | </mfrac> |
3103 | 1 | </mrow> |
3104 | 1 | </math>"; |
3105 | 1 | crate::interface::set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); |
3106 | 1 | set_mathml(mathml_str).unwrap(); |
3107 | 1 | set_preference("BrailleNavHighlight", "Off").unwrap(); |
3108 | ||
3109 | 1 | set_preference("BrailleCode", "Nemeth").unwrap(); |
3110 | 1 | let _braille = get_braille("") |
3111 | 1 | let answers= &[2, 3, 3, 3, 3, 4, 7, 8, 9, 9, 10, 13, 12, 14, 12, 15, 17, 19, 21, 10, 4, 23, 25, 4]; |
3112 | 24 | let |
3113 | 1 | debug!("\n*** Testing Nemeth ***"); |
3114 | 24 | for (i, answer) in |
3115 | 24 | debug!("\n=== i={} ===", i); |
3116 | 24 | let instant = Instant::now(); |
3117 | 24 | let (id, _offset) = crate::interface::get_navigation_node_from_braille_position(i) |
3118 | 24 | N_PROBES.with(|n| {debug!("test {:2} #probes = {}", i, |
3119 | 24 | debug!("Time taken: {}ms", |
3120 | 24 | assert_eq!(*answer, id, "\nNemeth test ith position={}", i); |
3121 | } | |
3122 | ||
3123 | 1 | set_preference("BrailleCode", "UEB").unwrap(); |
3124 | 1 | let _braille = get_braille("") |
3125 | 1 | let answers= &[0, 0, 0, 2, 3, 3, 3, 3, 4, 7, 7, 8, 9, 9, 10, 13, 12, 14, 14, 15, 15, 17, 17, 19, 19, 21, 10, 4, 4, 23, 23, 25, 25, 4, 0, 0]; |
3126 | 36 | let |
3127 | 1 | debug!("\n\n*** Testing UEB ***"); |
3128 | 36 | for (i, answer) in |
3129 | 36 | debug!("\n=== i={} ===", i); |
3130 | 36 | let instant = Instant::now(); |
3131 | 36 | let (id, _offset) = crate::interface::get_navigation_node_from_braille_position(i) |
3132 | 36 | N_PROBES.with(|n| {debug!("test {:2} #probes = {}", i, |
3133 | 36 | debug!("Time taken: {}ms", |
3134 | 36 | assert_eq!(*answer, id, "\nUEB test ith position={}", i); |
3135 | } | |
3136 | 1 | set_preference("BrailleCode", "CMU").unwrap(); |
3137 | 1 | let braille = get_braille("") |
3138 | 1 | let answers= &[2, 3, 5, 7, 8, 9, 9, 9, 10, 10, 11, 13, 12, 14, 14, 15, 17, 17, 19, 19, 21, 11, 5, 4, 22, 23, 23, 25, 25, 22,]; |
3139 | 30 | let |
3140 | 1 | debug!("\n\n*** Testing CMU ***"); |
3141 | 1 | debug!("Braille: {}", braille); |
3142 | 30 | for (i, answer) in |
3143 | 30 | debug!("\n=== i={} ===", i); |
3144 | 30 | let instant = Instant::now(); |
3145 | 30 | let (id, _offset) = crate::interface::get_navigation_node_from_braille_position(i) |
3146 | 30 | N_PROBES.with(|n| {debug!("test {:2} #probes = {}", i, |
3147 | 30 | debug!("Time taken: {}ms", |
3148 | 30 | assert_eq!(*answer, id, "\nCMU test ith position={}", i); |
3149 | } | |
3150 | 1 | return Ok( () ); |
3151 | 1 | } |
3152 | ||
3153 | #[test] | |
3154 | #[allow(non_snake_case)] | |
3155 | 1 | fn test_UEB_start_mode() -> Result<()> { |
3156 | 1 | let mathml_str = "<math><msup><mi>x</mi><mi>n</mi></msup></math>"; |
3157 | 1 | crate::interface::set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); |
3158 | 1 | set_mathml(mathml_str).unwrap(); |
3159 | 1 | set_preference("BrailleCode", "UEB").unwrap(); |
3160 | 1 | set_preference("UEB_START_MODE", "Grade2").unwrap(); |
3161 | 1 | let braille = get_braille("") |
3162 | 1 | assert_eq!("⠭⠰⠔⠝", braille, "Grade2"); |
3163 | 1 | set_preference("UEB_START_MODE", "Grade1").unwrap(); |
3164 | 1 | let braille = get_braille("") |
3165 | 1 | assert_eq!("⠭⠔⠝", braille, "Grade1"); |
3166 | 1 | return Ok( () ); |
3167 | 1 | } |
3168 | } |
Line | Count | Source |
1 | //! Converts the MathML to some sort of canonical MathML. | |
2 | //! | |
3 | //! Some changes made: | |
4 | //! * &extra whitespace at the start/end of tokens is trimmed. | |
5 | //! * "equivalent" characters are converted to a chosen character. | |
6 | //! * known "bad" MathML is cleaned up (this will likely be an ongoing effort) | |
7 | //! * mrows are added based on operator priorities from the MathML Operator Dictionary | |
8 | #![allow(clippy::needless_return)] | |
9 | use crate::errors::*; | |
10 | use std::rc::Rc; | |
11 | use std::cell::RefCell; | |
12 | use sxd_document::dom::{Element, Document, ChildOfElement, Attribute}; | |
13 | use sxd_document::QName; | |
14 | use phf::{phf_map, phf_set}; | |
15 | use crate::xpath_functions::{IsBracketed, is_leaf, IsNode}; | |
16 | use std::ptr::eq as ptr_eq; | |
17 | use crate::pretty_print::*; | |
18 | use regex::Regex; | |
19 | use std::fmt; | |
20 | use crate::chemistry::*; | |
21 | use unicode_script::Script; | |
22 | use roman_numerals_rs::RomanNumeral; | |
23 | use std::sync::LazyLock; | |
24 | use log::{debug}; | |
25 | use bitflags::bitflags; | |
26 | ||
27 | // FIX: DECIMAL_SEPARATOR should be set by env, or maybe language | |
28 | const DECIMAL_SEPARATOR: &str = "."; | |
29 | pub const CHANGED_ATTR: &str = "data-changed"; | |
30 | pub const ADDED_ATTR_VALUE: &str = "added"; | |
31 | pub const INTENT_ATTR: &str = "intent"; | |
32 | pub const MATHML_FROM_NAME_ATTR: &str = "data-from-mathml"; | |
33 | const MFENCED_ATTR_VALUE: &str = "from_mfenced"; | |
34 | const EMPTY_IN_2D: &str = "data-empty-in-2D"; | |
35 | const SPACE_AFTER: &str = "data-space-after"; | |
36 | const ACT_AS_OPERATOR: &str = "data-acts_as_operator"; | |
37 | // character to use instead of the text content for priority, etc. | |
38 | pub const CHEMICAL_BOND: &str ="data-chemical-bond"; | |
39 | ||
40 | ||
41 | /// Used when mhchem is detected and we should favor postscripts rather than prescripts in constructing an mmultiscripts | |
42 | const MHCHEM_MMULTISCRIPTS_HACK: &str = "MHCHEM_SCRIPT_HACK"; | |
43 | ||
44 | // (perfect) hash of operators built from MathML's operator dictionary | |
45 | static OPERATORS: phf::Map<&str, OperatorInfo> = include!("operator-info.in"); | |
46 | ||
47 | ||
48 | // The set of fence operators that can being either a left or right fence (or infix). For example: "|". | |
49 | static AMBIGUOUS_OPERATORS: phf::Set<&str> = phf_set! { | |
50 | "|", "∥", "\u{2016}" | |
51 | }; | |
52 | ||
53 | // static vars used when canonicalizing | |
54 | // lowest priority operator so it is never popped off the stack | |
55 | static LEFT_FENCEPOST: OperatorInfo = OperatorInfo{ op_type: OperatorTypes::LEFT_FENCE, priority: 0, next: &None }; | |
56 | ||
57 | 3 | static INVISIBLE_FUNCTION_APPLICATION: LazyLock<&'static OperatorInfo> = LazyLock::new(|| OPERATORS.get("\u{2061}").unwrap()); |
58 | 3 | static IMPLIED_TIMES: LazyLock<&'static OperatorInfo> = LazyLock::new(|| OPERATORS.get("\u{2062}").unwrap()); |
59 | 2 | static IMPLIED_INVISIBLE_COMMA: LazyLock<&'static OperatorInfo> = LazyLock::new(|| OPERATORS.get("\u{2063}").unwrap()); |
60 | 3 | static IMPLIED_INVISIBLE_PLUS: LazyLock<&'static OperatorInfo> = LazyLock::new(|| OPERATORS.get("\u{2064}").unwrap()); |
61 | ||
62 | // FIX: any other operators that should act the same (e.g, plus-minus and minus-plus)? | |
63 | 3 | static PLUS: LazyLock<&'static OperatorInfo> = LazyLock::new(|| OPERATORS.get("+").unwrap()); |
64 | 3 | static MINUS: LazyLock<&'static OperatorInfo> = LazyLock::new(|| OPERATORS.get("-").unwrap()); |
65 | 3 | static PREFIX_MINUS: LazyLock<&'static OperatorInfo> = LazyLock::new(|| MINUS.next.as_ref().unwrap()); |
66 | ||
67 | 3 | static TIMES_SIGN: LazyLock<&'static OperatorInfo> = LazyLock::new(|| OPERATORS.get("×").unwrap()); |
68 | ||
69 | // IMPLIED_TIMES_HIGH_PRIORITY -- used in trig functions for things like sin 2x cos 2x where want > function app priority | |
70 | static IMPLIED_TIMES_HIGH_PRIORITY: OperatorInfo = OperatorInfo{ | |
71 | op_type: OperatorTypes::INFIX, priority: 851, next: &None | |
72 | }; | |
73 | // IMPLIED_SEPARATOR_HIGH_PRIORITY -- used for Geometry points like ABC | |
74 | static IMPLIED_SEPARATOR_HIGH_PRIORITY: OperatorInfo = OperatorInfo{ | |
75 | op_type: OperatorTypes::INFIX, priority: 901, next: &None | |
76 | }; | |
77 | // IMPLIED_CHEMICAL_BOND -- used for implicit and explicit bonds | |
78 | static IMPLIED_CHEMICAL_BOND: OperatorInfo = OperatorInfo{ | |
79 | op_type: OperatorTypes::INFIX, priority: 905, next: &None | |
80 | }; | |
81 | static IMPLIED_PLUS_SLASH_HIGH_PRIORITY: OperatorInfo = OperatorInfo{ // (linear) mixed fraction 2 3/4 | |
82 | op_type: OperatorTypes::INFIX, priority: 881, next: &None | |
83 | }; | |
84 | ||
85 | // Useful static defaults to have available if there is no character match | |
86 | static DEFAULT_OPERATOR_INFO_PREFIX: OperatorInfo = OperatorInfo{ | |
87 | op_type: OperatorTypes::PREFIX, priority: 260, next: &None | |
88 | }; | |
89 | static DEFAULT_OPERATOR_INFO_INFIX: OperatorInfo = OperatorInfo{ | |
90 | op_type: OperatorTypes::INFIX, priority: 260, next:& None | |
91 | }; | |
92 | static DEFAULT_OPERATOR_INFO_POSTFIX: OperatorInfo = OperatorInfo{ | |
93 | op_type: OperatorTypes::POSTFIX, priority: 260, next: &None | |
94 | }; | |
95 | ||
96 | // avoids having to use Option<OperatorInfo> in some cases | |
97 | static ILLEGAL_OPERATOR_INFO: OperatorInfo = OperatorInfo{ | |
98 | op_type: OperatorTypes::INFIX, priority: 999, next: &None | |
99 | }; | |
100 | ||
101 | // used to tell if an operator is a relational operator | |
102 | 1 | static EQUAL_PRIORITY: LazyLock<usize> = LazyLock::new(|| OPERATORS.get("=").unwrap().priority); |
103 | ||
104 | // useful for detecting whitespace | |
105 | 3 | static IS_WHITESPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s+$").unwrap()); // only Unicode whitespace |
106 | ||
107 | // Operators are either PREFIX, INFIX, or POSTFIX, but can also have other properties such as LEFT_FENCE | |
108 | bitflags! { | |
109 | #[derive(Clone, Debug, Copy, PartialEq, Eq)] | |
110 | struct OperatorTypes: u32 { | |
111 | const NONE = 0x0; | |
112 | const PREFIX = 0x1; | |
113 | const INFIX = 0x2; | |
114 | const POSTFIX = 0x4; | |
115 | const FENCE = 0x8; | |
116 | const LEFT_FENCE= 0x9; | |
117 | const RIGHT_FENCE=0xc; | |
118 | const UNSPECIFIED=0xf; // 'and-ing will match anything | |
119 | } | |
120 | } | |
121 | // OperatorInfo is a key structure for parsing. | |
122 | // They OperatorInfo is this program's representation of MathML's Operator Dictionary. | |
123 | // The OperatorTypes say how the operator can group (can be overridden with @form="..." on an element). | |
124 | // Basically, it says the operator can be at the start, middle, or end of an mrow. | |
125 | // The priority field gives the relationships between operators so that lower priority operators are towards the root of the tree. | |
126 | // E.g., '=' is lower priority than (infix) '+', which in turn is lower priority than multiplication. | |
127 | // The operator info is a linked list because some operators (not many) have alternatives (e.g, '+' is both prefix and infix) | |
128 | // All OperatorInfo is static info, with some special static defaults to capture when it is not listed in the operator dictionary. | |
129 | #[derive(Clone, Debug)] | |
130 | struct OperatorInfo { | |
131 | op_type: OperatorTypes, // can be set on <mo> | |
132 | priority: usize, // not settable on an element | |
133 | next: &'static Option<OperatorInfo>, // can be both prefix & infix (etc) -- chain of options | |
134 | } | |
135 | ||
136 | // The character is separated out from the OperatorInfo as this allows the OperatorInfo to be static (can use default values) | |
137 | #[derive(Clone, Debug)] | |
138 | struct OperatorPair<'op> { | |
139 | ch: &'op str, | |
140 | op: &'static OperatorInfo | |
141 | } | |
142 | ||
143 | impl<'op> OperatorPair<'op> { | |
144 | 57.3k | fn new() -> OperatorPair<'op> { |
145 | 57.3k | return OperatorPair{ |
146 | 57.3k | ch: "illegal", // value 'illegal' used only in debugging, if then |
147 | 57.3k | op: &ILLEGAL_OPERATOR_INFO, // ILLEGAL_OPERATOR_INFO avoids using <Option> |
148 | 57.3k | }; |
149 | 57.3k | } |
150 | } | |
151 | ||
152 | // OperatorVersions is a convenient data structure when looking to see whether the operator should be prefix, infix, or postfix. | |
153 | // It is only used in one place in the code, so this could maybe be eliminated and the code localized to where it is used. | |
154 | #[derive(Debug)] | |
155 | struct OperatorVersions { | |
156 | prefix: Option<&'static OperatorInfo>, | |
157 | infix: Option<&'static OperatorInfo>, | |
158 | postfix: Option<&'static OperatorInfo>, | |
159 | } | |
160 | ||
161 | impl OperatorVersions { | |
162 | 401 | fn new(op: &'static OperatorInfo) -> OperatorVersions { |
163 | 401 | let mut op = op; |
164 | 401 | let mut prefix = None; |
165 | 401 | let mut infix = None; |
166 | 401 | let mut postfix = None; |
167 | loop { | |
168 | 1.10k | if op.is_prefix() { |
169 | 360 | prefix = Some( op ); |
170 | 745 | } else if op.is_infix() { |
171 | 385 | infix = Some( op ) |
172 | 360 | } else if op.is_postfix() { |
173 | 360 | postfix = Some( op ); |
174 | 360 | } else { |
175 | 0 | panic!("OperatorVersions::new: operator is not prefix, infix, or postfix") |
176 | } | |
177 | //let another_op = op.next; | |
178 | 1.10k | match &op.next { |
179 | 401 | None => break, |
180 | 704 | Some(alt_op) => op = alt_op, |
181 | } | |
182 | } | |
183 | 401 | return OperatorVersions{prefix, infix, postfix}; |
184 | 401 | } |
185 | } | |
186 | ||
187 | ||
188 | impl OperatorInfo { | |
189 | 13.1k | fn is_prefix(&self) -> bool { |
190 | 13.1k | return (self.op_type & OperatorTypes::PREFIX) != OperatorTypes::NONE; |
191 | 13.1k | } |
192 | ||
193 | 805 | fn is_infix(&self) -> bool { |
194 | 805 | return (self.op_type & OperatorTypes::INFIX) != OperatorTypes::NONE; |
195 | 805 | } |
196 | ||
197 | 14.2k | fn is_postfix(&self) -> bool { |
198 | 14.2k | return (self.op_type & OperatorTypes::POSTFIX) != OperatorTypes::NONE; |
199 | 14.2k | } |
200 | ||
201 | 13.9k | fn is_left_fence(&self) -> bool { |
202 | 13.9k | return self.op_type & OperatorTypes::LEFT_FENCE == OperatorTypes::LEFT_FENCE; |
203 | 13.9k | } |
204 | ||
205 | 12.9k | fn is_right_fence(&self) -> bool { |
206 | 12.9k | return self.op_type & OperatorTypes::RIGHT_FENCE ==OperatorTypes::RIGHT_FENCE; |
207 | 12.9k | } |
208 | ||
209 | 4.84k | fn is_fence(&self) -> bool { |
210 | 4.84k | return (self.op_type & (OperatorTypes::LEFT_FENCE | OperatorTypes::RIGHT_FENCE)) != OperatorTypes::NONE; |
211 | 4.84k | } |
212 | ||
213 | 21.3k | fn is_operator_type(&self, op_type: OperatorTypes) -> bool { |
214 | 21.3k | return self.op_type & op_type != OperatorTypes::NONE; |
215 | 21.3k | } |
216 | ||
217 | 13.5k | fn is_plus_or_minus(&self) -> bool { |
218 | 13.5k | return ptr_eq(self, *PLUS) || |
219 | 13.5k | } |
220 | ||
221 | 13.2k | fn is_times(&self) -> bool { |
222 | 13.2k | return ptr_eq(self, *IMPLIED_TIMES) || |
223 | 13.2k | } |
224 | ||
225 | 17.7k | fn is_nary(&self, previous_op: &OperatorInfo) -> bool { |
226 | 17.7k | return ptr_eq(previous_op,self) || |
227 | 13.0k | (previous_op.is_plus_or_minus() && |
228 | 13.0k | (previous_op.is_times() && |
229 | 17.7k | } |
230 | } | |
231 | ||
232 | // StackInfo contains all the needed information for deciding shift/reduce during parsing. | |
233 | // The stack itself is just a Vec of StackInfo (since we only push, pop, and look at the top) | |
234 | // There are a number of useful functions defined on StackInfo. | |
235 | struct StackInfo<'a, 'op>{ | |
236 | mrow: Element<'a>, // mrow being built | |
237 | op_pair: OperatorPair<'op>, // last operator placed on stack | |
238 | is_operand: bool, // true if child at end of mrow is an operand (as opposed to an operator) | |
239 | } | |
240 | ||
241 | impl fmt::Display for StackInfo<'_, '_> { | |
242 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
243 | 0 | write!(f, "StackInfo(op={}/{}, is_operand={}, mrow({}", |
244 | 0 | show_invisible_op_char(self.op_pair.ch), self.op_pair.op.priority, self.is_operand, |
245 | 0 | if self.mrow.children().is_empty() {")"} else {""})?; |
246 | 0 | for child in self.mrow.children() { |
247 | 0 | let child = as_element(child); |
248 | 0 | write!(f, "{}{}", name(child), if child.following_siblings().is_empty() {")"} else {","})?; |
249 | } | |
250 | 0 | return Ok( () ); |
251 | 0 | } |
252 | } | |
253 | ||
254 | impl<'a, 'op:'a> StackInfo<'a, 'op> { | |
255 | 10.6k | fn new(doc: Document<'a>) -> StackInfo<'a, 'op> { |
256 | // debug!(" new empty StackInfo"); | |
257 | 10.6k | let mrow = create_mathml_element(&doc, "mrow") ; |
258 | 10.6k | mrow.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
259 | 10.6k | return StackInfo{ |
260 | 10.6k | mrow, |
261 | 10.6k | op_pair: OperatorPair{ ch: "\u{E000}", op: &LEFT_FENCEPOST }, |
262 | 10.6k | is_operand: false, |
263 | 10.6k | } |
264 | 10.6k | } |
265 | ||
266 | 10.9k | fn with_op<'d>(doc: &'d Document<'a>, node: Element<'a>, op_pair: OperatorPair<'op>) -> StackInfo<'a, 'op> { |
267 | // debug!(" new StackInfo with '{}' and operator {}/{}", name(node), show_invisible_op_char(op_pair.ch), op_pair.op.priority); | |
268 | 10.9k | let mrow = create_mathml_element(doc, "mrow"); |
269 | 10.9k | mrow.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
270 | 10.9k | mrow.append_child(node); |
271 | 10.9k | return StackInfo { |
272 | 10.9k | mrow, |
273 | 10.9k | op_pair, |
274 | 10.9k | is_operand: false, |
275 | 10.9k | } |
276 | 10.9k | } |
277 | ||
278 | 37.2k | fn priority(&self) -> usize { |
279 | 37.2k | return self.op_pair.op.priority; |
280 | 37.2k | } |
281 | ||
282 | 37.1k | fn last_child_in_mrow(&self) -> Option<Element<'a>> { |
283 | 37.1k | let children = self.mrow.children(); |
284 | 37.1k | for & |
285 | 29.5k | let child = as_element(child); |
286 | 29.5k | if let Some( |
287 | 687 | && value == "empty_content" { |
288 | 0 | continue; |
289 | 29.5k | } |
290 | 29.5k | return Some(child); |
291 | } | |
292 | 7.58k | return None; |
293 | 37.1k | } |
294 | ||
295 | 57.6k | fn add_child_to_mrow(&mut self, child: Element<'a>, child_op: OperatorPair<'op>) { |
296 | // debug!(" adding '{}' to mrow[{}], operator '{}/{}'", | |
297 | // element_summary(child), self.mrow.children().len(), show_invisible_op_char(child_op.ch), child_op.op.priority); | |
298 | 57.6k | self.mrow.append_child(child); |
299 | 57.6k | if ptr_eq(child_op.op, &ILLEGAL_OPERATOR_INFO) { |
300 | 36.8k | assert!(!self.is_operand); // should not have two operands in a row (ok to add whitespace) |
301 | 36.8k | self.is_operand = true; |
302 | 20.7k | } else { |
303 | 20.7k | self.op_pair = child_op; |
304 | 20.7k | self.is_operand = false; |
305 | 20.7k | } |
306 | 57.6k | } |
307 | ||
308 | 18.4k | fn remove_last_operand_from_mrow(&mut self) -> Element<'a> { |
309 | 18.4k | let children = self.mrow.children(); |
310 | 18.4k | assert!( !children.is_empty() ); |
311 | 18.4k | assert!( self.is_operand || |
312 | 18.4k | self.is_operand = false; |
313 | 18.4k | let last_operand = as_element(children[children.len()-1]); |
314 | // debug!(" Removing last element '{}' from mrow[{}]",element_summary(last_operand), children.len()); | |
315 | 18.4k | last_operand.remove_from_parent(); |
316 | 18.4k | return last_operand; |
317 | 18.4k | } |
318 | ||
319 | } | |
320 | ||
321 | ||
322 | 117k | pub fn create_mathml_element<'a>(doc: &Document<'a>, name: &str) -> Element<'a> { |
323 | 117k | return doc.create_element(sxd_document::QName::with_namespace_uri( |
324 | 117k | Some("http://www.w3.org/1998/Math/MathML"), |
325 | 117k | name)); |
326 | 117k | } |
327 | ||
328 | 4.84k | pub fn is_fence(mo: Element) -> bool { |
329 | 4.84k | return CanonicalizeContext::find_operator(None, mo, None, None, None).is_fence(); |
330 | 4.84k | } |
331 | ||
332 | 664 | pub fn is_relational_op(mo: Element) -> bool { |
333 | 664 | return CanonicalizeContext::find_operator(None, mo, None, None, None).priority == *EQUAL_PRIORITY; |
334 | 664 | } |
335 | ||
336 | 113k | pub fn set_mathml_name(element: Element, new_name: &str) { |
337 | 113k | element.set_name(QName::with_namespace_uri(Some("http://www.w3.org/1998/Math/MathML"), new_name)); |
338 | 113k | } |
339 | ||
340 | /// Replace 'mathml' in the parent (must exist since this only happens for leaves) with the 'replacements' (new children). | |
341 | /// This handles adding mrows if needed. | |
342 | /// | |
343 | /// Returns first replacement | |
344 | 2.47k | pub fn replace_children<'a>(mathml: Element<'a>, replacements: Vec<Element<'a>>) -> Element<'a> { |
345 | 2.47k | let parent = get_parent(mathml); |
346 | 2.47k | let parent_name = name(parent); |
347 | // debug!("\nreplace_children: mathml\n{}", mml_to_string(mathml)); | |
348 | // debug!("replace_children: parent before replace\n{}", mml_to_string(parent)); | |
349 | // debug!("{} replacements:\n{}", replacements.len(), replacements.iter().map(|e| mml_to_string(e)).collect::<Vec<String>>().join("\n")); | |
350 | 2.47k | if ELEMENTS_WITH_FIXED_NUMBER_OF_CHILDREN.contains(parent_name) || |
351 | 2.44k | parent_name == "mmultiscripts" { // each child acts like the parent has a fixed number of children |
352 | // gather up the preceding/following siblings before mucking with the tree structure (mrow.append_children below) | |
353 | 32 | let mut new_children = mathml.preceding_siblings(); |
354 | 32 | let mut following_siblings = mathml.following_siblings(); |
355 | ||
356 | // debug!("\nreplace_children: mathml\n{}", mml_to_string(mathml)); | |
357 | // debug!("replace_children: parent before replace\n{}", mml_to_string(parent)); | |
358 | // wrap an mrow around the replacements and then replace 'mathml' with that | |
359 | 32 | let mrow = create_mathml_element(&mathml.document(), "mrow"); |
360 | 32 | add_attrs(mrow, &replacements[0].attributes()); |
361 | 32 | mrow.append_children(replacements); |
362 | 32 | new_children.push(ChildOfElement::Element(mrow)); |
363 | 32 | new_children.append(&mut following_siblings); |
364 | 32 | parent.replace_children(new_children); |
365 | // debug!("replace_children parent after: parent\n{}", mml_to_string(parent)); | |
366 | // debug!("replace_children: returned mrow\n{}", mml_to_string(mrow)); | |
367 | 32 | return mrow; |
368 | } else { | |
369 | // replace the children of the parent with 'replacements' inserted in place of 'mathml' | |
370 | 2.44k | let mut new_children = mathml.preceding_siblings(); |
371 | 2.44k | let i_first_new_child = new_children.len(); |
372 | 6.54k | let |
373 | 2.44k | new_children.append(&mut replacements); |
374 | 2.44k | new_children.append(&mut mathml.following_siblings()); |
375 | 2.44k | parent.replace_children(new_children); |
376 | // debug!("replace_children: (will return child[{}]) parent after replace\n{}", i_first_new_child, mml_to_string(parent)); | |
377 | 2.44k | return as_element(parent.children()[i_first_new_child]); |
378 | } | |
379 | 2.47k | } |
380 | ||
381 | // returns the presentation element of a "semantics" element | |
382 | 22 | pub fn get_presentation_element(element: Element) -> (usize, Element) { |
383 | 22 | assert_eq!(name(element), "semantics"); |
384 | 22 | let children = element.children(); |
385 | 22 | if let Some( ( |
386 | 48 | if let Some( |
387 | 46 | encoding == "MathML-Presentation" |
388 | } else { | |
389 | 2 | false |
390 | 48 | }) |
391 | { | |
392 | 20 | let presentation_annotation = as_element(*child); |
393 | // debug!("get_presentation_element:\n{}", mml_to_string(presentation_annotation)); | |
394 | 20 | assert_eq!(presentation_annotation.children().len(), 1); |
395 | 20 | return (i, as_element(presentation_annotation.children()[0])); |
396 | } else { | |
397 | 2 | return (0, as_element(children[0])); |
398 | } | |
399 | 22 | } |
400 | ||
401 | /// Canonicalize does several things: | |
402 | /// 1. cleans up the tree so all extra white space is removed (should only have element and text nodes) | |
403 | /// 2. normalize the characters | |
404 | /// 3. clean up "bad" MathML based on known output from some converters (TODO: still a work in progress) | |
405 | /// 4. the tree is "parsed" based on the mo (priority)/mi/mn's in an mrow | |
406 | /// * this adds mrows and some invisible operators (implied times, function app, ...) | |
407 | /// * extra mrows are removed | |
408 | /// * implicit mrows are turned into explicit mrows (e.g, there will be a single child of 'math') | |
409 | /// | |
410 | /// Canonicalize is pretty conservative in adding new mrows and won't do it if: | |
411 | /// * there is an intent attr | |
412 | /// * if the mrow starts and ends with a fence (e.g, French open interval "]0,1[") | |
413 | /// | |
414 | /// An mrow is never deleted unless it is redundant. | |
415 | /// | |
416 | /// Whitespace handling: | |
417 | /// Whitespace complicates parsing and also pattern matching (e.g., is it a mixed number which tests for a number preceding a fraction) | |
418 | /// The first attempt which mostly worked was to shove whitespace into adjacent mi/mn/mtext. That has a problem with distinguish different uses for whitespace | |
419 | /// The second attempt was to leave it in the parse and make it an mo when appropriate, but there were some cases where it should be prefix and wasn't caught | |
420 | /// The third attempt (and the current one) is to make it an attribute on adjacent elements. | |
421 | /// This preserves the data-width attr (with new name) added in the second attempt that helps resolve whether something is tweaking, a real space, or an omission. | |
422 | /// It adds data-previous-space-width/data-following-space-width with values to indicate with the space was on the left or right (typically it placed on the previous token because that's easier) | |
423 | 5.06k | pub fn canonicalize(mathml: Element) -> Result<Element> { |
424 | 5.06k | let context = CanonicalizeContext::new(); |
425 | 5.06k | return context.canonicalize(mathml); |
426 | 5.06k | } |
427 | ||
428 | #[derive(Debug, PartialEq)] | |
429 | enum FunctionNameCertainty { | |
430 | True, | |
431 | Maybe, | |
432 | False | |
433 | } | |
434 | ||
435 | ||
436 | static ELEMENTS_WITH_ONE_CHILD: phf::Set<&str> = phf_set! { | |
437 | "math", "msqrt", "merror", "mpadded", "mphantom", "menclose", "mtd", "mscarry" | |
438 | }; | |
439 | ||
440 | static ELEMENTS_WITH_FIXED_NUMBER_OF_CHILDREN: phf::Set<&str> = phf_set! { | |
441 | "mfrac", "mroot", "msub", "msup", "msubsup","munder", "mover", "munderover" | |
442 | }; | |
443 | ||
444 | static EMPTY_ELEMENTS: phf::Set<&str> = phf_set! { | |
445 | "mspace", "none", "mprescripts", "mglyph", "malignmark", "maligngroup", "msline", | |
446 | }; | |
447 | ||
448 | // turns out Roman Numerals tests aren't needed, but we do want to block VII from being a chemical match | |
449 | // two cases because we don't want to have a match for 'Cl', etc. | |
450 | 3 | static UPPER_ROMAN_NUMERAL: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s*$").unwrap()); |
451 | 3 | static LOWER_ROMAN_NUMERAL: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*^m{0,3}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s*$").unwrap()); |
452 | ||
453 | ||
454 | struct CanonicalizeContextPatterns { | |
455 | decimal_separator: Regex, | |
456 | block_separator: Regex, | |
457 | digit_only_decimal_number: Regex, | |
458 | block_3digit_pattern: Regex, | |
459 | block_3_5digit_pattern: Regex, | |
460 | block_4digit_hex_pattern: Regex, | |
461 | block_1digit_pattern: Regex, // used when generator puts each digit into a single mn | |
462 | } | |
463 | ||
464 | impl CanonicalizeContextPatterns { | |
465 | 4.10k | fn new(block_separator_pref: &str, decimal_separator_pref: &str) -> CanonicalizeContextPatterns { |
466 | 4.10k | let block_separator = Regex::new(&format!("[{}]", regex::escape(block_separator_pref))).unwrap(); |
467 | 4.10k | let decimal_separator = Regex::new(&format!("[{}]", regex::escape(decimal_separator_pref))).unwrap(); |
468 | // allows just "." and also matches an empty string, but those are ruled out elsewhere | |
469 | 4.10k | let digit_only_decimal_number = Regex::new(&format!(r"^\d*{}?\d*$", regex::escape(decimal_separator_pref))).unwrap(); |
470 | 4.10k | let block_3digit_pattern = get_number_pattern_regex(block_separator_pref, decimal_separator_pref, 3, 3); |
471 | 4.10k | let block_3_5digit_pattern = get_number_pattern_regex(block_separator_pref, decimal_separator_pref, 3, 5); |
472 | // Note: on en.wikipedia.org/wiki/Decimal_separator, show '3.14159 26535 89793 23846' | |
473 | 4.10k | let block_4digit_hex_pattern = Regex::new(r"^[0-9a-fA-F]{4}([ \u00A0\u202F][0-9a-fA-F]{4})*$").unwrap(); |
474 | 4.10k | let block_1digit_pattern = Regex::new(r"^((\d(\uFFFF\d)?)(\d([, \u00A0\u202F]\d){2})*)?([\.](\d(\uFFFF\d)*)?)?$").unwrap(); |
475 | ||
476 | 4.10k | return CanonicalizeContextPatterns { |
477 | 4.10k | block_separator, |
478 | 4.10k | decimal_separator, |
479 | 4.10k | digit_only_decimal_number, |
480 | 4.10k | block_3digit_pattern, |
481 | 4.10k | block_3_5digit_pattern, |
482 | 4.10k | block_4digit_hex_pattern, |
483 | 4.10k | block_1digit_pattern |
484 | 4.10k | }; |
485 | ||
486 | ||
487 | 8.21k | fn get_number_pattern_regex(block_separator: &str, decimal_separator: &str, n_sep_before: usize, n_sep_after: usize) -> Regex { |
488 | // the following is a generalization of a regex like ^(\d*|\d{1,3}([, ]?\d{3})*)(\.(\d*|(\d{3}[, ])*\d{1,3}))?$ | |
489 | // that matches something like '1 234.567 8' and '1,234.', but not '1,234.12,34 | |
490 | 8.21k | return Regex::new(&format!(r"^(\d*|\d{{1,{}}}([{}]?\d{{{}}})*)([{}](\d*|(\d{{{}}}[{}])*\d{{1,{}}}))?$", |
491 | 8.21k | n_sep_before, regex::escape(block_separator), n_sep_before, regex::escape(decimal_separator), |
492 | 8.21k | n_sep_after, regex::escape(block_separator), n_sep_after) ).unwrap(); |
493 | 8.21k | } |
494 | 4.10k | } |
495 | } | |
496 | ||
497 | /// Profiling showed that creating new contexts was very time consuming because creating the RegExs is very expensive | |
498 | /// Profiling set_mathml (which does the canonicalization) spends 65% of the time in Regex::new, of which half of it is spent in this initialization. | |
499 | struct CanonicalizeContextPatternsCache { | |
500 | block_separator_pref: String, | |
501 | decimal_separator_pref: String, | |
502 | patterns: Rc<CanonicalizeContextPatterns>, | |
503 | } | |
504 | ||
505 | thread_local!{ | |
506 | static PATTERN_CACHE: RefCell<CanonicalizeContextPatternsCache> = RefCell::new(CanonicalizeContextPatternsCache::new()); | |
507 | } | |
508 | ||
509 | impl CanonicalizeContextPatternsCache { | |
510 | 4.10k | fn new() -> CanonicalizeContextPatternsCache { |
511 | 4.10k | let pref_manager = crate::prefs::PreferenceManager::get(); |
512 | 4.10k | let pref_manager = pref_manager.borrow(); |
513 | 4.10k | let block_separator_pref = pref_manager.pref_to_string("BlockSeparators"); |
514 | 4.10k | let decimal_separator_pref = pref_manager.pref_to_string("DecimalSeparators"); |
515 | 4.10k | return CanonicalizeContextPatternsCache { |
516 | 4.10k | patterns: Rc::new( CanonicalizeContextPatterns::new(&block_separator_pref, &decimal_separator_pref) ), |
517 | 4.10k | block_separator_pref, |
518 | 4.10k | decimal_separator_pref |
519 | 4.10k | } |
520 | 4.10k | } |
521 | ||
522 | 5.06k | fn get() -> Rc<CanonicalizeContextPatterns> { |
523 | 5.06k | return PATTERN_CACHE.with( |cache| { |
524 | 5.06k | let pref_manager_rc = crate::prefs::PreferenceManager::get(); |
525 | 5.06k | let pref_manager = pref_manager_rc.borrow(); |
526 | 5.06k | let block_separator_pref = pref_manager.pref_to_string("BlockSeparators"); |
527 | 5.06k | let decimal_separator_pref = pref_manager.pref_to_string("DecimalSeparators"); |
528 | ||
529 | 5.06k | let mut cache = cache.borrow_mut(); |
530 | 5.06k | if block_separator_pref != cache.block_separator_pref || decimal_separator_pref != cache.decimal_separator_pref { |
531 | 0 | // update the cache |
532 | 0 | cache.patterns = Rc::new( CanonicalizeContextPatterns::new(&block_separator_pref, &decimal_separator_pref) ); |
533 | 0 | cache.block_separator_pref = block_separator_pref; |
534 | 0 | cache.decimal_separator_pref = decimal_separator_pref; |
535 | 5.06k | } |
536 | 5.06k | return cache.patterns.clone(); |
537 | 5.06k | }) |
538 | 5.06k | } |
539 | } | |
540 | ||
541 | struct CanonicalizeContext { | |
542 | patterns: Rc<CanonicalizeContextPatterns>, | |
543 | } | |
544 | ||
545 | ||
546 | impl CanonicalizeContext { | |
547 | 5.06k | fn new() -> CanonicalizeContext { |
548 | 5.06k | return CanonicalizeContext { |
549 | 5.06k | patterns: CanonicalizeContextPatternsCache::get(), |
550 | 5.06k | }; |
551 | 5.06k | } |
552 | ||
553 | 5.06k | fn canonicalize<'a>(&self, mut mathml: Element<'a>) -> Result<Element<'a>> { |
554 | // debug!("MathML before canonicalize:\n{}", mml_to_string(mathml)); | |
555 | ||
556 | 5.06k | if name(mathml) != "math" { |
557 | 0 | // debug!("Didn't start with <math> element -- attempting repair"); |
558 | 0 | let math_element = create_mathml_element(&mathml.document(), "math"); |
559 | 0 | math_element.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
560 | 0 | math_element.append_child(mathml); |
561 | 0 | let root = math_element.document().root(); |
562 | 0 | root.clear_children(); |
563 | 0 | root.append_child(math_element); |
564 | 0 | mathml = root.children()[0].element().unwrap(); |
565 | 5.06k | } |
566 | 5.06k | CanonicalizeContext::assure_mathml(mathml, 0) |
567 | 5.05k | let mathml = self.clean_mathml(mathml).unwrap(); // 'math' is never removed |
568 | 5.05k | self.assure_nary_tag_has_one_child(mathml); |
569 | // debug!("Not chemistry -- retry:\n{}", mml_to_string(mathml)); | |
570 | 5.05k | let mut converted_mathml = self.canonicalize_mrows(mathml) |
571 | 5.05k | .with_context(|| |
572 | // debug!("canonicalize before canonicalize_mrows:\n{}", mml_to_string(converted_mathml)); | |
573 | 5.05k | if !crate::chemistry::scan_and_mark_chemistry(converted_mathml) { |
574 | 869 | self.assure_nary_tag_has_one_child(converted_mathml); |
575 | 869 | converted_mathml = self.canonicalize_mrows(mathml) |
576 | 869 | .with_context(|| |
577 | 4.18k | } |
578 | 5.05k | debug!("\nMathML after canonicalize:\n{}", |
579 | 5.05k | return Ok(converted_mathml); |
580 | 5.06k | } |
581 | ||
582 | /// Make sure there is exactly one child | |
583 | 19.1k | fn assure_nary_tag_has_one_child(&self, mathml: Element) { |
584 | 19.1k | let children = mathml.children(); |
585 | 19.1k | if !ELEMENTS_WITH_ONE_CHILD.contains(name(mathml)) { |
586 | 6.43k | return; |
587 | 12.7k | } |
588 | ||
589 | 12.7k | if children.is_empty() { |
590 | 3 | // make sure there is content |
591 | 3 | let child = CanonicalizeContext::create_empty_element(&mathml.document()); |
592 | 3 | mathml.append_child(child); |
593 | 12.7k | } else if children.len() > 1 { |
594 | 2.34k | // wrap the children in an mrow |
595 | 2.34k | let mrow = create_mathml_element(&mathml.document(), "mrow"); |
596 | 2.34k | mrow.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
597 | 2.34k | mrow.append_children(children); |
598 | 2.34k | mathml.replace_children(vec![ChildOfElement::Element(mrow)]); |
599 | 10.3k | } |
600 | 19.1k | } |
601 | ||
602 | /// Return an error if some element is not MathML (only look at first child of <semantics>) or if it has the wrong number of children | |
603 | 52.9k | fn assure_mathml(mathml: Element, depth: usize) -> Result<()> { |
604 | 52.9k | if depth > crate::interface::MAX_DEPTH { |
605 | 1 | bail!("MathML is too deeply nested to process"); |
606 | 52.9k | } |
607 | 52.9k | let n_children = mathml.children().len(); |
608 | 52.9k | let element_name = name(mathml); |
609 | 52.9k | if is_leaf(mathml) { |
610 | 33.1k | if EMPTY_ELEMENTS.contains(element_name) { |
611 | 464 | if n_children != 0 { |
612 | 0 | bail!("{} should only have one child:\n{}", element_name, mml_to_string(mathml)); |
613 | 464 | } |
614 | 32.7k | } else if element_name == "annotation" { |
615 | 0 | bail!("'annotation' element is not child of 'semantics' element"); |
616 | 32.7k | } else if (n_children == 1 && |
617 | 32.7k | return Ok( () ); |
618 | } else { | |
619 | 0 | bail!("Not a valid MathML leaf element:\n{}", mml_to_string(mathml)); |
620 | }; | |
621 | 19.7k | } |
622 | ||
623 | 20.2k | if ELEMENTS_WITH_FIXED_NUMBER_OF_CHILDREN.contains(element_name) { |
624 | 3.90k | match element_name { |
625 | 3.90k | "munderover" | |
626 | 0 | bail!("{} should have 3 children:\n{}", element_name, mml_to_string(mathml)); |
627 | 277 | }, |
628 | 3.62k | _ => if n_children != 2 { |
629 | 0 | bail!("{} should have 2 children:\n{}", element_name, mml_to_string(mathml)); |
630 | 3.62k | }, |
631 | } | |
632 | 16.3k | } else if |
633 | 2.18k | let parent_name = name(get_parent(mathml)); |
634 | 2.18k | if (element_name == "mtr" || |
635 | 0 | bail!("Illegal MathML: {} is not a child of mtable. Parent is {}", element_name, mml_to_string(get_parent(mathml))); |
636 | 2.18k | } else if element_name == "mtd" && !( |
637 | 1 | bail!("Illegal MathML: mtd is not a child of {}. Parent is {}", parent_name, mml_to_string(get_parent(mathml))); |
638 | 2.17k | } |
639 | } | |
640 | 14.1k | else if element_name == "mmultiscripts" { |
641 | 182 | let has_prescripts = mathml.children().iter() |
642 | 649 | . |
643 | 182 | if has_prescripts ^ (n_children.is_multiple_of(2)) { |
644 | 1 | bail!("{} has the wrong number of children:\n{}", element_name, mml_to_string(mathml)); |
645 | 181 | } |
646 | 13.9k | } else if element_name == "mlongdiv" { |
647 | 0 | if n_children < 3 { |
648 | 0 | bail!("{} should have at least 3 children:\n{}", element_name, mml_to_string(mathml)); |
649 | 0 | } |
650 | 13.9k | } else if element_name == "semantics" { |
651 | 11 | let children = mathml.children(); |
652 | 11 | if children.is_empty() { |
653 | 0 | return Ok( () ); |
654 | } else { | |
655 | 11 | let (i_presentation, presentation_element) = get_presentation_element(mathml); |
656 | // make sure only 'annotation' and 'annotation-xml' elements are children of the non-presentation element | |
657 | 24 | for (i, child) in |
658 | 24 | if i != i_presentation { |
659 | 13 | let child = as_element(*child); |
660 | 13 | if name(child)!="annotation" && |
661 | 0 | bail!("Illegal MathML: {} is child of 'semantic'", name(child)); |
662 | 13 | } |
663 | 11 | } |
664 | } | |
665 | 11 | return CanonicalizeContext::assure_mathml(presentation_element, depth + 1); |
666 | } | |
667 | 13.9k | } else if !IsNode::is_mathml(mathml) { |
668 | 1 | if element_name == "annotation-xml" { |
669 | 0 | bail!("'annotation-xml' element is not child of 'semantics' element"); |
670 | } else { | |
671 | 1 | bail!("'{}' is not a valid MathML element", element_name); |
672 | } | |
673 | 13.9k | } |
674 | ||
675 | // valid MathML element and not a leaf -- check the children | |
676 | 47.8k | for child in |
677 | 47.8k | CanonicalizeContext::assure_mathml( as_element(child), depth + 1) |
678 | } | |
679 | 19.6k | return Ok( () ); |
680 | 52.9k | } |
681 | ||
682 | 283 | fn make_empty_element(mathml: Element) -> Element { |
683 | 283 | set_mathml_name(mathml, "mtext"); |
684 | 283 | mathml.clear_children(); |
685 | 283 | mathml.set_text("\u{00A0}"); |
686 | 283 | mathml.set_attribute_value("data-changed", "empty_content"); |
687 | 283 | mathml.set_attribute_value("data-width", "0"); |
688 | 283 | return mathml; |
689 | 283 | } |
690 | ||
691 | 24 | fn create_empty_element<'a>(doc: &Document<'a>) -> Element<'a> { |
692 | 24 | let mtext = create_mathml_element(doc, "mtext"); |
693 | 24 | mtext.set_text("\u{00A0}"); |
694 | 24 | mtext.set_attribute_value("data-added", "missing-content"); |
695 | 24 | mtext.set_attribute_value("data-width", "0"); |
696 | 24 | return mtext; |
697 | 24 | } |
698 | ||
699 | 11.5k | fn is_empty_element(el: Element) -> bool { |
700 | 11.5k | return (is_leaf(el) && |
701 | 11.0k | (name(el) == "mrow" && |
702 | 11.5k | } |
703 | ||
704 | ||
705 | // this should only be called for 2D elements | |
706 | 4.48k | fn mark_empty_content(two_d_element: Element) { |
707 | 7.32k | for child in |
708 | 7.32k | let child = as_element(child); |
709 | 7.32k | if CanonicalizeContext::is_empty_element(child) { |
710 | 20 | child.set_attribute_value(EMPTY_IN_2D, "true"); |
711 | 7.30k | } |
712 | } | |
713 | 4.48k | } |
714 | ||
715 | /// Turn leaf into an 'mn' and set attributes appropriately | |
716 | 34 | fn make_roman_numeral(leaf: Element) { |
717 | 34 | assert!(is_leaf(leaf)); |
718 | 34 | set_mathml_name(leaf, "mn"); |
719 | 34 | leaf.set_attribute_value("data-roman-numeral", "true"); // mark for easy detection |
720 | 34 | let as_number = match as_text(leaf).parse::<RomanNumeral>() { |
721 | 34 | Ok(roman) => roman.as_u16().to_string(), |
722 | 0 | Err(_) => as_text(leaf).to_string(), |
723 | }; | |
724 | 34 | leaf.set_attribute_value("data-number", &as_number); |
725 | 34 | } |
726 | ||
727 | /// most of the time it is ok to merge the mrow with its singleton child, but there are some exceptions: | |
728 | /// mrow has 'intent' -- this might reference the child and you aren't allowed to self reference | |
729 | 2.82k | fn is_ok_to_merge_mrow_child(mrow: Element) -> bool { |
730 | 2.82k | assert_eq!(name(mrow), "mrow"); |
731 | 2.82k | assert!(mrow.children().len() == 1); |
732 | 2.82k | return mrow.attribute(INTENT_ATTR).is_none(); // could check if child is referenced, but that's a chunk of code |
733 | 2.82k | } |
734 | ||
735 | /// This function does some cleanup of MathML (mostly fixing bad MathML) | |
736 | /// Unlike the main canonicalization routine, significant tree changes happen here | |
737 | /// Changes to "good" MathML: | |
738 | /// 1. mfenced -> mrow, a => mrow | |
739 | /// 2. mspace and mtext with only whitespace are canonicalized to a non-breaking space and merged in with | |
740 | /// an adjacent non-mo element unless in a required element position (need to keep for braille) | |
741 | /// | |
742 | /// Note: mspace that is potentially part of a number that was split apart is merged into a number as a single space char | |
743 | /// | |
744 | /// mstyle, mpadded, and mphantom, malignmark, maligngroup are removed (but children might be kept) | |
745 | /// | |
746 | /// Significant changes are made cleaning up empty bases of scripts, looking for chemistry, merging numbers with commas, | |
747 | /// "arg trig" functions, pseudo scripts, and others | |
748 | /// | |
749 | /// Returns 'None' if the element should not be in the tree. | |
750 | 52.3k | fn clean_mathml<'a>(&self, mathml: Element<'a>) -> Option<Element<'a>> { |
751 | // Note: this works bottom-up (clean the children first, then this element) | |
752 | 3 | static IS_PRIME: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"['′″‴⁗]").unwrap()); |
753 | ||
754 | // Note: including intervening spaces in what is likely a symbol of omission preserves any notion of separate digits (e.g., "_ _ _") | |
755 | 3 | static IS_UNDERSCORE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[_\u{00A0}]+$").unwrap()); |
756 | ||
757 | ||
758 | 23.5k | fn is_currency_symbol(ch: char) -> bool { |
759 | 23.5k |
|
760 | 23.5k | } |
761 | ||
762 | 20.0k | fn contains_currency(s: &str) -> bool { |
763 | 20.0k | s.chars().any(is_currency_symbol) |
764 | 20.0k | } |
765 | ||
766 | // begin by cleaning up empty elements | |
767 | // debug!("clean_mathml\n{}", mml_to_string(mathml)); | |
768 | 52.3k | let element_name = name(mathml); |
769 | 52.3k | let parent_name = if element_name == "math" { |
770 | 5.09k | "math".to_string() |
771 | } else { | |
772 | 47.2k | let parent = get_parent(mathml); |
773 | 47.2k | name(parent).to_string() |
774 | }; | |
775 | 52.3k | let parent_requires_child = ELEMENTS_WITH_FIXED_NUMBER_OF_CHILDREN.contains(&parent_name) || |
776 | 44.0k |
|
777 | 41.8k | parent_name == "mmultiscripts"; |
778 | ||
779 | // handle empty leaves -- leaving it empty causes problems with the speech rules | |
780 | 52.3k | if is_leaf(mathml) && |
781 | 32 | return if parent_requires_child { |
782 | 52.3k | }; |
783 | ||
784 | 52.3k | if mathml.children().is_empty() && |
785 | 158 | if element_name == "mrow" && |
786 | // if it is an empty mrow that doesn't need to be there, get rid of it. Otherwise, replace it with an mtext | |
787 | 142 | if parent_name == "mmultiscripts" && |
788 | // MathML Core dropped "none" in favor of <mrow/>, but MathCAT is written with <none/> | |
789 | // Do substitutions for the scripts, not the base | |
790 | 4 | set_mathml_name(mathml, "none"); |
791 | 4 | return Some(mathml); |
792 | 138 | } |
793 | 138 | if parent_requires_child { |
794 | 14 | return Some( CanonicalizeContext::make_empty_element(mathml) ); |
795 | } else { | |
796 | 124 | return None; |
797 | } | |
798 | 16 | } else { |
799 | 16 | // create some content so that speech rules don't require special cases |
800 | 16 | let mtext = CanonicalizeContext::create_empty_element(&mathml.document()); |
801 | 16 | mathml.append_child(mtext); |
802 | 16 | // return Some(mathml); |
803 | 16 | } |
804 | 52.1k | }; |
805 | ||
806 | 52.1k | match element_name { |
807 | 52.1k | "mn" => { |
808 | 9.08k | let text = as_text(mathml); |
809 | 9.08k | let mut chars = text.chars(); |
810 | 9.08k | let first_char = chars.next().unwrap(); // we have already made sure it is non-empty |
811 | 9.08k | if !text.trim().is_empty() && is_roman_number_match(text) { |
812 | 2 | // people tend to set them in a non-italic font and software makes that 'mtext' |
813 | 2 | CanonicalizeContext::make_roman_numeral(mathml); |
814 | 9.08k | } else if |
815 | 5 | let doc = mathml.document(); |
816 | 5 | let mo = create_mathml_element(&doc, "mo"); |
817 | 5 | let mn = create_mathml_element(&doc, "mn"); |
818 | 5 | mo.set_text("-"); |
819 | 5 | mn.set_text(&text[first_char.len_utf8()..]); |
820 | 5 | set_mathml_name(mathml, "mrow"); |
821 | 5 | mathml.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
822 | 5 | mathml.replace_children([mo,mn]); |
823 | 9.08k | } else if contains_currency(text) && let Some( |
824 | 3 | return Some(result); |
825 | 9.07k | } |
826 | 9.08k | if let Some((idx, last_char)) = text.char_indices().next_back() { |
827 | // look for something like 12° | |
828 | 9.08k | if is_pseudo_script_char(last_char) { |
829 | 1 | let doc = mathml.document(); |
830 | 1 | let mn = create_mathml_element(&doc, "mn"); |
831 | 1 | let mo = create_mathml_element(&doc, "mo"); |
832 | 1 | mn.set_text(&text[..idx]); |
833 | 1 | mo.set_text(last_char.to_string().as_str()); |
834 | 1 | set_mathml_name(mathml, "msup"); |
835 | 1 | mathml.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
836 | 1 | mathml.replace_children([mn, mo]); |
837 | 9.08k | } |
838 | 0 | } |
839 | 9.08k | return Some(mathml); |
840 | }, | |
841 | 43.0k | "ms" | |
842 | 3 | return Some(mathml); |
843 | }, | |
844 | 43.0k | "mi" => { |
845 | 11.6k | let text = as_text(mathml); |
846 | 11.6k | if !text.trim().is_empty() && is_roman_number_match(text) && |
847 | // people tend to set them in a non-italic font and software makes that 'mtext' | |
848 | 28 | CanonicalizeContext::make_roman_numeral(mathml); |
849 | 28 | return Some(mathml); |
850 | 11.5k | } |
851 | 11.5k | if let Some( |
852 | 1 | mathml.set_text(dash); |
853 | 1 | return Some(mathml); |
854 | 11.5k | } else if text.contains('_') { |
855 | // if left or right are an mo, leave as is. Otherwise convert to an mo. | |
856 | 6 | let preceding_siblings = mathml.preceding_siblings(); |
857 | 6 | let following_siblings = mathml.following_siblings(); |
858 | 6 | if preceding_siblings.is_empty() || following_siblings.is_empty() { |
859 | 4 | return Some(mathml); |
860 | 2 | } |
861 | 2 | if name(as_element(preceding_siblings[preceding_siblings.len()-1])) != "mo" && |
862 | 2 | name(as_element(following_siblings[0])) != "mo" { |
863 | 2 | set_mathml_name(mathml, "mo"); |
864 | 2 |
|
865 | 2 | return Some(mathml); |
866 | 11.5k | } else if OPERATORS.get(text).is_some() { |
867 | 118 | if let Some( |
868 | // if it is a unit, it might be seconds, minutes, feet, ... not an operator | |
869 | 88 | if intent_value.contains(":unit") { |
870 | 88 | return Some(mathml); |
871 | 0 | } |
872 | 30 | } |
873 | 30 | set_mathml_name(mathml, "mo"); |
874 | ||
875 | // For at least pandoc, ∇ is an 'mi' and it sometimes adds an invisible times -- remove them | |
876 | 30 | let op = OPERATORS.get(text).unwrap(); |
877 | 30 | let preceding_siblings = mathml.preceding_siblings(); |
878 | 30 | if (op.is_infix() || |
879 | 16 | !preceding_siblings.is_empty() && |
880 | 0 | as_element(preceding_siblings[0]).remove_from_parent(); |
881 | 30 | } |
882 | 30 | let following_siblings = mathml.following_siblings(); |
883 | 30 | if (op.is_infix() || |
884 | 27 | !following_siblings.is_empty() && CanonicalizeContext::is_invisible_char_element(as_element(following_siblings[0])) { |
885 | 0 | as_element(following_siblings[0]).remove_from_parent(); |
886 | 30 | } |
887 | 30 | return Some(mathml); |
888 | 11.4k | } else if let Some( |
889 | 1 | return Some(result); |
890 | 11.4k | } else if let Some( |
891 | 0 | return Some(result); |
892 | 11.4k | } else if IS_PRIME.is_match(text) { |
893 | 0 | let new_text = merge_prime_text(text); |
894 | 0 | mathml.set_text(&new_text); |
895 | 0 | return Some(mathml); |
896 | 11.4k | } else if text == "..." { |
897 | 1 | mathml.set_text("…"); |
898 | 1 | return Some(mathml); |
899 | 11.4k | } else if let Some( |
900 | 27 | return Some(result); |
901 | 11.4k | } else if let Some( |
902 | 11 | return Some(result); |
903 | } else { | |
904 | 11.4k | return Some(mathml); |
905 | }; | |
906 | }, | |
907 | 31.4k | "mtext" => { |
908 | // debug!("before merge_arc_trig: {}", mml_to_string(mathml)); | |
909 | ||
910 | 401 | if let Some( |
911 | 2 | return Some(result); |
912 | 399 | } else if let Some( |
913 | 11 | return Some(result); |
914 | 388 | } |
915 | ||
916 | 388 | let text = as_text(mathml); |
917 | 388 | if !text.trim().is_empty() && |
918 | // people tend to set them in a non-italic font and software makes that 'mtext' | |
919 | 4 | CanonicalizeContext::make_roman_numeral(mathml); |
920 | 4 | return Some(mathml); |
921 | 449 | } else if |
922 | 58 |
|
923 | 1 | mathml.set_name("mn"); |
924 | 1 | return Some(mathml); |
925 | 383 | } else if contains_currency(text) && let Some( |
926 | 0 | return Some(result); |
927 | 383 | } |
928 | // common bug: trig functions, lim, etc., should be mi | |
929 | 383 | if ["…", "⋯", "∞"].contains(&text) || |
930 | 383 | crate::definitions::SPEECH_DEFINITIONS.with(|definitions| |
931 | 383 | if let Some( |
932 | 382 | hashset.contains(text) |
933 | } else { | |
934 | 1 | false |
935 | 383 | } |
936 | ) { | |
937 | 6 | set_mathml_name(mathml, "mi"); |
938 | 6 | return Some(mathml); |
939 | 377 | } |
940 | ||
941 | // allow non-breaking whitespace to stay -- needed by braille | |
942 | 377 | if IS_WHITESPACE.is_match(text) { |
943 | // normalize to just a single non-breaking space | |
944 | 71 | mathml.set_attribute_value("data-width", &format!("{:.3}", white_space_em_width(text))); |
945 | 71 | mathml.set_text("\u{00A0}"); |
946 | 71 | return Some(mathml); |
947 | 306 | } else if let Some( |
948 | 2 | mathml.set_text(dash); |
949 | 304 | } else if OPERATORS.get(text).is_some() { |
950 | 11 | set_mathml_name(mathml, "mo"); |
951 | 11 | return Some(mathml); |
952 | 293 | } |
953 | 295 | return if parent_requires_child || |
954 | }, | |
955 | 31.0k | "mo" => { |
956 | // WIRIS editor puts non-breaking whitespace as standalone in 'mo' | |
957 | 11.2k | let text = as_text(mathml); |
958 | 11.2k | if !text.is_empty() && IS_WHITESPACE.is_match(text) { |
959 | // can't throw it out because it is needed by braille -- change to what it really is | |
960 | 78 | set_mathml_name(mathml, "mtext"); |
961 | 78 | mathml.set_attribute_value("data-width", &format!("{:.3}", white_space_em_width(text))); |
962 | 78 | mathml.set_text("\u{00A0}"); |
963 | 78 | mathml.set_attribute_value(CHANGED_ATTR, "data-was-mo"); |
964 | 78 | return Some(mathml); |
965 | } else { | |
966 | 11.1k | match text { |
967 | 11.1k | "arc" | "arc " | "arc " /* non-breaking space */ => { |
968 | 0 | if let Some(result) = merge_arc_trig(mathml) { |
969 | 0 | return Some(result); |
970 | 0 | } |
971 | }, | |
972 | 11.1k | "..." => |
973 | 11.1k | ":" => { |
974 | 94 | if is_ratio(mathml) { |
975 | 8 | mathml.set_text("∶"); // ratio U+2236 |
976 | 86 | } |
977 | 94 | return Some(mathml); |
978 | }, | |
979 | 11.0k | "::" => |
980 | 11.0k | "│" => |
981 | 11.0k | "|" | |
982 | 6 | return Some(result); |
983 | } else { | |
984 | 299 | return Some(mathml); |
985 | }, | |
986 | 10.7k | _ => (), |
987 | } | |
988 | } | |
989 | ||
990 | // common bug: trig functions, lim, etc., should be mi | |
991 | // same for ellipsis ("…") | |
992 | 10.7k | return crate::definitions::SPEECH_DEFINITIONS.with(|definitions| { |
993 | 10.7k | if ["…", "⋯", "∞"].contains(&text) || |
994 | 10.7k | definitions.borrow().get_hashset("FunctionNames").unwrap().contains(text) || |
995 | 10.6k | definitions.borrow().get_hashset("GeometryShapes").unwrap().contains(text) { |
996 | 83 | set_mathml_name(mathml, "mi"); |
997 | 83 | return Some(mathml); |
998 | 10.6k | } |
999 | 10.6k | if IS_PRIME.is_match(text) { |
1000 | 66 | let new_text = merge_prime_text(text); |
1001 | 66 | mathml.set_text(&new_text); |
1002 | 66 | return Some(mathml); |
1003 | 10.5k | } |
1004 | 10.5k | if contains_currency(text) && let Some( |
1005 | 9 | return Some(result); |
1006 | 10.5k | } |
1007 | 10.5k | return Some(mathml); |
1008 | 10.7k | }); |
1009 | // note: chemistry test is done later as part of another phase of chemistry cleanup | |
1010 | }, | |
1011 | 19.8k | "mfenced" => {return |
1012 | 19.8k | "a" => { |
1013 | // convert 'a' into 'mrow' | |
1014 | 2 | set_mathml_name(mathml, "mrow"); |
1015 | 2 | return self.clean_mathml(mathml); |
1016 | } | |
1017 | 19.8k | "mstyle" | |
1018 | // Throw out mstyle and mpadded -- to do this, we need to avoid mstyle being the arg of clean_mathml | |
1019 | // FIX: should probably push the attrs down to the children (set in 'self') | |
1020 | 714 | merge_adjacent_similar_mstyles(mathml); |
1021 | 714 | let children = mathml.children(); |
1022 | 714 | if children.is_empty() { |
1023 | 0 | return if parent_requires_child {Some( CanonicalizeContext::make_empty_element(mathml) )} else {None}; |
1024 | 714 | } else if children.len() == 1 { |
1025 | 678 | let is_from_mhchem = element_name == "mpadded" && |
1026 | 678 | if let Some( |
1027 | // "lift" the child up so all the links (e.g., siblings) are correct | |
1028 | 269 | mathml.replace_children(new_mathml.children()); |
1029 | 269 | set_mathml_name(mathml, name(new_mathml)); |
1030 | 269 | add_attrs(mathml, &new_mathml.attributes()); |
1031 | 269 | return Some(mathml); |
1032 | 409 | } else if parent_requires_child { |
1033 | // need a placeholder -- make it empty mtext | |
1034 | 31 | let empty = CanonicalizeContext::make_empty_element(mathml); |
1035 | 31 | if is_from_mhchem { |
1036 | 27 | empty.set_attribute_value(MHCHEM_MMULTISCRIPTS_HACK, "true"); |
1037 | 27 |
|
1038 | 31 | return Some(empty); |
1039 | } else { | |
1040 | 378 | return None; |
1041 | } | |
1042 | } else { | |
1043 | // wrap the children in an mrow, but maintain tree siblings by changing mpadded/mstyle to mrow | |
1044 | 36 | set_mathml_name(mathml, "mrow"); |
1045 | 36 | mathml.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
1046 | 36 | return self.clean_mathml(mathml); // now it's an mrow so a different path next time |
1047 | } | |
1048 | }, | |
1049 | 19.0k | "mphantom" | |
1050 | 364 | return if parent_requires_child { |
1051 | }, | |
1052 | 18.7k | "mspace" => { |
1053 | // need to hold onto space for braille | |
1054 | 224 | set_mathml_name(mathml, "mtext"); |
1055 | 224 | mathml.set_text("\u{00A0}"); |
1056 | 224 | mathml.set_attribute_value(CHANGED_ATTR, "was-mspace"); |
1057 | ||
1058 | // normalize width ems | |
1059 | 224 | let width = mathml.attribute_value("width").unwrap_or("0em"); |
1060 | 224 | let normalized_width = crate::xpath_functions::FontSizeGuess::em_from_value(width); |
1061 | 224 | mathml.set_attribute_value("data-width", &normalized_width.to_string()); |
1062 | 224 | return Some(mathml); |
1063 | }, | |
1064 | 18.5k | "semantics" => { |
1065 | // The semantics tag, like the style tag, can mess with pattern matching. | |
1066 | // However, it may be the case that having the annotations could aid in determining intent, so we want to keep them. | |
1067 | // The compromise is to move the annotations into an attr named data-annotation[-xml]-<encoding-name> | |
1068 | // The attribute is put on presentation element root | |
1069 | 11 | let presentation = get_presentation_element(mathml).1; |
1070 | 11 | let new_presentation = if let Some(presentation) = self.clean_mathml(presentation) { |
1071 | 11 | presentation |
1072 | } else { | |
1073 | // probably shouldn't happen, but just in case | |
1074 | 0 | CanonicalizeContext::create_empty_element(&mathml.document()) |
1075 | }; | |
1076 | 11 | set_annotation_attrs(new_presentation, mathml); |
1077 | 11 | return Some(new_presentation); |
1078 | }, | |
1079 | _ => { | |
1080 | 18.5k | let children = mathml.children(); |
1081 | 18.5k | if element_name == "mrow" { |
1082 | // handle special cases of empty mrows and mrows which just one element | |
1083 | 6.04k | if children.is_empty() && |
1084 | 0 | return if parent_requires_child {Some(mathml)} else {None}; |
1085 | 6.04k | } else if children.len() == 1 && |
1086 | 2.56k | let is_from_mhchem = is_from_mhchem_hack(mathml); |
1087 | 2.56k | if let Some( |
1088 | // "lift" the child up so all the links (e.g., siblings) are correct | |
1089 | 1.95k | mathml.replace_children(new_mathml.children()); |
1090 | 1.95k | set_mathml_name(mathml, name(new_mathml)); |
1091 | 1.95k | add_attrs(mathml, &new_mathml.attributes()); |
1092 | 1.95k | return Some(mathml); |
1093 | 607 | } else if parent_requires_child { |
1094 | 234 | let empty = CanonicalizeContext::make_empty_element(mathml); |
1095 | 234 | if is_from_mhchem { |
1096 | 142 | empty.set_attribute_value(MHCHEM_MMULTISCRIPTS_HACK, "true"); |
1097 | 142 |
|
1098 | 234 | return Some(empty); |
1099 | } else { | |
1100 | 373 | return None; |
1101 | } | |
1102 | 3.48k | } |
1103 | 12.4k | } |
1104 | ||
1105 | // FIX: this should be setting children, not mathml | |
1106 | 15.9k | let mathml = if element_name == "mrow" || |
1107 | 12.4k | (children.len() > 1 && |
1108 | 5.90k | let merged = merge_dots(mathml); // FIX -- switch to passing in children |
1109 | 5.90k | let merged = merge_primes(merged); |
1110 | 5.90k | let merged = merge_degrees_C_F(merged); |
1111 | 5.90k | let merged = merge_chars(merged, &IS_UNDERSCORE); |
1112 | 5.90k | handle_pseudo_scripts(merged) |
1113 | } else { | |
1114 | 10.0k | mathml |
1115 | }; | |
1116 | ||
1117 | // cleaning children can add or delete subsequent children, so we need to constantly update the children (and mathml) | |
1118 | 15.9k | let mut children = mathml.children(); |
1119 | 15.9k | let mut i = 0; |
1120 | ||
1121 | 59.7k | while i < children.len() { |
1122 | 43.9k | if let Some(child) = children[i].element() { |
1123 | 43.9k | match self.clean_mathml(child) { |
1124 | 299 | None => { |
1125 | 299 | mathml.remove_child(child); |
1126 | 299 | // don't increment 'i' because there is one less child now and so everything shifted left |
1127 | 299 | }, |
1128 | 43.6k | Some(new_child) => { |
1129 | // debug!("new_child (i={})\n{}", i, mml_to_string(new_child)); | |
1130 | 43.6k | let new_child_name = name(new_child); |
1131 | 43.6k | children = mathml.children(); // clean_mathml(child) may have changed following siblings |
1132 | 43.6k | children[i] = ChildOfElement::Element(new_child); |
1133 | 43.6k | mathml.replace_children(children); |
1134 | 43.6k | if new_child_name == "mi" || |
1135 | 12.5k | // can't do this above in 'match' because this changes the tree and |
1136 | 12.5k | // lifting single element mrows messes with structure in a conflicting way |
1137 | 12.5k | // Note: if clean_chemistry_leaf() made changes, they don't need cleaning because they will be "ok" mi's |
1138 | 12.5k | clean_chemistry_leaf(as_element(mathml.children()[i])); |
1139 | 12.5k | } else { |
1140 | // If the attach call does something, children are inserted *before* child (i.e., into parent) | |
1141 | // We return the new start at the expense of re-cleaning the script | |
1142 | // This is needed because anything before the returned element will be lost | |
1143 | 31.0k | let start_of_change = attach_scripts_to_split_element(new_child); |
1144 | 31.0k | if name(start_of_change) == "mrow" { |
1145 | 3.43k | start_of_change.remove_attribute(MAYBE_CHEMISTRY); // was lifted, and not set -- remove and it will be computed later |
1146 | 27.6k | } |
1147 | // crate::canonicalize::assure_mathml(get_parent(start_of_change)).unwrap(); // FIX: find a recovery -- we're in deep trouble if this isn't true | |
1148 | 31.0k | if start_of_change != child { |
1149 | // debug!("clean_mathml: start_of_change != mathml -- mathml={}", mml_to_string(mathml)); | |
1150 | 49 | return self.clean_mathml(mathml); // restart cleaning |
1151 | 30.9k | } |
1152 | } | |
1153 | 43.5k | i += 1; |
1154 | } | |
1155 | } | |
1156 | 43.8k | children = mathml.children(); // 'children' moved above, so need new values |
1157 | 0 | } else { |
1158 | 0 | // bad mathml such as '<annotation-xml> </annotation-xml>' -- don't add to new_children |
1159 | 0 | i += 1; |
1160 | 0 | } |
1161 | } | |
1162 | ||
1163 | // could have deleted children so only one child remains -- need to lift it | |
1164 | 15.8k | if element_name == "mrow" && |
1165 | // "lift" the child up so all the links (e.g., siblings) are correct | |
1166 | 108 | let child = as_element(children[0]); |
1167 | 108 | mathml.replace_children(child.children()); |
1168 | 108 | set_mathml_name(mathml, name(child)); |
1169 | 108 | add_attrs(mathml, &child.attributes()); |
1170 | 108 | return Some(mathml); // child has already been cleaned, so we can return |
1171 | 15.7k | } |
1172 | ||
1173 | 15.7k | if element_name == "mrow" || |
1174 | 10.1k | merge_number_blocks(self, mathml, &mut children); |
1175 | 10.1k | merge_whitespace(&mut children); |
1176 | 10.1k | merge_cross_or_dot_product_elements(&mut children); |
1177 | 10.1k | handle_convert_to_mmultiscripts(&mut children); |
1178 | 10.1k | } else if |
1179 | 3.48k | element_name == "msubsup" || |
1180 | 2.52k | if element_name != "mmultiscripts" { |
1181 | // mhchem emits some cases that boil down to a completely empty script -- see test mhchem_beta_decay | |
1182 | 2.33k | let mut is_empty_script = CanonicalizeContext::is_empty_element(as_element(children[0])) && |
1183 | 181 | CanonicalizeContext::is_empty_element(as_element(children[1])); |
1184 | 2.33k | if element_name == "msubsup" && |
1185 | 51 | is_empty_script = CanonicalizeContext::is_empty_element(as_element(children[2])); |
1186 | 2.28k | } |
1187 | 2.33k | if is_empty_script { |
1188 | 48 | if parent_requires_child { |
1189 | // need a placeholder -- make it empty mtext | |
1190 | 0 | return Some( as_element(children[0]) ); // pick one of the empty elements |
1191 | } else { | |
1192 | 48 | return None; |
1193 | } | |
1194 | 2.29k | } |
1195 | 185 | } |
1196 | 2.47k | let mathml = if element_name == "mmultiscripts" { |
1197 | 2.47k | if !is_chemistry_off(mathml) { |
1198 | 2.47k | let likely_chemistry = likely_adorned_chem_formula(mathml); |
1199 | // debug!("likely_chemistry={}, {}", likely_chemistry, mml_to_string(mathml)); | |
1200 | 2.47k | if likely_chemistry >= 0 { |
1201 | 553 | mathml.set_attribute_value(MAYBE_CHEMISTRY, likely_chemistry.to_string().as_str()); |
1202 | 1.92k | } |
1203 | 0 | } |
1204 | ||
1205 | 2.47k | if element_name == "msubsup" { |
1206 | 180 | return Some( clean_msubsup(mathml) ); |
1207 | } else { | |
1208 | 2.29k | return Some(mathml); |
1209 | } | |
1210 | 3.06k | } |
1211 | ||
1212 | 13.2k | mathml.replace_children(children); |
1213 | // debug!("clean_mathml: after loop\n{}", mml_to_string(mathml)); | |
1214 | 13.2k | if element_name == "mrow" || |
1215 | 10.1k | clean_chemistry_mrow(mathml); |
1216 | 10.1k |
|
1217 | 13.2k | self.assure_nary_tag_has_one_child(mathml); |
1218 | 13.2k | if crate::xpath_functions::IsNode::is_2D(mathml) { |
1219 | 4.48k | CanonicalizeContext::mark_empty_content(mathml); |
1220 | 8.77k | } |
1221 | ||
1222 | 13.2k | return Some(mathml); |
1223 | } | |
1224 | } | |
1225 | ||
1226 | /// Returns substitute text if hyphen sequence should be a short or long dash | |
1227 | 11.8k | fn canonicalize_dash(text: &str) -> Option<&str> { |
1228 | 11.8k | if text == "--" { |
1229 | 1 | return Some("—"); // U+2014 (em dash) |
1230 | 11.8k | } else if text == "---" || |
1231 | 2 | return Some("―"); // U+2015 (Horizontal bar) |
1232 | } else { | |
1233 | 11.8k | return None; |
1234 | } | |
1235 | 11.8k | } |
1236 | ||
1237 | 11 | fn set_annotation_attrs(new_presentation: Element, semantics: Element) { |
1238 | 24 | for child in |
1239 | 24 | let child = as_element(child); |
1240 | 24 | let child_name = name(child); |
1241 | 24 | if child == new_presentation { |
1242 | 1 | continue; |
1243 | 23 | } |
1244 | 23 | let attr_name = match child.attribute_value("encoding") { |
1245 | 23 | Some(encoding_name) => format!("data-{}-{}", child_name, encoding_name.replace('/', "_slash_")), |
1246 | 0 | None => format!("data-{child_name}"), // probably shouldn't happen |
1247 | }; | |
1248 | 23 | let attr_name = attr_name.as_str(); |
1249 | 23 | if child_name == "annotation" { |
1250 | 12 | new_presentation.set_attribute_value(attr_name, as_text(child)); |
1251 | 12 | } else { |
1252 | 11 | new_presentation.set_attribute_value(attr_name, &mml_to_string(child)); |
1253 | 11 | } |
1254 | } | |
1255 | ||
1256 | 11 | } |
1257 | ||
1258 | /// Hack to try and guess if a colon should be a ratio -- this affects parsing because of different precedences | |
1259 | /// It also guesses on the spacing after the colon and adds a space attr if it looks like set building or function mapping notation. | |
1260 | /// These conditions are really not well thought out and are just a first cut -- they do cause the braille tests to pass | |
1261 | /// If 'intent' is given, it must be intent='ratio' | |
1262 | /// 2. It must be infix and there is a proportion (∷) mo as a sibling, or | |
1263 | /// 3. It is the only mo and has numbers on each side | |
1264 | /// | |
1265 | /// Need to rule out field extensions "[K:F]" and trilinear coordinates "a:b:c" (Nemeth doesn't consider these to be ratios) | |
1266 | 94 | fn is_ratio(mathml: Element) -> bool { |
1267 | 94 | assert_eq!(name(mathml), "mo"); |
1268 | 94 | let parent = get_parent(mathml); // must exist |
1269 | 94 | if name(parent) != "mrow" && |
1270 | 0 | return false; |
1271 | 94 | } |
1272 | ||
1273 | 94 | if let Some( |
1274 | 1 | && (intent_value != "ratio" || |
1275 | 1 | return false; |
1276 | 93 | } |
1277 | ||
1278 | 93 | if let Some( |
1279 | 0 | && value == "PUNCT" { |
1280 | 0 | mathml.remove_attribute("data-mjx-texclass"); |
1281 | 0 | mathml.set_attribute_value(SPACE_AFTER, "true"); // signal to at least Nemeth rules that this is punctuation |
1282 | 93 | } |
1283 | ||
1284 | 93 | let preceding = mathml.preceding_siblings(); |
1285 | 93 | let following = mathml.following_siblings(); |
1286 | 93 | if preceding.is_empty() || |
1287 | 2 | return false; |
1288 | 91 | } |
1289 | 91 | let preceding_child = as_element( preceding[preceding.len()-1] ); |
1290 | 91 | let following_child = as_element(following[0]); |
1291 | 91 | if preceding.len() == 1 && |
1292 | 8 | following.len() == 1 && |
1293 | 2 | return true; |
1294 | 89 | } |
1295 | // only want one "∷" | |
1296 | 89 | let is_before = is_proportional_before_colon(preceding.iter().rev()); |
1297 | 89 | if let Some( |
1298 | 3 | && !is_before { |
1299 | 0 | return false; |
1300 | 89 | } |
1301 | 89 | let is_before = is_before.is_some(); // move this to true/false (found/not found) |
1302 | 89 | let is_after = is_proportional_before_colon(following.iter()); |
1303 | 89 | if let Some( |
1304 | 3 | && !is_after { |
1305 | 0 | return false; |
1306 | 89 | } |
1307 | 89 | let is_after = is_after.is_some(); // move this to true/false (found/not found) |
1308 | 89 | return is_before ^ is_after; |
1309 | ||
1310 | 178 | fn is_proportional_before_colon<'a>(siblings: impl Iterator<Item = &'a ChildOfElement<'a>>) -> Option<bool> { |
1311 | // unparsed, so we look at relative priorities to make sure the proportional operator is really the next operator | |
1312 | 3 | static PROPORTIONAL_PRIORITY: LazyLock<usize> = LazyLock::new(|| OPERATORS.get("∷").unwrap().priority); |
1313 | 461 | for sibling in |
1314 | 461 | let child = as_element(*sibling); |
1315 | 461 | if name(child) == "mo" { |
1316 | 203 | let text = as_text(child); |
1317 | 203 | match text { |
1318 | 203 | "∷" | |
1319 | 197 | "∶" => return |
1320 | _ => { | |
1321 | 197 | if let Some( |
1322 | 191 | && op.priority < *PROPORTIONAL_PRIORITY { |
1323 | 109 | return None; // no "∷" |
1324 | 88 | } |
1325 | }, | |
1326 | } | |
1327 | 258 | } |
1328 | } | |
1329 | 63 | return None; |
1330 | 178 | } |
1331 | 94 | } |
1332 | ||
1333 | ||
1334 | /// Returns true if it detects that this is likely coming from mhchem: | |
1335 | /// v3: msub/msup/msubsup with mpadded width=0/mphantom/mi=A) | |
1336 | /// v4: msub/msup/msubsup with mrow/mrow/mpadded width=0/mphantom/mi=A) | |
1337 | /// This should be called with 'mrow' being the outer mrow | |
1338 | 3.15k | fn is_from_mhchem_hack(mathml: Element) -> bool { |
1339 | 3.15k | assert!(name(mathml) == "mrow" || |
1340 | 3.15k | assert_eq!(mathml.children().len(), 1); |
1341 | 3.15k | let parent = get_parent(mathml); |
1342 | 3.15k | let parent_name = name(parent); |
1343 | 3.15k | if !(parent_name == "msub" || |
1344 | 2.56k | return false; |
1345 | 594 | } |
1346 | ||
1347 | 594 | let |
1348 | 545 | let mrow = as_element(mathml.children()[0]); |
1349 | 545 | if !(name(mrow) == "mrow" && |
1350 | 255 | return false; |
1351 | 290 | } |
1352 | 290 | let child = as_element(mrow.children()[0]); |
1353 | 290 | if name(child) != "mpadded" { |
1354 | 24 | return false; |
1355 | 266 | } |
1356 | 266 | child |
1357 | } else { | |
1358 | 49 | mathml |
1359 | }; | |
1360 | 315 | if let Some( |
1361 | 169 | if width != "0" { |
1362 | 0 | return false; |
1363 | 169 | } |
1364 | } else { | |
1365 | 146 | return false; |
1366 | } | |
1367 | ||
1368 | 169 | let mphantom = as_element(mpadded.children()[0]); |
1369 | 169 | if !(name(mphantom) == "mphantom" && mphantom.children().len() == 1) { |
1370 | 0 | return false; |
1371 | 169 | } |
1372 | ||
1373 | 169 | let child = as_element(mphantom.children()[0]); |
1374 | 169 | return name(child) == "mi" && as_text(child) == "A"; |
1375 | 3.15k | } |
1376 | ||
1377 | /// 'text' is potentially one of the many Unicode whitespace chars. Estimate the width in ems | |
1378 | 149 | fn white_space_em_width(text: &str) -> f64 { |
1379 | 149 | assert!(IS_WHITESPACE.is_match(text)); |
1380 | 149 | let mut width = 0.0; |
1381 | 163 | for ch in |
1382 | 163 | width += match ch { |
1383 | 137 | ' ' | '\u{00A0}' | '\u{1680}' | ' ' => 0.7, // space, non-breaking space, Ogham space mark, figure space |
1384 | 0 | ' ' | ' ' => 0.5, // en quad, en space |
1385 | 0 | ' ' | ' ' => 1.0, // em quad, em space |
1386 | 0 | ' ' => 1.0/3.0, // three per em space |
1387 | 0 | ' ' | ' ' => 0.25, // four per em space, punctuation space (wild guess) |
1388 | 22 | ' ' | ' ' => 3.0/18.0, // six per em space, thin space |
1389 | 0 | ' ' => 1.0/18.0, // hair space |
1390 | 0 | ' ' => 0.3, // narrow no-break space (half a regular space?) |
1391 | 4 | ' ' => 4.0/18.0, // medium math space |
1392 | 0 | ' ' => 1.5, // Ideographic Space |
1393 | 0 | _ => 0.7, // shouldn't happen |
1394 | } | |
1395 | } | |
1396 | 149 | return width; |
1397 | 149 | } |
1398 | ||
1399 | /// Splits the leaf element into chemical elements if needed | |
1400 | 12.5k | fn clean_chemistry_leaf(mathml: Element) -> Element { |
1401 | 12.5k | if !(is_chemistry_off(mathml) || mathml.attribute(MAYBE_CHEMISTRY).is_some()) { |
1402 | 12.3k | assert!(name(mathml)=="mi" || |
1403 | // this is a hack -- VII is more likely to be roman numeral than the molecule V I I so prevent that from happening | |
1404 | // FIX: come up with a less hacky way to prevent chem element misinterpretation | |
1405 | 12.3k | let text = as_text(mathml); |
1406 | 12.3k | if text.len() > 2 && |
1407 | 0 | return mathml; |
1408 | 12.3k | } |
1409 | 12.3k | if let Some( |
1410 | // children are already marked as chemical elements | |
1411 | 135 | let answer = replace_children(mathml, elements); |
1412 | 135 | if name(answer) == "mrow" { |
1413 | 29 | answer.remove_attribute(MAYBE_CHEMISTRY); // was lifted, and not set -- remove and it will be computed later |
1414 | 106 | } |
1415 | 135 | return answer; |
1416 | } else { | |
1417 | 12.1k | let likely_chemistry = likely_chem_element(mathml); |
1418 | 12.1k | if likely_chemistry >= 0 { |
1419 | 2.59k | mathml.set_attribute_value(MAYBE_CHEMISTRY, likely_chemistry.to_string().as_str()); |
1420 | 9.57k | } |
1421 | }; | |
1422 | 259 | } |
1423 | 12.4k | return mathml; |
1424 | 12.5k | } |
1425 | ||
1426 | ||
1427 | /// looks for pairs of (letter, pseudo-script) such as x' or p'q' all inside of a single token element | |
1428 | 11.4k | fn split_apart_pseudo_scripts<'a>(mi: Element<'a>) -> Option<Element<'a>> { |
1429 | 2 | static IS_DEGREES_C_OR_F: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[°º][CF]").unwrap()); |
1430 | ||
1431 | 11.4k | let text = as_text(mi); |
1432 | // debug!("split_apart_pseudo_scripts: start text=\"{text}\""); | |
1433 | 11.4k | if !text.chars().any(is_pseudo_script_char) || |
1434 | 11.4k | return None; |
1435 | 1 | } |
1436 | ||
1437 | 1 | let document = mi.document(); |
1438 | // create pairs of text | |
1439 | 1 | let chars = text.chars(); |
1440 | 1 | let next_chars = text.chars().skip(1); |
1441 | 1 | let result = chars.zip(next_chars).map(|(a, b)| |
1442 | 1 | if a.is_alphabetic() && is_pseudo_script_char(b) { |
1443 | // create msup | |
1444 | 1 | let base = create_mathml_element(&document, "mi"); |
1445 | 1 | base.set_text(&a.to_string()); |
1446 | 1 | let script = create_mathml_element(&document, "mo"); |
1447 | 1 | script.set_text(&b.to_string()); |
1448 | 1 | let msup = create_mathml_element(&document, "msup"); |
1449 | 1 | msup.append_child(base); |
1450 | 1 | msup.append_child(script); |
1451 | 1 | msup |
1452 | } else { | |
1453 | // create an mi "ab" | |
1454 | 0 | let new_mi = create_mathml_element(&document, "mi"); |
1455 | 0 | let mut new_mi_text = String::with_capacity(6); // likely will fit almost all cases |
1456 | 0 | new_mi_text.push(a); |
1457 | 0 | new_mi_text.push(b); |
1458 | 0 | new_mi.set_text(&new_mi_text); |
1459 | 0 | new_mi |
1460 | 1 | } ) |
1461 | 1 | .collect::<Vec<Element>>(); |
1462 | 1 | if result.len() == 1 { |
1463 | 1 | return Some( result[0] ); |
1464 | } else { | |
1465 | 0 | return Some( replace_children(mi, result) ); |
1466 | } | |
1467 | 11.4k | } |
1468 | ||
1469 | ||
1470 | /// If 'mathml' is a scripted element and has an mrow for a base, | |
1471 | /// attach any prescripts to the first element in mrow | |
1472 | /// attach any postscript to the last element in mrow | |
1473 | /// Return the modified element (which might now be an mrow) | |
1474 | 31.0k | fn attach_scripts_to_split_element(mathml: Element) -> Element { |
1475 | 31.0k | if !IsNode::is_scripted(mathml) { |
1476 | 28.5k | return mathml; |
1477 | 2.48k | } |
1478 | 2.48k | let base = as_element(mathml.children()[0]); |
1479 | 2.48k | if name(base) != "mrow" { |
1480 | 2.30k | return mathml; |
1481 | 185 | } |
1482 | 185 | let base_children = base.children(); |
1483 | 185 | let i_last_base = base_children.len()-1; |
1484 | 185 | let last_child = as_element(base_children[i_last_base]); |
1485 | 185 | if last_child.attribute(SPLIT_TOKEN).is_none() { |
1486 | 156 | return mathml; |
1487 | 29 | } |
1488 | // debug!("attach_scripts_to_split_element -- start: \n{}", mml_to_string(mathml)); | |
1489 | 29 | let mut mathml_replacement = Vec::with_capacity(base_children.len()); |
1490 | 29 | if name(mathml) == "mmultiscripts" { |
1491 | // pull any prescript (should be at most one prefix pair) into the first child | |
1492 | 1 | let multiscripts_children = mathml.children(); |
1493 | 1 | let n_multiscripts_children = multiscripts_children.len(); |
1494 | 1 | let potential_mprescripts_element = as_element(multiscripts_children[n_multiscripts_children-3]); |
1495 | 1 | if name(potential_mprescripts_element) == "mprescripts" { // we have potential chem prescripts |
1496 | // create a new mmultiscripts elements with first child as its base mathml's prescripts as the new element's prescripts | |
1497 | 1 | let mut new_mmultiscripts_children = Vec::with_capacity(4); |
1498 | 1 | new_mmultiscripts_children.push(base_children[0]); |
1499 | 1 | base.remove_child(as_element(base_children[0])); |
1500 | 1 | new_mmultiscripts_children.push(multiscripts_children[n_multiscripts_children-3]); |
1501 | 1 | new_mmultiscripts_children.push(multiscripts_children[n_multiscripts_children-2]); |
1502 | 1 | new_mmultiscripts_children.push(multiscripts_children[n_multiscripts_children-1]); |
1503 | ||
1504 | 1 | let new_mmultiscripts = create_mathml_element(&base.document(), "mmultiscripts"); |
1505 | 1 | new_mmultiscripts.append_children(new_mmultiscripts_children); |
1506 | 1 | let likely = likely_adorned_chem_formula(new_mmultiscripts); |
1507 | 1 | new_mmultiscripts.set_attribute_value(MAYBE_CHEMISTRY, &likely.to_string()); |
1508 | // debug!("attach_scripts_to_split_element -- new_mmultiscripts: \n{}", mml_to_string(new_mmultiscripts)); | |
1509 | 1 | if n_multiscripts_children == 4 { |
1510 | // we stripped all the children so only the (modified) base exists | |
1511 | // create mrow(new_mmultiscripts, mathml[0]) | |
1512 | 0 | let children = vec![new_mmultiscripts, base]; |
1513 | 0 | return replace_children(mathml, children); |
1514 | 1 | } |
1515 | 1 | mathml_replacement.push(new_mmultiscripts); |
1516 | 0 | } |
1517 | 28 | } |
1518 | ||
1519 | // Add all the middle children of the base to the mrow | |
1520 | 34 |
|
1521 | ||
1522 | // create a new script element with last child as its base | |
1523 | 29 | let mut new_mathml_children = mathml.children(); |
1524 | 29 | new_mathml_children[0] = ChildOfElement::Element(base); |
1525 | 29 | mathml.replace_children(new_mathml_children); |
1526 | 29 | mathml_replacement.push(mathml); |
1527 | // debug!("attach_scripts_to_split_element -- after adjusting ({} replacement children): \n{}", mathml_replacement.len(), mml_to_string(mathml)); | |
1528 | 29 | return replace_children(mathml, mathml_replacement); |
1529 | 31.0k | } |
1530 | ||
1531 | /// makes sure the structure is correct and also eliminates <none/> pairs | |
1532 | /// MathML core changed <none/> to <mrow/>. For now (since MathCAT has lots of "none" tests), <mrow/> => <mtext> => <none/> | |
1533 | /// (used https://chem.libretexts.org/Courses/Saint_Francis_University/CHEM_113%3A_Human_Chemistry_I_(Muino)/13%3A_Nuclear_Chemistry12/13.04%3A_Nuclear_Decay) | |
1534 | /// | |
1535 | /// This does some dubious repairs when the structure is bad, but not sure what else to do | |
1536 | 185 | fn clean_mmultiscripts(mathml: Element) -> Option<Element> { |
1537 | 185 | let mut mathml = mathml; |
1538 | 185 | let children = mathml.children(); |
1539 | 185 | let n = children.len(); |
1540 | 185 | let i_mprescripts = |
1541 | 185 | if let Some(( |
1542 | 659 | . |
1543 | 185 | let has_misplaced_mprescripts = i_mprescripts & 1 == 0; // should be first, third, ... child |
1544 | 185 | let mut has_proper_number_of_children = if i_mprescripts == n { |
1545 | 185 | if has_misplaced_mprescripts || !has_proper_number_of_children || |
1546 | // need to reset the children | |
1547 | 185 | let mut new_children = Vec::with_capacity(n+2); // adjusting position of mprescripts might add two children |
1548 | 185 | new_children.push(children[0]); |
1549 | // drop none, none script pairs | |
1550 | 185 | let mut i = 1; |
1551 | 604 | while i < n { |
1552 | 419 | let child = as_element(children[i]); |
1553 | 419 | let child_name = name(child); |
1554 | 419 | if child_name == "mprescripts" { |
1555 | 108 | if has_misplaced_mprescripts { |
1556 | 0 | let mtext = CanonicalizeContext::create_empty_element(&mathml.document()); |
1557 | 0 | new_children.push(ChildOfElement::Element(mtext)); |
1558 | 0 | has_proper_number_of_children = !has_proper_number_of_children; |
1559 | 108 | } |
1560 | 108 | new_children.push(children[i]); |
1561 | 108 | i += 1; |
1562 | 311 | } else if i+1 < n && child_name == "none" && |
1563 | 2 | i += 2; // found none, none pair |
1564 | 309 | } else { |
1565 | 309 | // copy pair |
1566 | 309 | new_children.push(children[i]); |
1567 | 309 | new_children.push(children[i+1]); |
1568 | 309 | i += 2; |
1569 | 309 | } |
1570 | } | |
1571 | 185 | if new_children.len() <= 2 { // base only, or base and </mprescripts> |
1572 | 1 | mathml = as_element(new_children[0]); |
1573 | 184 | } else { |
1574 | 184 | mathml.replace_children(new_children); |
1575 | 184 | } |
1576 | 0 | } |
1577 | ||
1578 | 185 | return Some(mathml); |
1579 | ||
1580 | 0 | fn has_none_none_script_pair(children: &[ChildOfElement]) -> bool { |
1581 | 0 | let mut i = 1; |
1582 | 0 | let n = children.len(); |
1583 | 0 | while i < n { |
1584 | 0 | let child = as_element(children[i]); |
1585 | 0 | let child_name = name(child); |
1586 | 0 | if child_name == "mprescripts" { |
1587 | 0 | i += 1; |
1588 | 0 | } else if i+1 < n && child_name == "none" && name(as_element(children[i+1])) == "none" { |
1589 | 0 | return true; // found none, none pair |
1590 | 0 | } else { |
1591 | 0 | i += 2; |
1592 | 0 | } |
1593 | } | |
1594 | 0 | return false; |
1595 | 0 | } |
1596 | 185 | } |
1597 | ||
1598 | /// converts element if there is an empty subscript or superscript | |
1599 | 180 | fn clean_msubsup(mathml: Element) -> Element { |
1600 | 180 | let children = mathml.children(); |
1601 | 180 | let subscript = as_element(children[1]); |
1602 | 180 | let has_subscript = !(name(subscript) == "mtext" && |
1603 | 180 | let superscript = as_element(children[2]); |
1604 | 180 | let has_superscript = !(name(superscript) == "mtext" && |
1605 | 180 | if has_subscript && |
1606 | 171 | return mathml; |
1607 | 9 | } else if has_subscript { |
1608 | 6 | set_mathml_name(mathml, "msub"); |
1609 | 6 | let children = vec!(children[0], children[1]); |
1610 | 6 | mathml.replace_children(children); |
1611 | 6 | return mathml; |
1612 | 3 | } else if has_superscript { |
1613 | 3 | set_mathml_name(mathml, "msup"); |
1614 | 3 | let children = vec!(children[0], children[2]); |
1615 | 3 | mathml.replace_children(children); |
1616 | 3 | return mathml; |
1617 | } else { | |
1618 | 0 | return as_element(children[0]); // no scripts |
1619 | } | |
1620 | 180 | } |
1621 | ||
1622 | /// Split off the currency symbol from the rest of the text and return an mrow with the result | |
1623 | /// Assumes it has already checked and that we have a leaf | |
1624 | 12 | fn split_currency_symbol(leaf: Element) -> Option<Element> { |
1625 | 12 | assert!(is_leaf(leaf)); |
1626 | 12 | let text = as_text(leaf); |
1627 | 12 | assert!(contains_currency(text)); |
1628 | 12 | let mut iter = text.chars(); |
1629 | 12 | match (iter.next(), iter.next()) { |
1630 | 0 | (None, _) => return None, |
1631 | (Some(_), None) => { // 1 char | |
1632 | 9 | leaf.set_name("mi"); |
1633 | 9 | return Some(leaf); } |
1634 | (Some(_), Some(_)) => { // 2 or more chars | |
1635 | // WARNING: don't use 'leaf' in the mrow -- that detaches it from its parent and could shrink the number of children causing problems | |
1636 | 4 | if |
1637 | 3 | leaf.set_name("mn"); // make sure we create an mn (might be one already) |
1638 | 3 |
|
1639 | 3 | let first_ch = text.char_indices().next().map(|(i, ch)| &text[i..i + ch.len_utf8()]).unwrap(); |
1640 | 3 | if is_currency_symbol(first_ch.chars().next().unwrap()) { |
1641 | 1 | let mrow = create_mathml_element(&leaf.document(), "mrow"); |
1642 | 1 | mrow.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
1643 | 1 | let currency_symbol = create_mathml_element(&leaf.document(), "mi"); |
1644 | 1 | currency_symbol.set_text(first_ch); |
1645 | 1 | mrow.append_child(currency_symbol); |
1646 | 1 | let implied_times = create_mo(leaf.document(), "\u{2062}", ADDED_ATTR_VALUE); |
1647 | 1 | mrow.append_child(implied_times); |
1648 | 1 | let currency_amount = create_mathml_element(&leaf.document(), name(leaf)); |
1649 | 1 | currency_amount.set_text(&text[first_ch.len()..]); |
1650 | 1 | mrow.append_child(currency_amount); |
1651 | 1 | return Some(mrow); |
1652 | 2 | } |
1653 | 2 | let last_ch = text.char_indices().last().map(|(i, _)| &text[i..]).unwrap(); |
1654 | 2 | if is_currency_symbol(last_ch.chars().next().unwrap()) { |
1655 | 1 | let mrow = create_mathml_element(&leaf.document(), "mrow"); |
1656 | 1 | mrow.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
1657 | 1 | let implied_times = create_mo(leaf.document(), "\u{2062}", ADDED_ATTR_VALUE); |
1658 | 1 | mrow.append_child(implied_times); |
1659 | 1 | let currency_amount = create_mathml_element(&leaf.document(), name(leaf)); |
1660 | 1 | currency_amount.set_text(&text[..text.len()-last_ch.len()]); |
1661 | 1 | mrow.append_child(currency_amount); |
1662 | 1 | let currency_symbol = create_mathml_element(&leaf.document(), "mi"); |
1663 | 1 | currency_symbol.set_text(last_ch); |
1664 | 1 | mrow.append_child(currency_symbol); |
1665 | 1 | return Some(mrow); |
1666 | 1 | } |
1667 | // try to find it in the middle | |
1668 | 2 | for (byte_idx, ch) in |
1669 | 2 | if contains_currency(&text[byte_idx .. byte_idx + ch.len_utf8()]) { |
1670 | // get all the substrings | |
1671 | 1 | let first_part = &text[..byte_idx]; |
1672 | 1 | let currency_symbol = &text[byte_idx .. byte_idx + ch.len_utf8()]; |
1673 | 1 | let second_part = &text[byte_idx + ch.len_utf8() ..]; |
1674 | 1 | let mrow = create_mathml_element(&leaf.document(), "mrow"); |
1675 | 1 | mrow.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
1676 | 1 | let first_part_element = create_mathml_element(&leaf.document(), name(leaf)); |
1677 | 1 | first_part_element.set_text(first_part); |
1678 | 1 | mrow.append_child(first_part_element); |
1679 | 1 | let implied_times = create_mo(leaf.document(), "\u{2062}", ADDED_ATTR_VALUE); |
1680 | 1 | mrow.append_child(implied_times); |
1681 | 1 | let currency_symbol_element = create_mathml_element(&leaf.document(), "mi"); |
1682 | 1 | currency_symbol_element.set_text(currency_symbol); |
1683 | 1 | mrow.append_child(currency_symbol_element); |
1684 | 1 | let implied_times = create_mo(leaf.document(), "\u{2062}", ADDED_ATTR_VALUE); |
1685 | 1 | mrow.append_child(implied_times); |
1686 | 1 | let second_part_element = create_mathml_element(&leaf.document(), name(leaf)); |
1687 | 1 | second_part_element.set_text(second_part); |
1688 | 1 | mrow.append_child(second_part_element); |
1689 | 1 | return Some(mrow); |
1690 | 1 | } |
1691 | } | |
1692 | 0 | return None |
1693 | } | |
1694 | } | |
1695 | 12 | } |
1696 | ||
1697 | /// If arg is "arc" (with optional space), merge the following element in if a trig function (sibling is deleted) | |
1698 | 11.8k | fn merge_arc_trig(leaf: Element) -> Option<Element> { |
1699 | 11.8k | assert!(is_leaf(leaf)); |
1700 | 11.8k | let leaf_text = as_text(leaf); |
1701 | 11.8k | if !(leaf_text == "arc" || |
1702 | 11.8k | return None; |
1703 | 2 | } |
1704 | ||
1705 | 2 | let following_siblings = leaf.following_siblings(); |
1706 | 2 | if following_siblings.is_empty() { |
1707 | 0 | return None; |
1708 | 2 | } |
1709 | ||
1710 | 2 | let following_sibling = as_element(following_siblings[0]); |
1711 | 2 | let following_sibling_name = name(following_sibling); |
1712 | 2 | if !(following_sibling_name == "mi" || |
1713 | 0 | return None; |
1714 | 2 | } |
1715 | ||
1716 | 2 | return crate::definitions::SPEECH_DEFINITIONS.with(|definitions| { |
1717 | // change "arc" "cos" to "arccos" -- we look forward because calling loop stores previous node | |
1718 | 2 | let following_text = as_text(following_sibling); |
1719 | 2 | if definitions.borrow().get_hashset("TrigFunctionNames").unwrap().contains(following_text) { |
1720 | 2 | let new_text = "arc".to_string() + following_text; |
1721 | 2 | set_mathml_name(leaf, "mi"); |
1722 | 2 | leaf.set_text(&new_text); |
1723 | 2 | following_sibling.remove_from_parent(); |
1724 | 2 | return Some(leaf); |
1725 | 0 | } |
1726 | 0 | return None; |
1727 | 2 | }) |
1728 | 11.8k | } |
1729 | ||
1730 | /// Convert "||" to "‖", if in single element or in repeated 'mo's (but not "|x||y|" or "{x ||x|>0}") | |
1731 | 305 | fn merge_vertical_bars(leaf: Element) -> Option<Element> { |
1732 | 305 | assert!(is_leaf(leaf)); |
1733 | 305 | let leaf_text = as_text(leaf); |
1734 | 305 | if leaf_text == "||" { |
1735 | 4 | leaf.set_text("‖"); // U+2016 |
1736 | 4 | return Some(leaf); |
1737 | 301 | } else if leaf_text != "|" { |
1738 | 0 | return None; |
1739 | 301 | } |
1740 | 301 | let following_siblings = leaf.following_siblings(); |
1741 | 301 | if following_siblings.is_empty() { |
1742 | 96 | return None; |
1743 | 205 | } |
1744 | ||
1745 | 205 | let following_sibling = as_element(following_siblings[0]); |
1746 | 205 | if name(following_sibling) != "mo" || |
1747 | 201 | return None |
1748 | 4 | } |
1749 | ||
1750 | // have "||" -- if there a single "|" on left, rule out merge | |
1751 | 4 | let preceding_siblings = leaf.preceding_siblings(); |
1752 | 5 | if |
1753 | 5 | let child = as_element(child); |
1754 | 5 | return name(child) == "mo" && |
1755 | 5 | }) { |
1756 | 1 | return None; // found "|" on left |
1757 | 3 | } |
1758 | ||
1759 | 3 | if following_siblings.len() > 1 { |
1760 | 2 | let following_siblings = &following_siblings[1..]; |
1761 | // if there are an odd number of "|"s to the right, rule out the merge | |
1762 | 8 | if !( |
1763 | 8 | let child = as_element(child); |
1764 | 8 | return name(child) == "mo" && |
1765 | 8 | }). |
1766 | 1 | return None; |
1767 | 1 | } |
1768 | 1 | } |
1769 | ||
1770 | // didn't find any | |
1771 | 2 | leaf.set_text("‖"); // U+2016 |
1772 | 2 | following_sibling.remove_from_parent(); |
1773 | 2 | return Some(leaf); |
1774 | 305 | } |
1775 | ||
1776 | /// merge a following mstyle that has the same attrs | |
1777 | 714 | fn merge_adjacent_similar_mstyles(mathml: Element) { |
1778 | 714 | if ELEMENTS_WITH_FIXED_NUMBER_OF_CHILDREN.contains(name(get_parent(mathml))) { |
1779 | // FIX: look to see if all of the children (might be more than just the adjacent one) have the same attr and then pull them up to the parent | |
1780 | 65 | return; // can't remove subsequent child |
1781 | 649 | } |
1782 | 649 | let following_siblings = mathml.following_siblings(); |
1783 | 649 | if following_siblings.is_empty() { |
1784 | 579 | return; |
1785 | 70 | } |
1786 | 70 | let following_element = as_element(following_siblings[0]); |
1787 | 70 | if name(following_element) != "mstyle" { |
1788 | 66 | return; |
1789 | 4 | } |
1790 | 4 | let are_same = mathml.attributes().iter() |
1791 | 4 | .zip( following_element.attributes() ) |
1792 | 5 | . |
1793 | 4 | if are_same { |
1794 | 4 | mathml.append_children(following_element.children()); |
1795 | 4 | following_element.remove_from_parent(); |
1796 | 4 |
|
1797 | 714 | } |
1798 | ||
1799 | 40 | fn convert_mfenced_to_mrow(mfenced: Element) -> Element { |
1800 | // The '<'/'>' replacements are because WIRIS uses them out instead of the correct chars in its template | |
1801 | 40 | let open = mfenced.attribute_value("open").unwrap_or("(").replace('<', "⟨"); |
1802 | 40 | let close = mfenced.attribute_value("close").unwrap_or(")").replace('>', "⟩"); |
1803 | // debug!("open={}, close={}", open, close); | |
1804 | 40 | let mut separators= mfenced.attribute_value("separators").unwrap_or(",").chars(); |
1805 | 40 | set_mathml_name(mfenced, "mrow"); |
1806 | 40 | mfenced.remove_attribute("open"); |
1807 | 40 | mfenced.remove_attribute("close"); |
1808 | 40 | mfenced.remove_attribute("separators"); |
1809 | 40 | let children = mfenced.children(); |
1810 | 40 | let mut new_children = Vec::with_capacity(2*children.len() + 1); |
1811 | 40 | if !open.is_empty() { |
1812 | 40 | new_children.push(ChildOfElement::Element( create_mo(mfenced.document(), &open, MFENCED_ATTR_VALUE)) ); |
1813 | 40 |
|
1814 | 40 | if !children.is_empty() { |
1815 | 40 | new_children.push(children[0]); |
1816 | 40 | for |
1817 | 3 | let sep = separators.next().unwrap_or(',').to_string(); |
1818 | 3 | new_children.push( ChildOfElement::Element( create_mo(mfenced.document(), &sep, MFENCED_ATTR_VALUE)) ); |
1819 | 3 | new_children.push(*child); |
1820 | 3 | } |
1821 | 0 | } |
1822 | 40 | if !close.is_empty() { |
1823 | 38 | new_children.push(ChildOfElement::Element( create_mo(mfenced.document(), &close, MFENCED_ATTR_VALUE)) ); |
1824 | 38 |
|
1825 | 40 | mfenced.replace_children(new_children); |
1826 | 40 | return mfenced; |
1827 | 40 | } |
1828 | ||
1829 | 30.4k | fn is_roman_number_match(text: &str) -> bool { |
1830 | 30.4k | return UPPER_ROMAN_NUMERAL.is_match(text) || |
1831 | 30.4k | } |
1832 | ||
1833 | /// Return true if 'element' (which is syntactically a roman numeral) is only inside mrows and | |
1834 | /// if its length is < 3 chars, then there is another roman numeral near it (separated by an operator). | |
1835 | /// We want to rule out something like 'm' or 'cm' being a roman numeral. | |
1836 | /// Note: this function assumes 'mathml' is a Roman Numeral, and optimizes operations based on that. | |
1837 | /// Note: Nemeth has some rules about roman numerals (capitalization and punctuation after) | |
1838 | 3.35k | fn is_roman_numeral_number_context(mathml: Element) -> bool { |
1839 | 3.35k | assert!(name(mathml)=="mtext" || |
1840 | 3.35k | let mut parent = mathml; |
1841 | loop { | |
1842 | 5.41k | parent = get_parent(parent); |
1843 | 5.41k | let current_name = name(parent); |
1844 | 5.41k | if current_name == "math" { |
1845 | 1.57k | break; |
1846 | 3.84k | } else if current_name == "msup" || |
1847 | // could be a oxidation state in a Chemical formula | |
1848 | 559 | let children = parent.children(); |
1849 | // make sure that there is only one script and that 'mathml' is a superscript | |
1850 | 559 | if current_name == "mmultiscripts" && ( |
1851 | 122 | return false; |
1852 | 437 | } |
1853 | 437 | let base = as_element(children[0]); |
1854 | 437 | if is_chemical_element(base) { |
1855 | 21 | break; |
1856 | } else { | |
1857 | 416 | return false; |
1858 | } | |
1859 | 3.28k | } else if current_name != "mrow" { |
1860 | 1.22k | return false; |
1861 | 2.06k | } |
1862 | } | |
1863 | ||
1864 | 1.59k | let text = as_text(mathml).as_bytes(); // note: we know it is all ASCII chars |
1865 | // if roman numeral is in superscript and we get here, then it had a chemical element base, so we accept it | |
1866 | // note: you never has a state = I; if two letters, it must be 'II'. | |
1867 | 1.59k | if text.len() > 2 || |
1868 | 1.57k | ((name(parent) =="msup" || |
1869 | 28 | return true; |
1870 | } else { | |
1871 | 1.56k | let is_upper_case = text[0].is_ascii_uppercase(); // safe since we know it is a roman numeral |
1872 | 1.56k | let preceding = mathml.preceding_siblings(); |
1873 | 1.56k | let following = mathml.following_siblings(); |
1874 | 1.56k | if preceding.is_empty() && |
1875 | 81 | return false; // no context and too short to confirm it is a roman numeral |
1876 | 1.48k | } |
1877 | 1.48k | if preceding.is_empty() { |
1878 | 275 | return is_roman_numeral_adjacent(following.iter(), is_upper_case); |
1879 | 1.21k | } |
1880 | 1.21k | if following.is_empty() { |
1881 | 399 | return is_roman_numeral_adjacent(preceding.iter().rev(), is_upper_case); |
1882 | 813 | } |
1883 | 813 | return is_roman_numeral_adjacent(preceding.iter().rev(), is_upper_case) && |
1884 | 3 | is_roman_numeral_adjacent(following.iter(), is_upper_case); |
1885 | } | |
1886 | ||
1887 | /// make sure all the non-mo leaf siblings are roman numerals | |
1888 | /// 'mo' should only be '+', '-', '=', ',', '.' -- unlikely someone is doing anything sophisticated | |
1889 | 1.49k | fn is_roman_numeral_adjacent<'a, I>(siblings: I, must_be_upper_case: bool) -> bool |
1890 | 1.49k | where I: Iterator<Item = &'a ChildOfElement<'a>> { |
1891 | static ROMAN_NUMERAL_OPERATORS: phf::Set<&str> = phf_set! { | |
1892 | "+", "-'", "=", "<", "≤", ">", "≥", | |
1893 | // ",", ".", // [c,d] triggers this if "," is present, so omitting it | |
1894 | }; | |
1895 | 1.49k | let mut found_match = false; // guard against no siblings |
1896 | 1.49k | let mut last_was_roman_numeral = true; // started at roman numeral |
1897 | // debug!("start is_roman_numeral_adjacent"); | |
1898 | 1.74k | for child in |
1899 | 1.74k | let maybe_roman_numeral = as_element(*child); |
1900 | // debug!("maybe_roman_numeral: {}", mml_to_string(maybe_roman_numeral)); | |
1901 | 1.74k | match name(maybe_roman_numeral) { |
1902 | 1.74k | "mo" => { |
1903 | 858 | if !last_was_roman_numeral { |
1904 | 18 | return false; |
1905 | 840 | } |
1906 | 840 | let text = as_text(maybe_roman_numeral); |
1907 | 840 | if !ROMAN_NUMERAL_OPERATORS.contains(text) { |
1908 | 660 | return false; |
1909 | 180 | } |
1910 | 180 | last_was_roman_numeral = false; |
1911 | }, | |
1912 | 889 | "mi" | |
1913 | 562 | if last_was_roman_numeral { |
1914 | 429 | return false; // no implicit multiplication (or whatever) |
1915 | 133 | } |
1916 | 133 | let text = as_text(maybe_roman_numeral); |
1917 | 133 | if !(( must_be_upper_case && |
1918 | 117 | (!must_be_upper_case && |
1919 | 109 | return false; |
1920 | 24 | }; |
1921 | 24 | found_match = true; |
1922 | 24 | last_was_roman_numeral = true; |
1923 | }, | |
1924 | 327 | "mtext" | |
1925 | _ => { | |
1926 | 252 | return false; |
1927 | } | |
1928 | } | |
1929 | } | |
1930 | 22 | return found_match; |
1931 | 1.49k | } |
1932 | 3.35k | } |
1933 | ||
1934 | /// Merge adjacent mtext by increasing the width of the first mtext | |
1935 | /// The resulting merged whitespace is put on the previous child, or if there is one, on the following child | |
1936 | /// | |
1937 | /// Note: this should be called *after* the mo/mtext cleanup (i.e., after the MathML child cleanup loop). | |
1938 | 10.1k | fn merge_whitespace(children: &mut Vec<ChildOfElement>) { |
1939 | 10.1k | if children.is_empty() { |
1940 | 3 | return; |
1941 | 10.1k | } |
1942 | ||
1943 | 10.1k | let mut i = 0; |
1944 | 10.1k | let mut previous_mtext_with_width: Option<Element<'_>> = None; // prefer to spacing on previous mtext |
1945 | 10.1k | let mut whitespace: Option<f64> = None; |
1946 | 42.0k | while i < children.len() { |
1947 | 31.8k | let child = as_element(children[i]); |
1948 | 31.8k | let is_child_whitespace = name(child) == "mtext" && |
1949 | // debug!("merge_whitespace: i={}, whitespace={:?}, mtext set={} {}", | |
1950 | // i, whitespace, previous_mtext_with_width.is_some(), mml_to_string(child)); | |
1951 | 31.8k | if is_child_whitespace { |
1952 | // update the running total of whitespace | |
1953 | 340 | let child_width = child.attribute_value("data-width").unwrap_or("0") |
1954 | 340 | .parse::<f64>().unwrap_or(0.0) ; |
1955 | 340 | whitespace = match whitespace { |
1956 | 327 | None => Some(child_width), |
1957 | 13 | Some(w) => Some(w + child_width), |
1958 | }; | |
1959 | 340 | if children.len() == 1 { |
1960 | 15 | i += 1; // don't remove only child |
1961 | 325 | } else { |
1962 | 325 | children.remove(i); // remove the current child (don't inc 'i') |
1963 | 325 | } |
1964 | 31.5k | } else if let Some( |
1965 | // done with sequence of whitespaces | |
1966 | 305 | if let Some( |
1967 | 13 | // prefer to set on previous mtext |
1968 | 13 | prev_mtext.set_attribute_value("data-following-space-width", (ws).to_string().as_str()); |
1969 | 13 | previous_mtext_with_width = None; |
1970 | 13 | } else { |
1971 | // if the space is significant, set it on the current child | |
1972 | 292 | child.set_attribute_value("data-previous-space-width", ws.to_string().as_str()); |
1973 | 292 | if name(child) == "mtext" { |
1974 | 18 | previous_mtext_with_width = Some(child); |
1975 | 274 | } |
1976 | } | |
1977 | 305 | whitespace = None; |
1978 | 305 | i += 1; |
1979 | 31.2k | } else { |
1980 | 31.2k | i += 1; |
1981 | 31.2k | previous_mtext_with_width = None; |
1982 | 31.2k | } |
1983 | } | |
1984 | // debug!(" after loop: whitespace={:?}, {}", whitespace, mml_to_string(as_element(children[children.len()-1]))); | |
1985 | 10.1k | if let Some( |
1986 | // last child in mrow is white space -- mark with space *after* | |
1987 | 22 | if children.len() == 1 { |
1988 | // only child -- check to see if we need to set the space-width | |
1989 | 21 | let child = as_element(children[0]); |
1990 | 21 | let child_width = child.attribute_value("data-width").unwrap_or("0").parse::<f64>().unwrap_or(0.0); |
1991 | 21 | if (child_width - ws).abs() > 0.001 { |
1992 | 9 | ws += child_width; |
1993 | 9 | child.set_attribute_value("data-following-space-width", ws.to_string().as_str()); |
1994 | 12 | } |
1995 | 1 | } else { |
1996 | 1 | let non_space_child = as_element(children[children.len()-1]); |
1997 | 1 | non_space_child.set_attribute_value("data-following-space-width", ws.to_string().as_str()); |
1998 | 1 | } |
1999 | 10.1k | } |
2000 | 10.1k | } |
2001 | ||
2002 | /// look for potential numbers by looking for sequences with commas, spaces, and decimal points | |
2003 | 10.1k | fn merge_number_blocks(context: &CanonicalizeContext, parent_mrow: Element, children: &mut Vec<ChildOfElement>) { |
2004 | // debug!("parent:\n{}", mml_to_string(parent_mrow)); | |
2005 | // If we find a comma that is not part of a number, don't form a number | |
2006 | // (see https://github.com/NSoiffer/MathCAT/issues/271) | |
2007 | // Unfortunately, we can't do this in the loop below because we might discover the "not part of a number" after a number has been formed | |
2008 | 10.1k | let do_not_merge_comma = is_comma_not_part_of_a_number(children); |
2009 | 10.1k | let mut i = 0; |
2010 | 38.2k | while i < children.len() { // length might change after a merge |
2011 | // { | |
2012 | // debug!("merge_number_blocks: top of loop"); | |
2013 | // for (i_child, &child) in children[i..].iter().enumerate() { | |
2014 | // let child = as_element(child); | |
2015 | // debug!("child #{}: {}", i+i_child, mml_to_string(child)); | |
2016 | // } | |
2017 | // } | |
2018 | 28.0k | let child = as_element(children[i]); |
2019 | 28.0k | let child_name = name(child); |
2020 | ||
2021 | // numbers start with an mn or a decimal separator | |
2022 | 28.0k | if child_name == "mn" || |
2023 | 6.09k | let leaf_child_text = as_text(child); |
2024 | // if Roman numeral, don't merge (move on) | |
2025 | // or if the 'mn' has ',', '.', or space, consider it correctly parsed and move on | |
2026 | 6.09k | if is_roman_number_match(leaf_child_text) || |
2027 | 5.75k | context.patterns.block_separator.is_match(leaf_child_text) || |
2028 | 5.64k | (leaf_child_text.len() > 1 && |
2029 | 559 | i += 1; |
2030 | 559 | continue; |
2031 | 5.53k | } |
2032 | 21.9k | } else if child_name != "mo" || |
2033 | 9.20k | (do_not_merge_comma && |
2034 | 6.44k | !context.patterns.decimal_separator.is_match(as_text(child)) { |
2035 | 21.9k | i += 1; |
2036 | 21.9k | continue; |
2037 | 31 | } |
2038 | ||
2039 | // potential start of a number | |
2040 | 5.56k | let mut end = i + 1; |
2041 | 5.56k | let mut has_decimal_separator = false; |
2042 | 5.56k | let mut not_a_number = false; |
2043 | 5.56k | if i < children.len() { |
2044 | // look at the right siblings and pull in the longest sequence of number/separators -- then check it for validity | |
2045 | 5.56k | for |
2046 | 4.00k | let sibling = as_element(*sibling); |
2047 | 4.00k | let sibling_name = name(sibling); |
2048 | 4.00k | if sibling_name == "mn" { |
2049 | 245 | let leaf_text = as_text(sibling); |
2050 | 245 | let is_block_separator = context.patterns.block_separator.is_match(leaf_text); |
2051 | 245 | let is_decimal_separator = context.patterns.decimal_separator.is_match(leaf_text); |
2052 | 245 | if is_roman_number_match(leaf_text) || is_block_separator || is_decimal_separator { |
2053 | // consider this mn correctly parsed | |
2054 | 1 | break; |
2055 | 244 | } |
2056 | 3.75k | } else if sibling_name=="mo" || |
2057 | 1.33k | let leaf_text = as_text(sibling); |
2058 | 1.33k | let is_block_separator = context.patterns.block_separator.is_match(leaf_text); |
2059 | 1.33k | let is_decimal_separator = context.patterns.decimal_separator.is_match(leaf_text); |
2060 | 1.33k | if (leaf_text == "," && |
2061 | 1.14k | !(is_block_separator || |
2062 | 261 | (is_decimal_separator && |
2063 | // not a separator or (it is decimal separator and we've already seen a decimal separator) | |
2064 | 1.09k | not_a_number = is_decimal_separator && |
2065 | 1.09k | break; |
2066 | 244 | } |
2067 | 244 | has_decimal_separator |= is_decimal_separator; |
2068 | } else { | |
2069 | // not mn, mo, or mtext -- end of a number | |
2070 | 2.41k | break; |
2071 | } | |
2072 | 488 | end += 1; // increment at end so we can tell the difference between a 'break' and end of loop |
2073 | } | |
2074 | 0 | } |
2075 | 5.56k | if not_a_number { |
2076 | 17 | i = end + 1; |
2077 | 17 | continue; // continue looking in the rest of the mrow |
2078 | 5.55k | } |
2079 | 5.55k | if ignore_final_punctuation(context, parent_mrow, &children[i..end]) { |
2080 | 18 | end -= 1; |
2081 | 5.53k | }; |
2082 | // debug!("start={}, end={}", i, end); | |
2083 | // no need to merge if only one child (also avoids "." being considered a number) | |
2084 | 5.55k | if end > i + 1 && |
2085 | 107 | (i, end) = trim_whitespace(children, i, end); |
2086 | 107 | merge_block(children, i, end); |
2087 | 107 | // note: start..end has been collapsed, so restart after the collapsed part |
2088 | 5.44k | } else { |
2089 | 5.44k | i = end; // start looking at the end of the block we just rejected |
2090 | 5.44k | } |
2091 | 5.55k | i += 1; |
2092 | } | |
2093 | 10.1k | } |
2094 | ||
2095 | /// Return true if we find a comma that doesn't have an <mn> on both sides | |
2096 | 10.1k | fn is_comma_not_part_of_a_number(children: &[ChildOfElement])-> bool { |
2097 | 10.1k | let n_children = children.len(); |
2098 | 10.1k | if n_children == 0 { |
2099 | 3 | return false; |
2100 | 10.1k | } |
2101 | 10.1k | let mut previous_child = as_element(children[0]); |
2102 | 14.5k | for i in |
2103 | 14.5k | let child = as_element(children[i]); |
2104 | 14.5k | if name(child) == "mo" && |
2105 | 972 | (name(previous_child) != "mn" || |
2106 | 809 | return true; |
2107 | 13.7k | } |
2108 | 13.7k | previous_child = child; |
2109 | } | |
2110 | 9.37k | return false; |
2111 | 10.1k | } |
2112 | ||
2113 | /// If we have something like 'shape' ABC, we split the ABC and add IMPLIED_SEPARATOR_HIGH_PRIORITY between them | |
2114 | /// under some specific conditions (trying to be a little cautious). | |
2115 | /// The returned (mrow) element reuses the arg so tree siblings links remain correct. | |
2116 | 11.8k | fn split_points(leaf: Element) -> Option<Element> { |
2117 | 3 | static IS_UPPERCASE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[A-Z]+$").unwrap()); |
2118 | ||
2119 | 11.8k | if !IS_UPPERCASE.is_match(as_text(leaf)) { |
2120 | 9.88k | return None; |
2121 | 1.96k | } |
2122 | ||
2123 | // check to see if there is a bar, arrow, etc over the letters (line-segment, arc, ...) | |
2124 | 1.96k | let parent = get_parent(leaf); |
2125 | 1.96k | if name(parent) == "mover" { |
2126 | // look for likely overscripts (basically just rule out some definite 'no's) | |
2127 | 29 | let over = as_element(parent.children()[1]); |
2128 | 29 | if is_leaf(over) { |
2129 | 29 | let mut over_chars = as_text(over).chars(); |
2130 | 29 | let first_char = over_chars.next(); |
2131 | 29 | if first_char.is_some() && over_chars.next().is_none() && !first_char.unwrap().is_alphanumeric(){ |
2132 | // only one char and it isn't alphanumeric | |
2133 | 29 | return Some( split_element(leaf) ); |
2134 | 0 | } |
2135 | 0 | } |
2136 | 1.93k | } |
2137 | ||
2138 | // check to see if it is preceded by a geometric shape (e.g, ∠ABC) | |
2139 | 1.93k | let preceding_siblings = leaf.preceding_siblings(); |
2140 | 1.93k | if !preceding_siblings.is_empty() { |
2141 | 1.11k | let preceding_sibling = as_element(preceding_siblings[preceding_siblings.len()-1]); |
2142 | 1.11k | let preceding_sibling_name = name(preceding_sibling); |
2143 | 1.11k | if preceding_sibling_name == "mi" || |
2144 | 711 | let preceding_text = as_text(preceding_sibling); |
2145 | 711 | return crate::definitions::SPEECH_DEFINITIONS.with(|definitions| { |
2146 | 711 | let defs = definitions.borrow(); |
2147 | 711 | let prefix_ops = defs.get_hashset("GeometryPrefixOperators").unwrap(); |
2148 | 711 | let shapes = defs.get_hashset("GeometryShapes").unwrap(); |
2149 | 711 | if prefix_ops.contains(preceding_text) || |
2150 | // split leaf | |
2151 | 9 | return Some( split_element(leaf) ); // always treated as function names |
2152 | } else { | |
2153 | 702 | return None; |
2154 | } | |
2155 | 711 | }) |
2156 | 407 | } |
2157 | 817 | } |
2158 | 1.22k | return None; |
2159 | ||
2160 | 38 | fn split_element(leaf: Element) -> Element { |
2161 | 38 | let mut children = Vec::with_capacity(leaf.children().len()); |
2162 | 51 | for ch in |
2163 | 51 | let new_leaf = create_mathml_element(&leaf.document(), "mi"); |
2164 | 51 | new_leaf.set_text(&ch.to_string()); |
2165 | 51 | children.push(new_leaf); |
2166 | 51 | } |
2167 | 38 | set_mathml_name(leaf, "mrow"); |
2168 | 38 | leaf.replace_children(children); |
2169 | 38 | return leaf; |
2170 | 38 | } |
2171 | 11.8k | } |
2172 | ||
2173 | /// If we have something like 'V e l o c i t y', merge that into a single <mi> | |
2174 | /// We only do this for sequences of at least three chars, and also exclude things like consecutive letter (e.g., 'x y z') | |
2175 | /// The returned (mi) element reuses 'mi' | |
2176 | 11.4k | fn merge_mi_sequence(mi: Element) -> Option<Element> { |
2177 | // The best solution would be to use a dictionary of words, or maybe restricted to words in a formula, | |
2178 | // but that would likely miss the words used in slope=run/rise. | |
2179 | // It would also be really expensive since we would need a dictionary for each language. | |
2180 | // We shouldn't need to worry about trig names like "cos", but people sometimes forget to use "\cos" | |
2181 | // Hence, we check against the "FunctionNames" that get read on startup. | |
2182 | 70 | fn is_vowel(ch: char) -> bool { |
2183 | 70 |
|
2184 | 'a' | 'e' | 'i' | 'o' | 'u' | 'y' | | |
2185 | 'à' | 'á' | 'â' | 'ã' | 'ä' | 'è' | 'é' | 'ê' | 'ë' | 'ì' | 'í' | 'î' | 'ï' | | |
2186 | 'ò' | 'ó' | 'ô' | 'õ' | 'ö' | 'ú' | 'Ù' | 'û' | 'ü' | 'ý' | 'ÿ' | | |
2187 | 'ả' | 'ạ' | 'ă' | 'ằ' | 'ẳ' | 'ẵ' | 'ắ' | 'ặ' | 'ầ' | 'ẩ' | 'ẫ' | 'ấ' | 'ậ' | 'ẻ' | 'ẽ' | 'ẹ' | 'ề' | 'ể' | 'ễ' | 'ế' | 'ệ' | | |
2188 | 'ỉ' | 'ĩ' | 'ị' | 'ỏ' | 'ọ' | 'ồ' | 'ổ' | 'ỗ' | 'ố' | 'ộ' | 'ơ' | 'ờ' | 'ở' | 'ỡ' | 'ớ' | 'ợ' | | |
2189 | 'ủ' | 'ũ' | 'ụ' | 'ư' | 'ừ' | 'ử' | 'ữ' | 'ứ' | 'ự' | 'ỳ' | 'ỷ' | 'ỹ' | 'ỵ' | |
2190 | ) | |
2191 | 70 | } |
2192 | 11.4k | let parent = get_parent(mi); // not canonicalized into mrows, so parent could be "math" |
2193 | 11.4k | let parent_name = name(parent); |
2194 | // don't merge if more than one char, or if not in an mrow (or implied on since we haven't normalized yet) | |
2195 | 11.4k | if as_text(mi).chars().nth(1).is_some() || !( |
2196 | 5.16k | return None; |
2197 | 6.25k | } |
2198 | 6.25k | let mut text = as_text(mi).to_string(); |
2199 | 6.25k | let text_script = Script::from(text.chars().next().unwrap_or('a')); |
2200 | 6.25k | let following_siblings = mi.following_siblings(); |
2201 | 6.25k | let mut last_char_is_scripted = None; |
2202 | 6.25k | let mut following_mi_siblings: Vec<Element> = following_siblings.iter() |
2203 | 6.25k | .map_while(|&child| |
2204 | 4.15k | let mut child = as_element(child); |
2205 | 4.15k | let mut is_ok = false; |
2206 | 4.15k | if name(child) == "msub" || |
2207 | // check if the *last* char in the sequence is scripted | |
2208 | // if so, we need to stop here anyway and deal with it specially | |
2209 | 163 | last_char_is_scripted = Some(child); // need to remember the value -- cleared later if not ok |
2210 | 163 | child = as_element(child.children()[0]); |
2211 | 233 | while name(child) == "mrow" && |
2212 | 70 | // the base may be wrapped with mrows |
2213 | 70 | child = as_element(child.children()[0]); |
2214 | 70 | } |
2215 | 3.99k | } |
2216 | 4.15k | if name(child) == "mi" { |
2217 | 402 | let mut child_text = as_text(child).chars(); |
2218 | 402 | let first_char = child_text.next().unwrap_or('a'); |
2219 | 402 | if child_text.next().is_none() && |
2220 | 365 | text.push(first_char); |
2221 | 365 | is_ok = true; |
2222 | 365 |
|
2223 | 3.75k | } |
2224 | 4.15k | if last_char_is_scripted.is_some() { |
2225 | 163 | if is_ok { |
2226 | 114 | is_ok = false; // don't want to continue |
2227 | 114 | } else { |
2228 | 49 | last_char_is_scripted = None; // reset to None |
2229 | 49 | } |
2230 | 3.99k | } |
2231 | 4.15k | if is_ok { |
2232 | 4.15k | }) |
2233 | 6.25k | .collect(); |
2234 | 6.25k | if following_mi_siblings.is_empty() { |
2235 | 6.03k | return None; |
2236 | 224 | } |
2237 | ||
2238 | 224 | if let Some( |
2239 | 14 | // add the last char to the run |
2240 | 14 | following_mi_siblings.push(last); |
2241 | 210 | } |
2242 | // debug!("merge_mi_sequence: text={}", &text); | |
2243 | 224 | if let Some( |
2244 | 224 | let definitions = definitions.borrow(); |
2245 | 224 | let function_names = definitions.get_hashset("FunctionNames").unwrap(); |
2246 | // UEB seems to think "Sin" (etc) is used for "sin", so we move to lower case | |
2247 | // function name might be (wrongly) set to italic math alphanumeric chars, including bold italic | |
2248 | 224 | if let Some( |
2249 | 221 | && function_names.contains(&ascii_text.to_lowercase()) { |
2250 | 10 | return Some(merge_from_text(mi, &ascii_text, &following_mi_siblings)); |
2251 | 214 | } |
2252 | 214 | if function_names.contains(&text) { |
2253 | 0 | return Some(merge_from_text(mi, &text, &following_mi_siblings)); |
2254 | 214 | } |
2255 | // unlike "FunctionNames", "KnownWords" might not exist | |
2256 | 214 | if let Some( |
2257 | 131 | && word_map.contains(&text) { |
2258 | 1 | return Some(merge_from_text(mi, &text, &following_mi_siblings)); |
2259 | 213 | } |
2260 | 213 | return None; |
2261 | 224 | }) { |
2262 | 11 | return answer; |
2263 | 213 | } |
2264 | ||
2265 | // don't be too aggressive combining mi's when they are short | |
2266 | 213 | if text.chars().count() < 3 { |
2267 | 186 | return None; |
2268 | 27 | } |
2269 | // If it is a word, it needs a vowel and it must be a letter | |
2270 | // FIX: this check needs to be internationalized to include accented vowels, other alphabets | |
2271 | 70 | if ! |
2272 | 15 | return None; |
2273 | 12 | } |
2274 | ||
2275 | // now for some heuristics to rule out a sequence of variables | |
2276 | // rule out sequences like 'abc' and also 'axy' that are in alphabetical order | |
2277 | 12 | let mut chars = text.chars(); |
2278 | 12 | let mut left = chars.next().unwrap(); // at least 3 chars |
2279 | 12 | let mut is_in_alphabetical_order = true; |
2280 | 23 | for ch in |
2281 | 23 | if (left as u32) >= (ch as u32) { |
2282 | 3 | is_in_alphabetical_order = false; |
2283 | 3 | break; // can't be 'abc', 'axy', etc |
2284 | 20 | } |
2285 | 20 | left = ch; |
2286 | } | |
2287 | 12 | if is_in_alphabetical_order || |
2288 | // If it is in alphabetical order, it's not likely a word | |
2289 | 12 | return None; |
2290 | 0 | } |
2291 | ||
2292 | // FIX: should add more heuristics to rule out words | |
2293 | 0 | return merge_from_text(mi, &text, &following_mi_siblings); |
2294 | ||
2295 | 11 | fn merge_from_text<'a>(mi: Element<'a>, text: &str, following_siblings: &[Element<'a>]) -> Option<Element<'a>> { |
2296 | // remove trailing mi's | |
2297 | 11 | let i_last_child = following_siblings.len()-1; |
2298 | 11 | let last_child = following_siblings[i_last_child]; |
2299 | 11 | if name(last_child) == "mi" { |
2300 | 10 |
|
2301 | 5 | mi.set_text(text); |
2302 | 5 | return Some(mi); |
2303 | } else { | |
2304 | // replace the base of the scripted element (the last child) with the run (e.g. 's i n^2' -> {sin}^2) | |
2305 | 6 | mi.remove_from_parent(); |
2306 | 6 | following_siblings[..i_last_child].iter().for_each(|sibling| sibling.remove_from_parent()); |
2307 | 6 | let mut base = as_element(last_child.children()[0]); |
2308 | 9 | while name(base) == "mrow" && |
2309 | 3 | // the base may be wrapped with mrows |
2310 | 3 | base = as_element(base.children()[0]); |
2311 | 3 | base.remove_attribute(SPLIT_TOKEN); |
2312 | 3 | } |
2313 | 6 | base.set_text(text); |
2314 | 6 | return Some(last_child); |
2315 | } | |
2316 | 11 | } |
2317 | 11.4k | } |
2318 | ||
2319 | // Check if start..end is a number | |
2320 | 275 | fn is_likely_a_number(context: &CanonicalizeContext, mrow: Element, children: &[ChildOfElement]) -> bool { |
2321 | // Note: the children of math_or_mrow aren't valid ('children' represents the current state) | |
2322 | 275 | let end = children.len(); |
2323 | // { | |
2324 | // let n_preceding_siblings = as_element(children[0]).preceding_siblings().len(); | |
2325 | // debug!("is_likely_a_number: start/end={}/{}", n_preceding_siblings, n_preceding_siblings+end); | |
2326 | // for (i, &child) in children.iter().enumerate() { | |
2327 | // let child = as_element(child); | |
2328 | // debug!("child# {}: {}", n_preceding_siblings+i, mml_to_string(child)); | |
2329 | // } | |
2330 | // debug!("\n"); | |
2331 | // } | |
2332 | ||
2333 | // gather up the text of the children (all mn, mo, or mtext) | |
2334 | 275 | let mut previous_name_was_mn = false; |
2335 | 275 | let mut text = "".to_string(); |
2336 | 727 | for &child in |
2337 | 727 | let child = as_element(child); |
2338 | 727 | let child_name = name(child); |
2339 | 727 | if previous_name_was_mn && |
2340 | 94 | text.push('\u{FFFF}'); // FIX: this should come from the separator string |
2341 | 633 | } |
2342 | 727 | text.push_str(as_text(child)); |
2343 | 727 | previous_name_was_mn = child_name == "mn"; |
2344 | } | |
2345 | ||
2346 | 275 | let text = text.trim(); // could be space got merged into an mn (e.g., braille::UEB::iceb::expr_3_1_6) |
2347 | // debug!(" text='{}', decimal num={}, 3 digit match={}, 3-5 match={}, 1 digit={}", &text, | |
2348 | // context.patterns.digit_only_decimal_number.is_match(text), | |
2349 | // context.patterns.block_3digit_pattern.is_match(text), | |
2350 | // context.patterns.block_3_5digit_pattern.is_match(text), | |
2351 | // context.patterns.block_1digit_pattern.is_match(text)); | |
2352 | 275 | if !(context.patterns.digit_only_decimal_number.is_match(text) || |
2353 | 190 | context.patterns.block_3digit_pattern.is_match(text) || |
2354 | 167 | context.patterns.block_3_5digit_pattern.is_match(text) || |
2355 | 166 | context.patterns.block_4digit_hex_pattern.is_match(text) || |
2356 | 162 | ( (text.chars().count() > 5 || |
2357 | 25 | context.patterns.block_1digit_pattern.is_match(text) ) |
2358 | ) { | |
2359 | 161 | return false; |
2360 | 114 | } |
2361 | ||
2362 | // ??? might want to rule out "sequences" like '100, 200, 300' and '100, 103, 106' (if constant difference, then a sequence) | |
2363 | ||
2364 | // If surrounded by fences, and commas are used, leave as is (e.g, "{1,234}") | |
2365 | 114 | if !text.contains(',') { |
2366 | 84 | return true; // not comma separated |
2367 | 30 | } |
2368 | ||
2369 | // We have already checked for whitespace as separators, so it must be a comma. Just check the fences. | |
2370 | // This is not yet in canonical form, so the fences may be siblings or siblings of the parent | |
2371 | 30 | let preceding_siblings = as_element(children[0]).preceding_siblings(); |
2372 | 30 | let following_siblings = as_element(children[end-1]).following_siblings(); |
2373 | let first_child; | |
2374 | let last_child; | |
2375 | 30 | if preceding_siblings.is_empty() && |
2376 | // number spans all children, look to parent for fences | |
2377 | 14 | let preceding_children = mrow.preceding_siblings(); |
2378 | 14 | let following_children = mrow.following_siblings(); |
2379 | 14 | if preceding_children.is_empty() || |
2380 | 9 | return true; // doesn't have left or right fence |
2381 | 5 | } |
2382 | 5 | first_child = preceding_children[preceding_children.len()-1]; |
2383 | 5 | last_child = following_children[0]; |
2384 | 16 | } else if preceding_siblings.is_empty() || |
2385 | 13 | return true; // can't be fences around it |
2386 | 3 | } else { |
2387 | 3 | first_child = preceding_siblings[preceding_siblings.len()-1]; |
2388 | 3 | last_child = following_siblings[0]; |
2389 | 3 | } |
2390 | 8 | let first_child = as_element(first_child); |
2391 | 8 | let last_child = as_element(last_child); |
2392 | 8 | return !(name(first_child) == "mo" && is_fence(first_child) && |
2393 | 7 | name(last_child) == "mo" && is_fence(last_child) ); |
2394 | 275 | } |
2395 | ||
2396 | // fn count_decimal_pts(context: &CanonicalizeContext, children: &[ChildOfElement], start: usize, end: usize) -> usize { | |
2397 | // let mut n_decimal_pt = 0; | |
2398 | // for &child_as_element in children.iter().take(end).skip(start) { | |
2399 | // let child = as_element(child_as_element); | |
2400 | // if context.patterns.decimal_separator.is_match(as_text(child)) { | |
2401 | // n_decimal_pt += 1; | |
2402 | // } | |
2403 | // } | |
2404 | // return n_decimal_pt; | |
2405 | // } | |
2406 | ||
2407 | /// This is a special case heuristic so try and determine if a terminating punctuation should be a decimal separator | |
2408 | /// Often math expressions end with punctuations for typographic reasons, so we try to figure that out here. | |
2409 | /// 'children' is a subset of 'mrow' | |
2410 | 5.55k | fn ignore_final_punctuation(context: &CanonicalizeContext, mrow: Element, children: &[ChildOfElement]) -> bool { |
2411 | 5.55k | let last_child = children[children.len()-1]; |
2412 | 5.55k | if mrow.children()[mrow.children().len()-1] != last_child { |
2413 | 3.49k | return false; // not at end |
2414 | 2.05k | } |
2415 | 2.05k | let parent = mrow.parent().unwrap().element(); |
2416 | 2.05k | if let Some( |
2417 | 1.71k | && name(math) != "math" { |
2418 | 1.58k | return false; // mrow inside something else -- not at end |
2419 | 471 | } |
2420 | ||
2421 | 471 | let last_child = as_element(last_child); |
2422 | // debug!("ignore_final_punctuation: last child={}", mml_to_string(last_child)); | |
2423 | 471 | if name(last_child) != "mo" { |
2424 | 451 | return false; // last was not "mo", so can't be a period |
2425 | 20 | } |
2426 | ||
2427 | 20 | if !context.patterns.decimal_separator.is_match(as_text(last_child)) { |
2428 | 0 | return false; |
2429 | 20 | } |
2430 | ||
2431 | // debug!("ignore_final_punctuation: #preceding={}", as_element(children[0]).preceding_siblings().len()); | |
2432 | // look to preceding siblings and see if an of the mn's have a decimal separator | |
2433 | 20 | return !as_element(children[0]).preceding_siblings().iter() |
2434 | 101 | . |
2435 | 101 | let child = as_element(child); |
2436 | 101 | name(child) == "mn" && |
2437 | 101 | }); |
2438 | 5.55k | } |
2439 | ||
2440 | /// Trim off any children that are whitespace on either side | |
2441 | 107 | fn trim_whitespace(children: &mut [ChildOfElement], start: usize, end: usize) -> (usize, usize) { |
2442 | 107 | let mut real_start = start; |
2443 | #[allow(clippy::needless_range_loop)] // I don't like enumerate/take/skip here | |
2444 | 107 | for i in start..end { |
2445 | 107 | let child = as_element(children[i]); |
2446 | 107 | if !as_text(child).trim().is_empty() { |
2447 | 107 | real_start = i; |
2448 | 107 | break; |
2449 | 0 | } |
2450 | } | |
2451 | ||
2452 | 107 | let mut real_end = end; |
2453 | 157 | for i in ( |
2454 | 157 | let child = as_element(children[i]); |
2455 | 157 | if !as_text(child).trim().is_empty() { |
2456 | 107 | real_end = i+1; |
2457 | 107 | break; |
2458 | 50 | } |
2459 | } | |
2460 | 107 | return (real_start, real_end); |
2461 | 107 | } |
2462 | ||
2463 | /// Merge the number block from start..end | |
2464 | 107 | fn merge_block(children: &mut Vec<ChildOfElement>, start: usize, end: usize) { |
2465 | ||
2466 | // debug!("merge_block: merging {}..{}", start, end); | |
2467 | 107 | let mut mn_text = String::with_capacity(4*(end-start)-1); // true size less than #3 digit blocks + separator |
2468 | 237 | for &child_as_element in |
2469 | 237 | let child = as_element(child_as_element); |
2470 | 237 | mn_text.push_str(as_text(child)); |
2471 | 237 | } |
2472 | 107 | let child = as_element(children[start]); |
2473 | 107 | set_mathml_name(child, "mn"); |
2474 | 107 | child.set_text(&mn_text); |
2475 | ||
2476 | 107 | children.drain(start+1..end); |
2477 | 107 | } |
2478 | ||
2479 | ||
2480 | /// merge ° C or ° F into a single <mi> with the text '℃' or '℉' -- prevents '°' from becoming a superscript | |
2481 | #[allow(non_snake_case)] | |
2482 | 5.90k | fn merge_degrees_C_F<'a>(mrow: Element<'a>) -> Element<'a> { |
2483 | 5.90k | let mut degree_child = None; |
2484 | 28.1k | for child in |
2485 | 28.1k | let child = as_element(child); |
2486 | 28.1k | if is_leaf(child) { |
2487 | 23.9k | match as_text(child) { |
2488 | 23.9k | "°" => { |
2489 | 34 | degree_child = Some(child); |
2490 | 34 | }, |
2491 | 23.9k | "°C" => { |
2492 | 12 | child.set_text("℃"); |
2493 | 12 | degree_child = None; |
2494 | 12 | }, |
2495 | 23.8k | "°F" => { |
2496 | 0 | child.set_text("℉"); |
2497 | 0 | degree_child = None; |
2498 | 0 | }, |
2499 | 23.8k | text => { |
2500 | 23.8k | if let Some( |
2501 | 23 | && (text == "C" || |
2502 | // merge the degree child with the current child | |
2503 | 3 | degree_child.set_text(if text == "C" { |
2504 | 3 | child.remove_from_parent(); |
2505 | 23.8k | } |
2506 | // merge the degree child with the current child | |
2507 | 23.8k | degree_child = None; |
2508 | }, | |
2509 | } | |
2510 | 4.25k | } |
2511 | } | |
2512 | 5.90k | return mrow; |
2513 | 5.90k | } |
2514 | ||
2515 | ||
2516 | /// merge consecutive leaves containing any of the 'chars' into the first leaf -- probably used for omission with('_') | |
2517 | 5.90k | fn merge_chars<'a>(mrow: Element<'a>, pattern: &Regex) -> Element<'a> { |
2518 | 5.90k | let mut first_child = None; |
2519 | 5.90k | let mut new_text = "".to_string(); |
2520 | 28.1k | for child in |
2521 | 28.1k | let child = as_element(child); |
2522 | 28.1k | if is_leaf(child) { |
2523 | 23.9k | let text = as_text(child); |
2524 | 23.9k | if pattern.is_match(text) { |
2525 | 134 | if new_text.is_empty() { |
2526 | 118 | // potential start of a string |
2527 | 118 | first_child = Some(child); |
2528 | 118 | new_text = as_text(child).to_string(); |
2529 | 118 | } else { |
2530 | 16 | // merge chars |
2531 | 16 | new_text.push_str(text); |
2532 | 16 | child.remove_from_parent(); |
2533 | 16 | } |
2534 | 23.8k | } else if new_text.len() > 1 { |
2535 | 99 | // end of a run |
2536 | 99 | first_child.unwrap().set_text(&new_text); |
2537 | 99 | new_text.clear(); |
2538 | 23.7k | } else { |
2539 | 23.7k | new_text.clear(); // just one entry -- no need to set the text |
2540 | 23.7k | } |
2541 | 4.25k | } else if new_text.len() > 1{ |
2542 | 7 | // end of a run |
2543 | 7 | first_child.unwrap().set_text(&new_text); |
2544 | 7 | new_text.clear(); |
2545 | 4.24k | } else { |
2546 | 4.24k | new_text.clear(); // just one entry -- no need to set the text |
2547 | 4.24k | } |
2548 | } | |
2549 | 5.90k | if new_text.len() > 1{ |
2550 | 9 | // end of a run |
2551 | 9 | first_child.unwrap().set_text(&new_text); |
2552 | 5.89k | } |
2553 | 5.90k | return mrow; |
2554 | 5.90k | } |
2555 | ||
2556 | /// curl and divergence are handled as two character operators | |
2557 | /// if found, merge them into their own (new) mrow that has an intent on it | |
2558 | /// we can have '∇' or '𝛁', or those as vectors (inside an mover) | |
2559 | 10.1k | fn merge_cross_or_dot_product_elements(children: &mut Vec<ChildOfElement>) { |
2560 | 10.1k | if children.is_empty() { |
2561 | 3 | return; |
2562 | 10.1k | } |
2563 | 10.1k | let mut i = 0; |
2564 | 10.1k | let mut is_previous_nabla = false; |
2565 | 31.5k | while i < children.len() - 1 { |
2566 | 21.3k | let child = as_element(children[i]); |
2567 | 21.3k | if is_previous_nabla { |
2568 | 14 | if is_leaf(child) { |
2569 | 14 | let text = as_text(child); |
2570 | 14 | if text == "⋅" || |
2571 | 12 | let nabla_child = as_element(children[i-1]); |
2572 | 12 | let nabla_text = as_text( get_possible_embellished_node(nabla_child) ); |
2573 | 12 | let new_mrow = create_mathml_element(&child.document(), "mrow"); |
2574 | 12 | new_mrow.set_attribute_value(ACT_AS_OPERATOR, nabla_text); |
2575 | 12 | new_mrow.append_child(nabla_child); |
2576 | 12 | new_mrow.append_child(child); |
2577 | 12 | children[i-1] = ChildOfElement::Element(new_mrow); |
2578 | 12 | children.remove(i); |
2579 | 12 |
|
2580 | 0 | } |
2581 | 14 | is_previous_nabla = false; |
2582 | } else { | |
2583 | 21.3k | let potential_nabla = if name(child) == "mover" { |
2584 | 21.3k | if is_leaf(potential_nabla) { |
2585 | 19.0k | let text = as_text(potential_nabla); |
2586 | 19.0k | if text == "∇" || |
2587 | 22 | is_previous_nabla = true; |
2588 | 19.0k | } |
2589 | 2.27k | } |
2590 | } | |
2591 | 21.3k | i += 1; |
2592 | } | |
2593 | 10.1k | } |
2594 | ||
2595 | 5.90k | fn merge_dots(mrow: Element) -> Element { |
2596 | // merge consecutive <mo>s containing '.' into ellipsis | |
2597 | 5.90k | let children = mrow.children(); |
2598 | 5.90k | let mut i = 0; |
2599 | 5.90k | let mut n_dots = 0; // number of consecutive mo's containing dots |
2600 | 34.1k | while i < children.len() { |
2601 | 28.2k | let child = as_element(children[i]); |
2602 | 28.2k | if name(child) == "mo" { |
2603 | 10.4k | let text = as_text(child); |
2604 | 10.4k | if text == "." { |
2605 | 71 | n_dots += 1; |
2606 | 71 | if n_dots == 3 { |
2607 | 3 | let first_child = as_element(children[i-2]); |
2608 | 3 | first_child.set_text("…"); |
2609 | 3 | as_element(children[i-1]).remove_from_parent(); |
2610 | 3 | child.remove_from_parent(); |
2611 | 3 | n_dots = 0; |
2612 | 68 | } |
2613 | 10.3k | } else { |
2614 | 10.3k | n_dots = 0; |
2615 | 10.3k | } |
2616 | 17.7k | } else { |
2617 | 17.7k | n_dots = 0; |
2618 | 17.7k | } |
2619 | 28.2k | i += 1; |
2620 | } | |
2621 | 5.90k | return mrow; |
2622 | 5.90k | } |
2623 | ||
2624 | 5.90k | fn merge_primes(mrow: Element) -> Element { |
2625 | // merge consecutive <mo>s containing primes (in various forms) | |
2626 | 5.90k | let mut children = mrow.children(); |
2627 | 5.90k | let mut i = 0; |
2628 | 5.90k | let mut n_primes = 0; // number of consecutive mo's containing primes |
2629 | 34.1k | while i < children.len() { |
2630 | 28.1k | let child = as_element(children[i]); |
2631 | 28.1k | if name(child) == "mo" { |
2632 | 10.4k | let text = as_text(child); |
2633 | // FIX: should we be more restrictive and change (apostrophe) only in a superscript? | |
2634 | 10.4k | if IS_PRIME.is_match(text) { |
2635 | 21 | n_primes += 1; |
2636 | 10.4k | } else if n_primes > 0 { |
2637 | 3 | merge_prime_elements(&mut children, i - n_primes, i); |
2638 | 3 | n_primes = 0; |
2639 | 10.4k | } |
2640 | 17.7k | } else if n_primes > 0 { |
2641 | 2 | merge_prime_elements(&mut children, i - n_primes, i); |
2642 | 2 | n_primes = 0; |
2643 | 17.7k | } |
2644 | 28.1k | i += 1; |
2645 | } | |
2646 | 5.90k | if n_primes > 0 { |
2647 | 12 | merge_prime_elements(&mut children, i - n_primes, i); |
2648 | 5.89k | } |
2649 | 5.90k | return mrow; |
2650 | 5.90k | } |
2651 | ||
2652 | 17 | fn merge_prime_elements(children: &mut [ChildOfElement], start: usize, end: usize) { |
2653 | // not very efficient since this is probably causing an array shift each time (array is probably not big though) | |
2654 | 17 | let first_child = as_element(children[start]); |
2655 | 17 | let mut new_text = String::with_capacity(end+3-start); // one per element plus a little extra |
2656 | 17 | new_text.push_str(as_text(first_child)); |
2657 | 17 | for & |
2658 | 4 | let child = as_element(child_as_element); |
2659 | 4 | let text = as_text(child); // only in this function because it is an <mo> |
2660 | 4 | new_text.push_str(text); |
2661 | 4 | child.remove_from_parent(); |
2662 | 4 | } |
2663 | 17 | first_child.set_text(&merge_prime_text(&new_text)); |
2664 | 17 | } |
2665 | ||
2666 | 83 | fn merge_prime_text(text: &str) -> String { |
2667 | // merge together single primes into double primes, etc. | |
2668 | 83 | let mut n_primes = 0; |
2669 | 101 | for ch in |
2670 | 101 | match ch { |
2671 | 90 | '\'' | '′' => n_primes += 1, |
2672 | 9 | '″' => n_primes += 2, |
2673 | 0 | '‴' => n_primes += 3, |
2674 | 2 | '⁗' => n_primes += 4, |
2675 | _ => { | |
2676 | 0 | eprintln!("merge_prime_text: unexpected char '{ch}' found in prime text '{text}'"); |
2677 | 0 | return text.to_string(); |
2678 | } | |
2679 | } | |
2680 | } | |
2681 | // it would be very rare to have more than a quadruple prime, so the inefficiency in the won't likely happen | |
2682 | 83 | let mut result = String::with_capacity(n_primes); // likely 4x too big, but string is short-lived and small |
2683 | 83 | for _ in 0..n_primes/4 { |
2684 | 3 | result.push('⁗'); |
2685 | 3 | } |
2686 | 83 | match n_primes % 4 { |
2687 | 61 | 1 => result.push('′'), |
2688 | 20 | 2 => result.push('″'), |
2689 | 1 | 3 => result.push('‴'), |
2690 | 1 | _ => () // can't happen |
2691 | } | |
2692 | 83 | return result; |
2693 | 83 | } |
2694 | ||
2695 | // from https://www.w3.org/TR/MathML3/chapter7.html#chars.pseudo-scripts | |
2696 | 35.1k | fn is_pseudo_script_char(ch: char) -> bool { |
2697 | 35.1k |
|
2698 | '\"' | '\'' | '*' | '`' | 'ª' | '°' | '²' | '³' | '´' | '¹' | 'º' | | |
2699 | '\u{2018}' | '\u{2019}' | '\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{201F}' | | |
2700 | '\u{2032}' | '\u{2033}' | '\u{2034}' | '\u{2035}' | '\u{2036}' | '\u{2037}' | '\u{2057}' | |
2701 | ) | |
2702 | 35.1k | } |
2703 | 5.90k | fn handle_pseudo_scripts(mrow: Element) -> Element { |
2704 | ||
2705 | 5.90k | assert!(name(mrow) == "mrow" || |
2706 | 5.90k | let mut children = mrow.children(); |
2707 | // check to see if mrow of all pseudo scripts | |
2708 | 5.91k | if |
2709 | 5.91k | is_pseudo_script(as_element(child)) |
2710 | 5.91k | }) { |
2711 | 2 | let parent = get_parent(mrow); // must exist |
2712 | 2 | let is_first_child = mrow.preceding_siblings().is_empty(); |
2713 | 2 | if is_first_child { |
2714 | 0 | return mrow; // FIX: what should happen |
2715 | 2 | } |
2716 | 2 | if crate::xpath_functions::IsNode::is_scripted(parent) { |
2717 | 2 | return mrow; // already in a script position |
2718 | 0 | } |
2719 | 0 | if name(parent) == "mrow" { |
2720 | 0 | mrow.set_attribute_value("data-pseudo-script", "true"); |
2721 | 0 | return handle_pseudo_scripts(parent); |
2722 | } else { | |
2723 | 0 | return mrow; // FIX: what should happen? |
2724 | } | |
2725 | 5.90k | } |
2726 | ||
2727 | 5.90k | let mut i = 1; |
2728 | 5.90k | let mut found = false; |
2729 | 28.1k | while i < children.len() { |
2730 | 22.2k | let child = as_element(children[i]); |
2731 | 22.2k | if is_pseudo_script(child) || |
2732 | 22.2k | child.attribute("data-pseudo-script").is_some() { |
2733 | 35 | let msup = create_mathml_element(&child.document(), "msup"); |
2734 | 35 | msup.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
2735 | 35 | msup.append_child(children[i-1]); |
2736 | 35 | msup.append_child(child); |
2737 | 35 | children[i-1] = ChildOfElement::Element(msup); |
2738 | 35 | children.remove(i); |
2739 | 35 | found = true; |
2740 | 22.2k | } else { |
2741 | 22.2k | i += 1; |
2742 | 22.2k | } |
2743 | } | |
2744 | 5.90k | if found { |
2745 | 25 | mrow.replace_children(children) |
2746 | 5.88k | } |
2747 | 5.90k | return mrow; |
2748 | ||
2749 | 28.1k | fn is_pseudo_script(child: Element) -> bool { |
2750 | 28.1k | if name(child) == "mo" { |
2751 | 10.4k | let text = as_text(child); |
2752 | 10.4k | if let Some( |
2753 | 10.3k | && is_pseudo_script_char(ch) { |
2754 | // don't script a pseudo-script | |
2755 | 55 | let preceding_siblings = child.preceding_siblings(); |
2756 | 55 | if !preceding_siblings.is_empty() { |
2757 | 42 | let last_child = as_element(preceding_siblings[preceding_siblings.len()-1]); |
2758 | 42 | if name(last_child) == "mo" && |
2759 | 10 | let Some(ch) = single_char(as_text(last_child)) |
2760 | 10 | && is_pseudo_script_char(ch) { |
2761 | 6 | return false; |
2762 | 36 | } |
2763 | 13 | } |
2764 | 49 | if text == "*" { |
2765 | // could be infix "*" -- this is a weak check to see if what follows is potentially an operand | |
2766 | 5 | let following_siblings = child.following_siblings(); |
2767 | 5 | if following_siblings.is_empty() { |
2768 | 1 | return true; |
2769 | 4 | } |
2770 | 4 | let first_child = as_element(following_siblings[0]); |
2771 | 4 | return name(first_child) != "mo" || ["(", "[", "{"].contains(&text); |
2772 | } else { | |
2773 | 44 | return true; |
2774 | } | |
2775 | 10.3k | } |
2776 | 17.7k | } |
2777 | 28.1k | return false; |
2778 | ||
2779 | /// An efficient method to get the char from a string if it is just one char or fail | |
2780 | 10.4k | fn single_char(text: &str) -> Option<char> { |
2781 | 10.4k | let mut chars = text.chars(); |
2782 | 10.4k | let ch = chars.next(); |
2783 | 10.4k | if ch.is_none() || chars.next().is_some() { |
2784 | 39 | return None; // not one character |
2785 | } else { | |
2786 | 10.3k | return ch; |
2787 | } | |
2788 | 10.4k | } |
2789 | 28.1k | } |
2790 | ||
2791 | 5.90k | } |
2792 | ||
2793 | 10.1k | fn handle_convert_to_mmultiscripts(children: &mut Vec<ChildOfElement>) { |
2794 | 10.1k | if children.len() == 1 { |
2795 | 4.45k | return; // can't convert to mmultiscripts if there is nothing to attach an empty base to |
2796 | 5.72k | } |
2797 | 5.72k | let mut i = 0; |
2798 | // convert_to_mmultiscripts changes 'children', so can't cache length | |
2799 | 32.7k | while i < children.len() { |
2800 | 26.9k | let child = as_element(children[i]); |
2801 | 26.9k | let child_name = name(child); |
2802 | 26.9k | if (child_name == "msub" || |
2803 | 115 | i = convert_to_mmultiscripts(children, i); |
2804 | 26.8k | } else { |
2805 | 26.8k | i += 1; |
2806 | 26.8k | } |
2807 | } | |
2808 | 10.1k | } |
2809 | ||
2810 | ||
2811 | /// Converts the script element with an empty base to mmultiscripts by sucking the base from the following or preceding element. | |
2812 | /// The following element is preferred so that these become prescripts (common usage is from TeX), but if the preceding element | |
2813 | /// has a closer mi/mtext, it is used. | |
2814 | /// mhchem has some ugly output (at least in MathJax) and that's where using the following element makes sense (usually) | |
2815 | /// because an empty base (mpadded width=0) is used for the scripts. A hacky attribute indicates this case. | |
2816 | 115 | fn convert_to_mmultiscripts(mrow_children: &mut Vec<ChildOfElement>, i: usize) -> usize { |
2817 | // this is a bit messy/confusing because we might scan forwards or backwards and this affects whether | |
2818 | // we are scanning for prescripts or postscripts | |
2819 | // the generic name "primary_scripts" means prescripts if going forward or postscripts if going backwards | |
2820 | // if we are going forward and hit a sub/superscript with a base, then those scripts become postscripts ("other_scripts") | |
2821 | // if we are going backwards, we never add prescripts | |
2822 | ||
2823 | // let parent = get_parent(as_element(mrow_children[i])); | |
2824 | // debug!("convert_to_mmultiscripts (i={}) -- PARENT:\n{}", i, mml_to_string(parent)); | |
2825 | ||
2826 | 115 | let i_base = choose_base_of_mmultiscripts(mrow_children, i); |
2827 | 115 | let mut base = as_element(mrow_children[i_base]); |
2828 | // debug!("convert_to_mmultiscripts -- base\n{}", mml_to_string(base)); | |
2829 | 115 | let base_name = name(base); |
2830 | 115 | let mut prescripts = vec![]; |
2831 | 115 | let mut postscripts = vec![]; |
2832 | 115 | let mut i_postscript = i_base + 1; |
2833 | ||
2834 | 115 | if (base_name == "msub" || |
2835 | 5 | !CanonicalizeContext::is_empty_element(as_element(base.children()[0])) { |
2836 | 5 | // if the base is a script element, then we want the base of that to be the base of the mmultiscripts |
2837 | 5 | let mut base_children = base.children(); |
2838 | 5 | let script_base = as_element(base.children()[0]); |
2839 | 5 | base_children[0] = ChildOfElement::Element(CanonicalizeContext::create_empty_element(&base.document())); |
2840 | 5 | base.replace_children(base_children); |
2841 | 5 | add_to_scripts(base, &mut postscripts); |
2842 | 5 | base = script_base; |
2843 | 110 | } |
2844 | ||
2845 | 115 | let mut has_chemistry_prescript = false; // chemical elements don't have both prescripts (nuclear chem) and postscripts |
2846 | 115 | if i_base > i { |
2847 | // we have prescripts -- gather them up | |
2848 | 61 | let mut i_prescript = i; |
2849 | 122 | while i_prescript < i_base { |
2850 | 61 | let script = as_element(mrow_children[i_prescript]); |
2851 | // kind of ugly -- this duplicates the first part of add_to_scripts | |
2852 | 61 | let script_name = name(script); |
2853 | 61 | if script_name == "msub" || |
2854 | 61 | let base = as_element(script.children()[0]); |
2855 | 61 | has_chemistry_prescript |= base.attribute(MHCHEM_MMULTISCRIPTS_HACK).is_some(); |
2856 | 61 |
|
2857 | 61 | if !add_to_scripts(script, &mut prescripts) { |
2858 | 0 | break; |
2859 | 61 | } |
2860 | 61 | i_prescript += 1; |
2861 | } | |
2862 | 54 | } |
2863 | ||
2864 | 115 | if !has_chemistry_prescript { |
2865 | // gather up the postscripts (if any) | |
2866 | 137 | while i_postscript < mrow_children.len() { |
2867 | 104 | let script = as_element(mrow_children[i_postscript]); |
2868 | // debug!("script: {}", mml_to_string(script)); | |
2869 | // if name(script) == "msub" && i_postscript+1 < mrow_children.len() { | |
2870 | // let superscript = as_element(mrow_children[i_postscript+1]); | |
2871 | // if name(superscript) == "msup" && CanonicalizeContext::is_empty_element(as_element(superscript.children()[0])) { | |
2872 | // set_mathml_name(script, "msubsup"); | |
2873 | // script.append_child(superscript.children()[1]); | |
2874 | // i_postscript += 1; | |
2875 | // } | |
2876 | // } | |
2877 | // debug!("adding postscript\n{}", mml_to_string(script)); | |
2878 | 104 | if !add_to_scripts(script, &mut postscripts) { |
2879 | 32 | break; |
2880 | 72 | } |
2881 | 72 | i_postscript += 1; |
2882 | } | |
2883 | 50 | } |
2884 | ||
2885 | 115 | let i_multiscript = if i_base < i { |
2886 | 115 | let script = create_mathml_element(&base.document(), "mmultiscripts"); |
2887 | 115 | let mut num_children = 1 + postscripts.len(); |
2888 | 115 | if !prescripts.is_empty() { |
2889 | 61 | num_children += 1 + prescripts.len(); |
2890 | 61 |
|
2891 | 115 | let mut new_children = Vec::with_capacity(num_children); |
2892 | 115 | new_children.push(ChildOfElement::Element(base)); |
2893 | 115 | new_children.append(&mut postscripts); |
2894 | 115 | if !prescripts.is_empty() { |
2895 | 61 | new_children.push( ChildOfElement::Element( create_mathml_element(&script.document(), "mprescripts") ) ); |
2896 | 61 | new_children.append(&mut prescripts); |
2897 | 61 |
|
2898 | ||
2899 | 115 | script.replace_children(new_children); |
2900 | 115 | let lifted_base = as_element(mrow_children[i_multiscript]); |
2901 | 115 | add_attrs(script, &lifted_base.attributes()); |
2902 | 115 | script.remove_attribute("data-split"); // doesn't make sense on mmultiscripts |
2903 | 115 | script.remove_attribute("mathvariant"); // doesn't make sense on mmultiscripts |
2904 | 115 | mrow_children[i_multiscript] = ChildOfElement::Element(script); |
2905 | 115 | mrow_children.drain(i_multiscript+1..i_postscript); // remove children after the first |
2906 | ||
2907 | 115 | let likely_chemistry = likely_adorned_chem_formula(script); |
2908 | 115 | if likely_chemistry >= 0 { |
2909 | 106 | script.set_attribute_value(MAYBE_CHEMISTRY, likely_chemistry.to_string().as_str()); |
2910 | 106 |
|
2911 | ||
2912 | // debug!("convert_to_mmultiscripts -- converted script:\n{}", mml_to_string(script)); | |
2913 | // debug!("convert_to_mmultiscripts (at end) -- #children={}", mrow_children.len()); | |
2914 | 115 | return i_multiscript + 1; // child to start on next |
2915 | 115 | } |
2916 | ||
2917 | 170 | fn add_to_scripts<'a>(el: Element<'a>, scripts: &mut Vec<ChildOfElement<'a>>) -> bool { |
2918 | 170 | let script_name = name(el); |
2919 | 170 | if !(script_name == "msub" || |
2920 | 32 | return false; |
2921 | 138 | } |
2922 | 138 | let base = as_element(el.children()[0]); |
2923 | 138 | if !CanonicalizeContext::is_empty_element(base) { // prescript that really should be a postscript |
2924 | // debug!("add_to_scripts: not empty base:\n{}", mml_to_string(base)); | |
2925 | 0 | return false; |
2926 | 138 | } |
2927 | 138 | if script_name == "msub" { |
2928 | 59 | add_pair(scripts, Some(el.children()[1]), None); |
2929 | 79 | } else if script_name == "msup" { |
2930 | 31 | add_pair(scripts, None, Some(el.children()[1])); |
2931 | 48 | } else { // msubsup |
2932 | 48 | add_pair(scripts, Some(el.children()[1]), Some(el.children()[2])); |
2933 | 48 | }; |
2934 | 138 | return true; |
2935 | 170 | } |
2936 | ||
2937 | 138 | fn add_pair<'v, 'a:'v>(script_vec: &'v mut Vec<ChildOfElement<'a>>, subscript: Option<ChildOfElement<'a>>, superscript: Option<ChildOfElement<'a>>) { |
2938 | 138 | let child_of_element = if let Some( |
2939 | 138 | let doc = as_element(child_of_element).document(); |
2940 | 138 | let subscript = if let Some( |
2941 | 107 | if CanonicalizeContext::is_empty_element(as_element(subscript)) { |
2942 | 0 | ChildOfElement::Element(create_mathml_element(&doc, "none")) |
2943 | } else { | |
2944 | 107 | subscript |
2945 | } | |
2946 | } else { | |
2947 | 31 | ChildOfElement::Element(create_mathml_element(&doc, "none")) |
2948 | }; | |
2949 | 138 | let superscript = if let Some( |
2950 | 79 | if CanonicalizeContext::is_empty_element(as_element(superscript)) { |
2951 | 0 | ChildOfElement::Element(create_mathml_element(&doc, "none")) |
2952 | } else { | |
2953 | 79 | superscript |
2954 | } | |
2955 | } else { | |
2956 | 59 | ChildOfElement::Element(create_mathml_element(&doc, "none")) |
2957 | }; | |
2958 | 138 | script_vec.push(subscript); |
2959 | 138 | script_vec.push(superscript); |
2960 | 138 | } |
2961 | ||
2962 | /// Find the closest likely base to the 'i'th child, preferring the next one over the preceding one, but want the closest. | |
2963 | /// | |
2964 | /// Note: because the base might be (...), 'mrow_children might be changed so that they are grouped into an mrow. | |
2965 | 115 | fn choose_base_of_mmultiscripts(mrow_children: &mut Vec<ChildOfElement>, i: usize) -> usize { |
2966 | // We already know there are no empty scripts to the left (because we find first empty base from left to right). | |
2967 | // However, there may be some empty bases before we get to real base on the right. | |
2968 | 115 | let script_element_base = as_element(as_element(mrow_children[i]).children()[0]); |
2969 | 115 | let mut likely_postscript = script_element_base.attribute(MHCHEM_MMULTISCRIPTS_HACK).is_some() && |
2970 | 115 | if likely_postscript { |
2971 | 86 | let base_of_postscript = as_element(mrow_children[i-1]); |
2972 | 86 | if name(base_of_postscript) != "mi" || |
2973 | 36 | likely_postscript = false; // base for potential postscript doesn't look reasonable -- consider it a prescript |
2974 | 50 | } |
2975 | 29 | } |
2976 | 115 | if i+1 < mrow_children.len() && |
2977 | 61 | return i+1; |
2978 | 54 | } |
2979 | 54 | if i > 0 { |
2980 | 54 | if let Some( |
2981 | 2 | assert!(i_start < i-1); // should be at least two children (open and close) |
2982 | // create a new mrow, add the grouped children to it, then drain all but the first of them from the original mrow vec. | |
2983 | // stick the mrow into the first of them -- this is the base | |
2984 | 2 | let new_mrow = create_mathml_element(&as_element(mrow_children[0]).document(), "mrow"); |
2985 | 2 | new_mrow.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
2986 | 8 | for &child in |
2987 | 8 | new_mrow.append_child(child); |
2988 | 8 | } |
2989 | 2 | mrow_children.drain(i_start+1..i); |
2990 | 2 | mrow_children[i_start] = ChildOfElement::Element(new_mrow); |
2991 | 2 | return i_start; |
2992 | 52 | } |
2993 | 52 | if is_child_simple_base(mrow_children[i-1]) { |
2994 | 52 | return i-1; |
2995 | 0 | } |
2996 | 0 | } |
2997 | ||
2998 | // base very likely after multiple scripts to the right | |
2999 | 0 | for (i_base, &child) in mrow_children.iter().enumerate().skip(i+1) { |
3000 | 0 | if is_child_simple_base(child) { |
3001 | 0 | return i_base; |
3002 | } else { | |
3003 | 0 | let child = as_element(child); |
3004 | 0 | let child_name = name(child); |
3005 | 0 | if !(child_name == "msub" || child_name == "msup" || child_name == "msubsup") { |
3006 | 0 | break; |
3007 | 0 | } |
3008 | } | |
3009 | } | |
3010 | // didn't find any good candidates for a base -- pick something valid | |
3011 | 0 | assert!(mrow_children.len() > i); |
3012 | 0 | return i; |
3013 | ||
3014 | ||
3015 | 113 | fn is_child_simple_base(child: ChildOfElement) -> bool { |
3016 | 113 | let mut child = as_element(child); |
3017 | 113 | let child_name = name(child); |
3018 | 113 | if child_name == "msub" || |
3019 | 5 | child = as_element(child.children()[0]); |
3020 | 108 | } |
3021 | ||
3022 | 113 | return is_leaf(child) && !CanonicalizeContext::is_empty_element(child); // a little overly general (but hopefully doesn't matter) |
3023 | 113 | } |
3024 | ||
3025 | /// Return the index of the matched open paren/bracket if the last element is a closed paren/bracket | |
3026 | 54 | fn is_grouped_base(mrow_children: &[ChildOfElement]) -> Option<usize> { |
3027 | // FIX: this really belongs in canonicalization pass, not the clean pass | |
3028 | 54 | let i_last = mrow_children.len()-1; |
3029 | 54 | let last_child = get_possible_embellished_node(as_element(mrow_children[i_last])); |
3030 | 54 | if name(last_child) == "mo" && |
3031 | 3 | CanonicalizeContext::find_operator(None, last_child, None, None, None).is_right_fence() { |
3032 | 6 | for i_child in ( |
3033 | 6 | let child = get_possible_embellished_node(as_element(mrow_children[i_child])); |
3034 | 6 | if name(child) == "mo" && |
3035 | 2 | CanonicalizeContext::find_operator(None, child, None, None, None).is_left_fence() { |
3036 | // FIX: should make sure left and right match. Should also count for nested parens | |
3037 | 2 | return Some(i_child); |
3038 | 4 | } |
3039 | } | |
3040 | 52 | } |
3041 | 52 | return None; |
3042 | 54 | } |
3043 | 115 | } |
3044 | 52.3k | } |
3045 | ||
3046 | 64.1k | fn canonicalize_mrows<'a>(&self, mathml: Element<'a>) -> Result<Element<'a>> { |
3047 | 64.1k | let tag_name = name(mathml); |
3048 | 64.1k | set_mathml_name(mathml, tag_name); // add namespace |
3049 | 64.1k | match tag_name { |
3050 | 64.1k | "mi" | |
3051 | 15.8k | self.canonicalize_plane1(mathml); |
3052 | 15.8k | return Ok( mathml ); }, |
3053 | 48.3k | "mo" => { |
3054 | 14.6k | self.canonicalize_plane1(mathml); |
3055 | 14.6k | self.canonicalize_mo_text(mathml); |
3056 | 14.6k | return Ok( mathml ); |
3057 | }, | |
3058 | 33.7k | "mn" => { |
3059 | 11.6k | self.canonicalize_plane1(mathml); |
3060 | 11.6k | return Ok( mathml ); |
3061 | }, | |
3062 | 22.0k | "mrow" => { |
3063 | 7.48k | return self.canonicalize_mrows_in_mrow(mathml); |
3064 | }, | |
3065 | _ => { | |
3066 | // recursively try to make mrows in other structures (eg, num/denom in fraction) | |
3067 | 14.6k | let mut new_children = Vec::with_capacity(mathml.children().len()); |
3068 | 21.5k | for child in |
3069 | 21.5k | match child { |
3070 | 21.5k | ChildOfElement::Element(e) => { |
3071 | 21.5k | new_children.push( ChildOfElement::Element(self.canonicalize_mrows(e) |
3072 | }, | |
3073 | 0 | ChildOfElement::Text(t) => { |
3074 | 0 | if mathml.children().len() != 1 { |
3075 | 0 | bail!("Text '{}' found with more than one child in element '{}'", t.text(), tag_name); |
3076 | 0 | } |
3077 | 0 | return Ok( mathml ); |
3078 | }, | |
3079 | 0 | _ => bail!("Should have been an element or text in '{}'", tag_name), |
3080 | } | |
3081 | } | |
3082 | 14.6k | mathml.replace_children(new_children); |
3083 | 14.6k | return Ok( mathml ); |
3084 | }, | |
3085 | } | |
3086 | 64.1k | } |
3087 | ||
3088 | 1.91k | fn potentially_lift_script<'a>(&self, mrow: Element<'a>) -> Element<'a> { |
3089 | 1.91k | if name(mrow) != "mrow" { |
3090 | 0 | return mrow; |
3091 | 1.91k | } |
3092 | 1.91k | let mut mrow_children = mrow.children(); |
3093 | 1.91k | let first_child = as_element(mrow_children[0]); |
3094 | 1.91k | let last_child = as_element(mrow_children[mrow_children.len()-1]); |
3095 | 1.91k | let last_child_name = name(last_child); |
3096 | ||
3097 | 1.91k | if name(first_child) == "mo" && |
3098 | 1.91k | (last_child_name == "msub" || last_child_name == "msup" || |
3099 | 19 | let base = as_element(last_child.children()[0]); |
3100 | 19 | if !(name(base) == "mo" && is_fence(base)) { |
3101 | 0 | return mrow; // not a case we are interested in |
3102 | 19 | } |
3103 | // else drop through | |
3104 | } else { | |
3105 | 1.89k | return mrow; // not a case we are interested in |
3106 | } | |
3107 | ||
3108 | 19 | let script = last_child; // better name now that we know what it is |
3109 | 19 | let mut script_children = script.children(); |
3110 | 19 | let close_fence = script_children[0]; |
3111 | 19 | let mrow_children_len = mrow_children.len(); // rust complains about a borrow after move if we don't store this first |
3112 | 19 | mrow_children[mrow_children_len-1] = close_fence; // make the mrow hold the fences |
3113 | 19 | mrow.replace_children(mrow_children); |
3114 | // make the mrow the child of the script | |
3115 | 19 | script_children[0] = ChildOfElement::Element(mrow); |
3116 | 19 | script.replace_children(script_children); |
3117 | 19 | return script; |
3118 | 1.91k | } |
3119 | ||
3120 | /// Map names to start of Unicode alphanumeric blocks (Roman, digits, Greek) | |
3121 | /// Don't do this for function names -- for function names, map them back to ASCII | |
3122 | 42.1k | fn canonicalize_plane1<'a>(&self, mi: Element<'a>) -> Element<'a> { |
3123 | // if the character shouldn't be mapped, use 0 -- don't use 'A' as ASCII and Greek aren't contiguous | |
3124 | static MATH_VARIANTS: phf::Map<&str, [u32; 3]> = phf_map! { | |
3125 | // "normal" -- nothing to do | |
3126 | "italic" => [0, 0, 0x1D6E2], | |
3127 | "bold" => [0x1D400, 0x1D7CE, 0x1D6A8], | |
3128 | "bold-italic" => [0x1D468, 0x1D7CE, 0x1D71C], | |
3129 | "double-struck" => [0x1D538, 0x1D7D8, 0], | |
3130 | "bold-fraktur" => [0x1D56C, 0, 0x1D6A8], | |
3131 | "script" => [0x1D49C, 0, 0], | |
3132 | "bold-script" => [0x1D4D0, 0, 0x1D6A8], | |
3133 | "fraktur" => [0x1D504, 0, 0], | |
3134 | "sans-serif" => [0x1D5A0, 0x1D7E2, 0], | |
3135 | "bold-sans-serif" => [0x1D5D4, 0x1D7EC, 0x1D756], | |
3136 | "sans-serif-italic" => [0x1D608, 0x1D7E2, 0], | |
3137 | "sans-serif-bold-italic" => [0x1D63C, 0x1D7EC, 0x1D790], | |
3138 | "monospace" => [0x1D670, 0x1D7F6, 0], | |
3139 | }; | |
3140 | ||
3141 | 42.1k | return crate::definitions::SPEECH_DEFINITIONS.with(|defs| { |
3142 | // names that are always function names (e.g, "sin" and "log") | |
3143 | 42.1k | let defs = defs.borrow(); |
3144 | 42.1k | let |
3145 | 42.1k | Some(hs) => hs, |
3146 | 3 | None => return mi, // happens in some canonicalize tests but not in real use |
3147 | }; | |
3148 | ||
3149 | ||
3150 | 42.1k | let mi_text = as_text(mi); |
3151 | 42.1k | let variant = mi.attribute_value("mathvariant"); |
3152 | ||
3153 | 42.1k | if names.contains(mi_text) { |
3154 | 791 | return mi; // avoid mapping mathvariant for function names |
3155 | 41.3k | } |
3156 | // function name might be (wrongly) set to italic math alphanumeric chars, including bold italic | |
3157 | 41.3k | if let Some( |
3158 | 12.6k | && names.contains(&ascii_text) { |
3159 | 3 | mi.set_text(&ascii_text); |
3160 | 3 | return mi |
3161 | 41.3k | } |
3162 | ||
3163 | 41.3k | if variant.is_none() { |
3164 | 40.3k | return mi; |
3165 | 952 | } |
3166 | ||
3167 | 952 | let new_text = match MATH_VARIANTS.get(variant.unwrap()) { |
3168 | 755 | None => mi_text.to_string(), |
3169 | 197 | Some(start) => shift_text(mi_text, start), |
3170 | }; | |
3171 | // mi.remove_attribute("mathvariant"); // leave attr -- for Nemeth, there are italic digits etc that don't have Unicode points | |
3172 | 952 | mi.set_text(&new_text); |
3173 | 952 | return mi; |
3174 | 42.1k | }); |
3175 | ||
3176 | 197 | fn shift_text(old_text: &str, char_mapping: &[u32; 3]) -> String { |
3177 | // if there is no block for something, use 'a', 'A', 0 as that will be a no-op | |
3178 | struct Offsets { | |
3179 | ch: u32, | |
3180 | table: usize, | |
3181 | } | |
3182 | static SHIFT_AMOUNTS: phf::Map<char, Offsets> = phf_map! { | |
3183 | 'A' => Offsets{ ch: 0, table: 0}, | |
3184 | 'B' => Offsets{ ch: 1, table: 0}, | |
3185 | 'C' => Offsets{ ch: 2, table: 0}, | |
3186 | 'D' => Offsets{ ch: 3, table: 0}, | |
3187 | 'E' => Offsets{ ch: 4, table: 0}, | |
3188 | 'F' => Offsets{ ch: 5, table: 0}, | |
3189 | 'G' => Offsets{ ch: 6, table: 0}, | |
3190 | 'H' => Offsets{ ch: 7, table: 0}, | |
3191 | 'I' => Offsets{ ch: 8, table: 0}, | |
3192 | 'J' => Offsets{ ch: 9, table: 0}, | |
3193 | 'K' => Offsets{ ch: 10, table: 0}, | |
3194 | 'L' => Offsets{ ch: 11, table: 0}, | |
3195 | 'M' => Offsets{ ch: 12, table: 0}, | |
3196 | 'N' => Offsets{ ch: 13, table: 0}, | |
3197 | 'O' => Offsets{ ch: 14, table: 0}, | |
3198 | 'P' => Offsets{ ch: 15, table: 0}, | |
3199 | 'Q' => Offsets{ ch: 16, table: 0}, | |
3200 | 'R' => Offsets{ ch: 17, table: 0}, | |
3201 | 'S' => Offsets{ ch: 18, table: 0}, | |
3202 | 'T' => Offsets{ ch: 19, table: 0}, | |
3203 | 'U' => Offsets{ ch: 20, table: 0}, | |
3204 | 'V' => Offsets{ ch: 21, table: 0}, | |
3205 | 'W' => Offsets{ ch: 22, table: 0}, | |
3206 | 'X' => Offsets{ ch: 23, table: 0}, | |
3207 | 'Y' => Offsets{ ch: 24, table: 0}, | |
3208 | 'Z' => Offsets{ ch: 25, table: 0}, | |
3209 | 'a' => Offsets{ ch: 26, table: 0}, | |
3210 | 'b' => Offsets{ ch: 27, table: 0}, | |
3211 | 'c' => Offsets{ ch: 28, table: 0}, | |
3212 | 'd' => Offsets{ ch: 29, table: 0}, | |
3213 | 'e' => Offsets{ ch: 30, table: 0}, | |
3214 | 'f' => Offsets{ ch: 31, table: 0}, | |
3215 | 'g' => Offsets{ ch: 32, table: 0}, | |
3216 | 'h' => Offsets{ ch: 33, table: 0}, | |
3217 | 'i' => Offsets{ ch: 34, table: 0}, | |
3218 | 'j' => Offsets{ ch: 35, table: 0}, | |
3219 | 'k' => Offsets{ ch: 36, table: 0}, | |
3220 | 'l' => Offsets{ ch: 37, table: 0}, | |
3221 | 'm' => Offsets{ ch: 38, table: 0}, | |
3222 | 'n' => Offsets{ ch: 39, table: 0}, | |
3223 | 'o' => Offsets{ ch: 40, table: 0}, | |
3224 | 'p' => Offsets{ ch: 41, table: 0}, | |
3225 | 'q' => Offsets{ ch: 42, table: 0}, | |
3226 | 'r' => Offsets{ ch: 43, table: 0}, | |
3227 | 's' => Offsets{ ch: 44, table: 0}, | |
3228 | 't' => Offsets{ ch: 45, table: 0}, | |
3229 | 'u' => Offsets{ ch: 46, table: 0}, | |
3230 | 'v' => Offsets{ ch: 47, table: 0}, | |
3231 | 'w' => Offsets{ ch: 48, table: 0}, | |
3232 | 'x' => Offsets{ ch: 49, table: 0}, | |
3233 | 'y' => Offsets{ ch: 50, table: 0}, | |
3234 | 'z' => Offsets{ ch: 51, table: 0}, | |
3235 | '0' => Offsets{ ch: 0, table: 1}, | |
3236 | '1' => Offsets{ ch: 1, table: 1}, | |
3237 | '2' => Offsets{ ch: 2, table: 1}, | |
3238 | '3' => Offsets{ ch: 3, table: 1}, | |
3239 | '4' => Offsets{ ch: 4, table: 1}, | |
3240 | '5' => Offsets{ ch: 5, table: 1}, | |
3241 | '6' => Offsets{ ch: 6, table: 1}, | |
3242 | '7' => Offsets{ ch: 7, table: 1}, | |
3243 | '8' => Offsets{ ch: 8, table: 1}, | |
3244 | '9' => Offsets{ ch: 9, table: 1}, | |
3245 | 'Α' => Offsets{ ch: 0, table: 2}, | |
3246 | 'Β' => Offsets{ ch: 1, table: 2}, | |
3247 | 'Γ' => Offsets{ ch: 2, table: 2}, | |
3248 | 'Δ' => Offsets{ ch: 3, table: 2}, | |
3249 | 'Ε' => Offsets{ ch: 4, table: 2}, | |
3250 | 'Ζ' => Offsets{ ch: 5, table: 2}, | |
3251 | 'Η' => Offsets{ ch: 6, table: 2}, | |
3252 | 'Θ' => Offsets{ ch: 7, table: 2}, | |
3253 | 'Ι' => Offsets{ ch: 8, table: 2}, | |
3254 | 'Κ' => Offsets{ ch: 9, table: 2}, | |
3255 | 'Λ' => Offsets{ ch: 10, table: 2}, | |
3256 | 'Μ' => Offsets{ ch: 11, table: 2}, | |
3257 | 'Ν' => Offsets{ ch: 12, table: 2}, | |
3258 | 'Ξ' => Offsets{ ch: 13, table: 2}, | |
3259 | 'Ο' => Offsets{ ch: 14, table: 2}, | |
3260 | 'Π' => Offsets{ ch: 15, table: 2}, | |
3261 | 'Ρ' => Offsets{ ch: 16, table: 2}, | |
3262 | 'ϴ' => Offsets{ ch: 17, table: 2}, | |
3263 | 'Σ' => Offsets{ ch: 18, table: 2}, | |
3264 | 'Τ' => Offsets{ ch: 19, table: 2}, | |
3265 | 'Υ' => Offsets{ ch: 20, table: 2}, | |
3266 | 'Φ' => Offsets{ ch: 21, table: 2}, | |
3267 | 'Χ' => Offsets{ ch: 22, table: 2}, | |
3268 | 'Ψ' => Offsets{ ch: 23, table: 2}, | |
3269 | 'Ω' => Offsets{ ch: 24, table: 2}, | |
3270 | '∇' => Offsets{ ch: 25, table: 2}, | |
3271 | 'α' => Offsets{ ch: 26, table: 2}, | |
3272 | 'β' => Offsets{ ch: 27, table: 2}, | |
3273 | 'γ' => Offsets{ ch: 28, table: 2}, | |
3274 | 'δ' => Offsets{ ch: 29, table: 2}, | |
3275 | 'ε' => Offsets{ ch: 30, table: 2}, | |
3276 | 'ζ' => Offsets{ ch: 31, table: 2}, | |
3277 | 'η' => Offsets{ ch: 32, table: 2}, | |
3278 | 'θ' => Offsets{ ch: 33, table: 2}, | |
3279 | 'ι' => Offsets{ ch: 34, table: 2}, | |
3280 | 'κ' => Offsets{ ch: 35, table: 2}, | |
3281 | 'λ' => Offsets{ ch: 36, table: 2}, | |
3282 | 'μ' => Offsets{ ch: 37, table: 2}, | |
3283 | 'ν' => Offsets{ ch: 38, table: 2}, | |
3284 | 'ξ' => Offsets{ ch: 39, table: 2}, | |
3285 | 'ο' => Offsets{ ch: 40, table: 2}, | |
3286 | 'π' => Offsets{ ch: 41, table: 2}, | |
3287 | 'ρ' => Offsets{ ch: 42, table: 2}, | |
3288 | 'ς' => Offsets{ ch: 43, table: 2}, | |
3289 | 'σ' => Offsets{ ch: 44, table: 2}, | |
3290 | 'τ' => Offsets{ ch: 45, table: 2}, | |
3291 | 'υ' => Offsets{ ch: 46, table: 2}, | |
3292 | 'φ' => Offsets{ ch: 47, table: 2}, | |
3293 | 'χ' => Offsets{ ch: 48, table: 2}, | |
3294 | 'ψ' => Offsets{ ch: 49, table: 2}, | |
3295 | 'ω' => Offsets{ ch: 50, table: 2}, | |
3296 | '∂' => Offsets{ ch: 51, table: 2}, | |
3297 | 'ϵ' => Offsets{ ch: 52, table: 2}, | |
3298 | 'ϑ' => Offsets{ ch: 53, table: 2}, | |
3299 | 'ϰ' => Offsets{ ch: 54, table: 2}, | |
3300 | 'ϕ' => Offsets{ ch: 55, table: 2}, | |
3301 | 'ϱ' => Offsets{ ch: 56, table: 2}, | |
3302 | 'ϖ' => Offsets{ ch: 57, table: 2}, | |
3303 | }; | |
3304 | 197 | let mut new_text = String::new(); |
3305 | 321 | for ch in |
3306 | 321 | new_text.push( |
3307 | 321 | match SHIFT_AMOUNTS.get(&ch) { |
3308 | None => { | |
3309 | // there are two digamma chars only in the bold mapping. Handled here | |
3310 | 71 | if char_mapping[2] == 0x1D6A8 { |
3311 | 43 | match ch { |
3312 | 1 | 'Ϝ' => '𝟊', |
3313 | 1 | 'ϝ' => '𝟋', |
3314 | 41 | _ => ch, |
3315 | } | |
3316 | } else { | |
3317 | 28 | ch |
3318 | } | |
3319 | }, | |
3320 | 250 | Some(offsets) => { |
3321 | 250 | let start_of_mapping = char_mapping[offsets.table]; |
3322 | 250 | if start_of_mapping == 0 { |
3323 | } | |
3324 | } | |
3325 | ) | |
3326 | } | |
3327 | 197 | return new_text; |
3328 | ||
3329 | 213 | fn shift_char(ch: u32) -> char { |
3330 | // there are "holes" in the math alphanumerics due to legacy issues | |
3331 | // this table maps the holes to their legacy location | |
3332 | static EXCEPTIONS: phf::Map<u32, u32> = phf_map! { | |
3333 | 0x1D455u32 => 0x210Eu32, | |
3334 | 0x1D49Du32 => 0x212Cu32, | |
3335 | 0x1D4A0u32 => 0x2130u32, | |
3336 | 0x1D4A1u32 => 0x2131u32, | |
3337 | 0x1D4A3u32 => 0x210Bu32, | |
3338 | 0x1D4A4u32 => 0x2110u32, | |
3339 | 0x1D4A7u32 => 0x2112u32, | |
3340 | 0x1D4A8u32 => 0x2133u32, | |
3341 | 0x1D4ADu32 => 0x211Bu32, | |
3342 | 0x1D4BAu32 => 0x212Fu32, | |
3343 | 0x1D4BCu32 => 0x210Au32, | |
3344 | 0x1D4C4u32 => 0x2134u32, | |
3345 | 0x1D506u32 => 0x212Du32, | |
3346 | 0x1D50Bu32 => 0x210Cu32, | |
3347 | 0x1D50Cu32 => 0x2111u32, | |
3348 | 0x1D515u32 => 0x211Cu32, | |
3349 | 0x1D51Du32 => 0x2128u32, | |
3350 | 0x1D53Au32 => 0x2102u32, | |
3351 | 0x1D53Fu32 => 0x210Du32, | |
3352 | 0x1D545u32 => 0x2115u32, | |
3353 | 0x1D547u32 => 0x2119u32, | |
3354 | 0x1D548u32 => 0x211Au32, | |
3355 | 0x1D549u32 => 0x211Du32, | |
3356 | 0x1D551u32 => 0x2124u32, | |
3357 | }; | |
3358 | ||
3359 | 213 | return unsafe { char::from_u32_unchecked( // safe because the values are a char or from the table above |
3360 | 213 | match EXCEPTIONS.get(&ch) { |
3361 | 161 | None => ch, |
3362 | 52 | Some(exception_value) => *exception_value, |
3363 | } | |
3364 | ) } | |
3365 | 213 | } |
3366 | 197 | } |
3367 | 42.1k | } |
3368 | ||
3369 | 41.5k | fn math_alphanumeric_to_ascii(input: &str) -> Option<String> { |
3370 | 41.5k | let mut result = String::with_capacity(input.len()); |
3371 | ||
3372 | 46.6k | for c in |
3373 | 46.6k | let |
3374 | // Standard ASCII | |
3375 | 18.4k | 'a'..='z' | |
3376 | ||
3377 | // Mathematical Bold (A-Z: U+1D400, a-z: U+1D41A) | |
3378 | 482 | '\u{1D400}'..='\u{1D419}' => |
3379 | 460 | '\u{1D41A}'..='\u{1D433}' => |
3380 | ||
3381 | // Mathematical Italic (A-Z: U+1D434, a-z: U+1D44E) | |
3382 | // Note: 'h' is missing from this range (U+210E) | |
3383 | 424 | '\u{1D434}'..='\u{1D44D}' => |
3384 | 414 | '\u{1D44E}'..='\u{1D467}' => |
3385 | ||
3386 | // Mathematical Bold Italic (A-Z: U+1D468, a-z: U+1D482) | |
3387 | 400 | '\u{1D468}'..='\u{1D481}' => |
3388 | 400 | '\u{1D482}'..='\u{1D49B}' => |
3389 | ||
3390 | // Mathematical Sans-Serif (A-Z: U+1D5A0, a-z: U+1D5BA) | |
3391 | 274 | '\u{1D5A0}'..='\u{1D5B9}' => |
3392 | 264 | '\u{1D5BA}'..='\u{1D5D3}' => |
3393 | ||
3394 | // If a character isn't a letter (or supported math letter), return None | |
3395 | 28.6k | _ => return None, |
3396 | }; | |
3397 | 18.0k | result.push(converted); |
3398 | } | |
3399 | ||
3400 | 12.8k | Some(result) |
3401 | 41.5k | } |
3402 | ||
3403 | 14.6k | fn canonicalize_mo_text(&self, mo: Element) { |
3404 | // lazy_static! { (NOTE: std::sync::LazyLock is now used instead) | |
3405 | // static ref IS_LIKELY_SCALAR_VARIABLE: Regex = Regex::new("[a-eh-z]").unwrap(); | |
3406 | // } | |
3407 | ||
3408 | 14.6k | let mut mo_text = as_text(mo); |
3409 | 14.6k | let parent = get_parent(mo); |
3410 | 14.6k | let parent_name = name(parent); |
3411 | 14.6k | let is_base = mo.preceding_siblings().is_empty(); |
3412 | 14.6k | if !is_base && ( |
3413 | // canonicalize various diacritics for munder, mover, munderover | |
3414 | 309 | mo_text = match mo_text { |
3415 | 309 | "_" | |
3416 | 302 | "\u{2010}" | "\u{2011}" | "\u{2012}" | "\u{2013}" | "\u{2014}" | "\u{2015}" | |
3417 | 292 | "\u{02BC}" => |
3418 | 292 | "\u{02DC}" | "\u{223C}" => |
3419 | 292 | "\u{02C6}"| "\u{0302}" => |
3420 | 292 | "\u{0307}" => |
3421 | 292 | "\u{0308}" => |
3422 | 292 | _ => mo_text, |
3423 | } | |
3424 | // FIX: MathType generates the wrong version of union and intersection ops (binary instead of unary) | |
3425 | 14.3k | } else if !is_base && ( |
3426 | 227 | mo_text = match mo_text { |
3427 | 227 | "\u{00BA}"| "\u{2092}"| "\u{20D8}"| "\u{2218}" | |
3428 | 223 | _ => mo_text, |
3429 | }; | |
3430 | } else { | |
3431 | 14.0k | mo_text = match mo_text { |
3432 | 14.0k | "\u{02C9}"| "\u{0304}"| "\u{0305}" => |
3433 | 14.0k | "\u{02DC}" | "~" => |
3434 | 14.0k | "\u{01C1}" => |
3435 | ||
3436 | 14.0k | _ => mo_text, |
3437 | }; | |
3438 | }; | |
3439 | 14.6k | if mo_text == "\u{2212}" { |
3440 | 314 | mo_text = "-"; |
3441 | 14.2k | } |
3442 | 14.6k | mo.set_text(mo_text); |
3443 | 14.6k | } |
3444 | ||
3445 | ||
3446 | // Find the operator associated with the 'mo_node' | |
3447 | // This is complicated by potentially needing to distinguish between the | |
3448 | // prefix, infix, or postfix version of the operator. | |
3449 | // To figure out prefix, we need to look at the node on the left; for postfix, we need to look to the left | |
3450 | // If the node of the left has been parsed, then this works. | |
3451 | // For example, suppose we want to determine if the "+" in 'x < n!+1' is prefix or infix. | |
3452 | // If we simply looked left without parsing, we'd see an operator and choose prefix unless we could figure out that | |
3453 | // that "!" was postfix. But if it had been parsed, we'd see an mrow (operand) and tree "+" as infix (as it should). | |
3454 | // The same problem applies on the right for postfix operators, but a problem is rare for those | |
3455 | // e.g., n!!n -- ((n!)!)*n or (n!)*(!n) -- the latter doesn't make semantic sense though | |
3456 | // FIX: the above ignores mspace and other nodes that need to be skipped to determine the right node to determine airity | |
3457 | // FIX: the postfix problem above should be addressed | |
3458 | 19.4k | fn find_operator<'a>(context: Option<&CanonicalizeContext>, mo_node: Element<'a>, previous_operator: Option<&'static OperatorInfo>, |
3459 | 19.4k | previous_node: Option<Element<'a>>, next_node: Option<Element<'a>>) -> &'static OperatorInfo { |
3460 | // get the unicode value and return the OpKeyword associated with it | |
3461 | 19.4k | assert!( name(mo_node) == "mo"); |
3462 | ||
3463 | // if a form has been given, that takes precedence | |
3464 | 19.4k | let form = mo_node.attribute_value("form"); |
3465 | 19.4k | let op_type = match form { |
3466 | 19.4k | None => match context { |
3467 | 5.50k | None => OperatorTypes::POSTFIX, // what compute_type_from_position returns when the other args to this are all None |
3468 | 13.9k | Some(context) => compute_type_from_position(context, previous_operator, previous_node, next_node), |
3469 | }, | |
3470 | 10 | Some(form) => match form.to_lowercase().as_str() { |
3471 | 10 | "prefix" => |
3472 | 6 | "postfix" => |
3473 | 4 | _ => OperatorTypes::INFIX, |
3474 | } | |
3475 | }; | |
3476 | ||
3477 | 19.4k | let found_op_info = if mo_node.attribute_value(CHEMICAL_BOND).is_some() { |
3478 | 112 | Some(&IMPLIED_CHEMICAL_BOND) |
3479 | } else { | |
3480 | 19.3k | OPERATORS.get(as_text(mo_node)) |
3481 | }; | |
3482 | 19.4k | if found_op_info.is_none() { |
3483 | // no known operator -- return the unknown operator with the correct "fix" type | |
3484 | 49 | return op_not_in_operator_dictionary(op_type); |
3485 | 19.4k | } |
3486 | ||
3487 | 19.4k | let found_op_info = found_op_info.unwrap(); |
3488 | 19.4k | let matching_op_info = find_operator_info(found_op_info, op_type, form.is_some()); |
3489 | 19.4k | if ptr_eq(matching_op_info, &ILLEGAL_OPERATOR_INFO) { |
3490 | 0 | return op_not_in_operator_dictionary(op_type); |
3491 | } else { | |
3492 | 19.4k | return matching_op_info; |
3493 | } | |
3494 | ||
3495 | ||
3496 | 13.9k | fn compute_type_from_position<'a>(context: &CanonicalizeContext, previous_operator: Option<&'static OperatorInfo>, previous_node: Option<Element<'a>>, next_node: Option<Element<'a>>) -> OperatorTypes { |
3497 | // based on choices, pick one that fits the context | |
3498 | // if there isn't an obvious one, we have parsed the left, but not the right, so discount that | |
3499 | ||
3500 | // Trig functions have some special syntax | |
3501 | // We need to treat '-' as prefix for things like "sin -2x" | |
3502 | // Need to be careful because (sin - cos)(x) needs an infix '-' | |
3503 | // Return either the prefix or infix version of the operator | |
3504 | 13.9k | if next_node.is_some() && |
3505 | 11.9k | context.is_function_name(get_possible_embellished_node(next_node.unwrap()), None) == FunctionNameCertainty::True { |
3506 | 260 | return OperatorTypes::INFIX; |
3507 | 13.6k | } |
3508 | 13.6k | if previous_node.is_some() && |
3509 | 11.1k | context.is_function_name(get_possible_embellished_node(previous_node.unwrap()), None) == FunctionNameCertainty::True { |
3510 | 207 | return OperatorTypes::PREFIX; |
3511 | 13.4k | } |
3512 | ||
3513 | // after that special case, start with the obvious cases... | |
3514 | 13.4k | let operand_on_left = previous_operator.is_none() || |
3515 | 13.4k | let operand_on_right = next_node.is_some() && |
3516 | ||
3517 | 13.4k | if operand_on_left && |
3518 | 8.19k | return OperatorTypes::INFIX; // infix |
3519 | 5.29k | } else if !operand_on_left && |
3520 | 2.75k | return OperatorTypes::PREFIX; // prefix |
3521 | 2.54k | } else if operand_on_left && |
3522 | 2.34k | return OperatorTypes::POSTFIX; // postfix |
3523 | } else { | |
3524 | // either two operators in a row or right hand side not parsed so we don't really know what is right (same is true above) | |
3525 | // since there is nothing good to return, assume right is an operand after parsing (thus infix case) | |
3526 | 196 | return OperatorTypes::INFIX; |
3527 | } | |
3528 | 13.9k | } |
3529 | ||
3530 | 19.4k | fn find_operator_info(op_info: &OperatorInfo, op_type: OperatorTypes, from_form_attr: bool) -> &OperatorInfo { |
3531 | 19.4k | if op_info.is_operator_type(op_type) { |
3532 | 12.9k | return op_info; |
3533 | 6.45k | } else if let Some( |
3534 | 1.64k | if next_op_info.is_operator_type(op_type) { |
3535 | 730 | return next_op_info; |
3536 | 915 | } else if let Some( |
3537 | 256 | && last_op_info.is_operator_type(op_type) { |
3538 | 256 | return last_op_info; |
3539 | 659 | } |
3540 | 4.81k | } |
3541 | ||
3542 | // didn't find op_info that matches -- if type is not forced, then return first value (any is probably ok) | |
3543 | 5.47k | return if from_form_attr { |
3544 | 19.4k | } |
3545 | ||
3546 | 49 | fn op_not_in_operator_dictionary(op_type: OperatorTypes) -> &'static OperatorInfo { |
3547 | 49 | return match op_type { |
3548 | 16 | OperatorTypes::PREFIX => &DEFAULT_OPERATOR_INFO_PREFIX, |
3549 | 9 | OperatorTypes::POSTFIX => &DEFAULT_OPERATOR_INFO_POSTFIX, |
3550 | 24 | _ => &DEFAULT_OPERATOR_INFO_INFIX, // should only be infix |
3551 | }; | |
3552 | 49 | } |
3553 | 19.4k | } |
3554 | ||
3555 | 13.9k | fn n_vertical_bars_on_right(&self, remaining_children: &[ChildOfElement], vert_bar_ch: &str) -> usize { |
3556 | // return the number of children that match 'vert_bar_op' not counting the first element | |
3557 | 13.9k | let mut n = 0; |
3558 | 149k | for child_of_element in |
3559 | 149k | let child = as_element(*child_of_element); |
3560 | 149k | if name(child) == "mo" { |
3561 | 49.9k | let operator_str = as_text(child); |
3562 | 49.9k | if operator_str == vert_bar_ch { |
3563 | 42.7k | n += 1; |
3564 | 42.7k |
|
3565 | 99.9k | } |
3566 | } | |
3567 | 13.9k | return n; |
3568 | 13.9k | } |
3569 | ||
3570 | ||
3571 | 13.9k | fn determine_vertical_bar_op<'a>(&self, original_op: &'static OperatorInfo, mo_node: Element<'a>, |
3572 | 13.9k | next_child: Option<Element<'a>>, |
3573 | 13.9k | parse_stack: &'a mut Vec<StackInfo>, |
3574 | 13.9k | n_vertical_bars_on_right: usize) -> &'static OperatorInfo { |
3575 | // if in a prefix location, it is a left fence | |
3576 | // note: if there is an operator on the top of the stack, it wants an operand (otherwise it would have been reduced) | |
3577 | 13.9k | let operator_str = as_text(mo_node); |
3578 | 13.9k | let found_op_info = OPERATORS.get(operator_str); |
3579 | 13.9k | if found_op_info.is_none() { |
3580 | 48 | return original_op; |
3581 | 13.8k | } |
3582 | 13.8k | let op = found_op_info.unwrap(); |
3583 | 13.8k | if !AMBIGUOUS_OPERATORS.contains(operator_str) { |
3584 | // debug!(" op is not ambiguous"); | |
3585 | 13.4k | return original_op; |
3586 | 401 | }; |
3587 | ||
3588 | 401 | let operator_versions = OperatorVersions::new(op); |
3589 | 401 | if let Some( |
3590 | 360 | (top(parse_stack).last_child_in_mrow().is_none() || |
3591 | // debug!(" is prefix"); | |
3592 | 115 | return prefix; |
3593 | 286 | } |
3594 | ||
3595 | // We have either a right fence or an infix operand at the top of the stack | |
3596 | // If this is already parsed, we'd look to the right to see if there is an operand after this child. | |
3597 | // But it isn't parsed and there might be a prefix operator which will eventually become an operand, so it is tricky. | |
3598 | // It is even trickier because we might have an implicit times, so we can't really tell | |
3599 | // For example: |x|y|z| which can be '|x| y |z|' or '|x |y| z|', or even | (x|y)|z |' | |
3600 | // We can't really know what is intended (without @intent). | |
3601 | // It seems like the case where it could be paired with a matching vertical bar as what most people would choose, so we favor that. | |
3602 | ||
3603 | // If there is a matching open vertical bar, it is either at the top of the stack or the entry just below the top | |
3604 | ||
3605 | 286 | let has_left_match = if let Some( |
3606 | 245 | if ptr_eq(top(parse_stack).op_pair.op, op_prefix) { // match at top of stack? (empty matching bars) |
3607 | 109 | true |
3608 | 136 | } else if parse_stack.len() > 2 { |
3609 | // matching op is below top (operand between matching bars) -- pop, peek, push | |
3610 | 36 | let old_top = parse_stack.pop().unwrap(); |
3611 | 36 | let top_op = top(parse_stack).op_pair.op; // can only access top, so we need to pop off top and push back later |
3612 | 36 | parse_stack.push(old_top); |
3613 | 36 | ptr_eq(top_op, op_prefix) |
3614 | } else { | |
3615 | 100 | false |
3616 | } | |
3617 | } else { | |
3618 | 41 | false |
3619 | }; | |
3620 | 286 | if let Some( |
3621 | // last child in row (must be a close) or we have a left match | |
3622 | // debug!(" is postfix"); | |
3623 | 136 | return postfix; |
3624 | 150 | } else if next_child.is_none() { |
3625 | // operand on left, so prefer infix version | |
3626 | 18 | return if let Some(infix) = operator_versions.infix {infix} else { |
3627 | 132 | } |
3628 | ||
3629 | 132 | let next_child = next_child.unwrap(); |
3630 | 132 | if let Some( |
3631 | // (" is prefix"); | |
3632 | 3 | return prefix; // odd number of vertical bars remain, so consider this the start of a pair |
3633 | 129 | } |
3634 | ||
3635 | 129 | let next_child = get_possible_embellished_node(next_child); |
3636 | 129 | let next_child_op = if name(next_child) != "mo" { |
3637 | 128 | None |
3638 | } else { | |
3639 | 1 | let next_next_children = next_child.following_siblings(); |
3640 | 1 | let next_next_child = if next_next_children.is_empty() { |
3641 | 1 | Some( CanonicalizeContext::find_operator(Some(self), next_child, operator_versions.infix, |
3642 | 1 | top(parse_stack).last_child_in_mrow(), next_next_child) ) |
3643 | }; | |
3644 | ||
3645 | // If the next child is a prefix op or a left fence, it will reduce to an operand, so don't consider it an operator | |
3646 | 129 | if next_child_op.is_some() && |
3647 | 0 | if let Some(postfix) =operator_versions.postfix { |
3648 | // debug!(" is postfix"); | |
3649 | 0 | return postfix; |
3650 | 0 | } |
3651 | 129 | } else if let Some(infix) = operator_versions.infix { |
3652 | // debug!(" is infix"); | |
3653 | 129 | return infix; |
3654 | 0 | } |
3655 | ||
3656 | // nothing good to match | |
3657 | 0 | return op; |
3658 | 13.9k | } |
3659 | ||
3660 | ||
3661 | // return FunctionNameCertainty::False or Maybe if 'node' is a chemical element and is followed by a state (solid, liquid, ...) | |
3662 | // in other words, we are certain this can't be a function since it looks like it is or might be chemistry | |
3663 | 1.71k | fn is_likely_chemical_state<'a>(&self, node: Element<'a>, right_sibling: Element<'a>) -> FunctionNameCertainty { |
3664 | 1.71k | assert_eq!(name(get_parent(node)), "mrow"); // should be here because we are parsing an mrow |
3665 | ||
3666 | // debug!(" in is_likely_chemical_state: '{}'?",element_summary(node)); | |
3667 | 1.71k | let node_chem_likelihood= node.attribute_value(MAYBE_CHEMISTRY); |
3668 | 1.71k | if node.attribute(MAYBE_CHEMISTRY).is_none() { |
3669 | 1.16k | return FunctionNameCertainty::True; |
3670 | 549 | } |
3671 | ||
3672 | 549 | if name(right_sibling) == "mrow" { // clean_chemistry_mrow made sure any state-like structure is an mrow |
3673 | 75 | let state_likelihood = likely_chem_state(right_sibling); |
3674 | 75 | if state_likelihood > 0 { |
3675 | 49 | right_sibling.set_attribute_value(MAYBE_CHEMISTRY, state_likelihood.to_string().as_str()); |
3676 | // at this point, we know both node and right_sibling are positive, so we have at least a maybe | |
3677 | 49 | if state_likelihood + node_chem_likelihood.unwrap().parse::<i32>().unwrap() > 2 { |
3678 | 49 | return FunctionNameCertainty::False; |
3679 | } else { | |
3680 | 0 | return FunctionNameCertainty::Maybe |
3681 | } | |
3682 | 26 | } |
3683 | 474 | } |
3684 | ||
3685 | 500 | return FunctionNameCertainty::True; |
3686 | 1.71k | } |
3687 | ||
3688 | // Try to figure out whether an <mi> is a function name or not. | |
3689 | // There are two important cases depending upon whether parens/brackets are used or not. | |
3690 | // E.g, sin x and f(x) | |
3691 | // 1. If parens follow the name, then we use a more inclusive set of heuristics as it is more likely a function | |
3692 | // The heuristics used are: | |
3693 | // - it is on the list of known function names (e.g., sin" and "log") | |
3694 | // - it is on the list of likely function names (e.g, f, g, h) | |
3695 | // - multi-char names that begin with a capital letter (e.g, "Tr") | |
3696 | // - there is a single token inside the parens (why else would someone use parens), any name (e.g, a(x)) | |
3697 | // - if there are multiple comma-separated args | |
3698 | // | |
3699 | // 2. If there are no parens, then only names on the known function list are used (e.g., "sin x") | |
3700 | // | |
3701 | // If the name if followed by parens but doesn't fit into the above categories, we return a "maybe" | |
3702 | 32.0k | fn is_function_name<'a>(&self, node: Element<'a>, right_siblings: Option<&[ChildOfElement<'a>]>) -> FunctionNameCertainty { |
3703 | 32.0k | let base_of_name = get_possible_embellished_node(node); |
3704 | ||
3705 | // actually only 'mi' should be legal here, but some systems used 'mtext' for multi-char variables | |
3706 | // FIX: need to allow for composition of function names. E.g, (f+g)(x) and (f^2/g)'(x) | |
3707 | 32.0k | let node_name = name(base_of_name); |
3708 | 32.0k | if node_name != "mi" && |
3709 | 15.4k | return FunctionNameCertainty::False; |
3710 | 16.6k | } |
3711 | // whitespace is sometimes added to the mi since braille needs it, so do a trim here to get function name | |
3712 | 16.6k | let base_name = as_text(base_of_name).trim(); |
3713 | 16.6k | if base_name.is_empty() { |
3714 | 2 | return FunctionNameCertainty::False; |
3715 | 16.6k | } |
3716 | // debug!(" is_function_name({}), {} following nodes", base_name, if right_siblings.is_none() {"No".to_string()} else {right_siblings.unwrap().len().to_string()}); | |
3717 | 16.6k | return crate::definitions::SPEECH_DEFINITIONS.with(|defs| { |
3718 | // names that are always function names (e.g, "sin" and "log") | |
3719 | 16.6k | let defs = defs.borrow(); |
3720 | 16.6k | let names = defs.get_hashset("FunctionNames").unwrap(); |
3721 | // UEB seems to think "Sin" (etc) is used for "sin", so we move to lower case | |
3722 | 16.6k | if names.contains(&base_name.to_ascii_lowercase()) { |
3723 | // debug!(" ...is in FunctionNames"); | |
3724 | 1.02k | return FunctionNameCertainty::True; // always treated as function names |
3725 | 15.5k | } |
3726 | ||
3727 | // We include shapes as function names so that △ABC makes sense since △ and | |
3728 | // the other shapes are not in the operator dictionary | |
3729 | 15.5k | let shapes = defs.get_hashset("GeometryShapes").unwrap(); |
3730 | 15.5k | if shapes.contains(base_name) { |
3731 | 23 | return FunctionNameCertainty::True; // always treated as function names |
3732 | 15.5k | } |
3733 | ||
3734 | 15.5k | if right_siblings.is_none() { |
3735 | 13.8k | return FunctionNameCertainty::False; // only accept known names, which is tested above |
3736 | 1.71k | } |
3737 | ||
3738 | // make sure that what follows starts and ends with parens/brackets | |
3739 | 1.71k | assert_eq!(name(get_parent(node)), "mrow"); |
3740 | 1.71k | let right_siblings = right_siblings.unwrap(); |
3741 | 1.71k | let non_whitespace = right_siblings.iter().enumerate() |
3742 | 1.71k | .find(|&(_, child)| { |
3743 | 1.71k | let child = as_element(*child); |
3744 | 1.71k | name(child) != "mtext" || |
3745 | 1.71k | }); |
3746 | 1.71k | let right_siblings = if let Some( (i, _) ) = non_whitespace {&right_siblings[i..]} else { |
3747 | 1.71k | if right_siblings.is_empty() { |
3748 | // debug!(" ...right siblings not None, but zero of them"); | |
3749 | 0 | return FunctionNameCertainty::False; |
3750 | 1.71k | } |
3751 | ||
3752 | 1.71k | let first_child = as_element(right_siblings[0]); |
3753 | ||
3754 | // clean_chemistry wrapped up a state in an mrow and this is assumed by is_likely_chemical_state() | |
3755 | 1.71k | let chem_state_certainty = self.is_likely_chemical_state(node, first_child); |
3756 | 1.71k | if chem_state_certainty != FunctionNameCertainty::True { |
3757 | // debug!(" ...is_likely_chemical_state says it is a function ={:?}", chem_state_certainty); | |
3758 | 49 | return chem_state_certainty; |
3759 | 1.66k | } |
3760 | ||
3761 | 1.66k | if name(first_child) == "mrow" && |
3762 | // debug!(" ...trying again after expanding mrow"); | |
3763 | 235 | return self.is_function_name(node, Some(&first_child.children())); |
3764 | 1.43k | } |
3765 | ||
3766 | 1.43k | if right_siblings.len() < 2 { |
3767 | // debug!(" ...not enough right siblings"); | |
3768 | 542 | return FunctionNameCertainty::False; // can't be (...) |
3769 | 892 | } |
3770 | ||
3771 | // at least two siblings are this point -- check that they are parens/brackets | |
3772 | // we can only check the open paren/bracket because the right side is unparsed and we don't know the close location | |
3773 | 892 | let first_sibling = as_element(right_siblings[0]); |
3774 | 892 | if name(first_sibling) != "mo" || |
3775 | { | |
3776 | // debug!(" ...first sibling is not '(' or '['"); | |
3777 | 522 | return FunctionNameCertainty::False; |
3778 | 370 | } |
3779 | ||
3780 | 370 | let likely_names = defs.get_hashset("LikelyFunctionNames").unwrap(); |
3781 | 370 | if likely_names.contains(base_name) { |
3782 | 206 | return FunctionNameCertainty::True; // don't bother checking contents of parens, consider these as function names |
3783 | 164 | } |
3784 | ||
3785 | 164 | if is_single_arg(as_text(first_sibling), &right_siblings[1..]) { |
3786 | // debug!(" ...is single arg"); | |
3787 | 64 | return FunctionNameCertainty::True; // if there is only a single arg, why else would you use parens? |
3788 | 100 | }; |
3789 | ||
3790 | 100 | if is_comma_arg(as_text(first_sibling), &right_siblings[1..]) { |
3791 | // debug!(" ...is comma arg"); | |
3792 | 2 | return FunctionNameCertainty::True; // if there is only a single arg, why else would you use parens? |
3793 | 98 | }; |
3794 | ||
3795 | // FIX: should really make sure all the args are marked as MAYBE_CHEMISTRY, but we don't know the matching close paren/bracket | |
3796 | 98 | if node.attribute(MAYBE_CHEMISTRY).is_some() && |
3797 | 34 | as_element(right_siblings[1]).attribute(MAYBE_CHEMISTRY).is_some() { |
3798 | 1 | return FunctionNameCertainty::False; |
3799 | 97 | } |
3800 | ||
3801 | // Names like "Tr" are likely function names, single letter names like "M" or "J" are iffy | |
3802 | // This needs to be after the chemical state check above to rule out Cl(g), etc | |
3803 | // This would be better if it were part of 'likely_names' as "[A-Za-z]+", but reg exprs don't work in HashSets. | |
3804 | // FIX: create our own struct and write appropriate traits for it and then it could work | |
3805 | 97 | let mut chars = base_name.chars(); |
3806 | 97 | let first_char = chars.next().unwrap(); // we know there is at least one byte in it, hence one char |
3807 | 97 | if chars.next().is_some() && |
3808 | // debug!(" ...is uppercase name"); | |
3809 | 4 | return FunctionNameCertainty::True; |
3810 | 93 | } |
3811 | ||
3812 | // debug!(" ...didn't match options to be a function"); | |
3813 | // debug!("Right siblings:\n{} ", right_siblings.iter().map(|&child| mml_to_string(as_element(child))).collect::<Vec<String>>().join("\n ")); | |
3814 | 93 | return if is_name_inside_parens(base_name, right_siblings) { |
3815 | 16.6k | }); |
3816 | ||
3817 | 164 | fn is_single_arg(open: &str, following_nodes: &[ChildOfElement]) -> bool { |
3818 | // following_nodes are nodes after "(" | |
3819 | 164 | if following_nodes.is_empty() { |
3820 | 0 | return true; // "a(" might or might not be a function call -- treat as "is" because we can't see more |
3821 | 164 | } |
3822 | ||
3823 | 164 | let first_child = as_element(following_nodes[0]); |
3824 | 164 | if is_matching_right_paren(open, first_child) { |
3825 | 0 | return true; // no-arg case "a()" |
3826 | 164 | } |
3827 | ||
3828 | // could be really picky and restrict to checking for only mi/mn | |
3829 | // that might make more sense in stranger cases, but mfrac, msqrt, etc., probably shouldn't have parens if times | |
3830 | 164 | return following_nodes.len() > 1 && |
3831 | 164 | name(first_child) != "mrow" && |
3832 | 127 | is_matching_right_paren(open, as_element(following_nodes[1])); |
3833 | 164 | } |
3834 | ||
3835 | 100 | fn is_comma_arg(open: &str, following_nodes: &[ChildOfElement]) -> bool { |
3836 | // following_nodes are nodes after "(" | |
3837 | 100 | if following_nodes.len() == 1 { |
3838 | 0 | return false; |
3839 | 100 | } |
3840 | ||
3841 | 100 | let first_child = as_element(following_nodes[1]); |
3842 | 100 | if name(first_child) == "mrow" { |
3843 | 0 | return is_comma_arg(open, &first_child.children()[..]); |
3844 | 100 | } |
3845 | ||
3846 | // FIX: this loop is very simplistic and could be improved to count parens, etc., to make sure "," is at top-level | |
3847 | 318 | for child in |
3848 | 318 | let child = as_element(*child); |
3849 | 318 | if name(child) == "mo" { |
3850 | 141 | if as_text(child) == "," { |
3851 | 2 | return true; |
3852 | 139 | } |
3853 | 139 | if is_matching_right_paren(open, child) { |
3854 | 96 | return false; |
3855 | 43 | } |
3856 | 177 | } |
3857 | } | |
3858 | ||
3859 | 2 | return false; |
3860 | 100 | } |
3861 | ||
3862 | 622 | fn is_left_paren(node: Element) -> bool { |
3863 | 622 | if name(node) != "mo" { |
3864 | 1 | return false; |
3865 | 621 | } |
3866 | 621 | let text = as_text(node); |
3867 | 621 | return text == "(" || |
3868 | 622 | } |
3869 | ||
3870 | 430 | fn is_matching_right_paren(open: &str, node: Element) -> bool { |
3871 | 430 | if name(node) != "mo" { |
3872 | 184 | return false; |
3873 | 246 | } |
3874 | 246 | let text = as_text(node); |
3875 | // debug!(" is_matching_right_paren: open={}, close={}", open, text); | |
3876 | 246 | return (open == "(" && |
3877 | 430 | } |
3878 | ||
3879 | /// Returns true if the name of the potential function is inside the parens. In that case, it is very unlikely to be a function call | |
3880 | /// For example, "n(n+1)" | |
3881 | 93 | fn is_name_inside_parens(function_name: &str, right_siblings: &[ChildOfElement]) -> bool { |
3882 | // the first child of right_siblings is either '(' or '[' | |
3883 | // right_siblings may extend well beyond the closing parens, so we first break this into finding the contents | |
3884 | // then we search the contents for the name | |
3885 | 93 | match find_contents(right_siblings) { |
3886 | 2 | None => return false, |
3887 | 91 | Some(contents) => return is_name_inside_contents(function_name, contents), |
3888 | } | |
3889 | ||
3890 | ||
3891 | 93 | fn find_contents<'a>(right_siblings: &'a[ChildOfElement<'a>]) -> Option<&'a[ChildOfElement<'a>]> { |
3892 | 93 | let open_text = as_text(as_element(right_siblings[0])); |
3893 | 93 | let close_text = if open_text == "(" { |
3894 | 93 | let mut nesting_level = 1; |
3895 | 93 | let mut i = 1; |
3896 | 296 | while i < right_siblings.len() { |
3897 | 294 | let child = as_element(right_siblings[i]); |
3898 | 294 | if name(child) == "mo" { |
3899 | 133 | let op_text = as_text(child); |
3900 | 133 | if op_text == open_text { |
3901 | 0 | nesting_level += 1; |
3902 | 133 | } else if op_text == close_text { |
3903 | 91 | if nesting_level == 1 { |
3904 | 91 | return Some(&right_siblings[1..i]); |
3905 | 0 | } |
3906 | 0 | nesting_level -= 1; |
3907 | 42 | } |
3908 | 161 | } |
3909 | 203 | i += 1; |
3910 | } | |
3911 | 2 | return None; // didn't find matching paren |
3912 | 93 | } |
3913 | ||
3914 | 134 | fn is_name_inside_contents(function_name: &str, contents: &[ChildOfElement]) -> bool { |
3915 | 304 | for &child in |
3916 | 304 | let child = as_element(child); |
3917 | // debug!("is_name_inside_contents: child={}", mml_to_string(child)); | |
3918 | 304 | if is_leaf(child) { |
3919 | 261 | let text = as_text(child); |
3920 | 261 | if (name(child) == "mi" || |
3921 | 5 | return true; |
3922 | 256 | } |
3923 | 43 | } else if is_name_inside_contents(function_name, &child.children()) { |
3924 | 4 | return true; |
3925 | 39 | } |
3926 | } | |
3927 | 125 | return false; |
3928 | 134 | } |
3929 | 93 | } |
3930 | 32.0k | } |
3931 | ||
3932 | 5.79k | fn is_mixed_fraction<'a>(&self, integer_part: Element<'a>, fraction_children: &[ChildOfElement<'a>]) -> Result<bool> { |
3933 | // do some simple disqualifying checks on the fraction part | |
3934 | 5.79k | if fraction_children.is_empty() { |
3935 | 0 | return Ok( false ); |
3936 | 5.79k | } |
3937 | 5.79k | let right_child = as_element(fraction_children[0]); |
3938 | 5.79k | let right_child_name = name(right_child); |
3939 | 5.79k | if ! (right_child_name == "mfrac" || |
3940 | 5.68k | (right_child_name == "mrow" && |
3941 | 5.48k | (right_child_name == "mn" && |
3942 | 5.46k | return Ok( false ); |
3943 | 329 | }; |
3944 | ||
3945 | 329 | if !is_integer_part_ok(integer_part) { |
3946 | 219 | return Ok( false ); |
3947 | 110 | } |
3948 | ||
3949 | 110 | if right_child_name == "mfrac" { |
3950 | 75 | return Ok( is_mfrac_ok(right_child) ); |
3951 | 35 | } |
3952 | ||
3953 | 35 | return is_linear_fraction(self, fraction_children); |
3954 | ||
3955 | ||
3956 | 351 | fn is_int(integer_part: Element) -> bool { |
3957 | 351 | return name(integer_part) == "mn" && |
3958 | 351 | } |
3959 | ||
3960 | 329 | fn is_integer_part_ok(integer_part: Element) -> bool { |
3961 | // integer part must be either 'n' or '-n' (in an mrow) | |
3962 | 329 | let integer_part_name = name(integer_part); |
3963 | 329 | if integer_part_name == "mrow" { |
3964 | 83 | let children = integer_part.children(); |
3965 | 83 | if children.len() == 2 && |
3966 | 16 | name(as_element(children[0])) == "mo" && |
3967 | 0 | as_text(as_element(children[0])) == "-" { |
3968 | 0 | let integer_part = as_element(children[1]); |
3969 | 0 | return is_int(integer_part); |
3970 | 83 | } |
3971 | 83 | return false; |
3972 | 246 | }; |
3973 | ||
3974 | 246 | return is_int(integer_part); |
3975 | 329 | } |
3976 | ||
3977 | 75 | fn is_mfrac_ok(fraction_part: Element) -> bool { |
3978 | // fraction_part needs to have integer numerator and denominator (already tested it is a frac) | |
3979 | 75 | let fraction_children = fraction_part.children(); |
3980 | 75 | if fraction_children.len() != 2 { |
3981 | 0 | return false; |
3982 | 75 | } |
3983 | 75 | let numerator = as_element(fraction_children[0]); |
3984 | 75 | if name(numerator) != "mn" || |
3985 | 8 | return false; |
3986 | 67 | } |
3987 | 67 | let denominator = as_element(fraction_children[1]); |
3988 | 67 | return is_int(denominator); |
3989 | 75 | } |
3990 | ||
3991 | 66 | fn is_linear_fraction(canonicalize: &CanonicalizeContext, fraction_children: &[ChildOfElement]) -> Result<bool> { |
3992 | // two possibilities | |
3993 | // 1. '3 / 4' is in an mrow | |
3994 | // 2. '3 / 4' are three separate elements | |
3995 | 66 | let first_child = as_element(fraction_children[0]); |
3996 | 66 | if name(first_child) == "mrow" { |
3997 | 31 | if first_child.children().len() != 3 { |
3998 | 0 | return Ok( false ); |
3999 | 31 | } |
4000 | 31 | return is_linear_fraction(canonicalize, &first_child.children()) |
4001 | 35 | } |
4002 | ||
4003 | ||
4004 | // the length has been checked | |
4005 | 35 | assert!(fraction_children.len() >= 3); |
4006 | ||
4007 | 35 | if !is_int(first_child) { |
4008 | 30 | return Ok( false ); |
4009 | 5 | } |
4010 | 5 | let slash_part = canonicalize.canonicalize_mrows(as_element(fraction_children[1])) |
4011 | 5 | if name(slash_part) == "mo" && as_text(slash_part) == "/" { |
4012 | 3 | let denom = canonicalize.canonicalize_mrows(as_element(fraction_children[2])) |
4013 | 3 | return Ok( is_int(denom) ); |
4014 | 2 | } |
4015 | 2 | return Ok( false ); |
4016 | 66 | } |
4017 | 5.79k | } |
4018 | ||
4019 | /// implied comma when two numbers are adjacent and are in a script position | |
4020 | 5.72k | fn is_implied_comma<'a>(&self, prev: Element<'a>, current: Element<'a>, mrow: Element<'a>) -> bool { |
4021 | 5.72k | if name(prev) != "mn" || |
4022 | 5.63k | return false; |
4023 | 95 | } |
4024 | ||
4025 | 95 | assert_eq!(name(mrow), "mrow"); |
4026 | 95 | let container = get_parent(mrow); |
4027 | 95 | let name = name(container); |
4028 | ||
4029 | // test for script position is that it is not the base and hence has a preceding sibling | |
4030 | 95 | return (name == "msub" || |
4031 | 5.72k | } |
4032 | ||
4033 | /// implied separator when two capital letters are adjacent or two chemical elements | |
4034 | 5.64k | fn is_implied_chemical_bond<'a>(&self, prev: Element<'a>, current: Element<'a>) -> bool { |
4035 | // debug!("is_implied_chemical_bond: previous: {:?}", prev.preceding_siblings()); | |
4036 | // debug!("is_implied_chemical_bond: following: {:?}", prev.following_siblings()); | |
4037 | 5.64k | if prev.attribute(MAYBE_CHEMISTRY).is_none() || |
4038 | 5.18k | return false; |
4039 | 462 | } |
4040 | // ABC example where B and C are chemical elements is why we need to scan further than just checking B and C | |
4041 | // look for an mi/mtext with @MAYBE_CHEMISTRY until we get to something that can't have it | |
4042 | 626 | for child in |
4043 | 626 | if !is_valid_chemistry(as_element(child)) { |
4044 | 11 | return false; |
4045 | 615 | } |
4046 | } | |
4047 | 851 | for child in |
4048 | 851 | if !is_valid_chemistry(as_element(child)) { |
4049 | 32 | return false; |
4050 | 819 | } |
4051 | } | |
4052 | 419 | return true; // sequence of all MAYBE_CHEMISTRY |
4053 | ||
4054 | 1.47k | fn is_valid_chemistry(child: Element) -> bool { |
4055 | 1.47k | let child = get_possible_embellished_node(child); |
4056 | 1.47k | return child.attribute(MAYBE_CHEMISTRY).is_some() || ( |
4057 | 1.47k | } |
4058 | 5.64k | } |
4059 | ||
4060 | /// implied separator when two capital letters are adjacent or two chemical elements | |
4061 | /// also for adjacent omission chars | |
4062 | 5.22k | fn is_implied_separator<'a>(&self, prev: Element<'a>, current: Element<'a>) -> bool { |
4063 | 5.22k | if name(prev) != "mi" || |
4064 | 4.83k | return false; |
4065 | 390 | } |
4066 | ||
4067 | // trim because whitespace might have gotten stuffed into the <mi>s | |
4068 | 390 | let prev_text = as_text(prev).trim(); |
4069 | 390 | let current_text = as_text(current).trim(); |
4070 | 390 | return prev_text.len() == 1 && |
4071 | 317 | ((is_cap(prev_text) && |
4072 | 151 | (prev_text=="_" && |
4073 | ||
4074 | ||
4075 | 491 | fn is_cap(str: &str) -> bool { |
4076 | 491 | assert_eq!(str.len(), 1); |
4077 | 491 | return str.chars().next().unwrap().is_ascii_uppercase(); |
4078 | 491 | } |
4079 | 5.22k | } |
4080 | ||
4081 | 42 | fn is_invisible_char_element(mathml: Element) -> bool { |
4082 | 42 | if !is_leaf(mathml) { |
4083 | 8 | return false |
4084 | 34 | } |
4085 | 34 | let text = as_text(mathml); |
4086 | 34 | if text.len() != 3 { // speed hack: invisible chars are three UTF-8 chars |
4087 | 28 | return false; |
4088 | 6 | } |
4089 | 6 | let ch = text.chars().next().unwrap(); |
4090 | 6 | return ('\u{2061}'..='\u{2064}').contains(&ch); |
4091 | 42 | } |
4092 | ||
4093 | // Add the current operator if it's not n-ary to the stack | |
4094 | // 'current_child' and it the operator to the stack. | |
4095 | 17.7k | fn shift_stack<'s, 'a:'s, 'op:'a>( |
4096 | 17.7k | &self, parse_stack: &'s mut Vec<StackInfo<'a, 'op>>, |
4097 | 17.7k | current_child: Element<'a>, |
4098 | 17.7k | current_op: OperatorPair<'op>) -> (Element<'a>, OperatorPair<'op>) { |
4099 | 17.7k | let mut new_current_child = current_child; |
4100 | 17.7k | let mut new_current_op = current_op.clone(); |
4101 | 17.7k | let previous_op = top(parse_stack).op_pair.clone(); |
4102 | // debug!(" shift_stack: mrow len={}", top(parse_stack).mrow.children().len().to_string()); | |
4103 | // debug!(" shift_stack: shift on '{}'; ops: prev '{}/{}', cur '{}/{}'", | |
4104 | // element_summary(current_child),show_invisible_op_char(previous_op.ch), previous_op.op.priority, | |
4105 | // show_invisible_op_char(current_op.ch), current_op.op.priority); | |
4106 | 17.7k | if !current_op.op.is_nary(previous_op.op) { |
4107 | // grab operand on top of stack (if there is one) and make it part of the new mrow since current op has higher precedence | |
4108 | // if operators are the same and are binary, then this push makes them act as left associative | |
4109 | 13.0k | let mut top_of_stack = parse_stack.pop().unwrap(); |
4110 | 13.0k | if top_of_stack.mrow.children().is_empty() || ( |
4111 | 138 | // "bad" syntax - no operand on left -- don't grab operand (there is none) |
4112 | 138 | // just start a new mrow beginning with operator |
4113 | 138 | // FIX -- check this shouldn't happen: parse_stack.push(top_of_stack); |
4114 | 138 | parse_stack.push( top_of_stack ); // put top back on |
4115 | 138 | parse_stack.push( StackInfo::new(current_child.document()) ); |
4116 | 12.8k | } else if current_op.op.is_right_fence() { |
4117 | // likely, but not necessarily, there is a left fence to start the mrow | |
4118 | // this is like the postfix case except we grab the entire mrow, push on the close, and make that the mrow | |
4119 | // note: the code does these operations on the stack for consistency, but it could be optimized without push/popping the stack | |
4120 | 1.96k | let mrow = top_of_stack.mrow; |
4121 | 1.96k | top_of_stack.add_child_to_mrow(current_child, current_op); |
4122 | // debug!("shift_stack: after adding right fence to mrow:\n{}", mml_to_string(mrow)); | |
4123 | 1.96k | new_current_op = OperatorPair::new(); // treat matched brackets as operand |
4124 | 1.96k | new_current_child = mrow; |
4125 | 1.96k | let children = mrow.children(); |
4126 | 1.96k | let base_of_first_child = get_possible_embellished_node(as_element(children[0])); |
4127 | // debug!("looking for left fence: len={}, {:#?}", children.len(), CanonicalizeContext::find_operator(Some(self), base_of_first_child, None, Some(as_element(children[0])), Some(mrow))); | |
4128 | 1.96k | if children.len() == 2 && |
4129 | 64 | (name(base_of_first_child) != "mo" || |
4130 | 13 | !CanonicalizeContext::find_operator(Some(self), base_of_first_child, None, |
4131 | 51 | Some( |
4132 | 51 | // the mrow did *not* start with an open (hence no push) |
4133 | 51 | // since parser really wants balanced parens to keep stack state right, we do a push here |
4134 | 51 | parse_stack.push( StackInfo::new(mrow.document()) ); |
4135 | 51 | } else { |
4136 | // the mrow started with some open fence (which caused a push) -- add the close, pop, and push on the "operand" | |
4137 | 1.91k | new_current_child = self.potentially_lift_script(mrow) |
4138 | } | |
4139 | 10.9k | } else if current_op.op.is_postfix() { |
4140 | 81 | // grab the left operand and start a new mrow with it and the operator -- put those back on the stack |
4141 | 81 | // note: the code does these operations on the stack for consistency, but it could be optimized without push/popping the stack |
4142 | 81 | let previous_child = top_of_stack.remove_last_operand_from_mrow(); // remove operand from mrow |
4143 | 81 | parse_stack.push(top_of_stack); |
4144 | 81 | let mut new_top_of_stack = StackInfo::with_op(¤t_child.document(), previous_child, current_op.clone()); // begin new mrow with operand |
4145 | 81 | new_top_of_stack.add_child_to_mrow(current_child, current_op); // add on operator |
4146 | 81 | new_current_child = new_top_of_stack.mrow; // grab for pushing on old mrow |
4147 | 81 | new_current_op = OperatorPair::new(); // treat "reduced" postfix operator & operand as an operand |
4148 | 81 | // debug!("shift_stack: after adding postfix to mrow has len: {}", new_current_child.children().len().to_string()); |
4149 | 10.8k | } else { |
4150 | 10.8k | // normal infix op case -- grab the left operand and start a new mrow with it and the operator |
4151 | 10.8k | let previous_child = top_of_stack.remove_last_operand_from_mrow(); |
4152 | 10.8k | parse_stack.push(top_of_stack); |
4153 | 10.8k | parse_stack.push( StackInfo::with_op(¤t_child.document(),previous_child, current_op) ); |
4154 | 10.8k | } |
4155 | 4.73k | } |
4156 | 17.7k | return (new_current_child, new_current_op); |
4157 | 17.7k | } |
4158 | ||
4159 | ||
4160 | 25.2k | fn reduce_stack<'s, 'a:'s, 'op:'a>(&self, parse_stack: &'s mut Vec<StackInfo<'a, 'op>>, current_priority: usize) { |
4161 | 25.2k | let mut prev_priority = top(parse_stack).priority(); |
4162 | // debug!(" reduce_stack: stack len={}, priority: prev={}, cur={}", parse_stack.len(), prev_priority, current_priority); | |
4163 | 37.2k | while current_priority < prev_priority { // pop off operators until we are back to the right level |
4164 | 12.0k | if parse_stack.len() == 1 { |
4165 | 0 | break; // something went wrong -- break before popping too much |
4166 | 12.0k | } |
4167 | 12.0k | prev_priority = self.reduce_stack_one_time(parse_stack); |
4168 | }; | |
4169 | 25.2k | } |
4170 | ||
4171 | 12.0k | fn reduce_stack_one_time<'s, 'a:'s, 'op:'a>(&self, parse_stack: &'s mut Vec<StackInfo<'a, 'op>>) -> usize { |
4172 | 12.0k | let mut top_of_stack = parse_stack.pop().unwrap(); |
4173 | // debug!(" ..popped len={} op:'{}/{}', operand: {}", | |
4174 | // top_of_stack.mrow.children().len(), | |
4175 | // show_invisible_op_char(top_of_stack.op_pair.ch), top_of_stack.op_pair.op.priority, | |
4176 | // top_of_stack.is_operand); | |
4177 | 12.0k | let mut mrow = top_of_stack.mrow; |
4178 | 12.0k | if mrow.children().len() == 1 && |
4179 | 63 | // should have added at least operator and operand, but input might not be well-formed |
4180 | 63 | // in this case, unwrap the mrow and expose the single child for pushing onto stack |
4181 | 63 | let single_child = top_of_stack.remove_last_operand_from_mrow(); |
4182 | 63 | mrow = single_child; |
4183 | 11.9k | } |
4184 | ||
4185 | 12.0k | let mut top_of_stack = parse_stack.pop().unwrap(); |
4186 | 12.0k | top_of_stack.add_child_to_mrow(mrow, OperatorPair::new()); // mrow on top is "parsed" -- now add it to previous |
4187 | 12.0k | let prev_priority = top_of_stack.priority(); |
4188 | 12.0k | parse_stack.push(top_of_stack); |
4189 | 12.0k | return prev_priority; |
4190 | 12.0k | } |
4191 | ||
4192 | 5.06k | fn is_trig_arg<'a, 'op:'a>(&self, previous_child: Element<'a>, current_child: Element<'a>, parse_stack: &mut Vec<StackInfo<'a, 'op>>) -> bool { |
4193 | // We have operand-operand and know we want multiplication at this point. | |
4194 | // Check for special case where we want multiplication to bind more tightly than function app (e.g, sin 2x, sin -2xy) | |
4195 | // We only want to do this for simple args | |
4196 | // debug!(" is_trig_arg: prev {}, current {}, Stack:", element_summary(previous_child), element_summary(current_child)); | |
4197 | // parse_stack.iter().for_each(|stack_info| debug!(" {}", stack_info)); | |
4198 | 5.06k | if !IsNode::is_simple(current_child) { |
4199 | 2.98k | return false; |
4200 | 2.07k | } |
4201 | // This only matters if we are not inside of parens | |
4202 | 2.07k | if IsBracketed::is_bracketed(previous_child, "(", ")", false, false) || |
4203 | 2.01k | IsBracketed::is_bracketed(previous_child, "[", "]", false, false) { |
4204 | 63 | return false; |
4205 | 2.01k | } |
4206 | ||
4207 | // Use lower priority multiplication if current_child is a function (e.g. "cos" in "sin x cos 3y") | |
4208 | // if !is_trig(current_child) { | |
4209 | 2.01k | if self.is_function_name(current_child, None) == FunctionNameCertainty::True { |
4210 | 1 | return false; |
4211 | 2.01k | } |
4212 | // Three cases: | |
4213 | // 1. First operand-operand (e.g, sin 2x, where 'current_child' is 'x') -- top of stack is mrow('sin' f_apply '2') | |
4214 | // 2. Another First operand-operand (e.g, sin -2x, where 'current_child' is 'x') -- top of stack is mrow('-' '2'), next is mrow('sin', f_apply) | |
4215 | // 3. Subsequent operand-operand (e.g, sin 2xy, where 'current_child' is 'y') -- top of stack is mrow('2' 'times' 'x') | |
4216 | // Note: IMPLIED_TIMES_HIGH_PRIORITY is only present if we have a trig function | |
4217 | 2.01k | let op_on_top = &top(parse_stack).op_pair; |
4218 | 2.01k | if ptr_eq(op_on_top.op, *INVISIBLE_FUNCTION_APPLICATION) { |
4219 | 8 | let function_element = as_element(top(parse_stack).mrow.children()[0]); |
4220 | 8 | return is_trig(function_element); |
4221 | 2.00k | } |
4222 | 2.00k | if ptr_eq(op_on_top.op, *PREFIX_MINUS) { |
4223 | 74 | if parse_stack.len() < 2 { |
4224 | 0 | return false; |
4225 | 74 | } |
4226 | 74 | let next_stack_info = &parse_stack[parse_stack.len()-2]; |
4227 | 74 | if !ptr_eq(next_stack_info.op_pair.op, *INVISIBLE_FUNCTION_APPLICATION) { |
4228 | 72 | return false; |
4229 | 2 | } |
4230 | 2 | let function_element = as_element(next_stack_info.mrow.children()[0]); |
4231 | 2 | if is_trig(function_element) { |
4232 | // want '- 2' to be an mrow; don't want '- 2 x ...' to be the mrow (IMPLIED_TIMES_HIGH_PRIORITY is an internal hack) | |
4233 | 1 | self.reduce_stack_one_time(parse_stack); |
4234 | 1 | return true; |
4235 | 1 | } |
4236 | 1 | return false; |
4237 | 1.92k | } |
4238 | 1.92k | return ptr_eq(op_on_top.op, &IMPLIED_TIMES_HIGH_PRIORITY); |
4239 | ||
4240 | 10 | fn is_trig(node: Element) -> bool { |
4241 | 10 | let base_of_name = get_possible_embellished_node(node); |
4242 | ||
4243 | // actually only 'mi' should be legal here, but some systems used 'mtext' for multi-char variables | |
4244 | 10 | let node_name = name(base_of_name); |
4245 | 10 | if node_name != "mi" && |
4246 | 0 | return false; |
4247 | 10 | } |
4248 | // whitespace is sometimes added to the mi since braille needs it, so do a trim here to get function name | |
4249 | 10 | let base_name = as_text(base_of_name).trim(); |
4250 | 10 | if base_name.is_empty() { |
4251 | 0 | return false; |
4252 | 10 | } |
4253 | 10 | return crate::definitions::SPEECH_DEFINITIONS.with(|defs| { |
4254 | // names that are always function names (e.g, "sin" and "log") | |
4255 | 10 | let defs = defs.borrow(); |
4256 | 10 | let names = defs.get_hashset("TrigFunctionNames").unwrap(); |
4257 | // UEB seems to think "Sin" (etc) is used for "sin", so we move to lower case | |
4258 | 10 | return names.contains(&base_name.to_ascii_lowercase()); |
4259 | 10 | }); |
4260 | 10 | } |
4261 | 5.06k | } |
4262 | ||
4263 | ||
4264 | /* | |
4265 | canonicalize_mrows_in_mrow is a simple(ish) operator precedence parser. | |
4266 | It works by keeping a stack of 'StackInfo': | |
4267 | 'StackInfo' has three parts: | |
4268 | 1. the mrow being build | |
4269 | 2. info about the operator in the mrow being build | |
4270 | 3. bool to say whether the last thing is an operator or an operand | |
4271 | ||
4272 | When the op priority increases (eg, have "=" and get "+"), we push on | |
4273 | 1. a new mrow -- if the operator has a left operand, we remove the last node in the mrow and it becomes | |
4274 | the first (only so far) child of the new mrow | |
4275 | 2. the operator info | |
4276 | ||
4277 | When the op priority decreases, we do the following loop until the this new priority > priority on top of stack | |
4278 | 1. pop the StackInfo | |
4279 | 2. add the StackInfo's mrow as the last child to the new top of the stack | |
4280 | We also do this when we hit the end of the mrow (we can treat this case as if we have a negative precedence) | |
4281 | ||
4282 | +/- are treated as nary operators and don't push/pop in those cases. | |
4283 | consecutive operands such as nary times are also considered n-ary operators and don't push/pop in those cases. | |
4284 | */ | |
4285 | 7.48k | fn canonicalize_mrows_in_mrow<'a>(&self, mrow: Element<'a>) -> Result<Element<'a>> { |
4286 | 7.48k | let is_ok_to_merge_child = mrow.children().len() != 1 || |
4287 | 7.48k | let saved_mrow_attrs = mrow.attributes(); |
4288 | 7.48k | assert_eq!(name(mrow), "mrow"); |
4289 | ||
4290 | // FIX: don't touch/canonicalize | |
4291 | // 1. if intent is given -- anything intent references | |
4292 | // 2. if the mrow starts or ends with a fence, don't merge into parent (parse children only) -- allows for "]a,b[" | |
4293 | 7.48k | let mut parse_stack = vec![StackInfo::new(mrow.document())]; |
4294 | 7.48k | let mut children = mrow.children(); |
4295 | 7.48k | let num_children = children.len(); |
4296 | ||
4297 | 36.7k | for i_child in |
4298 | // debug!("\nDealing with child #{}: {}", i_child, mml_to_string(as_element(children[i_child]))); | |
4299 | 36.7k | let mut current_child = self.canonicalize_mrows(as_element(children[i_child])) |
4300 | 36.7k | children[i_child] = ChildOfElement::Element( current_child ); |
4301 | 36.7k | let base_of_child = get_possible_embellished_node(current_child); |
4302 | 36.7k | let acts_as_ch = current_child.attribute_value(ACT_AS_OPERATOR); |
4303 | 36.7k | let mut current_op = OperatorPair::new(); |
4304 | // figure what the current operator is -- it either comes from the 'mo' (if we have an 'mo') or it is implied | |
4305 | 36.7k | if (name(base_of_child) == "mo" && |
4306 | 13.9k | !( base_of_child.children().is_empty() || as_text(base_of_child) == "\u{00A0}" )) || // shouldn't have empty mo node, but... |
4307 | 22.8k | acts_as_ch.is_some() { |
4308 | 13.9k | let previous_op = if top(&parse_stack).is_operand { |
4309 | 13.9k | let next_node = if i_child + 1 < num_children { |
4310 | 13.9k | if let Some( |
4311 | 20 | // ∇× (etc) hack, including ∇ being a vector (maybe eventually others) |
4312 | 20 | let temp_mo = create_mathml_element(¤t_child.document(), "mo"); |
4313 | 20 | temp_mo.set_text(acts_as_ch); |
4314 | 20 | current_op = OperatorPair{ |
4315 | 20 | ch: acts_as_ch, |
4316 | 20 | op: CanonicalizeContext::find_operator(Some(self), temp_mo, previous_op, |
4317 | 20 | top(&parse_stack).last_child_in_mrow(), next_node) |
4318 | 20 | }; |
4319 | 13.9k | } else { |
4320 | 13.9k | current_op = OperatorPair{ |
4321 | 13.9k | ch: as_text(base_of_child), |
4322 | 13.9k | op: CanonicalizeContext::find_operator(Some(self), base_of_child, previous_op, |
4323 | 13.9k | top(&parse_stack).last_child_in_mrow(), next_node) |
4324 | 13.9k | }; |
4325 | 13.9k | |
4326 | 13.9k | // deal with vertical bars which might be infix, open, or close fences |
4327 | 13.9k | // note: mrow shrinks as we iterate through it (removing children from it) |
4328 | 13.9k | current_op.op = self.determine_vertical_bar_op( |
4329 | 13.9k | current_op.op, |
4330 | 13.9k | base_of_child, |
4331 | 13.9k | next_node, |
4332 | 13.9k | &mut parse_stack, |
4333 | 13.9k | self.n_vertical_bars_on_right(&children[i_child+1..], current_op.ch) |
4334 | 13.9k | ); |
4335 | 13.9k | } |
4336 | } else { | |
4337 | 22.8k | let previous_child = top(&parse_stack).last_child_in_mrow(); |
4338 | 22.8k | if let Some( |
4339 | 17.9k | let base_of_previous_child = get_possible_embellished_node(previous_child); |
4340 | 17.9k | let acts_as_ch = previous_child.attribute_value(ACT_AS_OPERATOR); |
4341 | 17.9k | if name(base_of_previous_child) != "mo" && |
4342 | 6.55k | let likely_function_name = self.is_function_name(previous_child, Some(&children[i_child..])); |
4343 | 6.55k | if name(base_of_child) == "mtext" && |
4344 | 1 | base_of_child.set_attribute_value("data-function-likelihood", &(likely_function_name == FunctionNameCertainty::True).to_string()); |
4345 | 1 | base_of_child.remove_attribute("data-was-mo"); |
4346 | 1 | set_mathml_name(base_of_child, "mo"); |
4347 | 1 | let mut top_of_stack = parse_stack.pop().unwrap(); |
4348 | 1 | top_of_stack.add_child_to_mrow(current_child, OperatorPair{ ch: "\u{00A0}", op: *INVISIBLE_FUNCTION_APPLICATION}); // whitespace -- make part of mrow to keep out of parse |
4349 | 1 | parse_stack.push(top_of_stack); |
4350 | 1 | continue; |
4351 | 6.55k | } |
4352 | // consecutive operands -- add an invisible operator as appropriate | |
4353 | 6.55k | current_op = if likely_function_name == FunctionNameCertainty::True { |
4354 | 753 | OperatorPair{ ch: "\u{2061}", op: *INVISIBLE_FUNCTION_APPLICATION } |
4355 | 5.79k | } else if self.is_mixed_fraction(previous_child, &children[i_child..]) |
4356 | 70 | OperatorPair{ ch: "\u{2064}", op: *IMPLIED_INVISIBLE_PLUS } |
4357 | 5.72k | } else if self.is_implied_comma(previous_child, current_child, mrow) { |
4358 | 81 | OperatorPair{ch: "\u{2063}", op: *IMPLIED_INVISIBLE_COMMA } |
4359 | 5.64k | } else if self.is_implied_chemical_bond(previous_child, current_child) { |
4360 | 419 | OperatorPair{ch: "\u{2063}", op: &IMPLIED_CHEMICAL_BOND } |
4361 | 5.22k | } else if self.is_implied_separator(previous_child, current_child) { |
4362 | 166 | OperatorPair{ch: "\u{2063}", op: &IMPLIED_SEPARATOR_HIGH_PRIORITY } |
4363 | 5.06k | } else if self.is_trig_arg(base_of_previous_child, base_of_child, &mut parse_stack) { |
4364 | 9 | OperatorPair{ch: "\u{2062}", op: &IMPLIED_TIMES_HIGH_PRIORITY } |
4365 | } else { | |
4366 | 5.05k | OperatorPair{ ch: "\u{2062}", op: *IMPLIED_TIMES } |
4367 | }; | |
4368 | 6.55k | if let Some( |
4369 | 262 | && attr_val == "data-was-mo" { |
4370 | 0 | // it really should be an operator |
4371 | 0 | base_of_child.remove_attribute(CHANGED_ATTR); |
4372 | 0 | set_mathml_name(base_of_child, "mo"); |
4373 | 6.55k | } |
4374 | 6.55k | if name(base_of_child) == "mo" { |
4375 | 1 | current_op.ch = as_text(base_of_child); |
4376 | 1 | // debug!(" Found whitespace op '{}'/{}", show_invisible_op_char(current_op.ch), current_op.op.priority); |
4377 | 1 | } else { |
4378 | 6.54k | let implied_mo = create_mo(current_child.document(), current_op.ch, ADDED_ATTR_VALUE); |
4379 | 6.54k | if likely_function_name == FunctionNameCertainty::Maybe { |
4380 | 33 | implied_mo.set_attribute_value("data-function-guess", "true"); |
4381 | 6.51k | } |
4382 | // debug!(" Found implicit op {}/{} [{:?}]", show_invisible_op_char(current_op.ch), current_op.op.priority, likely_function_name); | |
4383 | 6.54k | self.reduce_stack(&mut parse_stack, current_op.op.priority); |
4384 | 6.54k | let shift_result = self.shift_stack(&mut parse_stack, implied_mo, current_op.clone()); |
4385 | // ignore shift_result.0 which is just 'implied_mo' | |
4386 | 6.54k | assert_eq!(implied_mo, shift_result.0); |
4387 | 6.54k | assert!( ptr_eq(current_op.op, shift_result.1.op) ); |
4388 | 6.54k | let mut top_of_stack = parse_stack.pop().unwrap(); |
4389 | 6.54k | top_of_stack.add_child_to_mrow(implied_mo, current_op); |
4390 | 6.54k | parse_stack.push(top_of_stack); |
4391 | 6.54k | current_op = OperatorPair::new(); |
4392 | } | |
4393 | 11.3k | } |
4394 | 4.88k | } |
4395 | } | |
4396 | ||
4397 | 36.7k | if !ptr_eq(current_op.op, &ILLEGAL_OPERATOR_INFO) { |
4398 | 13.9k | if current_op.op.is_left_fence() || |
4399 | 2.95k | if top(&parse_stack).is_operand { |
4400 | // will end up with duplicate operands -- need to choose operator associated with prev child | |
4401 | // we use the original input here because in this case, we need to look to the right of the ()s to deal with chemical states | |
4402 | 232 | let likely_function_name = self.is_function_name(as_element(children[i_child-1]), Some(&children[i_child..])); |
4403 | 232 | let implied_operator = if likely_function_name== FunctionNameCertainty::True { |
4404 | 98 | OperatorPair{ ch: "\u{2061}", op: *INVISIBLE_FUNCTION_APPLICATION } |
4405 | } else { | |
4406 | 134 | OperatorPair{ ch: "\u{2062}", op: *IMPLIED_TIMES } |
4407 | }; | |
4408 | // debug!(" adding implied {}", if ptr_eq(implied_operator.op,*IMPLIED_TIMES) {"times"} else {"function apply"}); | |
4409 | ||
4410 | 232 | let implied_mo = create_mo(current_child.document(), implied_operator.ch, ADDED_ATTR_VALUE); |
4411 | 232 | if likely_function_name == FunctionNameCertainty::Maybe { |
4412 | 55 | implied_mo.set_attribute_value("data-function-guess", "true"); |
4413 | 177 | } |
4414 | 232 | self.reduce_stack(&mut parse_stack, implied_operator.op.priority); let shift_result = self.shift_stack(&mut parse_stack, implied_mo, implied_operator.clone()); |
4415 | // ignore shift_result.0 which is just 'implied_mo' | |
4416 | 232 | assert_eq!(implied_mo, shift_result.0); |
4417 | 232 | assert!( ptr_eq(implied_operator.op, shift_result.1.op) ); |
4418 | 232 | let mut top_of_stack = parse_stack.pop().unwrap(); |
4419 | 232 | top_of_stack.add_child_to_mrow(implied_mo, implied_operator); |
4420 | 232 | parse_stack.push(top_of_stack); |
4421 | 2.72k | } |
4422 | // starting a new mrow | |
4423 | 2.95k | parse_stack.push( StackInfo::new(current_child.document()) ); |
4424 | } else { | |
4425 | // One of infix, postfix, or right fence -- all should have a left operand | |
4426 | // pop the stack if it is lower precedence (it forms an mrow) | |
4427 | ||
4428 | // hack to get linear mixed fractions to parse correctly | |
4429 | 10.9k | if current_op.ch == "/" && |
4430 | 2 | current_op.op = &IMPLIED_PLUS_SLASH_HIGH_PRIORITY; |
4431 | 10.9k | } |
4432 | 10.9k | self.reduce_stack(&mut parse_stack, current_op.op.priority); |
4433 | // push new operator on stack (already handled n-ary case) | |
4434 | 10.9k | let shift_result = self.shift_stack(&mut parse_stack, current_child, current_op); |
4435 | 10.9k | current_child = shift_result.0; |
4436 | 10.9k | current_op = shift_result.1; |
4437 | } | |
4438 | 22.7k | } |
4439 | 36.7k | let mut top_of_stack = parse_stack.pop().unwrap(); |
4440 | 36.7k | top_of_stack.add_child_to_mrow(current_child, current_op); |
4441 | 36.7k | parse_stack.push(top_of_stack); |
4442 | } | |
4443 | ||
4444 | // Reached the end -- force reduction of what's left on the stack | |
4445 | 7.48k | self.reduce_stack(&mut parse_stack, LEFT_FENCEPOST.priority); |
4446 | ||
4447 | // We essentially have 'terminator( mrow terminator)' | |
4448 | // in other words, we have an extra mrow with one child due to the initial start -- remove it | |
4449 | 7.48k | let mut top_of_stack = parse_stack.pop().unwrap(); |
4450 | 7.48k | assert_eq!(parse_stack.len(), 0); |
4451 | ||
4452 | 7.48k | let mut parsed_mrow = top_of_stack.mrow; |
4453 | 7.48k | assert_eq!( name(top_of_stack.mrow), "mrow"); |
4454 | 7.48k | if parsed_mrow.children().len() == 1 && is_ok_to_merge_child { |
4455 | 7.46k | parsed_mrow = top_of_stack.remove_last_operand_from_mrow(); |
4456 | 7.46k | // was synthesized, but is really the original top level mrow |
4457 | 7.46k |
|
4458 | ||
4459 | 7.48k | parsed_mrow.remove_attribute(CHANGED_ATTR); |
4460 | 7.48k | return Ok( add_attrs(parsed_mrow, &saved_mrow_attrs) ); |
4461 | 7.48k | } |
4462 | } | |
4463 | ||
4464 | // ---------------- useful utility functions -------------------- | |
4465 | 102k | fn top<'s, 'a:'s, 'op:'a>(vec: &'s[StackInfo<'a, 'op>]) -> &'s StackInfo<'a, 'op> { |
4466 | 102k | return &vec[vec.len()-1]; |
4467 | 102k | } |
4468 | // Replace the attrs of 'mathml' with 'attrs' and keep the global attrs of 'mathml' (i.e, lift 'attrs' to 'mathml' for replacing children) | |
4469 | 10.0k | pub fn add_attrs<'a>(mathml: Element<'a>, attrs: &[Attribute]) -> Element<'a> { |
4470 | static GLOBAL_ATTRS: phf::Set<&str> = phf_set! { | |
4471 | "class", "dir", "displaystyle", "id", "mathbackground", "mathcolor", "mathsize", | |
4472 | "mathvariant", "nonce", "scriptlevel", "style", "tabindex", | |
4473 | "intent", "arg", | |
4474 | }; | |
4475 | ||
4476 | // debug!( "Adding back {} attr(s) to {}", attrs.len(), name(mathml)); | |
4477 | // remove non-global attrs | |
4478 | 10.0k | for |
4479 | 740 | let attr_name = attr.name().local_part(); |
4480 | 740 | if !( attr_name.starts_with("data-") || |
4481 | 278 | attr_name.starts_with("on") ) { // allows too much - cheapo way to allow event handlers like "onchange" |
4482 | 278 | mathml.remove_attribute(attr.name()); |
4483 | 462 | } |
4484 | } | |
4485 | ||
4486 | // add in 'attrs' | |
4487 | 10.0k | for |
4488 | 5.22k | mathml.set_attribute_value(attr.name(), attr.value()); |
4489 | 5.22k | } |
4490 | 10.0k | return mathml; |
4491 | 10.0k | } |
4492 | ||
4493 | ||
4494 | 2.91M | pub fn name(node: Element<'_>) -> &str { |
4495 | 2.91M | return node.name().local_part(); |
4496 | 2.91M | } |
4497 | ||
4498 | /// The child of a non-leaf element must be an element | |
4499 | // Note: can't use references as that results in 'returning use of local variable' | |
4500 | 1.14M | pub fn as_element(child: ChildOfElement) -> Element { |
4501 | 1.14M | return match child { |
4502 | 1.14M | ChildOfElement::Element(e) => e, |
4503 | _ => { | |
4504 | 0 | panic!("as_element: internal error -- found non-element child (text? '{:?}')", child.text()); |
4505 | }, | |
4506 | }; | |
4507 | 1.14M | } |
4508 | ||
4509 | /// The child of a leaf element must be text (previously trimmed) | |
4510 | /// Note: trim() combines all the Text children into a single string | |
4511 | 603k | pub fn as_text(leaf_child: Element<'_>) -> &str { |
4512 | 603k | assert!(is_leaf(leaf_child)); |
4513 | 603k | let children = leaf_child.children(); |
4514 | 603k | if children.is_empty() { |
4515 | 401 | return ""; |
4516 | 602k | } |
4517 | 602k | assert!(children.len() == 1); |
4518 | 602k | return match children[0] { |
4519 | 602k | ChildOfElement::Text(t) => t.text(), |
4520 | 0 | _ => panic!("as_text: internal error -- found non-text child of leaf element"), |
4521 | } | |
4522 | 603k | } |
4523 | ||
4524 | /// Returns the parent of the argument. | |
4525 | /// Warning: this assumes the parent exists | |
4526 | 239k | pub fn get_parent(mathml: Element) -> Element { |
4527 | 239k | return mathml.parent().unwrap().element().unwrap(); |
4528 | 239k | } |
4529 | ||
4530 | #[allow(dead_code)] // for debugging | |
4531 | 0 | pub fn element_summary(mathml: Element) -> String { |
4532 | 0 | return format!("{}<{}>", name(mathml), |
4533 | 0 | if is_leaf(mathml) {show_invisible_op_char(as_text(mathml)).to_string()} |
4534 | else | |
4535 | 0 | {mathml.children().len().to_string()}); |
4536 | 0 | } |
4537 | ||
4538 | 6.86k | fn create_mo<'a, 'd:'a>(doc: Document<'d>, ch: &'a str, attr_value: &str) -> Element<'d> { |
4539 | 6.86k | let implied_mo = create_mathml_element(&doc, "mo"); |
4540 | 6.86k | implied_mo.set_attribute_value(CHANGED_ATTR, attr_value); |
4541 | 6.86k | let mo_text = doc.create_text(ch); |
4542 | 6.86k | implied_mo.append_child(mo_text); |
4543 | 6.86k | return implied_mo; |
4544 | 6.86k | } |
4545 | ||
4546 | /// return 'node' or if it is adorned, return its base (recursive) | |
4547 | 130k | pub fn get_possible_embellished_node(node: Element) -> Element { |
4548 | 130k | let mut node = node; |
4549 | 138k | while IsNode::is_modified(node) { |
4550 | 8.33k | node = as_element(node.children()[0]); |
4551 | 8.33k | } |
4552 | 130k | return node; |
4553 | 130k | } |
4554 | ||
4555 | #[allow(dead_code)] // for debugging with println | |
4556 | 0 | fn show_invisible_op_char(ch: &str) -> &str { |
4557 | 0 | return match ch.chars().next().unwrap() { |
4558 | 0 | '\u{2061}' => "⁡", |
4559 | 0 | '\u{2062}' => "⁢", |
4560 | 0 | '\u{2063}' => "⁣", |
4561 | 0 | '\u{2064}' => "⁤", |
4562 | 0 | '\u{E000}' => "", |
4563 | 0 | _ => ch |
4564 | }; | |
4565 | 0 | } |
4566 | ||
4567 | ||
4568 | #[cfg(test)] | |
4569 | mod canonicalize_tests { | |
4570 | use crate::errors::Result; | |
4571 | use crate::{are_strs_canonically_equal_result, are_strs_canonically_equal_with_locale}; | |
4572 | ||
4573 | #[allow(unused_imports)] | |
4574 | use super::super::init_logger; | |
4575 | use super::super::abs_rules_dir_path; | |
4576 | use super::*; | |
4577 | use sxd_document::parser; | |
4578 | ||
4579 | ||
4580 | #[test] | |
4581 | 1 | fn canonical_same() -> Result<()> { |
4582 | 1 | let target_str = "<math><mrow><mo>-</mo><mi>a</mi></mrow></math>"; |
4583 | 1 | are_strs_canonically_equal_result(target_str, target_str, &[]) |
4584 | 1 | } |
4585 | ||
4586 | #[test] | |
4587 | 1 | fn plane1_common() -> Result<()> { |
4588 | 1 | let test_str = "<math> |
4589 | 1 | <mi mathvariant='normal'>sin</mi> <mo>,</mo> <!-- shouldn't change --> |
4590 | 1 | <mi mathvariant='italic'>bB4</mi> <mo>,</mo> <!-- shouldn't change --> |
4591 | 1 | <mi mathvariant='bold'>a</mi> <mo>,</mo> <!-- single char id tests --> |
4592 | 1 | <mi mathvariant='bold'>Z</mi> <mo>,</mo> |
4593 | 1 | <mn mathvariant='bold'>19=𝟗</mn> <mo>,</mo> <!-- '=' and plane1 shouldn't change --> |
4594 | 1 | <mn mathvariant='double-struck'>024689</mn> <mo>,</mo> <!-- '=' and plane1 shouldn't change --> |
4595 | 1 | <mi mathvariant='double-struck'>yzCHNPQRZ</mi> <mo>,</mo> |
4596 | 1 | <mi mathvariant='fraktur'>0yACHIRZ</mi> <mo>,</mo> <!-- 0 stays as ASCII --> |
4597 | 1 | <mi mathvariant='bold-fraktur'>nC</mi> <mo>,</mo> |
4598 | 1 | <mi mathvariant='script'>ABEFHILMRegow</mi> <mo>,</mo> |
4599 | 1 | <msup> |
4600 | 1 | <mi mathvariant='bold-script'>fG</mi> |
4601 | 1 | <mo mathvariant='bold-script'>*</mo> <!-- '*' shouldn't change --> |
4602 | 1 | </msup> |
4603 | 1 | </math>"; |
4604 | 1 | let target_str = "<math> |
4605 | 1 | <mrow data-changed='added'> |
4606 | 1 | <mi mathvariant='normal'>sin</mi> |
4607 | 1 | <mo >,</mo> |
4608 | 1 | <mi mathvariant='italic'>bB4</mi> |
4609 | 1 | <mo>,</mo> |
4610 | 1 | <mi mathvariant='bold'>𝐚</mi> |
4611 | 1 | <mo>,</mo> |
4612 | 1 | <mi mathvariant='bold'>𝐙</mi> |
4613 | 1 | <mo>,</mo> |
4614 | 1 | <mn mathvariant='bold'>𝟏𝟗=𝟗</mn> |
4615 | 1 | <mo>,</mo> |
4616 | 1 | <mn mathvariant='double-struck'>𝟘𝟚𝟜𝟞𝟠𝟡</mn> |
4617 | 1 | <mo>,</mo> |
4618 | 1 | <mi mathvariant='double-struck'>𝕪𝕫ℂℍℕℙℚℝℤ</mi> |
4619 | 1 | <mo>,</mo> |
4620 | 1 | <mi mathvariant='fraktur'>0𝔶𝔄ℭℌℑℜℨ</mi> |
4621 | 1 | <mo>,</mo> |
4622 | 1 | <mi mathvariant='bold-fraktur'>𝖓𝕮</mi> |
4623 | 1 | <mo>,</mo> |
4624 | 1 | <mi mathvariant='script'>𝒜ℬℰℱℋℐℒℳℛℯℊℴ𝓌</mi> |
4625 | 1 | <mo>,</mo> |
4626 | 1 | <msup> |
4627 | 1 | <mi mathvariant='bold-script'>𝓯𝓖</mi> |
4628 | 1 | <mo mathvariant='bold-script'>*</mo> <!-- '*' shouldn't change --> |
4629 | 1 | </msup> |
4630 | 1 | </mrow> |
4631 | 1 | </math>"; |
4632 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4633 | 1 | } |
4634 | ||
4635 | #[test] | |
4636 | 1 | fn plane1_font_styles() -> Result<()> { |
4637 | 1 | let test_str = "<math> |
4638 | 1 | <mi mathvariant='sans-serif'>aA09=</mi> <mo>,</mo> <!-- '=' shouldn't change --> |
4639 | 1 | <mi mathvariant='bold-sans-serif'>zZ09</mi> <mo>,</mo> |
4640 | 1 | <mi mathvariant='sans-serif-italic'>azAZ09</mi> <mo>,</mo> <!-- italic digits don't exist: revert to sans-serif --> |
4641 | 1 | <mi mathvariant='sans-serif-bold-italic'>AZaz09</mi> <mo>,</mo> <!-- italic digits don't exist: revert to just bold --> |
4642 | 1 | <mi mathvariant='monospace'>aA09</mi> |
4643 | 1 | </math>"; |
4644 | 1 | let target_str = "<math> |
4645 | 1 | <mrow data-changed='added'> |
4646 | 1 | <mi mathvariant='sans-serif'>𝖺𝖠𝟢𝟫=</mi> |
4647 | 1 | <mo>,</mo> |
4648 | 1 | <mi mathvariant='bold-sans-serif'>𝘇𝗭𝟬𝟵</mi> |
4649 | 1 | <mo>,</mo> |
4650 | 1 | <mi mathvariant='sans-serif-italic'>𝘢𝘻𝘈𝘡𝟢𝟫</mi> |
4651 | 1 | <mo>,</mo> |
4652 | 1 | <mi mathvariant='sans-serif-bold-italic'>𝘼𝙕𝙖𝙯𝟬𝟵</mi> |
4653 | 1 | <mo>,</mo> |
4654 | 1 | <mi mathvariant='monospace'>𝚊𝙰𝟶𝟿</mi> |
4655 | 1 | </mrow> |
4656 | 1 | </math>"; |
4657 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4658 | 1 | } |
4659 | ||
4660 | #[test] | |
4661 | 1 | fn plane1_greek() -> Result<()> { |
4662 | 1 | let test_str = "<math> |
4663 | 1 | <mi mathvariant='normal'>ΑΩαω∇∂ϵ=</mi> <mo>,</mo> <!-- shouldn't change --> |
4664 | 1 | <mi mathvariant='italic'>ϴΑΩαω∇∂ϵ</mi> <mo>,</mo> |
4665 | 1 | <mi mathvariant='bold'>ΑΩαωϝϜ</mi> <mo>,</mo> |
4666 | 1 | <mi mathvariant='double-struck'>Σβ∇</mi> <mo>,</mo> <!-- shouldn't change --> |
4667 | 1 | <mi mathvariant='fraktur'>ΞΦλϱ</mi> <mo>,</mo> <!-- shouldn't change --> |
4668 | 1 | <mi mathvariant='bold-fraktur'>ψΓ</mi> <mo>,</mo> <!-- map to bold --> |
4669 | 1 | <mi mathvariant='script'>μΨ</mi> <mo>,</mo> <!-- shouldn't change --> |
4670 | 1 | <mi mathvariant='bold-script'>Σπ</mi> <!-- map to bold --> |
4671 | 1 | </math>"; |
4672 | 1 | let target_str = "<math> |
4673 | 1 | <mrow data-changed='added'> |
4674 | 1 | <mi mathvariant='normal'>ΑΩαω∇∂ϵ=</mi> |
4675 | 1 | <mo>,</mo> |
4676 | 1 | <mi mathvariant='italic'>𝛳𝛢𝛺𝛼𝜔𝛻𝜕𝜖</mi> |
4677 | 1 | <mo>,</mo> |
4678 | 1 | <mi mathvariant='bold'>𝚨𝛀𝛂𝛚𝟋𝟊</mi> |
4679 | 1 | <mo>,</mo> |
4680 | 1 | <mi mathvariant='double-struck'>Σβ∇</mi> |
4681 | 1 | <mo>,</mo> |
4682 | 1 | <mi mathvariant='fraktur'>ΞΦλϱ</mi> |
4683 | 1 | <mo>,</mo> |
4684 | 1 | <mi mathvariant='bold-fraktur'>𝛙𝚪</mi> |
4685 | 1 | <mo>,</mo> |
4686 | 1 | <mi mathvariant='script'>μΨ</mi> |
4687 | 1 | <mo>,</mo> |
4688 | 1 | <mi mathvariant='bold-script'>𝚺𝛑</mi> |
4689 | 1 | </mrow> |
4690 | 1 | </math>"; |
4691 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4692 | 1 | } |
4693 | ||
4694 | #[test] | |
4695 | 1 | fn plane1_greek_font_styles() -> Result<()> { |
4696 | 1 | let test_str = "<math> |
4697 | 1 | <mi mathvariant='sans-serif'>ΑΩαω∇∂ϵ=</mi> <mo>,</mo> <!-- '=' shouldn't change --> |
4698 | 1 | <mi mathvariant='bold-sans-serif'>ϴ0ΑΩαω∇∂ϵ</mi> <mo>,</mo> |
4699 | 1 | <mi mathvariant='sans-serif-italic'>aΑΩαω∇∂ϵ</mi> <mo>,</mo> <!-- italic digits don't exist: revert to sans-serif --> |
4700 | 1 | <mi mathvariant='sans-serif-bold-italic'>ZΑΩαωϰϕϱϖ</mi> <mo>,</mo> <!-- italic digits don't exist: revert to just bold --> |
4701 | 1 | <mi mathvariant='monospace'>zΑΩαω∇∂</mi> |
4702 | 1 | </math>"; |
4703 | 1 | let target_str = "<math> |
4704 | 1 | <mrow data-changed='added'> |
4705 | 1 | <mi mathvariant='sans-serif'>ΑΩαω∇∂ϵ=</mi> |
4706 | 1 | <mo>,</mo> |
4707 | 1 | <mi mathvariant='bold-sans-serif'>𝝧𝟬𝝖𝝮𝝰𝞈𝝯𝞉𝞊</mi> |
4708 | 1 | <mo>,</mo> |
4709 | 1 | <mi mathvariant='sans-serif-italic'>𝘢ΑΩαω∇∂ϵ</mi> |
4710 | 1 | <mo>,</mo> |
4711 | 1 | <mi mathvariant='sans-serif-bold-italic'>𝙕𝞐𝞨𝞪𝟂𝟆𝟇𝟈𝟉</mi> |
4712 | 1 | <mo>,</mo> |
4713 | 1 | <mi mathvariant='monospace'>𝚣ΑΩαω∇∂</mi> |
4714 | 1 | </mrow> |
4715 | 1 | </math>"; |
4716 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4717 | 1 | } |
4718 | ||
4719 | #[test] | |
4720 | 1 | fn short_and_long_dash() -> Result<()> { |
4721 | 1 | let test_str = "<math><mi>x</mi> <mo>=</mo> <mi>--</mi><mo>+</mo><mtext>----</mtext></math>"; |
4722 | 1 | let target_str = "<math> |
4723 | 1 | <mrow data-changed='added'> |
4724 | 1 | <mi>x</mi> |
4725 | 1 | <mo>=</mo> |
4726 | 1 | <mrow data-changed='added'> |
4727 | 1 | <mi>—</mi> |
4728 | 1 | <mo>+</mo> |
4729 | 1 | <mtext>―</mtext> |
4730 | 1 | </mrow> |
4731 | 1 | </mrow> |
4732 | 1 | </math>"; |
4733 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4734 | 1 | } |
4735 | ||
4736 | #[test] | |
4737 | 1 | fn illegal_mathml_element() { |
4738 | use crate::interface::*; | |
4739 | 1 | let test_str = "<math><foo><mi>f</mi></foo></math>"; |
4740 | 1 | let package1 = &parser::parse(test_str).expect("Failed to parse test input"); |
4741 | 1 | let mathml = get_element(package1); |
4742 | 1 | trim_element(mathml, false); |
4743 | 1 | assert!(canonicalize(mathml).is_err()); |
4744 | 1 | } |
4745 | ||
4746 | #[test] | |
4747 | 1 | fn illegal_mtd_element() { |
4748 | use crate::interface::*; | |
4749 | 1 | let test_str = "<math> |
4750 | 1 | <mtable> |
4751 | 1 | <mtr> |
4752 | 1 | <mtd> |
4753 | 1 | <mtext></mtext> |
4754 | 1 | </mtd> |
4755 | 1 | <mrow> |
4756 | 1 | <mi>E</mi> |
4757 | 1 | <mo>=</mo> |
4758 | 1 | <mrow> |
4759 | 1 | <mtd> |
4760 | 1 | <mi>m</mi> |
4761 | 1 | <mo><!--INVISIBLE TIMES--></mo> |
4762 | 1 | <msup> |
4763 | 1 | <mi>c</mi> |
4764 | 1 | <mn>2</mn> |
4765 | 1 | </msup> |
4766 | 1 | </mtd></mrow> |
4767 | 1 | </mrow> |
4768 | 1 | |
4769 | 1 | </mtr> |
4770 | 1 | </mtable> |
4771 | 1 | </math>"; |
4772 | 1 | let package1 = &parser::parse(test_str).expect("Failed to parse test input"); |
4773 | 1 | let mathml = get_element(package1); |
4774 | 1 | trim_element(mathml, false); |
4775 | 1 | assert!(canonicalize(mathml).is_err()); |
4776 | 1 | } |
4777 | ||
4778 | ||
4779 | #[test] | |
4780 | 1 | fn a_to_mrow() -> Result<()> { |
4781 | 1 | let test_str = "<math> |
4782 | 1 | <a href='https://www.example.com'> |
4783 | 1 | <mo>(</mo> |
4784 | 1 | <a href='#its_relative'> |
4785 | 1 | <mi>x</mi> |
4786 | 1 | <mo>,</mo> |
4787 | 1 | <mi>y</mi> |
4788 | 1 | </a> |
4789 | 1 | <mo>)</mo> |
4790 | 1 | </a> |
4791 | 1 | </math> |
4792 | 1 | "; |
4793 | 1 | let target_str = " <math> |
4794 | 1 | <mrow href='https://www.example.com'> |
4795 | 1 | <mo>(</mo> |
4796 | 1 | <mrow href='#its_relative'> |
4797 | 1 | <mi>x</mi> |
4798 | 1 | <mo>,</mo> |
4799 | 1 | <mi>y</mi> |
4800 | 1 | </mrow> |
4801 | 1 | <mo>)</mo> |
4802 | 1 | </mrow> |
4803 | 1 | </math>"; |
4804 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4805 | 1 | } |
4806 | ||
4807 | #[test] | |
4808 | 1 | fn mfenced_no_children() -> Result<()> { |
4809 | 1 | let test_str = "<math><mi>f</mi><mfenced><mrow/></mfenced></math>"; |
4810 | 1 | let target_str = "<math> |
4811 | 1 | <mrow data-changed='added'> |
4812 | 1 | <mi>f</mi> |
4813 | 1 | <mo data-changed='added'>⁡</mo> |
4814 | 1 | <mrow> |
4815 | 1 | <mo data-changed='from_mfenced'>(</mo> |
4816 | 1 | <mo data-changed='from_mfenced'>)</mo> |
4817 | 1 | </mrow> |
4818 | 1 | </mrow> |
4819 | 1 | </math>"; |
4820 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4821 | 1 | } |
4822 | ||
4823 | #[test] | |
4824 | 1 | fn mfenced_one_child() -> Result<()> { |
4825 | 1 | let test_str = "<math><mi>f</mi><mfenced open='[' close=']'><mi>x</mi></mfenced></math>"; |
4826 | 1 | let target_str = " <math> |
4827 | 1 | <mrow data-changed='added'> |
4828 | 1 | <mi>f</mi> |
4829 | 1 | <mo data-changed='added'>⁡</mo> |
4830 | 1 | <mrow> |
4831 | 1 | <mo data-changed='from_mfenced'>[</mo> |
4832 | 1 | <mi>x</mi> |
4833 | 1 | <mo data-changed='from_mfenced'>]</mo> |
4834 | 1 | </mrow> |
4835 | 1 | </mrow> |
4836 | 1 | </math>"; |
4837 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4838 | 1 | } |
4839 | ||
4840 | #[test] | |
4841 | 1 | fn mfenced_no_attrs() -> Result<()> { |
4842 | 1 | let test_str = "<math><mi>f</mi><mfenced><mrow><mi>x</mi><mo>,</mo><mi>y</mi><mo>,</mo><mi>z</mi></mrow></mfenced></math>"; |
4843 | 1 | let target_str = " <math> |
4844 | 1 | <mrow data-changed='added'> |
4845 | 1 | <mi>f</mi> |
4846 | 1 | <mo data-changed='added'>⁡</mo> |
4847 | 1 | <mrow> |
4848 | 1 | <mo data-changed='from_mfenced'>(</mo> |
4849 | 1 | <mrow> |
4850 | 1 | <mi>x</mi> |
4851 | 1 | <mo>,</mo> |
4852 | 1 | <mi>y</mi> |
4853 | 1 | <mo>,</mo> |
4854 | 1 | <mi>z</mi> |
4855 | 1 | </mrow> |
4856 | 1 | <mo data-changed='from_mfenced'>)</mo> |
4857 | 1 | </mrow> |
4858 | 1 | </mrow> |
4859 | 1 | </math>"; |
4860 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4861 | 1 | } |
4862 | ||
4863 | #[test] | |
4864 | 1 | fn mfenced_with_separators() -> Result<()> { |
4865 | 1 | let test_str = "<math><mi>f</mi><mfenced separators=',;'><mi>x</mi><mi>y</mi><mi>z</mi><mi>a</mi></mfenced></math>"; |
4866 | 1 | let target_str = "<math> |
4867 | 1 | <mrow data-changed='added'> |
4868 | 1 | <mi>f</mi> |
4869 | 1 | <mo data-changed='added'>⁡</mo> |
4870 | 1 | <mrow> |
4871 | 1 | <mo data-changed='from_mfenced'>(</mo> |
4872 | 1 | <mrow data-changed='added'> |
4873 | 1 | <mrow data-changed='added'> |
4874 | 1 | <mi>x</mi> |
4875 | 1 | <mo data-changed='from_mfenced'>,</mo> |
4876 | 1 | <mi>y</mi> |
4877 | 1 | </mrow> |
4878 | 1 | <mo data-changed='from_mfenced'>;</mo> |
4879 | 1 | <mrow data-changed='added'> |
4880 | 1 | <mi>z</mi> |
4881 | 1 | <mo data-changed='from_mfenced'>,</mo> |
4882 | 1 | <mi>a</mi> |
4883 | 1 | </mrow> |
4884 | 1 | </mrow> |
4885 | 1 | <mo data-changed='from_mfenced'>)</mo> |
4886 | 1 | </mrow> |
4887 | 1 | </mrow> |
4888 | 1 | </math>"; |
4889 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4890 | 1 | } |
4891 | ||
4892 | #[test] | |
4893 | 1 | fn canonical_one_element_mrow_around_mrow() -> Result<()> { |
4894 | 1 | let test_str = "<math><mrow><mrow><mo>-</mo><mi>a</mi></mrow></mrow></math>"; |
4895 | 1 | let target_str = "<math><mrow><mo>-</mo><mi>a</mi></mrow></math>"; |
4896 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4897 | 1 | } |
4898 | ||
4899 | #[test] | |
4900 | 1 | fn canonical_mtext_in_mtd_477() -> Result<()> { |
4901 | // make sure mtext doesn't go away | |
4902 | 1 | let test_str = r#"<math> |
4903 | 1 | <mtable> |
4904 | 1 | <mtr> |
4905 | 1 | <mtd> |
4906 | 1 | <mstyle scriptlevel="0"> |
4907 | 1 | <mspace width="2em"/> |
4908 | 1 | </mstyle> |
4909 | 1 | <mstyle scriptlevel="0"> |
4910 | 1 | <mspace width="1em"/> |
4911 | 1 | </mstyle> |
4912 | 1 | </mtd> |
4913 | 1 | </mtr> |
4914 | 1 | </mtable> |
4915 | 1 | </math>"#; |
4916 | 1 | let target_str = r#" <math> |
4917 | 1 | <mtable> |
4918 | 1 | <mtr> |
4919 | 1 | <mtd> |
4920 | 1 | <mtext data-width='1' data-following-space-width='4' scriptlevel='0' data-changed='added'> </mtext> |
4921 | 1 | </mtd> |
4922 | 1 | </mtr> |
4923 | 1 | </mtable> |
4924 | 1 | </math>"#; |
4925 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4926 | 1 | } |
4927 | ||
4928 | #[test] | |
4929 | 1 | fn canonical_mtext_in_mtr() -> Result<()> { |
4930 | // make sure mtext doesn't go away | |
4931 | 1 | let test_str = "<math> <mtable> <mtr> <mtext> </mtext> </mtr> <mtr> <mtext> </mtext> </mtr> </mtable> </math>"; |
4932 | 1 | let target_str = " <math> |
4933 | 1 | <mtable> |
4934 | 1 | <mtr> |
4935 | 1 | <mtext data-changed='empty_content' data-width='0' data-empty-in-2D='true'> </mtext> |
4936 | 1 | </mtr> |
4937 | 1 | <mtr> |
4938 | 1 | <mtext data-changed='empty_content' data-width='0' data-empty-in-2D='true'> </mtext> |
4939 | 1 | </mtr> |
4940 | 1 | </mtable> |
4941 | 1 | </math>"; |
4942 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4943 | 1 | } |
4944 | ||
4945 | #[test] | |
4946 | 1 | fn canonical_mtext_in_mtable() -> Result<()> { |
4947 | // make sure mtext doesn't go away | |
4948 | 1 | let test_str = r"<math> <mtable> <mtr> <mtd> <mi>L</mi> </mtd> <mtd> <mrow> <mi><mi/></mi> <mo>=</mo> |
4949 | 1 | <mrow> <mo>[</mo> <mtable> <mtext> </mtext> </mtable> <mo>]</mo> </mrow> </mrow> </mtd> </mtr> </mtable> </math>"; |
4950 | 1 | let target_str = r"<math> |
4951 | 1 | <mtable> |
4952 | 1 | <mtr> |
4953 | 1 | <mtd> |
4954 | 1 | <mi>L</mi> |
4955 | 1 | </mtd> |
4956 | 1 | <mtd> |
4957 | 1 | <mrow> |
4958 | 1 | <mi><mi/></mi> |
4959 | 1 | <mo>=</mo> |
4960 | 1 | <mrow> |
4961 | 1 | <mo>[</mo> |
4962 | 1 | <mtable> |
4963 | 1 | <mtext data-changed='empty_content' data-width='0' data-empty-in-2D='true'> </mtext> |
4964 | 1 | </mtable> |
4965 | 1 | <mo>]</mo> |
4966 | 1 | </mrow> |
4967 | 1 | </mrow> |
4968 | 1 | </mtd> |
4969 | 1 | </mtr> |
4970 | 1 | </mtable> |
4971 | 1 | </math>"; |
4972 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4973 | 1 | } |
4974 | ||
4975 | #[test] | |
4976 | 1 | fn mrow_with_intent_and_single_child() -> Result<()> { |
4977 | use crate::interface::*; | |
4978 | use sxd_document::parser; | |
4979 | use crate::canonicalize::canonicalize; | |
4980 | // this forces initialization | |
4981 | 1 | crate::interface::set_rules_dir(abs_rules_dir_path()).unwrap(); |
4982 | 1 | crate::speech::SPEECH_RULES.with(|_| true); |
4983 | ||
4984 | // we don't want to remove the mrow because the intent on the mi would reference itself | |
4985 | 1 | let test = "<math><mrow intent='log($x)'><mi arg='x'>X</mi></mrow></math>"; |
4986 | ||
4987 | 1 | let package1 = &parser::parse(test).expect("Failed to parse test input"); |
4988 | 1 | let mathml = get_element(package1); |
4989 | 1 | trim_element(mathml, false); |
4990 | 1 | let mathml_test = canonicalize(mathml).unwrap(); |
4991 | 1 | let first_child = as_element( mathml_test.children()[0] ); |
4992 | 1 | assert_eq!(name(first_child), "mrow"); |
4993 | 1 | assert_eq!(first_child.children().len(), 1); |
4994 | 1 | let mi = as_element(first_child.children()[0]); |
4995 | 1 | assert_eq!(name(mi), "mi"); |
4996 | 1 | Ok(()) |
4997 | 1 | } |
4998 | ||
4999 | #[test] | |
5000 | 1 | fn empty_mrow_with_intent() -> Result<()> { |
5001 | // we don't want to remove the mrow because the intent on the mi would reference itself | |
5002 | use crate::interface::*; | |
5003 | use sxd_document::parser; | |
5004 | use crate::canonicalize::canonicalize; | |
5005 | // this forces initialization | |
5006 | 1 | crate::interface::set_rules_dir(abs_rules_dir_path()).unwrap(); |
5007 | 1 | crate::speech::SPEECH_RULES.with(|_| true); |
5008 | ||
5009 | // we don't want to remove the mrow because the intent needs to stick around | |
5010 | 1 | let test = "<math><mrow intent='log(x)'/></math>"; |
5011 | ||
5012 | 1 | let package1 = &parser::parse(test).expect("Failed to parse test input"); |
5013 | 1 | let mathml = get_element(package1); |
5014 | 1 | trim_element(mathml, false); |
5015 | 1 | let mathml_test = canonicalize(mathml).unwrap(); |
5016 | 1 | let first_child = as_element( mathml_test.children()[0] ); |
5017 | 1 | assert_eq!(name(first_child), "mrow"); |
5018 | 1 | assert_eq!(first_child.children().len(), 1); |
5019 | 1 | let mtext = as_element(first_child.children()[0]); |
5020 | 1 | assert_eq!(name(mtext), "mtext"); |
5021 | 1 | Ok(()) |
5022 | 1 | } |
5023 | ||
5024 | #[test] | |
5025 | 1 | fn mn_with_negative_sign() -> Result<()> { |
5026 | 1 | let test_str = "<math><mfrac> |
5027 | 1 | <mrow><mn>-1</mn></mrow> |
5028 | 1 | <mn>−987</mn> |
5029 | 1 | </mfrac></math>"; |
5030 | 1 | let target_str = "<math><mfrac> |
5031 | 1 | <mrow data-changed='added'><mo>-</mo><mn>1</mn></mrow> |
5032 | 1 | <mrow data-changed='added'><mo>-</mo><mn>987</mn></mrow> |
5033 | 1 | </mfrac></math>"; |
5034 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5035 | 1 | } |
5036 | ||
5037 | #[test] | |
5038 | 1 | fn mn_with_degree_sign() -> Result<()> { |
5039 | 1 | let test_str = "<math> <mrow> <mi>cos</mi> <mo></mo> <mrow> <mo>(</mo> <mn>150°</mn> <mo>)</mo> </mrow> </mrow> </math>"; |
5040 | 1 | let target_str = "<math> |
5041 | 1 | <mrow> |
5042 | 1 | <mi>cos</mi> <mo>⁡</mo> |
5043 | 1 | <mrow> |
5044 | 1 | <mo>(</mo> |
5045 | 1 | <msup data-changed='added'> <mn>150</mn> <mo>°</mo> </msup> |
5046 | 1 | <mo>)</mo> |
5047 | 1 | </mrow> |
5048 | 1 | </mrow> |
5049 | 1 | </math>"; |
5050 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5051 | 1 | } |
5052 | ||
5053 | #[test] | |
5054 | 1 | fn canonical_one_element_mrow_around_mo() -> Result<()> { |
5055 | 1 | let test_str = "<math><mrow><mrow><mo>-</mo></mrow><mi>a</mi></mrow></math>"; |
5056 | 1 | let target_str = "<math><mrow><mo>-</mo><mi>a</mi></mrow></math>"; |
5057 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5058 | 1 | } |
5059 | ||
5060 | #[test] | |
5061 | 1 | fn canonical_flat_to_times_and_plus() -> Result<()> { |
5062 | 1 | let test_str = "<math><mi>c</mi><mo>+</mo><mi>x</mi><mi>y</mi></math>"; |
5063 | 1 | let target_str = "<math> |
5064 | 1 | <mrow data-changed='added'><mi>c</mi><mo>+</mo> |
5065 | 1 | <mrow data-changed='added'><mi>x</mi><mo data-changed='added'>⁢</mo><mi>y</mi></mrow> |
5066 | 1 | </mrow></math>"; |
5067 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5068 | 1 | } |
5069 | ||
5070 | #[test] | |
5071 | 1 | fn canonical_prefix_and_infix() -> Result<()> { |
5072 | 1 | let test_str = "<math><mrow><mo>-</mo><mi>a</mi><mo>-</mo><mi>b</mi></mrow></math>"; |
5073 | 1 | let target_str = "<math> |
5074 | 1 | <mrow> |
5075 | 1 | <mrow data-changed='added'> |
5076 | 1 | <mo>-</mo> |
5077 | 1 | <mi>a</mi> |
5078 | 1 | </mrow> |
5079 | 1 | <mo>-</mo> |
5080 | 1 | <mi>b</mi> |
5081 | 1 | </mrow> |
5082 | 1 | </math>"; |
5083 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5084 | 1 | } |
5085 | ||
5086 | ||
5087 | #[test] | |
5088 | 1 | fn canonical_prefix_implied_times_prefix() -> Result<()> { |
5089 | 1 | let test_str = "<math><mrow><mo>∂</mo><mi>x</mi><mo>∂</mo><mi>y</mi></mrow></math>"; |
5090 | 1 | let target_str = "<math> |
5091 | 1 | <mrow> |
5092 | 1 | <mrow data-changed='added'><mo>∂</mo><mi>x</mi></mrow> |
5093 | 1 | <mo data-changed='added'>⁢</mo> |
5094 | 1 | <mrow data-changed='added'><mo>∂</mo><mi>y</mi></mrow> |
5095 | 1 | </mrow> |
5096 | 1 | </math>"; |
5097 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5098 | 1 | } |
5099 | ||
5100 | #[test] | |
5101 | 1 | fn function_with_single_arg() -> Result<()> { |
5102 | 1 | let test_str = "<math><mrow> |
5103 | 1 | <mi>sin</mi><mo>(</mo><mi>x</mi><mo>)</mo> |
5104 | 1 | <mo>+</mo> |
5105 | 1 | <mi>f</mi><mo>(</mo><mi>x</mi><mo>)</mo> |
5106 | 1 | <mo>+</mo> |
5107 | 1 | <mi>t</mi><mrow><mo>(</mo><mi>x</mi><mo>)</mo></mrow> |
5108 | 1 | </mrow></math>"; |
5109 | 1 | let target_str = "<math> |
5110 | 1 | <mrow> |
5111 | 1 | <mrow data-changed='added'> |
5112 | 1 | <mi>sin</mi> |
5113 | 1 | <mo data-changed='added'>⁡</mo> |
5114 | 1 | <mrow data-changed='added'> |
5115 | 1 | <mo>(</mo> |
5116 | 1 | <mi>x</mi> |
5117 | 1 | <mo>)</mo> |
5118 | 1 | </mrow> |
5119 | 1 | </mrow> |
5120 | 1 | <mo>+</mo> |
5121 | 1 | <mrow data-changed='added'> |
5122 | 1 | <mi>f</mi> |
5123 | 1 | <mo data-changed='added'>⁡</mo> |
5124 | 1 | <mrow data-changed='added'> |
5125 | 1 | <mo>(</mo> |
5126 | 1 | <mi>x</mi> |
5127 | 1 | <mo>)</mo> |
5128 | 1 | </mrow> |
5129 | 1 | </mrow> |
5130 | 1 | <mo>+</mo> |
5131 | 1 | <mrow data-changed='added'> |
5132 | 1 | <mi>t</mi> |
5133 | 1 | <mo data-changed='added'>⁡</mo> |
5134 | 1 | <mrow> |
5135 | 1 | <mo>(</mo> |
5136 | 1 | <mi>x</mi> |
5137 | 1 | <mo>)</mo> |
5138 | 1 | </mrow> |
5139 | 1 | </mrow> |
5140 | 1 | </mrow> |
5141 | 1 | </math>"; |
5142 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5143 | 1 | } |
5144 | ||
5145 | #[test] | |
5146 | 1 | fn maybe_function() -> Result<()> { |
5147 | 1 | let test_str = "<math> |
5148 | 1 | <mrow> |
5149 | 1 | <mi>P</mi> |
5150 | 1 | <mo>(</mo> |
5151 | 1 | <mi>A</mi> |
5152 | 1 | <mo>∩</mo> |
5153 | 1 | <mi>B</mi> |
5154 | 1 | <mo>)</mo> |
5155 | 1 | </mrow> |
5156 | 1 | </math>"; |
5157 | 1 | let target_str = "<math> |
5158 | 1 | <mrow> |
5159 | 1 | <mi>P</mi> |
5160 | 1 | <mo data-function-guess='true' data-changed='added'>⁢</mo> |
5161 | 1 | <mrow data-changed='added'> |
5162 | 1 | <mo>(</mo> |
5163 | 1 | <mrow data-changed='added'> |
5164 | 1 | <mi>A</mi> |
5165 | 1 | <mo>∩</mo> |
5166 | 1 | <mi>B</mi> |
5167 | 1 | </mrow> |
5168 | 1 | <mo>)</mo> |
5169 | 1 | </mrow> |
5170 | 1 | </mrow> |
5171 | 1 | </math>"; |
5172 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5173 | 1 | } |
5174 | ||
5175 | #[test] | |
5176 | 1 | fn function_with_multiple_args() -> Result<()> { |
5177 | 1 | let test_str = "<math> |
5178 | 1 | <mi>sin</mi><mo>(</mo><mi>x</mi><mo>+</mo><mi>y</mi><mo>)</mo> |
5179 | 1 | <mo>+</mo> |
5180 | 1 | <mi>f</mi><mo>(</mo><mi>x</mi><mo>+</mo><mi>y</mi><mo>)</mo> |
5181 | 1 | <mo>+</mo> |
5182 | 1 | <mi>t</mi><mo>(</mo><mi>x</mi><mo>+</mo><mi>y</mi><mo>)</mo> |
5183 | 1 | <mo>+</mo> |
5184 | 1 | <mi>w</mi><mo>(</mo><mi>x</mi><mo>,</mo><mi>y</mi><mo>)</mo> |
5185 | 1 | </math>"; |
5186 | 1 | let target_str = " <math> |
5187 | 1 | <mrow data-changed='added'> |
5188 | 1 | <mrow data-changed='added'> |
5189 | 1 | <mi>sin</mi> |
5190 | 1 | <mo data-changed='added'>⁡</mo> |
5191 | 1 | <mrow data-changed='added'> |
5192 | 1 | <mo>(</mo> |
5193 | 1 | <mrow data-changed='added'> |
5194 | 1 | <mi>x</mi> |
5195 | 1 | <mo>+</mo> |
5196 | 1 | <mi>y</mi> |
5197 | 1 | </mrow> |
5198 | 1 | <mo>)</mo> |
5199 | 1 | </mrow> |
5200 | 1 | </mrow> |
5201 | 1 | <mo>+</mo> |
5202 | 1 | <mrow data-changed='added'> |
5203 | 1 | <mi>f</mi> |
5204 | 1 | <mo data-changed='added'>⁡</mo> |
5205 | 1 | <mrow data-changed='added'> |
5206 | 1 | <mo>(</mo> |
5207 | 1 | <mrow data-changed='added'> |
5208 | 1 | <mi>x</mi> |
5209 | 1 | <mo>+</mo> |
5210 | 1 | <mi>y</mi> |
5211 | 1 | </mrow> |
5212 | 1 | <mo>)</mo> |
5213 | 1 | </mrow> |
5214 | 1 | </mrow> |
5215 | 1 | <mo>+</mo> |
5216 | 1 | <mrow data-changed='added'> |
5217 | 1 | <mi>t</mi> |
5218 | 1 | <mo data-changed='added' data-function-guess='true'>⁢</mo> |
5219 | 1 | <mrow data-changed='added'> |
5220 | 1 | <mo>(</mo> |
5221 | 1 | <mrow data-changed='added'> |
5222 | 1 | <mi>x</mi> |
5223 | 1 | <mo>+</mo> |
5224 | 1 | <mi>y</mi> |
5225 | 1 | </mrow> |
5226 | 1 | <mo>)</mo> |
5227 | 1 | </mrow> |
5228 | 1 | </mrow> |
5229 | 1 | <mo>+</mo> |
5230 | 1 | <mrow data-changed='added'> |
5231 | 1 | <mi>w</mi> |
5232 | 1 | <mo data-changed='added'>⁡</mo> |
5233 | 1 | <mrow data-changed='added'> |
5234 | 1 | <mo>(</mo> |
5235 | 1 | <mrow data-changed='added'> |
5236 | 1 | <mi>x</mi> |
5237 | 1 | <mo>,</mo> |
5238 | 1 | <mi>y</mi> |
5239 | 1 | </mrow> |
5240 | 1 | <mo>)</mo> |
5241 | 1 | </mrow> |
5242 | 1 | </mrow> |
5243 | 1 | </mrow> |
5244 | 1 | </math>"; |
5245 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5246 | 1 | } |
5247 | ||
5248 | #[test] | |
5249 | 1 | fn function_with_no_args() -> Result<()> { |
5250 | 1 | let test_str = "<math><mrow> |
5251 | 1 | <mi>sin</mi><mi>x</mi> |
5252 | 1 | <mo>+</mo> |
5253 | 1 | <mi>f</mi><mi>x</mi> |
5254 | 1 | <mo>+</mo> |
5255 | 1 | <mi>t</mi><mi>x</mi> |
5256 | 1 | </mrow></math>"; |
5257 | 1 | let target_str = " <math> |
5258 | 1 | <mrow> |
5259 | 1 | <mrow data-changed='added'> |
5260 | 1 | <mi>sin</mi> |
5261 | 1 | <mo data-changed='added'>⁡</mo> |
5262 | 1 | <mi>x</mi> |
5263 | 1 | </mrow> |
5264 | 1 | <mo>+</mo> |
5265 | 1 | <mrow data-changed='added'> |
5266 | 1 | <mi>f</mi> |
5267 | 1 | <mo data-changed='added'>⁢</mo> |
5268 | 1 | <mi>x</mi> |
5269 | 1 | </mrow> |
5270 | 1 | <mo>+</mo> |
5271 | 1 | <mrow data-changed='added'> |
5272 | 1 | <mi>t</mi> |
5273 | 1 | <mo data-changed='added'>⁢</mo> |
5274 | 1 | <mi>x</mi> |
5275 | 1 | </mrow> |
5276 | 1 | </mrow> |
5277 | 1 | </math>"; |
5278 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5279 | ||
5280 | 1 | } |
5281 | ||
5282 | ||
5283 | #[test] | |
5284 | 1 | fn function_call_vs_implied_times() -> Result<()> { |
5285 | 1 | let test_str = "<math><mi>f</mi><mo>(</mo><mi>x</mi><mo>)</mo><mi>y</mi></math>"; |
5286 | 1 | let target_str = "<math> |
5287 | 1 | <mrow data-changed='added'> |
5288 | 1 | <mrow data-changed='added'> |
5289 | 1 | <mi>f</mi> |
5290 | 1 | <mo data-changed='added'>⁡</mo> |
5291 | 1 | <mrow data-changed='added'> <mo>(</mo> <mi>x</mi> <mo>)</mo> </mrow> |
5292 | 1 | </mrow> |
5293 | 1 | <mo data-changed='added'>⁢</mo> |
5294 | 1 | <mi>y</mi> </mrow> |
5295 | 1 | </math>"; |
5296 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5297 | 1 | } |
5298 | ||
5299 | #[test] | |
5300 | 1 | fn implied_plus() -> Result<()> { |
5301 | 1 | let test_str = "<math><mrow> |
5302 | 1 | <mn>2</mn><mfrac><mn>3</mn><mn>4</mn></mfrac> |
5303 | 1 | </mrow></math>"; |
5304 | 1 | let target_str = "<math> |
5305 | 1 | <mrow> |
5306 | 1 | <mn>2</mn> |
5307 | 1 | <mo data-changed='added'>⁤</mo> |
5308 | 1 | <mfrac> |
5309 | 1 | <mn>3</mn> |
5310 | 1 | <mn>4</mn> |
5311 | 1 | </mfrac> |
5312 | 1 | </mrow> |
5313 | 1 | </math>"; |
5314 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5315 | 1 | } |
5316 | ||
5317 | #[test] | |
5318 | 1 | fn implied_plus_linear() -> Result<()> { |
5319 | 1 | let test_str = "<math><mrow> |
5320 | 1 | <mn>2</mn><mspace width='0.278em'></mspace><mn>3</mn><mo>/</mo><mn>4</mn> |
5321 | 1 | </mrow></math>"; |
5322 | 1 | let target_str = "<math> |
5323 | 1 | <mrow> |
5324 | 1 | <mn>2</mn> |
5325 | 1 | <mo data-changed='added'>⁤</mo> |
5326 | 1 | <mrow data-changed='added'>> |
5327 | 1 | <mn data-previous-space-width='0.278'>3</mn> |
5328 | 1 | <mo>/</mo> |
5329 | 1 | <mn>4</mn> |
5330 | 1 | </mrow> |
5331 | 1 | </mrow> |
5332 | 1 | </math>"; |
5333 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5334 | 1 | } |
5335 | ||
5336 | #[test] | |
5337 | 1 | fn implied_plus_linear2() -> Result<()> { |
5338 | 1 | let test_str = "<math><mrow> |
5339 | 1 | <mn>2</mn><mrow><mn>3</mn><mo>/</mo><mn>4</mn></mrow> |
5340 | 1 | </mrow></math>"; |
5341 | 1 | let target_str = "<math> |
5342 | 1 | <mrow> |
5343 | 1 | <mn>2</mn> |
5344 | 1 | <mo data-changed='added'>⁤</mo> |
5345 | 1 | <mrow> |
5346 | 1 | <mn>3</mn> |
5347 | 1 | <mo>/</mo> |
5348 | 1 | <mn>4</mn> |
5349 | 1 | </mrow> |
5350 | 1 | </mrow> |
5351 | 1 | </math>"; |
5352 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5353 | 1 | } |
5354 | ||
5355 | #[test] | |
5356 | 1 | fn implied_comma() -> Result<()> { |
5357 | 1 | let test_str = "<math><msub><mi>b</mi><mrow><mn>1</mn><mn>2</mn></mrow></msub></math>"; |
5358 | 1 | let target_str = "<math> |
5359 | 1 | <msub><mi>b</mi><mrow><mn>1</mn><mo data-changed='added'>⁣</mo><mn>2</mn></mrow></msub> |
5360 | 1 | </math>"; |
5361 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5362 | 1 | } |
5363 | ||
5364 | #[test] | |
5365 | 1 | fn no_implied_comma() -> Result<()> { |
5366 | 1 | let test_str = "<math><mfrac><mi>b</mi><mrow><mn>1</mn><mn>2</mn></mrow></mfrac></math>"; |
5367 | 1 | let target_str = "<math> |
5368 | 1 | <mfrac><mi>b</mi><mrow><mn>1</mn><mo data-changed='added'>⁢</mo><mn>2</mn></mrow></mfrac> |
5369 | 1 | </math>"; |
5370 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5371 | 1 | } |
5372 | ||
5373 | #[test] | |
5374 | 1 | fn vertical_bars() -> Result<()> { |
5375 | 1 | let test_str = "<math> |
5376 | 1 | <mo>|</mo> <mi>x</mi> <mo>|</mo><mo>+</mo><mo>|</mo> |
5377 | 1 | <mi>a</mi><mo>+</mo><mn>1</mn> <mo>|</mo> |
5378 | 1 | </math>"; |
5379 | 1 | let target_str = " <math> |
5380 | 1 | <mrow data-changed='added'> |
5381 | 1 | <mrow data-changed='added'> |
5382 | 1 | <mo>|</mo> |
5383 | 1 | <mi>x</mi> |
5384 | 1 | <mo>|</mo> |
5385 | 1 | </mrow> |
5386 | 1 | <mo>+</mo> |
5387 | 1 | <mrow data-changed='added'> |
5388 | 1 | <mo>|</mo> |
5389 | 1 | <mrow data-changed='added'> |
5390 | 1 | <mi>a</mi> |
5391 | 1 | <mo>+</mo> |
5392 | 1 | <mn>1</mn> |
5393 | 1 | </mrow> |
5394 | 1 | <mo>|</mo> |
5395 | 1 | </mrow> |
5396 | 1 | </mrow> |
5397 | 1 | </math>"; |
5398 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5399 | 1 | } |
5400 | ||
5401 | ||
5402 | #[test] | |
5403 | 1 | fn vertical_bars_nested() -> Result<()> { |
5404 | 1 | let test_str = "<math><mo>|</mo><mi>x</mi><mo>|</mo><mi>y</mi><mo>|</mo><mi>z</mi><mo>|</mo></math>"; |
5405 | 1 | let target_str = "<math> |
5406 | 1 | <mrow data-changed='added'> |
5407 | 1 | <mrow data-changed='added'> |
5408 | 1 | <mo>|</mo> |
5409 | 1 | <mi>x</mi> |
5410 | 1 | <mo>|</mo> |
5411 | 1 | </mrow> |
5412 | 1 | <mo data-changed='added'>⁢</mo> |
5413 | 1 | <mi>y</mi> |
5414 | 1 | <mo data-changed='added'>⁢</mo> |
5415 | 1 | <mrow data-changed='added'> |
5416 | 1 | <mo>|</mo> |
5417 | 1 | <mi>z</mi> |
5418 | 1 | <mo>|</mo> |
5419 | 1 | </mrow> |
5420 | 1 | </mrow> |
5421 | 1 | </math>"; |
5422 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5423 | 1 | } |
5424 | ||
5425 | #[test] | |
5426 | 1 | fn double_vertical_bars() -> Result<()> { |
5427 | 1 | let test_str = "<math><mrow><mo>||</mo><mi>x</mi><mo>||</mo><mo>||</mo><mi>y</mi><mo>||</mo></mrow></math>"; |
5428 | 1 | let target_str = "<math> |
5429 | 1 | <mrow> |
5430 | 1 | <mrow data-changed='added'><mo>‖</mo><mi>x</mi><mo>‖</mo></mrow> |
5431 | 1 | <mo data-changed='added'>⁢</mo> |
5432 | 1 | <mrow data-changed='added'><mo>‖</mo><mi>y</mi><mo>‖</mo></mrow> |
5433 | 1 | </mrow> |
5434 | 1 | </math>"; |
5435 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5436 | 1 | } |
5437 | ||
5438 | #[test] | |
5439 | 1 | fn double_vertical_bars_mo() -> Result<()> { |
5440 | 1 | let test_str = "<math><mo>|</mo><mo>|</mo><mi>a</mi><mo>|</mo><mo>|</mo></math>"; |
5441 | 1 | let target_str = "<math><mrow data-changed='added'><mo>‖</mo><mi>a</mi><mo>‖</mo></mrow></math>"; |
5442 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5443 | 1 | } |
5444 | ||
5445 | #[test] | |
5446 | 1 | fn no_double_vertical_bars_mo() -> Result<()> { |
5447 | 1 | let test_str = "<math><mo>|</mo><mi>x</mi><mo>|</mo><mo>|</mo><mi>y</mi><mo>|</mo></math>"; |
5448 | 1 | let target_str = "<math> <mrow data-changed='added'> |
5449 | 1 | <mrow data-changed='added'><mo>|</mo><mi>x</mi><mo>|</mo></mrow> |
5450 | 1 | <mo data-changed='added'>⁢</mo> |
5451 | 1 | <mrow data-changed='added'><mo>|</mo><mi>y</mi><mo>|</mo></mrow> |
5452 | 1 | </mrow> </math>"; |
5453 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5454 | 1 | } |
5455 | ||
5456 | #[test] | |
5457 | 1 | fn vertical_bar_such_that() -> Result<()> { |
5458 | 1 | let test_str = "<math> |
5459 | 1 | <mo>{</mo><mi>x</mi><mo>|</mo><mi>x</mi><mo>∈</mo><mi>S</mi><mo>}</mo> |
5460 | 1 | </math>"; |
5461 | 1 | let target_str = "<math> |
5462 | 1 | <mrow data-changed='added'> |
5463 | 1 | <mo>{</mo> |
5464 | 1 | <mrow data-changed='added'> |
5465 | 1 | <mi>x</mi> |
5466 | 1 | <mo>|</mo> |
5467 | 1 | <mrow data-changed='added'> |
5468 | 1 | <mi>x</mi> |
5469 | 1 | <mo>∈</mo> |
5470 | 1 | <mi>S</mi> |
5471 | 1 | </mrow> |
5472 | 1 | </mrow> |
5473 | 1 | <mo>}</mo> |
5474 | 1 | </mrow> |
5475 | 1 | </math>"; |
5476 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5477 | 1 | } |
5478 | ||
5479 | #[test] | |
5480 | #[ignore] // need to figure out a test for this ("|" should have a precedence around ":" since that is an alternative notation for "such that", but "∣" is higher precedence) | |
5481 | 0 | fn vertical_bar_divides() -> Result<()> { |
5482 | 0 | let test_str = "<math> |
5483 | 0 | <mi>x</mi><mo>+</mo><mi>y</mi> <mo>|</mo><mn>12</mn> |
5484 | 0 | </math>"; |
5485 | 0 | let target_str = "<math> |
5486 | 0 | <mrow data-changed='added'> |
5487 | 0 | <mrow data-changed='added'> |
5488 | 0 | <mi>x</mi> |
5489 | 0 | <mo>+</mo> |
5490 | 0 | <mi>y</mi> |
5491 | 0 | </mrow> |
5492 | 0 | <mo>∣ <!--divides--></mo> |
5493 | 0 | <mn>12</mn> |
5494 | 0 | </mrow> |
5495 | 0 | </math>"; |
5496 | 0 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5497 | 0 | } |
5498 | ||
5499 | ||
5500 | #[test] | |
5501 | 1 | fn trig_mo() -> Result<()> { |
5502 | 1 | let test_str = "<math><mo>sin</mo><mi>x</mi> |
5503 | 1 | <mo>+</mo><mo>cos</mo><mi>y</mi> |
5504 | 1 | <mo>+</mo><munder><mo>lim</mo><mi>D</mi></munder><mi>y</mi> |
5505 | 1 | </math>"; |
5506 | 1 | let target_str = "<math> |
5507 | 1 | <mrow data-changed='added'> |
5508 | 1 | <mrow data-changed='added'> |
5509 | 1 | <mi>sin</mi> |
5510 | 1 | <mo data-changed='added'>⁡</mo> |
5511 | 1 | <mi>x</mi> |
5512 | 1 | </mrow> |
5513 | 1 | <mo>+</mo> |
5514 | 1 | <mrow data-changed='added'> |
5515 | 1 | <mi>cos</mi> |
5516 | 1 | <mo data-changed='added'>⁡</mo> |
5517 | 1 | <mi>y</mi> |
5518 | 1 | </mrow> |
5519 | 1 | <mo>+</mo> |
5520 | 1 | <mrow data-changed='added'> |
5521 | 1 | <munder> |
5522 | 1 | <mi>lim</mi> |
5523 | 1 | <mi>D</mi> |
5524 | 1 | </munder> |
5525 | 1 | <mo data-changed='added'>⁡</mo> |
5526 | 1 | <mi>y</mi> |
5527 | 1 | </mrow> |
5528 | 1 | </mrow> |
5529 | 1 | </math>"; |
5530 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5531 | 1 | } |
5532 | ||
5533 | #[test] | |
5534 | 1 | fn trig_mtext() -> Result<()> { |
5535 | 1 | let test_str = "<math><mtext>sin</mtext><mi>x</mi> |
5536 | 1 | <mo>+</mo><mtext>cos</mtext><mi>y</mi> |
5537 | 1 | <mo>+</mo><munder><mtext>lim</mtext><mi>D</mi></munder><mi>y</mi> |
5538 | 1 | </math>"; |
5539 | 1 | let target_str = "<math> |
5540 | 1 | <mrow data-changed='added'> |
5541 | 1 | <mrow data-changed='added'> |
5542 | 1 | <mi>sin</mi> |
5543 | 1 | <mo data-changed='added'>⁡</mo> |
5544 | 1 | <mi>x</mi> |
5545 | 1 | </mrow> |
5546 | 1 | <mo>+</mo> |
5547 | 1 | <mrow data-changed='added'> |
5548 | 1 | <mi>cos</mi> |
5549 | 1 | <mo data-changed='added'>⁡</mo> |
5550 | 1 | <mi>y</mi> |
5551 | 1 | </mrow> |
5552 | 1 | <mo>+</mo> |
5553 | 1 | <mrow data-changed='added'> |
5554 | 1 | <munder> |
5555 | 1 | <mi>lim</mi> |
5556 | 1 | <mi>D</mi> |
5557 | 1 | </munder> |
5558 | 1 | <mo data-changed='added'>⁡</mo> |
5559 | 1 | <mi>y</mi> |
5560 | 1 | </mrow> |
5561 | 1 | </mrow> |
5562 | 1 | </math>"; |
5563 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5564 | 1 | } |
5565 | ||
5566 | #[test] | |
5567 | 1 | fn trig_negative_args() -> Result<()> { |
5568 | 1 | let test_str = "<math><mi>sin</mi><mo>-</mo><mn>2</mn><mi>π</mi><mi>x</mi></math>"; |
5569 | 1 | let target_str = "<math> |
5570 | 1 | <mrow data-changed='added'> |
5571 | 1 | <mi>sin</mi> |
5572 | 1 | <mo data-changed='added'>⁡</mo> |
5573 | 1 | <mrow data-changed='added'> |
5574 | 1 | <mrow data-changed='added'> |
5575 | 1 | <mo>-</mo> |
5576 | 1 | <mn>2</mn> |
5577 | 1 | </mrow> |
5578 | 1 | <mo data-changed='added'>⁢</mo> |
5579 | 1 | <mi>π</mi> |
5580 | 1 | <mo data-changed='added'>⁢</mo> |
5581 | 1 | <mi>x</mi> |
5582 | 1 | </mrow> |
5583 | 1 | </mrow> |
5584 | 1 | </math>"; |
5585 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5586 | 1 | } |
5587 | ||
5588 | #[test] | |
5589 | 1 | fn not_trig_negative_args() -> Result<()> { |
5590 | // this is here to make sure that only trig functions get the special treatment | |
5591 | 1 | let test_str = "<math><mi>ker</mi><mo>-</mo><mn>2</mn><mi>π</mi><mi>x</mi></math>"; |
5592 | 1 | let target_str = "<math> |
5593 | 1 | <mrow data-changed='added'> |
5594 | 1 | <mrow data-changed='added'> |
5595 | 1 | <mi>ker</mi> |
5596 | 1 | <mo data-changed='added'>⁡</mo> |
5597 | 1 | <mrow data-changed='added'> |
5598 | 1 | <mo>-</mo> |
5599 | 1 | <mn>2</mn> |
5600 | 1 | </mrow> |
5601 | 1 | </mrow> |
5602 | 1 | <mo data-changed='added'>⁢</mo> |
5603 | 1 | <mi>π</mi> |
5604 | 1 | <mo data-changed='added'>⁢</mo> |
5605 | 1 | <mi>x</mi> |
5606 | 1 | </mrow> |
5607 | 1 | </math>"; |
5608 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5609 | 1 | } |
5610 | ||
5611 | #[test] | |
5612 | 1 | fn trig_args() -> Result<()> { |
5613 | 1 | let test_str = "<math><mi>sin</mi><mn>2</mn><mi>π</mi><mi>x</mi></math>"; |
5614 | 1 | let target_str = "<math> |
5615 | 1 | <mrow data-changed='added'> |
5616 | 1 | <mi>sin</mi> |
5617 | 1 | <mo data-changed='added'>⁡</mo> |
5618 | 1 | <mrow data-changed='added'> |
5619 | 1 | <mn>2</mn> |
5620 | 1 | <mo data-changed='added'>⁢</mo> |
5621 | 1 | <mi>π</mi> |
5622 | 1 | <mo data-changed='added'>⁢</mo> |
5623 | 1 | <mi>x</mi> |
5624 | 1 | </mrow> |
5625 | 1 | </mrow> |
5626 | 1 | </math>"; |
5627 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5628 | 1 | } |
5629 | ||
5630 | #[test] | |
5631 | 1 | fn not_trig_args() -> Result<()> { |
5632 | // this is here to make sure that only trig functions get the special treatment | |
5633 | 1 | let test_str = "<math><mi>ker</mi><mn>2</mn><mi>π</mi><mi>x</mi></math>"; |
5634 | 1 | let target_str = "<math> |
5635 | 1 | <mrow data-changed='added'> |
5636 | 1 | <mrow data-changed='added'> |
5637 | 1 | <mi>ker</mi> |
5638 | 1 | <mo data-changed='added'>⁡</mo> |
5639 | 1 | <mn>2</mn> |
5640 | 1 | </mrow> |
5641 | 1 | <mo data-changed='added'>⁢</mo> |
5642 | 1 | <mi>π</mi> |
5643 | 1 | <mo data-changed='added'>⁢</mo> |
5644 | 1 | <mi>x</mi> |
5645 | 1 | </mrow> |
5646 | 1 | </math>"; |
5647 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5648 | 1 | } |
5649 | ||
5650 | #[test] | |
5651 | 1 | fn trig_trig() -> Result<()> { |
5652 | 1 | let test_str = "<math><mi>sin</mi><mi>x</mi><mi>cos</mi><mi>y</mi></math>"; |
5653 | 1 | let target_str = "<math> |
5654 | 1 | <mrow data-changed='added'> |
5655 | 1 | <mrow data-changed='added'> |
5656 | 1 | <mi>sin</mi> |
5657 | 1 | <mo data-changed='added'>⁡</mo> |
5658 | 1 | <mi>x</mi> |
5659 | 1 | </mrow> |
5660 | 1 | <mo data-changed='added'>⁢</mo> |
5661 | 1 | <mrow data-changed='added'> |
5662 | 1 | <mi>cos</mi> |
5663 | 1 | <mo data-changed='added'>⁡</mo> |
5664 | 1 | <mi>y</mi> |
5665 | 1 | </mrow> |
5666 | 1 | </mrow> |
5667 | 1 | </math>"; |
5668 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5669 | 1 | } |
5670 | ||
5671 | #[test] | |
5672 | 1 | fn trig_function_composition() -> Result<()> { |
5673 | 1 | let test_str = "<math><mo>(</mo><mi>sin</mi><mo>-</mo><mi>cos</mi><mo>)</mo><mi>x</mi></math>"; |
5674 | 1 | let target_str = "<math> |
5675 | 1 | <mrow data-changed='added'> |
5676 | 1 | <mrow data-changed='added'> |
5677 | 1 | <mo>(</mo> |
5678 | 1 | <mrow data-changed='added'> |
5679 | 1 | <mi>sin</mi> |
5680 | 1 | <mo>-</mo> |
5681 | 1 | <mi>cos</mi> |
5682 | 1 | </mrow> |
5683 | 1 | <mo>)</mo> |
5684 | 1 | </mrow> |
5685 | 1 | <mo data-changed='added'>⁢</mo> |
5686 | 1 | <mi>x</mi> |
5687 | 1 | </mrow> |
5688 | 1 | </math>"; |
5689 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5690 | 1 | } |
5691 | ||
5692 | ||
5693 | #[test] | |
5694 | 1 | fn currency_in_leaf_prefix() -> Result<()> { |
5695 | 1 | let test_str = "<math><mn>$8.54</mn></math>"; |
5696 | 1 | let target_str = "<math> |
5697 | 1 | <mrow data-changed='added'> |
5698 | 1 | <mi>$</mi> |
5699 | 1 | <mo data-changed='added'>⁢</mo> |
5700 | 1 | <mn>8.54</mn> |
5701 | 1 | </mrow> |
5702 | 1 | </math>"; |
5703 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5704 | 1 | } |
5705 | ||
5706 | #[test] | |
5707 | 1 | fn currency_in_leaf_postfix() -> Result<()> { |
5708 | 1 | let test_str = "<math><mn>188,23€</mn></math>"; |
5709 | 1 | let target_str = " <math> |
5710 | 1 | <mrow data-changed='added'> |
5711 | 1 | <mo data-changed='added'>⁢</mo> |
5712 | 1 | <mn>188,23</mn> |
5713 | 1 | <mo data-changed='added'>⁢</mo> |
5714 | 1 | <mi>€</mi> |
5715 | 1 | </mrow> |
5716 | 1 | </math>"; |
5717 | 1 | are_strs_canonically_equal_with_locale(test_str, target_str, &[], ".", ",") |
5718 | 1 | } |
5719 | ||
5720 | #[test] | |
5721 | 1 | fn currency_in_leaf_infix() -> Result<()> { |
5722 | 1 | let test_str = "<math><mn>1€23</mn></math>"; |
5723 | 1 | let target_str = " <math> |
5724 | 1 | <mrow data-changed='added'> |
5725 | 1 | <mn>1</mn> |
5726 | 1 | <mo data-changed='added'>⁢</mo> |
5727 | 1 | <mi>€</mi> |
5728 | 1 | <mo data-changed='added'>⁢</mo> |
5729 | 1 | <mn>23</mn> |
5730 | 1 | </mrow> |
5731 | 1 | </math>"; |
5732 | 1 | are_strs_canonically_equal_with_locale(test_str, target_str, &[], ".", ",") |
5733 | 1 | } |
5734 | ||
5735 | #[test] | |
5736 | 1 | fn mtext_whitespace_string() -> Result<()> { |
5737 | 1 | let test_str = "<math><mi>t</mi><mtext>  </mtext></math>"; |
5738 | 1 | let target_str = "<math><mi data-following-space-width='0.922'>t</mi></math>"; |
5739 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5740 | 1 | } |
5741 | ||
5742 | #[test] | |
5743 | 1 | fn mtext_whitespace_string_before() -> Result<()> { |
5744 | 1 | let test_str = "<math><mtext>  </mtext><mi>t</mi></math>"; |
5745 | 1 | let target_str = "<math><mi data-previous-space-width='0.922'>t</mi></math>"; |
5746 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5747 | 1 | } |
5748 | ||
5749 | #[test] | |
5750 | 1 | fn mtext_whitespace_1() -> Result<()> { |
5751 | 1 | let test_str = "<math><mi>t</mi><mtext>  </mtext> |
5752 | 1 | <mrow><mo>(</mo><mi>x</mi><mo>+</mo><mi>y</mi><mo>)</mo></mrow></math>"; |
5753 | 1 | let target_str = " <math> |
5754 | 1 | <mrow data-changed='added'> |
5755 | 1 | <mi>t</mi> |
5756 | 1 | <mo data-changed='added' data-function-guess='true'>⁢</mo> |
5757 | 1 | <mrow data-previous-space-width='0.922'> |
5758 | 1 | <mo>(</mo> |
5759 | 1 | <mrow data-changed='added'> |
5760 | 1 | <mi>x</mi> |
5761 | 1 | <mo>+</mo> |
5762 | 1 | <mi>y</mi> |
5763 | 1 | </mrow> |
5764 | 1 | <mo>)</mo> |
5765 | 1 | </mrow> |
5766 | 1 | </mrow> |
5767 | 1 | </math>"; |
5768 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5769 | 1 | } |
5770 | ||
5771 | #[test] | |
5772 | 1 | fn mtext_whitespace_2() -> Result<()> { |
5773 | 1 | let test_str = "<math><mi>f</mi><mtext>  </mtext> |
5774 | 1 | <mrow><mo>(</mo><mi>x</mi><mo>+</mo><mi>y</mi><mo>)</mo></mrow></math>"; |
5775 | 1 | let target_str = " <math> |
5776 | 1 | <mrow data-changed='added'> |
5777 | 1 | <mi>f</mi> |
5778 | 1 | <mo data-changed='added'>⁡</mo> |
5779 | 1 | <mrow data-previous-space-width='0.922'> |
5780 | 1 | <mo>(</mo> |
5781 | 1 | <mrow data-changed='added'> |
5782 | 1 | <mi>x</mi> |
5783 | 1 | <mo>+</mo> |
5784 | 1 | <mi>y</mi> |
5785 | 1 | </mrow> |
5786 | 1 | <mo>)</mo> |
5787 | 1 | </mrow> |
5788 | 1 | </mrow> |
5789 | 1 | </math>"; |
5790 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5791 | 1 | } |
5792 | ||
5793 | #[test] | |
5794 | 1 | fn remove_mtext_whitespace_3() -> Result<()> { |
5795 | 1 | let test_str = "<math><mi>t</mi> |
5796 | 1 | <mrow><mtext> </mtext><mo>(</mo><mi>x</mi><mo>+</mo><mi>y</mi><mo>)</mo></mrow></math>"; |
5797 | 1 | let target_str = "<math> |
5798 | 1 | <mrow data-changed='added'> |
5799 | 1 | <mi>t</mi> |
5800 | 1 | <mo data-changed='added' data-function-guess='true'>⁢</mo> |
5801 | 1 | <mrow> |
5802 | 1 | <mo data-previous-space-width='0.167'>(</mo> |
5803 | 1 | <mrow data-changed='added'> |
5804 | 1 | <mi>x</mi> |
5805 | 1 | <mo>+</mo> |
5806 | 1 | <mi>y</mi> |
5807 | 1 | </mrow> |
5808 | 1 | <mo>)</mo> |
5809 | 1 | </mrow> |
5810 | 1 | </mrow> |
5811 | 1 | </math>"; |
5812 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5813 | 1 | } |
5814 | ||
5815 | #[test] | |
5816 | 1 | fn do_not_remove_any_whitespace() -> Result<()> { |
5817 | 1 | let test_str = "<math><mfrac> |
5818 | 1 | <mrow><mspace width='3em'/></mrow> |
5819 | 1 | <mtext> </mtext> |
5820 | 1 | </mfrac></math>"; |
5821 | 1 | let target_str = " <math> |
5822 | 1 | <mfrac> |
5823 | 1 | <mtext width='3em' data-changed='was-mspace' data-width='3' data-empty-in-2D='true'> </mtext> |
5824 | 1 | <mtext data-width='0.167' data-empty-in-2D='true'> </mtext> |
5825 | 1 | </mfrac> |
5826 | 1 | </math>"; |
5827 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5828 | 1 | } |
5829 | ||
5830 | #[test] | |
5831 | 1 | fn remove_mo_whitespace() -> Result<()> { |
5832 | 1 | let test_str = "<math><mi>cos</mi><mo> </mo><mi>x</mi></math>"; |
5833 | 1 | let target_str = "<math> |
5834 | 1 | <mrow data-changed='added'> |
5835 | 1 | <mi>cos</mi> |
5836 | 1 | <mo data-changed='added'>⁡</mo> |
5837 | 1 | <mi data-previous-space-width='0.7'>x</mi> |
5838 | 1 | </mrow> |
5839 | 1 | </math>"; |
5840 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5841 | 1 | } |
5842 | ||
5843 | #[test] | |
5844 | 1 | fn do_not_remove_some_whitespace() -> Result<()> { |
5845 | 1 | let test_str = "<math><mroot> |
5846 | 1 | <mrow><mi>b</mi><mphantom><mi>y</mi></mphantom></mrow> |
5847 | 1 | <mtext> </mtext> |
5848 | 1 | </mroot></math>"; |
5849 | 1 | let target_str = "<math><mroot> |
5850 | 1 | <mi>b</mi> |
5851 | 1 | <mtext data-empty-in-2D='true' data-width='0.167'> </mtext> |
5852 | 1 | </mroot></math>"; |
5853 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5854 | 1 | } |
5855 | ||
5856 | #[test] | |
5857 | 1 | fn remove_all_extra_elements() -> Result<()> { |
5858 | 1 | let test_str = "<math><msqrt> |
5859 | 1 | <mstyle> <mi>b</mi> </mstyle> |
5860 | 1 | <mphantom><mi>y</mi></mphantom> |
5861 | 1 | <mtext> </mtext> |
5862 | 1 | <mspace width='3em'/> |
5863 | 1 | </msqrt></math>"; |
5864 | 1 | let target_str = "<math><msqrt> |
5865 | 1 | <mi data-following-space-width='3.167'>b</mi> |
5866 | 1 | </msqrt></math>"; |
5867 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5868 | 1 | } |
5869 | ||
5870 | #[test] | |
5871 | 1 | fn empty_content() -> Result<()> { |
5872 | 1 | let test_str = "<math></math>"; |
5873 | 1 | let target_str = " <math><mtext data-added='missing-content' data-width='0.700'> </mtext></math>"; |
5874 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5875 | 1 | } |
5876 | ||
5877 | #[test] | |
5878 | 1 | fn empty_content_after_cleanup() -> Result<()> { |
5879 | 1 | let test_str = "<math><mrow><mphantom><mn>1</mn></mphantom></mrow></math>"; |
5880 | 1 | let target_str = " <math><mtext data-added='missing-content' data-width='0'> </mtext></math>"; |
5881 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5882 | 1 | } |
5883 | ||
5884 | #[test] | |
5885 | 1 | fn empty_content_fix_num_children() -> Result<()> { |
5886 | 1 | let test_str = " <math><mfrac><menclose notation='box'><mrow/></menclose><mrow/></mfrac></math>"; |
5887 | 1 | let target_str = "<math> |
5888 | 1 | <mfrac> |
5889 | 1 | <menclose notation='box'> |
5890 | 1 | <mtext data-added='missing-content' data-empty-in-2D='true' data-width='0'> </mtext> |
5891 | 1 | </menclose> |
5892 | 1 | <mtext data-changed='empty_content' data-empty-in-2D='true' data-width='0'> </mtext> |
5893 | 1 | </mfrac> |
5894 | 1 | </math>"; |
5895 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5896 | 1 | } |
5897 | ||
5898 | ||
5899 | #[test] | |
5900 | 1 | fn clean_semantics() -> Result<()> { |
5901 | // this comes from LateXML | |
5902 | 1 | let test_str = "<math> |
5903 | 1 | <semantics> |
5904 | 1 | <mrow><mi>z</mi></mrow> |
5905 | 1 | <annotation-xml encoding='MathML-Content'> |
5906 | 1 | <ci>𝑧</ci> |
5907 | 1 | </annotation-xml> |
5908 | 1 | <annotation encoding='application/x-tex'>z</annotation> |
5909 | 1 | <annotation encoding='application/x-llamapun'>italic_z</annotation> |
5910 | 1 | </semantics> |
5911 | 1 | </math>"; |
5912 | // the annotation-xml value is very touchy and must exactly match what mml-to-string() generates for the test to pass | |
5913 | 1 | let target_str = " <math> |
5914 | 1 | <mi data-annotation-xml-MathML-Content=' <annotation-xml encoding='MathML-Content'> |
5915 | 1 | <ci>𝑧</ci> |
5916 | 1 | </annotation-xml> |
5917 | 1 | ' data-annotation-application_slash_x-tex='z' data-annotation-application_slash_x-llamapun='italic_z'>z</mi> |
5918 | 1 | </math>"; |
5919 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5920 | 1 | } |
5921 | ||
5922 | #[test] | |
5923 | 1 | fn clean_up_mi_operator() -> Result<()> { |
5924 | 1 | let test_str = "<math><mrow><mi>∠</mi><mi>A</mi><mi>B</mi><mi>C</mi></mrow></math>"; |
5925 | 1 | let target_str = " <math> |
5926 | 1 | <mrow> |
5927 | 1 | <mo>∠</mo> |
5928 | 1 | <mrow data-changed='added'> |
5929 | 1 | <mi>A</mi> |
5930 | 1 | <mo data-changed='added'>⁣</mo> |
5931 | 1 | <mi>B</mi> |
5932 | 1 | <mo data-changed='added'>⁣</mo> |
5933 | 1 | <mi>C</mi> |
5934 | 1 | </mrow> |
5935 | 1 | </mrow> |
5936 | 1 | </math>"; |
5937 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5938 | 1 | } |
5939 | ||
5940 | ||
5941 | #[test] | |
5942 | 1 | fn clean_up_arc() -> Result<()> { |
5943 | 1 | let test_str = "<math><mtext>arc </mtext><mi>cos</mi><mi>x</mi></math>"; |
5944 | 1 | let target_str = "<math> |
5945 | 1 | <mrow data-changed='added'> |
5946 | 1 | <mi>arccos</mi> |
5947 | 1 | <mo data-changed='added'>⁡</mo> |
5948 | 1 | <mi>x</mi> |
5949 | 1 | </mrow> |
5950 | 1 | </math>"; |
5951 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5952 | 1 | } |
5953 | ||
5954 | #[test] | |
5955 | 1 | fn clean_up_arc_nospace() -> Result<()> { |
5956 | 1 | let test_str = "<math><mtext>arc</mtext><mi>cos</mi><mi>x</mi></math>"; |
5957 | 1 | let target_str = "<math> |
5958 | 1 | <mrow data-changed='added'> |
5959 | 1 | <mi>arccos</mi> |
5960 | 1 | <mo data-changed='added'>⁡</mo> |
5961 | 1 | <mi>x</mi> |
5962 | 1 | </mrow> |
5963 | 1 | </math>"; |
5964 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5965 | 1 | } |
5966 | ||
5967 | #[test] | |
5968 | 1 | fn roman_numeral() -> Result<()> { |
5969 | 1 | let test_str = "<math><mrow><mtext>XLVIII</mtext> <mo>+</mo><mn>mmxxvi</mn></mrow></math>"; |
5970 | // turns out there is no need to mark them as Roman Numerals -- thought that was need for braille | |
5971 | 1 | let target_str = "<math><mrow> |
5972 | 1 | <mn data-roman-numeral='true' data-number='48'>XLVIII</mn> <mo>+</mo><mn data-roman-numeral='true' data-number='2026'>mmxxvi</mn> |
5973 | 1 | </mrow></math>"; |
5974 | // let target_str = "<math><mrow><mtext>XLVIII</mtext> <mo>+</mo><mn>mmxxvi</mn></mrow></math>"; | |
5975 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5976 | 1 | } |
5977 | ||
5978 | // #[test] | |
5979 | // fn roman_numeral_context() { | |
5980 | // let test_str = "<math><mi>vi</mi><mo>-</mo><mi mathvariant='normal'>i</mi><mo>=</mo><mtext>v</mtext></math>"; | |
5981 | // let target_str = "<math> <mrow data-changed='added'> | |
5982 | // <mrow data-changed='added'><mn data-roman-numeral='true'>vi</mn><mo>-</mo><mn mathvariant='normal' data-roman-numeral='true'>i</mn></mrow> | |
5983 | // <mo>=</mo> <mn data-roman-numeral='true'>v</mn> | |
5984 | // </mrow> </math>"; | |
5985 | // are_strs_canonically_equal_result(test_str, target_str, &[]) | |
5986 | // } | |
5987 | ||
5988 | #[test] | |
5989 | 1 | fn not_roman_numeral() -> Result<()> { |
5990 | 1 | let test_str = "<math><mtext>cm</mtext></math>"; |
5991 | // shouldn't change | |
5992 | 1 | let target_str = "<math><mtext>cm</mtext></math>"; |
5993 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5994 | 1 | } |
5995 | ||
5996 | #[test] | |
5997 | 1 | fn digit_block_binary() -> Result<()> { |
5998 | 1 | let test_str = "<math><mo>(</mo><mn>0110</mn><mspace width=\"thickmathspace\"></mspace><mn>1110</mn><mspace width=\"thickmathspace\"></mspace><mn>0110</mn><mo>)</mo></math>"; |
5999 | 1 | let target_str = " <math> |
6000 | 1 | <mrow data-changed='added'> |
6001 | 1 | <mo>(</mo> |
6002 | 1 | <mn>0110\u{00A0}1110\u{00A0}0110</mn> |
6003 | 1 | <mo>)</mo> |
6004 | 1 | </mrow> |
6005 | 1 | </math>"; |
6006 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6007 | 1 | } |
6008 | ||
6009 | #[test] | |
6010 | 1 | fn digit_block_decimal() -> Result<()> { |
6011 | 1 | let test_str = "<math><mn>8</mn><mo>,</mo><mn>123</mn><mo>,</mo><mn>456</mn><mo>+</mo> |
6012 | 1 | <mn>4</mn><mo>.</mo><mn>32</mn></math>"; |
6013 | 1 | let target_str = " <math> |
6014 | 1 | <mrow data-changed='added'> |
6015 | 1 | <mn>8,123,456</mn> |
6016 | 1 | <mo>+</mo> |
6017 | 1 | <mn>4.32</mn> |
6018 | 1 | </mrow> |
6019 | 1 | </math>"; |
6020 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6021 | 1 | } |
6022 | #[test] | |
6023 | 1 | fn digit_block_comma() -> Result<()> { |
6024 | 1 | let test_str = "<math><mn>8</mn><mo>.</mo><mn>123</mn><mo>.</mo><mn>456</mn><mo>+</mo> |
6025 | 1 | <mn>4</mn><mo>,</mo><mn>32</mn></math>"; |
6026 | 1 | let target_str = " <math> |
6027 | 1 | <mrow data-changed='added'> |
6028 | 1 | <mn>8.123.456</mn> |
6029 | 1 | <mo>+</mo> |
6030 | 1 | <mn>4,32</mn> |
6031 | 1 | </mrow> |
6032 | 1 | </math>"; |
6033 | 1 | are_strs_canonically_equal_with_locale(test_str, target_str, &[], ".", ", ") |
6034 | 1 | } |
6035 | ||
6036 | #[test] | |
6037 | 1 | fn digit_block_int() -> Result<()> { |
6038 | 1 | let test_str = "<math><mn>12</mn><mo>,</mo><mn>345</mn><mo>+</mo> |
6039 | 1 | <mn>1</mn><mo>,</mo><mn>000</mn></math>"; |
6040 | 1 | let target_str = " <math> |
6041 | 1 | <mrow data-changed='added'> |
6042 | 1 | <mn>12,345</mn> |
6043 | 1 | <mo>+</mo> |
6044 | 1 | <mn>1,000</mn> |
6045 | 1 | </mrow> |
6046 | 1 | </math>"; |
6047 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6048 | 1 | } |
6049 | ||
6050 | #[test] | |
6051 | 1 | fn digit_block_non_ascii_int() -> Result<()> { |
6052 | 1 | let test_str = "<math><mn>𝟏𝟐</mn><mo>,</mo><mn>3𝟰𝟻</mn><mo>+</mo> |
6053 | 1 | <mn>𝟙</mn><mo>,</mo><mn>𝟬𝟬𝟬</mn></math>"; |
6054 | 1 | let target_str = " <math> |
6055 | 1 | <mrow data-changed='added'> |
6056 | 1 | <mn>𝟏𝟐,3𝟰𝟻</mn> |
6057 | 1 | <mo>+</mo> |
6058 | 1 | <mn>𝟙,𝟬𝟬𝟬</mn> |
6059 | 1 | </mrow> |
6060 | 1 | </math>"; |
6061 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6062 | 1 | } |
6063 | ||
6064 | #[test] | |
6065 | 1 | fn digit_block_int_dots() -> Result<()> { |
6066 | 1 | let test_str = "<math><mn>12</mn><mo>.</mo><mn>345</mn><mo>+</mo> |
6067 | 1 | <mn>1</mn><mo>.</mo><mn>000</mn></math>"; |
6068 | 1 | let target_str = " <math> |
6069 | 1 | <mrow data-changed='added'> |
6070 | 1 | <mn>12.345</mn> |
6071 | 1 | <mo>+</mo> |
6072 | 1 | <mn>1.000</mn> |
6073 | 1 | </mrow> |
6074 | 1 | </math>"; |
6075 | 1 | are_strs_canonically_equal_with_locale(test_str, target_str, &[], ".", ", ") |
6076 | 1 | } |
6077 | ||
6078 | #[test] | |
6079 | 1 | fn digit_block_decimal_pt() -> Result<()> { |
6080 | 1 | let test_str = "<math><mn>8</mn><mo>,</mo><mn>123</mn><mo>.</mo> |
6081 | 1 | <mo>+</mo><mn>4</mn><mo>.</mo> |
6082 | 1 | <mo>+</mo><mo>.</mo><mn>01</mn></math>"; |
6083 | 1 | let target_str = " <math> |
6084 | 1 | <mrow data-changed='added'> |
6085 | 1 | <mn>8,123.</mn> |
6086 | 1 | <mo>+</mo> |
6087 | 1 | <mn>4.</mn> |
6088 | 1 | <mo>+</mo> |
6089 | 1 | <mn>.01</mn> |
6090 | 1 | </mrow> |
6091 | 1 | </math>"; |
6092 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6093 | 1 | } |
6094 | ||
6095 | #[test] | |
6096 | 1 | fn number_with_decimal_pt() -> Result<()> { |
6097 | // this is output from WIRIS for "12.3" | |
6098 | 1 | let test_str = "<math><mn>12</mn><mo>.</mo><mn>3</mn></math>"; |
6099 | 1 | let target_str = "<math><mn>12.3</mn></math>"; |
6100 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6101 | 1 | } |
6102 | ||
6103 | #[test] | |
6104 | 1 | fn number_with_comma_decimal_pt() -> Result<()> { |
6105 | // this is output from WIRIS for "12.3" | |
6106 | 1 | let test_str = "<math><mn>12</mn><mo>,</mo><mn>3</mn></math>"; |
6107 | 1 | let target_str = "<math><mn>12,3</mn></math>"; |
6108 | 1 | are_strs_canonically_equal_with_locale(test_str, target_str, &[], ".", ", ") |
6109 | 1 | } |
6110 | ||
6111 | #[test] | |
6112 | 1 | fn addition_with_decimal_point_at_end() -> Result<()> { |
6113 | // in this case, the trailing "." is probably a decimal point" -- testing special case combine the "." | |
6114 | // this comes from WIRIS | |
6115 | 1 | let test_str = "<math><mn>1</mn><mo>.</mo><mn>3</mn><mo>+</mo><mn>2</mn><mo>.</mo></math>"; |
6116 | 1 | let target_str = "<math><mrow data-changed='added'><mn>1.3</mn><mo>+</mo><mn>2.</mn></mrow></math>"; |
6117 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6118 | 1 | } |
6119 | ||
6120 | #[test] | |
6121 | 1 | fn addition_with_decimal_point_at_end_and_comma_decimal_separator() -> Result<()> { |
6122 | // in this case, the trailing "." is probably a decimal point" -- testing special case combine the "." | |
6123 | // this comes from WIRIS | |
6124 | 1 | let test_str = "<math><mn>1</mn><mo>,</mo><mn>3</mn><mo>+</mo><mn>2</mn><mo>,</mo></math>"; |
6125 | 1 | let target_str = "<math><mrow data-changed='added'><mn>1,3</mn><mo>+</mo><mn>2,</mn></mrow></math>"; |
6126 | 1 | are_strs_canonically_equal_with_locale(test_str, target_str, &[], ".", ", ") |
6127 | 1 | } |
6128 | ||
6129 | #[test] | |
6130 | 1 | fn sequence_with_period() -> Result<()> { |
6131 | // in this case, we don't want "5." -- testing special case to avoid combining the period. | |
6132 | 1 | let test_str = "<math><mn>1</mn><mo>,</mo><mn>3</mn><mo>,</mo><mn>5</mn><mo>.</mo></math>"; |
6133 | 1 | let target_str = "<math><mrow data-changed='added'> |
6134 | 1 | <mrow data-changed='added'><mn>1</mn><mo>,</mo><mn>3</mn><mo>,</mo><mn>5</mn></mrow><mo>.</mo> |
6135 | 1 | </mrow></math>"; |
6136 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6137 | 1 | } |
6138 | ||
6139 | #[test] | |
6140 | 1 | fn addition_decimal_pt() -> Result<()> { |
6141 | 1 | let test_str = "<math><mo>.</mo><mn>4</mn><mo>=</mo><mn>0</mn><mo>.</mo><mn>4</mn></math>"; |
6142 | 1 | let target_str = "<math><mrow data-changed='added'><mn>.4</mn><mo>=</mo><mn>0.4</mn></mrow></math>"; |
6143 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6144 | 1 | } |
6145 | ||
6146 | #[test] | |
6147 | 1 | fn fraction_decimal_pt() -> Result<()> { |
6148 | 1 | let test_str = "<math><mfrac><mrow><mn>1</mn><mo>.</mo></mrow><mrow><mn>2</mn><mo>.</mo></mrow></mfrac></math>"; |
6149 | 1 | let target_str = "<math><mfrac><mn>1.</mn><mn>2.</mn></mfrac></math>"; |
6150 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6151 | 1 | } |
6152 | ||
6153 | #[test] | |
6154 | 1 | fn fraction_decimal_pt_no_split() -> Result<()> { |
6155 | // don't split off the '.' | |
6156 | 1 | let test_str = "<math><mfrac><mn>1.</mn><mn>2.</mn></mfrac></math>"; |
6157 | 1 | let target_str = "<math><mfrac><mn>1.</mn><mn>2.</mn></mfrac></math>"; |
6158 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6159 | 1 | } |
6160 | ||
6161 | #[test] | |
6162 | 1 | fn not_digit_block_parens() -> Result<()> { |
6163 | 1 | let test_str = "<math><mo>(</mo><mn>451</mn><mo>,</mo><mn>231</mn><mo>)</mo></math>"; |
6164 | 1 | let target_str = " <math> <mrow data-changed='added'> |
6165 | 1 | <mo>(</mo> |
6166 | 1 | <mrow data-changed='added'> |
6167 | 1 | <mn>451</mn> <mo>,</mo> <mn>231</mn> |
6168 | 1 | </mrow> |
6169 | 1 | <mo>)</mo> |
6170 | 1 | </mrow></math>"; |
6171 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6172 | 1 | } |
6173 | ||
6174 | #[test] | |
6175 | 1 | fn not_digit_block_parens_mrow() -> Result<()> { |
6176 | 1 | let test_str = "<math><mo>(</mo><mrow><mn>451</mn><mo>,</mo><mn>231</mn></mrow><mo>)</mo></math>"; |
6177 | 1 | let target_str = " <math> <mrow data-changed='added'> |
6178 | 1 | <mo>(</mo> |
6179 | 1 | <mrow> |
6180 | 1 | <mn>451</mn> <mo>,</mo> <mn>231</mn> |
6181 | 1 | </mrow> |
6182 | 1 | <mo>)</mo> |
6183 | 1 | </mrow></math>"; |
6184 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6185 | 1 | } |
6186 | ||
6187 | #[test] | |
6188 | 1 | fn not_digit_block_decimal() -> Result<()> { |
6189 | 1 | let test_str = "<math><mn>8</mn><mo>,</mo><mn>49</mn><mo>,</mo><mn>456</mn><mo>+</mo> |
6190 | 1 | <mn>4</mn><mtext> </mtext><mn>32</mn><mo>+</mo> |
6191 | 1 | <mn>1</mn><mo>,</mo><mn>234</mn><mo>,</mo><mn>56</mn></math>"; |
6192 | 1 | let target_str = "<math> |
6193 | 1 | <mrow data-changed='added'> |
6194 | 1 | <mn>8</mn> |
6195 | 1 | <mo>,</mo> |
6196 | 1 | <mn>49</mn> |
6197 | 1 | <mo>,</mo> |
6198 | 1 | <mrow data-changed='added'> |
6199 | 1 | <mn>456</mn> |
6200 | 1 | <mo>+</mo> |
6201 | 1 | <mrow data-changed='added'> |
6202 | 1 | <mn>4</mn> |
6203 | 1 | <mo data-changed='added'>⁢</mo> |
6204 | 1 | <mn>32</mn> |
6205 | 1 | </mrow> |
6206 | 1 | <mo>+</mo> |
6207 | 1 | <mn>1</mn> |
6208 | 1 | </mrow> |
6209 | 1 | <mo>,</mo> |
6210 | 1 | <mn>234</mn> |
6211 | 1 | <mo>,</mo> |
6212 | 1 | <mn>56</mn> |
6213 | 1 | </mrow> |
6214 | 1 | </math>"; |
6215 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6216 | 1 | } |
6217 | ||
6218 | #[test] | |
6219 | 1 | fn not_digit_block_ellipsis() -> Result<()> { |
6220 | 1 | let test_str = "<math><mrow><mn>8</mn><mo>,</mo><mn>123</mn><mo>,</mo><mn>456</mn><mo>,</mo> |
6221 | 1 | <mi>…</mi></mrow></math>"; |
6222 | 1 | let target_str = "<math> |
6223 | 1 | <mrow> |
6224 | 1 | <mn>8</mn> |
6225 | 1 | <mo>,</mo> |
6226 | 1 | <mn>123</mn> |
6227 | 1 | <mo>,</mo> |
6228 | 1 | <mn>456</mn> |
6229 | 1 | <mo>,</mo> |
6230 | 1 | <mi>…</mi> |
6231 | 1 | </mrow> |
6232 | 1 | </math>"; |
6233 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6234 | 1 | } |
6235 | ||
6236 | #[test] | |
6237 | 1 | fn not_digit_block_negative_numbers_euro() -> Result<()> { |
6238 | 1 | let test_str = "<math><mrow> |
6239 | 1 | <mo>-</mo><mn>1</mn><mo>,</mo> |
6240 | 1 | <mo>-</mo><mn>2</mn><mo>,</mo> |
6241 | 1 | <mo>-</mo><mn>3</mn><mo>,</mo> |
6242 | 1 | <mo>…</mo> |
6243 | 1 | </mrow></math>"; |
6244 | 1 | let target_str = "<math><mrow> |
6245 | 1 | <mrow data-changed='added'> |
6246 | 1 | <mo>-</mo> |
6247 | 1 | <mn>1</mn> |
6248 | 1 | </mrow> |
6249 | 1 | <mo>,</mo> |
6250 | 1 | <mrow data-changed='added'> |
6251 | 1 | <mo>-</mo> |
6252 | 1 | <mn>2</mn> |
6253 | 1 | </mrow> |
6254 | 1 | <mo>,</mo> |
6255 | 1 | <mrow data-changed='added'> |
6256 | 1 | <mo>-</mo> |
6257 | 1 | <mn>3</mn> |
6258 | 1 | </mrow> |
6259 | 1 | <mo>,</mo> |
6260 | 1 | <mi>…</mi> |
6261 | 1 | </mrow></math>"; |
6262 | 1 | are_strs_canonically_equal_with_locale(test_str, target_str, &[], " .", ",") |
6263 | 1 | } |
6264 | ||
6265 | #[test] | |
6266 | 1 | fn ellipsis() -> Result<()> { |
6267 | 1 | let test_str = "<math><mn>5</mn><mo>,</mo><mo>.</mo><mo>.</mo><mo>.</mo><mo>,</mo><mn>8</mn><mo>,</mo> |
6268 | 1 | <mn>9</mn><mo>,</mo><mo>.</mo><mo>.</mo><mo>.</mo><mo>,</mo><mn>11</mn><mo>,</mo> |
6269 | 1 | <mn>5</mn><mo>,</mo><mo>.</mo><mo>.</mo><mo>,</mo><mn>8</mn> |
6270 | 1 | </math>"; |
6271 | 1 | let target_str = "<math><mrow data-changed='added'> |
6272 | 1 | <mn>5</mn><mo>,</mo><mi>…</mi><mo>,</mo><mn>8</mn><mo>,</mo> |
6273 | 1 | <mn>9</mn><mo>,</mo><mi>…</mi><mo>,</mo><mn>11</mn><mo>,</mo> |
6274 | 1 | <mn>5</mn><mo>,</mo><mrow data-changed='added'><mo>.</mo><mo>.</mo></mrow> |
6275 | 1 | <mo>,</mo><mn>8</mn></mrow></math>"; |
6276 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6277 | 1 | } |
6278 | ||
6279 | ||
6280 | #[test] | |
6281 | 1 | fn no_merge_271() -> Result<()> { |
6282 | 1 | let test_str = "<math><mrow><mo>{</mo> |
6283 | 1 | <mrow><mn>2</mn><mo>,</mo><mn>4</mn><mo>,</mo><mn>6</mn></mrow> |
6284 | 1 | <mo>}</mo></mrow></math>"; |
6285 | 1 | let target_str = "<math><mrow><mo>{</mo> |
6286 | 1 | <mrow><mn>2</mn><mo>,</mo><mn>4</mn><mo>,</mo><mn>6</mn></mrow> |
6287 | 1 | <mo>}</mo></mrow></math>"; |
6288 | 1 | are_strs_canonically_equal_with_locale(test_str, target_str, &[], " .", ",") |
6289 | 1 | } |
6290 | ||
6291 | #[test] | |
6292 | 1 | fn not_digit_block_271() -> Result<()> { |
6293 | 1 | let test_str = "<math><mrow> |
6294 | 1 | <mi>…</mi><mo>,</mo> |
6295 | 1 | <mo>-</mo><mn>2</mn><mo>,</mo> |
6296 | 1 | <mo>-</mo><mn>1</mn><mo>,</mo> |
6297 | 1 | <mn>0</mn> |
6298 | 1 | </mrow></math>"; |
6299 | 1 | let target_str = "<math> <mrow> |
6300 | 1 | <mi>…</mi> |
6301 | 1 | <mo>,</mo> |
6302 | 1 | <mrow data-changed='added'><mo>-</mo><mn>2</mn></mrow> |
6303 | 1 | <mo>,</mo> |
6304 | 1 | <mrow data-changed='added'><mo>-</mo><mn>1</mn></mrow> |
6305 | 1 | <mo>,</mo> |
6306 | 1 | <mn>0</mn> |
6307 | 1 | </mrow></math>"; |
6308 | 1 | are_strs_canonically_equal_with_locale(test_str, target_str, &[], " .", ",") |
6309 | 1 | } |
6310 | ||
6311 | #[test] | |
6312 | 1 | fn merge_decimal_in_list_271() -> Result<()> { |
6313 | 1 | let test_str = "<math><mi>x</mi><mo>,</mo><mn>2</mn><mo>.</mo><mn>5</mn><mi>g</mi><mo>,</mo><mn>3</mn></math>"; |
6314 | 1 | let target_str = "<math> <mrow data-changed='added'> |
6315 | 1 | <mi>x</mi> |
6316 | 1 | <mo>,</mo> |
6317 | 1 | <mrow data-changed='added'> <mn>2.5</mn> <mo data-changed='added'>⁢</mo> <mi>g</mi> </mrow> |
6318 | 1 | <mo>,</mo> |
6319 | 1 | <mn>3</mn> |
6320 | 1 | </mrow> </math>"; |
6321 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6322 | 1 | } |
6323 | ||
6324 | #[test] | |
6325 | 1 | fn primes_common() -> Result<()> { |
6326 | 1 | let test_str = "<math><msup><mn>5</mn><mo>'</mo></msup> |
6327 | 1 | <msup><mn>5</mn><mo>''</mo></msup> |
6328 | 1 | <msup><mn>8</mn><mrow><mo>'</mo><mo>'</mo></mrow></msup></math>"; |
6329 | 1 | let target_str = "<math> |
6330 | 1 | <mrow data-changed='added'> |
6331 | 1 | <msup> |
6332 | 1 | <mn>5</mn> |
6333 | 1 | <mo>′</mo> |
6334 | 1 | </msup> |
6335 | 1 | <mo data-changed='added'>⁢</mo> |
6336 | 1 | <msup> |
6337 | 1 | <mn>5</mn> |
6338 | 1 | <mo>″</mo> |
6339 | 1 | </msup> |
6340 | 1 | <mo data-changed='added'>⁢</mo> |
6341 | 1 | <msup> |
6342 | 1 | <mn>8</mn> |
6343 | 1 | <mo>″</mo> |
6344 | 1 | </msup> |
6345 | 1 | </mrow> |
6346 | 1 | </math>"; |
6347 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6348 | 1 | } |
6349 | ||
6350 | #[test] | |
6351 | 1 | fn primes_uncommon() -> Result<()> { |
6352 | 1 | let test_str = "<math><msup><mn>5</mn><mo>''′</mo></msup> |
6353 | 1 | <msup><mn>5</mn><mo>''''</mo></msup> |
6354 | 1 | <msup><mn>8</mn><mrow><mo>′</mo><mo>⁗</mo></mrow></msup></math>"; |
6355 | 1 | let target_str = " <math> |
6356 | 1 | <mrow data-changed='added'> |
6357 | 1 | <msup> |
6358 | 1 | <mn>5</mn> |
6359 | 1 | <mo>‴</mo> |
6360 | 1 | </msup> |
6361 | 1 | <mo data-changed='added'>⁢</mo> |
6362 | 1 | <msup> |
6363 | 1 | <mn>5</mn> |
6364 | 1 | <mo>⁗</mo> |
6365 | 1 | </msup> |
6366 | 1 | <mo data-changed='added'>⁢</mo> |
6367 | 1 | <msup> |
6368 | 1 | <mn>8</mn> |
6369 | 1 | <mo>⁗′</mo> |
6370 | 1 | </msup> |
6371 | 1 | </mrow> |
6372 | 1 | </math>"; |
6373 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6374 | 1 | } |
6375 | ||
6376 | #[test] | |
6377 | 1 | fn merge_mi_test() -> Result<()> { |
6378 | 1 | let test_str = "<math> |
6379 | 1 | <mi>c</mi><mi>o</mi><mi>s</mi><mo>=</mo> |
6380 | 1 | <mi>w</mi><mi>x</mi><mi>y</mi><mi>z</mi><mo>+</mo> |
6381 | 1 | <mi>n</mi><mi>a</mi><mi>x</mi><mo>+</mo> |
6382 | 1 | <mi>i</mi><mi>ω</mi><mi>t</mi><mo>+</mo> |
6383 | 1 | <mi>f</mi><mi>l</mi><mi>o</mi><mi>w</mi><mo>+</mo> |
6384 | 1 | <mi>m</mi><mi>a</mi><mi>x</mi> |
6385 | 1 | </math> |
6386 | 1 | "; |
6387 | 1 | let target_str = "<math> |
6388 | 1 | <mrow data-changed='added'> |
6389 | 1 | <mi>cos</mi> |
6390 | 1 | <mo>=</mo> |
6391 | 1 | <mrow data-changed='added'> |
6392 | 1 | <mrow data-changed='added'> |
6393 | 1 | <mi>w</mi> |
6394 | 1 | <mo data-changed='added'>⁢</mo> |
6395 | 1 | <mi>x</mi> |
6396 | 1 | <mo data-changed='added'>⁢</mo> |
6397 | 1 | <mi>y</mi> |
6398 | 1 | <mo data-changed='added'>⁢</mo> |
6399 | 1 | <mi>z</mi> |
6400 | 1 | </mrow> |
6401 | 1 | <mo>+</mo> |
6402 | 1 | <mrow data-changed='added'> |
6403 | 1 | <mi>n</mi> |
6404 | 1 | <mo data-changed='added'>⁢</mo> |
6405 | 1 | <mi>a</mi> |
6406 | 1 | <mo data-changed='added'>⁢</mo> |
6407 | 1 | <mi>x</mi> |
6408 | 1 | </mrow> |
6409 | 1 | <mo>+</mo> |
6410 | 1 | <mrow data-changed='added'> |
6411 | 1 | <mi>i</mi> |
6412 | 1 | <mo data-changed='added'>⁢</mo> |
6413 | 1 | <mi>ω</mi> |
6414 | 1 | <mo data-changed='added'>⁢</mo> |
6415 | 1 | <mi>t</mi> |
6416 | 1 | </mrow> |
6417 | 1 | <mo>+</mo> |
6418 | 1 | <mi>flow</mi> |
6419 | 1 | <mo>+</mo> |
6420 | 1 | <mi>max</mi> |
6421 | 1 | </mrow> |
6422 | 1 | </mrow> |
6423 | 1 | </math>"; |
6424 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6425 | 1 | } |
6426 | ||
6427 | #[test] | |
6428 | 1 | fn merge_mi_with_script_test() -> Result<()> { |
6429 | 1 | let test_str = "<math> |
6430 | 1 | <mi>c</mi><mi>o</mi><msup><mi>s</mi><mn>2</mn></msup><mi>y</mi><mo>=</mo> |
6431 | 1 | <mi>l</mi><mi>o</mi><msup><mi>g</mi><mn>2</mn></msup><mi>y</mi><mo>+</mo> |
6432 | 1 | <mi>d</mi><mi>a</mi><msup><mi>g</mi><mn>2</mn></msup> |
6433 | 1 | </math>"; |
6434 | 1 | let target_str = "<math> |
6435 | 1 | <mrow data-changed='added'> |
6436 | 1 | <mrow data-changed='added'> |
6437 | 1 | <msup> |
6438 | 1 | <mi>cos</mi> |
6439 | 1 | <mn>2</mn> |
6440 | 1 | </msup> |
6441 | 1 | <mo data-changed='added'>⁡</mo> |
6442 | 1 | <mi>y</mi> |
6443 | 1 | </mrow> |
6444 | 1 | <mo>=</mo> |
6445 | 1 | <mrow data-changed='added'> |
6446 | 1 | <mrow data-changed='added'> |
6447 | 1 | <msup> |
6448 | 1 | <mi>log</mi> |
6449 | 1 | <mn>2</mn> |
6450 | 1 | </msup> |
6451 | 1 | <mo data-changed='added'>⁡</mo> |
6452 | 1 | <mi>y</mi> |
6453 | 1 | </mrow> |
6454 | 1 | <mo>+</mo> |
6455 | 1 | <mrow data-changed='added'> |
6456 | 1 | <mi>d</mi> |
6457 | 1 | <mo data-changed='added'>⁢</mo> |
6458 | 1 | <mi>a</mi> |
6459 | 1 | <mo data-changed='added'>⁢</mo> |
6460 | 1 | <msup> |
6461 | 1 | <mi>g</mi> |
6462 | 1 | <mn>2</mn> |
6463 | 1 | </msup> |
6464 | 1 | </mrow> |
6465 | 1 | </mrow> |
6466 | 1 | </mrow> |
6467 | 1 | </math>"; |
6468 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6469 | 1 | } |
6470 | ||
6471 | #[test] | |
6472 | 1 | fn merge_mi_with_script_bug_333_test() -> Result<()> { |
6473 | 1 | let test_str = "<math> |
6474 | 1 | <mi>l</mi><mi>o</mi><msub><mrow><mi>g</mi></mrow><mrow><mn>2</mn></mrow></msub><mo>=</mo> |
6475 | 1 | <mi>l</mi><mi>i</mi><msub><mrow><mi>m</mi></mrow><mrow><mi>n</mi><mo>→</mo><mi>∞</mi></mrow></msub> |
6476 | 1 | </math> |
6477 | 1 | "; |
6478 | 1 | let target_str = " <math> |
6479 | 1 | <mrow data-changed='added'> |
6480 | 1 | <msub> |
6481 | 1 | <mi>log</mi> |
6482 | 1 | <mn>2</mn> |
6483 | 1 | </msub> |
6484 | 1 | <mo>=</mo> |
6485 | 1 | <msub> |
6486 | 1 | <mi>lim</mi> |
6487 | 1 | <mrow> |
6488 | 1 | <mi>n</mi> |
6489 | 1 | <mo>→</mo> |
6490 | 1 | <mi>∞</mi> |
6491 | 1 | </mrow> |
6492 | 1 | </msub> |
6493 | 1 | </mrow> |
6494 | 1 | </math>"; |
6495 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6496 | 1 | } |
6497 | ||
6498 | #[test] | |
6499 | 1 | fn merge_mi_bug_545() -> Result<()> { |
6500 | 1 | let test_str = "<math><mi>S</mi><mi>I</mi><msup><mi>N</mi><mrow><mo>-</mo><mn>1</mn></mrow></msup></math>"; |
6501 | 1 | let target_str = "<math><msup><mi mathvariant='normal'>SIN</mi><mrow><mo>-</mo><mn>1</mn></mrow></msup></math>"; |
6502 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6503 | 1 | } |
6504 | ||
6505 | #[test] | |
6506 | 1 | fn parent_bug_94() -> Result<()> { |
6507 | // This is a test to make sure the crash in the bug report doesn't happen. | |
6508 | // Note: in the bug, they behavior they would like is a single mn with content "0.02" | |
6509 | // However, TeX input "1 2 3" will produce three consecutive <mn>s, so merging <mn>s isn't good in general | |
6510 | // This test | |
6511 | 1 | let test_str = " <math> |
6512 | 1 | <mrow> |
6513 | 1 | <msqrt> |
6514 | 1 | <mrow> |
6515 | 1 | <mstyle mathvariant='bold' mathsize='normal'><mn>0</mn></mstyle> |
6516 | 1 | <mstyle mathvariant='bold' mathsize='normal'><mo>.</mo><mn>0</mn><mn>2</mn></mstyle> |
6517 | 1 | </mrow> |
6518 | 1 | </msqrt> |
6519 | 1 | </mrow> |
6520 | 1 | </math> |
6521 | 1 | "; |
6522 | 1 | let target_str = "<math> |
6523 | 1 | <msqrt> |
6524 | 1 | <mn mathsize='normal' mathvariant='bold' data-changed='added'>0.02</mn> |
6525 | 1 | </msqrt> |
6526 | 1 | </math>"; |
6527 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6528 | 1 | } |
6529 | ||
6530 | #[test] | |
6531 | 1 | fn mstyle_merge_bug_272() -> Result<()> { |
6532 | 1 | let test_str = r#"<math> |
6533 | 1 | <msup> |
6534 | 1 | <mstyle mathvariant="bold" mathsize="normal"> |
6535 | 1 | <mn>6</mn> |
6536 | 1 | </mstyle> |
6537 | 1 | <mstyle mathvariant="bold" mathsize="normal"> |
6538 | 1 | <mn>9</mn> |
6539 | 1 | </mstyle> |
6540 | 1 | </msup> |
6541 | 1 | </math>"#; |
6542 | 1 | let target_str = "<math> |
6543 | 1 | <msup> |
6544 | 1 | <mn mathsize='normal' mathvariant='bold'>𝟔</mn> |
6545 | 1 | <mn mathsize='normal' mathvariant='bold'>𝟗</mn> |
6546 | 1 | </msup> |
6547 | 1 | </math>"; |
6548 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6549 | 1 | } |
6550 | ||
6551 | ||
6552 | #[test] | |
6553 | 1 | fn munder_mspace_bug_296() -> Result<()> { |
6554 | // this was a "typo" bug that should have looking embellished base | |
6555 | 1 | let test_str = r#"<math> |
6556 | 1 | <mrow><mn>5</mn><mfrac><mn>9</mn><mrow><mn>10</mn></mrow></mfrac> |
6557 | 1 | <munder accentunder="true"><mspace width="2.7em" /><mo stretchy="true">_</mo></munder> |
6558 | 1 | </mrow></math>"#; |
6559 | 1 | let target_str = "<math><mrow> |
6560 | 1 | <mrow data-changed='added'> |
6561 | 1 | <mn>5</mn> |
6562 | 1 | <mo data-changed='added'>⁤</mo> |
6563 | 1 | <mfrac> <mn>9</mn><mn>10</mn> </mfrac> |
6564 | 1 | </mrow> |
6565 | 1 | <munder accentunder='true'> |
6566 | 1 | <mo width='2.7em' data-changed='was-mspace' data-width='2.7' data-empty-in-2D='true' data-function-likelihood='false'> </mo> |
6567 | 1 | <mo stretchy='true'>¯</mo> |
6568 | 1 | </munder> |
6569 | 1 | </mrow></math>"; |
6570 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6571 | 1 | } |
6572 | ||
6573 | #[test] | |
6574 | 1 | fn parse_scripted_open_paren_439() -> Result<()> { |
6575 | // this was a "typo" bug that should have looking embellished base | |
6576 | 1 | let test_str = r#"<math><mrow><msub><mo>(</mo><mn>2</mn></msub><mo>)</mo></mrow></math>"#; |
6577 | 1 | let target_str = "<math><mrow><msub><mo>(</mo><mn>2</mn></msub><mo>)</mo></mrow></math>"; |
6578 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6579 | 1 | } |
6580 | ||
6581 | #[test] | |
6582 | 1 | fn lift_script() -> Result<()> { |
6583 | 1 | let test_str = "<math xmlns='http://www.w3.org/1998/Math/MathML' > |
6584 | 1 | <mrow> |
6585 | 1 | <mstyle scriptlevel='0' displaystyle='true'> |
6586 | 1 | <mrow> |
6587 | 1 | <msqrt> |
6588 | 1 | <munder> |
6589 | 1 | <mo>∑<!-- ∑ --></mo> |
6590 | 1 | <mrow> |
6591 | 1 | <mn>0</mn> |
6592 | 1 | <mo>≤<!-- ≤ --></mo> |
6593 | 1 | <mi>k</mi> |
6594 | 1 | <mo>≤<!-- ≤ --></mo> |
6595 | 1 | <mi>n</mi> |
6596 | 1 | </mrow> |
6597 | 1 | </munder> |
6598 | 1 | <mrow> |
6599 | 1 | <mo stretchy='false'>|</mo> |
6600 | 1 | </mrow> |
6601 | 1 | <msub> |
6602 | 1 | <mi>a</mi> |
6603 | 1 | <mrow> |
6604 | 1 | <mi>k</mi> |
6605 | 1 | </mrow> |
6606 | 1 | </msub> |
6607 | 1 | <msup> |
6608 | 1 | <mrow> |
6609 | 1 | <mo stretchy='false'>|</mo> |
6610 | 1 | </mrow> |
6611 | 1 | <mrow> |
6612 | 1 | <mn>2</mn> |
6613 | 1 | </mrow> |
6614 | 1 | </msup> |
6615 | 1 | </msqrt> |
6616 | 1 | </mrow> |
6617 | 1 | </mstyle> |
6618 | 1 | </mrow> |
6619 | 1 | </math>"; |
6620 | 1 | let target_str = "<math> |
6621 | 1 | <msqrt scriptlevel='0' displaystyle='true'> |
6622 | 1 | <mrow data-changed='added'> |
6623 | 1 | <munder> |
6624 | 1 | <mo>∑</mo> |
6625 | 1 | <mrow> |
6626 | 1 | <mn>0</mn> |
6627 | 1 | <mo>≤</mo> |
6628 | 1 | <mi>k</mi> |
6629 | 1 | <mo>≤</mo> |
6630 | 1 | <mi>n</mi> |
6631 | 1 | </mrow> |
6632 | 1 | </munder> |
6633 | 1 | <msup> |
6634 | 1 | <mrow data-changed='added'> |
6635 | 1 | <mo stretchy='false'>|</mo> |
6636 | 1 | <msub> |
6637 | 1 | <mi>a</mi> |
6638 | 1 | <mi>k</mi> |
6639 | 1 | </msub> |
6640 | 1 | <mo stretchy='false'>|</mo> |
6641 | 1 | </mrow> |
6642 | 1 | <mn>2</mn> |
6643 | 1 | </msup> |
6644 | 1 | </mrow> |
6645 | 1 | </msqrt> |
6646 | 1 | </math>"; |
6647 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6648 | 1 | } |
6649 | ||
6650 | #[test] | |
6651 | 1 | fn pseudo_scripts() -> Result<()> { |
6652 | 1 | let test_str = "<math><mrow> |
6653 | 1 | <mi>cos</mi><mn>30</mn><mo>°</mo> |
6654 | 1 | <mi>sin</mi><mn>60</mn><mo>′</mo> |
6655 | 1 | </mrow></math>"; |
6656 | 1 | let target_str = "<math> |
6657 | 1 | <mrow> |
6658 | 1 | <mrow data-changed='added'> |
6659 | 1 | <mi>cos</mi> |
6660 | 1 | <mo data-changed='added'>⁡</mo> |
6661 | 1 | <msup data-changed='added'><mn>30</mn><mo>°</mo></msup> |
6662 | 1 | </mrow> |
6663 | 1 | <mo data-changed='added'>⁢</mo> |
6664 | 1 | <mrow data-changed='added'> |
6665 | 1 | <mi>sin</mi> |
6666 | 1 | <mo data-changed='added'>⁡</mo> |
6667 | 1 | <msup data-changed='added'><mn>60</mn><mo>′</mo></msup> |
6668 | 1 | </mrow> |
6669 | 1 | </mrow> |
6670 | 1 | </math>"; |
6671 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6672 | 1 | } |
6673 | ||
6674 | #[test] | |
6675 | 1 | fn pseudo_scripts_in_mi() -> Result<()> { |
6676 | 1 | let test_str = "<math><mrow><mi>p'</mi><mo>=</mo><mi>µ°C</mi></mrow></math>"; |
6677 | 1 | let target_str = "<math><mrow><msup><mi>p</mi><mo>′</mo></msup><mo>=</mo><mi>µ°C</mi></mrow></math>"; |
6678 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6679 | 1 | } |
6680 | ||
6681 | #[test] | |
6682 | 1 | fn prescript_only() -> Result<()> { |
6683 | 1 | let test_str = "<math><msub><mtext/><mn>92</mn></msub><mi>U</mi></math>"; |
6684 | 1 | let target_str = "<math><mmultiscripts><mi>U</mi><mprescripts/> <mn>92</mn><none/> </mmultiscripts></math>"; |
6685 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6686 | 1 | } |
6687 | ||
6688 | #[test] | |
6689 | 1 | fn pre_and_postscript_only() -> Result<()> { |
6690 | 1 | let test_str = "<math> |
6691 | 1 | <msub><mrow/><mn>0</mn></msub> |
6692 | 1 | <msub><mi>F</mi><mn>1</mn></msub> |
6693 | 1 | <mo stretchy='false'>(</mo> |
6694 | 1 | <mi>a</mi><mo>,</mo><mi>b</mi><mo>;</mo><mi>c</mi><mo>;</mo><mi>z</mi> |
6695 | 1 | <mo stretchy='false'>)</mo> |
6696 | 1 | </math>"; |
6697 | 1 | let target_str = " <math> |
6698 | 1 | <mrow data-changed='added'> |
6699 | 1 | <mmultiscripts> |
6700 | 1 | <mi>F</mi> |
6701 | 1 | <mn>1</mn> |
6702 | 1 | <none></none> |
6703 | 1 | <mprescripts></mprescripts> |
6704 | 1 | <mn>0</mn> |
6705 | 1 | <none></none> |
6706 | 1 | </mmultiscripts> |
6707 | 1 | <mo data-changed='added'>⁡</mo> |
6708 | 1 | <mrow data-changed='added'> |
6709 | 1 | <mo stretchy='false'>(</mo> |
6710 | 1 | <mrow data-changed='added'> |
6711 | 1 | <mrow data-changed='added'> |
6712 | 1 | <mi>a</mi> |
6713 | 1 | <mo>,</mo> |
6714 | 1 | <mi>b</mi> |
6715 | 1 | </mrow> |
6716 | 1 | <mo>;</mo> |
6717 | 1 | <mi>c</mi> |
6718 | 1 | <mo>;</mo> |
6719 | 1 | <mi>z</mi> |
6720 | 1 | </mrow> |
6721 | 1 | <mo stretchy='false'>)</mo> |
6722 | 1 | </mrow> |
6723 | 1 | </mrow> |
6724 | 1 | </math>"; |
6725 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6726 | 1 | } |
6727 | ||
6728 | #[test] | |
6729 | 1 | fn pointless_nones_in_mmultiscripts() -> Result<()> { |
6730 | 1 | let test_str = "<math><mmultiscripts> |
6731 | 1 | <mtext>C</mtext> |
6732 | 1 | <none /> |
6733 | 1 | <none /> |
6734 | 1 | <mprescripts /> |
6735 | 1 | <mn>6</mn> |
6736 | 1 | <mn>14</mn> |
6737 | 1 | </mmultiscripts></math>"; |
6738 | 1 | let target_str = "<math> |
6739 | 1 | <mmultiscripts data-chem-formula='6'> |
6740 | 1 | <mtext data-chem-element='1'>C</mtext> |
6741 | 1 | <mprescripts></mprescripts> |
6742 | 1 | <mn>6</mn> |
6743 | 1 | <mn>14</mn> |
6744 | 1 | </mmultiscripts> |
6745 | 1 | </math>"; |
6746 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6747 | 1 | } |
6748 | ||
6749 | #[test] | |
6750 | 1 | fn empty_mmultiscripts_485() -> Result<()> { |
6751 | 1 | let test_str = "<math><mmultiscripts> </mmultiscripts></math>"; |
6752 | 1 | let target_str = ""; // shouldn't get to the point of comparing because the input is illegal. |
6753 | 1 | let err = are_strs_canonically_equal_result(test_str, target_str, &[]) |
6754 | 1 | .expect_err("empty mmultiscripts should be rejected"); |
6755 | 1 | assert!( |
6756 | 1 | err.to_string().contains("mmultiscripts has the wrong number of children:\n <mmultiscripts></mmultiscripts>"), |
6757 | "unexpected error message: {err}" | |
6758 | ); | |
6759 | 1 | Ok(()) |
6760 | 1 | } |
6761 | ||
6762 | #[test] | |
6763 | 1 | fn empty_mmultiscripts_544() -> Result<()> { |
6764 | 1 | let test_str = "<math><mmultiscripts><mrow/><mprescripts></mprescripts><mrow/><mrow/></mmultiscripts></math>"; |
6765 | 1 | let target_str = "<math> <mtext data-changed='empty_content' data-width='0'> </mtext></math>"; |
6766 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6767 | 1 | } |
6768 | ||
6769 | #[test] | |
6770 | 1 | fn empty_mrows_in_mmultiscripts_306() -> Result<()> { |
6771 | 1 | let test_str = "<math display='block'> |
6772 | 1 | <mmultiscripts intent='_permutation:prefix(_of,$k,_from,$n)'> |
6773 | 1 | <mi>P</mi> |
6774 | 1 | <mi arg='k'>k</mi> |
6775 | 1 | <mrow/> |
6776 | 1 | <mprescripts/> |
6777 | 1 | <mrow/> |
6778 | 1 | <mi arg='n'>n</mi> |
6779 | 1 | </mmultiscripts> |
6780 | 1 | </math>"; |
6781 | 1 | let target_str = "<math display='block'> |
6782 | 1 | <mmultiscripts intent='_permutation:prefix(_of,$k,_from,$n)'> |
6783 | 1 | <mi>P</mi> |
6784 | 1 | <mi arg='k'>k</mi> |
6785 | 1 | <none></none> |
6786 | 1 | <mprescripts></mprescripts> |
6787 | 1 | <none></none> |
6788 | 1 | <mi arg='n'>n</mi> |
6789 | 1 | </mmultiscripts> |
6790 | 1 | </math>"; |
6791 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6792 | 1 | } |
6793 | ||
6794 | ||
6795 | #[test] | |
6796 | #[ignore] // this fails -- need to figure out grabbing base from previous or next child | |
6797 | 0 | fn tensor() -> Result<()> { |
6798 | 0 | let test_str = "<math> |
6799 | 0 | <msub><mi>R</mi><mi>i</mi></msub> |
6800 | 0 | <msup><mrow/><mi>j</mi></msup> |
6801 | 0 | <msub><mrow/><mi>k</mi></msub> |
6802 | 0 | <msub><mrow/><mi>l</mi></msub> |
6803 | 0 | </math>"; |
6804 | 0 | let target_str = "<math> |
6805 | 0 | <mmultiscripts> |
6806 | 0 | <mi> R </mi> |
6807 | 0 | <mi> i </mi> |
6808 | 0 | <none/> |
6809 | 0 | <none/> |
6810 | 0 | <mi> j </mi> |
6811 | 0 | <mi> k </mi> |
6812 | 0 | <none/> |
6813 | 0 | <mi> l </mi> |
6814 | 0 | <none/> |
6815 | 0 | </mmultiscripts> |
6816 | 0 | </math>"; |
6817 | 0 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6818 | 0 | } |
6819 | ||
6820 | ||
6821 | #[test] | |
6822 | 1 | fn test_nonascii_function_name() -> Result<()> { |
6823 | 1 | let test_str = r#"<math> |
6824 | 1 | <mi mathvariant="bold-italic">x</mi> |
6825 | 1 | <mo>=</mo> |
6826 | 1 | <mn>2</mn> |
6827 | 1 | <mrow> |
6828 | 1 | <mi>𝒔𝒊𝒏</mi> |
6829 | 1 | <mo>⁡</mo> |
6830 | 1 | <mrow><mi mathvariant="bold-italic">t</mi></mrow> |
6831 | 1 | </mrow> |
6832 | 1 | <mo>-</mo> |
6833 | 1 | <mn>1</mn> |
6834 | 1 | </math>"#; |
6835 | 1 | let target_str = r#"<math> |
6836 | 1 | <mrow data-changed='added'> |
6837 | 1 | <mi mathvariant='bold-italic'>𝒙</mi> |
6838 | 1 | <mo>=</mo> |
6839 | 1 | <mrow data-changed='added'> |
6840 | 1 | <mrow data-changed='added'> |
6841 | 1 | <mn>2</mn> |
6842 | 1 | <mo data-changed='added'>⁢</mo> |
6843 | 1 | <mrow> |
6844 | 1 | <mi>sin</mi> |
6845 | 1 | <mo>⁡</mo> |
6846 | 1 | <mi mathvariant='bold-italic'>𝒕</mi> |
6847 | 1 | </mrow> |
6848 | 1 | </mrow> |
6849 | 1 | <mo>-</mo> |
6850 | 1 | <mn>1</mn> |
6851 | 1 | </mrow> |
6852 | 1 | </mrow> |
6853 | 1 | </math>"#; |
6854 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6855 | 1 | } |
6856 | ||
6857 | #[test] | |
6858 | 1 | fn test_nonascii_function_name_as_chars() -> Result<()> { |
6859 | 1 | let test_str = r#"<math display="block"> |
6860 | 1 | <mi>𝒙</mi> |
6861 | 1 | <mo>=</mo> |
6862 | 1 | <mrow> |
6863 | 1 | <mrow> |
6864 | 1 | <mi>𝒄</mi> |
6865 | 1 | <mi>𝒐</mi> |
6866 | 1 | <mi>𝒔</mi> |
6867 | 1 | </mrow> |
6868 | 1 | <mo>⁡</mo> |
6869 | 1 | <mrow> |
6870 | 1 | <mi>𝒕</mi> |
6871 | 1 | </mrow> |
6872 | 1 | </mrow> |
6873 | 1 | <mo>+</mo> |
6874 | 1 | <mn>𝟐</mn> |
6875 | 1 | </math>"#; |
6876 | 1 | let target_str = r#"<math display='block'> |
6877 | 1 | <mrow data-changed='added'> |
6878 | 1 | <mi>𝒙</mi> |
6879 | 1 | <mo>=</mo> |
6880 | 1 | <mrow data-changed='added'> |
6881 | 1 | <mrow> |
6882 | 1 | <mi>cos</mi> |
6883 | 1 | <mo>⁡</mo> |
6884 | 1 | <mi>𝒕</mi> |
6885 | 1 | </mrow> |
6886 | 1 | <mo>+</mo> |
6887 | 1 | <mn>𝟐</mn> |
6888 | 1 | </mrow> |
6889 | 1 | </mrow> |
6890 | 1 | </math>"#; |
6891 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6892 | 1 | } |
6893 | ||
6894 | ||
6895 | } |
Line | Count | Source |
1 | //! Converts the MathML to some sort of canonical MathML. | |
2 | //! | |
3 | //! Some changes made: | |
4 | //! * &extra whitespace at the start/end of tokens is trimmed. | |
5 | //! * "equivalent" characters are converted to a chosen character. | |
6 | //! * known "bad" MathML is cleaned up (this will likely be an ongoing effort) | |
7 | //! * mrows are added based on operator priorities from the MathML Operator Dictionary | |
8 | #![allow(clippy::needless_return)] | |
9 | use crate::errors::*; | |
10 | use std::rc::Rc; | |
11 | use std::cell::RefCell; | |
12 | use sxd_document::dom::{Element, Document, ChildOfElement, Attribute}; | |
13 | use sxd_document::QName; | |
14 | use phf::{phf_map, phf_set}; | |
15 | use crate::xpath_functions::{IsBracketed, is_leaf, IsNode}; | |
16 | use std::ptr::eq as ptr_eq; | |
17 | use crate::pretty_print::*; | |
18 | use regex::Regex; | |
19 | use std::fmt; | |
20 | use crate::chemistry::*; | |
21 | use unicode_script::Script; | |
22 | use roman_numerals_rs::RomanNumeral; | |
23 | use std::sync::LazyLock; | |
24 | use log::{debug}; | |
25 | use bitflags::bitflags; | |
26 | ||
27 | // FIX: DECIMAL_SEPARATOR should be set by env, or maybe language | |
28 | const DECIMAL_SEPARATOR: &str = "."; | |
29 | pub const CHANGED_ATTR: &str = "data-changed"; | |
30 | pub const ADDED_ATTR_VALUE: &str = "added"; | |
31 | pub const INTENT_ATTR: &str = "intent"; | |
32 | pub const MATHML_FROM_NAME_ATTR: &str = "data-from-mathml"; | |
33 | const MFENCED_ATTR_VALUE: &str = "from_mfenced"; | |
34 | const EMPTY_IN_2D: &str = "data-empty-in-2D"; | |
35 | const SPACE_AFTER: &str = "data-space-after"; | |
36 | const ACT_AS_OPERATOR: &str = "data-acts_as_operator"; | |
37 | // character to use instead of the text content for priority, etc. | |
38 | pub const CHEMICAL_BOND: &str ="data-chemical-bond"; | |
39 | ||
40 | ||
41 | /// Used when mhchem is detected and we should favor postscripts rather than prescripts in constructing an mmultiscripts | |
42 | const MHCHEM_MMULTISCRIPTS_HACK: &str = "MHCHEM_SCRIPT_HACK"; | |
43 | ||
44 | // (perfect) hash of operators built from MathML's operator dictionary | |
45 | static OPERATORS: phf::Map<&str, OperatorInfo> = include!("operator-info.in"); | |
46 | ||
47 | ||
48 | // The set of fence operators that can being either a left or right fence (or infix). For example: "|". | |
49 | static AMBIGUOUS_OPERATORS: phf::Set<&str> = phf_set! { | |
50 | "|", "∥", "\u{2016}" | |
51 | }; | |
52 | ||
53 | // static vars used when canonicalizing | |
54 | // lowest priority operator so it is never popped off the stack | |
55 | static LEFT_FENCEPOST: OperatorInfo = OperatorInfo{ op_type: OperatorTypes::LEFT_FENCE, priority: 0, next: &None }; | |
56 | ||
57 | 3 | static INVISIBLE_FUNCTION_APPLICATION: LazyLock<&'static OperatorInfo> = LazyLock::new(|| OPERATORS.get("\u{2061}").unwrap()); |
58 | 3 | static IMPLIED_TIMES: LazyLock<&'static OperatorInfo> = LazyLock::new(|| OPERATORS.get("\u{2062}").unwrap()); |
59 | 2 | static IMPLIED_INVISIBLE_COMMA: LazyLock<&'static OperatorInfo> = LazyLock::new(|| OPERATORS.get("\u{2063}").unwrap()); |
60 | 3 | static IMPLIED_INVISIBLE_PLUS: LazyLock<&'static OperatorInfo> = LazyLock::new(|| OPERATORS.get("\u{2064}").unwrap()); |
61 | ||
62 | // FIX: any other operators that should act the same (e.g, plus-minus and minus-plus)? | |
63 | 3 | static PLUS: LazyLock<&'static OperatorInfo> = LazyLock::new(|| OPERATORS.get("+").unwrap()); |
64 | 3 | static MINUS: LazyLock<&'static OperatorInfo> = LazyLock::new(|| OPERATORS.get("-").unwrap()); |
65 | 3 | static PREFIX_MINUS: LazyLock<&'static OperatorInfo> = LazyLock::new(|| MINUS.next.as_ref().unwrap()); |
66 | ||
67 | 3 | static TIMES_SIGN: LazyLock<&'static OperatorInfo> = LazyLock::new(|| OPERATORS.get("×").unwrap()); |
68 | ||
69 | // IMPLIED_TIMES_HIGH_PRIORITY -- used in trig functions for things like sin 2x cos 2x where want > function app priority | |
70 | static IMPLIED_TIMES_HIGH_PRIORITY: OperatorInfo = OperatorInfo{ | |
71 | op_type: OperatorTypes::INFIX, priority: 851, next: &None | |
72 | }; | |
73 | // IMPLIED_SEPARATOR_HIGH_PRIORITY -- used for Geometry points like ABC | |
74 | static IMPLIED_SEPARATOR_HIGH_PRIORITY: OperatorInfo = OperatorInfo{ | |
75 | op_type: OperatorTypes::INFIX, priority: 901, next: &None | |
76 | }; | |
77 | // IMPLIED_CHEMICAL_BOND -- used for implicit and explicit bonds | |
78 | static IMPLIED_CHEMICAL_BOND: OperatorInfo = OperatorInfo{ | |
79 | op_type: OperatorTypes::INFIX, priority: 905, next: &None | |
80 | }; | |
81 | static IMPLIED_PLUS_SLASH_HIGH_PRIORITY: OperatorInfo = OperatorInfo{ // (linear) mixed fraction 2 3/4 | |
82 | op_type: OperatorTypes::INFIX, priority: 881, next: &None | |
83 | }; | |
84 | ||
85 | // Useful static defaults to have available if there is no character match | |
86 | static DEFAULT_OPERATOR_INFO_PREFIX: OperatorInfo = OperatorInfo{ | |
87 | op_type: OperatorTypes::PREFIX, priority: 260, next: &None | |
88 | }; | |
89 | static DEFAULT_OPERATOR_INFO_INFIX: OperatorInfo = OperatorInfo{ | |
90 | op_type: OperatorTypes::INFIX, priority: 260, next:& None | |
91 | }; | |
92 | static DEFAULT_OPERATOR_INFO_POSTFIX: OperatorInfo = OperatorInfo{ | |
93 | op_type: OperatorTypes::POSTFIX, priority: 260, next: &None | |
94 | }; | |
95 | ||
96 | // avoids having to use Option<OperatorInfo> in some cases | |
97 | static ILLEGAL_OPERATOR_INFO: OperatorInfo = OperatorInfo{ | |
98 | op_type: OperatorTypes::INFIX, priority: 999, next: &None | |
99 | }; | |
100 | ||
101 | // used to tell if an operator is a relational operator | |
102 | 1 | static EQUAL_PRIORITY: LazyLock<usize> = LazyLock::new(|| OPERATORS.get("=").unwrap().priority); |
103 | ||
104 | // useful for detecting whitespace | |
105 | 3 | static IS_WHITESPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s+$").unwrap()); // only Unicode whitespace |
106 | ||
107 | // Operators are either PREFIX, INFIX, or POSTFIX, but can also have other properties such as LEFT_FENCE | |
108 | bitflags! { | |
109 | #[derive(Clone, Debug, Copy, PartialEq, Eq)] | |
110 | struct OperatorTypes: u32 { | |
111 | const NONE = 0x0; | |
112 | const PREFIX = 0x1; | |
113 | const INFIX = 0x2; | |
114 | const POSTFIX = 0x4; | |
115 | const FENCE = 0x8; | |
116 | const LEFT_FENCE= 0x9; | |
117 | const RIGHT_FENCE=0xc; | |
118 | const UNSPECIFIED=0xf; // 'and-ing will match anything | |
119 | } | |
120 | } | |
121 | // OperatorInfo is a key structure for parsing. | |
122 | // They OperatorInfo is this program's representation of MathML's Operator Dictionary. | |
123 | // The OperatorTypes say how the operator can group (can be overridden with @form="..." on an element). | |
124 | // Basically, it says the operator can be at the start, middle, or end of an mrow. | |
125 | // The priority field gives the relationships between operators so that lower priority operators are towards the root of the tree. | |
126 | // E.g., '=' is lower priority than (infix) '+', which in turn is lower priority than multiplication. | |
127 | // The operator info is a linked list because some operators (not many) have alternatives (e.g, '+' is both prefix and infix) | |
128 | // All OperatorInfo is static info, with some special static defaults to capture when it is not listed in the operator dictionary. | |
129 | #[derive(Clone, Debug)] | |
130 | struct OperatorInfo { | |
131 | op_type: OperatorTypes, // can be set on <mo> | |
132 | priority: usize, // not settable on an element | |
133 | next: &'static Option<OperatorInfo>, // can be both prefix & infix (etc) -- chain of options | |
134 | } | |
135 | ||
136 | // The character is separated out from the OperatorInfo as this allows the OperatorInfo to be static (can use default values) | |
137 | #[derive(Clone, Debug)] | |
138 | struct OperatorPair<'op> { | |
139 | ch: &'op str, | |
140 | op: &'static OperatorInfo | |
141 | } | |
142 | ||
143 | impl<'op> OperatorPair<'op> { | |
144 | 57.3k | fn new() -> OperatorPair<'op> { |
145 | 57.3k | return OperatorPair{ |
146 | 57.3k | ch: "illegal", // value 'illegal' used only in debugging, if then |
147 | 57.3k | op: &ILLEGAL_OPERATOR_INFO, // ILLEGAL_OPERATOR_INFO avoids using <Option> |
148 | 57.3k | }; |
149 | 57.3k | } |
150 | } | |
151 | ||
152 | // OperatorVersions is a convenient data structure when looking to see whether the operator should be prefix, infix, or postfix. | |
153 | // It is only used in one place in the code, so this could maybe be eliminated and the code localized to where it is used. | |
154 | #[derive(Debug)] | |
155 | struct OperatorVersions { | |
156 | prefix: Option<&'static OperatorInfo>, | |
157 | infix: Option<&'static OperatorInfo>, | |
158 | postfix: Option<&'static OperatorInfo>, | |
159 | } | |
160 | ||
161 | impl OperatorVersions { | |
162 | 401 | fn new(op: &'static OperatorInfo) -> OperatorVersions { |
163 | 401 | let mut op = op; |
164 | 401 | let mut prefix = None; |
165 | 401 | let mut infix = None; |
166 | 401 | let mut postfix = None; |
167 | loop { | |
168 | 1.10k | if op.is_prefix() { |
169 | 360 | prefix = Some( op ); |
170 | 745 | } else if op.is_infix() { |
171 | 385 | infix = Some( op ) |
172 | 360 | } else if op.is_postfix() { |
173 | 360 | postfix = Some( op ); |
174 | 360 | } else { |
175 | 0 | panic!("OperatorVersions::new: operator is not prefix, infix, or postfix") |
176 | } | |
177 | //let another_op = op.next; | |
178 | 1.10k | match &op.next { |
179 | 401 | None => break, |
180 | 704 | Some(alt_op) => op = alt_op, |
181 | } | |
182 | } | |
183 | 401 | return OperatorVersions{prefix, infix, postfix}; |
184 | 401 | } |
185 | } | |
186 | ||
187 | ||
188 | impl OperatorInfo { | |
189 | 13.1k | fn is_prefix(&self) -> bool { |
190 | 13.1k | return (self.op_type & OperatorTypes::PREFIX) != OperatorTypes::NONE; |
191 | 13.1k | } |
192 | ||
193 | 805 | fn is_infix(&self) -> bool { |
194 | 805 | return (self.op_type & OperatorTypes::INFIX) != OperatorTypes::NONE; |
195 | 805 | } |
196 | ||
197 | 14.2k | fn is_postfix(&self) -> bool { |
198 | 14.2k | return (self.op_type & OperatorTypes::POSTFIX) != OperatorTypes::NONE; |
199 | 14.2k | } |
200 | ||
201 | 13.9k | fn is_left_fence(&self) -> bool { |
202 | 13.9k | return self.op_type & OperatorTypes::LEFT_FENCE == OperatorTypes::LEFT_FENCE; |
203 | 13.9k | } |
204 | ||
205 | 12.9k | fn is_right_fence(&self) -> bool { |
206 | 12.9k | return self.op_type & OperatorTypes::RIGHT_FENCE ==OperatorTypes::RIGHT_FENCE; |
207 | 12.9k | } |
208 | ||
209 | 4.84k | fn is_fence(&self) -> bool { |
210 | 4.84k | return (self.op_type & (OperatorTypes::LEFT_FENCE | OperatorTypes::RIGHT_FENCE)) != OperatorTypes::NONE; |
211 | 4.84k | } |
212 | ||
213 | 21.3k | fn is_operator_type(&self, op_type: OperatorTypes) -> bool { |
214 | 21.3k | return self.op_type & op_type != OperatorTypes::NONE; |
215 | 21.3k | } |
216 | ||
217 | 13.5k | fn is_plus_or_minus(&self) -> bool { |
218 | 13.5k | return ptr_eq(self, *PLUS) || |
219 | 13.5k | } |
220 | ||
221 | 13.2k | fn is_times(&self) -> bool { |
222 | 13.2k | return ptr_eq(self, *IMPLIED_TIMES) || |
223 | 13.2k | } |
224 | ||
225 | 17.7k | fn is_nary(&self, previous_op: &OperatorInfo) -> bool { |
226 | 17.7k | return ptr_eq(previous_op,self) || |
227 | 13.0k | (previous_op.is_plus_or_minus() && |
228 | 13.0k | (previous_op.is_times() && |
229 | 17.7k | } |
230 | } | |
231 | ||
232 | // StackInfo contains all the needed information for deciding shift/reduce during parsing. | |
233 | // The stack itself is just a Vec of StackInfo (since we only push, pop, and look at the top) | |
234 | // There are a number of useful functions defined on StackInfo. | |
235 | struct StackInfo<'a, 'op>{ | |
236 | mrow: Element<'a>, // mrow being built | |
237 | op_pair: OperatorPair<'op>, // last operator placed on stack | |
238 | is_operand: bool, // true if child at end of mrow is an operand (as opposed to an operator) | |
239 | } | |
240 | ||
241 | impl fmt::Display for StackInfo<'_, '_> { | |
242 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
243 | 0 | write!(f, "StackInfo(op={}/{}, is_operand={}, mrow({}", |
244 | 0 | show_invisible_op_char(self.op_pair.ch), self.op_pair.op.priority, self.is_operand, |
245 | 0 | if self.mrow.children().is_empty() {")"} else {""})?; |
246 | 0 | for child in self.mrow.children() { |
247 | 0 | let child = as_element(child); |
248 | 0 | write!(f, "{}{}", name(child), if child.following_siblings().is_empty() {")"} else {","})?; |
249 | } | |
250 | 0 | return Ok( () ); |
251 | 0 | } |
252 | } | |
253 | ||
254 | impl<'a, 'op:'a> StackInfo<'a, 'op> { | |
255 | 10.6k | fn new(doc: Document<'a>) -> StackInfo<'a, 'op> { |
256 | // debug!(" new empty StackInfo"); | |
257 | 10.6k | let mrow = create_mathml_element(&doc, "mrow") ; |
258 | 10.6k | mrow.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
259 | 10.6k | return StackInfo{ |
260 | 10.6k | mrow, |
261 | 10.6k | op_pair: OperatorPair{ ch: "\u{E000}", op: &LEFT_FENCEPOST }, |
262 | 10.6k | is_operand: false, |
263 | 10.6k | } |
264 | 10.6k | } |
265 | ||
266 | 10.9k | fn with_op<'d>(doc: &'d Document<'a>, node: Element<'a>, op_pair: OperatorPair<'op>) -> StackInfo<'a, 'op> { |
267 | // debug!(" new StackInfo with '{}' and operator {}/{}", name(node), show_invisible_op_char(op_pair.ch), op_pair.op.priority); | |
268 | 10.9k | let mrow = create_mathml_element(doc, "mrow"); |
269 | 10.9k | mrow.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
270 | 10.9k | mrow.append_child(node); |
271 | 10.9k | return StackInfo { |
272 | 10.9k | mrow, |
273 | 10.9k | op_pair, |
274 | 10.9k | is_operand: false, |
275 | 10.9k | } |
276 | 10.9k | } |
277 | ||
278 | 37.2k | fn priority(&self) -> usize { |
279 | 37.2k | return self.op_pair.op.priority; |
280 | 37.2k | } |
281 | ||
282 | 37.1k | fn last_child_in_mrow(&self) -> Option<Element<'a>> { |
283 | 37.1k | let children = self.mrow.children(); |
284 | 37.1k | for & |
285 | 29.5k | let child = as_element(child); |
286 | 29.5k | if let Some( |
287 | 687 | && value == "empty_content" { |
288 | 0 | continue; |
289 | 29.5k | } |
290 | 29.5k | return Some(child); |
291 | } | |
292 | 7.58k | return None; |
293 | 37.1k | } |
294 | ||
295 | 57.6k | fn add_child_to_mrow(&mut self, child: Element<'a>, child_op: OperatorPair<'op>) { |
296 | // debug!(" adding '{}' to mrow[{}], operator '{}/{}'", | |
297 | // element_summary(child), self.mrow.children().len(), show_invisible_op_char(child_op.ch), child_op.op.priority); | |
298 | 57.6k | self.mrow.append_child(child); |
299 | 57.6k | if ptr_eq(child_op.op, &ILLEGAL_OPERATOR_INFO) { |
300 | 36.8k | assert!(!self.is_operand); // should not have two operands in a row (ok to add whitespace) |
301 | 36.8k | self.is_operand = true; |
302 | 20.7k | } else { |
303 | 20.7k | self.op_pair = child_op; |
304 | 20.7k | self.is_operand = false; |
305 | 20.7k | } |
306 | 57.6k | } |
307 | ||
308 | 18.4k | fn remove_last_operand_from_mrow(&mut self) -> Element<'a> { |
309 | 18.4k | let children = self.mrow.children(); |
310 | 18.4k | assert!( !children.is_empty() ); |
311 | 18.4k | assert!( self.is_operand || |
312 | 18.4k | self.is_operand = false; |
313 | 18.4k | let last_operand = as_element(children[children.len()-1]); |
314 | // debug!(" Removing last element '{}' from mrow[{}]",element_summary(last_operand), children.len()); | |
315 | 18.4k | last_operand.remove_from_parent(); |
316 | 18.4k | return last_operand; |
317 | 18.4k | } |
318 | ||
319 | } | |
320 | ||
321 | ||
322 | 117k | pub fn create_mathml_element<'a>(doc: &Document<'a>, name: &str) -> Element<'a> { |
323 | 117k | return doc.create_element(sxd_document::QName::with_namespace_uri( |
324 | 117k | Some("http://www.w3.org/1998/Math/MathML"), |
325 | 117k | name)); |
326 | 117k | } |
327 | ||
328 | 4.84k | pub fn is_fence(mo: Element) -> bool { |
329 | 4.84k | return CanonicalizeContext::find_operator(None, mo, None, None, None).is_fence(); |
330 | 4.84k | } |
331 | ||
332 | 664 | pub fn is_relational_op(mo: Element) -> bool { |
333 | 664 | return CanonicalizeContext::find_operator(None, mo, None, None, None).priority == *EQUAL_PRIORITY; |
334 | 664 | } |
335 | ||
336 | 113k | pub fn set_mathml_name(element: Element, new_name: &str) { |
337 | 113k | element.set_name(QName::with_namespace_uri(Some("http://www.w3.org/1998/Math/MathML"), new_name)); |
338 | 113k | } |
339 | ||
340 | /// Replace 'mathml' in the parent (must exist since this only happens for leaves) with the 'replacements' (new children). | |
341 | /// This handles adding mrows if needed. | |
342 | /// | |
343 | /// Returns first replacement | |
344 | 2.47k | pub fn replace_children<'a>(mathml: Element<'a>, replacements: Vec<Element<'a>>) -> Element<'a> { |
345 | 2.47k | let parent = get_parent(mathml); |
346 | 2.47k | let parent_name = name(parent); |
347 | // debug!("\nreplace_children: mathml\n{}", mml_to_string(mathml)); | |
348 | // debug!("replace_children: parent before replace\n{}", mml_to_string(parent)); | |
349 | // debug!("{} replacements:\n{}", replacements.len(), replacements.iter().map(|e| mml_to_string(e)).collect::<Vec<String>>().join("\n")); | |
350 | 2.47k | if ELEMENTS_WITH_FIXED_NUMBER_OF_CHILDREN.contains(parent_name) || |
351 | 2.44k | parent_name == "mmultiscripts" { // each child acts like the parent has a fixed number of children |
352 | // gather up the preceding/following siblings before mucking with the tree structure (mrow.append_children below) | |
353 | 32 | let mut new_children = mathml.preceding_siblings(); |
354 | 32 | let mut following_siblings = mathml.following_siblings(); |
355 | ||
356 | // debug!("\nreplace_children: mathml\n{}", mml_to_string(mathml)); | |
357 | // debug!("replace_children: parent before replace\n{}", mml_to_string(parent)); | |
358 | // wrap an mrow around the replacements and then replace 'mathml' with that | |
359 | 32 | let mrow = create_mathml_element(&mathml.document(), "mrow"); |
360 | 32 | add_attrs(mrow, &replacements[0].attributes()); |
361 | 32 | mrow.append_children(replacements); |
362 | 32 | new_children.push(ChildOfElement::Element(mrow)); |
363 | 32 | new_children.append(&mut following_siblings); |
364 | 32 | parent.replace_children(new_children); |
365 | // debug!("replace_children parent after: parent\n{}", mml_to_string(parent)); | |
366 | // debug!("replace_children: returned mrow\n{}", mml_to_string(mrow)); | |
367 | 32 | return mrow; |
368 | } else { | |
369 | // replace the children of the parent with 'replacements' inserted in place of 'mathml' | |
370 | 2.44k | let mut new_children = mathml.preceding_siblings(); |
371 | 2.44k | let i_first_new_child = new_children.len(); |
372 | 6.54k | let |
373 | 2.44k | new_children.append(&mut replacements); |
374 | 2.44k | new_children.append(&mut mathml.following_siblings()); |
375 | 2.44k | parent.replace_children(new_children); |
376 | // debug!("replace_children: (will return child[{}]) parent after replace\n{}", i_first_new_child, mml_to_string(parent)); | |
377 | 2.44k | return as_element(parent.children()[i_first_new_child]); |
378 | } | |
379 | 2.47k | } |
380 | ||
381 | // returns the presentation element of a "semantics" element | |
382 | 22 | pub fn get_presentation_element(element: Element) -> (usize, Element) { |
383 | 22 | assert_eq!(name(element), "semantics"); |
384 | 22 | let children = element.children(); |
385 | 22 | if let Some( ( |
386 | 48 | if let Some( |
387 | 46 | encoding == "MathML-Presentation" |
388 | } else { | |
389 | 2 | false |
390 | 48 | }) |
391 | { | |
392 | 20 | let presentation_annotation = as_element(*child); |
393 | // debug!("get_presentation_element:\n{}", mml_to_string(presentation_annotation)); | |
394 | 20 | assert_eq!(presentation_annotation.children().len(), 1); |
395 | 20 | return (i, as_element(presentation_annotation.children()[0])); |
396 | } else { | |
397 | 2 | return (0, as_element(children[0])); |
398 | } | |
399 | 22 | } |
400 | ||
401 | /// Canonicalize does several things: | |
402 | /// 1. cleans up the tree so all extra white space is removed (should only have element and text nodes) | |
403 | /// 2. normalize the characters | |
404 | /// 3. clean up "bad" MathML based on known output from some converters (TODO: still a work in progress) | |
405 | /// 4. the tree is "parsed" based on the mo (priority)/mi/mn's in an mrow | |
406 | /// * this adds mrows and some invisible operators (implied times, function app, ...) | |
407 | /// * extra mrows are removed | |
408 | /// * implicit mrows are turned into explicit mrows (e.g, there will be a single child of 'math') | |
409 | /// | |
410 | /// Canonicalize is pretty conservative in adding new mrows and won't do it if: | |
411 | /// * there is an intent attr | |
412 | /// * if the mrow starts and ends with a fence (e.g, French open interval "]0,1[") | |
413 | /// | |
414 | /// An mrow is never deleted unless it is redundant. | |
415 | /// | |
416 | /// Whitespace handling: | |
417 | /// Whitespace complicates parsing and also pattern matching (e.g., is it a mixed number which tests for a number preceding a fraction) | |
418 | /// The first attempt which mostly worked was to shove whitespace into adjacent mi/mn/mtext. That has a problem with distinguish different uses for whitespace | |
419 | /// The second attempt was to leave it in the parse and make it an mo when appropriate, but there were some cases where it should be prefix and wasn't caught | |
420 | /// The third attempt (and the current one) is to make it an attribute on adjacent elements. | |
421 | /// This preserves the data-width attr (with new name) added in the second attempt that helps resolve whether something is tweaking, a real space, or an omission. | |
422 | /// It adds data-previous-space-width/data-following-space-width with values to indicate with the space was on the left or right (typically it placed on the previous token because that's easier) | |
423 | 5.06k | pub fn canonicalize(mathml: Element) -> Result<Element> { |
424 | 5.06k | let context = CanonicalizeContext::new(); |
425 | 5.06k | return context.canonicalize(mathml); |
426 | 5.06k | } |
427 | ||
428 | #[derive(Debug, PartialEq)] | |
429 | enum FunctionNameCertainty { | |
430 | True, | |
431 | Maybe, | |
432 | False | |
433 | } | |
434 | ||
435 | ||
436 | static ELEMENTS_WITH_ONE_CHILD: phf::Set<&str> = phf_set! { | |
437 | "math", "msqrt", "merror", "mpadded", "mphantom", "menclose", "mtd", "mscarry" | |
438 | }; | |
439 | ||
440 | static ELEMENTS_WITH_FIXED_NUMBER_OF_CHILDREN: phf::Set<&str> = phf_set! { | |
441 | "mfrac", "mroot", "msub", "msup", "msubsup","munder", "mover", "munderover" | |
442 | }; | |
443 | ||
444 | static EMPTY_ELEMENTS: phf::Set<&str> = phf_set! { | |
445 | "mspace", "none", "mprescripts", "mglyph", "malignmark", "maligngroup", "msline", | |
446 | }; | |
447 | ||
448 | // turns out Roman Numerals tests aren't needed, but we do want to block VII from being a chemical match | |
449 | // two cases because we don't want to have a match for 'Cl', etc. | |
450 | 3 | static UPPER_ROMAN_NUMERAL: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s*$").unwrap()); |
451 | 3 | static LOWER_ROMAN_NUMERAL: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*^m{0,3}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s*$").unwrap()); |
452 | ||
453 | ||
454 | struct CanonicalizeContextPatterns { | |
455 | decimal_separator: Regex, | |
456 | block_separator: Regex, | |
457 | digit_only_decimal_number: Regex, | |
458 | block_3digit_pattern: Regex, | |
459 | block_3_5digit_pattern: Regex, | |
460 | block_4digit_hex_pattern: Regex, | |
461 | block_1digit_pattern: Regex, // used when generator puts each digit into a single mn | |
462 | } | |
463 | ||
464 | impl CanonicalizeContextPatterns { | |
465 | 4.10k | fn new(block_separator_pref: &str, decimal_separator_pref: &str) -> CanonicalizeContextPatterns { |
466 | 4.10k | let block_separator = Regex::new(&format!("[{}]", regex::escape(block_separator_pref))).unwrap(); |
467 | 4.10k | let decimal_separator = Regex::new(&format!("[{}]", regex::escape(decimal_separator_pref))).unwrap(); |
468 | // allows just "." and also matches an empty string, but those are ruled out elsewhere | |
469 | 4.10k | let digit_only_decimal_number = Regex::new(&format!(r"^\d*{}?\d*$", regex::escape(decimal_separator_pref))).unwrap(); |
470 | 4.10k | let block_3digit_pattern = get_number_pattern_regex(block_separator_pref, decimal_separator_pref, 3, 3); |
471 | 4.10k | let block_3_5digit_pattern = get_number_pattern_regex(block_separator_pref, decimal_separator_pref, 3, 5); |
472 | // Note: on en.wikipedia.org/wiki/Decimal_separator, show '3.14159 26535 89793 23846' | |
473 | 4.10k | let block_4digit_hex_pattern = Regex::new(r"^[0-9a-fA-F]{4}([ \u00A0\u202F][0-9a-fA-F]{4})*$").unwrap(); |
474 | 4.10k | let block_1digit_pattern = Regex::new(r"^((\d(\uFFFF\d)?)(\d([, \u00A0\u202F]\d){2})*)?([\.](\d(\uFFFF\d)*)?)?$").unwrap(); |
475 | ||
476 | 4.10k | return CanonicalizeContextPatterns { |
477 | 4.10k | block_separator, |
478 | 4.10k | decimal_separator, |
479 | 4.10k | digit_only_decimal_number, |
480 | 4.10k | block_3digit_pattern, |
481 | 4.10k | block_3_5digit_pattern, |
482 | 4.10k | block_4digit_hex_pattern, |
483 | 4.10k | block_1digit_pattern |
484 | 4.10k | }; |
485 | ||
486 | ||
487 | 8.21k | fn get_number_pattern_regex(block_separator: &str, decimal_separator: &str, n_sep_before: usize, n_sep_after: usize) -> Regex { |
488 | // the following is a generalization of a regex like ^(\d*|\d{1,3}([, ]?\d{3})*)(\.(\d*|(\d{3}[, ])*\d{1,3}))?$ | |
489 | // that matches something like '1 234.567 8' and '1,234.', but not '1,234.12,34 | |
490 | 8.21k | return Regex::new(&format!(r"^(\d*|\d{{1,{}}}([{}]?\d{{{}}})*)([{}](\d*|(\d{{{}}}[{}])*\d{{1,{}}}))?$", |
491 | 8.21k | n_sep_before, regex::escape(block_separator), n_sep_before, regex::escape(decimal_separator), |
492 | 8.21k | n_sep_after, regex::escape(block_separator), n_sep_after) ).unwrap(); |
493 | 8.21k | } |
494 | 4.10k | } |
495 | } | |
496 | ||
497 | /// Profiling showed that creating new contexts was very time consuming because creating the RegExs is very expensive | |
498 | /// Profiling set_mathml (which does the canonicalization) spends 65% of the time in Regex::new, of which half of it is spent in this initialization. | |
499 | struct CanonicalizeContextPatternsCache { | |
500 | block_separator_pref: String, | |
501 | decimal_separator_pref: String, | |
502 | patterns: Rc<CanonicalizeContextPatterns>, | |
503 | } | |
504 | ||
505 | thread_local!{ | |
506 | static PATTERN_CACHE: RefCell<CanonicalizeContextPatternsCache> = RefCell::new(CanonicalizeContextPatternsCache::new()); | |
507 | } | |
508 | ||
509 | impl CanonicalizeContextPatternsCache { | |
510 | 4.10k | fn new() -> CanonicalizeContextPatternsCache { |
511 | 4.10k | let pref_manager = crate::prefs::PreferenceManager::get(); |
512 | 4.10k | let pref_manager = pref_manager.borrow(); |
513 | 4.10k | let block_separator_pref = pref_manager.pref_to_string("BlockSeparators"); |
514 | 4.10k | let decimal_separator_pref = pref_manager.pref_to_string("DecimalSeparators"); |
515 | 4.10k | return CanonicalizeContextPatternsCache { |
516 | 4.10k | patterns: Rc::new( CanonicalizeContextPatterns::new(&block_separator_pref, &decimal_separator_pref) ), |
517 | 4.10k | block_separator_pref, |
518 | 4.10k | decimal_separator_pref |
519 | 4.10k | } |
520 | 4.10k | } |
521 | ||
522 | 5.06k | fn get() -> Rc<CanonicalizeContextPatterns> { |
523 | 5.06k | return PATTERN_CACHE.with( |cache| { |
524 | 5.06k | let pref_manager_rc = crate::prefs::PreferenceManager::get(); |
525 | 5.06k | let pref_manager = pref_manager_rc.borrow(); |
526 | 5.06k | let block_separator_pref = pref_manager.pref_to_string("BlockSeparators"); |
527 | 5.06k | let decimal_separator_pref = pref_manager.pref_to_string("DecimalSeparators"); |
528 | ||
529 | 5.06k | let mut cache = cache.borrow_mut(); |
530 | 5.06k | if block_separator_pref != cache.block_separator_pref || decimal_separator_pref != cache.decimal_separator_pref { |
531 | 0 | // update the cache |
532 | 0 | cache.patterns = Rc::new( CanonicalizeContextPatterns::new(&block_separator_pref, &decimal_separator_pref) ); |
533 | 0 | cache.block_separator_pref = block_separator_pref; |
534 | 0 | cache.decimal_separator_pref = decimal_separator_pref; |
535 | 5.06k | } |
536 | 5.06k | return cache.patterns.clone(); |
537 | 5.06k | }) |
538 | 5.06k | } |
539 | } | |
540 | ||
541 | struct CanonicalizeContext { | |
542 | patterns: Rc<CanonicalizeContextPatterns>, | |
543 | } | |
544 | ||
545 | ||
546 | impl CanonicalizeContext { | |
547 | 5.06k | fn new() -> CanonicalizeContext { |
548 | 5.06k | return CanonicalizeContext { |
549 | 5.06k | patterns: CanonicalizeContextPatternsCache::get(), |
550 | 5.06k | }; |
551 | 5.06k | } |
552 | ||
553 | 5.06k | fn canonicalize<'a>(&self, mut mathml: Element<'a>) -> Result<Element<'a>> { |
554 | // debug!("MathML before canonicalize:\n{}", mml_to_string(mathml)); | |
555 | ||
556 | 5.06k | if name(mathml) != "math" { |
557 | 0 | // debug!("Didn't start with <math> element -- attempting repair"); |
558 | 0 | let math_element = create_mathml_element(&mathml.document(), "math"); |
559 | 0 | math_element.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
560 | 0 | math_element.append_child(mathml); |
561 | 0 | let root = math_element.document().root(); |
562 | 0 | root.clear_children(); |
563 | 0 | root.append_child(math_element); |
564 | 0 | mathml = root.children()[0].element().unwrap(); |
565 | 5.06k | } |
566 | 5.06k | CanonicalizeContext::assure_mathml(mathml, 0) |
567 | 5.05k | let mathml = self.clean_mathml(mathml).unwrap(); // 'math' is never removed |
568 | 5.05k | self.assure_nary_tag_has_one_child(mathml); |
569 | // debug!("Not chemistry -- retry:\n{}", mml_to_string(mathml)); | |
570 | 5.05k | let mut converted_mathml = self.canonicalize_mrows(mathml) |
571 | 5.05k | .with_context(|| |
572 | // debug!("canonicalize before canonicalize_mrows:\n{}", mml_to_string(converted_mathml)); | |
573 | 5.05k | if !crate::chemistry::scan_and_mark_chemistry(converted_mathml) { |
574 | 869 | self.assure_nary_tag_has_one_child(converted_mathml); |
575 | 869 | converted_mathml = self.canonicalize_mrows(mathml) |
576 | 869 | .with_context(|| |
577 | 4.18k | } |
578 | 5.05k | debug!("\nMathML after canonicalize:\n{}", |
579 | 5.05k | return Ok(converted_mathml); |
580 | 5.06k | } |
581 | ||
582 | /// Make sure there is exactly one child | |
583 | 19.1k | fn assure_nary_tag_has_one_child(&self, mathml: Element) { |
584 | 19.1k | let children = mathml.children(); |
585 | 19.1k | if !ELEMENTS_WITH_ONE_CHILD.contains(name(mathml)) { |
586 | 6.43k | return; |
587 | 12.7k | } |
588 | ||
589 | 12.7k | if children.is_empty() { |
590 | 3 | // make sure there is content |
591 | 3 | let child = CanonicalizeContext::create_empty_element(&mathml.document()); |
592 | 3 | mathml.append_child(child); |
593 | 12.7k | } else if children.len() > 1 { |
594 | 2.34k | // wrap the children in an mrow |
595 | 2.34k | let mrow = create_mathml_element(&mathml.document(), "mrow"); |
596 | 2.34k | mrow.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
597 | 2.34k | mrow.append_children(children); |
598 | 2.34k | mathml.replace_children(vec![ChildOfElement::Element(mrow)]); |
599 | 10.3k | } |
600 | 19.1k | } |
601 | ||
602 | /// Return an error if some element is not MathML (only look at first child of <semantics>) or if it has the wrong number of children | |
603 | 52.9k | fn assure_mathml(mathml: Element, depth: usize) -> Result<()> { |
604 | 52.9k | if depth > crate::interface::MAX_DEPTH { |
605 | 1 | bail!("MathML is too deeply nested to process"); |
606 | 52.9k | } |
607 | 52.9k | let n_children = mathml.children().len(); |
608 | 52.9k | let element_name = name(mathml); |
609 | 52.9k | if is_leaf(mathml) { |
610 | 33.1k | if EMPTY_ELEMENTS.contains(element_name) { |
611 | 464 | if n_children != 0 { |
612 | 0 | bail!("{} should only have one child:\n{}", element_name, mml_to_string(mathml)); |
613 | 464 | } |
614 | 32.7k | } else if element_name == "annotation" { |
615 | 0 | bail!("'annotation' element is not child of 'semantics' element"); |
616 | 32.7k | } else if (n_children == 1 && |
617 | 32.7k | return Ok( () ); |
618 | } else { | |
619 | 0 | bail!("Not a valid MathML leaf element:\n{}", mml_to_string(mathml)); |
620 | }; | |
621 | 19.7k | } |
622 | ||
623 | 20.2k | if ELEMENTS_WITH_FIXED_NUMBER_OF_CHILDREN.contains(element_name) { |
624 | 3.90k | match element_name { |
625 | 3.90k | "munderover" | |
626 | 0 | bail!("{} should have 3 children:\n{}", element_name, mml_to_string(mathml)); |
627 | 277 | }, |
628 | 3.62k | _ => if n_children != 2 { |
629 | 0 | bail!("{} should have 2 children:\n{}", element_name, mml_to_string(mathml)); |
630 | 3.62k | }, |
631 | } | |
632 | 16.3k | } else if |
633 | 2.18k | let parent_name = name(get_parent(mathml)); |
634 | 2.18k | if (element_name == "mtr" || |
635 | 0 | bail!("Illegal MathML: {} is not a child of mtable. Parent is {}", element_name, mml_to_string(get_parent(mathml))); |
636 | 2.18k | } else if element_name == "mtd" && !( |
637 | 1 | bail!("Illegal MathML: mtd is not a child of {}. Parent is {}", parent_name, mml_to_string(get_parent(mathml))); |
638 | 2.17k | } |
639 | } | |
640 | 14.1k | else if element_name == "mmultiscripts" { |
641 | 182 | let has_prescripts = mathml.children().iter() |
642 | 649 | . |
643 | 182 | if has_prescripts ^ (n_children.is_multiple_of(2)) { |
644 | 1 | bail!("{} has the wrong number of children:\n{}", element_name, mml_to_string(mathml)); |
645 | 181 | } |
646 | 13.9k | } else if element_name == "mlongdiv" { |
647 | 0 | if n_children < 3 { |
648 | 0 | bail!("{} should have at least 3 children:\n{}", element_name, mml_to_string(mathml)); |
649 | 0 | } |
650 | 13.9k | } else if element_name == "semantics" { |
651 | 11 | let children = mathml.children(); |
652 | 11 | if children.is_empty() { |
653 | 0 | return Ok( () ); |
654 | } else { | |
655 | 11 | let (i_presentation, presentation_element) = get_presentation_element(mathml); |
656 | // make sure only 'annotation' and 'annotation-xml' elements are children of the non-presentation element | |
657 | 24 | for (i, child) in |
658 | 24 | if i != i_presentation { |
659 | 13 | let child = as_element(*child); |
660 | 13 | if name(child)!="annotation" && |
661 | 0 | bail!("Illegal MathML: {} is child of 'semantic'", name(child)); |
662 | 13 | } |
663 | 11 | } |
664 | } | |
665 | 11 | return CanonicalizeContext::assure_mathml(presentation_element, depth + 1); |
666 | } | |
667 | 13.9k | } else if !IsNode::is_mathml(mathml) { |
668 | 1 | if element_name == "annotation-xml" { |
669 | 0 | bail!("'annotation-xml' element is not child of 'semantics' element"); |
670 | } else { | |
671 | 1 | bail!("'{}' is not a valid MathML element", element_name); |
672 | } | |
673 | 13.9k | } |
674 | ||
675 | // valid MathML element and not a leaf -- check the children | |
676 | 47.8k | for child in |
677 | 47.8k | CanonicalizeContext::assure_mathml( as_element(child), depth + 1) |
678 | } | |
679 | 19.6k | return Ok( () ); |
680 | 52.9k | } |
681 | ||
682 | 283 | fn make_empty_element(mathml: Element) -> Element { |
683 | 283 | set_mathml_name(mathml, "mtext"); |
684 | 283 | mathml.clear_children(); |
685 | 283 | mathml.set_text("\u{00A0}"); |
686 | 283 | mathml.set_attribute_value("data-changed", "empty_content"); |
687 | 283 | mathml.set_attribute_value("data-width", "0"); |
688 | 283 | return mathml; |
689 | 283 | } |
690 | ||
691 | 24 | fn create_empty_element<'a>(doc: &Document<'a>) -> Element<'a> { |
692 | 24 | let mtext = create_mathml_element(doc, "mtext"); |
693 | 24 | mtext.set_text("\u{00A0}"); |
694 | 24 | mtext.set_attribute_value("data-added", "missing-content"); |
695 | 24 | mtext.set_attribute_value("data-width", "0"); |
696 | 24 | return mtext; |
697 | 24 | } |
698 | ||
699 | 11.5k | fn is_empty_element(el: Element) -> bool { |
700 | 11.5k | return (is_leaf(el) && |
701 | 11.0k | (name(el) == "mrow" && |
702 | 11.5k | } |
703 | ||
704 | ||
705 | // this should only be called for 2D elements | |
706 | 4.48k | fn mark_empty_content(two_d_element: Element) { |
707 | 7.32k | for child in |
708 | 7.32k | let child = as_element(child); |
709 | 7.32k | if CanonicalizeContext::is_empty_element(child) { |
710 | 20 | child.set_attribute_value(EMPTY_IN_2D, "true"); |
711 | 7.30k | } |
712 | } | |
713 | 4.48k | } |
714 | ||
715 | /// Turn leaf into an 'mn' and set attributes appropriately | |
716 | 34 | fn make_roman_numeral(leaf: Element) { |
717 | 34 | assert!(is_leaf(leaf)); |
718 | 34 | set_mathml_name(leaf, "mn"); |
719 | 34 | leaf.set_attribute_value("data-roman-numeral", "true"); // mark for easy detection |
720 | 34 | let as_number = match as_text(leaf).parse::<RomanNumeral>() { |
721 | 34 | Ok(roman) => roman.as_u16().to_string(), |
722 | 0 | Err(_) => as_text(leaf).to_string(), |
723 | }; | |
724 | 34 | leaf.set_attribute_value("data-number", &as_number); |
725 | 34 | } |
726 | ||
727 | /// most of the time it is ok to merge the mrow with its singleton child, but there are some exceptions: | |
728 | /// mrow has 'intent' -- this might reference the child and you aren't allowed to self reference | |
729 | 2.82k | fn is_ok_to_merge_mrow_child(mrow: Element) -> bool { |
730 | 2.82k | assert_eq!(name(mrow), "mrow"); |
731 | 2.82k | assert!(mrow.children().len() == 1); |
732 | 2.82k | return mrow.attribute(INTENT_ATTR).is_none(); // could check if child is referenced, but that's a chunk of code |
733 | 2.82k | } |
734 | ||
735 | /// This function does some cleanup of MathML (mostly fixing bad MathML) | |
736 | /// Unlike the main canonicalization routine, significant tree changes happen here | |
737 | /// Changes to "good" MathML: | |
738 | /// 1. mfenced -> mrow, a => mrow | |
739 | /// 2. mspace and mtext with only whitespace are canonicalized to a non-breaking space and merged in with | |
740 | /// an adjacent non-mo element unless in a required element position (need to keep for braille) | |
741 | /// | |
742 | /// Note: mspace that is potentially part of a number that was split apart is merged into a number as a single space char | |
743 | /// | |
744 | /// mstyle, mpadded, and mphantom, malignmark, maligngroup are removed (but children might be kept) | |
745 | /// | |
746 | /// Significant changes are made cleaning up empty bases of scripts, looking for chemistry, merging numbers with commas, | |
747 | /// "arg trig" functions, pseudo scripts, and others | |
748 | /// | |
749 | /// Returns 'None' if the element should not be in the tree. | |
750 | 52.3k | fn clean_mathml<'a>(&self, mathml: Element<'a>) -> Option<Element<'a>> { |
751 | // Note: this works bottom-up (clean the children first, then this element) | |
752 | 3 | static IS_PRIME: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"['′″‴⁗]").unwrap()); |
753 | ||
754 | // Note: including intervening spaces in what is likely a symbol of omission preserves any notion of separate digits (e.g., "_ _ _") | |
755 | 3 | static IS_UNDERSCORE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[_\u{00A0}]+$").unwrap()); |
756 | ||
757 | ||
758 | 23.5k | fn is_currency_symbol(ch: char) -> bool { |
759 | 23.5k |
|
760 | 23.5k | } |
761 | ||
762 | 20.0k | fn contains_currency(s: &str) -> bool { |
763 | 20.0k | s.chars().any(is_currency_symbol) |
764 | 20.0k | } |
765 | ||
766 | // begin by cleaning up empty elements | |
767 | // debug!("clean_mathml\n{}", mml_to_string(mathml)); | |
768 | 52.3k | let element_name = name(mathml); |
769 | 52.3k | let parent_name = if element_name == "math" { |
770 | 5.09k | "math".to_string() |
771 | } else { | |
772 | 47.2k | let parent = get_parent(mathml); |
773 | 47.2k | name(parent).to_string() |
774 | }; | |
775 | 52.3k | let parent_requires_child = ELEMENTS_WITH_FIXED_NUMBER_OF_CHILDREN.contains(&parent_name) || |
776 | 44.0k |
|
777 | 41.8k | parent_name == "mmultiscripts"; |
778 | ||
779 | // handle empty leaves -- leaving it empty causes problems with the speech rules | |
780 | 52.3k | if is_leaf(mathml) && |
781 | 32 | return if parent_requires_child { |
782 | 52.3k | }; |
783 | ||
784 | 52.3k | if mathml.children().is_empty() && |
785 | 158 | if element_name == "mrow" && |
786 | // if it is an empty mrow that doesn't need to be there, get rid of it. Otherwise, replace it with an mtext | |
787 | 142 | if parent_name == "mmultiscripts" && |
788 | // MathML Core dropped "none" in favor of <mrow/>, but MathCAT is written with <none/> | |
789 | // Do substitutions for the scripts, not the base | |
790 | 4 | set_mathml_name(mathml, "none"); |
791 | 4 | return Some(mathml); |
792 | 138 | } |
793 | 138 | if parent_requires_child { |
794 | 14 | return Some( CanonicalizeContext::make_empty_element(mathml) ); |
795 | } else { | |
796 | 124 | return None; |
797 | } | |
798 | 16 | } else { |
799 | 16 | // create some content so that speech rules don't require special cases |
800 | 16 | let mtext = CanonicalizeContext::create_empty_element(&mathml.document()); |
801 | 16 | mathml.append_child(mtext); |
802 | 16 | // return Some(mathml); |
803 | 16 | } |
804 | 52.1k | }; |
805 | ||
806 | 52.1k | match element_name { |
807 | 52.1k | "mn" => { |
808 | 9.08k | let text = as_text(mathml); |
809 | 9.08k | let mut chars = text.chars(); |
810 | 9.08k | let first_char = chars.next().unwrap(); // we have already made sure it is non-empty |
811 | 9.08k | if !text.trim().is_empty() && is_roman_number_match(text) { |
812 | 2 | // people tend to set them in a non-italic font and software makes that 'mtext' |
813 | 2 | CanonicalizeContext::make_roman_numeral(mathml); |
814 | 9.08k | } else if |
815 | 5 | let doc = mathml.document(); |
816 | 5 | let mo = create_mathml_element(&doc, "mo"); |
817 | 5 | let mn = create_mathml_element(&doc, "mn"); |
818 | 5 | mo.set_text("-"); |
819 | 5 | mn.set_text(&text[first_char.len_utf8()..]); |
820 | 5 | set_mathml_name(mathml, "mrow"); |
821 | 5 | mathml.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
822 | 5 | mathml.replace_children([mo,mn]); |
823 | 9.08k | } else if contains_currency(text) && let Some( |
824 | 3 | return Some(result); |
825 | 9.07k | } |
826 | 9.08k | if let Some((idx, last_char)) = text.char_indices().next_back() { |
827 | // look for something like 12° | |
828 | 9.08k | if is_pseudo_script_char(last_char) { |
829 | 1 | let doc = mathml.document(); |
830 | 1 | let mn = create_mathml_element(&doc, "mn"); |
831 | 1 | let mo = create_mathml_element(&doc, "mo"); |
832 | 1 | mn.set_text(&text[..idx]); |
833 | 1 | mo.set_text(last_char.to_string().as_str()); |
834 | 1 | set_mathml_name(mathml, "msup"); |
835 | 1 | mathml.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
836 | 1 | mathml.replace_children([mn, mo]); |
837 | 9.08k | } |
838 | 0 | } |
839 | 9.08k | return Some(mathml); |
840 | }, | |
841 | 43.0k | "ms" | |
842 | 3 | return Some(mathml); |
843 | }, | |
844 | 43.0k | "mi" => { |
845 | 11.6k | let text = as_text(mathml); |
846 | 11.6k | if !text.trim().is_empty() && is_roman_number_match(text) && |
847 | // people tend to set them in a non-italic font and software makes that 'mtext' | |
848 | 28 | CanonicalizeContext::make_roman_numeral(mathml); |
849 | 28 | return Some(mathml); |
850 | 11.5k | } |
851 | 11.5k | if let Some( |
852 | 1 | mathml.set_text(dash); |
853 | 1 | return Some(mathml); |
854 | 11.5k | } else if text.contains('_') { |
855 | // if left or right are an mo, leave as is. Otherwise convert to an mo. | |
856 | 6 | let preceding_siblings = mathml.preceding_siblings(); |
857 | 6 | let following_siblings = mathml.following_siblings(); |
858 | 6 | if preceding_siblings.is_empty() || following_siblings.is_empty() { |
859 | 4 | return Some(mathml); |
860 | 2 | } |
861 | 2 | if name(as_element(preceding_siblings[preceding_siblings.len()-1])) != "mo" && |
862 | 2 | name(as_element(following_siblings[0])) != "mo" { |
863 | 2 | set_mathml_name(mathml, "mo"); |
864 | 2 |
|
865 | 2 | return Some(mathml); |
866 | 11.5k | } else if OPERATORS.get(text).is_some() { |
867 | 118 | if let Some( |
868 | // if it is a unit, it might be seconds, minutes, feet, ... not an operator | |
869 | 88 | if intent_value.contains(":unit") { |
870 | 88 | return Some(mathml); |
871 | 0 | } |
872 | 30 | } |
873 | 30 | set_mathml_name(mathml, "mo"); |
874 | ||
875 | // For at least pandoc, ∇ is an 'mi' and it sometimes adds an invisible times -- remove them | |
876 | 30 | let op = OPERATORS.get(text).unwrap(); |
877 | 30 | let preceding_siblings = mathml.preceding_siblings(); |
878 | 30 | if (op.is_infix() || |
879 | 16 | !preceding_siblings.is_empty() && |
880 | 0 | as_element(preceding_siblings[0]).remove_from_parent(); |
881 | 30 | } |
882 | 30 | let following_siblings = mathml.following_siblings(); |
883 | 30 | if (op.is_infix() || |
884 | 27 | !following_siblings.is_empty() && CanonicalizeContext::is_invisible_char_element(as_element(following_siblings[0])) { |
885 | 0 | as_element(following_siblings[0]).remove_from_parent(); |
886 | 30 | } |
887 | 30 | return Some(mathml); |
888 | 11.4k | } else if let Some( |
889 | 1 | return Some(result); |
890 | 11.4k | } else if let Some( |
891 | 0 | return Some(result); |
892 | 11.4k | } else if IS_PRIME.is_match(text) { |
893 | 0 | let new_text = merge_prime_text(text); |
894 | 0 | mathml.set_text(&new_text); |
895 | 0 | return Some(mathml); |
896 | 11.4k | } else if text == "..." { |
897 | 1 | mathml.set_text("…"); |
898 | 1 | return Some(mathml); |
899 | 11.4k | } else if let Some( |
900 | 27 | return Some(result); |
901 | 11.4k | } else if let Some( |
902 | 11 | return Some(result); |
903 | } else { | |
904 | 11.4k | return Some(mathml); |
905 | }; | |
906 | }, | |
907 | 31.4k | "mtext" => { |
908 | // debug!("before merge_arc_trig: {}", mml_to_string(mathml)); | |
909 | ||
910 | 401 | if let Some( |
911 | 2 | return Some(result); |
912 | 399 | } else if let Some( |
913 | 11 | return Some(result); |
914 | 388 | } |
915 | ||
916 | 388 | let text = as_text(mathml); |
917 | 388 | if !text.trim().is_empty() && |
918 | // people tend to set them in a non-italic font and software makes that 'mtext' | |
919 | 4 | CanonicalizeContext::make_roman_numeral(mathml); |
920 | 4 | return Some(mathml); |
921 | 449 | } else if |
922 | 58 |
|
923 | 1 | mathml.set_name("mn"); |
924 | 1 | return Some(mathml); |
925 | 383 | } else if contains_currency(text) && let Some( |
926 | 0 | return Some(result); |
927 | 383 | } |
928 | // common bug: trig functions, lim, etc., should be mi | |
929 | 383 | if ["…", "⋯", "∞"].contains(&text) || |
930 | 383 | crate::definitions::SPEECH_DEFINITIONS.with(|definitions| |
931 | 383 | if let Some( |
932 | 382 | hashset.contains(text) |
933 | } else { | |
934 | 1 | false |
935 | 383 | } |
936 | ) { | |
937 | 6 | set_mathml_name(mathml, "mi"); |
938 | 6 | return Some(mathml); |
939 | 377 | } |
940 | ||
941 | // allow non-breaking whitespace to stay -- needed by braille | |
942 | 377 | if IS_WHITESPACE.is_match(text) { |
943 | // normalize to just a single non-breaking space | |
944 | 71 | mathml.set_attribute_value("data-width", &format!("{:.3}", white_space_em_width(text))); |
945 | 71 | mathml.set_text("\u{00A0}"); |
946 | 71 | return Some(mathml); |
947 | 306 | } else if let Some( |
948 | 2 | mathml.set_text(dash); |
949 | 304 | } else if OPERATORS.get(text).is_some() { |
950 | 11 | set_mathml_name(mathml, "mo"); |
951 | 11 | return Some(mathml); |
952 | 293 | } |
953 | 295 | return if parent_requires_child || |
954 | }, | |
955 | 31.0k | "mo" => { |
956 | // WIRIS editor puts non-breaking whitespace as standalone in 'mo' | |
957 | 11.2k | let text = as_text(mathml); |
958 | 11.2k | if !text.is_empty() && IS_WHITESPACE.is_match(text) { |
959 | // can't throw it out because it is needed by braille -- change to what it really is | |
960 | 78 | set_mathml_name(mathml, "mtext"); |
961 | 78 | mathml.set_attribute_value("data-width", &format!("{:.3}", white_space_em_width(text))); |
962 | 78 | mathml.set_text("\u{00A0}"); |
963 | 78 | mathml.set_attribute_value(CHANGED_ATTR, "data-was-mo"); |
964 | 78 | return Some(mathml); |
965 | } else { | |
966 | 11.1k | match text { |
967 | 11.1k | "arc" | "arc " | "arc " /* non-breaking space */ => { |
968 | 0 | if let Some(result) = merge_arc_trig(mathml) { |
969 | 0 | return Some(result); |
970 | 0 | } |
971 | }, | |
972 | 11.1k | "..." => |
973 | 11.1k | ":" => { |
974 | 94 | if is_ratio(mathml) { |
975 | 8 | mathml.set_text("∶"); // ratio U+2236 |
976 | 86 | } |
977 | 94 | return Some(mathml); |
978 | }, | |
979 | 11.0k | "::" => |
980 | 11.0k | "│" => |
981 | 11.0k | "|" | |
982 | 6 | return Some(result); |
983 | } else { | |
984 | 299 | return Some(mathml); |
985 | }, | |
986 | 10.7k | _ => (), |
987 | } | |
988 | } | |
989 | ||
990 | // common bug: trig functions, lim, etc., should be mi | |
991 | // same for ellipsis ("…") | |
992 | 10.7k | return crate::definitions::SPEECH_DEFINITIONS.with(|definitions| { |
993 | 10.7k | if ["…", "⋯", "∞"].contains(&text) || |
994 | 10.7k | definitions.borrow().get_hashset("FunctionNames").unwrap().contains(text) || |
995 | 10.6k | definitions.borrow().get_hashset("GeometryShapes").unwrap().contains(text) { |
996 | 83 | set_mathml_name(mathml, "mi"); |
997 | 83 | return Some(mathml); |
998 | 10.6k | } |
999 | 10.6k | if IS_PRIME.is_match(text) { |
1000 | 66 | let new_text = merge_prime_text(text); |
1001 | 66 | mathml.set_text(&new_text); |
1002 | 66 | return Some(mathml); |
1003 | 10.5k | } |
1004 | 10.5k | if contains_currency(text) && let Some( |
1005 | 9 | return Some(result); |
1006 | 10.5k | } |
1007 | 10.5k | return Some(mathml); |
1008 | 10.7k | }); |
1009 | // note: chemistry test is done later as part of another phase of chemistry cleanup | |
1010 | }, | |
1011 | 19.8k | "mfenced" => {return |
1012 | 19.8k | "a" => { |
1013 | // convert 'a' into 'mrow' | |
1014 | 2 | set_mathml_name(mathml, "mrow"); |
1015 | 2 | return self.clean_mathml(mathml); |
1016 | } | |
1017 | 19.8k | "mstyle" | |
1018 | // Throw out mstyle and mpadded -- to do this, we need to avoid mstyle being the arg of clean_mathml | |
1019 | // FIX: should probably push the attrs down to the children (set in 'self') | |
1020 | 714 | merge_adjacent_similar_mstyles(mathml); |
1021 | 714 | let children = mathml.children(); |
1022 | 714 | if children.is_empty() { |
1023 | 0 | return if parent_requires_child {Some( CanonicalizeContext::make_empty_element(mathml) )} else {None}; |
1024 | 714 | } else if children.len() == 1 { |
1025 | 678 | let is_from_mhchem = element_name == "mpadded" && |
1026 | 678 | if let Some( |
1027 | // "lift" the child up so all the links (e.g., siblings) are correct | |
1028 | 269 | mathml.replace_children(new_mathml.children()); |
1029 | 269 | set_mathml_name(mathml, name(new_mathml)); |
1030 | 269 | add_attrs(mathml, &new_mathml.attributes()); |
1031 | 269 | return Some(mathml); |
1032 | 409 | } else if parent_requires_child { |
1033 | // need a placeholder -- make it empty mtext | |
1034 | 31 | let empty = CanonicalizeContext::make_empty_element(mathml); |
1035 | 31 | if is_from_mhchem { |
1036 | 27 | empty.set_attribute_value(MHCHEM_MMULTISCRIPTS_HACK, "true"); |
1037 | 27 |
|
1038 | 31 | return Some(empty); |
1039 | } else { | |
1040 | 378 | return None; |
1041 | } | |
1042 | } else { | |
1043 | // wrap the children in an mrow, but maintain tree siblings by changing mpadded/mstyle to mrow | |
1044 | 36 | set_mathml_name(mathml, "mrow"); |
1045 | 36 | mathml.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
1046 | 36 | return self.clean_mathml(mathml); // now it's an mrow so a different path next time |
1047 | } | |
1048 | }, | |
1049 | 19.0k | "mphantom" | |
1050 | 364 | return if parent_requires_child { |
1051 | }, | |
1052 | 18.7k | "mspace" => { |
1053 | // need to hold onto space for braille | |
1054 | 224 | set_mathml_name(mathml, "mtext"); |
1055 | 224 | mathml.set_text("\u{00A0}"); |
1056 | 224 | mathml.set_attribute_value(CHANGED_ATTR, "was-mspace"); |
1057 | ||
1058 | // normalize width ems | |
1059 | 224 | let width = mathml.attribute_value("width").unwrap_or("0em"); |
1060 | 224 | let normalized_width = crate::xpath_functions::FontSizeGuess::em_from_value(width); |
1061 | 224 | mathml.set_attribute_value("data-width", &normalized_width.to_string()); |
1062 | 224 | return Some(mathml); |
1063 | }, | |
1064 | 18.5k | "semantics" => { |
1065 | // The semantics tag, like the style tag, can mess with pattern matching. | |
1066 | // However, it may be the case that having the annotations could aid in determining intent, so we want to keep them. | |
1067 | // The compromise is to move the annotations into an attr named data-annotation[-xml]-<encoding-name> | |
1068 | // The attribute is put on presentation element root | |
1069 | 11 | let presentation = get_presentation_element(mathml).1; |
1070 | 11 | let new_presentation = if let Some(presentation) = self.clean_mathml(presentation) { |
1071 | 11 | presentation |
1072 | } else { | |
1073 | // probably shouldn't happen, but just in case | |
1074 | 0 | CanonicalizeContext::create_empty_element(&mathml.document()) |
1075 | }; | |
1076 | 11 | set_annotation_attrs(new_presentation, mathml); |
1077 | 11 | return Some(new_presentation); |
1078 | }, | |
1079 | _ => { | |
1080 | 18.5k | let children = mathml.children(); |
1081 | 18.5k | if element_name == "mrow" { |
1082 | // handle special cases of empty mrows and mrows which just one element | |
1083 | 6.04k | if children.is_empty() && |
1084 | 0 | return if parent_requires_child {Some(mathml)} else {None}; |
1085 | 6.04k | } else if children.len() == 1 && |
1086 | 2.56k | let is_from_mhchem = is_from_mhchem_hack(mathml); |
1087 | 2.56k | if let Some( |
1088 | // "lift" the child up so all the links (e.g., siblings) are correct | |
1089 | 1.95k | mathml.replace_children(new_mathml.children()); |
1090 | 1.95k | set_mathml_name(mathml, name(new_mathml)); |
1091 | 1.95k | add_attrs(mathml, &new_mathml.attributes()); |
1092 | 1.95k | return Some(mathml); |
1093 | 607 | } else if parent_requires_child { |
1094 | 234 | let empty = CanonicalizeContext::make_empty_element(mathml); |
1095 | 234 | if is_from_mhchem { |
1096 | 142 | empty.set_attribute_value(MHCHEM_MMULTISCRIPTS_HACK, "true"); |
1097 | 142 |
|
1098 | 234 | return Some(empty); |
1099 | } else { | |
1100 | 373 | return None; |
1101 | } | |
1102 | 3.48k | } |
1103 | 12.4k | } |
1104 | ||
1105 | // FIX: this should be setting children, not mathml | |
1106 | 15.9k | let mathml = if element_name == "mrow" || |
1107 | 12.4k | (children.len() > 1 && |
1108 | 5.90k | let merged = merge_dots(mathml); // FIX -- switch to passing in children |
1109 | 5.90k | let merged = merge_primes(merged); |
1110 | 5.90k | let merged = merge_degrees_C_F(merged); |
1111 | 5.90k | let merged = merge_chars(merged, &IS_UNDERSCORE); |
1112 | 5.90k | handle_pseudo_scripts(merged) |
1113 | } else { | |
1114 | 10.0k | mathml |
1115 | }; | |
1116 | ||
1117 | // cleaning children can add or delete subsequent children, so we need to constantly update the children (and mathml) | |
1118 | 15.9k | let mut children = mathml.children(); |
1119 | 15.9k | let mut i = 0; |
1120 | ||
1121 | 59.7k | while i < children.len() { |
1122 | 43.9k | if let Some(child) = children[i].element() { |
1123 | 43.9k | match self.clean_mathml(child) { |
1124 | 299 | None => { |
1125 | 299 | mathml.remove_child(child); |
1126 | 299 | // don't increment 'i' because there is one less child now and so everything shifted left |
1127 | 299 | }, |
1128 | 43.6k | Some(new_child) => { |
1129 | // debug!("new_child (i={})\n{}", i, mml_to_string(new_child)); | |
1130 | 43.6k | let new_child_name = name(new_child); |
1131 | 43.6k | children = mathml.children(); // clean_mathml(child) may have changed following siblings |
1132 | 43.6k | children[i] = ChildOfElement::Element(new_child); |
1133 | 43.6k | mathml.replace_children(children); |
1134 | 43.6k | if new_child_name == "mi" || |
1135 | 12.5k | // can't do this above in 'match' because this changes the tree and |
1136 | 12.5k | // lifting single element mrows messes with structure in a conflicting way |
1137 | 12.5k | // Note: if clean_chemistry_leaf() made changes, they don't need cleaning because they will be "ok" mi's |
1138 | 12.5k | clean_chemistry_leaf(as_element(mathml.children()[i])); |
1139 | 12.5k | } else { |
1140 | // If the attach call does something, children are inserted *before* child (i.e., into parent) | |
1141 | // We return the new start at the expense of re-cleaning the script | |
1142 | // This is needed because anything before the returned element will be lost | |
1143 | 31.0k | let start_of_change = attach_scripts_to_split_element(new_child); |
1144 | 31.0k | if name(start_of_change) == "mrow" { |
1145 | 3.43k | start_of_change.remove_attribute(MAYBE_CHEMISTRY); // was lifted, and not set -- remove and it will be computed later |
1146 | 27.6k | } |
1147 | // crate::canonicalize::assure_mathml(get_parent(start_of_change)).unwrap(); // FIX: find a recovery -- we're in deep trouble if this isn't true | |
1148 | 31.0k | if start_of_change != child { |
1149 | // debug!("clean_mathml: start_of_change != mathml -- mathml={}", mml_to_string(mathml)); | |
1150 | 49 | return self.clean_mathml(mathml); // restart cleaning |
1151 | 30.9k | } |
1152 | } | |
1153 | 43.5k | i += 1; |
1154 | } | |
1155 | } | |
1156 | 43.8k | children = mathml.children(); // 'children' moved above, so need new values |
1157 | 0 | } else { |
1158 | 0 | // bad mathml such as '<annotation-xml> </annotation-xml>' -- don't add to new_children |
1159 | 0 | i += 1; |
1160 | 0 | } |
1161 | } | |
1162 | ||
1163 | // could have deleted children so only one child remains -- need to lift it | |
1164 | 15.8k | if element_name == "mrow" && |
1165 | // "lift" the child up so all the links (e.g., siblings) are correct | |
1166 | 108 | let child = as_element(children[0]); |
1167 | 108 | mathml.replace_children(child.children()); |
1168 | 108 | set_mathml_name(mathml, name(child)); |
1169 | 108 | add_attrs(mathml, &child.attributes()); |
1170 | 108 | return Some(mathml); // child has already been cleaned, so we can return |
1171 | 15.7k | } |
1172 | ||
1173 | 15.7k | if element_name == "mrow" || |
1174 | 10.1k | merge_number_blocks(self, mathml, &mut children); |
1175 | 10.1k | merge_whitespace(&mut children); |
1176 | 10.1k | merge_cross_or_dot_product_elements(&mut children); |
1177 | 10.1k | handle_convert_to_mmultiscripts(&mut children); |
1178 | 10.1k | } else if |
1179 | 3.48k | element_name == "msubsup" || |
1180 | 2.52k | if element_name != "mmultiscripts" { |
1181 | // mhchem emits some cases that boil down to a completely empty script -- see test mhchem_beta_decay | |
1182 | 2.33k | let mut is_empty_script = CanonicalizeContext::is_empty_element(as_element(children[0])) && |
1183 | 181 | CanonicalizeContext::is_empty_element(as_element(children[1])); |
1184 | 2.33k | if element_name == "msubsup" && |
1185 | 51 | is_empty_script = CanonicalizeContext::is_empty_element(as_element(children[2])); |
1186 | 2.28k | } |
1187 | 2.33k | if is_empty_script { |
1188 | 48 | if parent_requires_child { |
1189 | // need a placeholder -- make it empty mtext | |
1190 | 0 | return Some( as_element(children[0]) ); // pick one of the empty elements |
1191 | } else { | |
1192 | 48 | return None; |
1193 | } | |
1194 | 2.29k | } |
1195 | 185 | } |
1196 | 2.47k | let mathml = if element_name == "mmultiscripts" { |
1197 | 2.47k | if !is_chemistry_off(mathml) { |
1198 | 2.47k | let likely_chemistry = likely_adorned_chem_formula(mathml); |
1199 | // debug!("likely_chemistry={}, {}", likely_chemistry, mml_to_string(mathml)); | |
1200 | 2.47k | if likely_chemistry >= 0 { |
1201 | 553 | mathml.set_attribute_value(MAYBE_CHEMISTRY, likely_chemistry.to_string().as_str()); |
1202 | 1.92k | } |
1203 | 0 | } |
1204 | ||
1205 | 2.47k | if element_name == "msubsup" { |
1206 | 180 | return Some( clean_msubsup(mathml) ); |
1207 | } else { | |
1208 | 2.29k | return Some(mathml); |
1209 | } | |
1210 | 3.06k | } |
1211 | ||
1212 | 13.2k | mathml.replace_children(children); |
1213 | // debug!("clean_mathml: after loop\n{}", mml_to_string(mathml)); | |
1214 | 13.2k | if element_name == "mrow" || |
1215 | 10.1k | clean_chemistry_mrow(mathml); |
1216 | 10.1k |
|
1217 | 13.2k | self.assure_nary_tag_has_one_child(mathml); |
1218 | 13.2k | if crate::xpath_functions::IsNode::is_2D(mathml) { |
1219 | 4.48k | CanonicalizeContext::mark_empty_content(mathml); |
1220 | 8.77k | } |
1221 | ||
1222 | 13.2k | return Some(mathml); |
1223 | } | |
1224 | } | |
1225 | ||
1226 | /// Returns substitute text if hyphen sequence should be a short or long dash | |
1227 | 11.8k | fn canonicalize_dash(text: &str) -> Option<&str> { |
1228 | 11.8k | if text == "--" { |
1229 | 1 | return Some("—"); // U+2014 (em dash) |
1230 | 11.8k | } else if text == "---" || |
1231 | 2 | return Some("―"); // U+2015 (Horizontal bar) |
1232 | } else { | |
1233 | 11.8k | return None; |
1234 | } | |
1235 | 11.8k | } |
1236 | ||
1237 | 11 | fn set_annotation_attrs(new_presentation: Element, semantics: Element) { |
1238 | 24 | for child in |
1239 | 24 | let child = as_element(child); |
1240 | 24 | let child_name = name(child); |
1241 | 24 | if child == new_presentation { |
1242 | 1 | continue; |
1243 | 23 | } |
1244 | 23 | let attr_name = match child.attribute_value("encoding") { |
1245 | 23 | Some(encoding_name) => format!("data-{}-{}", child_name, encoding_name.replace('/', "_slash_")), |
1246 | 0 | None => format!("data-{child_name}"), // probably shouldn't happen |
1247 | }; | |
1248 | 23 | let attr_name = attr_name.as_str(); |
1249 | 23 | if child_name == "annotation" { |
1250 | 12 | new_presentation.set_attribute_value(attr_name, as_text(child)); |
1251 | 12 | } else { |
1252 | 11 | new_presentation.set_attribute_value(attr_name, &mml_to_string(child)); |
1253 | 11 | } |
1254 | } | |
1255 | ||
1256 | 11 | } |
1257 | ||
1258 | /// Hack to try and guess if a colon should be a ratio -- this affects parsing because of different precedences | |
1259 | /// It also guesses on the spacing after the colon and adds a space attr if it looks like set building or function mapping notation. | |
1260 | /// These conditions are really not well thought out and are just a first cut -- they do cause the braille tests to pass | |
1261 | /// If 'intent' is given, it must be intent='ratio' | |
1262 | /// 2. It must be infix and there is a proportion (∷) mo as a sibling, or | |
1263 | /// 3. It is the only mo and has numbers on each side | |
1264 | /// | |
1265 | /// Need to rule out field extensions "[K:F]" and trilinear coordinates "a:b:c" (Nemeth doesn't consider these to be ratios) | |
1266 | 94 | fn is_ratio(mathml: Element) -> bool { |
1267 | 94 | assert_eq!(name(mathml), "mo"); |
1268 | 94 | let parent = get_parent(mathml); // must exist |
1269 | 94 | if name(parent) != "mrow" && |
1270 | 0 | return false; |
1271 | 94 | } |
1272 | ||
1273 | 94 | if let Some( |
1274 | 1 | && (intent_value != "ratio" || |
1275 | 1 | return false; |
1276 | 93 | } |
1277 | ||
1278 | 93 | if let Some( |
1279 | 0 | && value == "PUNCT" { |
1280 | 0 | mathml.remove_attribute("data-mjx-texclass"); |
1281 | 0 | mathml.set_attribute_value(SPACE_AFTER, "true"); // signal to at least Nemeth rules that this is punctuation |
1282 | 93 | } |
1283 | ||
1284 | 93 | let preceding = mathml.preceding_siblings(); |
1285 | 93 | let following = mathml.following_siblings(); |
1286 | 93 | if preceding.is_empty() || |
1287 | 2 | return false; |
1288 | 91 | } |
1289 | 91 | let preceding_child = as_element( preceding[preceding.len()-1] ); |
1290 | 91 | let following_child = as_element(following[0]); |
1291 | 91 | if preceding.len() == 1 && |
1292 | 8 | following.len() == 1 && |
1293 | 2 | return true; |
1294 | 89 | } |
1295 | // only want one "∷" | |
1296 | 89 | let is_before = is_proportional_before_colon(preceding.iter().rev()); |
1297 | 89 | if let Some( |
1298 | 3 | && !is_before { |
1299 | 0 | return false; |
1300 | 89 | } |
1301 | 89 | let is_before = is_before.is_some(); // move this to true/false (found/not found) |
1302 | 89 | let is_after = is_proportional_before_colon(following.iter()); |
1303 | 89 | if let Some( |
1304 | 3 | && !is_after { |
1305 | 0 | return false; |
1306 | 89 | } |
1307 | 89 | let is_after = is_after.is_some(); // move this to true/false (found/not found) |
1308 | 89 | return is_before ^ is_after; |
1309 | ||
1310 | 178 | fn is_proportional_before_colon<'a>(siblings: impl Iterator<Item = &'a ChildOfElement<'a>>) -> Option<bool> { |
1311 | // unparsed, so we look at relative priorities to make sure the proportional operator is really the next operator | |
1312 | 3 | static PROPORTIONAL_PRIORITY: LazyLock<usize> = LazyLock::new(|| OPERATORS.get("∷").unwrap().priority); |
1313 | 461 | for sibling in |
1314 | 461 | let child = as_element(*sibling); |
1315 | 461 | if name(child) == "mo" { |
1316 | 203 | let text = as_text(child); |
1317 | 203 | match text { |
1318 | 203 | "∷" | |
1319 | 197 | "∶" => return |
1320 | _ => { | |
1321 | 197 | if let Some( |
1322 | 191 | && op.priority < *PROPORTIONAL_PRIORITY { |
1323 | 109 | return None; // no "∷" |
1324 | 88 | } |
1325 | }, | |
1326 | } | |
1327 | 258 | } |
1328 | } | |
1329 | 63 | return None; |
1330 | 178 | } |
1331 | 94 | } |
1332 | ||
1333 | ||
1334 | /// Returns true if it detects that this is likely coming from mhchem: | |
1335 | /// v3: msub/msup/msubsup with mpadded width=0/mphantom/mi=A) | |
1336 | /// v4: msub/msup/msubsup with mrow/mrow/mpadded width=0/mphantom/mi=A) | |
1337 | /// This should be called with 'mrow' being the outer mrow | |
1338 | 3.15k | fn is_from_mhchem_hack(mathml: Element) -> bool { |
1339 | 3.15k | assert!(name(mathml) == "mrow" || |
1340 | 3.15k | assert_eq!(mathml.children().len(), 1); |
1341 | 3.15k | let parent = get_parent(mathml); |
1342 | 3.15k | let parent_name = name(parent); |
1343 | 3.15k | if !(parent_name == "msub" || |
1344 | 2.56k | return false; |
1345 | 594 | } |
1346 | ||
1347 | 594 | let |
1348 | 545 | let mrow = as_element(mathml.children()[0]); |
1349 | 545 | if !(name(mrow) == "mrow" && |
1350 | 255 | return false; |
1351 | 290 | } |
1352 | 290 | let child = as_element(mrow.children()[0]); |
1353 | 290 | if name(child) != "mpadded" { |
1354 | 24 | return false; |
1355 | 266 | } |
1356 | 266 | child |
1357 | } else { | |
1358 | 49 | mathml |
1359 | }; | |
1360 | 315 | if let Some( |
1361 | 169 | if width != "0" { |
1362 | 0 | return false; |
1363 | 169 | } |
1364 | } else { | |
1365 | 146 | return false; |
1366 | } | |
1367 | ||
1368 | 169 | let mphantom = as_element(mpadded.children()[0]); |
1369 | 169 | if !(name(mphantom) == "mphantom" && mphantom.children().len() == 1) { |
1370 | 0 | return false; |
1371 | 169 | } |
1372 | ||
1373 | 169 | let child = as_element(mphantom.children()[0]); |
1374 | 169 | return name(child) == "mi" && as_text(child) == "A"; |
1375 | 3.15k | } |
1376 | ||
1377 | /// 'text' is potentially one of the many Unicode whitespace chars. Estimate the width in ems | |
1378 | 149 | fn white_space_em_width(text: &str) -> f64 { |
1379 | 149 | assert!(IS_WHITESPACE.is_match(text)); |
1380 | 149 | let mut width = 0.0; |
1381 | 163 | for ch in |
1382 | 163 | width += match ch { |
1383 | 137 | ' ' | '\u{00A0}' | '\u{1680}' | ' ' => 0.7, // space, non-breaking space, Ogham space mark, figure space |
1384 | 0 | ' ' | ' ' => 0.5, // en quad, en space |
1385 | 0 | ' ' | ' ' => 1.0, // em quad, em space |
1386 | 0 | ' ' => 1.0/3.0, // three per em space |
1387 | 0 | ' ' | ' ' => 0.25, // four per em space, punctuation space (wild guess) |
1388 | 22 | ' ' | ' ' => 3.0/18.0, // six per em space, thin space |
1389 | 0 | ' ' => 1.0/18.0, // hair space |
1390 | 0 | ' ' => 0.3, // narrow no-break space (half a regular space?) |
1391 | 4 | ' ' => 4.0/18.0, // medium math space |
1392 | 0 | ' ' => 1.5, // Ideographic Space |
1393 | 0 | _ => 0.7, // shouldn't happen |
1394 | } | |
1395 | } | |
1396 | 149 | return width; |
1397 | 149 | } |
1398 | ||
1399 | /// Splits the leaf element into chemical elements if needed | |
1400 | 12.5k | fn clean_chemistry_leaf(mathml: Element) -> Element { |
1401 | 12.5k | if !(is_chemistry_off(mathml) || mathml.attribute(MAYBE_CHEMISTRY).is_some()) { |
1402 | 12.3k | assert!(name(mathml)=="mi" || |
1403 | // this is a hack -- VII is more likely to be roman numeral than the molecule V I I so prevent that from happening | |
1404 | // FIX: come up with a less hacky way to prevent chem element misinterpretation | |
1405 | 12.3k | let text = as_text(mathml); |
1406 | 12.3k | if text.len() > 2 && |
1407 | 0 | return mathml; |
1408 | 12.3k | } |
1409 | 12.3k | if let Some( |
1410 | // children are already marked as chemical elements | |
1411 | 135 | let answer = replace_children(mathml, elements); |
1412 | 135 | if name(answer) == "mrow" { |
1413 | 29 | answer.remove_attribute(MAYBE_CHEMISTRY); // was lifted, and not set -- remove and it will be computed later |
1414 | 106 | } |
1415 | 135 | return answer; |
1416 | } else { | |
1417 | 12.1k | let likely_chemistry = likely_chem_element(mathml); |
1418 | 12.1k | if likely_chemistry >= 0 { |
1419 | 2.59k | mathml.set_attribute_value(MAYBE_CHEMISTRY, likely_chemistry.to_string().as_str()); |
1420 | 9.57k | } |
1421 | }; | |
1422 | 259 | } |
1423 | 12.4k | return mathml; |
1424 | 12.5k | } |
1425 | ||
1426 | ||
1427 | /// looks for pairs of (letter, pseudo-script) such as x' or p'q' all inside of a single token element | |
1428 | 11.4k | fn split_apart_pseudo_scripts<'a>(mi: Element<'a>) -> Option<Element<'a>> { |
1429 | 2 | static IS_DEGREES_C_OR_F: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[°º][CF]").unwrap()); |
1430 | ||
1431 | 11.4k | let text = as_text(mi); |
1432 | // debug!("split_apart_pseudo_scripts: start text=\"{text}\""); | |
1433 | 11.4k | if !text.chars().any(is_pseudo_script_char) || |
1434 | 11.4k | return None; |
1435 | 1 | } |
1436 | ||
1437 | 1 | let document = mi.document(); |
1438 | // create pairs of text | |
1439 | 1 | let chars = text.chars(); |
1440 | 1 | let next_chars = text.chars().skip(1); |
1441 | 1 | let result = chars.zip(next_chars).map(|(a, b)| |
1442 | 1 | if a.is_alphabetic() && is_pseudo_script_char(b) { |
1443 | // create msup | |
1444 | 1 | let base = create_mathml_element(&document, "mi"); |
1445 | 1 | base.set_text(&a.to_string()); |
1446 | 1 | let script = create_mathml_element(&document, "mo"); |
1447 | 1 | script.set_text(&b.to_string()); |
1448 | 1 | let msup = create_mathml_element(&document, "msup"); |
1449 | 1 | msup.append_child(base); |
1450 | 1 | msup.append_child(script); |
1451 | 1 | msup |
1452 | } else { | |
1453 | // create an mi "ab" | |
1454 | 0 | let new_mi = create_mathml_element(&document, "mi"); |
1455 | 0 | let mut new_mi_text = String::with_capacity(6); // likely will fit almost all cases |
1456 | 0 | new_mi_text.push(a); |
1457 | 0 | new_mi_text.push(b); |
1458 | 0 | new_mi.set_text(&new_mi_text); |
1459 | 0 | new_mi |
1460 | 1 | } ) |
1461 | 1 | .collect::<Vec<Element>>(); |
1462 | 1 | if result.len() == 1 { |
1463 | 1 | return Some( result[0] ); |
1464 | } else { | |
1465 | 0 | return Some( replace_children(mi, result) ); |
1466 | } | |
1467 | 11.4k | } |
1468 | ||
1469 | ||
1470 | /// If 'mathml' is a scripted element and has an mrow for a base, | |
1471 | /// attach any prescripts to the first element in mrow | |
1472 | /// attach any postscript to the last element in mrow | |
1473 | /// Return the modified element (which might now be an mrow) | |
1474 | 31.0k | fn attach_scripts_to_split_element(mathml: Element) -> Element { |
1475 | 31.0k | if !IsNode::is_scripted(mathml) { |
1476 | 28.5k | return mathml; |
1477 | 2.48k | } |
1478 | 2.48k | let base = as_element(mathml.children()[0]); |
1479 | 2.48k | if name(base) != "mrow" { |
1480 | 2.30k | return mathml; |
1481 | 185 | } |
1482 | 185 | let base_children = base.children(); |
1483 | 185 | let i_last_base = base_children.len()-1; |
1484 | 185 | let last_child = as_element(base_children[i_last_base]); |
1485 | 185 | if last_child.attribute(SPLIT_TOKEN).is_none() { |
1486 | 156 | return mathml; |
1487 | 29 | } |
1488 | // debug!("attach_scripts_to_split_element -- start: \n{}", mml_to_string(mathml)); | |
1489 | 29 | let mut mathml_replacement = Vec::with_capacity(base_children.len()); |
1490 | 29 | if name(mathml) == "mmultiscripts" { |
1491 | // pull any prescript (should be at most one prefix pair) into the first child | |
1492 | 1 | let multiscripts_children = mathml.children(); |
1493 | 1 | let n_multiscripts_children = multiscripts_children.len(); |
1494 | 1 | let potential_mprescripts_element = as_element(multiscripts_children[n_multiscripts_children-3]); |
1495 | 1 | if name(potential_mprescripts_element) == "mprescripts" { // we have potential chem prescripts |
1496 | // create a new mmultiscripts elements with first child as its base mathml's prescripts as the new element's prescripts | |
1497 | 1 | let mut new_mmultiscripts_children = Vec::with_capacity(4); |
1498 | 1 | new_mmultiscripts_children.push(base_children[0]); |
1499 | 1 | base.remove_child(as_element(base_children[0])); |
1500 | 1 | new_mmultiscripts_children.push(multiscripts_children[n_multiscripts_children-3]); |
1501 | 1 | new_mmultiscripts_children.push(multiscripts_children[n_multiscripts_children-2]); |
1502 | 1 | new_mmultiscripts_children.push(multiscripts_children[n_multiscripts_children-1]); |
1503 | ||
1504 | 1 | let new_mmultiscripts = create_mathml_element(&base.document(), "mmultiscripts"); |
1505 | 1 | new_mmultiscripts.append_children(new_mmultiscripts_children); |
1506 | 1 | let likely = likely_adorned_chem_formula(new_mmultiscripts); |
1507 | 1 | new_mmultiscripts.set_attribute_value(MAYBE_CHEMISTRY, &likely.to_string()); |
1508 | // debug!("attach_scripts_to_split_element -- new_mmultiscripts: \n{}", mml_to_string(new_mmultiscripts)); | |
1509 | 1 | if n_multiscripts_children == 4 { |
1510 | // we stripped all the children so only the (modified) base exists | |
1511 | // create mrow(new_mmultiscripts, mathml[0]) | |
1512 | 0 | let children = vec![new_mmultiscripts, base]; |
1513 | 0 | return replace_children(mathml, children); |
1514 | 1 | } |
1515 | 1 | mathml_replacement.push(new_mmultiscripts); |
1516 | 0 | } |
1517 | 28 | } |
1518 | ||
1519 | // Add all the middle children of the base to the mrow | |
1520 | 34 |
|
1521 | ||
1522 | // create a new script element with last child as its base | |
1523 | 29 | let mut new_mathml_children = mathml.children(); |
1524 | 29 | new_mathml_children[0] = ChildOfElement::Element(base); |
1525 | 29 | mathml.replace_children(new_mathml_children); |
1526 | 29 | mathml_replacement.push(mathml); |
1527 | // debug!("attach_scripts_to_split_element -- after adjusting ({} replacement children): \n{}", mathml_replacement.len(), mml_to_string(mathml)); | |
1528 | 29 | return replace_children(mathml, mathml_replacement); |
1529 | 31.0k | } |
1530 | ||
1531 | /// makes sure the structure is correct and also eliminates <none/> pairs | |
1532 | /// MathML core changed <none/> to <mrow/>. For now (since MathCAT has lots of "none" tests), <mrow/> => <mtext> => <none/> | |
1533 | /// (used https://chem.libretexts.org/Courses/Saint_Francis_University/CHEM_113%3A_Human_Chemistry_I_(Muino)/13%3A_Nuclear_Chemistry12/13.04%3A_Nuclear_Decay) | |
1534 | /// | |
1535 | /// This does some dubious repairs when the structure is bad, but not sure what else to do | |
1536 | 185 | fn clean_mmultiscripts(mathml: Element) -> Option<Element> { |
1537 | 185 | let mut mathml = mathml; |
1538 | 185 | let children = mathml.children(); |
1539 | 185 | let n = children.len(); |
1540 | 185 | let i_mprescripts = |
1541 | 185 | if let Some(( |
1542 | 659 | . |
1543 | 185 | let has_misplaced_mprescripts = i_mprescripts & 1 == 0; // should be first, third, ... child |
1544 | 185 | let mut has_proper_number_of_children = if i_mprescripts == n { |
1545 | 185 | if has_misplaced_mprescripts || !has_proper_number_of_children || |
1546 | // need to reset the children | |
1547 | 185 | let mut new_children = Vec::with_capacity(n+2); // adjusting position of mprescripts might add two children |
1548 | 185 | new_children.push(children[0]); |
1549 | // drop none, none script pairs | |
1550 | 185 | let mut i = 1; |
1551 | 604 | while i < n { |
1552 | 419 | let child = as_element(children[i]); |
1553 | 419 | let child_name = name(child); |
1554 | 419 | if child_name == "mprescripts" { |
1555 | 108 | if has_misplaced_mprescripts { |
1556 | 0 | let mtext = CanonicalizeContext::create_empty_element(&mathml.document()); |
1557 | 0 | new_children.push(ChildOfElement::Element(mtext)); |
1558 | 0 | has_proper_number_of_children = !has_proper_number_of_children; |
1559 | 108 | } |
1560 | 108 | new_children.push(children[i]); |
1561 | 108 | i += 1; |
1562 | 311 | } else if i+1 < n && child_name == "none" && |
1563 | 2 | i += 2; // found none, none pair |
1564 | 309 | } else { |
1565 | 309 | // copy pair |
1566 | 309 | new_children.push(children[i]); |
1567 | 309 | new_children.push(children[i+1]); |
1568 | 309 | i += 2; |
1569 | 309 | } |
1570 | } | |
1571 | 185 | if new_children.len() <= 2 { // base only, or base and </mprescripts> |
1572 | 1 | mathml = as_element(new_children[0]); |
1573 | 184 | } else { |
1574 | 184 | mathml.replace_children(new_children); |
1575 | 184 | } |
1576 | 0 | } |
1577 | ||
1578 | 185 | return Some(mathml); |
1579 | ||
1580 | 0 | fn has_none_none_script_pair(children: &[ChildOfElement]) -> bool { |
1581 | 0 | let mut i = 1; |
1582 | 0 | let n = children.len(); |
1583 | 0 | while i < n { |
1584 | 0 | let child = as_element(children[i]); |
1585 | 0 | let child_name = name(child); |
1586 | 0 | if child_name == "mprescripts" { |
1587 | 0 | i += 1; |
1588 | 0 | } else if i+1 < n && child_name == "none" && name(as_element(children[i+1])) == "none" { |
1589 | 0 | return true; // found none, none pair |
1590 | 0 | } else { |
1591 | 0 | i += 2; |
1592 | 0 | } |
1593 | } | |
1594 | 0 | return false; |
1595 | 0 | } |
1596 | 185 | } |
1597 | ||
1598 | /// converts element if there is an empty subscript or superscript | |
1599 | 180 | fn clean_msubsup(mathml: Element) -> Element { |
1600 | 180 | let children = mathml.children(); |
1601 | 180 | let subscript = as_element(children[1]); |
1602 | 180 | let has_subscript = !(name(subscript) == "mtext" && |
1603 | 180 | let superscript = as_element(children[2]); |
1604 | 180 | let has_superscript = !(name(superscript) == "mtext" && |
1605 | 180 | if has_subscript && |
1606 | 171 | return mathml; |
1607 | 9 | } else if has_subscript { |
1608 | 6 | set_mathml_name(mathml, "msub"); |
1609 | 6 | let children = vec!(children[0], children[1]); |
1610 | 6 | mathml.replace_children(children); |
1611 | 6 | return mathml; |
1612 | 3 | } else if has_superscript { |
1613 | 3 | set_mathml_name(mathml, "msup"); |
1614 | 3 | let children = vec!(children[0], children[2]); |
1615 | 3 | mathml.replace_children(children); |
1616 | 3 | return mathml; |
1617 | } else { | |
1618 | 0 | return as_element(children[0]); // no scripts |
1619 | } | |
1620 | 180 | } |
1621 | ||
1622 | /// Split off the currency symbol from the rest of the text and return an mrow with the result | |
1623 | /// Assumes it has already checked and that we have a leaf | |
1624 | 12 | fn split_currency_symbol(leaf: Element) -> Option<Element> { |
1625 | 12 | assert!(is_leaf(leaf)); |
1626 | 12 | let text = as_text(leaf); |
1627 | 12 | assert!(contains_currency(text)); |
1628 | 12 | let mut iter = text.chars(); |
1629 | 12 | match (iter.next(), iter.next()) { |
1630 | 0 | (None, _) => return None, |
1631 | (Some(_), None) => { // 1 char | |
1632 | 9 | leaf.set_name("mi"); |
1633 | 9 | return Some(leaf); } |
1634 | (Some(_), Some(_)) => { // 2 or more chars | |
1635 | // WARNING: don't use 'leaf' in the mrow -- that detaches it from its parent and could shrink the number of children causing problems | |
1636 | 4 | if |
1637 | 3 | leaf.set_name("mn"); // make sure we create an mn (might be one already) |
1638 | 3 |
|
1639 | 3 | let first_ch = text.char_indices().next().map(|(i, ch)| &text[i..i + ch.len_utf8()]).unwrap(); |
1640 | 3 | if is_currency_symbol(first_ch.chars().next().unwrap()) { |
1641 | 1 | let mrow = create_mathml_element(&leaf.document(), "mrow"); |
1642 | 1 | mrow.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
1643 | 1 | let currency_symbol = create_mathml_element(&leaf.document(), "mi"); |
1644 | 1 | currency_symbol.set_text(first_ch); |
1645 | 1 | mrow.append_child(currency_symbol); |
1646 | 1 | let implied_times = create_mo(leaf.document(), "\u{2062}", ADDED_ATTR_VALUE); |
1647 | 1 | mrow.append_child(implied_times); |
1648 | 1 | let currency_amount = create_mathml_element(&leaf.document(), name(leaf)); |
1649 | 1 | currency_amount.set_text(&text[first_ch.len()..]); |
1650 | 1 | mrow.append_child(currency_amount); |
1651 | 1 | return Some(mrow); |
1652 | 2 | } |
1653 | 2 | let last_ch = text.char_indices().last().map(|(i, _)| &text[i..]).unwrap(); |
1654 | 2 | if is_currency_symbol(last_ch.chars().next().unwrap()) { |
1655 | 1 | let mrow = create_mathml_element(&leaf.document(), "mrow"); |
1656 | 1 | mrow.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
1657 | 1 | let implied_times = create_mo(leaf.document(), "\u{2062}", ADDED_ATTR_VALUE); |
1658 | 1 | mrow.append_child(implied_times); |
1659 | 1 | let currency_amount = create_mathml_element(&leaf.document(), name(leaf)); |
1660 | 1 | currency_amount.set_text(&text[..text.len()-last_ch.len()]); |
1661 | 1 | mrow.append_child(currency_amount); |
1662 | 1 | let currency_symbol = create_mathml_element(&leaf.document(), "mi"); |
1663 | 1 | currency_symbol.set_text(last_ch); |
1664 | 1 | mrow.append_child(currency_symbol); |
1665 | 1 | return Some(mrow); |
1666 | 1 | } |
1667 | // try to find it in the middle | |
1668 | 2 | for (byte_idx, ch) in |
1669 | 2 | if contains_currency(&text[byte_idx .. byte_idx + ch.len_utf8()]) { |
1670 | // get all the substrings | |
1671 | 1 | let first_part = &text[..byte_idx]; |
1672 | 1 | let currency_symbol = &text[byte_idx .. byte_idx + ch.len_utf8()]; |
1673 | 1 | let second_part = &text[byte_idx + ch.len_utf8() ..]; |
1674 | 1 | let mrow = create_mathml_element(&leaf.document(), "mrow"); |
1675 | 1 | mrow.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
1676 | 1 | let first_part_element = create_mathml_element(&leaf.document(), name(leaf)); |
1677 | 1 | first_part_element.set_text(first_part); |
1678 | 1 | mrow.append_child(first_part_element); |
1679 | 1 | let implied_times = create_mo(leaf.document(), "\u{2062}", ADDED_ATTR_VALUE); |
1680 | 1 | mrow.append_child(implied_times); |
1681 | 1 | let currency_symbol_element = create_mathml_element(&leaf.document(), "mi"); |
1682 | 1 | currency_symbol_element.set_text(currency_symbol); |
1683 | 1 | mrow.append_child(currency_symbol_element); |
1684 | 1 | let implied_times = create_mo(leaf.document(), "\u{2062}", ADDED_ATTR_VALUE); |
1685 | 1 | mrow.append_child(implied_times); |
1686 | 1 | let second_part_element = create_mathml_element(&leaf.document(), name(leaf)); |
1687 | 1 | second_part_element.set_text(second_part); |
1688 | 1 | mrow.append_child(second_part_element); |
1689 | 1 | return Some(mrow); |
1690 | 1 | } |
1691 | } | |
1692 | 0 | return None |
1693 | } | |
1694 | } | |
1695 | 12 | } |
1696 | ||
1697 | /// If arg is "arc" (with optional space), merge the following element in if a trig function (sibling is deleted) | |
1698 | 11.8k | fn merge_arc_trig(leaf: Element) -> Option<Element> { |
1699 | 11.8k | assert!(is_leaf(leaf)); |
1700 | 11.8k | let leaf_text = as_text(leaf); |
1701 | 11.8k | if !(leaf_text == "arc" || |
1702 | 11.8k | return None; |
1703 | 2 | } |
1704 | ||
1705 | 2 | let following_siblings = leaf.following_siblings(); |
1706 | 2 | if following_siblings.is_empty() { |
1707 | 0 | return None; |
1708 | 2 | } |
1709 | ||
1710 | 2 | let following_sibling = as_element(following_siblings[0]); |
1711 | 2 | let following_sibling_name = name(following_sibling); |
1712 | 2 | if !(following_sibling_name == "mi" || |
1713 | 0 | return None; |
1714 | 2 | } |
1715 | ||
1716 | 2 | return crate::definitions::SPEECH_DEFINITIONS.with(|definitions| { |
1717 | // change "arc" "cos" to "arccos" -- we look forward because calling loop stores previous node | |
1718 | 2 | let following_text = as_text(following_sibling); |
1719 | 2 | if definitions.borrow().get_hashset("TrigFunctionNames").unwrap().contains(following_text) { |
1720 | 2 | let new_text = "arc".to_string() + following_text; |
1721 | 2 | set_mathml_name(leaf, "mi"); |
1722 | 2 | leaf.set_text(&new_text); |
1723 | 2 | following_sibling.remove_from_parent(); |
1724 | 2 | return Some(leaf); |
1725 | 0 | } |
1726 | 0 | return None; |
1727 | 2 | }) |
1728 | 11.8k | } |
1729 | ||
1730 | /// Convert "||" to "‖", if in single element or in repeated 'mo's (but not "|x||y|" or "{x ||x|>0}") | |
1731 | 305 | fn merge_vertical_bars(leaf: Element) -> Option<Element> { |
1732 | 305 | assert!(is_leaf(leaf)); |
1733 | 305 | let leaf_text = as_text(leaf); |
1734 | 305 | if leaf_text == "||" { |
1735 | 4 | leaf.set_text("‖"); // U+2016 |
1736 | 4 | return Some(leaf); |
1737 | 301 | } else if leaf_text != "|" { |
1738 | 0 | return None; |
1739 | 301 | } |
1740 | 301 | let following_siblings = leaf.following_siblings(); |
1741 | 301 | if following_siblings.is_empty() { |
1742 | 96 | return None; |
1743 | 205 | } |
1744 | ||
1745 | 205 | let following_sibling = as_element(following_siblings[0]); |
1746 | 205 | if name(following_sibling) != "mo" || |
1747 | 201 | return None |
1748 | 4 | } |
1749 | ||
1750 | // have "||" -- if there a single "|" on left, rule out merge | |
1751 | 4 | let preceding_siblings = leaf.preceding_siblings(); |
1752 | 5 | if |
1753 | 5 | let child = as_element(child); |
1754 | 5 | return name(child) == "mo" && |
1755 | 5 | }) { |
1756 | 1 | return None; // found "|" on left |
1757 | 3 | } |
1758 | ||
1759 | 3 | if following_siblings.len() > 1 { |
1760 | 2 | let following_siblings = &following_siblings[1..]; |
1761 | // if there are an odd number of "|"s to the right, rule out the merge | |
1762 | 8 | if !( |
1763 | 8 | let child = as_element(child); |
1764 | 8 | return name(child) == "mo" && |
1765 | 8 | }). |
1766 | 1 | return None; |
1767 | 1 | } |
1768 | 1 | } |
1769 | ||
1770 | // didn't find any | |
1771 | 2 | leaf.set_text("‖"); // U+2016 |
1772 | 2 | following_sibling.remove_from_parent(); |
1773 | 2 | return Some(leaf); |
1774 | 305 | } |
1775 | ||
1776 | /// merge a following mstyle that has the same attrs | |
1777 | 714 | fn merge_adjacent_similar_mstyles(mathml: Element) { |
1778 | 714 | if ELEMENTS_WITH_FIXED_NUMBER_OF_CHILDREN.contains(name(get_parent(mathml))) { |
1779 | // FIX: look to see if all of the children (might be more than just the adjacent one) have the same attr and then pull them up to the parent | |
1780 | 65 | return; // can't remove subsequent child |
1781 | 649 | } |
1782 | 649 | let following_siblings = mathml.following_siblings(); |
1783 | 649 | if following_siblings.is_empty() { |
1784 | 579 | return; |
1785 | 70 | } |
1786 | 70 | let following_element = as_element(following_siblings[0]); |
1787 | 70 | if name(following_element) != "mstyle" { |
1788 | 66 | return; |
1789 | 4 | } |
1790 | 4 | let are_same = mathml.attributes().iter() |
1791 | 4 | .zip( following_element.attributes() ) |
1792 | 5 | . |
1793 | 4 | if are_same { |
1794 | 4 | mathml.append_children(following_element.children()); |
1795 | 4 | following_element.remove_from_parent(); |
1796 | 4 |
|
1797 | 714 | } |
1798 | ||
1799 | 40 | fn convert_mfenced_to_mrow(mfenced: Element) -> Element { |
1800 | // The '<'/'>' replacements are because WIRIS uses them out instead of the correct chars in its template | |
1801 | 40 | let open = mfenced.attribute_value("open").unwrap_or("(").replace('<', "⟨"); |
1802 | 40 | let close = mfenced.attribute_value("close").unwrap_or(")").replace('>', "⟩"); |
1803 | // debug!("open={}, close={}", open, close); | |
1804 | 40 | let mut separators= mfenced.attribute_value("separators").unwrap_or(",").chars(); |
1805 | 40 | set_mathml_name(mfenced, "mrow"); |
1806 | 40 | mfenced.remove_attribute("open"); |
1807 | 40 | mfenced.remove_attribute("close"); |
1808 | 40 | mfenced.remove_attribute("separators"); |
1809 | 40 | let children = mfenced.children(); |
1810 | 40 | let mut new_children = Vec::with_capacity(2*children.len() + 1); |
1811 | 40 | if !open.is_empty() { |
1812 | 40 | new_children.push(ChildOfElement::Element( create_mo(mfenced.document(), &open, MFENCED_ATTR_VALUE)) ); |
1813 | 40 |
|
1814 | 40 | if !children.is_empty() { |
1815 | 40 | new_children.push(children[0]); |
1816 | 40 | for |
1817 | 3 | let sep = separators.next().unwrap_or(',').to_string(); |
1818 | 3 | new_children.push( ChildOfElement::Element( create_mo(mfenced.document(), &sep, MFENCED_ATTR_VALUE)) ); |
1819 | 3 | new_children.push(*child); |
1820 | 3 | } |
1821 | 0 | } |
1822 | 40 | if !close.is_empty() { |
1823 | 38 | new_children.push(ChildOfElement::Element( create_mo(mfenced.document(), &close, MFENCED_ATTR_VALUE)) ); |
1824 | 38 |
|
1825 | 40 | mfenced.replace_children(new_children); |
1826 | 40 | return mfenced; |
1827 | 40 | } |
1828 | ||
1829 | 30.4k | fn is_roman_number_match(text: &str) -> bool { |
1830 | 30.4k | return UPPER_ROMAN_NUMERAL.is_match(text) || |
1831 | 30.4k | } |
1832 | ||
1833 | /// Return true if 'element' (which is syntactically a roman numeral) is only inside mrows and | |
1834 | /// if its length is < 3 chars, then there is another roman numeral near it (separated by an operator). | |
1835 | /// We want to rule out something like 'm' or 'cm' being a roman numeral. | |
1836 | /// Note: this function assumes 'mathml' is a Roman Numeral, and optimizes operations based on that. | |
1837 | /// Note: Nemeth has some rules about roman numerals (capitalization and punctuation after) | |
1838 | 3.35k | fn is_roman_numeral_number_context(mathml: Element) -> bool { |
1839 | 3.35k | assert!(name(mathml)=="mtext" || |
1840 | 3.35k | let mut parent = mathml; |
1841 | loop { | |
1842 | 5.41k | parent = get_parent(parent); |
1843 | 5.41k | let current_name = name(parent); |
1844 | 5.41k | if current_name == "math" { |
1845 | 1.57k | break; |
1846 | 3.84k | } else if current_name == "msup" || |
1847 | // could be a oxidation state in a Chemical formula | |
1848 | 559 | let children = parent.children(); |
1849 | // make sure that there is only one script and that 'mathml' is a superscript | |
1850 | 559 | if current_name == "mmultiscripts" && ( |
1851 | 122 | return false; |
1852 | 437 | } |
1853 | 437 | let base = as_element(children[0]); |
1854 | 437 | if is_chemical_element(base) { |
1855 | 21 | break; |
1856 | } else { | |
1857 | 416 | return false; |
1858 | } | |
1859 | 3.28k | } else if current_name != "mrow" { |
1860 | 1.22k | return false; |
1861 | 2.06k | } |
1862 | } | |
1863 | ||
1864 | 1.59k | let text = as_text(mathml).as_bytes(); // note: we know it is all ASCII chars |
1865 | // if roman numeral is in superscript and we get here, then it had a chemical element base, so we accept it | |
1866 | // note: you never has a state = I; if two letters, it must be 'II'. | |
1867 | 1.59k | if text.len() > 2 || |
1868 | 1.57k | ((name(parent) =="msup" || |
1869 | 28 | return true; |
1870 | } else { | |
1871 | 1.56k | let is_upper_case = text[0].is_ascii_uppercase(); // safe since we know it is a roman numeral |
1872 | 1.56k | let preceding = mathml.preceding_siblings(); |
1873 | 1.56k | let following = mathml.following_siblings(); |
1874 | 1.56k | if preceding.is_empty() && |
1875 | 81 | return false; // no context and too short to confirm it is a roman numeral |
1876 | 1.48k | } |
1877 | 1.48k | if preceding.is_empty() { |
1878 | 275 | return is_roman_numeral_adjacent(following.iter(), is_upper_case); |
1879 | 1.21k | } |
1880 | 1.21k | if following.is_empty() { |
1881 | 399 | return is_roman_numeral_adjacent(preceding.iter().rev(), is_upper_case); |
1882 | 813 | } |
1883 | 813 | return is_roman_numeral_adjacent(preceding.iter().rev(), is_upper_case) && |
1884 | 3 | is_roman_numeral_adjacent(following.iter(), is_upper_case); |
1885 | } | |
1886 | ||
1887 | /// make sure all the non-mo leaf siblings are roman numerals | |
1888 | /// 'mo' should only be '+', '-', '=', ',', '.' -- unlikely someone is doing anything sophisticated | |
1889 | 1.49k | fn is_roman_numeral_adjacent<'a, I>(siblings: I, must_be_upper_case: bool) -> bool |
1890 | 1.49k | where I: Iterator<Item = &'a ChildOfElement<'a>> { |
1891 | static ROMAN_NUMERAL_OPERATORS: phf::Set<&str> = phf_set! { | |
1892 | "+", "-'", "=", "<", "≤", ">", "≥", | |
1893 | // ",", ".", // [c,d] triggers this if "," is present, so omitting it | |
1894 | }; | |
1895 | 1.49k | let mut found_match = false; // guard against no siblings |
1896 | 1.49k | let mut last_was_roman_numeral = true; // started at roman numeral |
1897 | // debug!("start is_roman_numeral_adjacent"); | |
1898 | 1.74k | for child in |
1899 | 1.74k | let maybe_roman_numeral = as_element(*child); |
1900 | // debug!("maybe_roman_numeral: {}", mml_to_string(maybe_roman_numeral)); | |
1901 | 1.74k | match name(maybe_roman_numeral) { |
1902 | 1.74k | "mo" => { |
1903 | 858 | if !last_was_roman_numeral { |
1904 | 18 | return false; |
1905 | 840 | } |
1906 | 840 | let text = as_text(maybe_roman_numeral); |
1907 | 840 | if !ROMAN_NUMERAL_OPERATORS.contains(text) { |
1908 | 660 | return false; |
1909 | 180 | } |
1910 | 180 | last_was_roman_numeral = false; |
1911 | }, | |
1912 | 889 | "mi" | |
1913 | 562 | if last_was_roman_numeral { |
1914 | 429 | return false; // no implicit multiplication (or whatever) |
1915 | 133 | } |
1916 | 133 | let text = as_text(maybe_roman_numeral); |
1917 | 133 | if !(( must_be_upper_case && |
1918 | 117 | (!must_be_upper_case && |
1919 | 109 | return false; |
1920 | 24 | }; |
1921 | 24 | found_match = true; |
1922 | 24 | last_was_roman_numeral = true; |
1923 | }, | |
1924 | 327 | "mtext" | |
1925 | _ => { | |
1926 | 252 | return false; |
1927 | } | |
1928 | } | |
1929 | } | |
1930 | 22 | return found_match; |
1931 | 1.49k | } |
1932 | 3.35k | } |
1933 | ||
1934 | /// Merge adjacent mtext by increasing the width of the first mtext | |
1935 | /// The resulting merged whitespace is put on the previous child, or if there is one, on the following child | |
1936 | /// | |
1937 | /// Note: this should be called *after* the mo/mtext cleanup (i.e., after the MathML child cleanup loop). | |
1938 | 10.1k | fn merge_whitespace(children: &mut Vec<ChildOfElement>) { |
1939 | 10.1k | if children.is_empty() { |
1940 | 3 | return; |
1941 | 10.1k | } |
1942 | ||
1943 | 10.1k | let mut i = 0; |
1944 | 10.1k | let mut previous_mtext_with_width: Option<Element<'_>> = None; // prefer to spacing on previous mtext |
1945 | 10.1k | let mut whitespace: Option<f64> = None; |
1946 | 42.0k | while i < children.len() { |
1947 | 31.8k | let child = as_element(children[i]); |
1948 | 31.8k | let is_child_whitespace = name(child) == "mtext" && |
1949 | // debug!("merge_whitespace: i={}, whitespace={:?}, mtext set={} {}", | |
1950 | // i, whitespace, previous_mtext_with_width.is_some(), mml_to_string(child)); | |
1951 | 31.8k | if is_child_whitespace { |
1952 | // update the running total of whitespace | |
1953 | 340 | let child_width = child.attribute_value("data-width").unwrap_or("0") |
1954 | 340 | .parse::<f64>().unwrap_or(0.0) ; |
1955 | 340 | whitespace = match whitespace { |
1956 | 327 | None => Some(child_width), |
1957 | 13 | Some(w) => Some(w + child_width), |
1958 | }; | |
1959 | 340 | if children.len() == 1 { |
1960 | 15 | i += 1; // don't remove only child |
1961 | 325 | } else { |
1962 | 325 | children.remove(i); // remove the current child (don't inc 'i') |
1963 | 325 | } |
1964 | 31.5k | } else if let Some( |
1965 | // done with sequence of whitespaces | |
1966 | 305 | if let Some( |
1967 | 13 | // prefer to set on previous mtext |
1968 | 13 | prev_mtext.set_attribute_value("data-following-space-width", (ws).to_string().as_str()); |
1969 | 13 | previous_mtext_with_width = None; |
1970 | 13 | } else { |
1971 | // if the space is significant, set it on the current child | |
1972 | 292 | child.set_attribute_value("data-previous-space-width", ws.to_string().as_str()); |
1973 | 292 | if name(child) == "mtext" { |
1974 | 18 | previous_mtext_with_width = Some(child); |
1975 | 274 | } |
1976 | } | |
1977 | 305 | whitespace = None; |
1978 | 305 | i += 1; |
1979 | 31.2k | } else { |
1980 | 31.2k | i += 1; |
1981 | 31.2k | previous_mtext_with_width = None; |
1982 | 31.2k | } |
1983 | } | |
1984 | // debug!(" after loop: whitespace={:?}, {}", whitespace, mml_to_string(as_element(children[children.len()-1]))); | |
1985 | 10.1k | if let Some( |
1986 | // last child in mrow is white space -- mark with space *after* | |
1987 | 22 | if children.len() == 1 { |
1988 | // only child -- check to see if we need to set the space-width | |
1989 | 21 | let child = as_element(children[0]); |
1990 | 21 | let child_width = child.attribute_value("data-width").unwrap_or("0").parse::<f64>().unwrap_or(0.0); |
1991 | 21 | if (child_width - ws).abs() > 0.001 { |
1992 | 9 | ws += child_width; |
1993 | 9 | child.set_attribute_value("data-following-space-width", ws.to_string().as_str()); |
1994 | 12 | } |
1995 | 1 | } else { |
1996 | 1 | let non_space_child = as_element(children[children.len()-1]); |
1997 | 1 | non_space_child.set_attribute_value("data-following-space-width", ws.to_string().as_str()); |
1998 | 1 | } |
1999 | 10.1k | } |
2000 | 10.1k | } |
2001 | ||
2002 | /// look for potential numbers by looking for sequences with commas, spaces, and decimal points | |
2003 | 10.1k | fn merge_number_blocks(context: &CanonicalizeContext, parent_mrow: Element, children: &mut Vec<ChildOfElement>) { |
2004 | // debug!("parent:\n{}", mml_to_string(parent_mrow)); | |
2005 | // If we find a comma that is not part of a number, don't form a number | |
2006 | // (see https://github.com/NSoiffer/MathCAT/issues/271) | |
2007 | // Unfortunately, we can't do this in the loop below because we might discover the "not part of a number" after a number has been formed | |
2008 | 10.1k | let do_not_merge_comma = is_comma_not_part_of_a_number(children); |
2009 | 10.1k | let mut i = 0; |
2010 | 38.2k | while i < children.len() { // length might change after a merge |
2011 | // { | |
2012 | // debug!("merge_number_blocks: top of loop"); | |
2013 | // for (i_child, &child) in children[i..].iter().enumerate() { | |
2014 | // let child = as_element(child); | |
2015 | // debug!("child #{}: {}", i+i_child, mml_to_string(child)); | |
2016 | // } | |
2017 | // } | |
2018 | 28.0k | let child = as_element(children[i]); |
2019 | 28.0k | let child_name = name(child); |
2020 | ||
2021 | // numbers start with an mn or a decimal separator | |
2022 | 28.0k | if child_name == "mn" || |
2023 | 6.09k | let leaf_child_text = as_text(child); |
2024 | // if Roman numeral, don't merge (move on) | |
2025 | // or if the 'mn' has ',', '.', or space, consider it correctly parsed and move on | |
2026 | 6.09k | if is_roman_number_match(leaf_child_text) || |
2027 | 5.75k | context.patterns.block_separator.is_match(leaf_child_text) || |
2028 | 5.64k | (leaf_child_text.len() > 1 && |
2029 | 559 | i += 1; |
2030 | 559 | continue; |
2031 | 5.53k | } |
2032 | 21.9k | } else if child_name != "mo" || |
2033 | 9.20k | (do_not_merge_comma && |
2034 | 6.44k | !context.patterns.decimal_separator.is_match(as_text(child)) { |
2035 | 21.9k | i += 1; |
2036 | 21.9k | continue; |
2037 | 31 | } |
2038 | ||
2039 | // potential start of a number | |
2040 | 5.56k | let mut end = i + 1; |
2041 | 5.56k | let mut has_decimal_separator = false; |
2042 | 5.56k | let mut not_a_number = false; |
2043 | 5.56k | if i < children.len() { |
2044 | // look at the right siblings and pull in the longest sequence of number/separators -- then check it for validity | |
2045 | 5.56k | for |
2046 | 4.00k | let sibling = as_element(*sibling); |
2047 | 4.00k | let sibling_name = name(sibling); |
2048 | 4.00k | if sibling_name == "mn" { |
2049 | 245 | let leaf_text = as_text(sibling); |
2050 | 245 | let is_block_separator = context.patterns.block_separator.is_match(leaf_text); |
2051 | 245 | let is_decimal_separator = context.patterns.decimal_separator.is_match(leaf_text); |
2052 | 245 | if is_roman_number_match(leaf_text) || is_block_separator || is_decimal_separator { |
2053 | // consider this mn correctly parsed | |
2054 | 1 | break; |
2055 | 244 | } |
2056 | 3.75k | } else if sibling_name=="mo" || |
2057 | 1.33k | let leaf_text = as_text(sibling); |
2058 | 1.33k | let is_block_separator = context.patterns.block_separator.is_match(leaf_text); |
2059 | 1.33k | let is_decimal_separator = context.patterns.decimal_separator.is_match(leaf_text); |
2060 | 1.33k | if (leaf_text == "," && |
2061 | 1.14k | !(is_block_separator || |
2062 | 261 | (is_decimal_separator && |
2063 | // not a separator or (it is decimal separator and we've already seen a decimal separator) | |
2064 | 1.09k | not_a_number = is_decimal_separator && |
2065 | 1.09k | break; |
2066 | 244 | } |
2067 | 244 | has_decimal_separator |= is_decimal_separator; |
2068 | } else { | |
2069 | // not mn, mo, or mtext -- end of a number | |
2070 | 2.41k | break; |
2071 | } | |
2072 | 488 | end += 1; // increment at end so we can tell the difference between a 'break' and end of loop |
2073 | } | |
2074 | 0 | } |
2075 | 5.56k | if not_a_number { |
2076 | 17 | i = end + 1; |
2077 | 17 | continue; // continue looking in the rest of the mrow |
2078 | 5.55k | } |
2079 | 5.55k | if ignore_final_punctuation(context, parent_mrow, &children[i..end]) { |
2080 | 18 | end -= 1; |
2081 | 5.53k | }; |
2082 | // debug!("start={}, end={}", i, end); | |
2083 | // no need to merge if only one child (also avoids "." being considered a number) | |
2084 | 5.55k | if end > i + 1 && |
2085 | 107 | (i, end) = trim_whitespace(children, i, end); |
2086 | 107 | merge_block(children, i, end); |
2087 | 107 | // note: start..end has been collapsed, so restart after the collapsed part |
2088 | 5.44k | } else { |
2089 | 5.44k | i = end; // start looking at the end of the block we just rejected |
2090 | 5.44k | } |
2091 | 5.55k | i += 1; |
2092 | } | |
2093 | 10.1k | } |
2094 | ||
2095 | /// Return true if we find a comma that doesn't have an <mn> on both sides | |
2096 | 10.1k | fn is_comma_not_part_of_a_number(children: &[ChildOfElement])-> bool { |
2097 | 10.1k | let n_children = children.len(); |
2098 | 10.1k | if n_children == 0 { |
2099 | 3 | return false; |
2100 | 10.1k | } |
2101 | 10.1k | let mut previous_child = as_element(children[0]); |
2102 | 14.5k | for i in |
2103 | 14.5k | let child = as_element(children[i]); |
2104 | 14.5k | if name(child) == "mo" && |
2105 | 972 | (name(previous_child) != "mn" || |
2106 | 809 | return true; |
2107 | 13.7k | } |
2108 | 13.7k | previous_child = child; |
2109 | } | |
2110 | 9.37k | return false; |
2111 | 10.1k | } |
2112 | ||
2113 | /// If we have something like 'shape' ABC, we split the ABC and add IMPLIED_SEPARATOR_HIGH_PRIORITY between them | |
2114 | /// under some specific conditions (trying to be a little cautious). | |
2115 | /// The returned (mrow) element reuses the arg so tree siblings links remain correct. | |
2116 | 11.8k | fn split_points(leaf: Element) -> Option<Element> { |
2117 | 3 | static IS_UPPERCASE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[A-Z]+$").unwrap()); |
2118 | ||
2119 | 11.8k | if !IS_UPPERCASE.is_match(as_text(leaf)) { |
2120 | 9.88k | return None; |
2121 | 1.96k | } |
2122 | ||
2123 | // check to see if there is a bar, arrow, etc over the letters (line-segment, arc, ...) | |
2124 | 1.96k | let parent = get_parent(leaf); |
2125 | 1.96k | if name(parent) == "mover" { |
2126 | // look for likely overscripts (basically just rule out some definite 'no's) | |
2127 | 29 | let over = as_element(parent.children()[1]); |
2128 | 29 | if is_leaf(over) { |
2129 | 29 | let mut over_chars = as_text(over).chars(); |
2130 | 29 | let first_char = over_chars.next(); |
2131 | 29 | if first_char.is_some() && over_chars.next().is_none() && !first_char.unwrap().is_alphanumeric(){ |
2132 | // only one char and it isn't alphanumeric | |
2133 | 29 | return Some( split_element(leaf) ); |
2134 | 0 | } |
2135 | 0 | } |
2136 | 1.93k | } |
2137 | ||
2138 | // check to see if it is preceded by a geometric shape (e.g, ∠ABC) | |
2139 | 1.93k | let preceding_siblings = leaf.preceding_siblings(); |
2140 | 1.93k | if !preceding_siblings.is_empty() { |
2141 | 1.11k | let preceding_sibling = as_element(preceding_siblings[preceding_siblings.len()-1]); |
2142 | 1.11k | let preceding_sibling_name = name(preceding_sibling); |
2143 | 1.11k | if preceding_sibling_name == "mi" || |
2144 | 711 | let preceding_text = as_text(preceding_sibling); |
2145 | 711 | return crate::definitions::SPEECH_DEFINITIONS.with(|definitions| { |
2146 | 711 | let defs = definitions.borrow(); |
2147 | 711 | let prefix_ops = defs.get_hashset("GeometryPrefixOperators").unwrap(); |
2148 | 711 | let shapes = defs.get_hashset("GeometryShapes").unwrap(); |
2149 | 711 | if prefix_ops.contains(preceding_text) || |
2150 | // split leaf | |
2151 | 9 | return Some( split_element(leaf) ); // always treated as function names |
2152 | } else { | |
2153 | 702 | return None; |
2154 | } | |
2155 | 711 | }) |
2156 | 407 | } |
2157 | 817 | } |
2158 | 1.22k | return None; |
2159 | ||
2160 | 38 | fn split_element(leaf: Element) -> Element { |
2161 | 38 | let mut children = Vec::with_capacity(leaf.children().len()); |
2162 | 51 | for ch in |
2163 | 51 | let new_leaf = create_mathml_element(&leaf.document(), "mi"); |
2164 | 51 | new_leaf.set_text(&ch.to_string()); |
2165 | 51 | children.push(new_leaf); |
2166 | 51 | } |
2167 | 38 | set_mathml_name(leaf, "mrow"); |
2168 | 38 | leaf.replace_children(children); |
2169 | 38 | return leaf; |
2170 | 38 | } |
2171 | 11.8k | } |
2172 | ||
2173 | /// If we have something like 'V e l o c i t y', merge that into a single <mi> | |
2174 | /// We only do this for sequences of at least three chars, and also exclude things like consecutive letter (e.g., 'x y z') | |
2175 | /// The returned (mi) element reuses 'mi' | |
2176 | 11.4k | fn merge_mi_sequence(mi: Element) -> Option<Element> { |
2177 | // The best solution would be to use a dictionary of words, or maybe restricted to words in a formula, | |
2178 | // but that would likely miss the words used in slope=run/rise. | |
2179 | // It would also be really expensive since we would need a dictionary for each language. | |
2180 | // We shouldn't need to worry about trig names like "cos", but people sometimes forget to use "\cos" | |
2181 | // Hence, we check against the "FunctionNames" that get read on startup. | |
2182 | 70 | fn is_vowel(ch: char) -> bool { |
2183 | 70 |
|
2184 | 'a' | 'e' | 'i' | 'o' | 'u' | 'y' | | |
2185 | 'à' | 'á' | 'â' | 'ã' | 'ä' | 'è' | 'é' | 'ê' | 'ë' | 'ì' | 'í' | 'î' | 'ï' | | |
2186 | 'ò' | 'ó' | 'ô' | 'õ' | 'ö' | 'ú' | 'Ù' | 'û' | 'ü' | 'ý' | 'ÿ' | | |
2187 | 'ả' | 'ạ' | 'ă' | 'ằ' | 'ẳ' | 'ẵ' | 'ắ' | 'ặ' | 'ầ' | 'ẩ' | 'ẫ' | 'ấ' | 'ậ' | 'ẻ' | 'ẽ' | 'ẹ' | 'ề' | 'ể' | 'ễ' | 'ế' | 'ệ' | | |
2188 | 'ỉ' | 'ĩ' | 'ị' | 'ỏ' | 'ọ' | 'ồ' | 'ổ' | 'ỗ' | 'ố' | 'ộ' | 'ơ' | 'ờ' | 'ở' | 'ỡ' | 'ớ' | 'ợ' | | |
2189 | 'ủ' | 'ũ' | 'ụ' | 'ư' | 'ừ' | 'ử' | 'ữ' | 'ứ' | 'ự' | 'ỳ' | 'ỷ' | 'ỹ' | 'ỵ' | |
2190 | ) | |
2191 | 70 | } |
2192 | 11.4k | let parent = get_parent(mi); // not canonicalized into mrows, so parent could be "math" |
2193 | 11.4k | let parent_name = name(parent); |
2194 | // don't merge if more than one char, or if not in an mrow (or implied on since we haven't normalized yet) | |
2195 | 11.4k | if as_text(mi).chars().nth(1).is_some() || !( |
2196 | 5.16k | return None; |
2197 | 6.25k | } |
2198 | 6.25k | let mut text = as_text(mi).to_string(); |
2199 | 6.25k | let text_script = Script::from(text.chars().next().unwrap_or('a')); |
2200 | 6.25k | let following_siblings = mi.following_siblings(); |
2201 | 6.25k | let mut last_char_is_scripted = None; |
2202 | 6.25k | let mut following_mi_siblings: Vec<Element> = following_siblings.iter() |
2203 | 6.25k | .map_while(|&child| |
2204 | 4.15k | let mut child = as_element(child); |
2205 | 4.15k | let mut is_ok = false; |
2206 | 4.15k | if name(child) == "msub" || |
2207 | // check if the *last* char in the sequence is scripted | |
2208 | // if so, we need to stop here anyway and deal with it specially | |
2209 | 163 | last_char_is_scripted = Some(child); // need to remember the value -- cleared later if not ok |
2210 | 163 | child = as_element(child.children()[0]); |
2211 | 233 | while name(child) == "mrow" && |
2212 | 70 | // the base may be wrapped with mrows |
2213 | 70 | child = as_element(child.children()[0]); |
2214 | 70 | } |
2215 | 3.99k | } |
2216 | 4.15k | if name(child) == "mi" { |
2217 | 402 | let mut child_text = as_text(child).chars(); |
2218 | 402 | let first_char = child_text.next().unwrap_or('a'); |
2219 | 402 | if child_text.next().is_none() && |
2220 | 365 | text.push(first_char); |
2221 | 365 | is_ok = true; |
2222 | 365 |
|
2223 | 3.75k | } |
2224 | 4.15k | if last_char_is_scripted.is_some() { |
2225 | 163 | if is_ok { |
2226 | 114 | is_ok = false; // don't want to continue |
2227 | 114 | } else { |
2228 | 49 | last_char_is_scripted = None; // reset to None |
2229 | 49 | } |
2230 | 3.99k | } |
2231 | 4.15k | if is_ok { |
2232 | 4.15k | }) |
2233 | 6.25k | .collect(); |
2234 | 6.25k | if following_mi_siblings.is_empty() { |
2235 | 6.03k | return None; |
2236 | 224 | } |
2237 | ||
2238 | 224 | if let Some( |
2239 | 14 | // add the last char to the run |
2240 | 14 | following_mi_siblings.push(last); |
2241 | 210 | } |
2242 | // debug!("merge_mi_sequence: text={}", &text); | |
2243 | 224 | if let Some( |
2244 | 224 | let definitions = definitions.borrow(); |
2245 | 224 | let function_names = definitions.get_hashset("FunctionNames").unwrap(); |
2246 | // UEB seems to think "Sin" (etc) is used for "sin", so we move to lower case | |
2247 | // function name might be (wrongly) set to italic math alphanumeric chars, including bold italic | |
2248 | 224 | if let Some( |
2249 | 221 | && function_names.contains(&ascii_text.to_lowercase()) { |
2250 | 10 | return Some(merge_from_text(mi, &ascii_text, &following_mi_siblings)); |
2251 | 214 | } |
2252 | 214 | if function_names.contains(&text) { |
2253 | 0 | return Some(merge_from_text(mi, &text, &following_mi_siblings)); |
2254 | 214 | } |
2255 | // unlike "FunctionNames", "KnownWords" might not exist | |
2256 | 214 | if let Some( |
2257 | 131 | && word_map.contains(&text) { |
2258 | 1 | return Some(merge_from_text(mi, &text, &following_mi_siblings)); |
2259 | 213 | } |
2260 | 213 | return None; |
2261 | 224 | }) { |
2262 | 11 | return answer; |
2263 | 213 | } |
2264 | ||
2265 | // don't be too aggressive combining mi's when they are short | |
2266 | 213 | if text.chars().count() < 3 { |
2267 | 186 | return None; |
2268 | 27 | } |
2269 | // If it is a word, it needs a vowel and it must be a letter | |
2270 | // FIX: this check needs to be internationalized to include accented vowels, other alphabets | |
2271 | 70 | if ! |
2272 | 15 | return None; |
2273 | 12 | } |
2274 | ||
2275 | // now for some heuristics to rule out a sequence of variables | |
2276 | // rule out sequences like 'abc' and also 'axy' that are in alphabetical order | |
2277 | 12 | let mut chars = text.chars(); |
2278 | 12 | let mut left = chars.next().unwrap(); // at least 3 chars |
2279 | 12 | let mut is_in_alphabetical_order = true; |
2280 | 23 | for ch in |
2281 | 23 | if (left as u32) >= (ch as u32) { |
2282 | 3 | is_in_alphabetical_order = false; |
2283 | 3 | break; // can't be 'abc', 'axy', etc |
2284 | 20 | } |
2285 | 20 | left = ch; |
2286 | } | |
2287 | 12 | if is_in_alphabetical_order || |
2288 | // If it is in alphabetical order, it's not likely a word | |
2289 | 12 | return None; |
2290 | 0 | } |
2291 | ||
2292 | // FIX: should add more heuristics to rule out words | |
2293 | 0 | return merge_from_text(mi, &text, &following_mi_siblings); |
2294 | ||
2295 | 11 | fn merge_from_text<'a>(mi: Element<'a>, text: &str, following_siblings: &[Element<'a>]) -> Option<Element<'a>> { |
2296 | // remove trailing mi's | |
2297 | 11 | let i_last_child = following_siblings.len()-1; |
2298 | 11 | let last_child = following_siblings[i_last_child]; |
2299 | 11 | if name(last_child) == "mi" { |
2300 | 10 |
|
2301 | 5 | mi.set_text(text); |
2302 | 5 | return Some(mi); |
2303 | } else { | |
2304 | // replace the base of the scripted element (the last child) with the run (e.g. 's i n^2' -> {sin}^2) | |
2305 | 6 | mi.remove_from_parent(); |
2306 | 6 | following_siblings[..i_last_child].iter().for_each(|sibling| sibling.remove_from_parent()); |
2307 | 6 | let mut base = as_element(last_child.children()[0]); |
2308 | 9 | while name(base) == "mrow" && |
2309 | 3 | // the base may be wrapped with mrows |
2310 | 3 | base = as_element(base.children()[0]); |
2311 | 3 | base.remove_attribute(SPLIT_TOKEN); |
2312 | 3 | } |
2313 | 6 | base.set_text(text); |
2314 | 6 | return Some(last_child); |
2315 | } | |
2316 | 11 | } |
2317 | 11.4k | } |
2318 | ||
2319 | // Check if start..end is a number | |
2320 | 275 | fn is_likely_a_number(context: &CanonicalizeContext, mrow: Element, children: &[ChildOfElement]) -> bool { |
2321 | // Note: the children of math_or_mrow aren't valid ('children' represents the current state) | |
2322 | 275 | let end = children.len(); |
2323 | // { | |
2324 | // let n_preceding_siblings = as_element(children[0]).preceding_siblings().len(); | |
2325 | // debug!("is_likely_a_number: start/end={}/{}", n_preceding_siblings, n_preceding_siblings+end); | |
2326 | // for (i, &child) in children.iter().enumerate() { | |
2327 | // let child = as_element(child); | |
2328 | // debug!("child# {}: {}", n_preceding_siblings+i, mml_to_string(child)); | |
2329 | // } | |
2330 | // debug!("\n"); | |
2331 | // } | |
2332 | ||
2333 | // gather up the text of the children (all mn, mo, or mtext) | |
2334 | 275 | let mut previous_name_was_mn = false; |
2335 | 275 | let mut text = "".to_string(); |
2336 | 727 | for &child in |
2337 | 727 | let child = as_element(child); |
2338 | 727 | let child_name = name(child); |
2339 | 727 | if previous_name_was_mn && |
2340 | 94 | text.push('\u{FFFF}'); // FIX: this should come from the separator string |
2341 | 633 | } |
2342 | 727 | text.push_str(as_text(child)); |
2343 | 727 | previous_name_was_mn = child_name == "mn"; |
2344 | } | |
2345 | ||
2346 | 275 | let text = text.trim(); // could be space got merged into an mn (e.g., braille::UEB::iceb::expr_3_1_6) |
2347 | // debug!(" text='{}', decimal num={}, 3 digit match={}, 3-5 match={}, 1 digit={}", &text, | |
2348 | // context.patterns.digit_only_decimal_number.is_match(text), | |
2349 | // context.patterns.block_3digit_pattern.is_match(text), | |
2350 | // context.patterns.block_3_5digit_pattern.is_match(text), | |
2351 | // context.patterns.block_1digit_pattern.is_match(text)); | |
2352 | 275 | if !(context.patterns.digit_only_decimal_number.is_match(text) || |
2353 | 190 | context.patterns.block_3digit_pattern.is_match(text) || |
2354 | 167 | context.patterns.block_3_5digit_pattern.is_match(text) || |
2355 | 166 | context.patterns.block_4digit_hex_pattern.is_match(text) || |
2356 | 162 | ( (text.chars().count() > 5 || |
2357 | 25 | context.patterns.block_1digit_pattern.is_match(text) ) |
2358 | ) { | |
2359 | 161 | return false; |
2360 | 114 | } |
2361 | ||
2362 | // ??? might want to rule out "sequences" like '100, 200, 300' and '100, 103, 106' (if constant difference, then a sequence) | |
2363 | ||
2364 | // If surrounded by fences, and commas are used, leave as is (e.g, "{1,234}") | |
2365 | 114 | if !text.contains(',') { |
2366 | 84 | return true; // not comma separated |
2367 | 30 | } |
2368 | ||
2369 | // We have already checked for whitespace as separators, so it must be a comma. Just check the fences. | |
2370 | // This is not yet in canonical form, so the fences may be siblings or siblings of the parent | |
2371 | 30 | let preceding_siblings = as_element(children[0]).preceding_siblings(); |
2372 | 30 | let following_siblings = as_element(children[end-1]).following_siblings(); |
2373 | let first_child; | |
2374 | let last_child; | |
2375 | 30 | if preceding_siblings.is_empty() && |
2376 | // number spans all children, look to parent for fences | |
2377 | 14 | let preceding_children = mrow.preceding_siblings(); |
2378 | 14 | let following_children = mrow.following_siblings(); |
2379 | 14 | if preceding_children.is_empty() || |
2380 | 9 | return true; // doesn't have left or right fence |
2381 | 5 | } |
2382 | 5 | first_child = preceding_children[preceding_children.len()-1]; |
2383 | 5 | last_child = following_children[0]; |
2384 | 16 | } else if preceding_siblings.is_empty() || |
2385 | 13 | return true; // can't be fences around it |
2386 | 3 | } else { |
2387 | 3 | first_child = preceding_siblings[preceding_siblings.len()-1]; |
2388 | 3 | last_child = following_siblings[0]; |
2389 | 3 | } |
2390 | 8 | let first_child = as_element(first_child); |
2391 | 8 | let last_child = as_element(last_child); |
2392 | 8 | return !(name(first_child) == "mo" && is_fence(first_child) && |
2393 | 7 | name(last_child) == "mo" && is_fence(last_child) ); |
2394 | 275 | } |
2395 | ||
2396 | // fn count_decimal_pts(context: &CanonicalizeContext, children: &[ChildOfElement], start: usize, end: usize) -> usize { | |
2397 | // let mut n_decimal_pt = 0; | |
2398 | // for &child_as_element in children.iter().take(end).skip(start) { | |
2399 | // let child = as_element(child_as_element); | |
2400 | // if context.patterns.decimal_separator.is_match(as_text(child)) { | |
2401 | // n_decimal_pt += 1; | |
2402 | // } | |
2403 | // } | |
2404 | // return n_decimal_pt; | |
2405 | // } | |
2406 | ||
2407 | /// This is a special case heuristic so try and determine if a terminating punctuation should be a decimal separator | |
2408 | /// Often math expressions end with punctuations for typographic reasons, so we try to figure that out here. | |
2409 | /// 'children' is a subset of 'mrow' | |
2410 | 5.55k | fn ignore_final_punctuation(context: &CanonicalizeContext, mrow: Element, children: &[ChildOfElement]) -> bool { |
2411 | 5.55k | let last_child = children[children.len()-1]; |
2412 | 5.55k | if mrow.children()[mrow.children().len()-1] != last_child { |
2413 | 3.49k | return false; // not at end |
2414 | 2.05k | } |
2415 | 2.05k | let parent = mrow.parent().unwrap().element(); |
2416 | 2.05k | if let Some( |
2417 | 1.71k | && name(math) != "math" { |
2418 | 1.58k | return false; // mrow inside something else -- not at end |
2419 | 471 | } |
2420 | ||
2421 | 471 | let last_child = as_element(last_child); |
2422 | // debug!("ignore_final_punctuation: last child={}", mml_to_string(last_child)); | |
2423 | 471 | if name(last_child) != "mo" { |
2424 | 451 | return false; // last was not "mo", so can't be a period |
2425 | 20 | } |
2426 | ||
2427 | 20 | if !context.patterns.decimal_separator.is_match(as_text(last_child)) { |
2428 | 0 | return false; |
2429 | 20 | } |
2430 | ||
2431 | // debug!("ignore_final_punctuation: #preceding={}", as_element(children[0]).preceding_siblings().len()); | |
2432 | // look to preceding siblings and see if an of the mn's have a decimal separator | |
2433 | 20 | return !as_element(children[0]).preceding_siblings().iter() |
2434 | 101 | . |
2435 | 101 | let child = as_element(child); |
2436 | 101 | name(child) == "mn" && |
2437 | 101 | }); |
2438 | 5.55k | } |
2439 | ||
2440 | /// Trim off any children that are whitespace on either side | |
2441 | 107 | fn trim_whitespace(children: &mut [ChildOfElement], start: usize, end: usize) -> (usize, usize) { |
2442 | 107 | let mut real_start = start; |
2443 | #[allow(clippy::needless_range_loop)] // I don't like enumerate/take/skip here | |
2444 | 107 | for i in start..end { |
2445 | 107 | let child = as_element(children[i]); |
2446 | 107 | if !as_text(child).trim().is_empty() { |
2447 | 107 | real_start = i; |
2448 | 107 | break; |
2449 | 0 | } |
2450 | } | |
2451 | ||
2452 | 107 | let mut real_end = end; |
2453 | 157 | for i in ( |
2454 | 157 | let child = as_element(children[i]); |
2455 | 157 | if !as_text(child).trim().is_empty() { |
2456 | 107 | real_end = i+1; |
2457 | 107 | break; |
2458 | 50 | } |
2459 | } | |
2460 | 107 | return (real_start, real_end); |
2461 | 107 | } |
2462 | ||
2463 | /// Merge the number block from start..end | |
2464 | 107 | fn merge_block(children: &mut Vec<ChildOfElement>, start: usize, end: usize) { |
2465 | ||
2466 | // debug!("merge_block: merging {}..{}", start, end); | |
2467 | 107 | let mut mn_text = String::with_capacity(4*(end-start)-1); // true size less than #3 digit blocks + separator |
2468 | 237 | for &child_as_element in |
2469 | 237 | let child = as_element(child_as_element); |
2470 | 237 | mn_text.push_str(as_text(child)); |
2471 | 237 | } |
2472 | 107 | let child = as_element(children[start]); |
2473 | 107 | set_mathml_name(child, "mn"); |
2474 | 107 | child.set_text(&mn_text); |
2475 | ||
2476 | 107 | children.drain(start+1..end); |
2477 | 107 | } |
2478 | ||
2479 | ||
2480 | /// merge ° C or ° F into a single <mi> with the text '℃' or '℉' -- prevents '°' from becoming a superscript | |
2481 | #[allow(non_snake_case)] | |
2482 | 5.90k | fn merge_degrees_C_F<'a>(mrow: Element<'a>) -> Element<'a> { |
2483 | 5.90k | let mut degree_child = None; |
2484 | 28.1k | for child in |
2485 | 28.1k | let child = as_element(child); |
2486 | 28.1k | if is_leaf(child) { |
2487 | 23.9k | match as_text(child) { |
2488 | 23.9k | "°" => { |
2489 | 34 | degree_child = Some(child); |
2490 | 34 | }, |
2491 | 23.9k | "°C" => { |
2492 | 12 | child.set_text("℃"); |
2493 | 12 | degree_child = None; |
2494 | 12 | }, |
2495 | 23.8k | "°F" => { |
2496 | 0 | child.set_text("℉"); |
2497 | 0 | degree_child = None; |
2498 | 0 | }, |
2499 | 23.8k | text => { |
2500 | 23.8k | if let Some( |
2501 | 23 | && (text == "C" || |
2502 | // merge the degree child with the current child | |
2503 | 3 | degree_child.set_text(if text == "C" { |
2504 | 3 | child.remove_from_parent(); |
2505 | 23.8k | } |
2506 | // merge the degree child with the current child | |
2507 | 23.8k | degree_child = None; |
2508 | }, | |
2509 | } | |
2510 | 4.25k | } |
2511 | } | |
2512 | 5.90k | return mrow; |
2513 | 5.90k | } |
2514 | ||
2515 | ||
2516 | /// merge consecutive leaves containing any of the 'chars' into the first leaf -- probably used for omission with('_') | |
2517 | 5.90k | fn merge_chars<'a>(mrow: Element<'a>, pattern: &Regex) -> Element<'a> { |
2518 | 5.90k | let mut first_child = None; |
2519 | 5.90k | let mut new_text = "".to_string(); |
2520 | 28.1k | for child in |
2521 | 28.1k | let child = as_element(child); |
2522 | 28.1k | if is_leaf(child) { |
2523 | 23.9k | let text = as_text(child); |
2524 | 23.9k | if pattern.is_match(text) { |
2525 | 134 | if new_text.is_empty() { |
2526 | 118 | // potential start of a string |
2527 | 118 | first_child = Some(child); |
2528 | 118 | new_text = as_text(child).to_string(); |
2529 | 118 | } else { |
2530 | 16 | // merge chars |
2531 | 16 | new_text.push_str(text); |
2532 | 16 | child.remove_from_parent(); |
2533 | 16 | } |
2534 | 23.8k | } else if new_text.len() > 1 { |
2535 | 99 | // end of a run |
2536 | 99 | first_child.unwrap().set_text(&new_text); |
2537 | 99 | new_text.clear(); |
2538 | 23.7k | } else { |
2539 | 23.7k | new_text.clear(); // just one entry -- no need to set the text |
2540 | 23.7k | } |
2541 | 4.25k | } else if new_text.len() > 1{ |
2542 | 7 | // end of a run |
2543 | 7 | first_child.unwrap().set_text(&new_text); |
2544 | 7 | new_text.clear(); |
2545 | 4.24k | } else { |
2546 | 4.24k | new_text.clear(); // just one entry -- no need to set the text |
2547 | 4.24k | } |
2548 | } | |
2549 | 5.90k | if new_text.len() > 1{ |
2550 | 9 | // end of a run |
2551 | 9 | first_child.unwrap().set_text(&new_text); |
2552 | 5.89k | } |
2553 | 5.90k | return mrow; |
2554 | 5.90k | } |
2555 | ||
2556 | /// curl and divergence are handled as two character operators | |
2557 | /// if found, merge them into their own (new) mrow that has an intent on it | |
2558 | /// we can have '∇' or '𝛁', or those as vectors (inside an mover) | |
2559 | 10.1k | fn merge_cross_or_dot_product_elements(children: &mut Vec<ChildOfElement>) { |
2560 | 10.1k | if children.is_empty() { |
2561 | 3 | return; |
2562 | 10.1k | } |
2563 | 10.1k | let mut i = 0; |
2564 | 10.1k | let mut is_previous_nabla = false; |
2565 | 31.5k | while i < children.len() - 1 { |
2566 | 21.3k | let child = as_element(children[i]); |
2567 | 21.3k | if is_previous_nabla { |
2568 | 14 | if is_leaf(child) { |
2569 | 14 | let text = as_text(child); |
2570 | 14 | if text == "⋅" || |
2571 | 12 | let nabla_child = as_element(children[i-1]); |
2572 | 12 | let nabla_text = as_text( get_possible_embellished_node(nabla_child) ); |
2573 | 12 | let new_mrow = create_mathml_element(&child.document(), "mrow"); |
2574 | 12 | new_mrow.set_attribute_value(ACT_AS_OPERATOR, nabla_text); |
2575 | 12 | new_mrow.append_child(nabla_child); |
2576 | 12 | new_mrow.append_child(child); |
2577 | 12 | children[i-1] = ChildOfElement::Element(new_mrow); |
2578 | 12 | children.remove(i); |
2579 | 12 |
|
2580 | 0 | } |
2581 | 14 | is_previous_nabla = false; |
2582 | } else { | |
2583 | 21.3k | let potential_nabla = if name(child) == "mover" { |
2584 | 21.3k | if is_leaf(potential_nabla) { |
2585 | 19.0k | let text = as_text(potential_nabla); |
2586 | 19.0k | if text == "∇" || |
2587 | 22 | is_previous_nabla = true; |
2588 | 19.0k | } |
2589 | 2.27k | } |
2590 | } | |
2591 | 21.3k | i += 1; |
2592 | } | |
2593 | 10.1k | } |
2594 | ||
2595 | 5.90k | fn merge_dots(mrow: Element) -> Element { |
2596 | // merge consecutive <mo>s containing '.' into ellipsis | |
2597 | 5.90k | let children = mrow.children(); |
2598 | 5.90k | let mut i = 0; |
2599 | 5.90k | let mut n_dots = 0; // number of consecutive mo's containing dots |
2600 | 34.1k | while i < children.len() { |
2601 | 28.2k | let child = as_element(children[i]); |
2602 | 28.2k | if name(child) == "mo" { |
2603 | 10.4k | let text = as_text(child); |
2604 | 10.4k | if text == "." { |
2605 | 71 | n_dots += 1; |
2606 | 71 | if n_dots == 3 { |
2607 | 3 | let first_child = as_element(children[i-2]); |
2608 | 3 | first_child.set_text("…"); |
2609 | 3 | as_element(children[i-1]).remove_from_parent(); |
2610 | 3 | child.remove_from_parent(); |
2611 | 3 | n_dots = 0; |
2612 | 68 | } |
2613 | 10.3k | } else { |
2614 | 10.3k | n_dots = 0; |
2615 | 10.3k | } |
2616 | 17.7k | } else { |
2617 | 17.7k | n_dots = 0; |
2618 | 17.7k | } |
2619 | 28.2k | i += 1; |
2620 | } | |
2621 | 5.90k | return mrow; |
2622 | 5.90k | } |
2623 | ||
2624 | 5.90k | fn merge_primes(mrow: Element) -> Element { |
2625 | // merge consecutive <mo>s containing primes (in various forms) | |
2626 | 5.90k | let mut children = mrow.children(); |
2627 | 5.90k | let mut i = 0; |
2628 | 5.90k | let mut n_primes = 0; // number of consecutive mo's containing primes |
2629 | 34.1k | while i < children.len() { |
2630 | 28.1k | let child = as_element(children[i]); |
2631 | 28.1k | if name(child) == "mo" { |
2632 | 10.4k | let text = as_text(child); |
2633 | // FIX: should we be more restrictive and change (apostrophe) only in a superscript? | |
2634 | 10.4k | if IS_PRIME.is_match(text) { |
2635 | 21 | n_primes += 1; |
2636 | 10.4k | } else if n_primes > 0 { |
2637 | 3 | merge_prime_elements(&mut children, i - n_primes, i); |
2638 | 3 | n_primes = 0; |
2639 | 10.4k | } |
2640 | 17.7k | } else if n_primes > 0 { |
2641 | 2 | merge_prime_elements(&mut children, i - n_primes, i); |
2642 | 2 | n_primes = 0; |
2643 | 17.7k | } |
2644 | 28.1k | i += 1; |
2645 | } | |
2646 | 5.90k | if n_primes > 0 { |
2647 | 12 | merge_prime_elements(&mut children, i - n_primes, i); |
2648 | 5.89k | } |
2649 | 5.90k | return mrow; |
2650 | 5.90k | } |
2651 | ||
2652 | 17 | fn merge_prime_elements(children: &mut [ChildOfElement], start: usize, end: usize) { |
2653 | // not very efficient since this is probably causing an array shift each time (array is probably not big though) | |
2654 | 17 | let first_child = as_element(children[start]); |
2655 | 17 | let mut new_text = String::with_capacity(end+3-start); // one per element plus a little extra |
2656 | 17 | new_text.push_str(as_text(first_child)); |
2657 | 17 | for & |
2658 | 4 | let child = as_element(child_as_element); |
2659 | 4 | let text = as_text(child); // only in this function because it is an <mo> |
2660 | 4 | new_text.push_str(text); |
2661 | 4 | child.remove_from_parent(); |
2662 | 4 | } |
2663 | 17 | first_child.set_text(&merge_prime_text(&new_text)); |
2664 | 17 | } |
2665 | ||
2666 | 83 | fn merge_prime_text(text: &str) -> String { |
2667 | // merge together single primes into double primes, etc. | |
2668 | 83 | let mut n_primes = 0; |
2669 | 101 | for ch in |
2670 | 101 | match ch { |
2671 | 90 | '\'' | '′' => n_primes += 1, |
2672 | 9 | '″' => n_primes += 2, |
2673 | 0 | '‴' => n_primes += 3, |
2674 | 2 | '⁗' => n_primes += 4, |
2675 | _ => { | |
2676 | 0 | eprintln!("merge_prime_text: unexpected char '{ch}' found in prime text '{text}'"); |
2677 | 0 | return text.to_string(); |
2678 | } | |
2679 | } | |
2680 | } | |
2681 | // it would be very rare to have more than a quadruple prime, so the inefficiency in the won't likely happen | |
2682 | 83 | let mut result = String::with_capacity(n_primes); // likely 4x too big, but string is short-lived and small |
2683 | 83 | for _ in 0..n_primes/4 { |
2684 | 3 | result.push('⁗'); |
2685 | 3 | } |
2686 | 83 | match n_primes % 4 { |
2687 | 61 | 1 => result.push('′'), |
2688 | 20 | 2 => result.push('″'), |
2689 | 1 | 3 => result.push('‴'), |
2690 | 1 | _ => () // can't happen |
2691 | } | |
2692 | 83 | return result; |
2693 | 83 | } |
2694 | ||
2695 | // from https://www.w3.org/TR/MathML3/chapter7.html#chars.pseudo-scripts | |
2696 | 35.1k | fn is_pseudo_script_char(ch: char) -> bool { |
2697 | 35.1k |
|
2698 | '\"' | '\'' | '*' | '`' | 'ª' | '°' | '²' | '³' | '´' | '¹' | 'º' | | |
2699 | '\u{2018}' | '\u{2019}' | '\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{201F}' | | |
2700 | '\u{2032}' | '\u{2033}' | '\u{2034}' | '\u{2035}' | '\u{2036}' | '\u{2037}' | '\u{2057}' | |
2701 | ) | |
2702 | 35.1k | } |
2703 | 5.90k | fn handle_pseudo_scripts(mrow: Element) -> Element { |
2704 | ||
2705 | 5.90k | assert!(name(mrow) == "mrow" || |
2706 | 5.90k | let mut children = mrow.children(); |
2707 | // check to see if mrow of all pseudo scripts | |
2708 | 5.91k | if |
2709 | 5.91k | is_pseudo_script(as_element(child)) |
2710 | 5.91k | }) { |
2711 | 2 | let parent = get_parent(mrow); // must exist |
2712 | 2 | let is_first_child = mrow.preceding_siblings().is_empty(); |
2713 | 2 | if is_first_child { |
2714 | 0 | return mrow; // FIX: what should happen |
2715 | 2 | } |
2716 | 2 | if crate::xpath_functions::IsNode::is_scripted(parent) { |
2717 | 2 | return mrow; // already in a script position |
2718 | 0 | } |
2719 | 0 | if name(parent) == "mrow" { |
2720 | 0 | mrow.set_attribute_value("data-pseudo-script", "true"); |
2721 | 0 | return handle_pseudo_scripts(parent); |
2722 | } else { | |
2723 | 0 | return mrow; // FIX: what should happen? |
2724 | } | |
2725 | 5.90k | } |
2726 | ||
2727 | 5.90k | let mut i = 1; |
2728 | 5.90k | let mut found = false; |
2729 | 28.1k | while i < children.len() { |
2730 | 22.2k | let child = as_element(children[i]); |
2731 | 22.2k | if is_pseudo_script(child) || |
2732 | 22.2k | child.attribute("data-pseudo-script").is_some() { |
2733 | 35 | let msup = create_mathml_element(&child.document(), "msup"); |
2734 | 35 | msup.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
2735 | 35 | msup.append_child(children[i-1]); |
2736 | 35 | msup.append_child(child); |
2737 | 35 | children[i-1] = ChildOfElement::Element(msup); |
2738 | 35 | children.remove(i); |
2739 | 35 | found = true; |
2740 | 22.2k | } else { |
2741 | 22.2k | i += 1; |
2742 | 22.2k | } |
2743 | } | |
2744 | 5.90k | if found { |
2745 | 25 | mrow.replace_children(children) |
2746 | 5.88k | } |
2747 | 5.90k | return mrow; |
2748 | ||
2749 | 28.1k | fn is_pseudo_script(child: Element) -> bool { |
2750 | 28.1k | if name(child) == "mo" { |
2751 | 10.4k | let text = as_text(child); |
2752 | 10.4k | if let Some( |
2753 | 10.3k | && is_pseudo_script_char(ch) { |
2754 | // don't script a pseudo-script | |
2755 | 55 | let preceding_siblings = child.preceding_siblings(); |
2756 | 55 | if !preceding_siblings.is_empty() { |
2757 | 42 | let last_child = as_element(preceding_siblings[preceding_siblings.len()-1]); |
2758 | 42 | if name(last_child) == "mo" && |
2759 | 10 | let Some(ch) = single_char(as_text(last_child)) |
2760 | 10 | && is_pseudo_script_char(ch) { |
2761 | 6 | return false; |
2762 | 36 | } |
2763 | 13 | } |
2764 | 49 | if text == "*" { |
2765 | // could be infix "*" -- this is a weak check to see if what follows is potentially an operand | |
2766 | 5 | let following_siblings = child.following_siblings(); |
2767 | 5 | if following_siblings.is_empty() { |
2768 | 1 | return true; |
2769 | 4 | } |
2770 | 4 | let first_child = as_element(following_siblings[0]); |
2771 | 4 | return name(first_child) != "mo" || ["(", "[", "{"].contains(&text); |
2772 | } else { | |
2773 | 44 | return true; |
2774 | } | |
2775 | 10.3k | } |
2776 | 17.7k | } |
2777 | 28.1k | return false; |
2778 | ||
2779 | /// An efficient method to get the char from a string if it is just one char or fail | |
2780 | 10.4k | fn single_char(text: &str) -> Option<char> { |
2781 | 10.4k | let mut chars = text.chars(); |
2782 | 10.4k | let ch = chars.next(); |
2783 | 10.4k | if ch.is_none() || chars.next().is_some() { |
2784 | 39 | return None; // not one character |
2785 | } else { | |
2786 | 10.3k | return ch; |
2787 | } | |
2788 | 10.4k | } |
2789 | 28.1k | } |
2790 | ||
2791 | 5.90k | } |
2792 | ||
2793 | 10.1k | fn handle_convert_to_mmultiscripts(children: &mut Vec<ChildOfElement>) { |
2794 | 10.1k | if children.len() == 1 { |
2795 | 4.45k | return; // can't convert to mmultiscripts if there is nothing to attach an empty base to |
2796 | 5.72k | } |
2797 | 5.72k | let mut i = 0; |
2798 | // convert_to_mmultiscripts changes 'children', so can't cache length | |
2799 | 32.7k | while i < children.len() { |
2800 | 26.9k | let child = as_element(children[i]); |
2801 | 26.9k | let child_name = name(child); |
2802 | 26.9k | if (child_name == "msub" || |
2803 | 115 | i = convert_to_mmultiscripts(children, i); |
2804 | 26.8k | } else { |
2805 | 26.8k | i += 1; |
2806 | 26.8k | } |
2807 | } | |
2808 | 10.1k | } |
2809 | ||
2810 | ||
2811 | /// Converts the script element with an empty base to mmultiscripts by sucking the base from the following or preceding element. | |
2812 | /// The following element is preferred so that these become prescripts (common usage is from TeX), but if the preceding element | |
2813 | /// has a closer mi/mtext, it is used. | |
2814 | /// mhchem has some ugly output (at least in MathJax) and that's where using the following element makes sense (usually) | |
2815 | /// because an empty base (mpadded width=0) is used for the scripts. A hacky attribute indicates this case. | |
2816 | 115 | fn convert_to_mmultiscripts(mrow_children: &mut Vec<ChildOfElement>, i: usize) -> usize { |
2817 | // this is a bit messy/confusing because we might scan forwards or backwards and this affects whether | |
2818 | // we are scanning for prescripts or postscripts | |
2819 | // the generic name "primary_scripts" means prescripts if going forward or postscripts if going backwards | |
2820 | // if we are going forward and hit a sub/superscript with a base, then those scripts become postscripts ("other_scripts") | |
2821 | // if we are going backwards, we never add prescripts | |
2822 | ||
2823 | // let parent = get_parent(as_element(mrow_children[i])); | |
2824 | // debug!("convert_to_mmultiscripts (i={}) -- PARENT:\n{}", i, mml_to_string(parent)); | |
2825 | ||
2826 | 115 | let i_base = choose_base_of_mmultiscripts(mrow_children, i); |
2827 | 115 | let mut base = as_element(mrow_children[i_base]); |
2828 | // debug!("convert_to_mmultiscripts -- base\n{}", mml_to_string(base)); | |
2829 | 115 | let base_name = name(base); |
2830 | 115 | let mut prescripts = vec![]; |
2831 | 115 | let mut postscripts = vec![]; |
2832 | 115 | let mut i_postscript = i_base + 1; |
2833 | ||
2834 | 115 | if (base_name == "msub" || |
2835 | 5 | !CanonicalizeContext::is_empty_element(as_element(base.children()[0])) { |
2836 | 5 | // if the base is a script element, then we want the base of that to be the base of the mmultiscripts |
2837 | 5 | let mut base_children = base.children(); |
2838 | 5 | let script_base = as_element(base.children()[0]); |
2839 | 5 | base_children[0] = ChildOfElement::Element(CanonicalizeContext::create_empty_element(&base.document())); |
2840 | 5 | base.replace_children(base_children); |
2841 | 5 | add_to_scripts(base, &mut postscripts); |
2842 | 5 | base = script_base; |
2843 | 110 | } |
2844 | ||
2845 | 115 | let mut has_chemistry_prescript = false; // chemical elements don't have both prescripts (nuclear chem) and postscripts |
2846 | 115 | if i_base > i { |
2847 | // we have prescripts -- gather them up | |
2848 | 61 | let mut i_prescript = i; |
2849 | 122 | while i_prescript < i_base { |
2850 | 61 | let script = as_element(mrow_children[i_prescript]); |
2851 | // kind of ugly -- this duplicates the first part of add_to_scripts | |
2852 | 61 | let script_name = name(script); |
2853 | 61 | if script_name == "msub" || |
2854 | 61 | let base = as_element(script.children()[0]); |
2855 | 61 | has_chemistry_prescript |= base.attribute(MHCHEM_MMULTISCRIPTS_HACK).is_some(); |
2856 | 61 |
|
2857 | 61 | if !add_to_scripts(script, &mut prescripts) { |
2858 | 0 | break; |
2859 | 61 | } |
2860 | 61 | i_prescript += 1; |
2861 | } | |
2862 | 54 | } |
2863 | ||
2864 | 115 | if !has_chemistry_prescript { |
2865 | // gather up the postscripts (if any) | |
2866 | 137 | while i_postscript < mrow_children.len() { |
2867 | 104 | let script = as_element(mrow_children[i_postscript]); |
2868 | // debug!("script: {}", mml_to_string(script)); | |
2869 | // if name(script) == "msub" && i_postscript+1 < mrow_children.len() { | |
2870 | // let superscript = as_element(mrow_children[i_postscript+1]); | |
2871 | // if name(superscript) == "msup" && CanonicalizeContext::is_empty_element(as_element(superscript.children()[0])) { | |
2872 | // set_mathml_name(script, "msubsup"); | |
2873 | // script.append_child(superscript.children()[1]); | |
2874 | // i_postscript += 1; | |
2875 | // } | |
2876 | // } | |
2877 | // debug!("adding postscript\n{}", mml_to_string(script)); | |
2878 | 104 | if !add_to_scripts(script, &mut postscripts) { |
2879 | 32 | break; |
2880 | 72 | } |
2881 | 72 | i_postscript += 1; |
2882 | } | |
2883 | 50 | } |
2884 | ||
2885 | 115 | let i_multiscript = if i_base < i { |
2886 | 115 | let script = create_mathml_element(&base.document(), "mmultiscripts"); |
2887 | 115 | let mut num_children = 1 + postscripts.len(); |
2888 | 115 | if !prescripts.is_empty() { |
2889 | 61 | num_children += 1 + prescripts.len(); |
2890 | 61 |
|
2891 | 115 | let mut new_children = Vec::with_capacity(num_children); |
2892 | 115 | new_children.push(ChildOfElement::Element(base)); |
2893 | 115 | new_children.append(&mut postscripts); |
2894 | 115 | if !prescripts.is_empty() { |
2895 | 61 | new_children.push( ChildOfElement::Element( create_mathml_element(&script.document(), "mprescripts") ) ); |
2896 | 61 | new_children.append(&mut prescripts); |
2897 | 61 |
|
2898 | ||
2899 | 115 | script.replace_children(new_children); |
2900 | 115 | let lifted_base = as_element(mrow_children[i_multiscript]); |
2901 | 115 | add_attrs(script, &lifted_base.attributes()); |
2902 | 115 | script.remove_attribute("data-split"); // doesn't make sense on mmultiscripts |
2903 | 115 | script.remove_attribute("mathvariant"); // doesn't make sense on mmultiscripts |
2904 | 115 | mrow_children[i_multiscript] = ChildOfElement::Element(script); |
2905 | 115 | mrow_children.drain(i_multiscript+1..i_postscript); // remove children after the first |
2906 | ||
2907 | 115 | let likely_chemistry = likely_adorned_chem_formula(script); |
2908 | 115 | if likely_chemistry >= 0 { |
2909 | 106 | script.set_attribute_value(MAYBE_CHEMISTRY, likely_chemistry.to_string().as_str()); |
2910 | 106 |
|
2911 | ||
2912 | // debug!("convert_to_mmultiscripts -- converted script:\n{}", mml_to_string(script)); | |
2913 | // debug!("convert_to_mmultiscripts (at end) -- #children={}", mrow_children.len()); | |
2914 | 115 | return i_multiscript + 1; // child to start on next |
2915 | 115 | } |
2916 | ||
2917 | 170 | fn add_to_scripts<'a>(el: Element<'a>, scripts: &mut Vec<ChildOfElement<'a>>) -> bool { |
2918 | 170 | let script_name = name(el); |
2919 | 170 | if !(script_name == "msub" || |
2920 | 32 | return false; |
2921 | 138 | } |
2922 | 138 | let base = as_element(el.children()[0]); |
2923 | 138 | if !CanonicalizeContext::is_empty_element(base) { // prescript that really should be a postscript |
2924 | // debug!("add_to_scripts: not empty base:\n{}", mml_to_string(base)); | |
2925 | 0 | return false; |
2926 | 138 | } |
2927 | 138 | if script_name == "msub" { |
2928 | 59 | add_pair(scripts, Some(el.children()[1]), None); |
2929 | 79 | } else if script_name == "msup" { |
2930 | 31 | add_pair(scripts, None, Some(el.children()[1])); |
2931 | 48 | } else { // msubsup |
2932 | 48 | add_pair(scripts, Some(el.children()[1]), Some(el.children()[2])); |
2933 | 48 | }; |
2934 | 138 | return true; |
2935 | 170 | } |
2936 | ||
2937 | 138 | fn add_pair<'v, 'a:'v>(script_vec: &'v mut Vec<ChildOfElement<'a>>, subscript: Option<ChildOfElement<'a>>, superscript: Option<ChildOfElement<'a>>) { |
2938 | 138 | let child_of_element = if let Some( |
2939 | 138 | let doc = as_element(child_of_element).document(); |
2940 | 138 | let subscript = if let Some( |
2941 | 107 | if CanonicalizeContext::is_empty_element(as_element(subscript)) { |
2942 | 0 | ChildOfElement::Element(create_mathml_element(&doc, "none")) |
2943 | } else { | |
2944 | 107 | subscript |
2945 | } | |
2946 | } else { | |
2947 | 31 | ChildOfElement::Element(create_mathml_element(&doc, "none")) |
2948 | }; | |
2949 | 138 | let superscript = if let Some( |
2950 | 79 | if CanonicalizeContext::is_empty_element(as_element(superscript)) { |
2951 | 0 | ChildOfElement::Element(create_mathml_element(&doc, "none")) |
2952 | } else { | |
2953 | 79 | superscript |
2954 | } | |
2955 | } else { | |
2956 | 59 | ChildOfElement::Element(create_mathml_element(&doc, "none")) |
2957 | }; | |
2958 | 138 | script_vec.push(subscript); |
2959 | 138 | script_vec.push(superscript); |
2960 | 138 | } |
2961 | ||
2962 | /// Find the closest likely base to the 'i'th child, preferring the next one over the preceding one, but want the closest. | |
2963 | /// | |
2964 | /// Note: because the base might be (...), 'mrow_children might be changed so that they are grouped into an mrow. | |
2965 | 115 | fn choose_base_of_mmultiscripts(mrow_children: &mut Vec<ChildOfElement>, i: usize) -> usize { |
2966 | // We already know there are no empty scripts to the left (because we find first empty base from left to right). | |
2967 | // However, there may be some empty bases before we get to real base on the right. | |
2968 | 115 | let script_element_base = as_element(as_element(mrow_children[i]).children()[0]); |
2969 | 115 | let mut likely_postscript = script_element_base.attribute(MHCHEM_MMULTISCRIPTS_HACK).is_some() && |
2970 | 115 | if likely_postscript { |
2971 | 86 | let base_of_postscript = as_element(mrow_children[i-1]); |
2972 | 86 | if name(base_of_postscript) != "mi" || |
2973 | 36 | likely_postscript = false; // base for potential postscript doesn't look reasonable -- consider it a prescript |
2974 | 50 | } |
2975 | 29 | } |
2976 | 115 | if i+1 < mrow_children.len() && |
2977 | 61 | return i+1; |
2978 | 54 | } |
2979 | 54 | if i > 0 { |
2980 | 54 | if let Some( |
2981 | 2 | assert!(i_start < i-1); // should be at least two children (open and close) |
2982 | // create a new mrow, add the grouped children to it, then drain all but the first of them from the original mrow vec. | |
2983 | // stick the mrow into the first of them -- this is the base | |
2984 | 2 | let new_mrow = create_mathml_element(&as_element(mrow_children[0]).document(), "mrow"); |
2985 | 2 | new_mrow.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
2986 | 8 | for &child in |
2987 | 8 | new_mrow.append_child(child); |
2988 | 8 | } |
2989 | 2 | mrow_children.drain(i_start+1..i); |
2990 | 2 | mrow_children[i_start] = ChildOfElement::Element(new_mrow); |
2991 | 2 | return i_start; |
2992 | 52 | } |
2993 | 52 | if is_child_simple_base(mrow_children[i-1]) { |
2994 | 52 | return i-1; |
2995 | 0 | } |
2996 | 0 | } |
2997 | ||
2998 | // base very likely after multiple scripts to the right | |
2999 | 0 | for (i_base, &child) in mrow_children.iter().enumerate().skip(i+1) { |
3000 | 0 | if is_child_simple_base(child) { |
3001 | 0 | return i_base; |
3002 | } else { | |
3003 | 0 | let child = as_element(child); |
3004 | 0 | let child_name = name(child); |
3005 | 0 | if !(child_name == "msub" || child_name == "msup" || child_name == "msubsup") { |
3006 | 0 | break; |
3007 | 0 | } |
3008 | } | |
3009 | } | |
3010 | // didn't find any good candidates for a base -- pick something valid | |
3011 | 0 | assert!(mrow_children.len() > i); |
3012 | 0 | return i; |
3013 | ||
3014 | ||
3015 | 113 | fn is_child_simple_base(child: ChildOfElement) -> bool { |
3016 | 113 | let mut child = as_element(child); |
3017 | 113 | let child_name = name(child); |
3018 | 113 | if child_name == "msub" || |
3019 | 5 | child = as_element(child.children()[0]); |
3020 | 108 | } |
3021 | ||
3022 | 113 | return is_leaf(child) && !CanonicalizeContext::is_empty_element(child); // a little overly general (but hopefully doesn't matter) |
3023 | 113 | } |
3024 | ||
3025 | /// Return the index of the matched open paren/bracket if the last element is a closed paren/bracket | |
3026 | 54 | fn is_grouped_base(mrow_children: &[ChildOfElement]) -> Option<usize> { |
3027 | // FIX: this really belongs in canonicalization pass, not the clean pass | |
3028 | 54 | let i_last = mrow_children.len()-1; |
3029 | 54 | let last_child = get_possible_embellished_node(as_element(mrow_children[i_last])); |
3030 | 54 | if name(last_child) == "mo" && |
3031 | 3 | CanonicalizeContext::find_operator(None, last_child, None, None, None).is_right_fence() { |
3032 | 6 | for i_child in ( |
3033 | 6 | let child = get_possible_embellished_node(as_element(mrow_children[i_child])); |
3034 | 6 | if name(child) == "mo" && |
3035 | 2 | CanonicalizeContext::find_operator(None, child, None, None, None).is_left_fence() { |
3036 | // FIX: should make sure left and right match. Should also count for nested parens | |
3037 | 2 | return Some(i_child); |
3038 | 4 | } |
3039 | } | |
3040 | 52 | } |
3041 | 52 | return None; |
3042 | 54 | } |
3043 | 115 | } |
3044 | 52.3k | } |
3045 | ||
3046 | 64.1k | fn canonicalize_mrows<'a>(&self, mathml: Element<'a>) -> Result<Element<'a>> { |
3047 | 64.1k | let tag_name = name(mathml); |
3048 | 64.1k | set_mathml_name(mathml, tag_name); // add namespace |
3049 | 64.1k | match tag_name { |
3050 | 64.1k | "mi" | |
3051 | 15.8k | self.canonicalize_plane1(mathml); |
3052 | 15.8k | return Ok( mathml ); }, |
3053 | 48.3k | "mo" => { |
3054 | 14.6k | self.canonicalize_plane1(mathml); |
3055 | 14.6k | self.canonicalize_mo_text(mathml); |
3056 | 14.6k | return Ok( mathml ); |
3057 | }, | |
3058 | 33.7k | "mn" => { |
3059 | 11.6k | self.canonicalize_plane1(mathml); |
3060 | 11.6k | return Ok( mathml ); |
3061 | }, | |
3062 | 22.0k | "mrow" => { |
3063 | 7.48k | return self.canonicalize_mrows_in_mrow(mathml); |
3064 | }, | |
3065 | _ => { | |
3066 | // recursively try to make mrows in other structures (eg, num/denom in fraction) | |
3067 | 14.6k | let mut new_children = Vec::with_capacity(mathml.children().len()); |
3068 | 21.5k | for child in |
3069 | 21.5k | match child { |
3070 | 21.5k | ChildOfElement::Element(e) => { |
3071 | 21.5k | new_children.push( ChildOfElement::Element(self.canonicalize_mrows(e) |
3072 | }, | |
3073 | 0 | ChildOfElement::Text(t) => { |
3074 | 0 | if mathml.children().len() != 1 { |
3075 | 0 | bail!("Text '{}' found with more than one child in element '{}'", t.text(), tag_name); |
3076 | 0 | } |
3077 | 0 | return Ok( mathml ); |
3078 | }, | |
3079 | 0 | _ => bail!("Should have been an element or text in '{}'", tag_name), |
3080 | } | |
3081 | } | |
3082 | 14.6k | mathml.replace_children(new_children); |
3083 | 14.6k | return Ok( mathml ); |
3084 | }, | |
3085 | } | |
3086 | 64.1k | } |
3087 | ||
3088 | 1.91k | fn potentially_lift_script<'a>(&self, mrow: Element<'a>) -> Element<'a> { |
3089 | 1.91k | if name(mrow) != "mrow" { |
3090 | 0 | return mrow; |
3091 | 1.91k | } |
3092 | 1.91k | let mut mrow_children = mrow.children(); |
3093 | 1.91k | let first_child = as_element(mrow_children[0]); |
3094 | 1.91k | let last_child = as_element(mrow_children[mrow_children.len()-1]); |
3095 | 1.91k | let last_child_name = name(last_child); |
3096 | ||
3097 | 1.91k | if name(first_child) == "mo" && |
3098 | 1.91k | (last_child_name == "msub" || last_child_name == "msup" || |
3099 | 19 | let base = as_element(last_child.children()[0]); |
3100 | 19 | if !(name(base) == "mo" && is_fence(base)) { |
3101 | 0 | return mrow; // not a case we are interested in |
3102 | 19 | } |
3103 | // else drop through | |
3104 | } else { | |
3105 | 1.89k | return mrow; // not a case we are interested in |
3106 | } | |
3107 | ||
3108 | 19 | let script = last_child; // better name now that we know what it is |
3109 | 19 | let mut script_children = script.children(); |
3110 | 19 | let close_fence = script_children[0]; |
3111 | 19 | let mrow_children_len = mrow_children.len(); // rust complains about a borrow after move if we don't store this first |
3112 | 19 | mrow_children[mrow_children_len-1] = close_fence; // make the mrow hold the fences |
3113 | 19 | mrow.replace_children(mrow_children); |
3114 | // make the mrow the child of the script | |
3115 | 19 | script_children[0] = ChildOfElement::Element(mrow); |
3116 | 19 | script.replace_children(script_children); |
3117 | 19 | return script; |
3118 | 1.91k | } |
3119 | ||
3120 | /// Map names to start of Unicode alphanumeric blocks (Roman, digits, Greek) | |
3121 | /// Don't do this for function names -- for function names, map them back to ASCII | |
3122 | 42.1k | fn canonicalize_plane1<'a>(&self, mi: Element<'a>) -> Element<'a> { |
3123 | // if the character shouldn't be mapped, use 0 -- don't use 'A' as ASCII and Greek aren't contiguous | |
3124 | static MATH_VARIANTS: phf::Map<&str, [u32; 3]> = phf_map! { | |
3125 | // "normal" -- nothing to do | |
3126 | "italic" => [0, 0, 0x1D6E2], | |
3127 | "bold" => [0x1D400, 0x1D7CE, 0x1D6A8], | |
3128 | "bold-italic" => [0x1D468, 0x1D7CE, 0x1D71C], | |
3129 | "double-struck" => [0x1D538, 0x1D7D8, 0], | |
3130 | "bold-fraktur" => [0x1D56C, 0, 0x1D6A8], | |
3131 | "script" => [0x1D49C, 0, 0], | |
3132 | "bold-script" => [0x1D4D0, 0, 0x1D6A8], | |
3133 | "fraktur" => [0x1D504, 0, 0], | |
3134 | "sans-serif" => [0x1D5A0, 0x1D7E2, 0], | |
3135 | "bold-sans-serif" => [0x1D5D4, 0x1D7EC, 0x1D756], | |
3136 | "sans-serif-italic" => [0x1D608, 0x1D7E2, 0], | |
3137 | "sans-serif-bold-italic" => [0x1D63C, 0x1D7EC, 0x1D790], | |
3138 | "monospace" => [0x1D670, 0x1D7F6, 0], | |
3139 | }; | |
3140 | ||
3141 | 42.1k | return crate::definitions::SPEECH_DEFINITIONS.with(|defs| { |
3142 | // names that are always function names (e.g, "sin" and "log") | |
3143 | 42.1k | let defs = defs.borrow(); |
3144 | 42.1k | let |
3145 | 42.1k | Some(hs) => hs, |
3146 | 3 | None => return mi, // happens in some canonicalize tests but not in real use |
3147 | }; | |
3148 | ||
3149 | ||
3150 | 42.1k | let mi_text = as_text(mi); |
3151 | 42.1k | let variant = mi.attribute_value("mathvariant"); |
3152 | ||
3153 | 42.1k | if names.contains(mi_text) { |
3154 | 791 | return mi; // avoid mapping mathvariant for function names |
3155 | 41.3k | } |
3156 | // function name might be (wrongly) set to italic math alphanumeric chars, including bold italic | |
3157 | 41.3k | if let Some( |
3158 | 12.6k | && names.contains(&ascii_text) { |
3159 | 3 | mi.set_text(&ascii_text); |
3160 | 3 | return mi |
3161 | 41.3k | } |
3162 | ||
3163 | 41.3k | if variant.is_none() { |
3164 | 40.3k | return mi; |
3165 | 952 | } |
3166 | ||
3167 | 952 | let new_text = match MATH_VARIANTS.get(variant.unwrap()) { |
3168 | 755 | None => mi_text.to_string(), |
3169 | 197 | Some(start) => shift_text(mi_text, start), |
3170 | }; | |
3171 | // mi.remove_attribute("mathvariant"); // leave attr -- for Nemeth, there are italic digits etc that don't have Unicode points | |
3172 | 952 | mi.set_text(&new_text); |
3173 | 952 | return mi; |
3174 | 42.1k | }); |
3175 | ||
3176 | 197 | fn shift_text(old_text: &str, char_mapping: &[u32; 3]) -> String { |
3177 | // if there is no block for something, use 'a', 'A', 0 as that will be a no-op | |
3178 | struct Offsets { | |
3179 | ch: u32, | |
3180 | table: usize, | |
3181 | } | |
3182 | static SHIFT_AMOUNTS: phf::Map<char, Offsets> = phf_map! { | |
3183 | 'A' => Offsets{ ch: 0, table: 0}, | |
3184 | 'B' => Offsets{ ch: 1, table: 0}, | |
3185 | 'C' => Offsets{ ch: 2, table: 0}, | |
3186 | 'D' => Offsets{ ch: 3, table: 0}, | |
3187 | 'E' => Offsets{ ch: 4, table: 0}, | |
3188 | 'F' => Offsets{ ch: 5, table: 0}, | |
3189 | 'G' => Offsets{ ch: 6, table: 0}, | |
3190 | 'H' => Offsets{ ch: 7, table: 0}, | |
3191 | 'I' => Offsets{ ch: 8, table: 0}, | |
3192 | 'J' => Offsets{ ch: 9, table: 0}, | |
3193 | 'K' => Offsets{ ch: 10, table: 0}, | |
3194 | 'L' => Offsets{ ch: 11, table: 0}, | |
3195 | 'M' => Offsets{ ch: 12, table: 0}, | |
3196 | 'N' => Offsets{ ch: 13, table: 0}, | |
3197 | 'O' => Offsets{ ch: 14, table: 0}, | |
3198 | 'P' => Offsets{ ch: 15, table: 0}, | |
3199 | 'Q' => Offsets{ ch: 16, table: 0}, | |
3200 | 'R' => Offsets{ ch: 17, table: 0}, | |
3201 | 'S' => Offsets{ ch: 18, table: 0}, | |
3202 | 'T' => Offsets{ ch: 19, table: 0}, | |
3203 | 'U' => Offsets{ ch: 20, table: 0}, | |
3204 | 'V' => Offsets{ ch: 21, table: 0}, | |
3205 | 'W' => Offsets{ ch: 22, table: 0}, | |
3206 | 'X' => Offsets{ ch: 23, table: 0}, | |
3207 | 'Y' => Offsets{ ch: 24, table: 0}, | |
3208 | 'Z' => Offsets{ ch: 25, table: 0}, | |
3209 | 'a' => Offsets{ ch: 26, table: 0}, | |
3210 | 'b' => Offsets{ ch: 27, table: 0}, | |
3211 | 'c' => Offsets{ ch: 28, table: 0}, | |
3212 | 'd' => Offsets{ ch: 29, table: 0}, | |
3213 | 'e' => Offsets{ ch: 30, table: 0}, | |
3214 | 'f' => Offsets{ ch: 31, table: 0}, | |
3215 | 'g' => Offsets{ ch: 32, table: 0}, | |
3216 | 'h' => Offsets{ ch: 33, table: 0}, | |
3217 | 'i' => Offsets{ ch: 34, table: 0}, | |
3218 | 'j' => Offsets{ ch: 35, table: 0}, | |
3219 | 'k' => Offsets{ ch: 36, table: 0}, | |
3220 | 'l' => Offsets{ ch: 37, table: 0}, | |
3221 | 'm' => Offsets{ ch: 38, table: 0}, | |
3222 | 'n' => Offsets{ ch: 39, table: 0}, | |
3223 | 'o' => Offsets{ ch: 40, table: 0}, | |
3224 | 'p' => Offsets{ ch: 41, table: 0}, | |
3225 | 'q' => Offsets{ ch: 42, table: 0}, | |
3226 | 'r' => Offsets{ ch: 43, table: 0}, | |
3227 | 's' => Offsets{ ch: 44, table: 0}, | |
3228 | 't' => Offsets{ ch: 45, table: 0}, | |
3229 | 'u' => Offsets{ ch: 46, table: 0}, | |
3230 | 'v' => Offsets{ ch: 47, table: 0}, | |
3231 | 'w' => Offsets{ ch: 48, table: 0}, | |
3232 | 'x' => Offsets{ ch: 49, table: 0}, | |
3233 | 'y' => Offsets{ ch: 50, table: 0}, | |
3234 | 'z' => Offsets{ ch: 51, table: 0}, | |
3235 | '0' => Offsets{ ch: 0, table: 1}, | |
3236 | '1' => Offsets{ ch: 1, table: 1}, | |
3237 | '2' => Offsets{ ch: 2, table: 1}, | |
3238 | '3' => Offsets{ ch: 3, table: 1}, | |
3239 | '4' => Offsets{ ch: 4, table: 1}, | |
3240 | '5' => Offsets{ ch: 5, table: 1}, | |
3241 | '6' => Offsets{ ch: 6, table: 1}, | |
3242 | '7' => Offsets{ ch: 7, table: 1}, | |
3243 | '8' => Offsets{ ch: 8, table: 1}, | |
3244 | '9' => Offsets{ ch: 9, table: 1}, | |
3245 | 'Α' => Offsets{ ch: 0, table: 2}, | |
3246 | 'Β' => Offsets{ ch: 1, table: 2}, | |
3247 | 'Γ' => Offsets{ ch: 2, table: 2}, | |
3248 | 'Δ' => Offsets{ ch: 3, table: 2}, | |
3249 | 'Ε' => Offsets{ ch: 4, table: 2}, | |
3250 | 'Ζ' => Offsets{ ch: 5, table: 2}, | |
3251 | 'Η' => Offsets{ ch: 6, table: 2}, | |
3252 | 'Θ' => Offsets{ ch: 7, table: 2}, | |
3253 | 'Ι' => Offsets{ ch: 8, table: 2}, | |
3254 | 'Κ' => Offsets{ ch: 9, table: 2}, | |
3255 | 'Λ' => Offsets{ ch: 10, table: 2}, | |
3256 | 'Μ' => Offsets{ ch: 11, table: 2}, | |
3257 | 'Ν' => Offsets{ ch: 12, table: 2}, | |
3258 | 'Ξ' => Offsets{ ch: 13, table: 2}, | |
3259 | 'Ο' => Offsets{ ch: 14, table: 2}, | |
3260 | 'Π' => Offsets{ ch: 15, table: 2}, | |
3261 | 'Ρ' => Offsets{ ch: 16, table: 2}, | |
3262 | 'ϴ' => Offsets{ ch: 17, table: 2}, | |
3263 | 'Σ' => Offsets{ ch: 18, table: 2}, | |
3264 | 'Τ' => Offsets{ ch: 19, table: 2}, | |
3265 | 'Υ' => Offsets{ ch: 20, table: 2}, | |
3266 | 'Φ' => Offsets{ ch: 21, table: 2}, | |
3267 | 'Χ' => Offsets{ ch: 22, table: 2}, | |
3268 | 'Ψ' => Offsets{ ch: 23, table: 2}, | |
3269 | 'Ω' => Offsets{ ch: 24, table: 2}, | |
3270 | '∇' => Offsets{ ch: 25, table: 2}, | |
3271 | 'α' => Offsets{ ch: 26, table: 2}, | |
3272 | 'β' => Offsets{ ch: 27, table: 2}, | |
3273 | 'γ' => Offsets{ ch: 28, table: 2}, | |
3274 | 'δ' => Offsets{ ch: 29, table: 2}, | |
3275 | 'ε' => Offsets{ ch: 30, table: 2}, | |
3276 | 'ζ' => Offsets{ ch: 31, table: 2}, | |
3277 | 'η' => Offsets{ ch: 32, table: 2}, | |
3278 | 'θ' => Offsets{ ch: 33, table: 2}, | |
3279 | 'ι' => Offsets{ ch: 34, table: 2}, | |
3280 | 'κ' => Offsets{ ch: 35, table: 2}, | |
3281 | 'λ' => Offsets{ ch: 36, table: 2}, | |
3282 | 'μ' => Offsets{ ch: 37, table: 2}, | |
3283 | 'ν' => Offsets{ ch: 38, table: 2}, | |
3284 | 'ξ' => Offsets{ ch: 39, table: 2}, | |
3285 | 'ο' => Offsets{ ch: 40, table: 2}, | |
3286 | 'π' => Offsets{ ch: 41, table: 2}, | |
3287 | 'ρ' => Offsets{ ch: 42, table: 2}, | |
3288 | 'ς' => Offsets{ ch: 43, table: 2}, | |
3289 | 'σ' => Offsets{ ch: 44, table: 2}, | |
3290 | 'τ' => Offsets{ ch: 45, table: 2}, | |
3291 | 'υ' => Offsets{ ch: 46, table: 2}, | |
3292 | 'φ' => Offsets{ ch: 47, table: 2}, | |
3293 | 'χ' => Offsets{ ch: 48, table: 2}, | |
3294 | 'ψ' => Offsets{ ch: 49, table: 2}, | |
3295 | 'ω' => Offsets{ ch: 50, table: 2}, | |
3296 | '∂' => Offsets{ ch: 51, table: 2}, | |
3297 | 'ϵ' => Offsets{ ch: 52, table: 2}, | |
3298 | 'ϑ' => Offsets{ ch: 53, table: 2}, | |
3299 | 'ϰ' => Offsets{ ch: 54, table: 2}, | |
3300 | 'ϕ' => Offsets{ ch: 55, table: 2}, | |
3301 | 'ϱ' => Offsets{ ch: 56, table: 2}, | |
3302 | 'ϖ' => Offsets{ ch: 57, table: 2}, | |
3303 | }; | |
3304 | 197 | let mut new_text = String::new(); |
3305 | 321 | for ch in |
3306 | 321 | new_text.push( |
3307 | 321 | match SHIFT_AMOUNTS.get(&ch) { |
3308 | None => { | |
3309 | // there are two digamma chars only in the bold mapping. Handled here | |
3310 | 71 | if char_mapping[2] == 0x1D6A8 { |
3311 | 43 | match ch { |
3312 | 1 | 'Ϝ' => '𝟊', |
3313 | 1 | 'ϝ' => '𝟋', |
3314 | 41 | _ => ch, |
3315 | } | |
3316 | } else { | |
3317 | 28 | ch |
3318 | } | |
3319 | }, | |
3320 | 250 | Some(offsets) => { |
3321 | 250 | let start_of_mapping = char_mapping[offsets.table]; |
3322 | 250 | if start_of_mapping == 0 { |
3323 | } | |
3324 | } | |
3325 | ) | |
3326 | } | |
3327 | 197 | return new_text; |
3328 | ||
3329 | 213 | fn shift_char(ch: u32) -> char { |
3330 | // there are "holes" in the math alphanumerics due to legacy issues | |
3331 | // this table maps the holes to their legacy location | |
3332 | static EXCEPTIONS: phf::Map<u32, u32> = phf_map! { | |
3333 | 0x1D455u32 => 0x210Eu32, | |
3334 | 0x1D49Du32 => 0x212Cu32, | |
3335 | 0x1D4A0u32 => 0x2130u32, | |
3336 | 0x1D4A1u32 => 0x2131u32, | |
3337 | 0x1D4A3u32 => 0x210Bu32, | |
3338 | 0x1D4A4u32 => 0x2110u32, | |
3339 | 0x1D4A7u32 => 0x2112u32, | |
3340 | 0x1D4A8u32 => 0x2133u32, | |
3341 | 0x1D4ADu32 => 0x211Bu32, | |
3342 | 0x1D4BAu32 => 0x212Fu32, | |
3343 | 0x1D4BCu32 => 0x210Au32, | |
3344 | 0x1D4C4u32 => 0x2134u32, | |
3345 | 0x1D506u32 => 0x212Du32, | |
3346 | 0x1D50Bu32 => 0x210Cu32, | |
3347 | 0x1D50Cu32 => 0x2111u32, | |
3348 | 0x1D515u32 => 0x211Cu32, | |
3349 | 0x1D51Du32 => 0x2128u32, | |
3350 | 0x1D53Au32 => 0x2102u32, | |
3351 | 0x1D53Fu32 => 0x210Du32, | |
3352 | 0x1D545u32 => 0x2115u32, | |
3353 | 0x1D547u32 => 0x2119u32, | |
3354 | 0x1D548u32 => 0x211Au32, | |
3355 | 0x1D549u32 => 0x211Du32, | |
3356 | 0x1D551u32 => 0x2124u32, | |
3357 | }; | |
3358 | ||
3359 | 213 | return unsafe { char::from_u32_unchecked( // safe because the values are a char or from the table above |
3360 | 213 | match EXCEPTIONS.get(&ch) { |
3361 | 161 | None => ch, |
3362 | 52 | Some(exception_value) => *exception_value, |
3363 | } | |
3364 | ) } | |
3365 | 213 | } |
3366 | 197 | } |
3367 | 42.1k | } |
3368 | ||
3369 | 41.5k | fn math_alphanumeric_to_ascii(input: &str) -> Option<String> { |
3370 | 41.5k | let mut result = String::with_capacity(input.len()); |
3371 | ||
3372 | 46.6k | for c in |
3373 | 46.6k | let |
3374 | // Standard ASCII | |
3375 | 18.4k | 'a'..='z' | |
3376 | ||
3377 | // Mathematical Bold (A-Z: U+1D400, a-z: U+1D41A) | |
3378 | 482 | '\u{1D400}'..='\u{1D419}' => |
3379 | 460 | '\u{1D41A}'..='\u{1D433}' => |
3380 | ||
3381 | // Mathematical Italic (A-Z: U+1D434, a-z: U+1D44E) | |
3382 | // Note: 'h' is missing from this range (U+210E) | |
3383 | 424 | '\u{1D434}'..='\u{1D44D}' => |
3384 | 414 | '\u{1D44E}'..='\u{1D467}' => |
3385 | ||
3386 | // Mathematical Bold Italic (A-Z: U+1D468, a-z: U+1D482) | |
3387 | 400 | '\u{1D468}'..='\u{1D481}' => |
3388 | 400 | '\u{1D482}'..='\u{1D49B}' => |
3389 | ||
3390 | // Mathematical Sans-Serif (A-Z: U+1D5A0, a-z: U+1D5BA) | |
3391 | 274 | '\u{1D5A0}'..='\u{1D5B9}' => |
3392 | 264 | '\u{1D5BA}'..='\u{1D5D3}' => |
3393 | ||
3394 | // If a character isn't a letter (or supported math letter), return None | |
3395 | 28.6k | _ => return None, |
3396 | }; | |
3397 | 18.0k | result.push(converted); |
3398 | } | |
3399 | ||
3400 | 12.8k | Some(result) |
3401 | 41.5k | } |
3402 | ||
3403 | 14.6k | fn canonicalize_mo_text(&self, mo: Element) { |
3404 | // lazy_static! { (NOTE: std::sync::LazyLock is now used instead) | |
3405 | // static ref IS_LIKELY_SCALAR_VARIABLE: Regex = Regex::new("[a-eh-z]").unwrap(); | |
3406 | // } | |
3407 | ||
3408 | 14.6k | let mut mo_text = as_text(mo); |
3409 | 14.6k | let parent = get_parent(mo); |
3410 | 14.6k | let parent_name = name(parent); |
3411 | 14.6k | let is_base = mo.preceding_siblings().is_empty(); |
3412 | 14.6k | if !is_base && ( |
3413 | // canonicalize various diacritics for munder, mover, munderover | |
3414 | 309 | mo_text = match mo_text { |
3415 | 309 | "_" | |
3416 | 302 | "\u{2010}" | "\u{2011}" | "\u{2012}" | "\u{2013}" | "\u{2014}" | "\u{2015}" | |
3417 | 292 | "\u{02BC}" => |
3418 | 292 | "\u{02DC}" | "\u{223C}" => |
3419 | 292 | "\u{02C6}"| "\u{0302}" => |
3420 | 292 | "\u{0307}" => |
3421 | 292 | "\u{0308}" => |
3422 | 292 | _ => mo_text, |
3423 | } | |
3424 | // FIX: MathType generates the wrong version of union and intersection ops (binary instead of unary) | |
3425 | 14.3k | } else if !is_base && ( |
3426 | 227 | mo_text = match mo_text { |
3427 | 227 | "\u{00BA}"| "\u{2092}"| "\u{20D8}"| "\u{2218}" | |
3428 | 223 | _ => mo_text, |
3429 | }; | |
3430 | } else { | |
3431 | 14.0k | mo_text = match mo_text { |
3432 | 14.0k | "\u{02C9}"| "\u{0304}"| "\u{0305}" => |
3433 | 14.0k | "\u{02DC}" | "~" => |
3434 | 14.0k | "\u{01C1}" => |
3435 | ||
3436 | 14.0k | _ => mo_text, |
3437 | }; | |
3438 | }; | |
3439 | 14.6k | if mo_text == "\u{2212}" { |
3440 | 314 | mo_text = "-"; |
3441 | 14.2k | } |
3442 | 14.6k | mo.set_text(mo_text); |
3443 | 14.6k | } |
3444 | ||
3445 | ||
3446 | // Find the operator associated with the 'mo_node' | |
3447 | // This is complicated by potentially needing to distinguish between the | |
3448 | // prefix, infix, or postfix version of the operator. | |
3449 | // To figure out prefix, we need to look at the node on the left; for postfix, we need to look to the left | |
3450 | // If the node of the left has been parsed, then this works. | |
3451 | // For example, suppose we want to determine if the "+" in 'x < n!+1' is prefix or infix. | |
3452 | // If we simply looked left without parsing, we'd see an operator and choose prefix unless we could figure out that | |
3453 | // that "!" was postfix. But if it had been parsed, we'd see an mrow (operand) and tree "+" as infix (as it should). | |
3454 | // The same problem applies on the right for postfix operators, but a problem is rare for those | |
3455 | // e.g., n!!n -- ((n!)!)*n or (n!)*(!n) -- the latter doesn't make semantic sense though | |
3456 | // FIX: the above ignores mspace and other nodes that need to be skipped to determine the right node to determine airity | |
3457 | // FIX: the postfix problem above should be addressed | |
3458 | 19.4k | fn find_operator<'a>(context: Option<&CanonicalizeContext>, mo_node: Element<'a>, previous_operator: Option<&'static OperatorInfo>, |
3459 | 19.4k | previous_node: Option<Element<'a>>, next_node: Option<Element<'a>>) -> &'static OperatorInfo { |
3460 | // get the unicode value and return the OpKeyword associated with it | |
3461 | 19.4k | assert!( name(mo_node) == "mo"); |
3462 | ||
3463 | // if a form has been given, that takes precedence | |
3464 | 19.4k | let form = mo_node.attribute_value("form"); |
3465 | 19.4k | let op_type = match form { |
3466 | 19.4k | None => match context { |
3467 | 5.50k | None => OperatorTypes::POSTFIX, // what compute_type_from_position returns when the other args to this are all None |
3468 | 13.9k | Some(context) => compute_type_from_position(context, previous_operator, previous_node, next_node), |
3469 | }, | |
3470 | 10 | Some(form) => match form.to_lowercase().as_str() { |
3471 | 10 | "prefix" => |
3472 | 6 | "postfix" => |
3473 | 4 | _ => OperatorTypes::INFIX, |
3474 | } | |
3475 | }; | |
3476 | ||
3477 | 19.4k | let found_op_info = if mo_node.attribute_value(CHEMICAL_BOND).is_some() { |
3478 | 112 | Some(&IMPLIED_CHEMICAL_BOND) |
3479 | } else { | |
3480 | 19.3k | OPERATORS.get(as_text(mo_node)) |
3481 | }; | |
3482 | 19.4k | if found_op_info.is_none() { |
3483 | // no known operator -- return the unknown operator with the correct "fix" type | |
3484 | 49 | return op_not_in_operator_dictionary(op_type); |
3485 | 19.4k | } |
3486 | ||
3487 | 19.4k | let found_op_info = found_op_info.unwrap(); |
3488 | 19.4k | let matching_op_info = find_operator_info(found_op_info, op_type, form.is_some()); |
3489 | 19.4k | if ptr_eq(matching_op_info, &ILLEGAL_OPERATOR_INFO) { |
3490 | 0 | return op_not_in_operator_dictionary(op_type); |
3491 | } else { | |
3492 | 19.4k | return matching_op_info; |
3493 | } | |
3494 | ||
3495 | ||
3496 | 13.9k | fn compute_type_from_position<'a>(context: &CanonicalizeContext, previous_operator: Option<&'static OperatorInfo>, previous_node: Option<Element<'a>>, next_node: Option<Element<'a>>) -> OperatorTypes { |
3497 | // based on choices, pick one that fits the context | |
3498 | // if there isn't an obvious one, we have parsed the left, but not the right, so discount that | |
3499 | ||
3500 | // Trig functions have some special syntax | |
3501 | // We need to treat '-' as prefix for things like "sin -2x" | |
3502 | // Need to be careful because (sin - cos)(x) needs an infix '-' | |
3503 | // Return either the prefix or infix version of the operator | |
3504 | 13.9k | if next_node.is_some() && |
3505 | 11.9k | context.is_function_name(get_possible_embellished_node(next_node.unwrap()), None) == FunctionNameCertainty::True { |
3506 | 260 | return OperatorTypes::INFIX; |
3507 | 13.6k | } |
3508 | 13.6k | if previous_node.is_some() && |
3509 | 11.1k | context.is_function_name(get_possible_embellished_node(previous_node.unwrap()), None) == FunctionNameCertainty::True { |
3510 | 207 | return OperatorTypes::PREFIX; |
3511 | 13.4k | } |
3512 | ||
3513 | // after that special case, start with the obvious cases... | |
3514 | 13.4k | let operand_on_left = previous_operator.is_none() || |
3515 | 13.4k | let operand_on_right = next_node.is_some() && |
3516 | ||
3517 | 13.4k | if operand_on_left && |
3518 | 8.19k | return OperatorTypes::INFIX; // infix |
3519 | 5.29k | } else if !operand_on_left && |
3520 | 2.75k | return OperatorTypes::PREFIX; // prefix |
3521 | 2.54k | } else if operand_on_left && |
3522 | 2.34k | return OperatorTypes::POSTFIX; // postfix |
3523 | } else { | |
3524 | // either two operators in a row or right hand side not parsed so we don't really know what is right (same is true above) | |
3525 | // since there is nothing good to return, assume right is an operand after parsing (thus infix case) | |
3526 | 196 | return OperatorTypes::INFIX; |
3527 | } | |
3528 | 13.9k | } |
3529 | ||
3530 | 19.4k | fn find_operator_info(op_info: &OperatorInfo, op_type: OperatorTypes, from_form_attr: bool) -> &OperatorInfo { |
3531 | 19.4k | if op_info.is_operator_type(op_type) { |
3532 | 12.9k | return op_info; |
3533 | 6.45k | } else if let Some( |
3534 | 1.64k | if next_op_info.is_operator_type(op_type) { |
3535 | 730 | return next_op_info; |
3536 | 915 | } else if let Some( |
3537 | 256 | && last_op_info.is_operator_type(op_type) { |
3538 | 256 | return last_op_info; |
3539 | 659 | } |
3540 | 4.81k | } |
3541 | ||
3542 | // didn't find op_info that matches -- if type is not forced, then return first value (any is probably ok) | |
3543 | 5.47k | return if from_form_attr { |
3544 | 19.4k | } |
3545 | ||
3546 | 49 | fn op_not_in_operator_dictionary(op_type: OperatorTypes) -> &'static OperatorInfo { |
3547 | 49 | return match op_type { |
3548 | 16 | OperatorTypes::PREFIX => &DEFAULT_OPERATOR_INFO_PREFIX, |
3549 | 9 | OperatorTypes::POSTFIX => &DEFAULT_OPERATOR_INFO_POSTFIX, |
3550 | 24 | _ => &DEFAULT_OPERATOR_INFO_INFIX, // should only be infix |
3551 | }; | |
3552 | 49 | } |
3553 | 19.4k | } |
3554 | ||
3555 | 13.9k | fn n_vertical_bars_on_right(&self, remaining_children: &[ChildOfElement], vert_bar_ch: &str) -> usize { |
3556 | // return the number of children that match 'vert_bar_op' not counting the first element | |
3557 | 13.9k | let mut n = 0; |
3558 | 149k | for child_of_element in |
3559 | 149k | let child = as_element(*child_of_element); |
3560 | 149k | if name(child) == "mo" { |
3561 | 49.9k | let operator_str = as_text(child); |
3562 | 49.9k | if operator_str == vert_bar_ch { |
3563 | 42.7k | n += 1; |
3564 | 42.7k |
|
3565 | 99.9k | } |
3566 | } | |
3567 | 13.9k | return n; |
3568 | 13.9k | } |
3569 | ||
3570 | ||
3571 | 13.9k | fn determine_vertical_bar_op<'a>(&self, original_op: &'static OperatorInfo, mo_node: Element<'a>, |
3572 | 13.9k | next_child: Option<Element<'a>>, |
3573 | 13.9k | parse_stack: &'a mut Vec<StackInfo>, |
3574 | 13.9k | n_vertical_bars_on_right: usize) -> &'static OperatorInfo { |
3575 | // if in a prefix location, it is a left fence | |
3576 | // note: if there is an operator on the top of the stack, it wants an operand (otherwise it would have been reduced) | |
3577 | 13.9k | let operator_str = as_text(mo_node); |
3578 | 13.9k | let found_op_info = OPERATORS.get(operator_str); |
3579 | 13.9k | if found_op_info.is_none() { |
3580 | 48 | return original_op; |
3581 | 13.8k | } |
3582 | 13.8k | let op = found_op_info.unwrap(); |
3583 | 13.8k | if !AMBIGUOUS_OPERATORS.contains(operator_str) { |
3584 | // debug!(" op is not ambiguous"); | |
3585 | 13.4k | return original_op; |
3586 | 401 | }; |
3587 | ||
3588 | 401 | let operator_versions = OperatorVersions::new(op); |
3589 | 401 | if let Some( |
3590 | 360 | (top(parse_stack).last_child_in_mrow().is_none() || |
3591 | // debug!(" is prefix"); | |
3592 | 115 | return prefix; |
3593 | 286 | } |
3594 | ||
3595 | // We have either a right fence or an infix operand at the top of the stack | |
3596 | // If this is already parsed, we'd look to the right to see if there is an operand after this child. | |
3597 | // But it isn't parsed and there might be a prefix operator which will eventually become an operand, so it is tricky. | |
3598 | // It is even trickier because we might have an implicit times, so we can't really tell | |
3599 | // For example: |x|y|z| which can be '|x| y |z|' or '|x |y| z|', or even | (x|y)|z |' | |
3600 | // We can't really know what is intended (without @intent). | |
3601 | // It seems like the case where it could be paired with a matching vertical bar as what most people would choose, so we favor that. | |
3602 | ||
3603 | // If there is a matching open vertical bar, it is either at the top of the stack or the entry just below the top | |
3604 | ||
3605 | 286 | let has_left_match = if let Some( |
3606 | 245 | if ptr_eq(top(parse_stack).op_pair.op, op_prefix) { // match at top of stack? (empty matching bars) |
3607 | 109 | true |
3608 | 136 | } else if parse_stack.len() > 2 { |
3609 | // matching op is below top (operand between matching bars) -- pop, peek, push | |
3610 | 36 | let old_top = parse_stack.pop().unwrap(); |
3611 | 36 | let top_op = top(parse_stack).op_pair.op; // can only access top, so we need to pop off top and push back later |
3612 | 36 | parse_stack.push(old_top); |
3613 | 36 | ptr_eq(top_op, op_prefix) |
3614 | } else { | |
3615 | 100 | false |
3616 | } | |
3617 | } else { | |
3618 | 41 | false |
3619 | }; | |
3620 | 286 | if let Some( |
3621 | // last child in row (must be a close) or we have a left match | |
3622 | // debug!(" is postfix"); | |
3623 | 136 | return postfix; |
3624 | 150 | } else if next_child.is_none() { |
3625 | // operand on left, so prefer infix version | |
3626 | 18 | return if let Some(infix) = operator_versions.infix {infix} else { |
3627 | 132 | } |
3628 | ||
3629 | 132 | let next_child = next_child.unwrap(); |
3630 | 132 | if let Some( |
3631 | // (" is prefix"); | |
3632 | 3 | return prefix; // odd number of vertical bars remain, so consider this the start of a pair |
3633 | 129 | } |
3634 | ||
3635 | 129 | let next_child = get_possible_embellished_node(next_child); |
3636 | 129 | let next_child_op = if name(next_child) != "mo" { |
3637 | 128 | None |
3638 | } else { | |
3639 | 1 | let next_next_children = next_child.following_siblings(); |
3640 | 1 | let next_next_child = if next_next_children.is_empty() { |
3641 | 1 | Some( CanonicalizeContext::find_operator(Some(self), next_child, operator_versions.infix, |
3642 | 1 | top(parse_stack).last_child_in_mrow(), next_next_child) ) |
3643 | }; | |
3644 | ||
3645 | // If the next child is a prefix op or a left fence, it will reduce to an operand, so don't consider it an operator | |
3646 | 129 | if next_child_op.is_some() && |
3647 | 0 | if let Some(postfix) =operator_versions.postfix { |
3648 | // debug!(" is postfix"); | |
3649 | 0 | return postfix; |
3650 | 0 | } |
3651 | 129 | } else if let Some(infix) = operator_versions.infix { |
3652 | // debug!(" is infix"); | |
3653 | 129 | return infix; |
3654 | 0 | } |
3655 | ||
3656 | // nothing good to match | |
3657 | 0 | return op; |
3658 | 13.9k | } |
3659 | ||
3660 | ||
3661 | // return FunctionNameCertainty::False or Maybe if 'node' is a chemical element and is followed by a state (solid, liquid, ...) | |
3662 | // in other words, we are certain this can't be a function since it looks like it is or might be chemistry | |
3663 | 1.71k | fn is_likely_chemical_state<'a>(&self, node: Element<'a>, right_sibling: Element<'a>) -> FunctionNameCertainty { |
3664 | 1.71k | assert_eq!(name(get_parent(node)), "mrow"); // should be here because we are parsing an mrow |
3665 | ||
3666 | // debug!(" in is_likely_chemical_state: '{}'?",element_summary(node)); | |
3667 | 1.71k | let node_chem_likelihood= node.attribute_value(MAYBE_CHEMISTRY); |
3668 | 1.71k | if node.attribute(MAYBE_CHEMISTRY).is_none() { |
3669 | 1.16k | return FunctionNameCertainty::True; |
3670 | 549 | } |
3671 | ||
3672 | 549 | if name(right_sibling) == "mrow" { // clean_chemistry_mrow made sure any state-like structure is an mrow |
3673 | 75 | let state_likelihood = likely_chem_state(right_sibling); |
3674 | 75 | if state_likelihood > 0 { |
3675 | 49 | right_sibling.set_attribute_value(MAYBE_CHEMISTRY, state_likelihood.to_string().as_str()); |
3676 | // at this point, we know both node and right_sibling are positive, so we have at least a maybe | |
3677 | 49 | if state_likelihood + node_chem_likelihood.unwrap().parse::<i32>().unwrap() > 2 { |
3678 | 49 | return FunctionNameCertainty::False; |
3679 | } else { | |
3680 | 0 | return FunctionNameCertainty::Maybe |
3681 | } | |
3682 | 26 | } |
3683 | 474 | } |
3684 | ||
3685 | 500 | return FunctionNameCertainty::True; |
3686 | 1.71k | } |
3687 | ||
3688 | // Try to figure out whether an <mi> is a function name or not. | |
3689 | // There are two important cases depending upon whether parens/brackets are used or not. | |
3690 | // E.g, sin x and f(x) | |
3691 | // 1. If parens follow the name, then we use a more inclusive set of heuristics as it is more likely a function | |
3692 | // The heuristics used are: | |
3693 | // - it is on the list of known function names (e.g., sin" and "log") | |
3694 | // - it is on the list of likely function names (e.g, f, g, h) | |
3695 | // - multi-char names that begin with a capital letter (e.g, "Tr") | |
3696 | // - there is a single token inside the parens (why else would someone use parens), any name (e.g, a(x)) | |
3697 | // - if there are multiple comma-separated args | |
3698 | // | |
3699 | // 2. If there are no parens, then only names on the known function list are used (e.g., "sin x") | |
3700 | // | |
3701 | // If the name if followed by parens but doesn't fit into the above categories, we return a "maybe" | |
3702 | 32.0k | fn is_function_name<'a>(&self, node: Element<'a>, right_siblings: Option<&[ChildOfElement<'a>]>) -> FunctionNameCertainty { |
3703 | 32.0k | let base_of_name = get_possible_embellished_node(node); |
3704 | ||
3705 | // actually only 'mi' should be legal here, but some systems used 'mtext' for multi-char variables | |
3706 | // FIX: need to allow for composition of function names. E.g, (f+g)(x) and (f^2/g)'(x) | |
3707 | 32.0k | let node_name = name(base_of_name); |
3708 | 32.0k | if node_name != "mi" && |
3709 | 15.4k | return FunctionNameCertainty::False; |
3710 | 16.6k | } |
3711 | // whitespace is sometimes added to the mi since braille needs it, so do a trim here to get function name | |
3712 | 16.6k | let base_name = as_text(base_of_name).trim(); |
3713 | 16.6k | if base_name.is_empty() { |
3714 | 2 | return FunctionNameCertainty::False; |
3715 | 16.6k | } |
3716 | // debug!(" is_function_name({}), {} following nodes", base_name, if right_siblings.is_none() {"No".to_string()} else {right_siblings.unwrap().len().to_string()}); | |
3717 | 16.6k | return crate::definitions::SPEECH_DEFINITIONS.with(|defs| { |
3718 | // names that are always function names (e.g, "sin" and "log") | |
3719 | 16.6k | let defs = defs.borrow(); |
3720 | 16.6k | let names = defs.get_hashset("FunctionNames").unwrap(); |
3721 | // UEB seems to think "Sin" (etc) is used for "sin", so we move to lower case | |
3722 | 16.6k | if names.contains(&base_name.to_ascii_lowercase()) { |
3723 | // debug!(" ...is in FunctionNames"); | |
3724 | 1.02k | return FunctionNameCertainty::True; // always treated as function names |
3725 | 15.5k | } |
3726 | ||
3727 | // We include shapes as function names so that △ABC makes sense since △ and | |
3728 | // the other shapes are not in the operator dictionary | |
3729 | 15.5k | let shapes = defs.get_hashset("GeometryShapes").unwrap(); |
3730 | 15.5k | if shapes.contains(base_name) { |
3731 | 23 | return FunctionNameCertainty::True; // always treated as function names |
3732 | 15.5k | } |
3733 | ||
3734 | 15.5k | if right_siblings.is_none() { |
3735 | 13.8k | return FunctionNameCertainty::False; // only accept known names, which is tested above |
3736 | 1.71k | } |
3737 | ||
3738 | // make sure that what follows starts and ends with parens/brackets | |
3739 | 1.71k | assert_eq!(name(get_parent(node)), "mrow"); |
3740 | 1.71k | let right_siblings = right_siblings.unwrap(); |
3741 | 1.71k | let non_whitespace = right_siblings.iter().enumerate() |
3742 | 1.71k | .find(|&(_, child)| { |
3743 | 1.71k | let child = as_element(*child); |
3744 | 1.71k | name(child) != "mtext" || |
3745 | 1.71k | }); |
3746 | 1.71k | let right_siblings = if let Some( (i, _) ) = non_whitespace {&right_siblings[i..]} else { |
3747 | 1.71k | if right_siblings.is_empty() { |
3748 | // debug!(" ...right siblings not None, but zero of them"); | |
3749 | 0 | return FunctionNameCertainty::False; |
3750 | 1.71k | } |
3751 | ||
3752 | 1.71k | let first_child = as_element(right_siblings[0]); |
3753 | ||
3754 | // clean_chemistry wrapped up a state in an mrow and this is assumed by is_likely_chemical_state() | |
3755 | 1.71k | let chem_state_certainty = self.is_likely_chemical_state(node, first_child); |
3756 | 1.71k | if chem_state_certainty != FunctionNameCertainty::True { |
3757 | // debug!(" ...is_likely_chemical_state says it is a function ={:?}", chem_state_certainty); | |
3758 | 49 | return chem_state_certainty; |
3759 | 1.66k | } |
3760 | ||
3761 | 1.66k | if name(first_child) == "mrow" && |
3762 | // debug!(" ...trying again after expanding mrow"); | |
3763 | 235 | return self.is_function_name(node, Some(&first_child.children())); |
3764 | 1.43k | } |
3765 | ||
3766 | 1.43k | if right_siblings.len() < 2 { |
3767 | // debug!(" ...not enough right siblings"); | |
3768 | 542 | return FunctionNameCertainty::False; // can't be (...) |
3769 | 892 | } |
3770 | ||
3771 | // at least two siblings are this point -- check that they are parens/brackets | |
3772 | // we can only check the open paren/bracket because the right side is unparsed and we don't know the close location | |
3773 | 892 | let first_sibling = as_element(right_siblings[0]); |
3774 | 892 | if name(first_sibling) != "mo" || |
3775 | { | |
3776 | // debug!(" ...first sibling is not '(' or '['"); | |
3777 | 522 | return FunctionNameCertainty::False; |
3778 | 370 | } |
3779 | ||
3780 | 370 | let likely_names = defs.get_hashset("LikelyFunctionNames").unwrap(); |
3781 | 370 | if likely_names.contains(base_name) { |
3782 | 206 | return FunctionNameCertainty::True; // don't bother checking contents of parens, consider these as function names |
3783 | 164 | } |
3784 | ||
3785 | 164 | if is_single_arg(as_text(first_sibling), &right_siblings[1..]) { |
3786 | // debug!(" ...is single arg"); | |
3787 | 64 | return FunctionNameCertainty::True; // if there is only a single arg, why else would you use parens? |
3788 | 100 | }; |
3789 | ||
3790 | 100 | if is_comma_arg(as_text(first_sibling), &right_siblings[1..]) { |
3791 | // debug!(" ...is comma arg"); | |
3792 | 2 | return FunctionNameCertainty::True; // if there is only a single arg, why else would you use parens? |
3793 | 98 | }; |
3794 | ||
3795 | // FIX: should really make sure all the args are marked as MAYBE_CHEMISTRY, but we don't know the matching close paren/bracket | |
3796 | 98 | if node.attribute(MAYBE_CHEMISTRY).is_some() && |
3797 | 34 | as_element(right_siblings[1]).attribute(MAYBE_CHEMISTRY).is_some() { |
3798 | 1 | return FunctionNameCertainty::False; |
3799 | 97 | } |
3800 | ||
3801 | // Names like "Tr" are likely function names, single letter names like "M" or "J" are iffy | |
3802 | // This needs to be after the chemical state check above to rule out Cl(g), etc | |
3803 | // This would be better if it were part of 'likely_names' as "[A-Za-z]+", but reg exprs don't work in HashSets. | |
3804 | // FIX: create our own struct and write appropriate traits for it and then it could work | |
3805 | 97 | let mut chars = base_name.chars(); |
3806 | 97 | let first_char = chars.next().unwrap(); // we know there is at least one byte in it, hence one char |
3807 | 97 | if chars.next().is_some() && |
3808 | // debug!(" ...is uppercase name"); | |
3809 | 4 | return FunctionNameCertainty::True; |
3810 | 93 | } |
3811 | ||
3812 | // debug!(" ...didn't match options to be a function"); | |
3813 | // debug!("Right siblings:\n{} ", right_siblings.iter().map(|&child| mml_to_string(as_element(child))).collect::<Vec<String>>().join("\n ")); | |
3814 | 93 | return if is_name_inside_parens(base_name, right_siblings) { |
3815 | 16.6k | }); |
3816 | ||
3817 | 164 | fn is_single_arg(open: &str, following_nodes: &[ChildOfElement]) -> bool { |
3818 | // following_nodes are nodes after "(" | |
3819 | 164 | if following_nodes.is_empty() { |
3820 | 0 | return true; // "a(" might or might not be a function call -- treat as "is" because we can't see more |
3821 | 164 | } |
3822 | ||
3823 | 164 | let first_child = as_element(following_nodes[0]); |
3824 | 164 | if is_matching_right_paren(open, first_child) { |
3825 | 0 | return true; // no-arg case "a()" |
3826 | 164 | } |
3827 | ||
3828 | // could be really picky and restrict to checking for only mi/mn | |
3829 | // that might make more sense in stranger cases, but mfrac, msqrt, etc., probably shouldn't have parens if times | |
3830 | 164 | return following_nodes.len() > 1 && |
3831 | 164 | name(first_child) != "mrow" && |
3832 | 127 | is_matching_right_paren(open, as_element(following_nodes[1])); |
3833 | 164 | } |
3834 | ||
3835 | 100 | fn is_comma_arg(open: &str, following_nodes: &[ChildOfElement]) -> bool { |
3836 | // following_nodes are nodes after "(" | |
3837 | 100 | if following_nodes.len() == 1 { |
3838 | 0 | return false; |
3839 | 100 | } |
3840 | ||
3841 | 100 | let first_child = as_element(following_nodes[1]); |
3842 | 100 | if name(first_child) == "mrow" { |
3843 | 0 | return is_comma_arg(open, &first_child.children()[..]); |
3844 | 100 | } |
3845 | ||
3846 | // FIX: this loop is very simplistic and could be improved to count parens, etc., to make sure "," is at top-level | |
3847 | 318 | for child in |
3848 | 318 | let child = as_element(*child); |
3849 | 318 | if name(child) == "mo" { |
3850 | 141 | if as_text(child) == "," { |
3851 | 2 | return true; |
3852 | 139 | } |
3853 | 139 | if is_matching_right_paren(open, child) { |
3854 | 96 | return false; |
3855 | 43 | } |
3856 | 177 | } |
3857 | } | |
3858 | ||
3859 | 2 | return false; |
3860 | 100 | } |
3861 | ||
3862 | 622 | fn is_left_paren(node: Element) -> bool { |
3863 | 622 | if name(node) != "mo" { |
3864 | 1 | return false; |
3865 | 621 | } |
3866 | 621 | let text = as_text(node); |
3867 | 621 | return text == "(" || |
3868 | 622 | } |
3869 | ||
3870 | 430 | fn is_matching_right_paren(open: &str, node: Element) -> bool { |
3871 | 430 | if name(node) != "mo" { |
3872 | 184 | return false; |
3873 | 246 | } |
3874 | 246 | let text = as_text(node); |
3875 | // debug!(" is_matching_right_paren: open={}, close={}", open, text); | |
3876 | 246 | return (open == "(" && |
3877 | 430 | } |
3878 | ||
3879 | /// Returns true if the name of the potential function is inside the parens. In that case, it is very unlikely to be a function call | |
3880 | /// For example, "n(n+1)" | |
3881 | 93 | fn is_name_inside_parens(function_name: &str, right_siblings: &[ChildOfElement]) -> bool { |
3882 | // the first child of right_siblings is either '(' or '[' | |
3883 | // right_siblings may extend well beyond the closing parens, so we first break this into finding the contents | |
3884 | // then we search the contents for the name | |
3885 | 93 | match find_contents(right_siblings) { |
3886 | 2 | None => return false, |
3887 | 91 | Some(contents) => return is_name_inside_contents(function_name, contents), |
3888 | } | |
3889 | ||
3890 | ||
3891 | 93 | fn find_contents<'a>(right_siblings: &'a[ChildOfElement<'a>]) -> Option<&'a[ChildOfElement<'a>]> { |
3892 | 93 | let open_text = as_text(as_element(right_siblings[0])); |
3893 | 93 | let close_text = if open_text == "(" { |
3894 | 93 | let mut nesting_level = 1; |
3895 | 93 | let mut i = 1; |
3896 | 296 | while i < right_siblings.len() { |
3897 | 294 | let child = as_element(right_siblings[i]); |
3898 | 294 | if name(child) == "mo" { |
3899 | 133 | let op_text = as_text(child); |
3900 | 133 | if op_text == open_text { |
3901 | 0 | nesting_level += 1; |
3902 | 133 | } else if op_text == close_text { |
3903 | 91 | if nesting_level == 1 { |
3904 | 91 | return Some(&right_siblings[1..i]); |
3905 | 0 | } |
3906 | 0 | nesting_level -= 1; |
3907 | 42 | } |
3908 | 161 | } |
3909 | 203 | i += 1; |
3910 | } | |
3911 | 2 | return None; // didn't find matching paren |
3912 | 93 | } |
3913 | ||
3914 | 134 | fn is_name_inside_contents(function_name: &str, contents: &[ChildOfElement]) -> bool { |
3915 | 304 | for &child in |
3916 | 304 | let child = as_element(child); |
3917 | // debug!("is_name_inside_contents: child={}", mml_to_string(child)); | |
3918 | 304 | if is_leaf(child) { |
3919 | 261 | let text = as_text(child); |
3920 | 261 | if (name(child) == "mi" || |
3921 | 5 | return true; |
3922 | 256 | } |
3923 | 43 | } else if is_name_inside_contents(function_name, &child.children()) { |
3924 | 4 | return true; |
3925 | 39 | } |
3926 | } | |
3927 | 125 | return false; |
3928 | 134 | } |
3929 | 93 | } |
3930 | 32.0k | } |
3931 | ||
3932 | 5.79k | fn is_mixed_fraction<'a>(&self, integer_part: Element<'a>, fraction_children: &[ChildOfElement<'a>]) -> Result<bool> { |
3933 | // do some simple disqualifying checks on the fraction part | |
3934 | 5.79k | if fraction_children.is_empty() { |
3935 | 0 | return Ok( false ); |
3936 | 5.79k | } |
3937 | 5.79k | let right_child = as_element(fraction_children[0]); |
3938 | 5.79k | let right_child_name = name(right_child); |
3939 | 5.79k | if ! (right_child_name == "mfrac" || |
3940 | 5.68k | (right_child_name == "mrow" && |
3941 | 5.48k | (right_child_name == "mn" && |
3942 | 5.46k | return Ok( false ); |
3943 | 329 | }; |
3944 | ||
3945 | 329 | if !is_integer_part_ok(integer_part) { |
3946 | 219 | return Ok( false ); |
3947 | 110 | } |
3948 | ||
3949 | 110 | if right_child_name == "mfrac" { |
3950 | 75 | return Ok( is_mfrac_ok(right_child) ); |
3951 | 35 | } |
3952 | ||
3953 | 35 | return is_linear_fraction(self, fraction_children); |
3954 | ||
3955 | ||
3956 | 351 | fn is_int(integer_part: Element) -> bool { |
3957 | 351 | return name(integer_part) == "mn" && |
3958 | 351 | } |
3959 | ||
3960 | 329 | fn is_integer_part_ok(integer_part: Element) -> bool { |
3961 | // integer part must be either 'n' or '-n' (in an mrow) | |
3962 | 329 | let integer_part_name = name(integer_part); |
3963 | 329 | if integer_part_name == "mrow" { |
3964 | 83 | let children = integer_part.children(); |
3965 | 83 | if children.len() == 2 && |
3966 | 16 | name(as_element(children[0])) == "mo" && |
3967 | 0 | as_text(as_element(children[0])) == "-" { |
3968 | 0 | let integer_part = as_element(children[1]); |
3969 | 0 | return is_int(integer_part); |
3970 | 83 | } |
3971 | 83 | return false; |
3972 | 246 | }; |
3973 | ||
3974 | 246 | return is_int(integer_part); |
3975 | 329 | } |
3976 | ||
3977 | 75 | fn is_mfrac_ok(fraction_part: Element) -> bool { |
3978 | // fraction_part needs to have integer numerator and denominator (already tested it is a frac) | |
3979 | 75 | let fraction_children = fraction_part.children(); |
3980 | 75 | if fraction_children.len() != 2 { |
3981 | 0 | return false; |
3982 | 75 | } |
3983 | 75 | let numerator = as_element(fraction_children[0]); |
3984 | 75 | if name(numerator) != "mn" || |
3985 | 8 | return false; |
3986 | 67 | } |
3987 | 67 | let denominator = as_element(fraction_children[1]); |
3988 | 67 | return is_int(denominator); |
3989 | 75 | } |
3990 | ||
3991 | 66 | fn is_linear_fraction(canonicalize: &CanonicalizeContext, fraction_children: &[ChildOfElement]) -> Result<bool> { |
3992 | // two possibilities | |
3993 | // 1. '3 / 4' is in an mrow | |
3994 | // 2. '3 / 4' are three separate elements | |
3995 | 66 | let first_child = as_element(fraction_children[0]); |
3996 | 66 | if name(first_child) == "mrow" { |
3997 | 31 | if first_child.children().len() != 3 { |
3998 | 0 | return Ok( false ); |
3999 | 31 | } |
4000 | 31 | return is_linear_fraction(canonicalize, &first_child.children()) |
4001 | 35 | } |
4002 | ||
4003 | ||
4004 | // the length has been checked | |
4005 | 35 | assert!(fraction_children.len() >= 3); |
4006 | ||
4007 | 35 | if !is_int(first_child) { |
4008 | 30 | return Ok( false ); |
4009 | 5 | } |
4010 | 5 | let slash_part = canonicalize.canonicalize_mrows(as_element(fraction_children[1])) |
4011 | 5 | if name(slash_part) == "mo" && as_text(slash_part) == "/" { |
4012 | 3 | let denom = canonicalize.canonicalize_mrows(as_element(fraction_children[2])) |
4013 | 3 | return Ok( is_int(denom) ); |
4014 | 2 | } |
4015 | 2 | return Ok( false ); |
4016 | 66 | } |
4017 | 5.79k | } |
4018 | ||
4019 | /// implied comma when two numbers are adjacent and are in a script position | |
4020 | 5.72k | fn is_implied_comma<'a>(&self, prev: Element<'a>, current: Element<'a>, mrow: Element<'a>) -> bool { |
4021 | 5.72k | if name(prev) != "mn" || |
4022 | 5.63k | return false; |
4023 | 95 | } |
4024 | ||
4025 | 95 | assert_eq!(name(mrow), "mrow"); |
4026 | 95 | let container = get_parent(mrow); |
4027 | 95 | let name = name(container); |
4028 | ||
4029 | // test for script position is that it is not the base and hence has a preceding sibling | |
4030 | 95 | return (name == "msub" || |
4031 | 5.72k | } |
4032 | ||
4033 | /// implied separator when two capital letters are adjacent or two chemical elements | |
4034 | 5.64k | fn is_implied_chemical_bond<'a>(&self, prev: Element<'a>, current: Element<'a>) -> bool { |
4035 | // debug!("is_implied_chemical_bond: previous: {:?}", prev.preceding_siblings()); | |
4036 | // debug!("is_implied_chemical_bond: following: {:?}", prev.following_siblings()); | |
4037 | 5.64k | if prev.attribute(MAYBE_CHEMISTRY).is_none() || |
4038 | 5.18k | return false; |
4039 | 462 | } |
4040 | // ABC example where B and C are chemical elements is why we need to scan further than just checking B and C | |
4041 | // look for an mi/mtext with @MAYBE_CHEMISTRY until we get to something that can't have it | |
4042 | 626 | for child in |
4043 | 626 | if !is_valid_chemistry(as_element(child)) { |
4044 | 11 | return false; |
4045 | 615 | } |
4046 | } | |
4047 | 851 | for child in |
4048 | 851 | if !is_valid_chemistry(as_element(child)) { |
4049 | 32 | return false; |
4050 | 819 | } |
4051 | } | |
4052 | 419 | return true; // sequence of all MAYBE_CHEMISTRY |
4053 | ||
4054 | 1.47k | fn is_valid_chemistry(child: Element) -> bool { |
4055 | 1.47k | let child = get_possible_embellished_node(child); |
4056 | 1.47k | return child.attribute(MAYBE_CHEMISTRY).is_some() || ( |
4057 | 1.47k | } |
4058 | 5.64k | } |
4059 | ||
4060 | /// implied separator when two capital letters are adjacent or two chemical elements | |
4061 | /// also for adjacent omission chars | |
4062 | 5.22k | fn is_implied_separator<'a>(&self, prev: Element<'a>, current: Element<'a>) -> bool { |
4063 | 5.22k | if name(prev) != "mi" || |
4064 | 4.83k | return false; |
4065 | 390 | } |
4066 | ||
4067 | // trim because whitespace might have gotten stuffed into the <mi>s | |
4068 | 390 | let prev_text = as_text(prev).trim(); |
4069 | 390 | let current_text = as_text(current).trim(); |
4070 | 390 | return prev_text.len() == 1 && |
4071 | 317 | ((is_cap(prev_text) && |
4072 | 151 | (prev_text=="_" && |
4073 | ||
4074 | ||
4075 | 491 | fn is_cap(str: &str) -> bool { |
4076 | 491 | assert_eq!(str.len(), 1); |
4077 | 491 | return str.chars().next().unwrap().is_ascii_uppercase(); |
4078 | 491 | } |
4079 | 5.22k | } |
4080 | ||
4081 | 42 | fn is_invisible_char_element(mathml: Element) -> bool { |
4082 | 42 | if !is_leaf(mathml) { |
4083 | 8 | return false |
4084 | 34 | } |
4085 | 34 | let text = as_text(mathml); |
4086 | 34 | if text.len() != 3 { // speed hack: invisible chars are three UTF-8 chars |
4087 | 28 | return false; |
4088 | 6 | } |
4089 | 6 | let ch = text.chars().next().unwrap(); |
4090 | 6 | return ('\u{2061}'..='\u{2064}').contains(&ch); |
4091 | 42 | } |
4092 | ||
4093 | // Add the current operator if it's not n-ary to the stack | |
4094 | // 'current_child' and it the operator to the stack. | |
4095 | 17.7k | fn shift_stack<'s, 'a:'s, 'op:'a>( |
4096 | 17.7k | &self, parse_stack: &'s mut Vec<StackInfo<'a, 'op>>, |
4097 | 17.7k | current_child: Element<'a>, |
4098 | 17.7k | current_op: OperatorPair<'op>) -> (Element<'a>, OperatorPair<'op>) { |
4099 | 17.7k | let mut new_current_child = current_child; |
4100 | 17.7k | let mut new_current_op = current_op.clone(); |
4101 | 17.7k | let previous_op = top(parse_stack).op_pair.clone(); |
4102 | // debug!(" shift_stack: mrow len={}", top(parse_stack).mrow.children().len().to_string()); | |
4103 | // debug!(" shift_stack: shift on '{}'; ops: prev '{}/{}', cur '{}/{}'", | |
4104 | // element_summary(current_child),show_invisible_op_char(previous_op.ch), previous_op.op.priority, | |
4105 | // show_invisible_op_char(current_op.ch), current_op.op.priority); | |
4106 | 17.7k | if !current_op.op.is_nary(previous_op.op) { |
4107 | // grab operand on top of stack (if there is one) and make it part of the new mrow since current op has higher precedence | |
4108 | // if operators are the same and are binary, then this push makes them act as left associative | |
4109 | 13.0k | let mut top_of_stack = parse_stack.pop().unwrap(); |
4110 | 13.0k | if top_of_stack.mrow.children().is_empty() || ( |
4111 | 138 | // "bad" syntax - no operand on left -- don't grab operand (there is none) |
4112 | 138 | // just start a new mrow beginning with operator |
4113 | 138 | // FIX -- check this shouldn't happen: parse_stack.push(top_of_stack); |
4114 | 138 | parse_stack.push( top_of_stack ); // put top back on |
4115 | 138 | parse_stack.push( StackInfo::new(current_child.document()) ); |
4116 | 12.8k | } else if current_op.op.is_right_fence() { |
4117 | // likely, but not necessarily, there is a left fence to start the mrow | |
4118 | // this is like the postfix case except we grab the entire mrow, push on the close, and make that the mrow | |
4119 | // note: the code does these operations on the stack for consistency, but it could be optimized without push/popping the stack | |
4120 | 1.96k | let mrow = top_of_stack.mrow; |
4121 | 1.96k | top_of_stack.add_child_to_mrow(current_child, current_op); |
4122 | // debug!("shift_stack: after adding right fence to mrow:\n{}", mml_to_string(mrow)); | |
4123 | 1.96k | new_current_op = OperatorPair::new(); // treat matched brackets as operand |
4124 | 1.96k | new_current_child = mrow; |
4125 | 1.96k | let children = mrow.children(); |
4126 | 1.96k | let base_of_first_child = get_possible_embellished_node(as_element(children[0])); |
4127 | // debug!("looking for left fence: len={}, {:#?}", children.len(), CanonicalizeContext::find_operator(Some(self), base_of_first_child, None, Some(as_element(children[0])), Some(mrow))); | |
4128 | 1.96k | if children.len() == 2 && |
4129 | 64 | (name(base_of_first_child) != "mo" || |
4130 | 13 | !CanonicalizeContext::find_operator(Some(self), base_of_first_child, None, |
4131 | 51 | Some( |
4132 | 51 | // the mrow did *not* start with an open (hence no push) |
4133 | 51 | // since parser really wants balanced parens to keep stack state right, we do a push here |
4134 | 51 | parse_stack.push( StackInfo::new(mrow.document()) ); |
4135 | 51 | } else { |
4136 | // the mrow started with some open fence (which caused a push) -- add the close, pop, and push on the "operand" | |
4137 | 1.91k | new_current_child = self.potentially_lift_script(mrow) |
4138 | } | |
4139 | 10.9k | } else if current_op.op.is_postfix() { |
4140 | 81 | // grab the left operand and start a new mrow with it and the operator -- put those back on the stack |
4141 | 81 | // note: the code does these operations on the stack for consistency, but it could be optimized without push/popping the stack |
4142 | 81 | let previous_child = top_of_stack.remove_last_operand_from_mrow(); // remove operand from mrow |
4143 | 81 | parse_stack.push(top_of_stack); |
4144 | 81 | let mut new_top_of_stack = StackInfo::with_op(¤t_child.document(), previous_child, current_op.clone()); // begin new mrow with operand |
4145 | 81 | new_top_of_stack.add_child_to_mrow(current_child, current_op); // add on operator |
4146 | 81 | new_current_child = new_top_of_stack.mrow; // grab for pushing on old mrow |
4147 | 81 | new_current_op = OperatorPair::new(); // treat "reduced" postfix operator & operand as an operand |
4148 | 81 | // debug!("shift_stack: after adding postfix to mrow has len: {}", new_current_child.children().len().to_string()); |
4149 | 10.8k | } else { |
4150 | 10.8k | // normal infix op case -- grab the left operand and start a new mrow with it and the operator |
4151 | 10.8k | let previous_child = top_of_stack.remove_last_operand_from_mrow(); |
4152 | 10.8k | parse_stack.push(top_of_stack); |
4153 | 10.8k | parse_stack.push( StackInfo::with_op(¤t_child.document(),previous_child, current_op) ); |
4154 | 10.8k | } |
4155 | 4.73k | } |
4156 | 17.7k | return (new_current_child, new_current_op); |
4157 | 17.7k | } |
4158 | ||
4159 | ||
4160 | 25.2k | fn reduce_stack<'s, 'a:'s, 'op:'a>(&self, parse_stack: &'s mut Vec<StackInfo<'a, 'op>>, current_priority: usize) { |
4161 | 25.2k | let mut prev_priority = top(parse_stack).priority(); |
4162 | // debug!(" reduce_stack: stack len={}, priority: prev={}, cur={}", parse_stack.len(), prev_priority, current_priority); | |
4163 | 37.2k | while current_priority < prev_priority { // pop off operators until we are back to the right level |
4164 | 12.0k | if parse_stack.len() == 1 { |
4165 | 0 | break; // something went wrong -- break before popping too much |
4166 | 12.0k | } |
4167 | 12.0k | prev_priority = self.reduce_stack_one_time(parse_stack); |
4168 | }; | |
4169 | 25.2k | } |
4170 | ||
4171 | 12.0k | fn reduce_stack_one_time<'s, 'a:'s, 'op:'a>(&self, parse_stack: &'s mut Vec<StackInfo<'a, 'op>>) -> usize { |
4172 | 12.0k | let mut top_of_stack = parse_stack.pop().unwrap(); |
4173 | // debug!(" ..popped len={} op:'{}/{}', operand: {}", | |
4174 | // top_of_stack.mrow.children().len(), | |
4175 | // show_invisible_op_char(top_of_stack.op_pair.ch), top_of_stack.op_pair.op.priority, | |
4176 | // top_of_stack.is_operand); | |
4177 | 12.0k | let mut mrow = top_of_stack.mrow; |
4178 | 12.0k | if mrow.children().len() == 1 && |
4179 | 63 | // should have added at least operator and operand, but input might not be well-formed |
4180 | 63 | // in this case, unwrap the mrow and expose the single child for pushing onto stack |
4181 | 63 | let single_child = top_of_stack.remove_last_operand_from_mrow(); |
4182 | 63 | mrow = single_child; |
4183 | 11.9k | } |
4184 | ||
4185 | 12.0k | let mut top_of_stack = parse_stack.pop().unwrap(); |
4186 | 12.0k | top_of_stack.add_child_to_mrow(mrow, OperatorPair::new()); // mrow on top is "parsed" -- now add it to previous |
4187 | 12.0k | let prev_priority = top_of_stack.priority(); |
4188 | 12.0k | parse_stack.push(top_of_stack); |
4189 | 12.0k | return prev_priority; |
4190 | 12.0k | } |
4191 | ||
4192 | 5.06k | fn is_trig_arg<'a, 'op:'a>(&self, previous_child: Element<'a>, current_child: Element<'a>, parse_stack: &mut Vec<StackInfo<'a, 'op>>) -> bool { |
4193 | // We have operand-operand and know we want multiplication at this point. | |
4194 | // Check for special case where we want multiplication to bind more tightly than function app (e.g, sin 2x, sin -2xy) | |
4195 | // We only want to do this for simple args | |
4196 | // debug!(" is_trig_arg: prev {}, current {}, Stack:", element_summary(previous_child), element_summary(current_child)); | |
4197 | // parse_stack.iter().for_each(|stack_info| debug!(" {}", stack_info)); | |
4198 | 5.06k | if !IsNode::is_simple(current_child) { |
4199 | 2.98k | return false; |
4200 | 2.07k | } |
4201 | // This only matters if we are not inside of parens | |
4202 | 2.07k | if IsBracketed::is_bracketed(previous_child, "(", ")", false, false) || |
4203 | 2.01k | IsBracketed::is_bracketed(previous_child, "[", "]", false, false) { |
4204 | 63 | return false; |
4205 | 2.01k | } |
4206 | ||
4207 | // Use lower priority multiplication if current_child is a function (e.g. "cos" in "sin x cos 3y") | |
4208 | // if !is_trig(current_child) { | |
4209 | 2.01k | if self.is_function_name(current_child, None) == FunctionNameCertainty::True { |
4210 | 1 | return false; |
4211 | 2.01k | } |
4212 | // Three cases: | |
4213 | // 1. First operand-operand (e.g, sin 2x, where 'current_child' is 'x') -- top of stack is mrow('sin' f_apply '2') | |
4214 | // 2. Another First operand-operand (e.g, sin -2x, where 'current_child' is 'x') -- top of stack is mrow('-' '2'), next is mrow('sin', f_apply) | |
4215 | // 3. Subsequent operand-operand (e.g, sin 2xy, where 'current_child' is 'y') -- top of stack is mrow('2' 'times' 'x') | |
4216 | // Note: IMPLIED_TIMES_HIGH_PRIORITY is only present if we have a trig function | |
4217 | 2.01k | let op_on_top = &top(parse_stack).op_pair; |
4218 | 2.01k | if ptr_eq(op_on_top.op, *INVISIBLE_FUNCTION_APPLICATION) { |
4219 | 8 | let function_element = as_element(top(parse_stack).mrow.children()[0]); |
4220 | 8 | return is_trig(function_element); |
4221 | 2.00k | } |
4222 | 2.00k | if ptr_eq(op_on_top.op, *PREFIX_MINUS) { |
4223 | 74 | if parse_stack.len() < 2 { |
4224 | 0 | return false; |
4225 | 74 | } |
4226 | 74 | let next_stack_info = &parse_stack[parse_stack.len()-2]; |
4227 | 74 | if !ptr_eq(next_stack_info.op_pair.op, *INVISIBLE_FUNCTION_APPLICATION) { |
4228 | 72 | return false; |
4229 | 2 | } |
4230 | 2 | let function_element = as_element(next_stack_info.mrow.children()[0]); |
4231 | 2 | if is_trig(function_element) { |
4232 | // want '- 2' to be an mrow; don't want '- 2 x ...' to be the mrow (IMPLIED_TIMES_HIGH_PRIORITY is an internal hack) | |
4233 | 1 | self.reduce_stack_one_time(parse_stack); |
4234 | 1 | return true; |
4235 | 1 | } |
4236 | 1 | return false; |
4237 | 1.92k | } |
4238 | 1.92k | return ptr_eq(op_on_top.op, &IMPLIED_TIMES_HIGH_PRIORITY); |
4239 | ||
4240 | 10 | fn is_trig(node: Element) -> bool { |
4241 | 10 | let base_of_name = get_possible_embellished_node(node); |
4242 | ||
4243 | // actually only 'mi' should be legal here, but some systems used 'mtext' for multi-char variables | |
4244 | 10 | let node_name = name(base_of_name); |
4245 | 10 | if node_name != "mi" && |
4246 | 0 | return false; |
4247 | 10 | } |
4248 | // whitespace is sometimes added to the mi since braille needs it, so do a trim here to get function name | |
4249 | 10 | let base_name = as_text(base_of_name).trim(); |
4250 | 10 | if base_name.is_empty() { |
4251 | 0 | return false; |
4252 | 10 | } |
4253 | 10 | return crate::definitions::SPEECH_DEFINITIONS.with(|defs| { |
4254 | // names that are always function names (e.g, "sin" and "log") | |
4255 | 10 | let defs = defs.borrow(); |
4256 | 10 | let names = defs.get_hashset("TrigFunctionNames").unwrap(); |
4257 | // UEB seems to think "Sin" (etc) is used for "sin", so we move to lower case | |
4258 | 10 | return names.contains(&base_name.to_ascii_lowercase()); |
4259 | 10 | }); |
4260 | 10 | } |
4261 | 5.06k | } |
4262 | ||
4263 | ||
4264 | /* | |
4265 | canonicalize_mrows_in_mrow is a simple(ish) operator precedence parser. | |
4266 | It works by keeping a stack of 'StackInfo': | |
4267 | 'StackInfo' has three parts: | |
4268 | 1. the mrow being build | |
4269 | 2. info about the operator in the mrow being build | |
4270 | 3. bool to say whether the last thing is an operator or an operand | |
4271 | ||
4272 | When the op priority increases (eg, have "=" and get "+"), we push on | |
4273 | 1. a new mrow -- if the operator has a left operand, we remove the last node in the mrow and it becomes | |
4274 | the first (only so far) child of the new mrow | |
4275 | 2. the operator info | |
4276 | ||
4277 | When the op priority decreases, we do the following loop until the this new priority > priority on top of stack | |
4278 | 1. pop the StackInfo | |
4279 | 2. add the StackInfo's mrow as the last child to the new top of the stack | |
4280 | We also do this when we hit the end of the mrow (we can treat this case as if we have a negative precedence) | |
4281 | ||
4282 | +/- are treated as nary operators and don't push/pop in those cases. | |
4283 | consecutive operands such as nary times are also considered n-ary operators and don't push/pop in those cases. | |
4284 | */ | |
4285 | 7.48k | fn canonicalize_mrows_in_mrow<'a>(&self, mrow: Element<'a>) -> Result<Element<'a>> { |
4286 | 7.48k | let is_ok_to_merge_child = mrow.children().len() != 1 || |
4287 | 7.48k | let saved_mrow_attrs = mrow.attributes(); |
4288 | 7.48k | assert_eq!(name(mrow), "mrow"); |
4289 | ||
4290 | // FIX: don't touch/canonicalize | |
4291 | // 1. if intent is given -- anything intent references | |
4292 | // 2. if the mrow starts or ends with a fence, don't merge into parent (parse children only) -- allows for "]a,b[" | |
4293 | 7.48k | let mut parse_stack = vec![StackInfo::new(mrow.document())]; |
4294 | 7.48k | let mut children = mrow.children(); |
4295 | 7.48k | let num_children = children.len(); |
4296 | ||
4297 | 36.7k | for i_child in |
4298 | // debug!("\nDealing with child #{}: {}", i_child, mml_to_string(as_element(children[i_child]))); | |
4299 | 36.7k | let mut current_child = self.canonicalize_mrows(as_element(children[i_child])) |
4300 | 36.7k | children[i_child] = ChildOfElement::Element( current_child ); |
4301 | 36.7k | let base_of_child = get_possible_embellished_node(current_child); |
4302 | 36.7k | let acts_as_ch = current_child.attribute_value(ACT_AS_OPERATOR); |
4303 | 36.7k | let mut current_op = OperatorPair::new(); |
4304 | // figure what the current operator is -- it either comes from the 'mo' (if we have an 'mo') or it is implied | |
4305 | 36.7k | if (name(base_of_child) == "mo" && |
4306 | 13.9k | !( base_of_child.children().is_empty() || as_text(base_of_child) == "\u{00A0}" )) || // shouldn't have empty mo node, but... |
4307 | 22.8k | acts_as_ch.is_some() { |
4308 | 13.9k | let previous_op = if top(&parse_stack).is_operand { |
4309 | 13.9k | let next_node = if i_child + 1 < num_children { |
4310 | 13.9k | if let Some( |
4311 | 20 | // ∇× (etc) hack, including ∇ being a vector (maybe eventually others) |
4312 | 20 | let temp_mo = create_mathml_element(¤t_child.document(), "mo"); |
4313 | 20 | temp_mo.set_text(acts_as_ch); |
4314 | 20 | current_op = OperatorPair{ |
4315 | 20 | ch: acts_as_ch, |
4316 | 20 | op: CanonicalizeContext::find_operator(Some(self), temp_mo, previous_op, |
4317 | 20 | top(&parse_stack).last_child_in_mrow(), next_node) |
4318 | 20 | }; |
4319 | 13.9k | } else { |
4320 | 13.9k | current_op = OperatorPair{ |
4321 | 13.9k | ch: as_text(base_of_child), |
4322 | 13.9k | op: CanonicalizeContext::find_operator(Some(self), base_of_child, previous_op, |
4323 | 13.9k | top(&parse_stack).last_child_in_mrow(), next_node) |
4324 | 13.9k | }; |
4325 | 13.9k | |
4326 | 13.9k | // deal with vertical bars which might be infix, open, or close fences |
4327 | 13.9k | // note: mrow shrinks as we iterate through it (removing children from it) |
4328 | 13.9k | current_op.op = self.determine_vertical_bar_op( |
4329 | 13.9k | current_op.op, |
4330 | 13.9k | base_of_child, |
4331 | 13.9k | next_node, |
4332 | 13.9k | &mut parse_stack, |
4333 | 13.9k | self.n_vertical_bars_on_right(&children[i_child+1..], current_op.ch) |
4334 | 13.9k | ); |
4335 | 13.9k | } |
4336 | } else { | |
4337 | 22.8k | let previous_child = top(&parse_stack).last_child_in_mrow(); |
4338 | 22.8k | if let Some( |
4339 | 17.9k | let base_of_previous_child = get_possible_embellished_node(previous_child); |
4340 | 17.9k | let acts_as_ch = previous_child.attribute_value(ACT_AS_OPERATOR); |
4341 | 17.9k | if name(base_of_previous_child) != "mo" && |
4342 | 6.55k | let likely_function_name = self.is_function_name(previous_child, Some(&children[i_child..])); |
4343 | 6.55k | if name(base_of_child) == "mtext" && |
4344 | 1 | base_of_child.set_attribute_value("data-function-likelihood", &(likely_function_name == FunctionNameCertainty::True).to_string()); |
4345 | 1 | base_of_child.remove_attribute("data-was-mo"); |
4346 | 1 | set_mathml_name(base_of_child, "mo"); |
4347 | 1 | let mut top_of_stack = parse_stack.pop().unwrap(); |
4348 | 1 | top_of_stack.add_child_to_mrow(current_child, OperatorPair{ ch: "\u{00A0}", op: *INVISIBLE_FUNCTION_APPLICATION}); // whitespace -- make part of mrow to keep out of parse |
4349 | 1 | parse_stack.push(top_of_stack); |
4350 | 1 | continue; |
4351 | 6.55k | } |
4352 | // consecutive operands -- add an invisible operator as appropriate | |
4353 | 6.55k | current_op = if likely_function_name == FunctionNameCertainty::True { |
4354 | 753 | OperatorPair{ ch: "\u{2061}", op: *INVISIBLE_FUNCTION_APPLICATION } |
4355 | 5.79k | } else if self.is_mixed_fraction(previous_child, &children[i_child..]) |
4356 | 70 | OperatorPair{ ch: "\u{2064}", op: *IMPLIED_INVISIBLE_PLUS } |
4357 | 5.72k | } else if self.is_implied_comma(previous_child, current_child, mrow) { |
4358 | 81 | OperatorPair{ch: "\u{2063}", op: *IMPLIED_INVISIBLE_COMMA } |
4359 | 5.64k | } else if self.is_implied_chemical_bond(previous_child, current_child) { |
4360 | 419 | OperatorPair{ch: "\u{2063}", op: &IMPLIED_CHEMICAL_BOND } |
4361 | 5.22k | } else if self.is_implied_separator(previous_child, current_child) { |
4362 | 166 | OperatorPair{ch: "\u{2063}", op: &IMPLIED_SEPARATOR_HIGH_PRIORITY } |
4363 | 5.06k | } else if self.is_trig_arg(base_of_previous_child, base_of_child, &mut parse_stack) { |
4364 | 9 | OperatorPair{ch: "\u{2062}", op: &IMPLIED_TIMES_HIGH_PRIORITY } |
4365 | } else { | |
4366 | 5.05k | OperatorPair{ ch: "\u{2062}", op: *IMPLIED_TIMES } |
4367 | }; | |
4368 | 6.55k | if let Some( |
4369 | 262 | && attr_val == "data-was-mo" { |
4370 | 0 | // it really should be an operator |
4371 | 0 | base_of_child.remove_attribute(CHANGED_ATTR); |
4372 | 0 | set_mathml_name(base_of_child, "mo"); |
4373 | 6.55k | } |
4374 | 6.55k | if name(base_of_child) == "mo" { |
4375 | 1 | current_op.ch = as_text(base_of_child); |
4376 | 1 | // debug!(" Found whitespace op '{}'/{}", show_invisible_op_char(current_op.ch), current_op.op.priority); |
4377 | 1 | } else { |
4378 | 6.54k | let implied_mo = create_mo(current_child.document(), current_op.ch, ADDED_ATTR_VALUE); |
4379 | 6.54k | if likely_function_name == FunctionNameCertainty::Maybe { |
4380 | 33 | implied_mo.set_attribute_value("data-function-guess", "true"); |
4381 | 6.51k | } |
4382 | // debug!(" Found implicit op {}/{} [{:?}]", show_invisible_op_char(current_op.ch), current_op.op.priority, likely_function_name); | |
4383 | 6.54k | self.reduce_stack(&mut parse_stack, current_op.op.priority); |
4384 | 6.54k | let shift_result = self.shift_stack(&mut parse_stack, implied_mo, current_op.clone()); |
4385 | // ignore shift_result.0 which is just 'implied_mo' | |
4386 | 6.54k | assert_eq!(implied_mo, shift_result.0); |
4387 | 6.54k | assert!( ptr_eq(current_op.op, shift_result.1.op) ); |
4388 | 6.54k | let mut top_of_stack = parse_stack.pop().unwrap(); |
4389 | 6.54k | top_of_stack.add_child_to_mrow(implied_mo, current_op); |
4390 | 6.54k | parse_stack.push(top_of_stack); |
4391 | 6.54k | current_op = OperatorPair::new(); |
4392 | } | |
4393 | 11.3k | } |
4394 | 4.88k | } |
4395 | } | |
4396 | ||
4397 | 36.7k | if !ptr_eq(current_op.op, &ILLEGAL_OPERATOR_INFO) { |
4398 | 13.9k | if current_op.op.is_left_fence() || |
4399 | 2.95k | if top(&parse_stack).is_operand { |
4400 | // will end up with duplicate operands -- need to choose operator associated with prev child | |
4401 | // we use the original input here because in this case, we need to look to the right of the ()s to deal with chemical states | |
4402 | 232 | let likely_function_name = self.is_function_name(as_element(children[i_child-1]), Some(&children[i_child..])); |
4403 | 232 | let implied_operator = if likely_function_name== FunctionNameCertainty::True { |
4404 | 98 | OperatorPair{ ch: "\u{2061}", op: *INVISIBLE_FUNCTION_APPLICATION } |
4405 | } else { | |
4406 | 134 | OperatorPair{ ch: "\u{2062}", op: *IMPLIED_TIMES } |
4407 | }; | |
4408 | // debug!(" adding implied {}", if ptr_eq(implied_operator.op,*IMPLIED_TIMES) {"times"} else {"function apply"}); | |
4409 | ||
4410 | 232 | let implied_mo = create_mo(current_child.document(), implied_operator.ch, ADDED_ATTR_VALUE); |
4411 | 232 | if likely_function_name == FunctionNameCertainty::Maybe { |
4412 | 55 | implied_mo.set_attribute_value("data-function-guess", "true"); |
4413 | 177 | } |
4414 | 232 | self.reduce_stack(&mut parse_stack, implied_operator.op.priority); let shift_result = self.shift_stack(&mut parse_stack, implied_mo, implied_operator.clone()); |
4415 | // ignore shift_result.0 which is just 'implied_mo' | |
4416 | 232 | assert_eq!(implied_mo, shift_result.0); |
4417 | 232 | assert!( ptr_eq(implied_operator.op, shift_result.1.op) ); |
4418 | 232 | let mut top_of_stack = parse_stack.pop().unwrap(); |
4419 | 232 | top_of_stack.add_child_to_mrow(implied_mo, implied_operator); |
4420 | 232 | parse_stack.push(top_of_stack); |
4421 | 2.72k | } |
4422 | // starting a new mrow | |
4423 | 2.95k | parse_stack.push( StackInfo::new(current_child.document()) ); |
4424 | } else { | |
4425 | // One of infix, postfix, or right fence -- all should have a left operand | |
4426 | // pop the stack if it is lower precedence (it forms an mrow) | |
4427 | ||
4428 | // hack to get linear mixed fractions to parse correctly | |
4429 | 10.9k | if current_op.ch == "/" && |
4430 | 2 | current_op.op = &IMPLIED_PLUS_SLASH_HIGH_PRIORITY; |
4431 | 10.9k | } |
4432 | 10.9k | self.reduce_stack(&mut parse_stack, current_op.op.priority); |
4433 | // push new operator on stack (already handled n-ary case) | |
4434 | 10.9k | let shift_result = self.shift_stack(&mut parse_stack, current_child, current_op); |
4435 | 10.9k | current_child = shift_result.0; |
4436 | 10.9k | current_op = shift_result.1; |
4437 | } | |
4438 | 22.7k | } |
4439 | 36.7k | let mut top_of_stack = parse_stack.pop().unwrap(); |
4440 | 36.7k | top_of_stack.add_child_to_mrow(current_child, current_op); |
4441 | 36.7k | parse_stack.push(top_of_stack); |
4442 | } | |
4443 | ||
4444 | // Reached the end -- force reduction of what's left on the stack | |
4445 | 7.48k | self.reduce_stack(&mut parse_stack, LEFT_FENCEPOST.priority); |
4446 | ||
4447 | // We essentially have 'terminator( mrow terminator)' | |
4448 | // in other words, we have an extra mrow with one child due to the initial start -- remove it | |
4449 | 7.48k | let mut top_of_stack = parse_stack.pop().unwrap(); |
4450 | 7.48k | assert_eq!(parse_stack.len(), 0); |
4451 | ||
4452 | 7.48k | let mut parsed_mrow = top_of_stack.mrow; |
4453 | 7.48k | assert_eq!( name(top_of_stack.mrow), "mrow"); |
4454 | 7.48k | if parsed_mrow.children().len() == 1 && is_ok_to_merge_child { |
4455 | 7.46k | parsed_mrow = top_of_stack.remove_last_operand_from_mrow(); |
4456 | 7.46k | // was synthesized, but is really the original top level mrow |
4457 | 7.46k |
|
4458 | ||
4459 | 7.48k | parsed_mrow.remove_attribute(CHANGED_ATTR); |
4460 | 7.48k | return Ok( add_attrs(parsed_mrow, &saved_mrow_attrs) ); |
4461 | 7.48k | } |
4462 | } | |
4463 | ||
4464 | // ---------------- useful utility functions -------------------- | |
4465 | 102k | fn top<'s, 'a:'s, 'op:'a>(vec: &'s[StackInfo<'a, 'op>]) -> &'s StackInfo<'a, 'op> { |
4466 | 102k | return &vec[vec.len()-1]; |
4467 | 102k | } |
4468 | // Replace the attrs of 'mathml' with 'attrs' and keep the global attrs of 'mathml' (i.e, lift 'attrs' to 'mathml' for replacing children) | |
4469 | 10.0k | pub fn add_attrs<'a>(mathml: Element<'a>, attrs: &[Attribute]) -> Element<'a> { |
4470 | static GLOBAL_ATTRS: phf::Set<&str> = phf_set! { | |
4471 | "class", "dir", "displaystyle", "id", "mathbackground", "mathcolor", "mathsize", | |
4472 | "mathvariant", "nonce", "scriptlevel", "style", "tabindex", | |
4473 | "intent", "arg", | |
4474 | }; | |
4475 | ||
4476 | // debug!( "Adding back {} attr(s) to {}", attrs.len(), name(mathml)); | |
4477 | // remove non-global attrs | |
4478 | 10.0k | for |
4479 | 740 | let attr_name = attr.name().local_part(); |
4480 | 740 | if !( attr_name.starts_with("data-") || |
4481 | 278 | attr_name.starts_with("on") ) { // allows too much - cheapo way to allow event handlers like "onchange" |
4482 | 278 | mathml.remove_attribute(attr.name()); |
4483 | 462 | } |
4484 | } | |
4485 | ||
4486 | // add in 'attrs' | |
4487 | 10.0k | for |
4488 | 5.22k | mathml.set_attribute_value(attr.name(), attr.value()); |
4489 | 5.22k | } |
4490 | 10.0k | return mathml; |
4491 | 10.0k | } |
4492 | ||
4493 | ||
4494 | 2.91M | pub fn name(node: Element<'_>) -> &str { |
4495 | 2.91M | return node.name().local_part(); |
4496 | 2.91M | } |
4497 | ||
4498 | /// The child of a non-leaf element must be an element | |
4499 | // Note: can't use references as that results in 'returning use of local variable' | |
4500 | 1.14M | pub fn as_element(child: ChildOfElement) -> Element { |
4501 | 1.14M | return match child { |
4502 | 1.14M | ChildOfElement::Element(e) => e, |
4503 | _ => { | |
4504 | 0 | panic!("as_element: internal error -- found non-element child (text? '{:?}')", child.text()); |
4505 | }, | |
4506 | }; | |
4507 | 1.14M | } |
4508 | ||
4509 | /// The child of a leaf element must be text (previously trimmed) | |
4510 | /// Note: trim() combines all the Text children into a single string | |
4511 | 603k | pub fn as_text(leaf_child: Element<'_>) -> &str { |
4512 | 603k | assert!(is_leaf(leaf_child)); |
4513 | 603k | let children = leaf_child.children(); |
4514 | 603k | if children.is_empty() { |
4515 | 401 | return ""; |
4516 | 602k | } |
4517 | 602k | assert!(children.len() == 1); |
4518 | 602k | return match children[0] { |
4519 | 602k | ChildOfElement::Text(t) => t.text(), |
4520 | 0 | _ => panic!("as_text: internal error -- found non-text child of leaf element"), |
4521 | } | |
4522 | 603k | } |
4523 | ||
4524 | /// Returns the parent of the argument. | |
4525 | /// Warning: this assumes the parent exists | |
4526 | 239k | pub fn get_parent(mathml: Element) -> Element { |
4527 | 239k | return mathml.parent().unwrap().element().unwrap(); |
4528 | 239k | } |
4529 | ||
4530 | #[allow(dead_code)] // for debugging | |
4531 | 0 | pub fn element_summary(mathml: Element) -> String { |
4532 | 0 | return format!("{}<{}>", name(mathml), |
4533 | 0 | if is_leaf(mathml) {show_invisible_op_char(as_text(mathml)).to_string()} |
4534 | else | |
4535 | 0 | {mathml.children().len().to_string()}); |
4536 | 0 | } |
4537 | ||
4538 | 6.86k | fn create_mo<'a, 'd:'a>(doc: Document<'d>, ch: &'a str, attr_value: &str) -> Element<'d> { |
4539 | 6.86k | let implied_mo = create_mathml_element(&doc, "mo"); |
4540 | 6.86k | implied_mo.set_attribute_value(CHANGED_ATTR, attr_value); |
4541 | 6.86k | let mo_text = doc.create_text(ch); |
4542 | 6.86k | implied_mo.append_child(mo_text); |
4543 | 6.86k | return implied_mo; |
4544 | 6.86k | } |
4545 | ||
4546 | /// return 'node' or if it is adorned, return its base (recursive) | |
4547 | 130k | pub fn get_possible_embellished_node(node: Element) -> Element { |
4548 | 130k | let mut node = node; |
4549 | 138k | while IsNode::is_modified(node) { |
4550 | 8.33k | node = as_element(node.children()[0]); |
4551 | 8.33k | } |
4552 | 130k | return node; |
4553 | 130k | } |
4554 | ||
4555 | #[allow(dead_code)] // for debugging with println | |
4556 | 0 | fn show_invisible_op_char(ch: &str) -> &str { |
4557 | 0 | return match ch.chars().next().unwrap() { |
4558 | 0 | '\u{2061}' => "⁡", |
4559 | 0 | '\u{2062}' => "⁢", |
4560 | 0 | '\u{2063}' => "⁣", |
4561 | 0 | '\u{2064}' => "⁤", |
4562 | 0 | '\u{E000}' => "", |
4563 | 0 | _ => ch |
4564 | }; | |
4565 | 0 | } |
4566 | ||
4567 | ||
4568 | #[cfg(test)] | |
4569 | mod canonicalize_tests { | |
4570 | use crate::errors::Result; | |
4571 | use crate::{are_strs_canonically_equal_result, are_strs_canonically_equal_with_locale}; | |
4572 | ||
4573 | #[allow(unused_imports)] | |
4574 | use super::super::init_logger; | |
4575 | use super::super::abs_rules_dir_path; | |
4576 | use super::*; | |
4577 | use sxd_document::parser; | |
4578 | ||
4579 | ||
4580 | #[test] | |
4581 | 1 | fn canonical_same() -> Result<()> { |
4582 | 1 | let target_str = "<math><mrow><mo>-</mo><mi>a</mi></mrow></math>"; |
4583 | 1 | are_strs_canonically_equal_result(target_str, target_str, &[]) |
4584 | 1 | } |
4585 | ||
4586 | #[test] | |
4587 | 1 | fn plane1_common() -> Result<()> { |
4588 | 1 | let test_str = "<math> |
4589 | 1 | <mi mathvariant='normal'>sin</mi> <mo>,</mo> <!-- shouldn't change --> |
4590 | 1 | <mi mathvariant='italic'>bB4</mi> <mo>,</mo> <!-- shouldn't change --> |
4591 | 1 | <mi mathvariant='bold'>a</mi> <mo>,</mo> <!-- single char id tests --> |
4592 | 1 | <mi mathvariant='bold'>Z</mi> <mo>,</mo> |
4593 | 1 | <mn mathvariant='bold'>19=𝟗</mn> <mo>,</mo> <!-- '=' and plane1 shouldn't change --> |
4594 | 1 | <mn mathvariant='double-struck'>024689</mn> <mo>,</mo> <!-- '=' and plane1 shouldn't change --> |
4595 | 1 | <mi mathvariant='double-struck'>yzCHNPQRZ</mi> <mo>,</mo> |
4596 | 1 | <mi mathvariant='fraktur'>0yACHIRZ</mi> <mo>,</mo> <!-- 0 stays as ASCII --> |
4597 | 1 | <mi mathvariant='bold-fraktur'>nC</mi> <mo>,</mo> |
4598 | 1 | <mi mathvariant='script'>ABEFHILMRegow</mi> <mo>,</mo> |
4599 | 1 | <msup> |
4600 | 1 | <mi mathvariant='bold-script'>fG</mi> |
4601 | 1 | <mo mathvariant='bold-script'>*</mo> <!-- '*' shouldn't change --> |
4602 | 1 | </msup> |
4603 | 1 | </math>"; |
4604 | 1 | let target_str = "<math> |
4605 | 1 | <mrow data-changed='added'> |
4606 | 1 | <mi mathvariant='normal'>sin</mi> |
4607 | 1 | <mo >,</mo> |
4608 | 1 | <mi mathvariant='italic'>bB4</mi> |
4609 | 1 | <mo>,</mo> |
4610 | 1 | <mi mathvariant='bold'>𝐚</mi> |
4611 | 1 | <mo>,</mo> |
4612 | 1 | <mi mathvariant='bold'>𝐙</mi> |
4613 | 1 | <mo>,</mo> |
4614 | 1 | <mn mathvariant='bold'>𝟏𝟗=𝟗</mn> |
4615 | 1 | <mo>,</mo> |
4616 | 1 | <mn mathvariant='double-struck'>𝟘𝟚𝟜𝟞𝟠𝟡</mn> |
4617 | 1 | <mo>,</mo> |
4618 | 1 | <mi mathvariant='double-struck'>𝕪𝕫ℂℍℕℙℚℝℤ</mi> |
4619 | 1 | <mo>,</mo> |
4620 | 1 | <mi mathvariant='fraktur'>0𝔶𝔄ℭℌℑℜℨ</mi> |
4621 | 1 | <mo>,</mo> |
4622 | 1 | <mi mathvariant='bold-fraktur'>𝖓𝕮</mi> |
4623 | 1 | <mo>,</mo> |
4624 | 1 | <mi mathvariant='script'>𝒜ℬℰℱℋℐℒℳℛℯℊℴ𝓌</mi> |
4625 | 1 | <mo>,</mo> |
4626 | 1 | <msup> |
4627 | 1 | <mi mathvariant='bold-script'>𝓯𝓖</mi> |
4628 | 1 | <mo mathvariant='bold-script'>*</mo> <!-- '*' shouldn't change --> |
4629 | 1 | </msup> |
4630 | 1 | </mrow> |
4631 | 1 | </math>"; |
4632 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4633 | 1 | } |
4634 | ||
4635 | #[test] | |
4636 | 1 | fn plane1_font_styles() -> Result<()> { |
4637 | 1 | let test_str = "<math> |
4638 | 1 | <mi mathvariant='sans-serif'>aA09=</mi> <mo>,</mo> <!-- '=' shouldn't change --> |
4639 | 1 | <mi mathvariant='bold-sans-serif'>zZ09</mi> <mo>,</mo> |
4640 | 1 | <mi mathvariant='sans-serif-italic'>azAZ09</mi> <mo>,</mo> <!-- italic digits don't exist: revert to sans-serif --> |
4641 | 1 | <mi mathvariant='sans-serif-bold-italic'>AZaz09</mi> <mo>,</mo> <!-- italic digits don't exist: revert to just bold --> |
4642 | 1 | <mi mathvariant='monospace'>aA09</mi> |
4643 | 1 | </math>"; |
4644 | 1 | let target_str = "<math> |
4645 | 1 | <mrow data-changed='added'> |
4646 | 1 | <mi mathvariant='sans-serif'>𝖺𝖠𝟢𝟫=</mi> |
4647 | 1 | <mo>,</mo> |
4648 | 1 | <mi mathvariant='bold-sans-serif'>𝘇𝗭𝟬𝟵</mi> |
4649 | 1 | <mo>,</mo> |
4650 | 1 | <mi mathvariant='sans-serif-italic'>𝘢𝘻𝘈𝘡𝟢𝟫</mi> |
4651 | 1 | <mo>,</mo> |
4652 | 1 | <mi mathvariant='sans-serif-bold-italic'>𝘼𝙕𝙖𝙯𝟬𝟵</mi> |
4653 | 1 | <mo>,</mo> |
4654 | 1 | <mi mathvariant='monospace'>𝚊𝙰𝟶𝟿</mi> |
4655 | 1 | </mrow> |
4656 | 1 | </math>"; |
4657 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4658 | 1 | } |
4659 | ||
4660 | #[test] | |
4661 | 1 | fn plane1_greek() -> Result<()> { |
4662 | 1 | let test_str = "<math> |
4663 | 1 | <mi mathvariant='normal'>ΑΩαω∇∂ϵ=</mi> <mo>,</mo> <!-- shouldn't change --> |
4664 | 1 | <mi mathvariant='italic'>ϴΑΩαω∇∂ϵ</mi> <mo>,</mo> |
4665 | 1 | <mi mathvariant='bold'>ΑΩαωϝϜ</mi> <mo>,</mo> |
4666 | 1 | <mi mathvariant='double-struck'>Σβ∇</mi> <mo>,</mo> <!-- shouldn't change --> |
4667 | 1 | <mi mathvariant='fraktur'>ΞΦλϱ</mi> <mo>,</mo> <!-- shouldn't change --> |
4668 | 1 | <mi mathvariant='bold-fraktur'>ψΓ</mi> <mo>,</mo> <!-- map to bold --> |
4669 | 1 | <mi mathvariant='script'>μΨ</mi> <mo>,</mo> <!-- shouldn't change --> |
4670 | 1 | <mi mathvariant='bold-script'>Σπ</mi> <!-- map to bold --> |
4671 | 1 | </math>"; |
4672 | 1 | let target_str = "<math> |
4673 | 1 | <mrow data-changed='added'> |
4674 | 1 | <mi mathvariant='normal'>ΑΩαω∇∂ϵ=</mi> |
4675 | 1 | <mo>,</mo> |
4676 | 1 | <mi mathvariant='italic'>𝛳𝛢𝛺𝛼𝜔𝛻𝜕𝜖</mi> |
4677 | 1 | <mo>,</mo> |
4678 | 1 | <mi mathvariant='bold'>𝚨𝛀𝛂𝛚𝟋𝟊</mi> |
4679 | 1 | <mo>,</mo> |
4680 | 1 | <mi mathvariant='double-struck'>Σβ∇</mi> |
4681 | 1 | <mo>,</mo> |
4682 | 1 | <mi mathvariant='fraktur'>ΞΦλϱ</mi> |
4683 | 1 | <mo>,</mo> |
4684 | 1 | <mi mathvariant='bold-fraktur'>𝛙𝚪</mi> |
4685 | 1 | <mo>,</mo> |
4686 | 1 | <mi mathvariant='script'>μΨ</mi> |
4687 | 1 | <mo>,</mo> |
4688 | 1 | <mi mathvariant='bold-script'>𝚺𝛑</mi> |
4689 | 1 | </mrow> |
4690 | 1 | </math>"; |
4691 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4692 | 1 | } |
4693 | ||
4694 | #[test] | |
4695 | 1 | fn plane1_greek_font_styles() -> Result<()> { |
4696 | 1 | let test_str = "<math> |
4697 | 1 | <mi mathvariant='sans-serif'>ΑΩαω∇∂ϵ=</mi> <mo>,</mo> <!-- '=' shouldn't change --> |
4698 | 1 | <mi mathvariant='bold-sans-serif'>ϴ0ΑΩαω∇∂ϵ</mi> <mo>,</mo> |
4699 | 1 | <mi mathvariant='sans-serif-italic'>aΑΩαω∇∂ϵ</mi> <mo>,</mo> <!-- italic digits don't exist: revert to sans-serif --> |
4700 | 1 | <mi mathvariant='sans-serif-bold-italic'>ZΑΩαωϰϕϱϖ</mi> <mo>,</mo> <!-- italic digits don't exist: revert to just bold --> |
4701 | 1 | <mi mathvariant='monospace'>zΑΩαω∇∂</mi> |
4702 | 1 | </math>"; |
4703 | 1 | let target_str = "<math> |
4704 | 1 | <mrow data-changed='added'> |
4705 | 1 | <mi mathvariant='sans-serif'>ΑΩαω∇∂ϵ=</mi> |
4706 | 1 | <mo>,</mo> |
4707 | 1 | <mi mathvariant='bold-sans-serif'>𝝧𝟬𝝖𝝮𝝰𝞈𝝯𝞉𝞊</mi> |
4708 | 1 | <mo>,</mo> |
4709 | 1 | <mi mathvariant='sans-serif-italic'>𝘢ΑΩαω∇∂ϵ</mi> |
4710 | 1 | <mo>,</mo> |
4711 | 1 | <mi mathvariant='sans-serif-bold-italic'>𝙕𝞐𝞨𝞪𝟂𝟆𝟇𝟈𝟉</mi> |
4712 | 1 | <mo>,</mo> |
4713 | 1 | <mi mathvariant='monospace'>𝚣ΑΩαω∇∂</mi> |
4714 | 1 | </mrow> |
4715 | 1 | </math>"; |
4716 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4717 | 1 | } |
4718 | ||
4719 | #[test] | |
4720 | 1 | fn short_and_long_dash() -> Result<()> { |
4721 | 1 | let test_str = "<math><mi>x</mi> <mo>=</mo> <mi>--</mi><mo>+</mo><mtext>----</mtext></math>"; |
4722 | 1 | let target_str = "<math> |
4723 | 1 | <mrow data-changed='added'> |
4724 | 1 | <mi>x</mi> |
4725 | 1 | <mo>=</mo> |
4726 | 1 | <mrow data-changed='added'> |
4727 | 1 | <mi>—</mi> |
4728 | 1 | <mo>+</mo> |
4729 | 1 | <mtext>―</mtext> |
4730 | 1 | </mrow> |
4731 | 1 | </mrow> |
4732 | 1 | </math>"; |
4733 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4734 | 1 | } |
4735 | ||
4736 | #[test] | |
4737 | 1 | fn illegal_mathml_element() { |
4738 | use crate::interface::*; | |
4739 | 1 | let test_str = "<math><foo><mi>f</mi></foo></math>"; |
4740 | 1 | let package1 = &parser::parse(test_str).expect("Failed to parse test input"); |
4741 | 1 | let mathml = get_element(package1); |
4742 | 1 | trim_element(mathml, false); |
4743 | 1 | assert!(canonicalize(mathml).is_err()); |
4744 | 1 | } |
4745 | ||
4746 | #[test] | |
4747 | 1 | fn illegal_mtd_element() { |
4748 | use crate::interface::*; | |
4749 | 1 | let test_str = "<math> |
4750 | 1 | <mtable> |
4751 | 1 | <mtr> |
4752 | 1 | <mtd> |
4753 | 1 | <mtext></mtext> |
4754 | 1 | </mtd> |
4755 | 1 | <mrow> |
4756 | 1 | <mi>E</mi> |
4757 | 1 | <mo>=</mo> |
4758 | 1 | <mrow> |
4759 | 1 | <mtd> |
4760 | 1 | <mi>m</mi> |
4761 | 1 | <mo><!--INVISIBLE TIMES--></mo> |
4762 | 1 | <msup> |
4763 | 1 | <mi>c</mi> |
4764 | 1 | <mn>2</mn> |
4765 | 1 | </msup> |
4766 | 1 | </mtd></mrow> |
4767 | 1 | </mrow> |
4768 | 1 | |
4769 | 1 | </mtr> |
4770 | 1 | </mtable> |
4771 | 1 | </math>"; |
4772 | 1 | let package1 = &parser::parse(test_str).expect("Failed to parse test input"); |
4773 | 1 | let mathml = get_element(package1); |
4774 | 1 | trim_element(mathml, false); |
4775 | 1 | assert!(canonicalize(mathml).is_err()); |
4776 | 1 | } |
4777 | ||
4778 | ||
4779 | #[test] | |
4780 | 1 | fn a_to_mrow() -> Result<()> { |
4781 | 1 | let test_str = "<math> |
4782 | 1 | <a href='https://www.example.com'> |
4783 | 1 | <mo>(</mo> |
4784 | 1 | <a href='#its_relative'> |
4785 | 1 | <mi>x</mi> |
4786 | 1 | <mo>,</mo> |
4787 | 1 | <mi>y</mi> |
4788 | 1 | </a> |
4789 | 1 | <mo>)</mo> |
4790 | 1 | </a> |
4791 | 1 | </math> |
4792 | 1 | "; |
4793 | 1 | let target_str = " <math> |
4794 | 1 | <mrow href='https://www.example.com'> |
4795 | 1 | <mo>(</mo> |
4796 | 1 | <mrow href='#its_relative'> |
4797 | 1 | <mi>x</mi> |
4798 | 1 | <mo>,</mo> |
4799 | 1 | <mi>y</mi> |
4800 | 1 | </mrow> |
4801 | 1 | <mo>)</mo> |
4802 | 1 | </mrow> |
4803 | 1 | </math>"; |
4804 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4805 | 1 | } |
4806 | ||
4807 | #[test] | |
4808 | 1 | fn mfenced_no_children() -> Result<()> { |
4809 | 1 | let test_str = "<math><mi>f</mi><mfenced><mrow/></mfenced></math>"; |
4810 | 1 | let target_str = "<math> |
4811 | 1 | <mrow data-changed='added'> |
4812 | 1 | <mi>f</mi> |
4813 | 1 | <mo data-changed='added'>⁡</mo> |
4814 | 1 | <mrow> |
4815 | 1 | <mo data-changed='from_mfenced'>(</mo> |
4816 | 1 | <mo data-changed='from_mfenced'>)</mo> |
4817 | 1 | </mrow> |
4818 | 1 | </mrow> |
4819 | 1 | </math>"; |
4820 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4821 | 1 | } |
4822 | ||
4823 | #[test] | |
4824 | 1 | fn mfenced_one_child() -> Result<()> { |
4825 | 1 | let test_str = "<math><mi>f</mi><mfenced open='[' close=']'><mi>x</mi></mfenced></math>"; |
4826 | 1 | let target_str = " <math> |
4827 | 1 | <mrow data-changed='added'> |
4828 | 1 | <mi>f</mi> |
4829 | 1 | <mo data-changed='added'>⁡</mo> |
4830 | 1 | <mrow> |
4831 | 1 | <mo data-changed='from_mfenced'>[</mo> |
4832 | 1 | <mi>x</mi> |
4833 | 1 | <mo data-changed='from_mfenced'>]</mo> |
4834 | 1 | </mrow> |
4835 | 1 | </mrow> |
4836 | 1 | </math>"; |
4837 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4838 | 1 | } |
4839 | ||
4840 | #[test] | |
4841 | 1 | fn mfenced_no_attrs() -> Result<()> { |
4842 | 1 | let test_str = "<math><mi>f</mi><mfenced><mrow><mi>x</mi><mo>,</mo><mi>y</mi><mo>,</mo><mi>z</mi></mrow></mfenced></math>"; |
4843 | 1 | let target_str = " <math> |
4844 | 1 | <mrow data-changed='added'> |
4845 | 1 | <mi>f</mi> |
4846 | 1 | <mo data-changed='added'>⁡</mo> |
4847 | 1 | <mrow> |
4848 | 1 | <mo data-changed='from_mfenced'>(</mo> |
4849 | 1 | <mrow> |
4850 | 1 | <mi>x</mi> |
4851 | 1 | <mo>,</mo> |
4852 | 1 | <mi>y</mi> |
4853 | 1 | <mo>,</mo> |
4854 | 1 | <mi>z</mi> |
4855 | 1 | </mrow> |
4856 | 1 | <mo data-changed='from_mfenced'>)</mo> |
4857 | 1 | </mrow> |
4858 | 1 | </mrow> |
4859 | 1 | </math>"; |
4860 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4861 | 1 | } |
4862 | ||
4863 | #[test] | |
4864 | 1 | fn mfenced_with_separators() -> Result<()> { |
4865 | 1 | let test_str = "<math><mi>f</mi><mfenced separators=',;'><mi>x</mi><mi>y</mi><mi>z</mi><mi>a</mi></mfenced></math>"; |
4866 | 1 | let target_str = "<math> |
4867 | 1 | <mrow data-changed='added'> |
4868 | 1 | <mi>f</mi> |
4869 | 1 | <mo data-changed='added'>⁡</mo> |
4870 | 1 | <mrow> |
4871 | 1 | <mo data-changed='from_mfenced'>(</mo> |
4872 | 1 | <mrow data-changed='added'> |
4873 | 1 | <mrow data-changed='added'> |
4874 | 1 | <mi>x</mi> |
4875 | 1 | <mo data-changed='from_mfenced'>,</mo> |
4876 | 1 | <mi>y</mi> |
4877 | 1 | </mrow> |
4878 | 1 | <mo data-changed='from_mfenced'>;</mo> |
4879 | 1 | <mrow data-changed='added'> |
4880 | 1 | <mi>z</mi> |
4881 | 1 | <mo data-changed='from_mfenced'>,</mo> |
4882 | 1 | <mi>a</mi> |
4883 | 1 | </mrow> |
4884 | 1 | </mrow> |
4885 | 1 | <mo data-changed='from_mfenced'>)</mo> |
4886 | 1 | </mrow> |
4887 | 1 | </mrow> |
4888 | 1 | </math>"; |
4889 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4890 | 1 | } |
4891 | ||
4892 | #[test] | |
4893 | 1 | fn canonical_one_element_mrow_around_mrow() -> Result<()> { |
4894 | 1 | let test_str = "<math><mrow><mrow><mo>-</mo><mi>a</mi></mrow></mrow></math>"; |
4895 | 1 | let target_str = "<math><mrow><mo>-</mo><mi>a</mi></mrow></math>"; |
4896 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4897 | 1 | } |
4898 | ||
4899 | #[test] | |
4900 | 1 | fn canonical_mtext_in_mtd_477() -> Result<()> { |
4901 | // make sure mtext doesn't go away | |
4902 | 1 | let test_str = r#"<math> |
4903 | 1 | <mtable> |
4904 | 1 | <mtr> |
4905 | 1 | <mtd> |
4906 | 1 | <mstyle scriptlevel="0"> |
4907 | 1 | <mspace width="2em"/> |
4908 | 1 | </mstyle> |
4909 | 1 | <mstyle scriptlevel="0"> |
4910 | 1 | <mspace width="1em"/> |
4911 | 1 | </mstyle> |
4912 | 1 | </mtd> |
4913 | 1 | </mtr> |
4914 | 1 | </mtable> |
4915 | 1 | </math>"#; |
4916 | 1 | let target_str = r#" <math> |
4917 | 1 | <mtable> |
4918 | 1 | <mtr> |
4919 | 1 | <mtd> |
4920 | 1 | <mtext data-width='1' data-following-space-width='4' scriptlevel='0' data-changed='added'> </mtext> |
4921 | 1 | </mtd> |
4922 | 1 | </mtr> |
4923 | 1 | </mtable> |
4924 | 1 | </math>"#; |
4925 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4926 | 1 | } |
4927 | ||
4928 | #[test] | |
4929 | 1 | fn canonical_mtext_in_mtr() -> Result<()> { |
4930 | // make sure mtext doesn't go away | |
4931 | 1 | let test_str = "<math> <mtable> <mtr> <mtext> </mtext> </mtr> <mtr> <mtext> </mtext> </mtr> </mtable> </math>"; |
4932 | 1 | let target_str = " <math> |
4933 | 1 | <mtable> |
4934 | 1 | <mtr> |
4935 | 1 | <mtext data-changed='empty_content' data-width='0' data-empty-in-2D='true'> </mtext> |
4936 | 1 | </mtr> |
4937 | 1 | <mtr> |
4938 | 1 | <mtext data-changed='empty_content' data-width='0' data-empty-in-2D='true'> </mtext> |
4939 | 1 | </mtr> |
4940 | 1 | </mtable> |
4941 | 1 | </math>"; |
4942 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4943 | 1 | } |
4944 | ||
4945 | #[test] | |
4946 | 1 | fn canonical_mtext_in_mtable() -> Result<()> { |
4947 | // make sure mtext doesn't go away | |
4948 | 1 | let test_str = r"<math> <mtable> <mtr> <mtd> <mi>L</mi> </mtd> <mtd> <mrow> <mi><mi/></mi> <mo>=</mo> |
4949 | 1 | <mrow> <mo>[</mo> <mtable> <mtext> </mtext> </mtable> <mo>]</mo> </mrow> </mrow> </mtd> </mtr> </mtable> </math>"; |
4950 | 1 | let target_str = r"<math> |
4951 | 1 | <mtable> |
4952 | 1 | <mtr> |
4953 | 1 | <mtd> |
4954 | 1 | <mi>L</mi> |
4955 | 1 | </mtd> |
4956 | 1 | <mtd> |
4957 | 1 | <mrow> |
4958 | 1 | <mi><mi/></mi> |
4959 | 1 | <mo>=</mo> |
4960 | 1 | <mrow> |
4961 | 1 | <mo>[</mo> |
4962 | 1 | <mtable> |
4963 | 1 | <mtext data-changed='empty_content' data-width='0' data-empty-in-2D='true'> </mtext> |
4964 | 1 | </mtable> |
4965 | 1 | <mo>]</mo> |
4966 | 1 | </mrow> |
4967 | 1 | </mrow> |
4968 | 1 | </mtd> |
4969 | 1 | </mtr> |
4970 | 1 | </mtable> |
4971 | 1 | </math>"; |
4972 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
4973 | 1 | } |
4974 | ||
4975 | #[test] | |
4976 | 1 | fn mrow_with_intent_and_single_child() -> Result<()> { |
4977 | use crate::interface::*; | |
4978 | use sxd_document::parser; | |
4979 | use crate::canonicalize::canonicalize; | |
4980 | // this forces initialization | |
4981 | 1 | crate::interface::set_rules_dir(abs_rules_dir_path()).unwrap(); |
4982 | 1 | crate::speech::SPEECH_RULES.with(|_| true); |
4983 | ||
4984 | // we don't want to remove the mrow because the intent on the mi would reference itself | |
4985 | 1 | let test = "<math><mrow intent='log($x)'><mi arg='x'>X</mi></mrow></math>"; |
4986 | ||
4987 | 1 | let package1 = &parser::parse(test).expect("Failed to parse test input"); |
4988 | 1 | let mathml = get_element(package1); |
4989 | 1 | trim_element(mathml, false); |
4990 | 1 | let mathml_test = canonicalize(mathml).unwrap(); |
4991 | 1 | let first_child = as_element( mathml_test.children()[0] ); |
4992 | 1 | assert_eq!(name(first_child), "mrow"); |
4993 | 1 | assert_eq!(first_child.children().len(), 1); |
4994 | 1 | let mi = as_element(first_child.children()[0]); |
4995 | 1 | assert_eq!(name(mi), "mi"); |
4996 | 1 | Ok(()) |
4997 | 1 | } |
4998 | ||
4999 | #[test] | |
5000 | 1 | fn empty_mrow_with_intent() -> Result<()> { |
5001 | // we don't want to remove the mrow because the intent on the mi would reference itself | |
5002 | use crate::interface::*; | |
5003 | use sxd_document::parser; | |
5004 | use crate::canonicalize::canonicalize; | |
5005 | // this forces initialization | |
5006 | 1 | crate::interface::set_rules_dir(abs_rules_dir_path()).unwrap(); |
5007 | 1 | crate::speech::SPEECH_RULES.with(|_| true); |
5008 | ||
5009 | // we don't want to remove the mrow because the intent needs to stick around | |
5010 | 1 | let test = "<math><mrow intent='log(x)'/></math>"; |
5011 | ||
5012 | 1 | let package1 = &parser::parse(test).expect("Failed to parse test input"); |
5013 | 1 | let mathml = get_element(package1); |
5014 | 1 | trim_element(mathml, false); |
5015 | 1 | let mathml_test = canonicalize(mathml).unwrap(); |
5016 | 1 | let first_child = as_element( mathml_test.children()[0] ); |
5017 | 1 | assert_eq!(name(first_child), "mrow"); |
5018 | 1 | assert_eq!(first_child.children().len(), 1); |
5019 | 1 | let mtext = as_element(first_child.children()[0]); |
5020 | 1 | assert_eq!(name(mtext), "mtext"); |
5021 | 1 | Ok(()) |
5022 | 1 | } |
5023 | ||
5024 | #[test] | |
5025 | 1 | fn mn_with_negative_sign() -> Result<()> { |
5026 | 1 | let test_str = "<math><mfrac> |
5027 | 1 | <mrow><mn>-1</mn></mrow> |
5028 | 1 | <mn>−987</mn> |
5029 | 1 | </mfrac></math>"; |
5030 | 1 | let target_str = "<math><mfrac> |
5031 | 1 | <mrow data-changed='added'><mo>-</mo><mn>1</mn></mrow> |
5032 | 1 | <mrow data-changed='added'><mo>-</mo><mn>987</mn></mrow> |
5033 | 1 | </mfrac></math>"; |
5034 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5035 | 1 | } |
5036 | ||
5037 | #[test] | |
5038 | 1 | fn mn_with_degree_sign() -> Result<()> { |
5039 | 1 | let test_str = "<math> <mrow> <mi>cos</mi> <mo></mo> <mrow> <mo>(</mo> <mn>150°</mn> <mo>)</mo> </mrow> </mrow> </math>"; |
5040 | 1 | let target_str = "<math> |
5041 | 1 | <mrow> |
5042 | 1 | <mi>cos</mi> <mo>⁡</mo> |
5043 | 1 | <mrow> |
5044 | 1 | <mo>(</mo> |
5045 | 1 | <msup data-changed='added'> <mn>150</mn> <mo>°</mo> </msup> |
5046 | 1 | <mo>)</mo> |
5047 | 1 | </mrow> |
5048 | 1 | </mrow> |
5049 | 1 | </math>"; |
5050 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5051 | 1 | } |
5052 | ||
5053 | #[test] | |
5054 | 1 | fn canonical_one_element_mrow_around_mo() -> Result<()> { |
5055 | 1 | let test_str = "<math><mrow><mrow><mo>-</mo></mrow><mi>a</mi></mrow></math>"; |
5056 | 1 | let target_str = "<math><mrow><mo>-</mo><mi>a</mi></mrow></math>"; |
5057 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5058 | 1 | } |
5059 | ||
5060 | #[test] | |
5061 | 1 | fn canonical_flat_to_times_and_plus() -> Result<()> { |
5062 | 1 | let test_str = "<math><mi>c</mi><mo>+</mo><mi>x</mi><mi>y</mi></math>"; |
5063 | 1 | let target_str = "<math> |
5064 | 1 | <mrow data-changed='added'><mi>c</mi><mo>+</mo> |
5065 | 1 | <mrow data-changed='added'><mi>x</mi><mo data-changed='added'>⁢</mo><mi>y</mi></mrow> |
5066 | 1 | </mrow></math>"; |
5067 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5068 | 1 | } |
5069 | ||
5070 | #[test] | |
5071 | 1 | fn canonical_prefix_and_infix() -> Result<()> { |
5072 | 1 | let test_str = "<math><mrow><mo>-</mo><mi>a</mi><mo>-</mo><mi>b</mi></mrow></math>"; |
5073 | 1 | let target_str = "<math> |
5074 | 1 | <mrow> |
5075 | 1 | <mrow data-changed='added'> |
5076 | 1 | <mo>-</mo> |
5077 | 1 | <mi>a</mi> |
5078 | 1 | </mrow> |
5079 | 1 | <mo>-</mo> |
5080 | 1 | <mi>b</mi> |
5081 | 1 | </mrow> |
5082 | 1 | </math>"; |
5083 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5084 | 1 | } |
5085 | ||
5086 | ||
5087 | #[test] | |
5088 | 1 | fn canonical_prefix_implied_times_prefix() -> Result<()> { |
5089 | 1 | let test_str = "<math><mrow><mo>∂</mo><mi>x</mi><mo>∂</mo><mi>y</mi></mrow></math>"; |
5090 | 1 | let target_str = "<math> |
5091 | 1 | <mrow> |
5092 | 1 | <mrow data-changed='added'><mo>∂</mo><mi>x</mi></mrow> |
5093 | 1 | <mo data-changed='added'>⁢</mo> |
5094 | 1 | <mrow data-changed='added'><mo>∂</mo><mi>y</mi></mrow> |
5095 | 1 | </mrow> |
5096 | 1 | </math>"; |
5097 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5098 | 1 | } |
5099 | ||
5100 | #[test] | |
5101 | 1 | fn function_with_single_arg() -> Result<()> { |
5102 | 1 | let test_str = "<math><mrow> |
5103 | 1 | <mi>sin</mi><mo>(</mo><mi>x</mi><mo>)</mo> |
5104 | 1 | <mo>+</mo> |
5105 | 1 | <mi>f</mi><mo>(</mo><mi>x</mi><mo>)</mo> |
5106 | 1 | <mo>+</mo> |
5107 | 1 | <mi>t</mi><mrow><mo>(</mo><mi>x</mi><mo>)</mo></mrow> |
5108 | 1 | </mrow></math>"; |
5109 | 1 | let target_str = "<math> |
5110 | 1 | <mrow> |
5111 | 1 | <mrow data-changed='added'> |
5112 | 1 | <mi>sin</mi> |
5113 | 1 | <mo data-changed='added'>⁡</mo> |
5114 | 1 | <mrow data-changed='added'> |
5115 | 1 | <mo>(</mo> |
5116 | 1 | <mi>x</mi> |
5117 | 1 | <mo>)</mo> |
5118 | 1 | </mrow> |
5119 | 1 | </mrow> |
5120 | 1 | <mo>+</mo> |
5121 | 1 | <mrow data-changed='added'> |
5122 | 1 | <mi>f</mi> |
5123 | 1 | <mo data-changed='added'>⁡</mo> |
5124 | 1 | <mrow data-changed='added'> |
5125 | 1 | <mo>(</mo> |
5126 | 1 | <mi>x</mi> |
5127 | 1 | <mo>)</mo> |
5128 | 1 | </mrow> |
5129 | 1 | </mrow> |
5130 | 1 | <mo>+</mo> |
5131 | 1 | <mrow data-changed='added'> |
5132 | 1 | <mi>t</mi> |
5133 | 1 | <mo data-changed='added'>⁡</mo> |
5134 | 1 | <mrow> |
5135 | 1 | <mo>(</mo> |
5136 | 1 | <mi>x</mi> |
5137 | 1 | <mo>)</mo> |
5138 | 1 | </mrow> |
5139 | 1 | </mrow> |
5140 | 1 | </mrow> |
5141 | 1 | </math>"; |
5142 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5143 | 1 | } |
5144 | ||
5145 | #[test] | |
5146 | 1 | fn maybe_function() -> Result<()> { |
5147 | 1 | let test_str = "<math> |
5148 | 1 | <mrow> |
5149 | 1 | <mi>P</mi> |
5150 | 1 | <mo>(</mo> |
5151 | 1 | <mi>A</mi> |
5152 | 1 | <mo>∩</mo> |
5153 | 1 | <mi>B</mi> |
5154 | 1 | <mo>)</mo> |
5155 | 1 | </mrow> |
5156 | 1 | </math>"; |
5157 | 1 | let target_str = "<math> |
5158 | 1 | <mrow> |
5159 | 1 | <mi>P</mi> |
5160 | 1 | <mo data-function-guess='true' data-changed='added'>⁢</mo> |
5161 | 1 | <mrow data-changed='added'> |
5162 | 1 | <mo>(</mo> |
5163 | 1 | <mrow data-changed='added'> |
5164 | 1 | <mi>A</mi> |
5165 | 1 | <mo>∩</mo> |
5166 | 1 | <mi>B</mi> |
5167 | 1 | </mrow> |
5168 | 1 | <mo>)</mo> |
5169 | 1 | </mrow> |
5170 | 1 | </mrow> |
5171 | 1 | </math>"; |
5172 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5173 | 1 | } |
5174 | ||
5175 | #[test] | |
5176 | 1 | fn function_with_multiple_args() -> Result<()> { |
5177 | 1 | let test_str = "<math> |
5178 | 1 | <mi>sin</mi><mo>(</mo><mi>x</mi><mo>+</mo><mi>y</mi><mo>)</mo> |
5179 | 1 | <mo>+</mo> |
5180 | 1 | <mi>f</mi><mo>(</mo><mi>x</mi><mo>+</mo><mi>y</mi><mo>)</mo> |
5181 | 1 | <mo>+</mo> |
5182 | 1 | <mi>t</mi><mo>(</mo><mi>x</mi><mo>+</mo><mi>y</mi><mo>)</mo> |
5183 | 1 | <mo>+</mo> |
5184 | 1 | <mi>w</mi><mo>(</mo><mi>x</mi><mo>,</mo><mi>y</mi><mo>)</mo> |
5185 | 1 | </math>"; |
5186 | 1 | let target_str = " <math> |
5187 | 1 | <mrow data-changed='added'> |
5188 | 1 | <mrow data-changed='added'> |
5189 | 1 | <mi>sin</mi> |
5190 | 1 | <mo data-changed='added'>⁡</mo> |
5191 | 1 | <mrow data-changed='added'> |
5192 | 1 | <mo>(</mo> |
5193 | 1 | <mrow data-changed='added'> |
5194 | 1 | <mi>x</mi> |
5195 | 1 | <mo>+</mo> |
5196 | 1 | <mi>y</mi> |
5197 | 1 | </mrow> |
5198 | 1 | <mo>)</mo> |
5199 | 1 | </mrow> |
5200 | 1 | </mrow> |
5201 | 1 | <mo>+</mo> |
5202 | 1 | <mrow data-changed='added'> |
5203 | 1 | <mi>f</mi> |
5204 | 1 | <mo data-changed='added'>⁡</mo> |
5205 | 1 | <mrow data-changed='added'> |
5206 | 1 | <mo>(</mo> |
5207 | 1 | <mrow data-changed='added'> |
5208 | 1 | <mi>x</mi> |
5209 | 1 | <mo>+</mo> |
5210 | 1 | <mi>y</mi> |
5211 | 1 | </mrow> |
5212 | 1 | <mo>)</mo> |
5213 | 1 | </mrow> |
5214 | 1 | </mrow> |
5215 | 1 | <mo>+</mo> |
5216 | 1 | <mrow data-changed='added'> |
5217 | 1 | <mi>t</mi> |
5218 | 1 | <mo data-changed='added' data-function-guess='true'>⁢</mo> |
5219 | 1 | <mrow data-changed='added'> |
5220 | 1 | <mo>(</mo> |
5221 | 1 | <mrow data-changed='added'> |
5222 | 1 | <mi>x</mi> |
5223 | 1 | <mo>+</mo> |
5224 | 1 | <mi>y</mi> |
5225 | 1 | </mrow> |
5226 | 1 | <mo>)</mo> |
5227 | 1 | </mrow> |
5228 | 1 | </mrow> |
5229 | 1 | <mo>+</mo> |
5230 | 1 | <mrow data-changed='added'> |
5231 | 1 | <mi>w</mi> |
5232 | 1 | <mo data-changed='added'>⁡</mo> |
5233 | 1 | <mrow data-changed='added'> |
5234 | 1 | <mo>(</mo> |
5235 | 1 | <mrow data-changed='added'> |
5236 | 1 | <mi>x</mi> |
5237 | 1 | <mo>,</mo> |
5238 | 1 | <mi>y</mi> |
5239 | 1 | </mrow> |
5240 | 1 | <mo>)</mo> |
5241 | 1 | </mrow> |
5242 | 1 | </mrow> |
5243 | 1 | </mrow> |
5244 | 1 | </math>"; |
5245 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5246 | 1 | } |
5247 | ||
5248 | #[test] | |
5249 | 1 | fn function_with_no_args() -> Result<()> { |
5250 | 1 | let test_str = "<math><mrow> |
5251 | 1 | <mi>sin</mi><mi>x</mi> |
5252 | 1 | <mo>+</mo> |
5253 | 1 | <mi>f</mi><mi>x</mi> |
5254 | 1 | <mo>+</mo> |
5255 | 1 | <mi>t</mi><mi>x</mi> |
5256 | 1 | </mrow></math>"; |
5257 | 1 | let target_str = " <math> |
5258 | 1 | <mrow> |
5259 | 1 | <mrow data-changed='added'> |
5260 | 1 | <mi>sin</mi> |
5261 | 1 | <mo data-changed='added'>⁡</mo> |
5262 | 1 | <mi>x</mi> |
5263 | 1 | </mrow> |
5264 | 1 | <mo>+</mo> |
5265 | 1 | <mrow data-changed='added'> |
5266 | 1 | <mi>f</mi> |
5267 | 1 | <mo data-changed='added'>⁢</mo> |
5268 | 1 | <mi>x</mi> |
5269 | 1 | </mrow> |
5270 | 1 | <mo>+</mo> |
5271 | 1 | <mrow data-changed='added'> |
5272 | 1 | <mi>t</mi> |
5273 | 1 | <mo data-changed='added'>⁢</mo> |
5274 | 1 | <mi>x</mi> |
5275 | 1 | </mrow> |
5276 | 1 | </mrow> |
5277 | 1 | </math>"; |
5278 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5279 | ||
5280 | 1 | } |
5281 | ||
5282 | ||
5283 | #[test] | |
5284 | 1 | fn function_call_vs_implied_times() -> Result<()> { |
5285 | 1 | let test_str = "<math><mi>f</mi><mo>(</mo><mi>x</mi><mo>)</mo><mi>y</mi></math>"; |
5286 | 1 | let target_str = "<math> |
5287 | 1 | <mrow data-changed='added'> |
5288 | 1 | <mrow data-changed='added'> |
5289 | 1 | <mi>f</mi> |
5290 | 1 | <mo data-changed='added'>⁡</mo> |
5291 | 1 | <mrow data-changed='added'> <mo>(</mo> <mi>x</mi> <mo>)</mo> </mrow> |
5292 | 1 | </mrow> |
5293 | 1 | <mo data-changed='added'>⁢</mo> |
5294 | 1 | <mi>y</mi> </mrow> |
5295 | 1 | </math>"; |
5296 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5297 | 1 | } |
5298 | ||
5299 | #[test] | |
5300 | 1 | fn implied_plus() -> Result<()> { |
5301 | 1 | let test_str = "<math><mrow> |
5302 | 1 | <mn>2</mn><mfrac><mn>3</mn><mn>4</mn></mfrac> |
5303 | 1 | </mrow></math>"; |
5304 | 1 | let target_str = "<math> |
5305 | 1 | <mrow> |
5306 | 1 | <mn>2</mn> |
5307 | 1 | <mo data-changed='added'>⁤</mo> |
5308 | 1 | <mfrac> |
5309 | 1 | <mn>3</mn> |
5310 | 1 | <mn>4</mn> |
5311 | 1 | </mfrac> |
5312 | 1 | </mrow> |
5313 | 1 | </math>"; |
5314 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5315 | 1 | } |
5316 | ||
5317 | #[test] | |
5318 | 1 | fn implied_plus_linear() -> Result<()> { |
5319 | 1 | let test_str = "<math><mrow> |
5320 | 1 | <mn>2</mn><mspace width='0.278em'></mspace><mn>3</mn><mo>/</mo><mn>4</mn> |
5321 | 1 | </mrow></math>"; |
5322 | 1 | let target_str = "<math> |
5323 | 1 | <mrow> |
5324 | 1 | <mn>2</mn> |
5325 | 1 | <mo data-changed='added'>⁤</mo> |
5326 | 1 | <mrow data-changed='added'>> |
5327 | 1 | <mn data-previous-space-width='0.278'>3</mn> |
5328 | 1 | <mo>/</mo> |
5329 | 1 | <mn>4</mn> |
5330 | 1 | </mrow> |
5331 | 1 | </mrow> |
5332 | 1 | </math>"; |
5333 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5334 | 1 | } |
5335 | ||
5336 | #[test] | |
5337 | 1 | fn implied_plus_linear2() -> Result<()> { |
5338 | 1 | let test_str = "<math><mrow> |
5339 | 1 | <mn>2</mn><mrow><mn>3</mn><mo>/</mo><mn>4</mn></mrow> |
5340 | 1 | </mrow></math>"; |
5341 | 1 | let target_str = "<math> |
5342 | 1 | <mrow> |
5343 | 1 | <mn>2</mn> |
5344 | 1 | <mo data-changed='added'>⁤</mo> |
5345 | 1 | <mrow> |
5346 | 1 | <mn>3</mn> |
5347 | 1 | <mo>/</mo> |
5348 | 1 | <mn>4</mn> |
5349 | 1 | </mrow> |
5350 | 1 | </mrow> |
5351 | 1 | </math>"; |
5352 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5353 | 1 | } |
5354 | ||
5355 | #[test] | |
5356 | 1 | fn implied_comma() -> Result<()> { |
5357 | 1 | let test_str = "<math><msub><mi>b</mi><mrow><mn>1</mn><mn>2</mn></mrow></msub></math>"; |
5358 | 1 | let target_str = "<math> |
5359 | 1 | <msub><mi>b</mi><mrow><mn>1</mn><mo data-changed='added'>⁣</mo><mn>2</mn></mrow></msub> |
5360 | 1 | </math>"; |
5361 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5362 | 1 | } |
5363 | ||
5364 | #[test] | |
5365 | 1 | fn no_implied_comma() -> Result<()> { |
5366 | 1 | let test_str = "<math><mfrac><mi>b</mi><mrow><mn>1</mn><mn>2</mn></mrow></mfrac></math>"; |
5367 | 1 | let target_str = "<math> |
5368 | 1 | <mfrac><mi>b</mi><mrow><mn>1</mn><mo data-changed='added'>⁢</mo><mn>2</mn></mrow></mfrac> |
5369 | 1 | </math>"; |
5370 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5371 | 1 | } |
5372 | ||
5373 | #[test] | |
5374 | 1 | fn vertical_bars() -> Result<()> { |
5375 | 1 | let test_str = "<math> |
5376 | 1 | <mo>|</mo> <mi>x</mi> <mo>|</mo><mo>+</mo><mo>|</mo> |
5377 | 1 | <mi>a</mi><mo>+</mo><mn>1</mn> <mo>|</mo> |
5378 | 1 | </math>"; |
5379 | 1 | let target_str = " <math> |
5380 | 1 | <mrow data-changed='added'> |
5381 | 1 | <mrow data-changed='added'> |
5382 | 1 | <mo>|</mo> |
5383 | 1 | <mi>x</mi> |
5384 | 1 | <mo>|</mo> |
5385 | 1 | </mrow> |
5386 | 1 | <mo>+</mo> |
5387 | 1 | <mrow data-changed='added'> |
5388 | 1 | <mo>|</mo> |
5389 | 1 | <mrow data-changed='added'> |
5390 | 1 | <mi>a</mi> |
5391 | 1 | <mo>+</mo> |
5392 | 1 | <mn>1</mn> |
5393 | 1 | </mrow> |
5394 | 1 | <mo>|</mo> |
5395 | 1 | </mrow> |
5396 | 1 | </mrow> |
5397 | 1 | </math>"; |
5398 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5399 | 1 | } |
5400 | ||
5401 | ||
5402 | #[test] | |
5403 | 1 | fn vertical_bars_nested() -> Result<()> { |
5404 | 1 | let test_str = "<math><mo>|</mo><mi>x</mi><mo>|</mo><mi>y</mi><mo>|</mo><mi>z</mi><mo>|</mo></math>"; |
5405 | 1 | let target_str = "<math> |
5406 | 1 | <mrow data-changed='added'> |
5407 | 1 | <mrow data-changed='added'> |
5408 | 1 | <mo>|</mo> |
5409 | 1 | <mi>x</mi> |
5410 | 1 | <mo>|</mo> |
5411 | 1 | </mrow> |
5412 | 1 | <mo data-changed='added'>⁢</mo> |
5413 | 1 | <mi>y</mi> |
5414 | 1 | <mo data-changed='added'>⁢</mo> |
5415 | 1 | <mrow data-changed='added'> |
5416 | 1 | <mo>|</mo> |
5417 | 1 | <mi>z</mi> |
5418 | 1 | <mo>|</mo> |
5419 | 1 | </mrow> |
5420 | 1 | </mrow> |
5421 | 1 | </math>"; |
5422 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5423 | 1 | } |
5424 | ||
5425 | #[test] | |
5426 | 1 | fn double_vertical_bars() -> Result<()> { |
5427 | 1 | let test_str = "<math><mrow><mo>||</mo><mi>x</mi><mo>||</mo><mo>||</mo><mi>y</mi><mo>||</mo></mrow></math>"; |
5428 | 1 | let target_str = "<math> |
5429 | 1 | <mrow> |
5430 | 1 | <mrow data-changed='added'><mo>‖</mo><mi>x</mi><mo>‖</mo></mrow> |
5431 | 1 | <mo data-changed='added'>⁢</mo> |
5432 | 1 | <mrow data-changed='added'><mo>‖</mo><mi>y</mi><mo>‖</mo></mrow> |
5433 | 1 | </mrow> |
5434 | 1 | </math>"; |
5435 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5436 | 1 | } |
5437 | ||
5438 | #[test] | |
5439 | 1 | fn double_vertical_bars_mo() -> Result<()> { |
5440 | 1 | let test_str = "<math><mo>|</mo><mo>|</mo><mi>a</mi><mo>|</mo><mo>|</mo></math>"; |
5441 | 1 | let target_str = "<math><mrow data-changed='added'><mo>‖</mo><mi>a</mi><mo>‖</mo></mrow></math>"; |
5442 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5443 | 1 | } |
5444 | ||
5445 | #[test] | |
5446 | 1 | fn no_double_vertical_bars_mo() -> Result<()> { |
5447 | 1 | let test_str = "<math><mo>|</mo><mi>x</mi><mo>|</mo><mo>|</mo><mi>y</mi><mo>|</mo></math>"; |
5448 | 1 | let target_str = "<math> <mrow data-changed='added'> |
5449 | 1 | <mrow data-changed='added'><mo>|</mo><mi>x</mi><mo>|</mo></mrow> |
5450 | 1 | <mo data-changed='added'>⁢</mo> |
5451 | 1 | <mrow data-changed='added'><mo>|</mo><mi>y</mi><mo>|</mo></mrow> |
5452 | 1 | </mrow> </math>"; |
5453 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5454 | 1 | } |
5455 | ||
5456 | #[test] | |
5457 | 1 | fn vertical_bar_such_that() -> Result<()> { |
5458 | 1 | let test_str = "<math> |
5459 | 1 | <mo>{</mo><mi>x</mi><mo>|</mo><mi>x</mi><mo>∈</mo><mi>S</mi><mo>}</mo> |
5460 | 1 | </math>"; |
5461 | 1 | let target_str = "<math> |
5462 | 1 | <mrow data-changed='added'> |
5463 | 1 | <mo>{</mo> |
5464 | 1 | <mrow data-changed='added'> |
5465 | 1 | <mi>x</mi> |
5466 | 1 | <mo>|</mo> |
5467 | 1 | <mrow data-changed='added'> |
5468 | 1 | <mi>x</mi> |
5469 | 1 | <mo>∈</mo> |
5470 | 1 | <mi>S</mi> |
5471 | 1 | </mrow> |
5472 | 1 | </mrow> |
5473 | 1 | <mo>}</mo> |
5474 | 1 | </mrow> |
5475 | 1 | </math>"; |
5476 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5477 | 1 | } |
5478 | ||
5479 | #[test] | |
5480 | #[ignore] // need to figure out a test for this ("|" should have a precedence around ":" since that is an alternative notation for "such that", but "∣" is higher precedence) | |
5481 | 0 | fn vertical_bar_divides() -> Result<()> { |
5482 | 0 | let test_str = "<math> |
5483 | 0 | <mi>x</mi><mo>+</mo><mi>y</mi> <mo>|</mo><mn>12</mn> |
5484 | 0 | </math>"; |
5485 | 0 | let target_str = "<math> |
5486 | 0 | <mrow data-changed='added'> |
5487 | 0 | <mrow data-changed='added'> |
5488 | 0 | <mi>x</mi> |
5489 | 0 | <mo>+</mo> |
5490 | 0 | <mi>y</mi> |
5491 | 0 | </mrow> |
5492 | 0 | <mo>∣ <!--divides--></mo> |
5493 | 0 | <mn>12</mn> |
5494 | 0 | </mrow> |
5495 | 0 | </math>"; |
5496 | 0 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5497 | 0 | } |
5498 | ||
5499 | ||
5500 | #[test] | |
5501 | 1 | fn trig_mo() -> Result<()> { |
5502 | 1 | let test_str = "<math><mo>sin</mo><mi>x</mi> |
5503 | 1 | <mo>+</mo><mo>cos</mo><mi>y</mi> |
5504 | 1 | <mo>+</mo><munder><mo>lim</mo><mi>D</mi></munder><mi>y</mi> |
5505 | 1 | </math>"; |
5506 | 1 | let target_str = "<math> |
5507 | 1 | <mrow data-changed='added'> |
5508 | 1 | <mrow data-changed='added'> |
5509 | 1 | <mi>sin</mi> |
5510 | 1 | <mo data-changed='added'>⁡</mo> |
5511 | 1 | <mi>x</mi> |
5512 | 1 | </mrow> |
5513 | 1 | <mo>+</mo> |
5514 | 1 | <mrow data-changed='added'> |
5515 | 1 | <mi>cos</mi> |
5516 | 1 | <mo data-changed='added'>⁡</mo> |
5517 | 1 | <mi>y</mi> |
5518 | 1 | </mrow> |
5519 | 1 | <mo>+</mo> |
5520 | 1 | <mrow data-changed='added'> |
5521 | 1 | <munder> |
5522 | 1 | <mi>lim</mi> |
5523 | 1 | <mi>D</mi> |
5524 | 1 | </munder> |
5525 | 1 | <mo data-changed='added'>⁡</mo> |
5526 | 1 | <mi>y</mi> |
5527 | 1 | </mrow> |
5528 | 1 | </mrow> |
5529 | 1 | </math>"; |
5530 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5531 | 1 | } |
5532 | ||
5533 | #[test] | |
5534 | 1 | fn trig_mtext() -> Result<()> { |
5535 | 1 | let test_str = "<math><mtext>sin</mtext><mi>x</mi> |
5536 | 1 | <mo>+</mo><mtext>cos</mtext><mi>y</mi> |
5537 | 1 | <mo>+</mo><munder><mtext>lim</mtext><mi>D</mi></munder><mi>y</mi> |
5538 | 1 | </math>"; |
5539 | 1 | let target_str = "<math> |
5540 | 1 | <mrow data-changed='added'> |
5541 | 1 | <mrow data-changed='added'> |
5542 | 1 | <mi>sin</mi> |
5543 | 1 | <mo data-changed='added'>⁡</mo> |
5544 | 1 | <mi>x</mi> |
5545 | 1 | </mrow> |
5546 | 1 | <mo>+</mo> |
5547 | 1 | <mrow data-changed='added'> |
5548 | 1 | <mi>cos</mi> |
5549 | 1 | <mo data-changed='added'>⁡</mo> |
5550 | 1 | <mi>y</mi> |
5551 | 1 | </mrow> |
5552 | 1 | <mo>+</mo> |
5553 | 1 | <mrow data-changed='added'> |
5554 | 1 | <munder> |
5555 | 1 | <mi>lim</mi> |
5556 | 1 | <mi>D</mi> |
5557 | 1 | </munder> |
5558 | 1 | <mo data-changed='added'>⁡</mo> |
5559 | 1 | <mi>y</mi> |
5560 | 1 | </mrow> |
5561 | 1 | </mrow> |
5562 | 1 | </math>"; |
5563 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5564 | 1 | } |
5565 | ||
5566 | #[test] | |
5567 | 1 | fn trig_negative_args() -> Result<()> { |
5568 | 1 | let test_str = "<math><mi>sin</mi><mo>-</mo><mn>2</mn><mi>π</mi><mi>x</mi></math>"; |
5569 | 1 | let target_str = "<math> |
5570 | 1 | <mrow data-changed='added'> |
5571 | 1 | <mi>sin</mi> |
5572 | 1 | <mo data-changed='added'>⁡</mo> |
5573 | 1 | <mrow data-changed='added'> |
5574 | 1 | <mrow data-changed='added'> |
5575 | 1 | <mo>-</mo> |
5576 | 1 | <mn>2</mn> |
5577 | 1 | </mrow> |
5578 | 1 | <mo data-changed='added'>⁢</mo> |
5579 | 1 | <mi>π</mi> |
5580 | 1 | <mo data-changed='added'>⁢</mo> |
5581 | 1 | <mi>x</mi> |
5582 | 1 | </mrow> |
5583 | 1 | </mrow> |
5584 | 1 | </math>"; |
5585 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5586 | 1 | } |
5587 | ||
5588 | #[test] | |
5589 | 1 | fn not_trig_negative_args() -> Result<()> { |
5590 | // this is here to make sure that only trig functions get the special treatment | |
5591 | 1 | let test_str = "<math><mi>ker</mi><mo>-</mo><mn>2</mn><mi>π</mi><mi>x</mi></math>"; |
5592 | 1 | let target_str = "<math> |
5593 | 1 | <mrow data-changed='added'> |
5594 | 1 | <mrow data-changed='added'> |
5595 | 1 | <mi>ker</mi> |
5596 | 1 | <mo data-changed='added'>⁡</mo> |
5597 | 1 | <mrow data-changed='added'> |
5598 | 1 | <mo>-</mo> |
5599 | 1 | <mn>2</mn> |
5600 | 1 | </mrow> |
5601 | 1 | </mrow> |
5602 | 1 | <mo data-changed='added'>⁢</mo> |
5603 | 1 | <mi>π</mi> |
5604 | 1 | <mo data-changed='added'>⁢</mo> |
5605 | 1 | <mi>x</mi> |
5606 | 1 | </mrow> |
5607 | 1 | </math>"; |
5608 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5609 | 1 | } |
5610 | ||
5611 | #[test] | |
5612 | 1 | fn trig_args() -> Result<()> { |
5613 | 1 | let test_str = "<math><mi>sin</mi><mn>2</mn><mi>π</mi><mi>x</mi></math>"; |
5614 | 1 | let target_str = "<math> |
5615 | 1 | <mrow data-changed='added'> |
5616 | 1 | <mi>sin</mi> |
5617 | 1 | <mo data-changed='added'>⁡</mo> |
5618 | 1 | <mrow data-changed='added'> |
5619 | 1 | <mn>2</mn> |
5620 | 1 | <mo data-changed='added'>⁢</mo> |
5621 | 1 | <mi>π</mi> |
5622 | 1 | <mo data-changed='added'>⁢</mo> |
5623 | 1 | <mi>x</mi> |
5624 | 1 | </mrow> |
5625 | 1 | </mrow> |
5626 | 1 | </math>"; |
5627 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5628 | 1 | } |
5629 | ||
5630 | #[test] | |
5631 | 1 | fn not_trig_args() -> Result<()> { |
5632 | // this is here to make sure that only trig functions get the special treatment | |
5633 | 1 | let test_str = "<math><mi>ker</mi><mn>2</mn><mi>π</mi><mi>x</mi></math>"; |
5634 | 1 | let target_str = "<math> |
5635 | 1 | <mrow data-changed='added'> |
5636 | 1 | <mrow data-changed='added'> |
5637 | 1 | <mi>ker</mi> |
5638 | 1 | <mo data-changed='added'>⁡</mo> |
5639 | 1 | <mn>2</mn> |
5640 | 1 | </mrow> |
5641 | 1 | <mo data-changed='added'>⁢</mo> |
5642 | 1 | <mi>π</mi> |
5643 | 1 | <mo data-changed='added'>⁢</mo> |
5644 | 1 | <mi>x</mi> |
5645 | 1 | </mrow> |
5646 | 1 | </math>"; |
5647 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5648 | 1 | } |
5649 | ||
5650 | #[test] | |
5651 | 1 | fn trig_trig() -> Result<()> { |
5652 | 1 | let test_str = "<math><mi>sin</mi><mi>x</mi><mi>cos</mi><mi>y</mi></math>"; |
5653 | 1 | let target_str = "<math> |
5654 | 1 | <mrow data-changed='added'> |
5655 | 1 | <mrow data-changed='added'> |
5656 | 1 | <mi>sin</mi> |
5657 | 1 | <mo data-changed='added'>⁡</mo> |
5658 | 1 | <mi>x</mi> |
5659 | 1 | </mrow> |
5660 | 1 | <mo data-changed='added'>⁢</mo> |
5661 | 1 | <mrow data-changed='added'> |
5662 | 1 | <mi>cos</mi> |
5663 | 1 | <mo data-changed='added'>⁡</mo> |
5664 | 1 | <mi>y</mi> |
5665 | 1 | </mrow> |
5666 | 1 | </mrow> |
5667 | 1 | </math>"; |
5668 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5669 | 1 | } |
5670 | ||
5671 | #[test] | |
5672 | 1 | fn trig_function_composition() -> Result<()> { |
5673 | 1 | let test_str = "<math><mo>(</mo><mi>sin</mi><mo>-</mo><mi>cos</mi><mo>)</mo><mi>x</mi></math>"; |
5674 | 1 | let target_str = "<math> |
5675 | 1 | <mrow data-changed='added'> |
5676 | 1 | <mrow data-changed='added'> |
5677 | 1 | <mo>(</mo> |
5678 | 1 | <mrow data-changed='added'> |
5679 | 1 | <mi>sin</mi> |
5680 | 1 | <mo>-</mo> |
5681 | 1 | <mi>cos</mi> |
5682 | 1 | </mrow> |
5683 | 1 | <mo>)</mo> |
5684 | 1 | </mrow> |
5685 | 1 | <mo data-changed='added'>⁢</mo> |
5686 | 1 | <mi>x</mi> |
5687 | 1 | </mrow> |
5688 | 1 | </math>"; |
5689 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5690 | 1 | } |
5691 | ||
5692 | ||
5693 | #[test] | |
5694 | 1 | fn currency_in_leaf_prefix() -> Result<()> { |
5695 | 1 | let test_str = "<math><mn>$8.54</mn></math>"; |
5696 | 1 | let target_str = "<math> |
5697 | 1 | <mrow data-changed='added'> |
5698 | 1 | <mi>$</mi> |
5699 | 1 | <mo data-changed='added'>⁢</mo> |
5700 | 1 | <mn>8.54</mn> |
5701 | 1 | </mrow> |
5702 | 1 | </math>"; |
5703 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5704 | 1 | } |
5705 | ||
5706 | #[test] | |
5707 | 1 | fn currency_in_leaf_postfix() -> Result<()> { |
5708 | 1 | let test_str = "<math><mn>188,23€</mn></math>"; |
5709 | 1 | let target_str = " <math> |
5710 | 1 | <mrow data-changed='added'> |
5711 | 1 | <mo data-changed='added'>⁢</mo> |
5712 | 1 | <mn>188,23</mn> |
5713 | 1 | <mo data-changed='added'>⁢</mo> |
5714 | 1 | <mi>€</mi> |
5715 | 1 | </mrow> |
5716 | 1 | </math>"; |
5717 | 1 | are_strs_canonically_equal_with_locale(test_str, target_str, &[], ".", ",") |
5718 | 1 | } |
5719 | ||
5720 | #[test] | |
5721 | 1 | fn currency_in_leaf_infix() -> Result<()> { |
5722 | 1 | let test_str = "<math><mn>1€23</mn></math>"; |
5723 | 1 | let target_str = " <math> |
5724 | 1 | <mrow data-changed='added'> |
5725 | 1 | <mn>1</mn> |
5726 | 1 | <mo data-changed='added'>⁢</mo> |
5727 | 1 | <mi>€</mi> |
5728 | 1 | <mo data-changed='added'>⁢</mo> |
5729 | 1 | <mn>23</mn> |
5730 | 1 | </mrow> |
5731 | 1 | </math>"; |
5732 | 1 | are_strs_canonically_equal_with_locale(test_str, target_str, &[], ".", ",") |
5733 | 1 | } |
5734 | ||
5735 | #[test] | |
5736 | 1 | fn mtext_whitespace_string() -> Result<()> { |
5737 | 1 | let test_str = "<math><mi>t</mi><mtext>  </mtext></math>"; |
5738 | 1 | let target_str = "<math><mi data-following-space-width='0.922'>t</mi></math>"; |
5739 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5740 | 1 | } |
5741 | ||
5742 | #[test] | |
5743 | 1 | fn mtext_whitespace_string_before() -> Result<()> { |
5744 | 1 | let test_str = "<math><mtext>  </mtext><mi>t</mi></math>"; |
5745 | 1 | let target_str = "<math><mi data-previous-space-width='0.922'>t</mi></math>"; |
5746 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5747 | 1 | } |
5748 | ||
5749 | #[test] | |
5750 | 1 | fn mtext_whitespace_1() -> Result<()> { |
5751 | 1 | let test_str = "<math><mi>t</mi><mtext>  </mtext> |
5752 | 1 | <mrow><mo>(</mo><mi>x</mi><mo>+</mo><mi>y</mi><mo>)</mo></mrow></math>"; |
5753 | 1 | let target_str = " <math> |
5754 | 1 | <mrow data-changed='added'> |
5755 | 1 | <mi>t</mi> |
5756 | 1 | <mo data-changed='added' data-function-guess='true'>⁢</mo> |
5757 | 1 | <mrow data-previous-space-width='0.922'> |
5758 | 1 | <mo>(</mo> |
5759 | 1 | <mrow data-changed='added'> |
5760 | 1 | <mi>x</mi> |
5761 | 1 | <mo>+</mo> |
5762 | 1 | <mi>y</mi> |
5763 | 1 | </mrow> |
5764 | 1 | <mo>)</mo> |
5765 | 1 | </mrow> |
5766 | 1 | </mrow> |
5767 | 1 | </math>"; |
5768 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5769 | 1 | } |
5770 | ||
5771 | #[test] | |
5772 | 1 | fn mtext_whitespace_2() -> Result<()> { |
5773 | 1 | let test_str = "<math><mi>f</mi><mtext>  </mtext> |
5774 | 1 | <mrow><mo>(</mo><mi>x</mi><mo>+</mo><mi>y</mi><mo>)</mo></mrow></math>"; |
5775 | 1 | let target_str = " <math> |
5776 | 1 | <mrow data-changed='added'> |
5777 | 1 | <mi>f</mi> |
5778 | 1 | <mo data-changed='added'>⁡</mo> |
5779 | 1 | <mrow data-previous-space-width='0.922'> |
5780 | 1 | <mo>(</mo> |
5781 | 1 | <mrow data-changed='added'> |
5782 | 1 | <mi>x</mi> |
5783 | 1 | <mo>+</mo> |
5784 | 1 | <mi>y</mi> |
5785 | 1 | </mrow> |
5786 | 1 | <mo>)</mo> |
5787 | 1 | </mrow> |
5788 | 1 | </mrow> |
5789 | 1 | </math>"; |
5790 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5791 | 1 | } |
5792 | ||
5793 | #[test] | |
5794 | 1 | fn remove_mtext_whitespace_3() -> Result<()> { |
5795 | 1 | let test_str = "<math><mi>t</mi> |
5796 | 1 | <mrow><mtext> </mtext><mo>(</mo><mi>x</mi><mo>+</mo><mi>y</mi><mo>)</mo></mrow></math>"; |
5797 | 1 | let target_str = "<math> |
5798 | 1 | <mrow data-changed='added'> |
5799 | 1 | <mi>t</mi> |
5800 | 1 | <mo data-changed='added' data-function-guess='true'>⁢</mo> |
5801 | 1 | <mrow> |
5802 | 1 | <mo data-previous-space-width='0.167'>(</mo> |
5803 | 1 | <mrow data-changed='added'> |
5804 | 1 | <mi>x</mi> |
5805 | 1 | <mo>+</mo> |
5806 | 1 | <mi>y</mi> |
5807 | 1 | </mrow> |
5808 | 1 | <mo>)</mo> |
5809 | 1 | </mrow> |
5810 | 1 | </mrow> |
5811 | 1 | </math>"; |
5812 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5813 | 1 | } |
5814 | ||
5815 | #[test] | |
5816 | 1 | fn do_not_remove_any_whitespace() -> Result<()> { |
5817 | 1 | let test_str = "<math><mfrac> |
5818 | 1 | <mrow><mspace width='3em'/></mrow> |
5819 | 1 | <mtext> </mtext> |
5820 | 1 | </mfrac></math>"; |
5821 | 1 | let target_str = " <math> |
5822 | 1 | <mfrac> |
5823 | 1 | <mtext width='3em' data-changed='was-mspace' data-width='3' data-empty-in-2D='true'> </mtext> |
5824 | 1 | <mtext data-width='0.167' data-empty-in-2D='true'> </mtext> |
5825 | 1 | </mfrac> |
5826 | 1 | </math>"; |
5827 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5828 | 1 | } |
5829 | ||
5830 | #[test] | |
5831 | 1 | fn remove_mo_whitespace() -> Result<()> { |
5832 | 1 | let test_str = "<math><mi>cos</mi><mo> </mo><mi>x</mi></math>"; |
5833 | 1 | let target_str = "<math> |
5834 | 1 | <mrow data-changed='added'> |
5835 | 1 | <mi>cos</mi> |
5836 | 1 | <mo data-changed='added'>⁡</mo> |
5837 | 1 | <mi data-previous-space-width='0.7'>x</mi> |
5838 | 1 | </mrow> |
5839 | 1 | </math>"; |
5840 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5841 | 1 | } |
5842 | ||
5843 | #[test] | |
5844 | 1 | fn do_not_remove_some_whitespace() -> Result<()> { |
5845 | 1 | let test_str = "<math><mroot> |
5846 | 1 | <mrow><mi>b</mi><mphantom><mi>y</mi></mphantom></mrow> |
5847 | 1 | <mtext> </mtext> |
5848 | 1 | </mroot></math>"; |
5849 | 1 | let target_str = "<math><mroot> |
5850 | 1 | <mi>b</mi> |
5851 | 1 | <mtext data-empty-in-2D='true' data-width='0.167'> </mtext> |
5852 | 1 | </mroot></math>"; |
5853 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5854 | 1 | } |
5855 | ||
5856 | #[test] | |
5857 | 1 | fn remove_all_extra_elements() -> Result<()> { |
5858 | 1 | let test_str = "<math><msqrt> |
5859 | 1 | <mstyle> <mi>b</mi> </mstyle> |
5860 | 1 | <mphantom><mi>y</mi></mphantom> |
5861 | 1 | <mtext> </mtext> |
5862 | 1 | <mspace width='3em'/> |
5863 | 1 | </msqrt></math>"; |
5864 | 1 | let target_str = "<math><msqrt> |
5865 | 1 | <mi data-following-space-width='3.167'>b</mi> |
5866 | 1 | </msqrt></math>"; |
5867 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5868 | 1 | } |
5869 | ||
5870 | #[test] | |
5871 | 1 | fn empty_content() -> Result<()> { |
5872 | 1 | let test_str = "<math></math>"; |
5873 | 1 | let target_str = " <math><mtext data-added='missing-content' data-width='0.700'> </mtext></math>"; |
5874 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5875 | 1 | } |
5876 | ||
5877 | #[test] | |
5878 | 1 | fn empty_content_after_cleanup() -> Result<()> { |
5879 | 1 | let test_str = "<math><mrow><mphantom><mn>1</mn></mphantom></mrow></math>"; |
5880 | 1 | let target_str = " <math><mtext data-added='missing-content' data-width='0'> </mtext></math>"; |
5881 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5882 | 1 | } |
5883 | ||
5884 | #[test] | |
5885 | 1 | fn empty_content_fix_num_children() -> Result<()> { |
5886 | 1 | let test_str = " <math><mfrac><menclose notation='box'><mrow/></menclose><mrow/></mfrac></math>"; |
5887 | 1 | let target_str = "<math> |
5888 | 1 | <mfrac> |
5889 | 1 | <menclose notation='box'> |
5890 | 1 | <mtext data-added='missing-content' data-empty-in-2D='true' data-width='0'> </mtext> |
5891 | 1 | </menclose> |
5892 | 1 | <mtext data-changed='empty_content' data-empty-in-2D='true' data-width='0'> </mtext> |
5893 | 1 | </mfrac> |
5894 | 1 | </math>"; |
5895 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5896 | 1 | } |
5897 | ||
5898 | ||
5899 | #[test] | |
5900 | 1 | fn clean_semantics() -> Result<()> { |
5901 | // this comes from LateXML | |
5902 | 1 | let test_str = "<math> |
5903 | 1 | <semantics> |
5904 | 1 | <mrow><mi>z</mi></mrow> |
5905 | 1 | <annotation-xml encoding='MathML-Content'> |
5906 | 1 | <ci>𝑧</ci> |
5907 | 1 | </annotation-xml> |
5908 | 1 | <annotation encoding='application/x-tex'>z</annotation> |
5909 | 1 | <annotation encoding='application/x-llamapun'>italic_z</annotation> |
5910 | 1 | </semantics> |
5911 | 1 | </math>"; |
5912 | // the annotation-xml value is very touchy and must exactly match what mml-to-string() generates for the test to pass | |
5913 | 1 | let target_str = " <math> |
5914 | 1 | <mi data-annotation-xml-MathML-Content=' <annotation-xml encoding='MathML-Content'> |
5915 | 1 | <ci>𝑧</ci> |
5916 | 1 | </annotation-xml> |
5917 | 1 | ' data-annotation-application_slash_x-tex='z' data-annotation-application_slash_x-llamapun='italic_z'>z</mi> |
5918 | 1 | </math>"; |
5919 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5920 | 1 | } |
5921 | ||
5922 | #[test] | |
5923 | 1 | fn clean_up_mi_operator() -> Result<()> { |
5924 | 1 | let test_str = "<math><mrow><mi>∠</mi><mi>A</mi><mi>B</mi><mi>C</mi></mrow></math>"; |
5925 | 1 | let target_str = " <math> |
5926 | 1 | <mrow> |
5927 | 1 | <mo>∠</mo> |
5928 | 1 | <mrow data-changed='added'> |
5929 | 1 | <mi>A</mi> |
5930 | 1 | <mo data-changed='added'>⁣</mo> |
5931 | 1 | <mi>B</mi> |
5932 | 1 | <mo data-changed='added'>⁣</mo> |
5933 | 1 | <mi>C</mi> |
5934 | 1 | </mrow> |
5935 | 1 | </mrow> |
5936 | 1 | </math>"; |
5937 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5938 | 1 | } |
5939 | ||
5940 | ||
5941 | #[test] | |
5942 | 1 | fn clean_up_arc() -> Result<()> { |
5943 | 1 | let test_str = "<math><mtext>arc </mtext><mi>cos</mi><mi>x</mi></math>"; |
5944 | 1 | let target_str = "<math> |
5945 | 1 | <mrow data-changed='added'> |
5946 | 1 | <mi>arccos</mi> |
5947 | 1 | <mo data-changed='added'>⁡</mo> |
5948 | 1 | <mi>x</mi> |
5949 | 1 | </mrow> |
5950 | 1 | </math>"; |
5951 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5952 | 1 | } |
5953 | ||
5954 | #[test] | |
5955 | 1 | fn clean_up_arc_nospace() -> Result<()> { |
5956 | 1 | let test_str = "<math><mtext>arc</mtext><mi>cos</mi><mi>x</mi></math>"; |
5957 | 1 | let target_str = "<math> |
5958 | 1 | <mrow data-changed='added'> |
5959 | 1 | <mi>arccos</mi> |
5960 | 1 | <mo data-changed='added'>⁡</mo> |
5961 | 1 | <mi>x</mi> |
5962 | 1 | </mrow> |
5963 | 1 | </math>"; |
5964 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5965 | 1 | } |
5966 | ||
5967 | #[test] | |
5968 | 1 | fn roman_numeral() -> Result<()> { |
5969 | 1 | let test_str = "<math><mrow><mtext>XLVIII</mtext> <mo>+</mo><mn>mmxxvi</mn></mrow></math>"; |
5970 | // turns out there is no need to mark them as Roman Numerals -- thought that was need for braille | |
5971 | 1 | let target_str = "<math><mrow> |
5972 | 1 | <mn data-roman-numeral='true' data-number='48'>XLVIII</mn> <mo>+</mo><mn data-roman-numeral='true' data-number='2026'>mmxxvi</mn> |
5973 | 1 | </mrow></math>"; |
5974 | // let target_str = "<math><mrow><mtext>XLVIII</mtext> <mo>+</mo><mn>mmxxvi</mn></mrow></math>"; | |
5975 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5976 | 1 | } |
5977 | ||
5978 | // #[test] | |
5979 | // fn roman_numeral_context() { | |
5980 | // let test_str = "<math><mi>vi</mi><mo>-</mo><mi mathvariant='normal'>i</mi><mo>=</mo><mtext>v</mtext></math>"; | |
5981 | // let target_str = "<math> <mrow data-changed='added'> | |
5982 | // <mrow data-changed='added'><mn data-roman-numeral='true'>vi</mn><mo>-</mo><mn mathvariant='normal' data-roman-numeral='true'>i</mn></mrow> | |
5983 | // <mo>=</mo> <mn data-roman-numeral='true'>v</mn> | |
5984 | // </mrow> </math>"; | |
5985 | // are_strs_canonically_equal_result(test_str, target_str, &[]) | |
5986 | // } | |
5987 | ||
5988 | #[test] | |
5989 | 1 | fn not_roman_numeral() -> Result<()> { |
5990 | 1 | let test_str = "<math><mtext>cm</mtext></math>"; |
5991 | // shouldn't change | |
5992 | 1 | let target_str = "<math><mtext>cm</mtext></math>"; |
5993 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
5994 | 1 | } |
5995 | ||
5996 | #[test] | |
5997 | 1 | fn digit_block_binary() -> Result<()> { |
5998 | 1 | let test_str = "<math><mo>(</mo><mn>0110</mn><mspace width=\"thickmathspace\"></mspace><mn>1110</mn><mspace width=\"thickmathspace\"></mspace><mn>0110</mn><mo>)</mo></math>"; |
5999 | 1 | let target_str = " <math> |
6000 | 1 | <mrow data-changed='added'> |
6001 | 1 | <mo>(</mo> |
6002 | 1 | <mn>0110\u{00A0}1110\u{00A0}0110</mn> |
6003 | 1 | <mo>)</mo> |
6004 | 1 | </mrow> |
6005 | 1 | </math>"; |
6006 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6007 | 1 | } |
6008 | ||
6009 | #[test] | |
6010 | 1 | fn digit_block_decimal() -> Result<()> { |
6011 | 1 | let test_str = "<math><mn>8</mn><mo>,</mo><mn>123</mn><mo>,</mo><mn>456</mn><mo>+</mo> |
6012 | 1 | <mn>4</mn><mo>.</mo><mn>32</mn></math>"; |
6013 | 1 | let target_str = " <math> |
6014 | 1 | <mrow data-changed='added'> |
6015 | 1 | <mn>8,123,456</mn> |
6016 | 1 | <mo>+</mo> |
6017 | 1 | <mn>4.32</mn> |
6018 | 1 | </mrow> |
6019 | 1 | </math>"; |
6020 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6021 | 1 | } |
6022 | #[test] | |
6023 | 1 | fn digit_block_comma() -> Result<()> { |
6024 | 1 | let test_str = "<math><mn>8</mn><mo>.</mo><mn>123</mn><mo>.</mo><mn>456</mn><mo>+</mo> |
6025 | 1 | <mn>4</mn><mo>,</mo><mn>32</mn></math>"; |
6026 | 1 | let target_str = " <math> |
6027 | 1 | <mrow data-changed='added'> |
6028 | 1 | <mn>8.123.456</mn> |
6029 | 1 | <mo>+</mo> |
6030 | 1 | <mn>4,32</mn> |
6031 | 1 | </mrow> |
6032 | 1 | </math>"; |
6033 | 1 | are_strs_canonically_equal_with_locale(test_str, target_str, &[], ".", ", ") |
6034 | 1 | } |
6035 | ||
6036 | #[test] | |
6037 | 1 | fn digit_block_int() -> Result<()> { |
6038 | 1 | let test_str = "<math><mn>12</mn><mo>,</mo><mn>345</mn><mo>+</mo> |
6039 | 1 | <mn>1</mn><mo>,</mo><mn>000</mn></math>"; |
6040 | 1 | let target_str = " <math> |
6041 | 1 | <mrow data-changed='added'> |
6042 | 1 | <mn>12,345</mn> |
6043 | 1 | <mo>+</mo> |
6044 | 1 | <mn>1,000</mn> |
6045 | 1 | </mrow> |
6046 | 1 | </math>"; |
6047 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6048 | 1 | } |
6049 | ||
6050 | #[test] | |
6051 | 1 | fn digit_block_non_ascii_int() -> Result<()> { |
6052 | 1 | let test_str = "<math><mn>𝟏𝟐</mn><mo>,</mo><mn>3𝟰𝟻</mn><mo>+</mo> |
6053 | 1 | <mn>𝟙</mn><mo>,</mo><mn>𝟬𝟬𝟬</mn></math>"; |
6054 | 1 | let target_str = " <math> |
6055 | 1 | <mrow data-changed='added'> |
6056 | 1 | <mn>𝟏𝟐,3𝟰𝟻</mn> |
6057 | 1 | <mo>+</mo> |
6058 | 1 | <mn>𝟙,𝟬𝟬𝟬</mn> |
6059 | 1 | </mrow> |
6060 | 1 | </math>"; |
6061 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6062 | 1 | } |
6063 | ||
6064 | #[test] | |
6065 | 1 | fn digit_block_int_dots() -> Result<()> { |
6066 | 1 | let test_str = "<math><mn>12</mn><mo>.</mo><mn>345</mn><mo>+</mo> |
6067 | 1 | <mn>1</mn><mo>.</mo><mn>000</mn></math>"; |
6068 | 1 | let target_str = " <math> |
6069 | 1 | <mrow data-changed='added'> |
6070 | 1 | <mn>12.345</mn> |
6071 | 1 | <mo>+</mo> |
6072 | 1 | <mn>1.000</mn> |
6073 | 1 | </mrow> |
6074 | 1 | </math>"; |
6075 | 1 | are_strs_canonically_equal_with_locale(test_str, target_str, &[], ".", ", ") |
6076 | 1 | } |
6077 | ||
6078 | #[test] | |
6079 | 1 | fn digit_block_decimal_pt() -> Result<()> { |
6080 | 1 | let test_str = "<math><mn>8</mn><mo>,</mo><mn>123</mn><mo>.</mo> |
6081 | 1 | <mo>+</mo><mn>4</mn><mo>.</mo> |
6082 | 1 | <mo>+</mo><mo>.</mo><mn>01</mn></math>"; |
6083 | 1 | let target_str = " <math> |
6084 | 1 | <mrow data-changed='added'> |
6085 | 1 | <mn>8,123.</mn> |
6086 | 1 | <mo>+</mo> |
6087 | 1 | <mn>4.</mn> |
6088 | 1 | <mo>+</mo> |
6089 | 1 | <mn>.01</mn> |
6090 | 1 | </mrow> |
6091 | 1 | </math>"; |
6092 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6093 | 1 | } |
6094 | ||
6095 | #[test] | |
6096 | 1 | fn number_with_decimal_pt() -> Result<()> { |
6097 | // this is output from WIRIS for "12.3" | |
6098 | 1 | let test_str = "<math><mn>12</mn><mo>.</mo><mn>3</mn></math>"; |
6099 | 1 | let target_str = "<math><mn>12.3</mn></math>"; |
6100 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6101 | 1 | } |
6102 | ||
6103 | #[test] | |
6104 | 1 | fn number_with_comma_decimal_pt() -> Result<()> { |
6105 | // this is output from WIRIS for "12.3" | |
6106 | 1 | let test_str = "<math><mn>12</mn><mo>,</mo><mn>3</mn></math>"; |
6107 | 1 | let target_str = "<math><mn>12,3</mn></math>"; |
6108 | 1 | are_strs_canonically_equal_with_locale(test_str, target_str, &[], ".", ", ") |
6109 | 1 | } |
6110 | ||
6111 | #[test] | |
6112 | 1 | fn addition_with_decimal_point_at_end() -> Result<()> { |
6113 | // in this case, the trailing "." is probably a decimal point" -- testing special case combine the "." | |
6114 | // this comes from WIRIS | |
6115 | 1 | let test_str = "<math><mn>1</mn><mo>.</mo><mn>3</mn><mo>+</mo><mn>2</mn><mo>.</mo></math>"; |
6116 | 1 | let target_str = "<math><mrow data-changed='added'><mn>1.3</mn><mo>+</mo><mn>2.</mn></mrow></math>"; |
6117 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6118 | 1 | } |
6119 | ||
6120 | #[test] | |
6121 | 1 | fn addition_with_decimal_point_at_end_and_comma_decimal_separator() -> Result<()> { |
6122 | // in this case, the trailing "." is probably a decimal point" -- testing special case combine the "." | |
6123 | // this comes from WIRIS | |
6124 | 1 | let test_str = "<math><mn>1</mn><mo>,</mo><mn>3</mn><mo>+</mo><mn>2</mn><mo>,</mo></math>"; |
6125 | 1 | let target_str = "<math><mrow data-changed='added'><mn>1,3</mn><mo>+</mo><mn>2,</mn></mrow></math>"; |
6126 | 1 | are_strs_canonically_equal_with_locale(test_str, target_str, &[], ".", ", ") |
6127 | 1 | } |
6128 | ||
6129 | #[test] | |
6130 | 1 | fn sequence_with_period() -> Result<()> { |
6131 | // in this case, we don't want "5." -- testing special case to avoid combining the period. | |
6132 | 1 | let test_str = "<math><mn>1</mn><mo>,</mo><mn>3</mn><mo>,</mo><mn>5</mn><mo>.</mo></math>"; |
6133 | 1 | let target_str = "<math><mrow data-changed='added'> |
6134 | 1 | <mrow data-changed='added'><mn>1</mn><mo>,</mo><mn>3</mn><mo>,</mo><mn>5</mn></mrow><mo>.</mo> |
6135 | 1 | </mrow></math>"; |
6136 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6137 | 1 | } |
6138 | ||
6139 | #[test] | |
6140 | 1 | fn addition_decimal_pt() -> Result<()> { |
6141 | 1 | let test_str = "<math><mo>.</mo><mn>4</mn><mo>=</mo><mn>0</mn><mo>.</mo><mn>4</mn></math>"; |
6142 | 1 | let target_str = "<math><mrow data-changed='added'><mn>.4</mn><mo>=</mo><mn>0.4</mn></mrow></math>"; |
6143 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6144 | 1 | } |
6145 | ||
6146 | #[test] | |
6147 | 1 | fn fraction_decimal_pt() -> Result<()> { |
6148 | 1 | let test_str = "<math><mfrac><mrow><mn>1</mn><mo>.</mo></mrow><mrow><mn>2</mn><mo>.</mo></mrow></mfrac></math>"; |
6149 | 1 | let target_str = "<math><mfrac><mn>1.</mn><mn>2.</mn></mfrac></math>"; |
6150 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6151 | 1 | } |
6152 | ||
6153 | #[test] | |
6154 | 1 | fn fraction_decimal_pt_no_split() -> Result<()> { |
6155 | // don't split off the '.' | |
6156 | 1 | let test_str = "<math><mfrac><mn>1.</mn><mn>2.</mn></mfrac></math>"; |
6157 | 1 | let target_str = "<math><mfrac><mn>1.</mn><mn>2.</mn></mfrac></math>"; |
6158 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6159 | 1 | } |
6160 | ||
6161 | #[test] | |
6162 | 1 | fn not_digit_block_parens() -> Result<()> { |
6163 | 1 | let test_str = "<math><mo>(</mo><mn>451</mn><mo>,</mo><mn>231</mn><mo>)</mo></math>"; |
6164 | 1 | let target_str = " <math> <mrow data-changed='added'> |
6165 | 1 | <mo>(</mo> |
6166 | 1 | <mrow data-changed='added'> |
6167 | 1 | <mn>451</mn> <mo>,</mo> <mn>231</mn> |
6168 | 1 | </mrow> |
6169 | 1 | <mo>)</mo> |
6170 | 1 | </mrow></math>"; |
6171 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6172 | 1 | } |
6173 | ||
6174 | #[test] | |
6175 | 1 | fn not_digit_block_parens_mrow() -> Result<()> { |
6176 | 1 | let test_str = "<math><mo>(</mo><mrow><mn>451</mn><mo>,</mo><mn>231</mn></mrow><mo>)</mo></math>"; |
6177 | 1 | let target_str = " <math> <mrow data-changed='added'> |
6178 | 1 | <mo>(</mo> |
6179 | 1 | <mrow> |
6180 | 1 | <mn>451</mn> <mo>,</mo> <mn>231</mn> |
6181 | 1 | </mrow> |
6182 | 1 | <mo>)</mo> |
6183 | 1 | </mrow></math>"; |
6184 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6185 | 1 | } |
6186 | ||
6187 | #[test] | |
6188 | 1 | fn not_digit_block_decimal() -> Result<()> { |
6189 | 1 | let test_str = "<math><mn>8</mn><mo>,</mo><mn>49</mn><mo>,</mo><mn>456</mn><mo>+</mo> |
6190 | 1 | <mn>4</mn><mtext> </mtext><mn>32</mn><mo>+</mo> |
6191 | 1 | <mn>1</mn><mo>,</mo><mn>234</mn><mo>,</mo><mn>56</mn></math>"; |
6192 | 1 | let target_str = "<math> |
6193 | 1 | <mrow data-changed='added'> |
6194 | 1 | <mn>8</mn> |
6195 | 1 | <mo>,</mo> |
6196 | 1 | <mn>49</mn> |
6197 | 1 | <mo>,</mo> |
6198 | 1 | <mrow data-changed='added'> |
6199 | 1 | <mn>456</mn> |
6200 | 1 | <mo>+</mo> |
6201 | 1 | <mrow data-changed='added'> |
6202 | 1 | <mn>4</mn> |
6203 | 1 | <mo data-changed='added'>⁢</mo> |
6204 | 1 | <mn>32</mn> |
6205 | 1 | </mrow> |
6206 | 1 | <mo>+</mo> |
6207 | 1 | <mn>1</mn> |
6208 | 1 | </mrow> |
6209 | 1 | <mo>,</mo> |
6210 | 1 | <mn>234</mn> |
6211 | 1 | <mo>,</mo> |
6212 | 1 | <mn>56</mn> |
6213 | 1 | </mrow> |
6214 | 1 | </math>"; |
6215 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6216 | 1 | } |
6217 | ||
6218 | #[test] | |
6219 | 1 | fn not_digit_block_ellipsis() -> Result<()> { |
6220 | 1 | let test_str = "<math><mrow><mn>8</mn><mo>,</mo><mn>123</mn><mo>,</mo><mn>456</mn><mo>,</mo> |
6221 | 1 | <mi>…</mi></mrow></math>"; |
6222 | 1 | let target_str = "<math> |
6223 | 1 | <mrow> |
6224 | 1 | <mn>8</mn> |
6225 | 1 | <mo>,</mo> |
6226 | 1 | <mn>123</mn> |
6227 | 1 | <mo>,</mo> |
6228 | 1 | <mn>456</mn> |
6229 | 1 | <mo>,</mo> |
6230 | 1 | <mi>…</mi> |
6231 | 1 | </mrow> |
6232 | 1 | </math>"; |
6233 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6234 | 1 | } |
6235 | ||
6236 | #[test] | |
6237 | 1 | fn not_digit_block_negative_numbers_euro() -> Result<()> { |
6238 | 1 | let test_str = "<math><mrow> |
6239 | 1 | <mo>-</mo><mn>1</mn><mo>,</mo> |
6240 | 1 | <mo>-</mo><mn>2</mn><mo>,</mo> |
6241 | 1 | <mo>-</mo><mn>3</mn><mo>,</mo> |
6242 | 1 | <mo>…</mo> |
6243 | 1 | </mrow></math>"; |
6244 | 1 | let target_str = "<math><mrow> |
6245 | 1 | <mrow data-changed='added'> |
6246 | 1 | <mo>-</mo> |
6247 | 1 | <mn>1</mn> |
6248 | 1 | </mrow> |
6249 | 1 | <mo>,</mo> |
6250 | 1 | <mrow data-changed='added'> |
6251 | 1 | <mo>-</mo> |
6252 | 1 | <mn>2</mn> |
6253 | 1 | </mrow> |
6254 | 1 | <mo>,</mo> |
6255 | 1 | <mrow data-changed='added'> |
6256 | 1 | <mo>-</mo> |
6257 | 1 | <mn>3</mn> |
6258 | 1 | </mrow> |
6259 | 1 | <mo>,</mo> |
6260 | 1 | <mi>…</mi> |
6261 | 1 | </mrow></math>"; |
6262 | 1 | are_strs_canonically_equal_with_locale(test_str, target_str, &[], " .", ",") |
6263 | 1 | } |
6264 | ||
6265 | #[test] | |
6266 | 1 | fn ellipsis() -> Result<()> { |
6267 | 1 | let test_str = "<math><mn>5</mn><mo>,</mo><mo>.</mo><mo>.</mo><mo>.</mo><mo>,</mo><mn>8</mn><mo>,</mo> |
6268 | 1 | <mn>9</mn><mo>,</mo><mo>.</mo><mo>.</mo><mo>.</mo><mo>,</mo><mn>11</mn><mo>,</mo> |
6269 | 1 | <mn>5</mn><mo>,</mo><mo>.</mo><mo>.</mo><mo>,</mo><mn>8</mn> |
6270 | 1 | </math>"; |
6271 | 1 | let target_str = "<math><mrow data-changed='added'> |
6272 | 1 | <mn>5</mn><mo>,</mo><mi>…</mi><mo>,</mo><mn>8</mn><mo>,</mo> |
6273 | 1 | <mn>9</mn><mo>,</mo><mi>…</mi><mo>,</mo><mn>11</mn><mo>,</mo> |
6274 | 1 | <mn>5</mn><mo>,</mo><mrow data-changed='added'><mo>.</mo><mo>.</mo></mrow> |
6275 | 1 | <mo>,</mo><mn>8</mn></mrow></math>"; |
6276 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6277 | 1 | } |
6278 | ||
6279 | ||
6280 | #[test] | |
6281 | 1 | fn no_merge_271() -> Result<()> { |
6282 | 1 | let test_str = "<math><mrow><mo>{</mo> |
6283 | 1 | <mrow><mn>2</mn><mo>,</mo><mn>4</mn><mo>,</mo><mn>6</mn></mrow> |
6284 | 1 | <mo>}</mo></mrow></math>"; |
6285 | 1 | let target_str = "<math><mrow><mo>{</mo> |
6286 | 1 | <mrow><mn>2</mn><mo>,</mo><mn>4</mn><mo>,</mo><mn>6</mn></mrow> |
6287 | 1 | <mo>}</mo></mrow></math>"; |
6288 | 1 | are_strs_canonically_equal_with_locale(test_str, target_str, &[], " .", ",") |
6289 | 1 | } |
6290 | ||
6291 | #[test] | |
6292 | 1 | fn not_digit_block_271() -> Result<()> { |
6293 | 1 | let test_str = "<math><mrow> |
6294 | 1 | <mi>…</mi><mo>,</mo> |
6295 | 1 | <mo>-</mo><mn>2</mn><mo>,</mo> |
6296 | 1 | <mo>-</mo><mn>1</mn><mo>,</mo> |
6297 | 1 | <mn>0</mn> |
6298 | 1 | </mrow></math>"; |
6299 | 1 | let target_str = "<math> <mrow> |
6300 | 1 | <mi>…</mi> |
6301 | 1 | <mo>,</mo> |
6302 | 1 | <mrow data-changed='added'><mo>-</mo><mn>2</mn></mrow> |
6303 | 1 | <mo>,</mo> |
6304 | 1 | <mrow data-changed='added'><mo>-</mo><mn>1</mn></mrow> |
6305 | 1 | <mo>,</mo> |
6306 | 1 | <mn>0</mn> |
6307 | 1 | </mrow></math>"; |
6308 | 1 | are_strs_canonically_equal_with_locale(test_str, target_str, &[], " .", ",") |
6309 | 1 | } |
6310 | ||
6311 | #[test] | |
6312 | 1 | fn merge_decimal_in_list_271() -> Result<()> { |
6313 | 1 | let test_str = "<math><mi>x</mi><mo>,</mo><mn>2</mn><mo>.</mo><mn>5</mn><mi>g</mi><mo>,</mo><mn>3</mn></math>"; |
6314 | 1 | let target_str = "<math> <mrow data-changed='added'> |
6315 | 1 | <mi>x</mi> |
6316 | 1 | <mo>,</mo> |
6317 | 1 | <mrow data-changed='added'> <mn>2.5</mn> <mo data-changed='added'>⁢</mo> <mi>g</mi> </mrow> |
6318 | 1 | <mo>,</mo> |
6319 | 1 | <mn>3</mn> |
6320 | 1 | </mrow> </math>"; |
6321 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6322 | 1 | } |
6323 | ||
6324 | #[test] | |
6325 | 1 | fn primes_common() -> Result<()> { |
6326 | 1 | let test_str = "<math><msup><mn>5</mn><mo>'</mo></msup> |
6327 | 1 | <msup><mn>5</mn><mo>''</mo></msup> |
6328 | 1 | <msup><mn>8</mn><mrow><mo>'</mo><mo>'</mo></mrow></msup></math>"; |
6329 | 1 | let target_str = "<math> |
6330 | 1 | <mrow data-changed='added'> |
6331 | 1 | <msup> |
6332 | 1 | <mn>5</mn> |
6333 | 1 | <mo>′</mo> |
6334 | 1 | </msup> |
6335 | 1 | <mo data-changed='added'>⁢</mo> |
6336 | 1 | <msup> |
6337 | 1 | <mn>5</mn> |
6338 | 1 | <mo>″</mo> |
6339 | 1 | </msup> |
6340 | 1 | <mo data-changed='added'>⁢</mo> |
6341 | 1 | <msup> |
6342 | 1 | <mn>8</mn> |
6343 | 1 | <mo>″</mo> |
6344 | 1 | </msup> |
6345 | 1 | </mrow> |
6346 | 1 | </math>"; |
6347 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6348 | 1 | } |
6349 | ||
6350 | #[test] | |
6351 | 1 | fn primes_uncommon() -> Result<()> { |
6352 | 1 | let test_str = "<math><msup><mn>5</mn><mo>''′</mo></msup> |
6353 | 1 | <msup><mn>5</mn><mo>''''</mo></msup> |
6354 | 1 | <msup><mn>8</mn><mrow><mo>′</mo><mo>⁗</mo></mrow></msup></math>"; |
6355 | 1 | let target_str = " <math> |
6356 | 1 | <mrow data-changed='added'> |
6357 | 1 | <msup> |
6358 | 1 | <mn>5</mn> |
6359 | 1 | <mo>‴</mo> |
6360 | 1 | </msup> |
6361 | 1 | <mo data-changed='added'>⁢</mo> |
6362 | 1 | <msup> |
6363 | 1 | <mn>5</mn> |
6364 | 1 | <mo>⁗</mo> |
6365 | 1 | </msup> |
6366 | 1 | <mo data-changed='added'>⁢</mo> |
6367 | 1 | <msup> |
6368 | 1 | <mn>8</mn> |
6369 | 1 | <mo>⁗′</mo> |
6370 | 1 | </msup> |
6371 | 1 | </mrow> |
6372 | 1 | </math>"; |
6373 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6374 | 1 | } |
6375 | ||
6376 | #[test] | |
6377 | 1 | fn merge_mi_test() -> Result<()> { |
6378 | 1 | let test_str = "<math> |
6379 | 1 | <mi>c</mi><mi>o</mi><mi>s</mi><mo>=</mo> |
6380 | 1 | <mi>w</mi><mi>x</mi><mi>y</mi><mi>z</mi><mo>+</mo> |
6381 | 1 | <mi>n</mi><mi>a</mi><mi>x</mi><mo>+</mo> |
6382 | 1 | <mi>i</mi><mi>ω</mi><mi>t</mi><mo>+</mo> |
6383 | 1 | <mi>f</mi><mi>l</mi><mi>o</mi><mi>w</mi><mo>+</mo> |
6384 | 1 | <mi>m</mi><mi>a</mi><mi>x</mi> |
6385 | 1 | </math> |
6386 | 1 | "; |
6387 | 1 | let target_str = "<math> |
6388 | 1 | <mrow data-changed='added'> |
6389 | 1 | <mi>cos</mi> |
6390 | 1 | <mo>=</mo> |
6391 | 1 | <mrow data-changed='added'> |
6392 | 1 | <mrow data-changed='added'> |
6393 | 1 | <mi>w</mi> |
6394 | 1 | <mo data-changed='added'>⁢</mo> |
6395 | 1 | <mi>x</mi> |
6396 | 1 | <mo data-changed='added'>⁢</mo> |
6397 | 1 | <mi>y</mi> |
6398 | 1 | <mo data-changed='added'>⁢</mo> |
6399 | 1 | <mi>z</mi> |
6400 | 1 | </mrow> |
6401 | 1 | <mo>+</mo> |
6402 | 1 | <mrow data-changed='added'> |
6403 | 1 | <mi>n</mi> |
6404 | 1 | <mo data-changed='added'>⁢</mo> |
6405 | 1 | <mi>a</mi> |
6406 | 1 | <mo data-changed='added'>⁢</mo> |
6407 | 1 | <mi>x</mi> |
6408 | 1 | </mrow> |
6409 | 1 | <mo>+</mo> |
6410 | 1 | <mrow data-changed='added'> |
6411 | 1 | <mi>i</mi> |
6412 | 1 | <mo data-changed='added'>⁢</mo> |
6413 | 1 | <mi>ω</mi> |
6414 | 1 | <mo data-changed='added'>⁢</mo> |
6415 | 1 | <mi>t</mi> |
6416 | 1 | </mrow> |
6417 | 1 | <mo>+</mo> |
6418 | 1 | <mi>flow</mi> |
6419 | 1 | <mo>+</mo> |
6420 | 1 | <mi>max</mi> |
6421 | 1 | </mrow> |
6422 | 1 | </mrow> |
6423 | 1 | </math>"; |
6424 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6425 | 1 | } |
6426 | ||
6427 | #[test] | |
6428 | 1 | fn merge_mi_with_script_test() -> Result<()> { |
6429 | 1 | let test_str = "<math> |
6430 | 1 | <mi>c</mi><mi>o</mi><msup><mi>s</mi><mn>2</mn></msup><mi>y</mi><mo>=</mo> |
6431 | 1 | <mi>l</mi><mi>o</mi><msup><mi>g</mi><mn>2</mn></msup><mi>y</mi><mo>+</mo> |
6432 | 1 | <mi>d</mi><mi>a</mi><msup><mi>g</mi><mn>2</mn></msup> |
6433 | 1 | </math>"; |
6434 | 1 | let target_str = "<math> |
6435 | 1 | <mrow data-changed='added'> |
6436 | 1 | <mrow data-changed='added'> |
6437 | 1 | <msup> |
6438 | 1 | <mi>cos</mi> |
6439 | 1 | <mn>2</mn> |
6440 | 1 | </msup> |
6441 | 1 | <mo data-changed='added'>⁡</mo> |
6442 | 1 | <mi>y</mi> |
6443 | 1 | </mrow> |
6444 | 1 | <mo>=</mo> |
6445 | 1 | <mrow data-changed='added'> |
6446 | 1 | <mrow data-changed='added'> |
6447 | 1 | <msup> |
6448 | 1 | <mi>log</mi> |
6449 | 1 | <mn>2</mn> |
6450 | 1 | </msup> |
6451 | 1 | <mo data-changed='added'>⁡</mo> |
6452 | 1 | <mi>y</mi> |
6453 | 1 | </mrow> |
6454 | 1 | <mo>+</mo> |
6455 | 1 | <mrow data-changed='added'> |
6456 | 1 | <mi>d</mi> |
6457 | 1 | <mo data-changed='added'>⁢</mo> |
6458 | 1 | <mi>a</mi> |
6459 | 1 | <mo data-changed='added'>⁢</mo> |
6460 | 1 | <msup> |
6461 | 1 | <mi>g</mi> |
6462 | 1 | <mn>2</mn> |
6463 | 1 | </msup> |
6464 | 1 | </mrow> |
6465 | 1 | </mrow> |
6466 | 1 | </mrow> |
6467 | 1 | </math>"; |
6468 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6469 | 1 | } |
6470 | ||
6471 | #[test] | |
6472 | 1 | fn merge_mi_with_script_bug_333_test() -> Result<()> { |
6473 | 1 | let test_str = "<math> |
6474 | 1 | <mi>l</mi><mi>o</mi><msub><mrow><mi>g</mi></mrow><mrow><mn>2</mn></mrow></msub><mo>=</mo> |
6475 | 1 | <mi>l</mi><mi>i</mi><msub><mrow><mi>m</mi></mrow><mrow><mi>n</mi><mo>→</mo><mi>∞</mi></mrow></msub> |
6476 | 1 | </math> |
6477 | 1 | "; |
6478 | 1 | let target_str = " <math> |
6479 | 1 | <mrow data-changed='added'> |
6480 | 1 | <msub> |
6481 | 1 | <mi>log</mi> |
6482 | 1 | <mn>2</mn> |
6483 | 1 | </msub> |
6484 | 1 | <mo>=</mo> |
6485 | 1 | <msub> |
6486 | 1 | <mi>lim</mi> |
6487 | 1 | <mrow> |
6488 | 1 | <mi>n</mi> |
6489 | 1 | <mo>→</mo> |
6490 | 1 | <mi>∞</mi> |
6491 | 1 | </mrow> |
6492 | 1 | </msub> |
6493 | 1 | </mrow> |
6494 | 1 | </math>"; |
6495 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6496 | 1 | } |
6497 | ||
6498 | #[test] | |
6499 | 1 | fn merge_mi_bug_545() -> Result<()> { |
6500 | 1 | let test_str = "<math><mi>S</mi><mi>I</mi><msup><mi>N</mi><mrow><mo>-</mo><mn>1</mn></mrow></msup></math>"; |
6501 | 1 | let target_str = "<math><msup><mi mathvariant='normal'>SIN</mi><mrow><mo>-</mo><mn>1</mn></mrow></msup></math>"; |
6502 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6503 | 1 | } |
6504 | ||
6505 | #[test] | |
6506 | 1 | fn parent_bug_94() -> Result<()> { |
6507 | // This is a test to make sure the crash in the bug report doesn't happen. | |
6508 | // Note: in the bug, they behavior they would like is a single mn with content "0.02" | |
6509 | // However, TeX input "1 2 3" will produce three consecutive <mn>s, so merging <mn>s isn't good in general | |
6510 | // This test | |
6511 | 1 | let test_str = " <math> |
6512 | 1 | <mrow> |
6513 | 1 | <msqrt> |
6514 | 1 | <mrow> |
6515 | 1 | <mstyle mathvariant='bold' mathsize='normal'><mn>0</mn></mstyle> |
6516 | 1 | <mstyle mathvariant='bold' mathsize='normal'><mo>.</mo><mn>0</mn><mn>2</mn></mstyle> |
6517 | 1 | </mrow> |
6518 | 1 | </msqrt> |
6519 | 1 | </mrow> |
6520 | 1 | </math> |
6521 | 1 | "; |
6522 | 1 | let target_str = "<math> |
6523 | 1 | <msqrt> |
6524 | 1 | <mn mathsize='normal' mathvariant='bold' data-changed='added'>0.02</mn> |
6525 | 1 | </msqrt> |
6526 | 1 | </math>"; |
6527 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6528 | 1 | } |
6529 | ||
6530 | #[test] | |
6531 | 1 | fn mstyle_merge_bug_272() -> Result<()> { |
6532 | 1 | let test_str = r#"<math> |
6533 | 1 | <msup> |
6534 | 1 | <mstyle mathvariant="bold" mathsize="normal"> |
6535 | 1 | <mn>6</mn> |
6536 | 1 | </mstyle> |
6537 | 1 | <mstyle mathvariant="bold" mathsize="normal"> |
6538 | 1 | <mn>9</mn> |
6539 | 1 | </mstyle> |
6540 | 1 | </msup> |
6541 | 1 | </math>"#; |
6542 | 1 | let target_str = "<math> |
6543 | 1 | <msup> |
6544 | 1 | <mn mathsize='normal' mathvariant='bold'>𝟔</mn> |
6545 | 1 | <mn mathsize='normal' mathvariant='bold'>𝟗</mn> |
6546 | 1 | </msup> |
6547 | 1 | </math>"; |
6548 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6549 | 1 | } |
6550 | ||
6551 | ||
6552 | #[test] | |
6553 | 1 | fn munder_mspace_bug_296() -> Result<()> { |
6554 | // this was a "typo" bug that should have looking embellished base | |
6555 | 1 | let test_str = r#"<math> |
6556 | 1 | <mrow><mn>5</mn><mfrac><mn>9</mn><mrow><mn>10</mn></mrow></mfrac> |
6557 | 1 | <munder accentunder="true"><mspace width="2.7em" /><mo stretchy="true">_</mo></munder> |
6558 | 1 | </mrow></math>"#; |
6559 | 1 | let target_str = "<math><mrow> |
6560 | 1 | <mrow data-changed='added'> |
6561 | 1 | <mn>5</mn> |
6562 | 1 | <mo data-changed='added'>⁤</mo> |
6563 | 1 | <mfrac> <mn>9</mn><mn>10</mn> </mfrac> |
6564 | 1 | </mrow> |
6565 | 1 | <munder accentunder='true'> |
6566 | 1 | <mo width='2.7em' data-changed='was-mspace' data-width='2.7' data-empty-in-2D='true' data-function-likelihood='false'> </mo> |
6567 | 1 | <mo stretchy='true'>¯</mo> |
6568 | 1 | </munder> |
6569 | 1 | </mrow></math>"; |
6570 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6571 | 1 | } |
6572 | ||
6573 | #[test] | |
6574 | 1 | fn parse_scripted_open_paren_439() -> Result<()> { |
6575 | // this was a "typo" bug that should have looking embellished base | |
6576 | 1 | let test_str = r#"<math><mrow><msub><mo>(</mo><mn>2</mn></msub><mo>)</mo></mrow></math>"#; |
6577 | 1 | let target_str = "<math><mrow><msub><mo>(</mo><mn>2</mn></msub><mo>)</mo></mrow></math>"; |
6578 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6579 | 1 | } |
6580 | ||
6581 | #[test] | |
6582 | 1 | fn lift_script() -> Result<()> { |
6583 | 1 | let test_str = "<math xmlns='http://www.w3.org/1998/Math/MathML' > |
6584 | 1 | <mrow> |
6585 | 1 | <mstyle scriptlevel='0' displaystyle='true'> |
6586 | 1 | <mrow> |
6587 | 1 | <msqrt> |
6588 | 1 | <munder> |
6589 | 1 | <mo>∑<!-- ∑ --></mo> |
6590 | 1 | <mrow> |
6591 | 1 | <mn>0</mn> |
6592 | 1 | <mo>≤<!-- ≤ --></mo> |
6593 | 1 | <mi>k</mi> |
6594 | 1 | <mo>≤<!-- ≤ --></mo> |
6595 | 1 | <mi>n</mi> |
6596 | 1 | </mrow> |
6597 | 1 | </munder> |
6598 | 1 | <mrow> |
6599 | 1 | <mo stretchy='false'>|</mo> |
6600 | 1 | </mrow> |
6601 | 1 | <msub> |
6602 | 1 | <mi>a</mi> |
6603 | 1 | <mrow> |
6604 | 1 | <mi>k</mi> |
6605 | 1 | </mrow> |
6606 | 1 | </msub> |
6607 | 1 | <msup> |
6608 | 1 | <mrow> |
6609 | 1 | <mo stretchy='false'>|</mo> |
6610 | 1 | </mrow> |
6611 | 1 | <mrow> |
6612 | 1 | <mn>2</mn> |
6613 | 1 | </mrow> |
6614 | 1 | </msup> |
6615 | 1 | </msqrt> |
6616 | 1 | </mrow> |
6617 | 1 | </mstyle> |
6618 | 1 | </mrow> |
6619 | 1 | </math>"; |
6620 | 1 | let target_str = "<math> |
6621 | 1 | <msqrt scriptlevel='0' displaystyle='true'> |
6622 | 1 | <mrow data-changed='added'> |
6623 | 1 | <munder> |
6624 | 1 | <mo>∑</mo> |
6625 | 1 | <mrow> |
6626 | 1 | <mn>0</mn> |
6627 | 1 | <mo>≤</mo> |
6628 | 1 | <mi>k</mi> |
6629 | 1 | <mo>≤</mo> |
6630 | 1 | <mi>n</mi> |
6631 | 1 | </mrow> |
6632 | 1 | </munder> |
6633 | 1 | <msup> |
6634 | 1 | <mrow data-changed='added'> |
6635 | 1 | <mo stretchy='false'>|</mo> |
6636 | 1 | <msub> |
6637 | 1 | <mi>a</mi> |
6638 | 1 | <mi>k</mi> |
6639 | 1 | </msub> |
6640 | 1 | <mo stretchy='false'>|</mo> |
6641 | 1 | </mrow> |
6642 | 1 | <mn>2</mn> |
6643 | 1 | </msup> |
6644 | 1 | </mrow> |
6645 | 1 | </msqrt> |
6646 | 1 | </math>"; |
6647 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6648 | 1 | } |
6649 | ||
6650 | #[test] | |
6651 | 1 | fn pseudo_scripts() -> Result<()> { |
6652 | 1 | let test_str = "<math><mrow> |
6653 | 1 | <mi>cos</mi><mn>30</mn><mo>°</mo> |
6654 | 1 | <mi>sin</mi><mn>60</mn><mo>′</mo> |
6655 | 1 | </mrow></math>"; |
6656 | 1 | let target_str = "<math> |
6657 | 1 | <mrow> |
6658 | 1 | <mrow data-changed='added'> |
6659 | 1 | <mi>cos</mi> |
6660 | 1 | <mo data-changed='added'>⁡</mo> |
6661 | 1 | <msup data-changed='added'><mn>30</mn><mo>°</mo></msup> |
6662 | 1 | </mrow> |
6663 | 1 | <mo data-changed='added'>⁢</mo> |
6664 | 1 | <mrow data-changed='added'> |
6665 | 1 | <mi>sin</mi> |
6666 | 1 | <mo data-changed='added'>⁡</mo> |
6667 | 1 | <msup data-changed='added'><mn>60</mn><mo>′</mo></msup> |
6668 | 1 | </mrow> |
6669 | 1 | </mrow> |
6670 | 1 | </math>"; |
6671 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6672 | 1 | } |
6673 | ||
6674 | #[test] | |
6675 | 1 | fn pseudo_scripts_in_mi() -> Result<()> { |
6676 | 1 | let test_str = "<math><mrow><mi>p'</mi><mo>=</mo><mi>µ°C</mi></mrow></math>"; |
6677 | 1 | let target_str = "<math><mrow><msup><mi>p</mi><mo>′</mo></msup><mo>=</mo><mi>µ°C</mi></mrow></math>"; |
6678 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6679 | 1 | } |
6680 | ||
6681 | #[test] | |
6682 | 1 | fn prescript_only() -> Result<()> { |
6683 | 1 | let test_str = "<math><msub><mtext/><mn>92</mn></msub><mi>U</mi></math>"; |
6684 | 1 | let target_str = "<math><mmultiscripts><mi>U</mi><mprescripts/> <mn>92</mn><none/> </mmultiscripts></math>"; |
6685 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6686 | 1 | } |
6687 | ||
6688 | #[test] | |
6689 | 1 | fn pre_and_postscript_only() -> Result<()> { |
6690 | 1 | let test_str = "<math> |
6691 | 1 | <msub><mrow/><mn>0</mn></msub> |
6692 | 1 | <msub><mi>F</mi><mn>1</mn></msub> |
6693 | 1 | <mo stretchy='false'>(</mo> |
6694 | 1 | <mi>a</mi><mo>,</mo><mi>b</mi><mo>;</mo><mi>c</mi><mo>;</mo><mi>z</mi> |
6695 | 1 | <mo stretchy='false'>)</mo> |
6696 | 1 | </math>"; |
6697 | 1 | let target_str = " <math> |
6698 | 1 | <mrow data-changed='added'> |
6699 | 1 | <mmultiscripts> |
6700 | 1 | <mi>F</mi> |
6701 | 1 | <mn>1</mn> |
6702 | 1 | <none></none> |
6703 | 1 | <mprescripts></mprescripts> |
6704 | 1 | <mn>0</mn> |
6705 | 1 | <none></none> |
6706 | 1 | </mmultiscripts> |
6707 | 1 | <mo data-changed='added'>⁡</mo> |
6708 | 1 | <mrow data-changed='added'> |
6709 | 1 | <mo stretchy='false'>(</mo> |
6710 | 1 | <mrow data-changed='added'> |
6711 | 1 | <mrow data-changed='added'> |
6712 | 1 | <mi>a</mi> |
6713 | 1 | <mo>,</mo> |
6714 | 1 | <mi>b</mi> |
6715 | 1 | </mrow> |
6716 | 1 | <mo>;</mo> |
6717 | 1 | <mi>c</mi> |
6718 | 1 | <mo>;</mo> |
6719 | 1 | <mi>z</mi> |
6720 | 1 | </mrow> |
6721 | 1 | <mo stretchy='false'>)</mo> |
6722 | 1 | </mrow> |
6723 | 1 | </mrow> |
6724 | 1 | </math>"; |
6725 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6726 | 1 | } |
6727 | ||
6728 | #[test] | |
6729 | 1 | fn pointless_nones_in_mmultiscripts() -> Result<()> { |
6730 | 1 | let test_str = "<math><mmultiscripts> |
6731 | 1 | <mtext>C</mtext> |
6732 | 1 | <none /> |
6733 | 1 | <none /> |
6734 | 1 | <mprescripts /> |
6735 | 1 | <mn>6</mn> |
6736 | 1 | <mn>14</mn> |
6737 | 1 | </mmultiscripts></math>"; |
6738 | 1 | let target_str = "<math> |
6739 | 1 | <mmultiscripts data-chem-formula='6'> |
6740 | 1 | <mtext data-chem-element='1'>C</mtext> |
6741 | 1 | <mprescripts></mprescripts> |
6742 | 1 | <mn>6</mn> |
6743 | 1 | <mn>14</mn> |
6744 | 1 | </mmultiscripts> |
6745 | 1 | </math>"; |
6746 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6747 | 1 | } |
6748 | ||
6749 | #[test] | |
6750 | 1 | fn empty_mmultiscripts_485() -> Result<()> { |
6751 | 1 | let test_str = "<math><mmultiscripts> </mmultiscripts></math>"; |
6752 | 1 | let target_str = ""; // shouldn't get to the point of comparing because the input is illegal. |
6753 | 1 | let err = are_strs_canonically_equal_result(test_str, target_str, &[]) |
6754 | 1 | .expect_err("empty mmultiscripts should be rejected"); |
6755 | 1 | assert!( |
6756 | 1 | err.to_string().contains("mmultiscripts has the wrong number of children:\n <mmultiscripts></mmultiscripts>"), |
6757 | "unexpected error message: {err}" | |
6758 | ); | |
6759 | 1 | Ok(()) |
6760 | 1 | } |
6761 | ||
6762 | #[test] | |
6763 | 1 | fn empty_mmultiscripts_544() -> Result<()> { |
6764 | 1 | let test_str = "<math><mmultiscripts><mrow/><mprescripts></mprescripts><mrow/><mrow/></mmultiscripts></math>"; |
6765 | 1 | let target_str = "<math> <mtext data-changed='empty_content' data-width='0'> </mtext></math>"; |
6766 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6767 | 1 | } |
6768 | ||
6769 | #[test] | |
6770 | 1 | fn empty_mrows_in_mmultiscripts_306() -> Result<()> { |
6771 | 1 | let test_str = "<math display='block'> |
6772 | 1 | <mmultiscripts intent='_permutation:prefix(_of,$k,_from,$n)'> |
6773 | 1 | <mi>P</mi> |
6774 | 1 | <mi arg='k'>k</mi> |
6775 | 1 | <mrow/> |
6776 | 1 | <mprescripts/> |
6777 | 1 | <mrow/> |
6778 | 1 | <mi arg='n'>n</mi> |
6779 | 1 | </mmultiscripts> |
6780 | 1 | </math>"; |
6781 | 1 | let target_str = "<math display='block'> |
6782 | 1 | <mmultiscripts intent='_permutation:prefix(_of,$k,_from,$n)'> |
6783 | 1 | <mi>P</mi> |
6784 | 1 | <mi arg='k'>k</mi> |
6785 | 1 | <none></none> |
6786 | 1 | <mprescripts></mprescripts> |
6787 | 1 | <none></none> |
6788 | 1 | <mi arg='n'>n</mi> |
6789 | 1 | </mmultiscripts> |
6790 | 1 | </math>"; |
6791 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6792 | 1 | } |
6793 | ||
6794 | ||
6795 | #[test] | |
6796 | #[ignore] // this fails -- need to figure out grabbing base from previous or next child | |
6797 | 0 | fn tensor() -> Result<()> { |
6798 | 0 | let test_str = "<math> |
6799 | 0 | <msub><mi>R</mi><mi>i</mi></msub> |
6800 | 0 | <msup><mrow/><mi>j</mi></msup> |
6801 | 0 | <msub><mrow/><mi>k</mi></msub> |
6802 | 0 | <msub><mrow/><mi>l</mi></msub> |
6803 | 0 | </math>"; |
6804 | 0 | let target_str = "<math> |
6805 | 0 | <mmultiscripts> |
6806 | 0 | <mi> R </mi> |
6807 | 0 | <mi> i </mi> |
6808 | 0 | <none/> |
6809 | 0 | <none/> |
6810 | 0 | <mi> j </mi> |
6811 | 0 | <mi> k </mi> |
6812 | 0 | <none/> |
6813 | 0 | <mi> l </mi> |
6814 | 0 | <none/> |
6815 | 0 | </mmultiscripts> |
6816 | 0 | </math>"; |
6817 | 0 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6818 | 0 | } |
6819 | ||
6820 | ||
6821 | #[test] | |
6822 | 1 | fn test_nonascii_function_name() -> Result<()> { |
6823 | 1 | let test_str = r#"<math> |
6824 | 1 | <mi mathvariant="bold-italic">x</mi> |
6825 | 1 | <mo>=</mo> |
6826 | 1 | <mn>2</mn> |
6827 | 1 | <mrow> |
6828 | 1 | <mi>𝒔𝒊𝒏</mi> |
6829 | 1 | <mo>⁡</mo> |
6830 | 1 | <mrow><mi mathvariant="bold-italic">t</mi></mrow> |
6831 | 1 | </mrow> |
6832 | 1 | <mo>-</mo> |
6833 | 1 | <mn>1</mn> |
6834 | 1 | </math>"#; |
6835 | 1 | let target_str = r#"<math> |
6836 | 1 | <mrow data-changed='added'> |
6837 | 1 | <mi mathvariant='bold-italic'>𝒙</mi> |
6838 | 1 | <mo>=</mo> |
6839 | 1 | <mrow data-changed='added'> |
6840 | 1 | <mrow data-changed='added'> |
6841 | 1 | <mn>2</mn> |
6842 | 1 | <mo data-changed='added'>⁢</mo> |
6843 | 1 | <mrow> |
6844 | 1 | <mi>sin</mi> |
6845 | 1 | <mo>⁡</mo> |
6846 | 1 | <mi mathvariant='bold-italic'>𝒕</mi> |
6847 | 1 | </mrow> |
6848 | 1 | </mrow> |
6849 | 1 | <mo>-</mo> |
6850 | 1 | <mn>1</mn> |
6851 | 1 | </mrow> |
6852 | 1 | </mrow> |
6853 | 1 | </math>"#; |
6854 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6855 | 1 | } |
6856 | ||
6857 | #[test] | |
6858 | 1 | fn test_nonascii_function_name_as_chars() -> Result<()> { |
6859 | 1 | let test_str = r#"<math display="block"> |
6860 | 1 | <mi>𝒙</mi> |
6861 | 1 | <mo>=</mo> |
6862 | 1 | <mrow> |
6863 | 1 | <mrow> |
6864 | 1 | <mi>𝒄</mi> |
6865 | 1 | <mi>𝒐</mi> |
6866 | 1 | <mi>𝒔</mi> |
6867 | 1 | </mrow> |
6868 | 1 | <mo>⁡</mo> |
6869 | 1 | <mrow> |
6870 | 1 | <mi>𝒕</mi> |
6871 | 1 | </mrow> |
6872 | 1 | </mrow> |
6873 | 1 | <mo>+</mo> |
6874 | 1 | <mn>𝟐</mn> |
6875 | 1 | </math>"#; |
6876 | 1 | let target_str = r#"<math display='block'> |
6877 | 1 | <mrow data-changed='added'> |
6878 | 1 | <mi>𝒙</mi> |
6879 | 1 | <mo>=</mo> |
6880 | 1 | <mrow data-changed='added'> |
6881 | 1 | <mrow> |
6882 | 1 | <mi>cos</mi> |
6883 | 1 | <mo>⁡</mo> |
6884 | 1 | <mi>𝒕</mi> |
6885 | 1 | </mrow> |
6886 | 1 | <mo>+</mo> |
6887 | 1 | <mn>𝟐</mn> |
6888 | 1 | </mrow> |
6889 | 1 | </mrow> |
6890 | 1 | </math>"#; |
6891 | 1 | are_strs_canonically_equal_result(test_str, target_str, &[]) |
6892 | 1 | } |
6893 | ||
6894 | ||
6895 | } |
Line | Count | Source |
1 | #![allow(clippy::needless_return)] | |
2 | ||
3 | // Chemistry terms used here: | |
4 | // chemical formula -- this references a molecule (one or more elements with bonds between them), including its state. | |
5 | // chemical equation -- this is a notation specialized to chemistry -- it has concentration, arrows, equality, "addition" along with | |
6 | // some special symbols for operators and (mostly) chemical formulas for operands. | |
7 | // Operand exceptions are the equilibrium constant, numbers, and identifiers. | |
8 | // Although a chemical equation is a superset of a chemical formula, because we want to distinguish the two (e.g., '=' is in both), | |
9 | // we require that chemical equation is an mrow | |
10 | // FIX?? -- can it be an adorned mrow? | |
11 | // Note: with the current definition, if any element in a potential chem equation is ruled out, the entire mrow is ruled out. | |
12 | // | |
13 | // The general flow is that for every element that looks like a chem formula/equation, we mark it with data-likely-[equation/formula] | |
14 | // After we are done marking "likely", we go back and either delete them or replace them with data-[equation/formula]. | |
15 | // Note: anything already marked with data-[equation/formula] doesn't need recomputation later (essentially the result is cached) | |
16 | // | |
17 | // There is a chicken and egg problem with detecting chemistry: to more reliably detect it, we need good structure. | |
18 | // However, to get the structure right (e.,g "=" being a double bond, not equality; chem elements being in 'mi's; ...), | |
19 | // we need to know "=" is part of a chemical formula. | |
20 | // The imperfect solution used is: | |
21 | // As the final step of each recursive call to 'clean_mathml', | |
22 | // 1. mi/mtext: is it a chemical element(s) or one of the symbols used in chemical formulas (not equations). | |
23 | // If so, mark it MAYBE_CHEMISTRY. | |
24 | // 2. msub/msup/msubsup/mmultiscripts: is base marked MAYBE_CHEMISTRY and the scripts are potential adornments, mark it MAYBE_CHEMISTRY | |
25 | // 3. mrows: these take a few passes (remember, they aren't structured properly yet) | |
26 | // On the assumption that chemistry is not common we implement a "show me" attitude before changing the structure. | |
27 | // Pass 1: | |
28 | // a) for any run of mi/mtext that can be re-split into chem elements, split them and mark them if it is at least 3 chars long | |
29 | // b) if there are any potential chem formula operators (e.g., "=" and ":") and the previous node is marked MAYBE_CHEMISTRY, | |
30 | // mark this as MAYBE_CHEMISTRY | |
31 | // Pass 2: (assuming something was marked in pass 1) | |
32 | // a) find the first marked child and then the last consecutive marked child and trim any mo's from the ends | |
33 | // b) evaluate the likelihood that the sequence is chemistry | |
34 | // yes: replace mathml children with new (potentially restructured) children | |
35 | // no: clear all the marks for the old children | |
36 | // After canonicalization, we take another pass looking for chemical equations and marking them if found. | |
37 | ||
38 | use sxd_document::dom::{Element, Document, ChildOfElement}; | |
39 | use crate::canonicalize::*; | |
40 | use crate::pretty_print::mml_to_string; | |
41 | use crate::xpath_functions::{is_leaf, IsNode}; | |
42 | use regex::Regex; | |
43 | use crate::xpath_functions::IsBracketed; | |
44 | use phf::{phf_map, phf_set}; | |
45 | use std::convert::TryInto; | |
46 | #[allow(unused_imports)] | |
47 | use log::{error, debug}; | |
48 | use std::collections::HashSet; | |
49 | use std::cmp::Ordering; | |
50 | use crate::errors::*; | |
51 | use std::sync::LazyLock; | |
52 | ||
53 | ||
54 | pub static NOT_CHEMISTRY: i32 = -10000; // should overwhelm any positive signal | |
55 | static NOT_CHEMISTRY_THRESHOLD: i32 = -10000/2; // value for testing -- that way some can be added to NOT_CHEMISTRY and still meet the test | |
56 | static CHEMISTRY_THRESHOLD: i32 = 5; // if this changes, change CHEMISTRY_THRESHOLD_STR | |
57 | ||
58 | ||
59 | /// this might be chemistry -- should only exist during canonicalization | |
60 | pub static MAYBE_CHEMISTRY: &str = "data-maybe-chemistry"; | |
61 | ||
62 | /// Attr flag to indicate chemical equation | |
63 | static CHEM_EQUATION: &str = "data-chem-equation"; | |
64 | /// Attr flag to indicate chemical formula | |
65 | static CHEM_FORMULA: &str = "data-chem-formula"; | |
66 | /// Attr flag to indicate chemical element | |
67 | static CHEM_ELEMENT: &str = "data-chem-element"; | |
68 | static CHEM_FORMULA_OPERATOR: &str = "data-chem-formula-op"; | |
69 | static CHEM_EQUATION_OPERATOR: &str = "data-chem-equation-op"; | |
70 | static CHEM_STATE: &str = "data-chem-state"; | |
71 | ||
72 | /// mark a new chem element that happened due to splitting a leaf | |
73 | pub static SPLIT_TOKEN: &str = "data-split"; | |
74 | ||
75 | /// mark a new chem element that happened due to merging two leaves | |
76 | static MERGED_TOKEN: &str = "data-merged"; | |
77 | ||
78 | /// these can be in the base of an under/over script | |
79 | 6.64k | fn is_chem_equation_arrow(ch: char) -> bool { |
80 | 6.64k |
|
81 | '→' | '➔' | '←' | '⟶' | '⟵' | '⤻' | '⇋' | '⇌' | | |
82 | '↑' | '↓' | '↿' | '↾' | '⇃' | '⇂' | '⥮' | '⥯' | '⇷' | '⇸' | '⤉' | '⤈' | | |
83 | '⥂' | '⥄' | '⥃' | | |
84 | '\u{1f8d0}' | '\u{1f8d1}' | '\u{1f8d2}' | '\u{1f8d3}' | '\u{1f8d4}' | '\u{1f8d5}' // proposed Unicode equilibrium arrows | |
85 | ) | |
86 | 6.64k | } |
87 | ||
88 | // Returns true if the 'property' (should have ":") is in the intent | |
89 | 196k | fn has_chem_intent(mathml: Element, property: &str) -> bool { |
90 | 196k | if let Some( |
91 | 16.9k | let head = intent.split('(').next().unwrap(); |
92 | 16.9k | return head.contains(property); |
93 | 179k | } |
94 | 179k | return false; |
95 | 196k | } |
96 | ||
97 | 26.7k | fn has_inherited_property(mathml: Element, property: &str) -> bool { |
98 | 26.7k | let mut current = mathml; |
99 | loop { | |
100 | 101k | if has_chem_intent(current, property) { |
101 | 0 | return true; |
102 | 101k | } |
103 | // chem might not be temp node without a 'math' parent | |
104 | 101k | if name(current) == "math" || |
105 | 26.7k | break; |
106 | 74.6k | } |
107 | 74.6k | current = get_parent(current); |
108 | } | |
109 | 26.7k | return false; |
110 | 26.7k | } |
111 | ||
112 | 30.2k | pub fn is_chemistry_off(mathml: Element) -> bool { |
113 | 30.2k | if has_chem_intent(mathml, ":chemical-formula") || has_chem_intent(mathml, ":chemical-equation") { |
114 | 4 | return false; |
115 | 30.2k | } |
116 | 30.2k | let pref_manager = crate::prefs::PreferenceManager::get(); |
117 | 30.2k | return pref_manager.borrow().pref_to_string("Chemistry") == "Off"; |
118 | 30.2k | } |
119 | ||
120 | 10.1k | pub fn clean_chemistry_mrow(mathml: Element) { |
121 | 10.1k | if is_chemistry_off(mathml) { |
122 | 0 | return; |
123 | 10.1k | } |
124 | // debug!("clean_chemistry_mrow:\n{}", mml_to_string(mathml)); | |
125 | 10.1k | let mut children = mathml.children().iter() |
126 | 31.3k | . |
127 | 10.1k | .collect::<Vec<Element>>(); |
128 | 10.1k | if let Some( |
129 | 246 | mathml.replace_children(&new_children); |
130 | 246 | children = new_children; |
131 | 9.93k | } |
132 | 10.1k | clean_mrow_children_mark_pass(&children); |
133 | 10.1k | } |
134 | ||
135 | /// Do some aggressive structural changes and if they make this look like a chemistry formula, mark it as one else remove other marks | |
136 | /// Note: the element is replaced with a new restructured element if it is marked as chemistry | |
137 | /// Pass 1: | |
138 | /// a) for any run of mi/mtext that can be re-split into chem elements, split them and mark them if it is at least 3 chars long. | |
139 | /// Also split "(g)", etc., when in mi/mtext | |
140 | /// b) if there are any potential chem formula operators (e.g., "=" and ":") and the previous node is marked MAYBE_CHEMISTRY, | |
141 | /// mark this as MAYBE_CHEMISTRY | |
142 | 10.1k | fn clean_mrow_children_restructure_pass<'a>(old_children: &[Element<'a>]) -> Option<Vec<Element<'a>>> { |
143 | 10.1k | let mut changed = false; |
144 | 10.1k | let mut new_children = Vec::with_capacity(2*old_children.len()); |
145 | 10.1k | let mut i = 0; |
146 | 40.7k | while i < old_children.len() { |
147 | 30.6k | if let Some( |
148 | 1 | new_children.push(paren_mrow_aq); |
149 | 1 | i += 4; // skipping "( a q )" |
150 | 1 | changed = true; |
151 | 1 | continue; |
152 | } else { | |
153 | 30.6k | let child = old_children[i]; |
154 | 30.6k | let child_name = name(child); |
155 | 30.6k | if child_name == "mi" || ( |
156 | // break mi/mtext that is done as "(g)", etc. Even if it isn't 'g', 'l', etc., it probably shouldn't be an mi/text. | |
157 | 8.62k | let text = as_text(child); |
158 | 8.62k | if text.starts_with('(') && |
159 | 4 | let doc = child.document(); |
160 | 4 | let state = create_mathml_element(&doc, "mi"); |
161 | 4 | state.set_text(&text[1..text.len()-1]); |
162 | 4 | let open = create_mathml_element(&doc, "mo"); |
163 | 4 | open.set_text("("); |
164 | 4 | let close = create_mathml_element(&doc, "mo"); |
165 | 4 | close.set_text(")"); |
166 | 4 | let mrow = create_mathml_element(&doc, "mrow"); |
167 | 4 | mrow.append_children(&[open,state,close]); |
168 | 4 | new_children.push(mrow); |
169 | 4 | i += 1; |
170 | 4 | changed = true; |
171 | 4 | continue; |
172 | 8.62k | } |
173 | 21.9k | } else if i + 2 < old_children.len() { |
174 | // wrap with an mrow if we are not already an 'mrow' | |
175 | 9.68k | let parent = get_parent(child); // safe since 'math' is always at root |
176 | 9.68k | if !(name(parent) == "mrow" && |
177 | 8.68k | let Some( |
178 | // debug!("make_mrow added mrow"); | |
179 | 377 | new_children.push(paren_mrow); |
180 | 377 | i += 3; |
181 | 377 | changed = true; |
182 | 377 | continue; |
183 | 9.30k | } |
184 | 12.3k | } |
185 | 30.2k | if child_name == "mo" { |
186 | 9.50k | let likely_chemistry_op = likely_chem_formula_operator(child); |
187 | // debug!("clean_mrow_children_restructure_pass -- in mo: likely {}, {}", likely_chemistry_op, mml_to_string(child)); | |
188 | 9.50k | if likely_chemistry_op >= 0 { |
189 | // if possible chemistry to left and right, then override text for operator lookup | |
190 | // note: on the right, we haven't set chem flag for operators yet, so we skip them | |
191 | 2.98k | let preceding = child.preceding_siblings(); |
192 | 2.98k | let following = child.following_siblings(); |
193 | 2.98k | if !preceding.is_empty() && |
194 | 1.84k | ( has_inherited_property(child, "chemical-formula") || |
195 | 2.27k |
|
196 | 2.27k | let child = as_element(child); |
197 | 2.27k | name(child)=="mn" || |
198 | 574 |
|
199 | 574 | let child = as_element(child); |
200 | 574 | name(child)=="mo" || |
201 | 574 | })) { |
202 | 146 | // "=", etc., should be treated as high priority separators |
203 | 146 | // debug!("clean_mrow_children_restructure: child = {}", mml_to_string(child)); |
204 | 146 | child.set_attribute_value(CHEMICAL_BOND, "true"); |
205 | 146 | child.set_attribute_value(CHEM_FORMULA_OPERATOR, &likely_chemistry_op.to_string()); |
206 | 146 | child.set_attribute_value(MAYBE_CHEMISTRY, &likely_chemistry_op.to_string()); |
207 | 2.83k | } |
208 | 6.52k | } else { |
209 | 6.52k | likely_chem_equation_operator(child); // need to mark MAYBE_CHEMISTRY for CHEMICAL_BOND tests |
210 | 6.52k | } |
211 | 20.7k | } else if child_name == "mrow" && |
212 | 2.05k | let Some( |
213 | 1 | latex_value == r"\mathrel{\longrightleftharpoons}" { |
214 | 0 | child.set_attribute_value("data-unicode", "\u{1f8d2}"); |
215 | 0 | child.set_attribute_value(MAYBE_CHEMISTRY, "2"); // same as is_hack_for_missing_arrows() |
216 | 20.7k | } |
217 | 30.2k | i += 1; |
218 | 30.2k | new_children.push(child); |
219 | } | |
220 | } | |
221 | ||
222 | 10.1k | return if changed { |
223 | ||
224 | ||
225 | /// if it looks like we have ChemFormula ( a q ), merge the 'a' and 'q' together into an 'mi' | |
226 | /// if not already true, structure '( aq )' into a single mrow (might be other elements on either side) | |
227 | /// returns the last char matched | |
228 | 30.6k | fn clean_aq_state<'a>(children: &[Element<'a>], i: usize) -> Option<Element<'a>> { |
229 | 30.6k | if i+3 >= children.len() || ( |
230 | 27.8k | return None; // can't be '( a q )' -- not enough elements left or not Chem Formula on left |
231 | 2.79k | } |
232 | ||
233 | // this is a little sloppy in that we allow matching text in any leaf element, but we can use the same function | |
234 | 2.79k | if is_text(children[i], "(") && |
235 | 244 | is_text(children[i+1], "a") && |
236 | 1 | is_text(children[i+3], ")") { |
237 | 1 | let mi = create_mathml_element(&children[i].document(), "mi"); |
238 | 1 | mi.set_text("aq"); |
239 | 1 | return make_mrow([children[i], mi, children[i+3]]); |
240 | 2.79k | } |
241 | 2.79k | return None; |
242 | 30.6k | } |
243 | ||
244 | 12.3k | fn is_text(node: Element, target: &str) -> bool { |
245 | 12.3k | return is_leaf(node) && |
246 | 12.3k | } |
247 | ||
248 | /// Converts "( child )" to mrow with those elements as children. | |
249 | /// This is to make ascertaining whether this is a chemical state easier, but it is correct even if not a chemical state. | |
250 | 8.68k | fn make_mrow(children: [Element; 3]) -> Option<Element> { |
251 | // this is a little sloppy in that we allow matching text in any leaf element, but we can use the same function | |
252 | 8.68k | if is_text(children[0], "(") && |
253 | 631 | is_text(children[2], ")") { |
254 | 378 | let mrow = create_mathml_element(&children[0].document(), "mrow"); |
255 | 378 | mrow.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
256 | 378 | mrow.append_children(children); |
257 | 378 | return Some(mrow); |
258 | 8.31k | } |
259 | 8.31k | return None; |
260 | 8.68k | } |
261 | 10.1k | } |
262 | ||
263 | /// Pass 2: (assuming something was marked in pass 1) | |
264 | /// a) find the first marked child and then the last consecutive marked child and trim any mo's from the ends | |
265 | /// b) evaluate the likelihood that the sequence is chemistry | |
266 | 10.1k | fn clean_mrow_children_mark_pass(children: &[Element]) { |
267 | 10.1k | let mut start = None; |
268 | 30.6k | for i in |
269 | 30.6k | let child = children[i]; |
270 | 30.6k | if child.attribute(MAYBE_CHEMISTRY).is_some() { |
271 | 4.64k | if start.is_none() { |
272 | 3.63k | if name(child) == "mo" { |
273 | 2.38k | // debug!(" start.is_none(): removing MAYBE_CHEMISTRY on {}", as_text(child)); |
274 | 2.38k | child.remove_attribute(MAYBE_CHEMISTRY); |
275 | 2.38k | child.remove_attribute(CHEM_FORMULA_OPERATOR); |
276 | 2.38k | child.remove_attribute(CHEM_EQUATION_OPERATOR); |
277 | 2.38k | child.remove_attribute(CHEMICAL_BOND); |
278 | 2.38k | } else { |
279 | 1.25k | start = Some(i); |
280 | 1.25k | } |
281 | 1.00k | } |
282 | 25.9k | } else if let Some( |
283 | 804 | remove_operators_at_end_of_sequence(children, seq_start, i) { |
284 | 804 | start = None; |
285 | 25.1k | } |
286 | } | |
287 | ||
288 | 10.1k | if let Some( |
289 | 452 | remove_operators_at_end_of_sequence(children, seq_start, children.len()); |
290 | 9.73k | } |
291 | 10.1k | return; |
292 | ||
293 | ||
294 | 1.25k | fn remove_operators_at_end_of_sequence(children: &[Element], start: usize, end: usize) -> bool { |
295 | // debug!(" looking for ops at end of {}..{}, last is:{}", start, end, mml_to_string(children[end-1])); | |
296 | 1.45k | for stop in ( |
297 | 1.45k | let end_child = children[stop]; |
298 | 1.45k | if name(end_child) == "mo" { |
299 | 202 | end_child.remove_attribute(MAYBE_CHEMISTRY); |
300 | 202 | } else { |
301 | 1.25k | return true; |
302 | } | |
303 | } | |
304 | 0 | return false |
305 | 1.25k | } |
306 | 10.1k | } |
307 | ||
308 | ||
309 | /// Very little software gets the token elements for chemistry right. | |
310 | /// Sometimes multiple elements are in a single token (e.g. "NaCl") and sometimes | |
311 | /// a single element is spread across multiple tokens (e.g. "N", "a"). | |
312 | /// | |
313 | /// Here we attempt one or the other repair, but not both on the assumption there is | |
314 | /// consistency in the error. | |
315 | /// | |
316 | /// Returns a Vec of the chemical elements or None. If a merge happened, the tree is altered. | |
317 | 12.3k | pub fn convert_leaves_to_chem_elements(mathml: Element) -> Option<Vec<Element>> { |
318 | // gather up all the consecutive mi/mtext | |
319 | 12.3k | if !(name(mathml) == "mi" || |
320 | 0 | return None; // do nothing |
321 | 12.3k | } |
322 | ||
323 | // we play games with the string to avoid allocation... | |
324 | 12.3k | let token_string = as_text(mathml); |
325 | 12.3k | if !token_string.is_ascii() { |
326 | 2.67k | return None; // chemical elements are ASCII |
327 | 9.62k | } |
328 | 9.62k | let doc = mathml.document(); |
329 | 9.62k | if token_string.len() > 1 { // safe because all chars are ASCII |
330 | 2.54k | return split_string_chem_element(&doc, mathml); |
331 | 7.08k | } |
332 | 7.08k | let parent = get_parent(mathml); |
333 | 7.08k | let parent_name = name(parent); |
334 | 7.08k | if !(parent_name == "mrow" || |
335 | 2.57k | return None; // only try to merge if in an mrow |
336 | 4.50k | } |
337 | 4.50k | let answer = merge_tokens_chem_element(&doc, mathml, &mathml.following_siblings()); |
338 | 4.50k | return answer; |
339 | ||
340 | ||
341 | 4.50k | fn merge_tokens_chem_element<'a>(doc: &Document<'a>, leaf: Element<'a>, following_siblings: &[ChildOfElement<'a>]) -> Option<Vec<Element<'a>>> { |
342 | 4.50k | if following_siblings.is_empty() { |
343 | 1.22k | return None; |
344 | 3.28k | } |
345 | 3.28k | let second_element = as_element(following_siblings[0]); |
346 | 3.28k | let second_element_name = name(second_element); |
347 | 3.28k | if second_element_name != "mi" && |
348 | 3.02k | return None; |
349 | 256 | } |
350 | 256 | let second_element_text = as_text(second_element); |
351 | 256 | if second_element_text.len() != 1 { |
352 | 57 | return None; |
353 | 199 | } |
354 | 199 | let token_string = as_text(leaf); |
355 | 199 | let chem_token_string = vec![token_string.as_bytes()[0], second_element_text.as_bytes()[0]]; |
356 | 199 | if let Some( |
357 | 4 | chem_element.set_text(as_text(chem_element)); |
358 | 4 | chem_element.set_attribute_value(MAYBE_CHEMISTRY, chem_element.attribute_value(MAYBE_CHEMISTRY).unwrap()); |
359 | 4 | chem_element.set_attribute_value(MERGED_TOKEN, "true"); |
360 | 4 | second_element.remove_from_parent(); |
361 | 4 | return Some(vec![chem_element]); |
362 | 195 | } |
363 | 195 | return None; |
364 | 4.50k | } |
365 | ||
366 | /// split the string which has been checked to be all ASCII chars | |
367 | 2.54k | fn split_string_chem_element<'a>(doc: &Document<'a>, leaf: Element<'a>) -> Option<Vec<Element<'a>>> { |
368 | 2.54k | let token_string = as_text(leaf).as_bytes(); |
369 | 2.54k | let token_len = token_string.len(); |
370 | 2.54k | let mut j = 0; |
371 | 2.54k | let mut new_children = Vec::with_capacity(token_string.len()); |
372 | 3.31k | while j < token_len { |
373 | // try elements of length 2 and 1, preferring longer elements (e.g., prefer "Na" over "N") | |
374 | 2.94k | if let Some( |
375 | 310 | new_children.push(chem_element); |
376 | 310 | j += 2; |
377 | 310 | continue; |
378 | 2.63k | } else if let Some( |
379 | 457 | new_children.push(chem_element); |
380 | 457 | j += 1; |
381 | 457 | continue; |
382 | 2.18k | } |
383 | 2.18k | return None; // didn't find a valid chem element |
384 | } | |
385 | 362 | if new_children.len() <= 1 { |
386 | 231 | return None; |
387 | 131 | } |
388 | 131 | add_attrs(new_children[new_children.len()-1], &leaf.attributes()); |
389 | 131 | new_children[new_children.len()-1].set_attribute_value(SPLIT_TOKEN, "true"); |
390 | // debug!("split_string_chem_element: {} -> {}", String::from_utf8(token_string.to_vec()).unwrap(), new_children.len()); | |
391 | 131 | return Some(new_children); |
392 | 2.54k | } |
393 | ||
394 | /// Returns element or None | |
395 | 5.78k | fn get_chem_element<'a>(doc: &Document<'a>, bytes_str: &[u8], n: usize) -> Option<Element<'a>> { |
396 | use std::str; | |
397 | 5.78k | let len = bytes_str.len(); |
398 | 5.78k | if n > len { |
399 | 277 | return None; // can't be an chemical letter |
400 | 5.50k | } |
401 | 5.50k | match str::from_utf8(&bytes_str[..n]) { |
402 | 5.50k | Ok(chem_element) => { |
403 | 5.50k | if CHEMICAL_ELEMENT_ELECTRONEGATIVITY.contains_key(chem_element) { |
404 | 771 | return Some(new_chemical_element(doc, chem_element)); |
405 | 4.73k | } |
406 | 4.73k | return None; |
407 | } | |
408 | 0 | Err(_) => return None, |
409 | } | |
410 | 5.78k | } |
411 | ||
412 | 771 | fn new_chemical_element<'a>(doc: &Document<'a>, chem_element_str: &str) -> Element<'a> { |
413 | 771 | let result = create_mathml_element(doc, "mi"); |
414 | 771 | result.set_text(chem_element_str); |
415 | 771 | result.set_attribute_value(MAYBE_CHEMISTRY, if chem_element_str.len() == 1 { |
416 | 771 | if chem_element_str.len() == 1 { |
417 | 457 | result.set_attribute_value("mathvariant", "normal"); |
418 | 457 |
|
419 | 771 | return result; |
420 | 771 | } |
421 | 12.3k | } |
422 | ||
423 | /// Looks at the children of the element and uses heuristics to decide whether this is a chemical equation/formula | |
424 | /// If it is, it is marked with either data-chem-equation or data-chem-formula | |
425 | /// This function assumes proper structure | |
426 | /// | |
427 | /// Returns true if not chemistry -- added attrs, mrows, and leaves are removed in preparation for a second parse | |
428 | 5.05k | pub fn scan_and_mark_chemistry(mathml: Element) -> bool { |
429 | 5.05k | if is_chemistry_off(mathml) { |
430 | 0 | return true; |
431 | 5.05k | } |
432 | ||
433 | 5.05k | let child = as_element(mathml.children()[0]); |
434 | // debug!("scan_and_mark_chemistry:\n{}", mml_to_string(child)); | |
435 | 5.05k | assert_eq!(name(mathml), "math"); |
436 | 5.05k | let is_chemistry = if let Some( |
437 | // MathJax v4 includes this really useful info -- if it starts \ce -- we have Chemistry | |
438 | // need to determine if it is an equation or a formula | |
439 | 5 | latex.trim_start().starts_with(r"\ce") |
440 | } else { | |
441 | 5.05k | has_chem_intent(mathml, ":chemical-formula") || has_chem_intent(mathml, ":chemical-equation") |
442 | }; | |
443 | ||
444 | 5.05k | if is_chemistry || |
445 | 669 | assert_eq!(mathml.children().len(), 1); |
446 | 669 | let likelihood = likely_chem_formula(child); |
447 | 669 | if likelihood >= CHEMISTRY_THRESHOLD || |
448 | 211 | child.set_attribute_value(MAYBE_CHEMISTRY, std::cmp::max(CHEMISTRY_THRESHOLD, likelihood).to_string().as_str()); |
449 | 211 | set_marked_chemistry_attr(child, CHEM_FORMULA); |
450 | 458 | } |
451 | ||
452 | 669 | if child.attribute(CHEM_FORMULA).is_none() { |
453 | // can't be both an equation and a formula... | |
454 | 458 | let likelihood = likely_chem_equation(child); |
455 | 458 | if is_chemistry || |
456 | 36 | child.set_attribute_value(MAYBE_CHEMISTRY, std::cmp::max(CHEMISTRY_THRESHOLD, likelihood).to_string().as_str()); |
457 | 36 | set_marked_chemistry_attr(child, CHEM_EQUATION); |
458 | 422 | } |
459 | 211 | } |
460 | 4.38k | } |
461 | // debug!("...after marking:\n{}", mml_to_string(child)); | |
462 | ||
463 | 5.05k | if child.attribute(CHEM_FORMULA).is_none() && |
464 | 4.80k | if !has_maybe_chemistry(mathml) { |
465 | 3.68k | return true; // quick check avoids needing a second parse due to removing added elements |
466 | 1.12k | } |
467 | 1.12k | return !is_changed_after_unmarking_chemistry(mathml); |
468 | } else { | |
469 | 247 | return true; |
470 | } | |
471 | 5.05k | } |
472 | ||
473 | // returns the marked attr value or None | |
474 | 16.2k | fn get_marked_value(mathml: Element) -> Option<i32> { |
475 | 16.2k | return mathml.attribute_value(MAYBE_CHEMISTRY).map(|value| |
476 | 16.2k | } |
477 | ||
478 | /// Sets the attr 'chem' | |
479 | /// Recurse through all the children that have MAYBE_CHEMISTRY set | |
480 | 4.24k | fn set_marked_chemistry_attr(mathml: Element, chem: &str) { |
481 | 4.24k | let tag_name = name(mathml); |
482 | 4.24k | if let Some( |
483 | 2.88k | maybe_attr.remove_from_parent(); |
484 | ||
485 | 2.88k | match tag_name { |
486 | 2.88k | "mi" | |
487 | 2.03k | "mo" => { |
488 | 686 | if mathml.attribute(CHEM_FORMULA_OPERATOR).is_none() && |
489 | // don't mark as both formula and equation | |
490 | 433 | mathml.set_attribute_value(if chem == CHEM_FORMULA { |
491 | 253 | } |
492 | }, | |
493 | 1.35k | "mn" => |
494 | 1.26k | "mrow" | |
495 | 1.25k | let mut chem_name = chem; |
496 | 1.25k | if tag_name != "mrow" && |
497 | // look at base -- if an mi/mtext then this is really a chemical formula | |
498 | 69 | let base = as_element(mathml.children()[0]); |
499 | 69 | let base_name = name(base); |
500 | 69 | if base_name == "mi" || |
501 | 63 | chem_name = CHEM_FORMULA; |
502 | 63 |
|
503 | 1.18k | } |
504 | ||
505 | 1.25k | if mathml.attribute(CHEM_FORMULA).is_none() { |
506 | 1.23k | // don't mark as both formula and equation |
507 | 1.23k | mathml.set_attribute_value(chem_name, maybe_attr.value()); |
508 | 1.23k |
|
509 | 3.92k | for child in |
510 | 3.92k | set_marked_chemistry_attr(as_element(child), chem); |
511 | 3.92k | }; |
512 | } | |
513 | 10 | "mfrac" => { |
514 | 0 | let children = mathml.children(); |
515 | // debug!("mfrac children: {}", mml_to_string(mathml)); | |
516 | 0 | let numerator_is_chem_equation = IsBracketed::is_bracketed(as_element(children[0]), "[", "]", false, true); |
517 | 0 | let denominator_is_chem_equation = IsBracketed::is_bracketed(as_element(children[1]), "[", "]", false, true); |
518 | 0 | if numerator_is_chem_equation && denominator_is_chem_equation { |
519 | 0 | mathml.set_attribute_value(CHEM_EQUATION, "true"); |
520 | 0 | } |
521 | } | |
522 | 10 | _ => error!("Internal error: {tag_name} should not be marked as 'MAYBE_CHEMISTRY'"), |
523 | } | |
524 | 1.35k | } else if tag_name == "mrow" { |
525 | // could have been added during canonicalization, so never marked. Recurse to the children | |
526 | 68 | for child in |
527 | 68 | set_marked_chemistry_attr(as_element(child), chem); |
528 | 68 | }; |
529 | 1.32k | } |
530 | 4.24k | } |
531 | ||
532 | /// returns true if MAYBE_CHEMISTRY's occur within the element | |
533 | 41.3k | fn has_maybe_chemistry(mathml: Element) -> bool { |
534 | 41.3k | if mathml.attribute(MAYBE_CHEMISTRY).is_some() { |
535 | 1.12k | return true; |
536 | 40.2k | } |
537 | 40.2k | if !is_leaf(mathml) { |
538 | 36.5k | for child in |
539 | 36.5k | if has_maybe_chemistry(as_element(child)) { |
540 | 3.15k | return true; |
541 | 33.3k | } |
542 | } | |
543 | 22.2k | } |
544 | 37.0k | return false; |
545 | 41.3k | } |
546 | ||
547 | /// Clears MAYBE_CHEMISTRY from this element and its decedents | |
548 | /// Also deletes added mrows and leaves; returns true if anything is deleted | |
549 | 19.7k | fn is_changed_after_unmarking_chemistry(mathml: Element) -> bool { |
550 | 19.7k | mathml.remove_attribute(MAYBE_CHEMISTRY); |
551 | 19.7k | if is_leaf(mathml) { |
552 | // don't bother testing for the attr -- just remove and nothing bad happens if they aren't there | |
553 | 13.3k | mathml.remove_attribute(CHEM_FORMULA_OPERATOR); |
554 | 13.3k | mathml.remove_attribute(CHEM_EQUATION_OPERATOR); |
555 | 13.3k | mathml.remove_attribute(CHEMICAL_BOND); |
556 | 13.3k | if mathml.attribute(MERGED_TOKEN).is_some() { |
557 | 3 | unmerge_element(mathml); |
558 | 3 | return true; // need to re-parse |
559 | 13.3k | } else if mathml.attribute(SPLIT_TOKEN).is_some() { |
560 | 33 | if let Err( |
561 | 0 | panic!("{}", err); |
562 | 33 | } |
563 | // debug!("After merge_element:{}", mml_to_string(mathml)); | |
564 | // let parent = get_parent(mathml); | |
565 | // debug!("After merge_element: -- parent{}", mml_to_string(parent)); | |
566 | ||
567 | 13.3k | } else if let Some( |
568 | 2.14k | changed_value == ADDED_ATTR_VALUE && |
569 | 2.11k | name(mathml) != "mtext" { // a hack fix for #477 (chem never modifies mtext, so this is ok) |
570 | 2.11k | mathml.remove_from_parent(); |
571 | 2.11k | return true; |
572 | 11.1k | } |
573 | 11.2k | return false; |
574 | 6.38k | } else if IsNode::is_scripted(mathml) && |
575 | 1.04k | name(as_element(mathml.children()[0])) == "mi" && |
576 | 575 | as_element(mathml.children()[0]).attribute(SPLIT_TOKEN).is_some() { |
577 | // Undo a split that happened in a scripted element. | |
578 | // We put the preceding elements into the base and call merge_element on the last element of the base | |
579 | // The first and/or the last child in the sequence could be a script that needs to be unwrapped | |
580 | 1 | let mut parent = get_parent(mathml); // there is always a "math" node |
581 | // debug!("mathml:\n{}", mml_to_string(mathml)); | |
582 | // debug!("parent before merge:\n{}", mml_to_string(parent)); | |
583 | // debug!("grandparent before merge:\n{}", mml_to_string(get_parent(parent))); | |
584 | ||
585 | 1 | let mut preceding_children = mathml.preceding_siblings(); |
586 | // could be no preceding children to canonicalization creating mrows (see issue #303), so might need to use parent, etc | |
587 | 2 | while preceding_children.is_empty() { |
588 | 1 | preceding_children = parent.preceding_siblings(); |
589 | 1 | if name(parent) == "math" { |
590 | 0 | break; // consider {SIN}^{-1} -- no preceding child |
591 | 1 | } |
592 | 1 | parent = get_parent(parent); |
593 | } | |
594 | ||
595 | 1 | let mut new_script_children = vec![]; |
596 | 1 | if !preceding_children.is_empty() { |
597 | // deal with the first element (if it needs unwrapping, it has only prescripts) | |
598 | 1 | let first_element_of_split = as_element(preceding_children[preceding_children.len()-1]); |
599 | // debug!("first_element_of_split: \n{}", mml_to_string(first_element_of_split)); | |
600 | 1 | if name(first_element_of_split) == "mmultiscripts" { |
601 | // take the base and make it the first child of preceding_children (what will get merged) | |
602 | // put the rest of the elements (the prescripts) at the end of the parent last element (mathml) which must be an mmultiscripts | |
603 | 0 | let first_element_children = first_element_of_split.children(); |
604 | 0 | assert_eq!(name(mathml), "mmultiscripts"); |
605 | 0 | let mut script_children = mathml.children(); |
606 | 0 | assert_eq!(name(as_element(script_children[0])), "mi"); |
607 | 0 | assert!(!script_children.len().is_multiple_of(2)); // doesn't have <mprescripts/> |
608 | 0 | script_children.push(first_element_children[1]); // mprescripts |
609 | 0 | script_children.push(first_element_children[2]); // prescripts subscript |
610 | 0 | script_children.push(first_element_children[3]); // prescripts superscript |
611 | ||
612 | 0 | let base_of_first_element = first_element_children[0]; // base |
613 | 0 | assert_eq!(name(as_element(base_of_first_element)), "mi"); |
614 | 0 | let script_base = as_element(script_children[0]); |
615 | 0 | let mut merged_base_text = as_text( as_element(base_of_first_element)).to_string(); |
616 | 0 | merged_base_text.push_str(as_text(script_base)); |
617 | 0 | script_base.set_text(&merged_base_text); |
618 | 0 | script_base.remove_attribute("mathvariant"); |
619 | 0 | script_base.remove_attribute(ADDED_ATTR_VALUE); |
620 | 0 | script_base.remove_attribute(MAYBE_CHEMISTRY); |
621 | 0 | script_base.remove_attribute(SPLIT_TOKEN); |
622 | 0 | mathml.replace_children(script_children); |
623 | ||
624 | 0 | first_element_of_split.remove_from_parent(); |
625 | 0 | return true; |
626 | 1 | } |
627 | 1 | new_script_children.push(ChildOfElement::Element(first_element_of_split)); |
628 | 0 | } |
629 | 1 | debug!("mathml after handling preceding children:\n{}", |
630 | 1 | let mut children_of_script = mathml.children(); |
631 | 1 | let split_child = as_element(children_of_script[0]); |
632 | 1 | new_script_children.append(&mut children_of_script); |
633 | 1 | mathml.replace_children(new_script_children); // temporarily has bad number of children |
634 | // debug!("After making bad script:\n{}", mml_to_string(mathml)); | |
635 | 1 | if let Err( |
636 | 0 | panic!("{}", err); |
637 | 1 | } |
638 | 1 | return true; |
639 | } else { | |
640 | 6.37k | let mut answer = false; |
641 | 18.5k | for child in |
642 | 18.5k | let child = as_element(child); |
643 | 18.5k | if name(child) == "mtd" && |
644 | 2 | answer = true; // each mtd acts as a potential island for chemistry, so don't clear it |
645 | 18.5k | } else { |
646 | 18.5k | answer |= is_changed_after_unmarking_chemistry(child); |
647 | 18.5k | } |
648 | } | |
649 | 6.37k | if name(mathml) == "mrow" { |
650 | 3.58k | if let Some( |
651 | // we added an mrow, we can remove it -- but this might be already processed which is the case if "data-id-added" is true (exists) | |
652 | 2.86k | if changed_value == ADDED_ATTR_VALUE && mathml.attribute("data-id-added").is_none() { |
653 | // mrows get added for several reasons. One of them is to canonicalize elements like msqrt that can have 1 or more children; | |
654 | // those should not get removed because the re-parse doesn't add those | |
655 | // Although they would never be added, elements with fixed number of children also shouldn't have the mrow go away | |
656 | // We are left with only removing mrows with one child or mrows that are children of mrows (simpler test than ELEMENTS_WITH_ONE_CHILD) | |
657 | 2.86k | let parent = get_parent(mathml); // mathml is mrow, so parent always exists |
658 | 2.86k | if mathml.children().len() == 1 || |
659 | 6.26k | let |
660 | 2.31k | mathml.remove_attribute(CHANGED_ATTR); // if just one child, the attrs are pushed onto the child |
661 | // debug!("is_changed_after_unmarking: before replace - parent\n{}", mml_to_string(parent)); | |
662 | 2.31k | replace_children(mathml, children); |
663 | // debug!("is_changed_after_unmarking: parent\n{}", mml_to_string(parent)); | |
664 | ||
665 | 557 | } |
666 | 0 | } |
667 | 720 | } |
668 | 3.58k | return true; |
669 | 2.79k | } |
670 | 2.79k | return answer; |
671 | } | |
672 | ||
673 | 3 | fn unmerge_element(mathml: Element) { |
674 | // a merged token occurs when two single letters get merged into one. Here we recreate the two tokens | |
675 | 3 | assert!(is_leaf(mathml)); |
676 | // debug!("unmerge_element: {}", mml_to_string(mathml)); | |
677 | 3 | let mut token_str = as_text(mathml).chars(); |
678 | 3 | let first = create_mathml_element(&mathml.document(), name(mathml)); |
679 | 3 | first.set_text(&token_str.next().unwrap().to_string()); |
680 | 3 | let second = create_mathml_element(&mathml.document(), name(mathml)); |
681 | 3 | second.set_text(&token_str.next().unwrap().to_string()); |
682 | 3 | replace_children(mathml, vec![first, second]); |
683 | 3 | } |
684 | ||
685 | /// Put the split pieces back together (undo the split) | |
686 | 34 | fn merge_element(mathml: Element) -> Result<()> { |
687 | // debug!("merge_element: {}", mml_to_string(mathml)); | |
688 | // debug!("merge_element parent: {}", mml_to_string(get_parent(mathml))); | |
689 | 34 | assert!(is_leaf(mathml)); |
690 | 34 | let mut preceding_children = mathml.preceding_siblings(); |
691 | // debug!("preceding_children: {}", preceding_children.iter().map(|&el| name(as_element(el)).to_string()).collect::<Vec<String>>().join(", ")); | |
692 | 34 | if preceding_children.is_empty() { |
693 | // handle: | |
694 | // * case where we have mi mmultiscripts mi ... where the second mi needs to join with the first (see test mhchem_so4) | |
695 | // * case where the child got buried in an added mrow (can only happen one level deep because invisible times should get inserted) | |
696 | 0 | let parent = get_parent(mathml); // mathml is leaf, so parent always exists |
697 | 0 | preceding_children = parent.preceding_siblings(); |
698 | 0 | if preceding_children.is_empty() || |
699 | 0 | !(name(parent) == "mmultiscripts" || |
700 | 0 | (name(parent) == "mrow" && parent.attribute_value(CHANGED_ATTR).is_some() && |
701 | 0 | parent.attribute_value(CHANGED_ATTR).unwrap() == ADDED_ATTR_VALUE)) { |
702 | 0 | bail!("Internal error: {} should not have been split'", mml_to_string(mathml)); |
703 | 0 | } |
704 | 34 | } |
705 | // Note: there was an invisible U+2063, but it was removed before we got here | |
706 | // The parent mrow could have many children that couldn't have been part of a split -- only consider feasible children to split (mi/mtext) | |
707 | // To figure this out, we walk backwards adding the text in reverse and then reverse that text in the end | |
708 | 34 | let mut merged_text = Vec::default(); |
709 | 46 | for &child in |
710 | 46 | let child = as_element(child); |
711 | // because this is before canonicalization, there could be an mrow with just mi/mtext | |
712 | 46 | if name(child) == "mrow" && |
713 | 0 | // "lift" the child up so all the links (e.g., siblings) are correct |
714 | 0 | let child = as_element(child.children()[0]); |
715 | 0 | set_mathml_name(child, name(child)); |
716 | 0 | crate::canonicalize::add_attrs(child, &child.attributes()); |
717 | 0 | child.replace_children(child.children()); |
718 | 46 | } |
719 | 46 | if name(child) != "mi" && |
720 | 12 | break; |
721 | 34 | } |
722 | 34 | merged_text.push(as_text(child)); |
723 | 34 | child.remove_from_parent(); |
724 | } | |
725 | 34 | merged_text.reverse(); |
726 | 34 | let mut merged_text = merged_text.join(""); |
727 | 34 | merged_text.push_str(as_text(mathml)); |
728 | 34 | mathml.set_text(&merged_text); |
729 | 34 | mathml.remove_attribute("mathvariant"); |
730 | 34 | mathml.remove_attribute(ADDED_ATTR_VALUE); |
731 | 34 | mathml.remove_attribute(MAYBE_CHEMISTRY); |
732 | 34 | mathml.remove_attribute(SPLIT_TOKEN); |
733 | 34 | return Ok( () ); |
734 | 34 | } |
735 | 19.7k | } |
736 | ||
737 | /// Returns true only if 'mathml' potentially is chemistry. | |
738 | /// This assumes canonicalization has happened and that 'mathml' is the 'math' element | |
739 | 5.05k | fn is_chemistry_sanity_check(mathml: Element) -> bool { |
740 | // This does some sanity checking. More can definitely be done | |
741 | // Checks: | |
742 | // * there should be chemical elements | |
743 | // * if the child is an mrow with three children, the operator should be '=' (not CHEMICAL_BOND) or an arrow | |
744 | // in this case, we gather up the elements on the lhs and rhs. The sets should be equal and non-empty. | |
745 | // the exception is if there are prescripts, in which as we might have radioactive decay so we don't require the sets to be equal | |
746 | // * otherwise, we gather up all the chemical elements and make sure the set is non-empty | |
747 | // * if it isn't an mrow, we leave it to likely_chem_equation() to rule it out | |
748 | 5.05k | assert_eq!(name(mathml), "math"); |
749 | 5.05k | assert_eq!(mathml.children().len(), 1); |
750 | 5.05k | let mathml = as_element(mathml.children()[0]); |
751 | 5.05k | if name(mathml) == "mrow" { |
752 | 3.29k | let mrow_children = mathml.children(); |
753 | 3.29k | if mrow_children.len() == 3 && |
754 | 371 | let mut lhs_elements = HashSet::with_capacity(8); // likely more than anything we'll encounter -- bigger affects '=' op |
755 | 371 | let lhs_has_prescripts = gather_chemical_elements(as_element(mrow_children[0]), &mut lhs_elements); |
756 | // need to include the arrow as it might have the addition of some chemical elements (see UEB/iceb.rs/chem_16_5_2) | |
757 | 371 | gather_chemical_elements(as_element(mrow_children[1]), &mut lhs_elements); |
758 | 371 | let mut rhs_elements = HashSet::with_capacity(8); // likely more than anything we'll encounter -- bigger affects '=' op |
759 | 371 | let rhs_has_prescripts = gather_chemical_elements(as_element(mrow_children[2]), &mut rhs_elements); |
760 | 371 | if lhs_elements.is_empty() { |
761 | 269 | return false; |
762 | 102 | } |
763 | // debug!("lhs/rhs elements: {:?}, {:?}", lhs_elements, rhs_elements); | |
764 | // debug!("lhs/rhs has prescripts: {}, {}", lhs_has_prescripts, rhs_has_prescripts); | |
765 | 102 | if lhs_elements == rhs_elements { |
766 | 37 | return !(lhs_has_prescripts ^ rhs_has_prescripts); // seems reasonable that if the lhs has prescripts, so should the rhs |
767 | 65 | } |
768 | 65 | return lhs_has_prescripts && |
769 | 2.92k | } |
770 | 1.76k | } |
771 | 4.68k | let mut chem_elements = HashSet::with_capacity(8); // likely more than anything we'll encounter -- bigger affects '=' op |
772 | 4.68k | gather_chemical_elements(mathml, &mut chem_elements); |
773 | 4.68k | return !chem_elements.is_empty(); |
774 | ||
775 | ||
776 | 2.52k | fn is_arrow_or_equal(mathml: Element) -> bool { |
777 | 2.52k | let base = get_possible_embellished_node(mathml); |
778 | 2.52k | if name(base) != "mo" || |
779 | 542 | return false; |
780 | 1.98k | } |
781 | 1.98k | let text = as_text(base); |
782 | 1.98k | return text == "=" || |
783 | ||
784 | 2.52k | } |
785 | ||
786 | /// Gather up all the chemical elements in the element and return true if it has numerical prescripts | |
787 | 48.3k | fn gather_chemical_elements<'a>(mathml: Element<'a>, chem_elements: &mut HashSet<&'a str>) -> bool { |
788 | 48.3k | match name(mathml) { |
789 | 48.3k | "mi" | |
790 | 10.8k | if is_chemical_element(mathml) { |
791 | 1.60k | chem_elements.insert(as_text(mathml)); |
792 | 9.27k | } |
793 | 10.8k | return false; |
794 | }, | |
795 | 37.4k | "msub" | |
796 | 2.16k | gather_chemical_elements(get_possible_embellished_node(mathml), chem_elements); |
797 | 2.16k | return name(mathml) == "mmultiscripts" && |
798 | }, | |
799 | 35.2k | "semantics" => { |
800 | 0 | return gather_chemical_elements( get_presentation_element(mathml).1, chem_elements ); |
801 | }, | |
802 | 35.2k | _ => if is_leaf(mathml) { return |
803 | } | |
804 | ||
805 | // mrow, msqrt, etc | |
806 | 13.7k | let mut has_prescripts = false; |
807 | 40.3k | for child in |
808 | 40.3k | let child = as_element(child); |
809 | 40.3k | has_prescripts |= gather_chemical_elements(child, chem_elements); |
810 | 40.3k | } |
811 | 13.7k | return has_prescripts; |
812 | 48.3k | } |
813 | ||
814 | /// find the mprescripts child and then check the following siblings for numerical prescripts | |
815 | 291 | fn has_numerical_prescripts(mathml: Element) -> bool { |
816 | 291 | let children = mathml.children(); |
817 | // quick check to see if there is an mprescripts child | |
818 | 291 | if !children.len().is_multiple_of(2) { // <mprescripts/> => even number of children |
819 | 129 | return false; |
820 | 162 | } |
821 | // we need enumerate because the "step_by" will cause any returned iterator to jump ahead by 2 | |
822 | 162 | let i_mprescripts = children.iter() |
823 | 162 | .enumerate() |
824 | 162 | .skip(1) |
825 | 162 | .step_by(2) |
826 | 222 | . |
827 | 162 | .map(|(i, _)| i); |
828 | ||
829 | 162 | if let Some(i) = i_mprescripts { |
830 | 162 | let subscript = as_element(children[i+1]); // can be +1/-1 for beta decay |
831 | 162 | let superscript = as_element(children[i+2]); // mass number, so always >= 0 |
832 | 162 | if name(superscript) != "mn" { |
833 | 55 | return false; |
834 | 107 | } |
835 | 107 | return name(subscript) == "mn" || |
836 | 36 | (name(subscript) == "mrow" && |
837 | 0 | name(as_element(subscript.children()[3])) == "mm" && |
838 | 0 | name(as_element(subscript.children()[1])) == "mo" && |
839 | 0 | matches!(as_text(as_element(subscript.children()[1])), "+" | "-")); |
840 | 0 | } |
841 | 0 | return false; |
842 | 291 | } |
843 | 5.05k | } |
844 | ||
845 | /// Looks at the children of the element and uses heuristics to decide whether this is a chemical equation. | |
846 | /// This assumes canonicalization of characters has happened | |
847 | 713 | fn likely_chem_equation(mathml: Element) -> i32 { |
848 | // mfrac -- could be a ratio of concentrations | |
849 | 713 | if name(mathml) != "mrow" && |
850 | 119 | return NOT_CHEMISTRY; |
851 | 594 | } |
852 | ||
853 | // debug!("start likely_chem_equation:\n{}", mml_to_string(mathml)); | |
854 | // mrow -- check the children to see if we are likely to be a chemical equation | |
855 | ||
856 | // concentrations should either be unscripted or have a superscript that isn't a charge | |
857 | // they occur in an mrow or mfrac | |
858 | 594 | if IsBracketed::is_bracketed(mathml, "[", "]", false, true) { |
859 | 10 | let parent_name = name(get_parent(mathml)); |
860 | 10 | if parent_name == "mfrac" || parent_name == "mrow" || |
861 | 0 | (parent_name == "msup" && likely_chem_superscript(as_element(mathml.following_siblings()[0])) < 0){ |
862 | 10 | return if as_element(mathml.children()[0]).attribute(CHEM_FORMULA).is_some() { |
863 | 0 | } |
864 | 584 | } |
865 | ||
866 | // possible improvement -- give bonus points for consecutive (not counting invisible separators) chemical elements on top of the existing points | |
867 | 584 | let mut likelihood = 0; // indicator of likely match |
868 | 584 | let mut has_equilibrium_constant = false; |
869 | 584 | let children = mathml.children(); |
870 | 1.22k | for i in |
871 | 1.22k | let child = as_element(children[i]); |
872 | // debug!(" i={}, likelihood={}, child={}", i, likelihood, crate::canonicalize::element_summary(child)); | |
873 | 1.22k | if let Some( |
874 | 457 | likelihood += likely; |
875 | 457 | continue; |
876 | 771 | } |
877 | 771 | if i == children.len()-1 { |
878 | 195 | let likely = likely_chem_state(child); |
879 | 195 | if likely > 0 { |
880 | 0 | likelihood += likely; |
881 | 0 | break; |
882 | 195 | } |
883 | // otherwise, check the last element as normal | |
884 | 576 | } |
885 | 771 | let tag_name = name(child); |
886 | 771 | let likely = match tag_name { |
887 | 771 | "mi" => |
888 | 625 | "mn" => |
889 | 616 | "mo" | |
890 | 286 | "msub" | |
891 | 38 | if is_equilibrium_constant(child) { |
892 | 0 | has_equilibrium_constant = true; |
893 | 0 | 2 |
894 | } else { | |
895 | 38 | likely_adorned_chem_formula(child) |
896 | } | |
897 | }, | |
898 | 248 | "mfrac" => { |
899 | 0 | if has_equilibrium_constant { |
900 | 0 | 2 |
901 | } else { | |
902 | 0 | -3 // fraction tend only to appear after an equilibrium constant |
903 | } | |
904 | }, | |
905 | 248 | "mrow" => { |
906 | 248 | let likely = likely_chem_formula(child); |
907 | 248 | if likely < 0 { |
908 | 248 | likely_chem_equation(child) |
909 | } else { | |
910 | 0 | likely |
911 | } | |
912 | }, | |
913 | // no need to check for mtr or mtd because they only exist in a table and the recursion is dealt with here. | |
914 | 0 | "mtable" => { |
915 | 0 | for mrow in child.children() { |
916 | 0 | let mrow = as_element(mrow); |
917 | 0 | for mtd in mrow.children() { |
918 | 0 | let mtd = as_element(mtd); |
919 | 0 | let mut likely = likely_chem_formula(mtd); |
920 | 0 | if likely < CHEMISTRY_THRESHOLD { |
921 | 0 | likely = likely_chem_equation(mtd); |
922 | 0 | } |
923 | 0 | if likely < CHEMISTRY_THRESHOLD { |
924 | 0 | is_changed_after_unmarking_chemistry(mtd); |
925 | 0 | } |
926 | } | |
927 | } | |
928 | 0 | NOT_CHEMISTRY |
929 | }, | |
930 | 0 | "semantics" => { |
931 | 0 | likely_chem_equation(get_presentation_element(mathml).1) |
932 | }, | |
933 | 0 | _ => NOT_CHEMISTRY, |
934 | }; | |
935 | 771 | if likely >= 0 { |
936 | 164 | child.set_attribute_value(MAYBE_CHEMISTRY, &likely.to_string()); |
937 | 607 | } |
938 | 771 | likelihood += likely; |
939 | 771 | if likelihood < NOT_CHEMISTRY_THRESHOLD { |
940 | 396 | return NOT_CHEMISTRY; |
941 | 375 | } |
942 | } | |
943 | ||
944 | 188 | if likelihood >= 0 { |
945 | 108 | mathml.set_attribute_value(MAYBE_CHEMISTRY, &likelihood.to_string()); |
946 | 108 |
|
947 | 188 | return likelihood; |
948 | 713 | } |
949 | ||
950 | ||
951 | /// could be a number, a state ("(l)", "(g)", etc), or a number followed by a state | |
952 | 1.19k | fn likely_chem_subscript(subscript: Element) -> i32 { |
953 | 1.19k | let subscript_name = name(subscript); |
954 | 1.19k | if subscript_name == "mn" && |
955 | 674 | return 0; // not really much chem info about an integer subscript |
956 | 525 | } else if subscript_name == "mi" { |
957 | 328 | let text = as_text(subscript); |
958 | 328 | if text == "s" || |
959 | 6 | subscript.set_attribute_value(CHEM_STATE, "true"); |
960 | 6 | return 2; |
961 | 322 | } |
962 | 197 | } else if subscript_name == "mrow" { |
963 | // debug!("likely_chem_subscript:\n{}", mml_to_string(subscript)); | |
964 | 184 | let children = subscript.children(); |
965 | 184 | if children.len() == 3 && |
966 | 6 | return likely_chem_subscript(as_element(children[1])); |
967 | 178 | } |
968 | 178 | let i_first_child = as_element(children[0]); |
969 | 178 | if children.len() == 2 && |
970 | 103 | name(i_first_child) == "mn" && |
971 | 81 | name(as_element(children[1])) == "mrow" && |
972 | 0 | likely_chem_state(as_element(children[1])) > 0 { // notation used in en.wikipedia.org/wiki/Electrolyte#Formation |
973 | 0 | return 2; |
974 | 178 | } |
975 | 13 | } |
976 | // could be a variable 'n' or something else -- just not likely | |
977 | 513 | return -3 |
978 | 1.19k | } |
979 | ||
980 | 17 | fn small_roman_to_number(text: &str) -> &str { |
981 | // simplest to do a look up | |
982 | static ROMAN_TO_NUMBER: phf::Map<&str, &str> = phf_map! { | |
983 | "I" => "1", "II" => "2", "III" => "3", "IV" => "4", "V" => "5", "VI" => "6", "VII" => "7", "VIII" => "8", "IX" => "9", | |
984 | }; | |
985 | 17 | return ROMAN_TO_NUMBER.get(text).unwrap_or(&""); |
986 | ||
987 | 17 | } |
988 | ||
989 | 1.65k | fn likely_chem_superscript(sup: Element) -> i32 { |
990 | // either one or more '+'s (or '-'s) or a number followed by +/- | |
991 | // also could be state (en.wikipedia.org/wiki/Nuclear_chemistry#PUREX_chemistry) | |
992 | // bullet is radical (en.wikipedia.org/wiki/Radical_(chemistry)#Depiction_in_chemical_reactions); mhchem uses dot operator | |
993 | // these can stand alone, be followed by +/- or have a number in front "(2•)-"" [examples from mhchem documentation] | |
994 | // roman numerals are "oxidation state" and range from -4 to +9 | |
995 | 3 | static MULTIPLE_PLUS_OR_MINUS_OR_DOT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\++$|^-+$|^\U{2212}+$|^[⋅∙•][-+\U{2212}]*$").unwrap()); |
996 | 3 | static SINGLE_PLUS_OR_MINUS_OR_DOT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[+-\U{2212}⋅∙•]$").unwrap()); |
997 | static DOTS: &[char; 3] = &['⋅', '∙', '•']; | |
998 | 1.65k | let sup_name = name(sup); |
999 | 1.65k | if sup_name == "mo" && |
1000 | 113 | if as_text(sup).find(DOTS).is_some() { |
1001 | 7 | sup.set_attribute_value(MAYBE_CHEMISTRY, "1"); |
1002 | 7 | sup.set_attribute_value(CHEM_FORMULA_OPERATOR, "1"); // value doesn't really matter |
1003 | 106 | } |
1004 | 113 | return if as_text(sup).len()==1 { |
1005 | 1.54k | } else if (sup_name == "mi" || |
1006 | 17 | sup.set_attribute_value("data-number", small_roman_to_number(as_text(sup))); |
1007 | 17 | sup.set_attribute_value(MAYBE_CHEMISTRY, "2"); |
1008 | 17 | return 2; |
1009 | 1.52k | } else if sup_name == "mrow" { |
1010 | // look for something like '2+' | |
1011 | 311 | let children = sup.children(); |
1012 | 311 | if children.len() == 2 { |
1013 | 177 | let first = as_element(children[0]); |
1014 | 177 | let second = as_element(children[1]); |
1015 | 177 | if name(first) == "mn" && |
1016 | 55 | let second_text = as_text(second); |
1017 | 55 | if SINGLE_PLUS_OR_MINUS_OR_DOT.is_match(second_text) { |
1018 | 55 | if second_text.find(DOTS).is_some() { |
1019 | 0 | second.set_attribute_value(MAYBE_CHEMISTRY, "2"); |
1020 | 0 | second.set_attribute_value(CHEM_FORMULA_OPERATOR, "2"); // value doesn't really matter |
1021 | 55 | } |
1022 | 55 | sup.set_attribute_value(MAYBE_CHEMISTRY, "3"); |
1023 | 55 | return 3; // ending with a +/- makes it likely this is an ion |
1024 | 0 | } |
1025 | 122 | } |
1026 | 134 | } |
1027 | // gather up the text and see if it is all +, -, etc | |
1028 | 256 | let mut text = "".to_string(); |
1029 | 414 | for child in |
1030 | 414 | let child = as_element(*child); |
1031 | 414 | if name(child) == "mo" { |
1032 | 169 | text.push_str(as_text(child)); |
1033 | 169 | } else { |
1034 | // could have something like 'mrow(mrow 2n, -) (chem example 5-9) -- so fallback to still ok if ends with + or - | |
1035 | 245 | let last_super_child = as_element(children[children.len()-1]); |
1036 | 245 | if name(last_super_child) == "mo" { |
1037 | 7 | let text = as_text(last_super_child); |
1038 | 7 | if text == "+" || text == "-" { |
1039 | 1 | sup.set_attribute_value(MAYBE_CHEMISTRY, "3"); |
1040 | 1 | return 3; |
1041 | 6 | } |
1042 | 238 | } |
1043 | 244 | return NOT_CHEMISTRY; |
1044 | } | |
1045 | } | |
1046 | 11 | if MULTIPLE_PLUS_OR_MINUS_OR_DOT.is_match(&text) { |
1047 | 13 | for child in |
1048 | 13 | let child = as_element(child); |
1049 | 13 | if name(child) == "mo" && as_text(child).find(DOTS).is_some() { |
1050 | 0 | child.set_attribute_value(MAYBE_CHEMISTRY, "1"); |
1051 | 0 | child.set_attribute_value(CHEM_FORMULA_OPERATOR, "1"); // value doesn't really matter |
1052 | 13 | } |
1053 | } | |
1054 | 6 | let likely = 2*text.len() as i32; |
1055 | 6 | sup.set_attribute_value(MAYBE_CHEMISTRY, &likely.to_string()); |
1056 | 6 | return likely; |
1057 | 5 | } |
1058 | 1.21k | } |
1059 | 1.21k | return NOT_CHEMISTRY |
1060 | 1.65k | } |
1061 | ||
1062 | ||
1063 | /// chem_formula is likely if it is one of: | |
1064 | /// * a (possibly adorned) chemical element | |
1065 | /// * an operator that represents a bond | |
1066 | /// * fences around a chemical formula | |
1067 | /// * an mrow made up of only chemical formulas | |
1068 | 15.0k | fn likely_chem_formula(mathml: Element) -> i32 { |
1069 | // debug!("start likely_chem_formula:\n{}", mml_to_string(mathml)); | |
1070 | 15.0k | if let Some( |
1071 | 2.65k | return value; // already marked |
1072 | 12.3k | } |
1073 | ||
1074 | 12.3k | let tag_name = name(mathml); |
1075 | 12.3k | let likelihood = match tag_name { |
1076 | // a parent may clear the chem flags if something says can't be chemistry (e.g, a non chemically valid script) | |
1077 | 12.3k | "mi" => |
1078 | 10.3k | "mo" => |
1079 | 5.90k | "mtext" => |
1080 | 5.85k | "mn" => |
1081 | 3.87k | "msub" | |
1082 | 225 | likely_chem_formula(as_element(mathml.children()[0])); // set MAYBE_CHEMISTRY attribute |
1083 | 225 | likely_adorned_chem_formula(mathml) |
1084 | }, | |
1085 | 3.64k | "mrow" => { |
1086 | 3.41k | let chem_state = likely_chem_state(mathml); |
1087 | 3.41k | if chem_state > 0 { |
1088 | 18 | chem_state |
1089 | } else { | |
1090 | 3.39k | likely_mrow_chem_formula(mathml) |
1091 | } | |
1092 | }, | |
1093 | 232 | "mfrac" => { |
1094 | 73 | let children = mathml.children(); |
1095 | 73 | let num_likely = likely_chem_formula(as_element(children[0])); |
1096 | 73 | let denom_likely = likely_chem_formula(as_element(children[1])); |
1097 | 73 | let likely = num_likely.max(denom_likely); |
1098 | 73 | if likely < CHEMISTRY_THRESHOLD {NOT_CHEMISTRY} else { |
1099 | } | |
1100 | 159 | "mtd" => { |
1101 | 5 | let mut likely = likely_chem_formula(as_element(mathml.children()[0])); |
1102 | 5 | if likely < CHEMISTRY_THRESHOLD { |
1103 | 4 | likely = likely_chem_equation(mathml); |
1104 | 4 |
|
1105 | 5 | likely |
1106 | } | |
1107 | 154 | "mtable" => { |
1108 | 4 | for mrow in |
1109 | 4 | let mrow = as_element(mrow); |
1110 | 5 | for mtd in |
1111 | 5 | let mtd = as_element(mtd); |
1112 | 5 | let mut likely = likely_chem_formula(mtd); |
1113 | 5 | if likely < CHEMISTRY_THRESHOLD { |
1114 | 3 | likely = likely_chem_equation(mtd); |
1115 | 3 |
|
1116 | 5 | if likely < CHEMISTRY_THRESHOLD { |
1117 | 3 | is_changed_after_unmarking_chemistry(mtd); |
1118 | 3 |
|
1119 | } | |
1120 | } | |
1121 | 2 | NOT_CHEMISTRY |
1122 | }, | |
1123 | 152 | "semantics" => { |
1124 | 0 | likely_chem_formula(get_presentation_element(mathml).1) |
1125 | }, | |
1126 | _ => { | |
1127 | 152 | if !is_leaf(mathml) { |
1128 | // mfrac, msqrt, etc | |
1129 | 320 | for child in |
1130 | 320 | let child = as_element(child); |
1131 | 320 | let likelihood = likely_chem_formula(child); |
1132 | 320 | if likelihood > 0 { |
1133 | 77 | child.set_attribute_value(MAYBE_CHEMISTRY, likelihood.to_string().as_str()); |
1134 | 243 | }; |
1135 | } | |
1136 | 0 | } |
1137 | // debug!("NOT_CHEMISTRY:\n{}", mml_to_string(mathml)); | |
1138 | 152 | NOT_CHEMISTRY |
1139 | } | |
1140 | }; | |
1141 | 12.3k | if likelihood >= 0 { |
1142 | 5.09k | mathml.set_attribute_value(MAYBE_CHEMISTRY, &likelihood.to_string()); |
1143 | 7.30k | } |
1144 | // debug!("likely_chem_formula {}:\n{}", likelihood, mml_to_string(mathml)); | |
1145 | ||
1146 | 12.3k | return likelihood; |
1147 | ||
1148 | 3.39k | fn likely_mrow_chem_formula(mrow: Element) -> i32 { |
1149 | // For parens, the only reason to add them is to group the children and then indicate that there is more than one molecule | |
1150 | 3.39k | if IsBracketed::is_bracketed(mrow, "(", ")", false, false) || |
1151 | 3.14k | IsBracketed::is_bracketed(mrow, "[", "]", false, false) { |
1152 | // If it is bracketed, it should have a subscript to indicate the number of the element. | |
1153 | // We give a pass to unadorned bracketing chars | |
1154 | 310 | if mrow.children().len() != 3 { |
1155 | 0 | return NOT_CHEMISTRY; |
1156 | 310 | } |
1157 | 310 | let contents = as_element(mrow.children()[1]); |
1158 | 310 | let parent = get_parent(mrow); |
1159 | 310 | let parent_is_scripted = IsNode::is_scripted(parent); |
1160 | 310 | if name(contents) != "mrow" && |
1161 | 53 | return NOT_CHEMISTRY; |
1162 | 257 | } |
1163 | 257 | let likely = likely_chem_formula(contents); |
1164 | 257 | if parent_is_scripted { |
1165 | 149 | return likely + 3; |
1166 | } else { | |
1167 | 108 | return likely; |
1168 | } | |
1169 | 3.08k | } |
1170 | ||
1171 | 3.08k | let mut likelihood = if is_order_ok(mrow) { |
1172 | ||
1173 | // check all the children and compute the likelihood of that this is a chemical formula | |
1174 | // bonus point for consecutive chemical formula children (not counting invisible children) | |
1175 | 3.08k | let mut last_was_likely_formula = 0; // 0 is false, 1 is true |
1176 | 3.08k | let mut is_chem_formula = true; // assume true until we prove otherwise (still want to mark the children) |
1177 | 12.5k | for child in |
1178 | 12.5k | let child = as_element(child); |
1179 | 12.5k | let likely = likely_chem_formula(child); |
1180 | // debug!(" in mrow: likely={}, likelihood={}", likely, likelihood); | |
1181 | 12.5k | match likely.cmp(&0) { |
1182 | Ordering::Greater => { | |
1183 | 2.56k | likelihood += likely + last_was_likely_formula; |
1184 | 2.56k | last_was_likely_formula = if name(child) == "mo" { |
1185 | }, | |
1186 | 5.86k | Ordering::Less => { |
1187 | 5.86k | // debug!("in likely_chem_formula: FALSE: likelihood={}, child\n{}", likelihood, mml_to_string(child)); |
1188 | 5.86k | is_chem_formula = false; |
1189 | 5.86k | last_was_likely_formula = 0; |
1190 | 5.86k | likelihood += likely; |
1191 | 5.86k | }, |
1192 | Ordering::Equal => { | |
1193 | 4.08k | if name(child) == "mo" { |
1194 | 2.27k | let text = as_text(child); |
1195 | 2.27k | if text != "\u{2062}" && |
1196 | 8 | last_was_likely_formula = 0; |
1197 | 2.26k | } |
1198 | 1.81k | } |
1199 | }, | |
1200 | } | |
1201 | // debug!("in likely_chem_formula likelihood={}, child\n{}", likelihood, mml_to_string(child)); | |
1202 | // debug!(" likelihood={} (likely={})", likelihood, likely); | |
1203 | } | |
1204 | ||
1205 | 3.08k | if !is_chem_formula || |
1206 | // the children may have looked have looked right, but something has said "not likely" | |
1207 | 2.25k | return NOT_CHEMISTRY; |
1208 | 832 | } else if likelihood < CHEMISTRY_THRESHOLD && |
1209 | // debug!("is_short_formula is true for:\n{}", mml_to_string(mrow)); | |
1210 | 47 | return CHEMISTRY_THRESHOLD |
1211 | 785 | } |
1212 | 785 | return likelihood; |
1213 | 3.39k | } |
1214 | ||
1215 | 15.0k | } |
1216 | ||
1217 | /// This does some checks that sort of follow IUPAC's "Red Book" in section IR-4.4. | |
1218 | /// Those rules require knowledge that the program doesn't have (e.g., which bond is closest to the central atom). | |
1219 | /// Instead, we mainly use the two main types of orderings: alphabetical and electronegativity. | |
1220 | /// We first do a test to see if this looks like a structural formula -- if so, ordering doesn't apply. | |
1221 | /// If a formula has groupings, each grouping is checked independently of the rest since | |
1222 | /// there are cases where the outer ordering doesn't match the inner ordering. | |
1223 | /// For "generalized salts", we need to split the elements into positive and negative ions, and within each group | |
1224 | /// the order is suppose to be alphabetical but many use electronegativity (the point being there are two separate groups). | |
1225 | /// This site has a nice summary of the rules: https://chemistry.stackexchange.com/questions/537/why-is-arsenous-acid-denoted-h3aso3/538#538 | |
1226 | /// Note: "(OH)" doesn't fit with the above, and Susan Jolly suggests allowing any sequence that ends with H, so we allow that. | |
1227 | /// Also, Susan Jolly suggested allowing any compound with C, H, and O | |
1228 | 3.08k | fn is_order_ok(mrow: Element) -> bool { |
1229 | 3.08k | assert_eq!(name(mrow), "mrow"); |
1230 | 3.08k | if let Some( |
1231 | 2.73k | if |
1232 | 1.48k | return false; |
1233 | 846 | } |
1234 | 846 | let n_elements = elements.len(); |
1235 | 846 | if n_elements < 2 { |
1236 | 475 | return true; |
1237 | 371 | } else if has_noble_element(&elements) { |
1238 | 0 | return false; // noble elements don't form compounds |
1239 | } else { | |
1240 | 371 | return elements[n_elements-1] == "H" || // special case that includes "OH" |
1241 | // has_non_metal_element(&elements) && !has_non_metal_element(&elements) && // must have a metal and non-metal | |
1242 | 295 | has_c_h_o(&elements) || |
1243 | 291 | is_structural(&elements) || |
1244 | 271 | is_alphabetical(&elements) || |
1245 | 169 | is_ordered_by_electronegativity(&elements) || |
1246 | 12 | is_generalized_salt(&elements); |
1247 | } | |
1248 | } else { | |
1249 | 759 | return false; |
1250 | } | |
1251 | 3.08k | } |
1252 | ||
1253 | // from https://learnwithdrscott.com/ionic-bond-definition/ | |
1254 | // I don't include the noble gases since they don't interact with other elements and are ruled out elsewhere | |
1255 | // fn has_non_metal_element(elements: &[&str]) -> bool { | |
1256 | // static NON_METAL_ELEMENTS: phf::Set<&str> = phf_set! { | |
1257 | // "H", "B", "C", "N", "O", "F", "Si", "P", "S", "Cl", "As", "Se", "Br", "Te", "I", "At", | |
1258 | // }; | |
1259 | // return elements.iter().any(|&e| NON_METAL_ELEMENTS.contains(e)); | |
1260 | // } | |
1261 | ||
1262 | ||
1263 | 374 | fn has_noble_element(elements: &[&str]) -> bool { |
1264 | static NOBLE_ELEMENTS: phf::Set<&str> = phf_set! { | |
1265 | "He", "Ne", "Ar", "Kr", "Xe", "Rn", "Og" // Og might be reactive, but it is unstable | |
1266 | }; | |
1267 | 893 | return |
1268 | 374 | } |
1269 | ||
1270 | 295 | fn has_c_h_o(elements: &[&str]) -> bool { |
1271 | 295 | return elements.contains(&"C") && |
1272 | 295 | } |
1273 | ||
1274 | ||
1275 | 295 | fn is_structural(elements: &[&str]) -> bool { |
1276 | 295 | assert!(elements.len() > 1); // already handled |
1277 | ||
1278 | // debug!("is_structural: {:?}", elements); | |
1279 | 295 | let mut element_set = HashSet::with_capacity(elements.len()); |
1280 | 627 |
|
1281 | 295 | return element_set.len() < elements.len(); |
1282 | 295 | } |
1283 | ||
1284 | /// collect up all the elements in the mrow. | |
1285 | /// Returns the elements (which can be an empty vector) or None if something (right now an operator) rules out them being elements | |
1286 | 3.10k | fn collect_elements(mrow: Element<'_>) -> Option<Vec<&str>> { |
1287 | 3.10k | let mut elements = Vec::with_capacity(mrow.children().len()/2+1); // don't bother with slots for operators |
1288 | 8.86k | for child in |
1289 | 8.86k | let child = as_element(child); |
1290 | 8.86k | match name(child) { |
1291 | 8.86k | "mi" | |
1292 | 6.06k | "msub" | |
1293 | 584 | let base = as_element(child.children()[0]); |
1294 | 584 | let base_name = name(base); |
1295 | 584 | if base_name == "mi" || |
1296 | 514 | elements.push(as_text(base)); |
1297 | 514 |
|
1298 | }, | |
1299 | 5.48k | "mo" if |
1300 | 2.46k | "mo" => (), |
1301 | 2.25k | _ => (), // let loop in likely_chem_formula() deal with all the negatives |
1302 | } | |
1303 | } | |
1304 | 2.34k | return Some(elements); |
1305 | 3.10k | } |
1306 | ||
1307 | /// check to make sure elements are ordered alphabetically | |
1308 | /// Actually check Hill's system that puts 'C' followed by 'H' first if 'C' is present | |
1309 | 275 | fn is_alphabetical(elements: &[&str]) -> bool { |
1310 | 275 | assert!(elements.len() > 1); // already handled |
1311 | // debug!("is_alphabetical: {:?}", elements); | |
1312 | 275 | let mut elements = elements; |
1313 | 275 | if elements[1..].contains(&"C") { // "C" must be first if present |
1314 | 22 | return false; |
1315 | 253 | } |
1316 | 253 | if elements[0] == "C" { |
1317 | 10 | elements = if elements[1]=="H" { |
1318 | 243 | } |
1319 | 253 | return elements.len() < 2 || |
1320 | 275 | } |
1321 | ||
1322 | 174 | fn is_ordered_by_electronegativity(elements: &[&str]) -> bool { |
1323 | // HPO_4^2 (Mono-hydrogen phosphate) doesn't fit this pattern, nor does HCO_3^- (Hydrogen carbonate) and some others | |
1324 | // FIX: drop "H" from the ordering?? | |
1325 | 174 | assert!(elements.len() > 1); // already handled |
1326 | 188 | return |
1327 | 174 | } |
1328 | ||
1329 | 12 | fn is_generalized_salt(elements: &[&str]) -> bool { |
1330 | 12 | assert!(!elements.is_empty()); |
1331 | 12 | return false; |
1332 | 12 | } |
1333 | ||
1334 | ||
1335 | /// Returns the likelihood that the arg is an adorned chem formula | |
1336 | /// Adornments are: | |
1337 | /// superscripts with +/- and optionally a number (charge) | |
1338 | /// numeric subscripts (e.g. H_2) | |
1339 | /// In addition to chemical elements, we include nuclear decay since there is a lot of overlap in notation | |
1340 | /// The nuclear decay notation is mostly taken from https://tinyurl.com/2f6b8e3a | |
1341 | /// Basically it is a chemical element or 'e', 'p', 'n', 'α', 'β', or 'γ' with pre-sub/superscript | |
1342 | /// There is also an instance with a charge on the referenced page, so we allow that also. | |
1343 | /// | |
1344 | /// Note: https://tinyurl.com/ysmr8cw2 says "++"/"--", etc., is sometimes used in a superscript particle physics instead of a "2" | |
1345 | /// | |
1346 | /// Note: msubsup cleaning for an empty script hasn't happened and we consider an empty script a sign of attempting to vertically align sub/superscripts | |
1347 | /// | |
1348 | /// Note: 'mathml' is not necessarily canonicalized | |
1349 | 2.85k | pub fn likely_adorned_chem_formula(mathml: Element) -> i32 { |
1350 | 2.85k | if ! |
1351 | 1 | return NOT_CHEMISTRY; |
1352 | 2.85k | } |
1353 | // some simple sanity checks on the scripts... | |
1354 | 2.85k | let tag_name = name(mathml); |
1355 | 2.85k | let children = mathml.children(); |
1356 | 2.85k | let mut likelihood = 0; |
1357 | 2.85k | let mut is_empty_subscript = false; |
1358 | // debug!("likely_adorned_chem_formula:\n{}", mml_to_string(mathml)); | |
1359 | 2.85k | if tag_name == "msub" || |
1360 | // subscripts should be just a number, although they could be 'n' or '2n' or other exprs. | |
1361 | 1.10k | let subscript = as_element(children[1]); |
1362 | 1.10k | is_empty_subscript = name(subscript) == "mtext" && |
1363 | 1.10k | if !is_empty_subscript { |
1364 | 1.10k | likelihood += likely_chem_subscript(subscript); |
1365 | 1.10k |
|
1366 | 1.74k | } |
1367 | ||
1368 | 2.85k | let mut empty_superscript = false; |
1369 | 2.85k | if tag_name == "msup" || |
1370 | // debug!("likely_adorned_chem_formula: mathml\n{}", mml_to_string(mathml)); | |
1371 | 1.59k | let superscript = as_element(children[if tag_name == "msup" { |
1372 | 1.59k | empty_superscript = name(superscript) == "mtext" && |
1373 | 1.59k | if !empty_superscript { |
1374 | 1.58k | likelihood += likely_chem_superscript(superscript); |
1375 | 1.58k |
|
1376 | 1.26k | } |
1377 | 2.85k | if tag_name == "msubsup" && ( |
1378 | 9 | likelihood += 1; // might be trying to vertically align scripts as in done in chemistry |
1379 | 2.84k | } |
1380 | ||
1381 | 2.85k | if tag_name == "mmultiscripts" { |
1382 | // prescripts are normally positive integers, chem 2.5.1 allows for a superscript for a Lewis dot | |
1383 | // postscript should be a charge | |
1384 | ||
1385 | let prescripts; | |
1386 | let postscripts; | |
1387 | 351 | if children.len() == 4 && |
1388 | 138 | prescripts = &children[2..4]; |
1389 | 138 | postscripts = &children[0..0]; // empty |
1390 | 213 | } else if children.len() == 6 && |
1391 | 55 | prescripts = &children[4..6]; |
1392 | 55 | postscripts = &children[1..3]; // empty |
1393 | 158 | } else if children.len() == 3 || |
1394 | 118 | prescripts = &children[0..0]; // empty |
1395 | 118 | postscripts = &children[1..]; |
1396 | 118 | } else { |
1397 | 40 | return NOT_CHEMISTRY; |
1398 | }; | |
1399 | ||
1400 | 311 | if !prescripts.is_empty() { |
1401 | 193 | let pre_subscript = as_element(prescripts[0]); |
1402 | 193 | let pre_subscript_name = name(pre_subscript); |
1403 | ||
1404 | 193 | let pre_superscript = as_element(prescripts[1]); |
1405 | 193 | let pre_superscript_name = name(pre_superscript); |
1406 | ||
1407 | // deal with special case of 'e' with prescripts of -1 and 0 | |
1408 | 193 | if is_adorned_electron(children[0], prescripts) { |
1409 | 31 | return 100; // very likely chemistry |
1410 | 162 | } |
1411 | 162 | let base = as_element(children[0]); |
1412 | 162 | let base_name = name(base); |
1413 | 162 | let |
1414 | 154 | let Some( |
1415 | 127 | *atomic_number |
1416 | } else { | |
1417 | 35 | return NOT_CHEMISTRY; |
1418 | }; | |
1419 | 127 | if pre_superscript_name == "mo" { |
1420 | // Lewis dot prescript case | |
1421 | 3 | if pre_subscript_name != "none" { |
1422 | 0 | return NOT_CHEMISTRY; |
1423 | 3 | } |
1424 | 3 | likelihood += likely_chem_superscript(pre_superscript); |
1425 | 124 | } else if pre_superscript_name == "mn" { // must have a pre-superscript (neutrons + protons) |
1426 | 75 | if let Ok(mass) = as_text(pre_superscript).parse::<u32>() { |
1427 | // "drip line" is 1.5 * mass < 3.5 * mass -- it is possible to outside of this range, but VERY unlikely | |
1428 | // to avoid floating point, we multiply by 2 and compare to 3 and 7 | |
1429 | 75 | if 3*atomic_number < 2*mass && |
1430 | 74 | likelihood += 3; |
1431 | 74 |
|
1432 | 0 | } |
1433 | 75 | if pre_subscript_name == "mn" && |
1434 | 69 | likelihood = CHEMISTRY_THRESHOLD; |
1435 | 69 |
|
1436 | } else { | |
1437 | 49 | return NOT_CHEMISTRY; |
1438 | } | |
1439 | 118 | } |
1440 | ||
1441 | 196 | if !postscripts.is_empty() { |
1442 | 119 | let mut i = 0; |
1443 | 266 | while i < postscripts.len() { |
1444 | 147 | let sub = as_element(postscripts[i]); |
1445 | // debug!("sub: {}", mml_to_string(sub)); | |
1446 | 147 | if name(sub) != "none" { |
1447 | 91 | likelihood += likely_chem_subscript(sub); |
1448 | 91 |
|
1449 | 147 | let sup = as_element(postscripts[i+1]); |
1450 | 147 | if name(sup) != "none" { |
1451 | 65 | // debug!("sup: {}", mml_to_string(sub)); |
1452 | 65 | likelihood += likely_chem_superscript(sup); |
1453 | 82 | } |
1454 | 147 | i += 2; |
1455 | } | |
1456 | 77 | } |
1457 | 2.50k | } |
1458 | ||
1459 | 2.69k | let base = as_element(children[0]); |
1460 | 2.69k | let base_name = name(base); |
1461 | 2.69k | if base_name == "mi" || |
1462 | 2.05k | likelihood += likely_chem_element(base); |
1463 | 2.05k | } else if |
1464 | // debug!("mrow addition:\n{}", mml_to_string(base)); | |
1465 | // a safe minor canonicalization that allows "short_form" calculations if appropriate | |
1466 | 187 | if (IsBracketed::is_bracketed(base, "(", ")", false, false) || |
1467 | 89 | IsBracketed::is_bracketed(base, "[", "]", false, false)) && |
1468 | 148 | base.children().len() > 3 { |
1469 | 77 | let inner_mrow = create_mathml_element(&base.document(), "mrow"); |
1470 | 77 | inner_mrow.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
1471 | 77 | let mut children = base.children(); |
1472 | 77 | let inside_of_parens = children.drain(1..children.len()-1); |
1473 | 77 | inner_mrow.append_children(inside_of_parens); |
1474 | 77 | base.replace_children(vec![children[0], ChildOfElement::Element(inner_mrow), children[children.len()-1]]); |
1475 | 110 | } |
1476 | 187 | likelihood += likely_chem_formula(base); |
1477 | 454 | } else { |
1478 | 454 | likelihood += likely_chem_formula(base); |
1479 | 454 | } |
1480 | ||
1481 | // debug!("returning from likely_adorned_chem_formula: likelihood={}, mathml\n{}", likelihood, mml_to_string(mathml)); | |
1482 | 2.69k | return likelihood; |
1483 | ||
1484 | ||
1485 | 193 | fn is_adorned_electron(base: ChildOfElement, prescripts: &[ChildOfElement]) -> bool { |
1486 | // looking for 'e' with prescripts of -1 and 0 | |
1487 | 193 | let base = as_element(base); |
1488 | 193 | let pre_lower = as_element(prescripts[0]); |
1489 | 193 | let pre_upper = as_element(prescripts[1]); |
1490 | 193 | if (name(base) == "mi" || |
1491 | 31 | name(pre_upper) == "mn" && as_text(pre_upper) == "0" && |
1492 | 31 | name(pre_lower) == "mrow" && pre_lower.children().len() == 2 { |
1493 | // looking '-' and '1' | |
1494 | 31 | let lower_children = pre_lower.children(); |
1495 | 31 | let minus = as_element(lower_children[0]); |
1496 | 31 | let one = as_element(lower_children[1]); |
1497 | // not yet normalized, so we need to compare against ASCII minus and u+2212 | |
1498 | 31 | return name(minus) == "mo" && (as_text(minus) == "-" || as_text(minus) == "−") && |
1499 | 31 | name(one) == "mn" && as_text(one) == "1"; |
1500 | } else { | |
1501 | 162 | return false; |
1502 | } | |
1503 | 193 | } |
1504 | 2.85k | } |
1505 | ||
1506 | /// useful function to see if the str is a single char matching the predicate | |
1507 | 29.6k | fn is_single_char_matching(leaf_text: &str, pred: impl Fn(char) -> bool) -> bool { |
1508 | 29.6k | let mut chars = leaf_text.chars(); |
1509 | 29.6k | if let Some(ch) = chars.next() && chars.next().is_none() { |
1510 | 29.5k | return pred(ch); |
1511 | 87 | } |
1512 | 87 | return false; |
1513 | 29.6k | } |
1514 | ||
1515 | 17.2k | fn likely_chem_formula_operator(mathml: Element) -> i32 { |
1516 | // mostly from chenzhijin.com/en/article/Useful%20Unicode%20for%20Chemists (Arrows and Other) | |
1517 | // also en.wikipedia.org/wiki/Chemical_formula#Condensed_formula | |
1518 | #[derive(PartialEq, Eq)] | |
1519 | enum BondType {DoubleBond, TripleBond} // options for is_legal_bond() | |
1520 | // "⋅" is used in GTM 16.2 and en.wikipedia.org/wiki/Cement_chemist_notation -- may want to add some similar chars | |
1521 | static CHEM_FORMULA_OPERATORS: phf::Set<&str> = phf_set! { | |
1522 | "-", "\u{2212}", "⋅", ":", "=", "∷", "≡", ":::", "≣", "::::", // bond symbols (need both 2212 and minus because maybe not canonicalized) | |
1523 | "⋮", // lewis dots, part of "⋮⋮" - triple bond (see Nemeth chem guide 2.5.4) | |
1524 | }; | |
1525 | 16.1k | fn is_chem_formula_ok(ch: char) -> bool { |
1526 | 16.1k |
|
1527 | 16.1k | } |
1528 | ||
1529 | 17.2k | assert_eq!(name(mathml), "mo"); |
1530 | 17.2k | let leaf_text = as_text(mathml); |
1531 | 17.2k | if CHEM_FORMULA_OPERATORS.contains(leaf_text) && |
1532 | 1.85k | (has_inherited_property(mathml, "chemical-formula") || |
1533 | 1.85k | ( !(leaf_text == "=" || |
1534 | 1.05k | ( !(leaf_text == "≡" || |
1535 | ) { | |
1536 | 1.04k | mathml.set_attribute_value(MAYBE_CHEMISTRY, "1"); |
1537 | 1.04k | mathml.set_attribute_value(CHEM_FORMULA_OPERATOR, "1"); |
1538 | 1.04k | return 1; |
1539 | 16.1k | } else if is_single_char_matching(leaf_text, is_chem_formula_ok) { |
1540 | 6.49k | return 0; // not much info |
1541 | } else { | |
1542 | 9.67k | return -3; // still a small chance; |
1543 | } | |
1544 | ||
1545 | 874 | fn is_legal_bond(mathml: Element, bond_type: BondType) -> bool { |
1546 | 874 | let preceding = mathml.preceding_siblings(); |
1547 | 874 | let following = mathml.following_siblings(); |
1548 | 874 | if preceding.is_empty() || |
1549 | 115 | return false; |
1550 | 759 | } |
1551 | ||
1552 | 759 | let mut preceding_element = as_element(preceding[preceding.len()-1]); |
1553 | // special check for CH_2 -- double bond is really with C | |
1554 | 759 | if bond_type == BondType::DoubleBond && |
1555 | 31 | preceding.len() > 1 && |
1556 | 2 | preceding_element = as_element(preceding[preceding.len()-2]); |
1557 | 2 | if !is_leaf(preceding_element) || as_text(preceding_element) != "C" { |
1558 | 0 | return false; |
1559 | 2 | } |
1560 | 757 | } else if name(preceding_element) != "mi" && |
1561 | 320 | return false; |
1562 | 437 | } |
1563 | 439 | let following_element = get_possible_embellished_node(as_element(following[0])); |
1564 | 439 | if name(following_element) != "mi" && |
1565 | 313 | return false; |
1566 | 126 | } |
1567 | 126 | let preceding_text = as_text(preceding_element); |
1568 | 126 | let following_text = as_text(following_element); |
1569 | 126 | return match bond_type { |
1570 | 105 | BondType::DoubleBond => is_legal_double_bond(preceding_text, following_text), |
1571 | 21 | BondType::TripleBond => is_legal_triple_bond(preceding_text, following_text), |
1572 | }; | |
1573 | ||
1574 | 105 | fn is_legal_double_bond(left: &str, right: &str) -> bool { |
1575 | // this is based on table in en.wikipedia.org/wiki/Double_bond#Types_of_double_bonds_between_atoms | |
1576 | static DOUBLE_BOND_TO_SELF: phf::Set<&str> = phf_set! { | |
1577 | "C", "O", "N", "S", "Si", "Ge", "Sn", "Pb" | |
1578 | }; | |
1579 | // "C" => &["O", "N", "S"], | |
1580 | // "O" => &["N", "S"], | |
1581 | 105 | if left == right && |
1582 | 44 | return true; |
1583 | 61 | } |
1584 | 61 | return match left { |
1585 | 61 | "C" => |
1586 | 58 | "O" => |
1587 | 57 | "Si" => |
1588 | 57 | _ => false, |
1589 | } | |
1590 | 105 | } |
1591 | ||
1592 | 21 | fn is_legal_triple_bond(left: &str, right: &str) -> bool { |
1593 | // According to https://tinyurl.com/rkynhwj3 (from physics.org) | |
1594 | // triple bonds can be formed between any of B, C, N, and O | |
1595 | // Apparently they can also be forced in other cases, but they are rare. | |
1596 | // 'B' is from studiousguy.com/triple-bond-examples/ | |
1597 | 21 | return (left == "B" || left == "C" || |
1598 | 18 | (right == "B" || right == "C" || |
1599 | 21 | } |
1600 | 874 | } |
1601 | 17.2k | } |
1602 | ||
1603 | /// This assumes canonicalization of characters has happened | |
1604 | 6.85k | fn likely_chem_equation_operator(mathml: Element) -> i32 { |
1605 | ||
1606 | 6.73k | fn is_chem_equation_operator(ch: char) -> bool { |
1607 | 6.73k |
|
1608 | 6.73k | } |
1609 | ||
1610 | 6.85k | let elem_name = name(mathml); |
1611 | 6.85k | if elem_name == "munder" || |
1612 | 86 | let base = as_element(mathml.children()[0]); |
1613 | 86 | if name(base) == "mo" && |
1614 | 1 | base.set_attribute_value(MAYBE_CHEMISTRY, "1"); |
1615 | 1 | base.set_attribute_value(CHEM_EQUATION_OPERATOR, "1"); |
1616 | 1 | return 1; |
1617 | 85 | } else if elem_name == "mover" && |
1618 | 9 | return 2; |
1619 | } else { | |
1620 | 76 | return NOT_CHEMISTRY; |
1621 | } | |
1622 | 6.76k | } |
1623 | ||
1624 | 6.76k | if name(mathml) == "mo" { |
1625 | 6.76k | let text = as_text(mathml); |
1626 | 6.76k | if is_single_char_matching(text, is_chem_equation_operator) || |
1627 | 1.96k | mathml.set_attribute_value(MAYBE_CHEMISTRY, "1"); |
1628 | 1.96k | mathml.set_attribute_value(CHEM_EQUATION_OPERATOR, "1"); |
1629 | 1.96k | return 1; |
1630 | 4.79k | } else if text == "\u{2062}" || text == "\u{2063}" { |
1631 | // FIX: the invisible operator between elements should be well-defined, but this likely needs work, so both accepted for now | |
1632 | 0 | return 0; |
1633 | 4.79k | } |
1634 | 0 | } |
1635 | 4.79k | return -3; // there is still a chance |
1636 | ||
1637 | /// Detects output of mhchem for some equilibrium arrows that currently (11/22) don't have Unicode points | |
1638 | /// See github.com/NSoiffer/MathCAT/issues/60 for the patterns being matched | |
1639 | 20 | fn is_hack_for_missing_arrows(mover: Element) -> bool { |
1640 | 20 | assert_eq!(name(mover), "mover"); |
1641 | 20 | let children = mover.children(); |
1642 | 20 | let base = as_element(children[0]); |
1643 | 20 | let mo_base = if name(base) == "mrow" && |
1644 | 9 | as_element(base.children()[0]) |
1645 | } else { | |
1646 | 11 | base |
1647 | }; | |
1648 | 20 | let upper = as_element(children[1]); |
1649 | 20 | let mo_upper = if name(upper) == "mrow" && |
1650 | 9 | as_element(upper.children()[1]) |
1651 | } else { | |
1652 | 11 | upper |
1653 | }; | |
1654 | // slightly sloppy match, but almost certainly good enough | |
1655 | 20 | return name(mo_base) == "mo" && |
1656 | 9 | as_text(mo_base) == "↽" && as_text(mo_upper) == "⇀"; |
1657 | 20 | } |
1658 | 6.85k | } |
1659 | ||
1660 | 38 | fn is_equilibrium_constant(mut mathml: Element) -> bool { |
1661 | 38 | if name(mathml) == "msub" { |
1662 | 27 | mathml = as_element(mathml.children()[0]); |
1663 | 27 |
|
1664 | ||
1665 | 38 | return name(mathml) == "mi" && |
1666 | 38 | } |
1667 | ||
1668 | // Oxidation states range from -4 to 9 and are written with (a subset of) roman numerals. | |
1669 | // All instances seem to be upper case that I've seen. | |
1670 | 3 | static SMALL_UPPER_ROMAN_NUMERAL: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*^(IX|IV|V?I{0,3})\s*$").unwrap()); |
1671 | ||
1672 | /// look for "(s), "(l)", "(g)", "(aq)" (could also use [...]) | |
1673 | /// this might be called before canonicalization, but in clean_chemistry_mrow, we made sure "( xxx )" is grouped properly | |
1674 | 3.68k | pub fn likely_chem_state(mathml: Element) -> i32 { |
1675 | ||
1676 | 3.68k | if IsBracketed::is_bracketed(mathml, "(", ")", false, false) || |
1677 | 3.30k | IsBracketed::is_bracketed(mathml, "[", "]", false, false) { |
1678 | 438 | let contents = as_element(mathml.children()[1]); |
1679 | 438 | let contents_name = name(contents); |
1680 | 438 | if contents_name == "mi" || |
1681 | 109 | let text = as_text(contents); |
1682 | 109 | if text == "s" || |
1683 | 67 | return text.len() as i32 + 1; // hack to count chars -- works because all are ASCII |
1684 | 42 | }; |
1685 | 329 | } |
1686 | 3.24k | } |
1687 | 3.61k | return NOT_CHEMISTRY; |
1688 | 3.68k | } |
1689 | ||
1690 | /// Returns the likelihood that the arg is an element | |
1691 | 16.4k | pub fn likely_chem_element(mathml: Element) -> i32 { |
1692 | static NUCLEAR_SYMBOLS: [&str; 6] = ["e", "p", "n", "α", "β","γ"]; | |
1693 | ||
1694 | 16.4k | assert!(name(mathml) == "mi" || |
1695 | 16.4k | let text = as_text(mathml); |
1696 | 16.4k | if as_text(mathml).trim().is_empty() { |
1697 | 782 | return 0; // whitespace |
1698 | 15.6k | } else if is_chemical_element(mathml) { |
1699 | // single letter = 1; single letter with mathvariant="normal" = 2; double = 3 -- all elements are ASCII | |
1700 | 2.21k | return if text.len() == 1 { |
1701 | 1.90k | if mathml.attribute_value("mathvariant").unwrap_or_default() == "normal" { |
1702 | } else { | |
1703 | 311 | 3 |
1704 | }; | |
1705 | 13.4k | } else if NUCLEAR_SYMBOLS.contains(&text) { |
1706 | 659 | return 0; |
1707 | // not much special about them; | |
1708 | } else { | |
1709 | 12.7k | return NOT_CHEMISTRY; |
1710 | } | |
1711 | 16.4k | } |
1712 | ||
1713 | static SHORT_SINGLE_LETTER_ELEMENT_FORMULAE: phf::Set<&str> = phf_set! { | |
1714 | // from en.wikipedia.org/wiki/Glossary_of_chemical_formulae (via chem_formula_from_wikipedia.py) | |
1715 | "BF_3", "BI_3", "BN", "BP", "B_2F_4", "B_2H_6", "B_2O_3", "B_2S_3", "B_4C", | |
1716 | "CB_4", "CF_4", "CH_2", "CH_4", "CO", "CO_2", "CO_3", "CS_2", "CW", "C_2F_4", | |
1717 | "C_2H_4", "C_2H_6", "C_2U", "C_2Y", "C_3H_4", "C_3H_6", "C_3H_8", "C_4H_2", | |
1718 | "C_4H_8", "C_4I_2", "C_6H_6", "C_6N_4", "C_7H_8", "C_8H_8", "DI", "D_2O", | |
1719 | "FI", "FI_2", "FK", "FN", "FO", "FO_2", "FP", "FS", "FW", "FY", "F_2", | |
1720 | "F_2N", "F_2O", "F_2O_2", "F_2P", "F_2S", "F_2S_2", "F_2W", "F_2Y", "F_3B", | |
1721 | "F_3P", "F_3S", "F_3W", "F_3Y", "F_4B_2", "F_4C", "F_4C_2", "F_4N_2", | |
1722 | "F_4S", "F_4U", "F_4W", "F_5I", "F_5P", "F_5S", "F_5U", "F_5W", "F_6S", | |
1723 | "F_6W", "F_7I", "HF", "HI", "HK", "HN_3", "H_2", "H_2C", "H_2C_2", "H_2C_4", | |
1724 | "H_2O", "H_2O_2", "H_2S", "H_3N", "H_3P", "H_4C", "H_4C_2", "H_4C_3", | |
1725 | "H_4N_2", "H_4N_4", "H_6B_2", "H_6C_2", "H_6C_3", "H_6C_6", "H_8C_3", | |
1726 | "H_8C_7", "H_8C_8", "ID", "IF", "IF_5", "IF_7", "IH", "IK", "IO_3", "I_2", | |
1727 | "I_2F", "I_2O_5", "I_2W", "I_3B", "I_3N", "I_3U", "I_3V", "I_4P_2", "I_4W", | |
1728 | "KH", "KI", "K_2F_2", "K_2O", "K_2O_2", "K_2S", "NB", "NF", "NF_2", "NF_3", | |
1729 | "NI_3", "NO", "NO_2", "NU", "NV", "N_2", "N_2F_4", "N_2H_2", "N_2H_4", | |
1730 | "N_2O_3", "N_2O_4", "N_2O_5", "N_3H", "N_4C_6", "N_4H_4", "N_5P_3", "O", | |
1731 | "OD_2", "OF", "OF_2", "OH_2", "OK_2", "ON", "ON_2", "OT_2", "O_2", "O_2C", | |
1732 | "O_2F_2", "O_2H_2", "O_2K_2", "O_2N", "O_2S", "O_2U", "O_2W", "O_3", | |
1733 | "O_3C", "O_3I", "O_3N_2", "O_3S", "O_3U", "O_3V_2", "O_3W", "O_3Y_2", | |
1734 | "O_5I_2", "O_5N_2", "O_5P_2", "O_5V_2", "O_8U_3", "PB", "PF", "PF_2", "PF_3", | |
1735 | "PH_3", "PY", "P_2F_4", "P_2I_4", "P_2O_5", "P_2S_3", "P_3N_5", "SF", "SF_2", | |
1736 | "SF_4", "SF_5", "SF_6", "SH_2", "SK_2", "SO_2", "SO_3", "S_2C", "S_2F_2", | |
1737 | "S_2W", "S_3B_2", "S_3P_2", "S_3W", "S_3Y_2", "T_2O", "UC_2", "UF_4", "UF_5", | |
1738 | "UI_3", "UN", "UO_2", "UO_3", "US_2", "U_3O_8", "VI_3", "VN", "V_2O_3", | |
1739 | "WC", "WF", "WF_2", "WF_3", "WF_4", "WF_5", "WF_6", "WI_2", "WI_4", "WO_2", | |
1740 | "WS_2", "WS_3", "YB_6", "YC_2", "YF", "YF_2", "YF_3", "YP", "Y_2O_3", | |
1741 | ||
1742 | // from en.wikipedia.org/wiki/Ion#Common_ions (via chem_formula_from_wikipedia.py) | |
1743 | "CH_3COO^−", "CN^−", "CO_3^2−", "C^−", "C_2O_4^2−", "F^−", "HCOO^−", | |
1744 | "HPO_4^2−", "HSO_3^−", "HSO_4^−", "H^+", "H^−", "H_2PO_4^−", "H_3O^+", "I^−", | |
1745 | "NH_4^+", "NO_2^−", "NO_3^−", "N^3−", "N_3^−", "OH^−", "O^2−", "O_2^2−", | |
1746 | "PO_4^3−", "P^3−", "SO_3^2−", "SO_4^2−", "S^2−", "S_2O_3^2−", | |
1747 | ||
1748 | // from gchem.cm.utexas.edu/canvas.php?target=bonding/ionic/polyatomic-ions.html | |
1749 | "PO_3^3−", "IO_3^−", | |
1750 | ||
1751 | // others | |
1752 | "CH_3", /* methyl */ | |
1753 | "NH_3", // ammonium | |
1754 | }; | |
1755 | ||
1756 | /// Returns true if the formula is composed of 1 or 2 single letter elements and it matches a known compound/ion | |
1757 | /// This might be called (via likely_adorned_chem_formula) unparsed | |
1758 | 387 | fn is_short_formula(mrow: Element) -> bool { |
1759 | 387 | assert_eq!(name(mrow), "mrow"); |
1760 | 387 | let children = mrow.children(); |
1761 | 387 | let n_children = children.len(); |
1762 | 387 | if n_children == 0 || n_children > 3 || ( |
1763 | 12 | return false; |
1764 | 375 | } |
1765 | ||
1766 | 375 | let first_element = convert_to_short_form( as_element(children[0]) ); |
1767 | 375 | if n_children == 1 { |
1768 | 2 | return first_element.is_ok(); |
1769 | 373 | } |
1770 | 373 | let second_element = convert_to_short_form( as_element(children[if n_children == 2 { |
1771 | 373 | return match (first_element, second_element) { |
1772 | 365 | (Ok(first), Ok(second)) => { |
1773 | 365 | let short_form = first + second.as_str(); |
1774 | // debug!("short_form: {}", short_form); | |
1775 | 365 | return SHORT_SINGLE_LETTER_ELEMENT_FORMULAE.contains(&short_form); |
1776 | }, | |
1777 | 8 | _ => false, |
1778 | } | |
1779 | 387 | } |
1780 | ||
1781 | 931 | fn convert_to_short_form(mathml: Element) -> Result<String> { |
1782 | 931 | let mathml_name = name(mathml); |
1783 | 931 | return match mathml_name { |
1784 | 931 | "mi" | |
1785 | 95 | "none" => |
1786 | 95 | "msub" | |
1787 | 86 | let is_mmultiscripts = mathml_name == "mmultiscripts"; |
1788 | 86 | let children = mathml.children(); |
1789 | 86 | let mut result = convert_to_short_form(as_element(children[0])) |
1790 | 86 | if is_mmultiscripts && |
1791 | 0 | bail!("mmultiscripts found with {} children -- not part of chemical formula", children.len()); |
1792 | 86 | } |
1793 | 86 | if mathml_name == "msub" || |
1794 | 83 | result += "_"; |
1795 | 83 | result += &convert_to_short_form(as_element(children[1])) |
1796 | 3 | } |
1797 | 85 | if mathml_name == "msup" || |
1798 | 3 | result += "^"; |
1799 | 3 | result += &convert_to_short_form(as_element(children[if mathml_name=="msup" {1} else { |
1800 | 82 | } |
1801 | 85 | Ok( result ) |
1802 | }, | |
1803 | 9 | "mrow" => { |
1804 | // the only time this is valid is if the superscript is something like "+" or "2+", so we do a few checks and short circuit false now | |
1805 | 9 | let mrow_children = mathml.children(); |
1806 | 9 | if mrow_children.len() == 1 || mrow_children.len() == 2 { |
1807 | 0 | let mut result = convert_to_short_form(as_element(mrow_children[0]))?; |
1808 | 0 | if mrow_children.len() == 2 { |
1809 | 0 | result += &convert_to_short_form(as_element(mrow_children[1]))?; |
1810 | 0 | } |
1811 | 0 | return Ok(result) |
1812 | } else { | |
1813 | 9 | bail!("mrow found with {} children -- not part of chemical formula", mrow_children.len()); |
1814 | } | |
1815 | } | |
1816 | 0 | _ => bail!("{} found -- not part of chemical formula", mathml_name), |
1817 | } | |
1818 | 931 | } |
1819 | ||
1820 | /// A map of chemical elements and their relative IUPAC electronegativity (https://i.stack.imgur.com/VCSzW.png) | |
1821 | /// That list uses a horizontal line for the Lanthanide and Actinide Series. | |
1822 | /// Because I had already ordered the elements before realizing that, I opened a gap and started the higher ones again with a '1' in front. | |
1823 | /// The list is missing recent (unstable) elements -- I added them with the same value as the element above them in the periodic table. | |
1824 | static CHEMICAL_ELEMENT_ELECTRONEGATIVITY: phf::Map<&str, u32> = phf_map! { | |
1825 | "Ac" => 40, "Ag" => 155, "Al" => 163, "Am" => 29, "Ar" => 4, "As" => 172, "At" => 181, "Au" => 154, | |
1826 | "B" => 164, "Ba" => 14, "Be" => 18, "Bh" => 137, "Bi" => 170, "Bk" => 27, "Br" => 183, | |
1827 | "C" => 169, "Ca" => 16, "Cd" => 158, "Ce" => 56, "Cf" => 26, "Cl" => 184, "Cm" => 28, "Cn" => 157, "Co" => 148, "Cr" => 136, "Cs" => 8, "Cu" => 156, | |
1828 | "Db" => 129, "Ds" => 149, "Dy" => 48, | |
1829 | "Er" => 46, "Es" => 25, "Eu" => 51, "F" => 185, "Fe" => 144, "Fl" => 165, "Fm" => 24, "Fr" => 7, "Ga" => 162, "Gd" => 50, "Ge" => 167, | |
1830 | "H" => 175, "He" => 6, "Hf" => 126, "Hg" => 157, "Ho" => 47, "Hs" => 141, "I" => 182, "In" => 161, "Ir" => 146, "K" => 10, "Kr" => 3, | |
1831 | "La" => 62, "Li" => 12, "Lr" => 19, "Lu" => 41, "Lv" => 176, "Mc" => 170, "Md" => 23, "Mg" => 17, "Mn" => 140, "Mo" => 135, "Mt" => 145, | |
1832 | "N" => 174, "Na" => 11, "Nb" => 131, "Nd" => 54, "Ne" => 5, "Nh" => 160, "Ni" => 152, "No" => 22, "Np" => 31, "O" => 180, "Og" => 1, "Os" => 142, | |
1833 | "P" => 173, "Pa" => 33, "Pb" => 165, "Pd" => 151, "Pm" => 53, "Po" => 176, "Pr" => 55, "Pt" => 150, "Pu" => 30, | |
1834 | "Ra" => 13, "Rb" => 9, "Re" => 138, "Rf" => 125, "Rg" => 153, "Rh" => 147, "Rn" => 1, "Ru" => 143, | |
1835 | "S" => 179, "Sb" => 171, "Sc" => 124, "Se" => 178, "Sg" => 133, "Si" => 168, "Sm" => 52, "Sn" => 166, "Sr" => 15, | |
1836 | "Ta" => 130, "Tb" => 49, "Tc" => 139, "Te" => 177, "Th" => 34, "Ti" => 128, "Tl" => 160, "Tm" => 45, "Ts" => 181, | |
1837 | "U" => 32, "V" => 132, "W" => 134, "Xe" => 2, "Y" => 123, "Yb" => 44, "Zn" => 159, "Zr" => 127, | |
1838 | // The following come from E.A. Moore who said to treat them like chemicals | |
1839 | // These stand for methyl, ethyl, alkyl, acetyl and phenyl and apparently are quite commonly used ("Ac" is already a chemical) | |
1840 | // A full(er?) list is at en.wikipedia.org/wiki/Skeletal_formula#Alkyl_groups and in following sections | |
1841 | "Me" => 0, "Et" => 0, "R" => 0, /* "Ac" => 0, */ "Ph" => 0, | |
1842 | "X" => 0, /* treated as an unknown */ | |
1843 | }; | |
1844 | ||
1845 | // A map of the chemical elements and their atomic numbers | |
1846 | static CHEMICAL_ELEMENT_ATOMIC_NUMBER: phf::Map<&str, u32> = phf_map! { | |
1847 | "H" => 1, "He" => 2, "Li" => 3, "Be" => 4, "B" => 5, "C" => 6, "N" => 7, "O" => 8, "F" => 9, "Ne" => 10, | |
1848 | "Na" => 11, "Mg" => 12, "Al" => 13, "Si" => 14, "P" => 15, "S" => 16, "Cl" => 17, "Ar" => 18, "K" => 19, "Ca" => 20, | |
1849 | "Sc" => 21, "Ti" => 22, "V" => 23, "Cr" => 24, "Mn" => 25, "Fe" => 26, "Co" => 27, "Ni" => 28, "Cu" => 29, "Zn" => 30, | |
1850 | "Ga" => 31, "Ge" => 32, "As" => 33, "Se" => 34, "Br" => 35, "Kr" => 36, "Rb" => 37, "Sr" => 38, "Y" => 39, "Zr" => 40, | |
1851 | "Nb" => 41, "Mo" => 42, "Tc" => 43, "Ru" => 44, "Rh" => 45, "Pd" => 46, "Ag" => 47, "Cd" => 48, "In" => 49, "Sn" => 50, | |
1852 | "Sb" => 51, "Te" => 52, "I" => 53, "Xe" => 54, "Cs" => 55, "Ba" => 56, "La" => 57, "Ce" => 58, "Pr" => 59, "Nd" => 60, | |
1853 | "Pm" => 61, "Sm" => 62, "Eu" => 63, "Gd" => 64, "Tb" => 65, "Dy" => 66, "Ho" => 67, "Er" => 68, "Tm" => 69, "Yb" => 70, | |
1854 | "Lu" => 71, "Hf" => 72, "Ta" => 73, "W" => 74, "Re" => 75, "Os" => 76, "Ir" => 77, "Pt" => 78, "Au" => 79, "Hg" => 80, | |
1855 | "Tl" => 81, "Pb" => 82, "Bi" => 83, "Po" => 84, "At" => 85, "Rn" => 86, "Fr" => 87, "Ra" => 88, "Ac" => 89, "Th" => 90, | |
1856 | "Pa" => 91, "U" => 92, "Np" => 93, "Pu" => 94, "Am" => 95, "Cm" => 96, "Bk" => 97, "Cf" => 98, "Es" => 99, "Fm" => 100, | |
1857 | "Md" => 101, "No" => 102, "Lr" => 103, "Rf" => 104, "Db" => 105, "Sg" => 106, "Bh" => 107, "Hs" => 108, "Mt" => 109, "Ds" => 110, | |
1858 | "Rg" => 111, "Cn" => 112, "Nh" => 113, "Fl" => 114, "Mc" => 115, "Lv" => 116, "Ts" => 117, "Og" => 118, | |
1859 | }; | |
1860 | ||
1861 | 26.9k | pub fn is_chemical_element(node: Element) -> bool { |
1862 | // FIX: allow name to be in an mrow (e.g., <mi>N</mi><mi>a</mi> | |
1863 | 26.9k | let name = name(node); |
1864 | 26.9k | if name != "mi" && |
1865 | 71 | return false; |
1866 | 26.9k | } |
1867 | ||
1868 | 26.9k | let text = as_text(node); |
1869 | 26.9k | return CHEMICAL_ELEMENT_ELECTRONEGATIVITY.contains_key(text) || |
1870 | 23.0k | has_chem_intent(node, "chemical-element") || |
1871 | 23.0k | has_inherited_property(node, "chemical-formula"); |
1872 | 26.9k | } |
1873 | ||
1874 | ||
1875 | #[cfg(test)] | |
1876 | mod chem_tests { | |
1877 | ||
1878 | ||
1879 | #[allow(unused_imports)] | |
1880 | use super::super::init_logger; | |
1881 | use super::super::are_strs_canonically_equal; | |
1882 | use super::*; | |
1883 | ||
1884 | 40 | fn parse_mathml_string<F>(test: &str, test_mathml: F) -> bool |
1885 | 40 | where F: Fn(Element) -> bool { |
1886 | use sxd_document::parser; | |
1887 | use crate::interface::{get_element, trim_element}; | |
1888 | ||
1889 | ||
1890 | 40 | let test = if test.starts_with("<math") { |
1891 | 40 | let new_package = parser::parse(test); |
1892 | 40 | if let Err( |
1893 | 0 | panic!("Invalid MathML input:\n{}\nError is: {}", &test, &e.to_string()); |
1894 | 40 | } |
1895 | ||
1896 | 40 | let new_package = new_package.unwrap(); |
1897 | 40 | let mut mathml = get_element(&new_package); |
1898 | 40 | trim_element(mathml, false); |
1899 | 40 | mathml = as_element(mathml.children()[0]); |
1900 | 40 | return test_mathml(mathml); |
1901 | 40 | } |
1902 | ||
1903 | #[test] | |
1904 | 1 | fn test_noble_element() { |
1905 | // mathml test strings need to be canonical MathML since we aren't testing canonicalize() | |
1906 | 1 | let test = "<mrow> <mi>Na</mi> <mo>⁣</mo> <mi>Cl</mi> </mrow>"; // |
1907 | 1 | assert!( !parse_mathml_string(test, |mathml| has_noble_element( &collect_elements(mathml).unwrap() )) ); |
1908 | 1 | let test = "<mrow> <mi>Ar</mi> <mo>⁣</mo> <mi>Cl</mi> </mrow>"; // |
1909 | 1 | assert!( parse_mathml_string(test, |mathml| has_noble_element( &collect_elements(mathml).unwrap() )) ); |
1910 | 1 | let test = "<mrow> <mi>Ne</mi> </mrow>"; // |
1911 | 1 | assert!( parse_mathml_string(test, |mathml| has_noble_element( &collect_elements(mathml).unwrap() )) ); |
1912 | 1 | } |
1913 | ||
1914 | #[test] | |
1915 | 1 | fn test_alphabetical_order() { |
1916 | // mathml test strings need to be canonical MathML since we aren't testing canonicalize() | |
1917 | 1 | let test = r#"<mrow> |
1918 | 1 | <msub><mi>C</mi><mn>6</mn></msub><mo>⁣</mo> |
1919 | 1 | <msub><mi>H</mi><mn>14</mn></msub> |
1920 | 1 | </mrow>"#; |
1921 | 1 | assert!( parse_mathml_string(test, |mathml| is_alphabetical( &collect_elements(mathml).unwrap() )) ); |
1922 | 1 | let test = r#"<mrow> |
1923 | 1 | <msub><mi>C</mi><mn>6</mn></msub><mo>⁣</mo> |
1924 | 1 | <msub><mi>H</mi><mn>12</mn></msub><mo>⁣</mo> |
1925 | 1 | <msub><mi>O</mi><mn>6</mn></msub> |
1926 | 1 | </mrow>"#; |
1927 | 1 | assert!( parse_mathml_string(test, |mathml| is_alphabetical( &collect_elements(mathml).unwrap() )) ); |
1928 | 1 | let test = "<mrow> <mi>B</mi> <mo>⁣</mo> <mi>C</mi> <mo>⁣</mo> <mi>O</mi></mrow>"; // "C" should be first |
1929 | 1 | assert!( !parse_mathml_string(test, |mathml| is_alphabetical( &collect_elements(mathml).unwrap() )) ); |
1930 | 1 | let test = "<mrow> <mi>P</mi> <mo>⁣</mo> <mi>B</mi> <mo>⁣</mo> <mi>O</mi></mrow>"; // not alphabetical |
1931 | 1 | assert!( !parse_mathml_string(test, |mathml| is_alphabetical( &collect_elements(mathml).unwrap() )) ); |
1932 | 1 | } |
1933 | ||
1934 | #[test] | |
1935 | 1 | fn test_is_structural() { |
1936 | // mathml test strings need to be canonical MathML since we aren't testing canonicalize() | |
1937 | 1 | let test = r#"<mrow> |
1938 | 1 | <msub><mi>C</mi><mn>6</mn></msub><mo>⁣</mo> |
1939 | 1 | <msub><mi>H</mi><mn>14</mn></msub> |
1940 | 1 | </mrow>"#; |
1941 | 1 | assert!( !parse_mathml_string(test, |mathml| is_structural( &collect_elements(mathml).unwrap() )) ); |
1942 | 1 | let test = "<mrow> <mi>B</mi> <mo>⁣</mo> <mi>C</mi> <mo>⁣</mo> <mi>O</mi></mrow>"; |
1943 | 1 | assert!( !parse_mathml_string(test, |mathml| is_structural( &collect_elements(mathml).unwrap() )) ); |
1944 | 1 | let test = "<mrow> <mi>H</mi> <mo>⁣</mo> <mi>O</mi> <mo>⁣</mo> <mi>H</mi></mrow>"; |
1945 | 1 | assert!( parse_mathml_string(test, |mathml| is_structural( &collect_elements(mathml).unwrap() )) ); |
1946 | 1 | let test = "<mrow data-chem-formula='9'> |
1947 | 1 | <mmultiscripts data-chem-formula='1'> |
1948 | 1 | <mi mathvariant='normal' data-chem-element='1'>H</mi> |
1949 | 1 | <mn>2</mn> |
1950 | 1 | <none></none> |
1951 | 1 | </mmultiscripts> |
1952 | 1 | <mo data-changed='added'>⁣</mo> |
1953 | 1 | <mi mathvariant='normal' data-chem-element='1'>C</mi> |
1954 | 1 | <mo data-chemical-bond='true' data-chem-formula-op='1'>=</mo> |
1955 | 1 | <mi mathvariant='normal' data-chem-element='1'>C</mi> |
1956 | 1 | <mo data-changed='added'>⁣</mo> |
1957 | 1 | <mmultiscripts data-chem-formula='1'> |
1958 | 1 | <mi mathvariant='normal' data-chem-element='1'>H</mi> |
1959 | 1 | <mn>2</mn> |
1960 | 1 | <none></none> |
1961 | 1 | </mmultiscripts> |
1962 | 1 | </mrow>"; |
1963 | 1 | assert!( parse_mathml_string(test, |mathml| is_structural( &collect_elements(mathml).unwrap() )) ); |
1964 | 1 | } |
1965 | ||
1966 | ||
1967 | #[test] | |
1968 | 1 | fn test_electronegativity_order() { |
1969 | // mathml test strings need to be canonical MathML since we aren't testing canonicalize() | |
1970 | 1 | let test = r#"<mrow> |
1971 | 1 | <mi>N</mi><mo>⁣</mo> |
1972 | 1 | <msub><mi>H</mi><mn>3</mn></msub> |
1973 | 1 | </mrow>"#; |
1974 | 1 | assert!( parse_mathml_string(test, |mathml| is_ordered_by_electronegativity( &collect_elements(mathml).unwrap() )) ); |
1975 | 1 | let test = r#"<mrow> |
1976 | 1 | <mi>O</mi><mo>⁣</mo> |
1977 | 1 | <msub><mi>F</mi><mn>2</mn></msub> |
1978 | 1 | </mrow>"#; |
1979 | 1 | assert!( parse_mathml_string(test, |mathml| is_ordered_by_electronegativity( &collect_elements(mathml).unwrap() )) ); |
1980 | 1 | let test = r#"<mrow> |
1981 | 1 | <msub><mi>Rb</mi><mn>15</mn></msub><mo>⁣</mo> |
1982 | 1 | <msub><mi>Hg</mi><mn>16</mn></msub> |
1983 | 1 | </mrow>"#; |
1984 | 1 | assert!( parse_mathml_string(test, |mathml| is_ordered_by_electronegativity( &collect_elements(mathml).unwrap() )) ); |
1985 | 1 | let test = r#" |
1986 | 1 | <mrow><msup> |
1987 | 1 | <mo>[</mo> |
1988 | 1 | <mi>Si</mi><mo>⁣</mo> |
1989 | 1 | <msub><mi>As</mi><mn>4</mn></msub> |
1990 | 1 | <mo>]</mo> |
1991 | 1 | <mrow><mn>8</mn><mo>-</mo></mrow> |
1992 | 1 | </msup></mrow>"#; |
1993 | 1 | assert!( parse_mathml_string(test, |mathml| is_ordered_by_electronegativity( &collect_elements(as_element(mathml.children()[0])).unwrap() )) ); |
1994 | 1 | let test = r#"<mrow> |
1995 | 1 | <mi>Si</mi><mo>⁣</mo> |
1996 | 1 | <msub><mi>H</mi><mn>2</mn></msub> |
1997 | 1 | <mi>Br</mi><mo>⁣</mo> |
1998 | 1 | <mi>Cl</mi> |
1999 | 1 | </mrow>"#; |
2000 | 1 | assert!( parse_mathml_string(test, |mathml| is_ordered_by_electronegativity( &collect_elements(mathml).unwrap() )) ); |
2001 | 1 | } |
2002 | ||
2003 | #[test] | |
2004 | 1 | fn test_order() { |
2005 | 1 | let test = r#"<mrow> |
2006 | 1 | <msub><mi>C</mi><mn>2</mn></msub><mo>⁣</mo> |
2007 | 1 | <msub><mi>H</mi><mn>4</mn></msub><mo>⁣</mo> |
2008 | 1 | <msub><mrow> <mo>(</mo><mi>N</mi> <mo>⁣</mo> <msub> <mi>H</mi> <mn>2</mn> </msub><mo>)</mo> </mrow><mn>2</mn></msub> |
2009 | 1 | </mrow>"#; |
2010 | 1 | assert!( parse_mathml_string(test, is_order_ok) ); |
2011 | 1 | let test = r#"<mrow> |
2012 | 1 | <mi>Fe</mi><mo>⁣</mo> |
2013 | 1 | <mi>O</mi><mo>⁣</mo> |
2014 | 1 | <mrow> <mo>(</mo><mrow><mi>O</mi> <mo>⁣</mo><mi>H</mi> </mrow><mo>)</mo> </mrow> |
2015 | 1 | </mrow>"#; |
2016 | 1 | assert!( parse_mathml_string(test, is_order_ok) ); |
2017 | 1 | let test = r#"<mrow> // R-4.4.3.3 -- Chain compound doesn't fit rules but should be accepted |
2018 | 1 | <mi>Br</mi><mo>⁣</mo> |
2019 | 1 | <mi>S</mi><mo>⁣</mo> |
2020 | 1 | <mi>C</mi><mo>⁣</mo> |
2021 | 1 | <mi>N</mi> |
2022 | 1 | </mrow>"#; |
2023 | 1 | assert!( parse_mathml_string(test, |mathml| likely_chem_formula(mathml)==5) ); |
2024 | 1 | } |
2025 | ||
2026 | #[test] | |
2027 | 1 | fn test_simple_double_bond() { |
2028 | 1 | let test1 = r#"<mrow><mi>C</mi><mo>=</mo><mi>C</mi></mrow>"#; |
2029 | 1 | assert!( parse_mathml_string(test1, |mathml| likely_chem_formula(mathml) < CHEMISTRY_THRESHOLD) ); // just under threshold |
2030 | 1 | let test2 = r#"<mrow><mi>C</mi><mo>∷</mo><mi>O</mi></mrow>"#; |
2031 | 1 | assert!( parse_mathml_string(test2, |mathml| likely_chem_formula(mathml)==CHEMISTRY_THRESHOLD) ); |
2032 | 1 | let test3 = r#"<mrow><mi>N</mi><mo>=</mo><mi>N</mi></mrow>"#; |
2033 | 1 | assert!( parse_mathml_string(test3, |mathml| likely_chem_formula(mathml) < CHEMISTRY_THRESHOLD) ); // just under threshold |
2034 | 1 | let test4 = r#"<mrow><mi>Sn</mi><mo>=</mo><mi>Sn</mi></mrow>"#; |
2035 | 1 | assert!( parse_mathml_string(test4, |mathml| likely_chem_formula(mathml) == 8) ); |
2036 | 1 | let test5 = r#"<mrow><mi>O</mi><mo>=</mo><mi>S</mi></mrow>"#; |
2037 | 1 | assert!( parse_mathml_string(test5, |mathml| likely_chem_formula(mathml) < CHEMISTRY_THRESHOLD) ); // just under threshold |
2038 | 1 | let test10 = r#"<mrow><mi>K</mi><mo>=</mo><mi>K</mi></mrow>"#; |
2039 | 1 | assert!( parse_mathml_string(test10, |mathml| likely_chem_formula(mathml) == NOT_CHEMISTRY) ); |
2040 | 1 | let test11 = r#"<mrow><mi>C</mi><mo>=</mo><mi>K</mi></mrow>"#; |
2041 | 1 | assert!( parse_mathml_string(test11, |mathml| likely_chem_formula(mathml) == NOT_CHEMISTRY) ); |
2042 | 1 | } |
2043 | ||
2044 | #[test] | |
2045 | 1 | fn test_double_bond() { |
2046 | 1 | let test1 = r#"<mrow><mi mathvariant='normal'>C</mi><msub><mi mathvariant='normal'>H</mi><mn>2</mn></msub><mo>=</mo><mi>C</mi></mrow>"#; |
2047 | 1 | assert!( parse_mathml_string(test1, |mathml| likely_chem_formula(mathml)==8) ); |
2048 | 1 | let test2 = r#"<mrow><mi mathvariant='normal'>C</mi><msub><mi mathvariant='normal'>H</mi><mn>2</mn></msub><mo>=</mo> |
2049 | 1 | <mi>C</mi><mi>H</mi><mi>R</mi></mrow>"#; |
2050 | 1 | assert!( parse_mathml_string(test2, |mathml| likely_chem_formula(mathml)==12) ); |
2051 | 1 | let test3 = r#"<mrow><msub><mi mathvariant='normal'>H</mi><mn>2</mn></msub><mi mathvariant='normal'>C</mi><mo>=</mo> |
2052 | 1 | <mi>C</mi><msub><mi mathvariant='normal'>H</mi><mn>2</mn></msub></mrow>"#; |
2053 | 1 | assert!( parse_mathml_string(test3, |mathml| likely_chem_formula(mathml)==11) ); |
2054 | 1 | let test4 = r#"<mrow><mi>H</mi><mo>-</mo><mi>N</mi><mo>=</mo><mi>N</mi><mo>-</mo><mi>H</mi></mrow>"#; |
2055 | 1 | assert!( parse_mathml_string(test4, |mathml| likely_chem_formula(mathml)==10) ); |
2056 | 1 | let test10 = r#"<mrow><mi mathvariant='normal'>C</mi><msub><mi mathvariant='normal'>H</mi><mn>3</mn></msub><mo>=</mo><mi>C</mi></mrow>"#; |
2057 | 1 | assert!( parse_mathml_string(test10, |mathml| likely_chem_formula(mathml)==NOT_CHEMISTRY) ); |
2058 | 1 | } |
2059 | ||
2060 | #[test] | |
2061 | #[ignore] // It would be good to say "not chemistry" for this, but there aren't rules for that at the moment | |
2062 | 0 | fn test_water_bond() { |
2063 | 0 | let test11 = r#"<mrow><msub><mi mathvariant='normal'>H</mi><mn>2</mn></msub><mi mathvariant='normal'>O</mi><mo>=</mo><mi>O</mi></mrow>"#; |
2064 | 0 | assert!( parse_mathml_string(test11, |mathml| {println!("val={}", likely_chem_formula(mathml)); likely_chem_formula(mathml)==8}) ); |
2065 | // assert!( parse_mathml_string(test11, |mathml| likely_chem_formula(mathml)==NOT_CHEMISTRY) ); | |
2066 | 0 | } |
2067 | ||
2068 | ||
2069 | #[test] | |
2070 | 1 | fn test_triple_bond() { |
2071 | 1 | let test1 = r#"<mrow><mi>C</mi><mo>≡</mo><mi>C</mi></mrow>"#; |
2072 | 1 | assert!( parse_mathml_string(test1, |mathml| likely_chem_formula(mathml) < CHEMISTRY_THRESHOLD) ); |
2073 | 1 | let test2 = r#"<mrow><mi>C</mi><mo>:::</mo><mi>O</mi></mrow>"#; |
2074 | 1 | assert!( parse_mathml_string(test2, |mathml| likely_chem_formula(mathml)==CHEMISTRY_THRESHOLD) ); |
2075 | 1 | let test3 = r#"<mrow><mi>H</mi><mo>-</mo><mi>C</mi><mo>≡</mo><mi>C</mi><mo>-</mo><mi>H</mi></mrow>"#; |
2076 | 1 | assert!( parse_mathml_string(test3, |mathml| likely_chem_formula(mathml)==10) ); |
2077 | 1 | let test4 = r#"<mrow><mi>H</mi><mo>-</mo><mi>C</mi><mo>≡</mo><mi>C</mi><mo>-</mo><mi>H</mi></mrow>"#; |
2078 | 1 | assert!( parse_mathml_string(test4, |mathml| likely_chem_formula(mathml)==10) ); |
2079 | 1 | let test5 = r#"<mrow><mi>N</mi><mo>-</mo><mi>C</mi><mo>≡</mo><mi>C</mi><mo>-</mo><mi>N</mi></mrow>"#; |
2080 | 1 | assert!( parse_mathml_string(test5, |mathml| likely_chem_formula(mathml)==10) ); |
2081 | 1 | let test6 = r#"<mrow><mi>H</mi><mo>-</mo><mi>C</mi><mo>≡</mo> |
2082 | 1 | <mi>C</mi><mo>-</mo><mi mathvariant='normal'>C</mi><msub><mi mathvariant='normal'>H</mi><mn>3</mn></msub></mrow>"#; // 1-Propyne |
2083 | 1 | assert!( parse_mathml_string(test6, |mathml| likely_chem_formula(mathml)==14) ); |
2084 | // assert!( parse_mathml_string(test6, |mathml| {println!("val={}", likely_chem_formula(mathml)); likely_chem_formula(mathml)==10}) ); | |
2085 | 1 | let test10 = r#"<mrow><mi>O</mi><mo>:::</mo><mi>S</mi></mrow>"#; |
2086 | 1 | assert!( parse_mathml_string(test10, |mathml| likely_chem_formula(mathml)==NOT_CHEMISTRY) ); |
2087 | 1 | let test11 = r#"<mrow><mi>Pb</mi><mo>≡</mo><mi>Pb</mi></mrow>"#; |
2088 | 1 | assert!( parse_mathml_string(test11, |mathml| likely_chem_formula(mathml)==NOT_CHEMISTRY) ); |
2089 | 1 | let test12 = r#"<mrow><mi>C</mi><mo>≡</mo><mi>K</mi></mrow>"#; |
2090 | 1 | assert!( parse_mathml_string(test12, |mathml| likely_chem_formula(mathml)==NOT_CHEMISTRY) ); |
2091 | 1 | } |
2092 | ||
2093 | #[test] | |
2094 | 1 | fn split_mi() { |
2095 | 1 | let test = "<math><mi>LiF</mi></math>"; |
2096 | 1 | let target = "<math> |
2097 | 1 | <mrow data-changed='added' data-chem-formula='5'> |
2098 | 1 | <mi data-chem-element='3'>Li</mi> |
2099 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2100 | 1 | <mi mathvariant='normal' data-split='true' data-chem-element='1'>F</mi> |
2101 | 1 | </mrow> |
2102 | 1 | </math>"; |
2103 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2104 | 1 | } |
2105 | ||
2106 | #[test] | |
2107 | 1 | fn no_split_mi() { |
2108 | 1 | let test = "<math><mi>HC</mi></math>"; |
2109 | 1 | let target = "<math> |
2110 | 1 | <mi>HC</mi> |
2111 | 1 | </math>"; |
2112 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2113 | 1 | } |
2114 | ||
2115 | #[test] | |
2116 | 1 | fn combine_mi() { |
2117 | 1 | let test = "<math><mi>H</mi><mi>C</mi><mi>l</mi></math>"; |
2118 | 1 | let target = " <math> |
2119 | 1 | <mrow data-changed='added' data-chem-formula='5'> |
2120 | 1 | <mi data-chem-element='1'>H</mi> |
2121 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2122 | 1 | <mi data-merged='true' data-chem-element='3'>Cl</mi> |
2123 | 1 | </mrow> |
2124 | 1 | </math>"; |
2125 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2126 | 1 | } |
2127 | ||
2128 | #[test] | |
2129 | 1 | fn no_combine() { |
2130 | 1 | let test = "<math><mi>C</mi><mi>l</mi></math>"; |
2131 | 1 | let target = "<math> |
2132 | 1 | <mrow data-changed='added'> |
2133 | 1 | <mi>C</mi> |
2134 | 1 | <mo data-changed='added'>⁢</mo> |
2135 | 1 | <mi>l</mi> |
2136 | 1 | </mrow> |
2137 | 1 | </math>"; |
2138 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2139 | 1 | } |
2140 | ||
2141 | #[test] | |
2142 | 1 | fn add_script() { |
2143 | 1 | let test = "<math> <mi>SO</mi> <msub> <mrow></mrow> <mn>2</mn> </msub> </math>"; |
2144 | 1 | let target = "<math> |
2145 | 1 | <mrow data-changed='added' data-chem-formula='5'> |
2146 | 1 | <mi mathvariant='normal' data-chem-element='1'>S</mi> |
2147 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2148 | 1 | <mmultiscripts data-chem-formula='2'> |
2149 | 1 | <mi mathvariant='normal' data-split='true' data-chem-element='1'>O</mi> |
2150 | 1 | <mn>2</mn> |
2151 | 1 | <none></none> |
2152 | 1 | </mmultiscripts> |
2153 | 1 | </mrow> |
2154 | 1 | </math>"; |
2155 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2156 | 1 | } |
2157 | ||
2158 | #[test] | |
2159 | 1 | fn add_script_bug_287() { |
2160 | 1 | let test = r#"<math><mrow> |
2161 | 1 | <msubsup> |
2162 | 1 | <mrow><mi mathvariant="normal">SO</mi></mrow> |
2163 | 1 | <mn>4</mn> |
2164 | 1 | <mrow><mn>2</mn><mo>−</mo></mrow> |
2165 | 1 | </msubsup> |
2166 | 1 | </mrow></math>"#; |
2167 | 1 | let target = r#"<math> |
2168 | 1 | <mrow data-changed='added' data-chem-formula='7'> |
2169 | 1 | <mi mathvariant='normal' data-chem-element='1'>S</mi> |
2170 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2171 | 1 | <msubsup data-chem-formula='5'> |
2172 | 1 | <mi mathvariant='normal' data-split='true' data-chem-element='1'>O</mi> |
2173 | 1 | <mn>4</mn> |
2174 | 1 | <mrow data-chem-formula='3'><mn>2</mn><mo>-</mo></mrow> |
2175 | 1 | </msubsup> |
2176 | 1 | </mrow> |
2177 | 1 | </math>"#; |
2178 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2179 | 1 | } |
2180 | ||
2181 | #[test] | |
2182 | 1 | fn salt() { |
2183 | 1 | let test = "<math><mi>Na</mi><mi>Cl</mi></math>"; |
2184 | 1 | let target = "<math> |
2185 | 1 | <mrow data-changed='added' data-chem-formula='7'> |
2186 | 1 | <mi data-chem-element='3'>Na</mi> |
2187 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2188 | 1 | <mi data-chem-element='3'>Cl</mi> |
2189 | 1 | </mrow> |
2190 | 1 | </math>"; |
2191 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2192 | 1 | } |
2193 | ||
2194 | #[test] | |
2195 | 1 | fn water() { |
2196 | 1 | let test = "<math><msub><mi mathvariant='normal'>H</mi><mn>2</mn></msub><mi mathvariant='normal'>O</mi></math>"; |
2197 | 1 | let target = "<math> |
2198 | 1 | <mrow data-changed='added' data-chem-formula='5'> |
2199 | 1 | <msub data-chem-formula='2'> |
2200 | 1 | <mi mathvariant='normal' data-chem-element='2'>H</mi> |
2201 | 1 | <mn>2</mn> |
2202 | 1 | </msub> |
2203 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2204 | 1 | <mi mathvariant='normal' data-chem-element='2'>O</mi> |
2205 | 1 | </mrow> |
2206 | 1 | </math>"; |
2207 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2208 | 1 | } |
2209 | ||
2210 | #[test] | |
2211 | 1 | fn mhchem_water() { |
2212 | 1 | let test = "<math> |
2213 | 1 | <mrow> |
2214 | 1 | <mrow> |
2215 | 1 | <mi mathvariant='normal'>H</mi> |
2216 | 1 | </mrow> |
2217 | 1 | <msub> |
2218 | 1 | <mrow> |
2219 | 1 | <mrow> |
2220 | 1 | <mpadded width='0'> |
2221 | 1 | <mphantom> |
2222 | 1 | <mi>A</mi> |
2223 | 1 | </mphantom> |
2224 | 1 | </mpadded> |
2225 | 1 | </mrow> |
2226 | 1 | </mrow> |
2227 | 1 | <mrow> |
2228 | 1 | <mrow> |
2229 | 1 | <mpadded height='0'> |
2230 | 1 | <mn>2</mn> |
2231 | 1 | </mpadded> |
2232 | 1 | </mrow> |
2233 | 1 | </mrow> |
2234 | 1 | </msub> |
2235 | 1 | <mrow> |
2236 | 1 | <mi mathvariant='normal'>O</mi> |
2237 | 1 | </mrow> |
2238 | 1 | </mrow> |
2239 | 1 | </math>"; |
2240 | 1 | let target = "<math> |
2241 | 1 | <mrow data-chem-formula='5'> |
2242 | 1 | <mmultiscripts data-chem-formula='2'> |
2243 | 1 | <mi mathvariant='normal' data-chem-element='2'>H</mi> |
2244 | 1 | <mn>2</mn> |
2245 | 1 | <none></none> |
2246 | 1 | </mmultiscripts> |
2247 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2248 | 1 | <mi mathvariant='normal' data-chem-element='2'>O</mi> |
2249 | 1 | </mrow> |
2250 | 1 | </math>"; |
2251 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2252 | 1 | } |
2253 | ||
2254 | #[test] | |
2255 | 1 | fn carbon() { |
2256 | 1 | let test = "<math><mi>C</mi></math>"; // not enough to trigger recognition |
2257 | 1 | let target = " <math> |
2258 | 1 | <mi>C</mi> |
2259 | 1 | </math>"; |
2260 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2261 | 1 | } |
2262 | ||
2263 | #[test] | |
2264 | 1 | fn sulfate() { |
2265 | 1 | let test = "<math><mrow><msup> |
2266 | 1 | <mrow><mo>[</mo><mi>S</mi><msub><mi>O</mi><mn>4</mn></msub><mo>]</mo></mrow> |
2267 | 1 | <mrow><mn>2</mn><mo>−</mo></mrow> |
2268 | 1 | </msup></mrow></math>"; |
2269 | 1 | let target = "<math> |
2270 | 1 | <msup data-chem-formula='9'> |
2271 | 1 | <mrow data-chem-formula='6'> |
2272 | 1 | <mo>[</mo> |
2273 | 1 | <mrow data-changed='added' data-chem-formula='3'> |
2274 | 1 | <mi data-chem-element='1'>S</mi> |
2275 | 1 | <mo data-changed='added'>⁣</mo> |
2276 | 1 | <msub data-chem-formula='1'> |
2277 | 1 | <mi data-chem-element='1'>O</mi> |
2278 | 1 | <mn>4</mn> |
2279 | 1 | </msub> |
2280 | 1 | </mrow> |
2281 | 1 | <mo>]</mo> |
2282 | 1 | </mrow> |
2283 | 1 | <mrow data-chem-formula='3'> |
2284 | 1 | <mn>2</mn> |
2285 | 1 | <mo>-</mo> |
2286 | 1 | </mrow> |
2287 | 1 | </msup> |
2288 | 1 | </math>"; |
2289 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2290 | 1 | } |
2291 | ||
2292 | #[test] | |
2293 | 1 | fn aluminum_sulfate() { |
2294 | 1 | let test = "<math><mrow><msub><mi>Al</mi><mn>2</mn></msub> |
2295 | 1 | <msub><mrow><mo>(</mo><mi>S</mi><msub><mi>O</mi><mn>4</mn></msub><mo>)</mo></mrow><mn>3</mn></msub></mrow></math>"; |
2296 | 1 | let target = " <math> |
2297 | 1 | <mrow data-chem-formula='10'> |
2298 | 1 | <msub data-chem-formula='3'> |
2299 | 1 | <mi data-chem-element='3'>Al</mi> |
2300 | 1 | <mn>2</mn> |
2301 | 1 | </msub> |
2302 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2303 | 1 | <msub data-chem-formula='6'> |
2304 | 1 | <mrow data-chem-formula='6'> |
2305 | 1 | <mo>(</mo> |
2306 | 1 | <mrow data-changed='added' data-chem-formula='3'> |
2307 | 1 | <mi data-chem-element='1'>S</mi> |
2308 | 1 | <mo data-changed='added'>⁣</mo> |
2309 | 1 | <msub data-chem-formula='1'> |
2310 | 1 | <mi data-chem-element='1'>O</mi> |
2311 | 1 | <mn>4</mn> |
2312 | 1 | </msub> |
2313 | 1 | </mrow> |
2314 | 1 | <mo>)</mo> |
2315 | 1 | </mrow> |
2316 | 1 | <mn>3</mn> |
2317 | 1 | </msub> |
2318 | 1 | </mrow> |
2319 | 1 | </math>"; |
2320 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2321 | 1 | } |
2322 | ||
2323 | #[test] | |
2324 | 1 | fn ethanol_bonds() { |
2325 | 1 | let test = "<math> |
2326 | 1 | <mrow> |
2327 | 1 | <mi>C</mi> |
2328 | 1 | <msub> <mi>H</mi> <mn>3</mn> </msub> |
2329 | 1 | <mo>−</mo> |
2330 | 1 | <mi>C</mi> |
2331 | 1 | <msub> <mi>H</mi> <mn>2</mn> </msub> |
2332 | 1 | <mo>−</mo> |
2333 | 1 | <mi>O</mi> |
2334 | 1 | <mi>H</mi> |
2335 | 1 | </mrow> |
2336 | 1 | </math>"; |
2337 | 1 | let target = "<math> |
2338 | 1 | <mrow data-chem-formula='13'> |
2339 | 1 | <mi data-chem-element='1'>C</mi> |
2340 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2341 | 1 | <msub data-chem-formula='1'> |
2342 | 1 | <mi data-chem-element='1'>H</mi> |
2343 | 1 | <mn>3</mn> |
2344 | 1 | </msub> |
2345 | 1 | <mo data-chemical-bond='true' data-chem-formula-op='1'>-</mo> |
2346 | 1 | <mi data-chem-element='1'>C</mi> |
2347 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2348 | 1 | <msub data-chem-formula='1'> |
2349 | 1 | <mi data-chem-element='1'>H</mi> |
2350 | 1 | <mn>2</mn> |
2351 | 1 | </msub> |
2352 | 1 | <mo data-chemical-bond='true' data-chem-formula-op='1'>-</mo> |
2353 | 1 | <mi data-chem-element='1'>O</mi> |
2354 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2355 | 1 | <mi data-chem-element='1'>H</mi> |
2356 | 1 | </mrow> |
2357 | 1 | </math>"; |
2358 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2359 | 1 | } |
2360 | ||
2361 | #[test] | |
2362 | 1 | fn dichlorine_hexoxide() { |
2363 | // init_logger(); | |
2364 | 1 | let test = "<math><mrow> |
2365 | 1 | <msup> |
2366 | 1 | <mrow><mo>[</mo><mi>Cl</mi><msub><mi>O</mi><mn>2</mn></msub><mo>]</mo></mrow> |
2367 | 1 | <mo>+</mo> |
2368 | 1 | </msup> |
2369 | 1 | <msup> |
2370 | 1 | <mrow><mo>[</mo><mi>Cl</mi><msub><mi>O</mi><mn>4</mn></msub><mo>]</mo></mrow> |
2371 | 1 | <mo>-</mo> |
2372 | 1 | </msup> |
2373 | 1 | </mrow></math>"; |
2374 | 1 | let target = "<math> |
2375 | 1 | <mrow data-chem-formula='19'> |
2376 | 1 | <msup data-chem-formula='9'> |
2377 | 1 | <mrow data-chem-formula='8'> |
2378 | 1 | <mo>[</mo> |
2379 | 1 | <mrow data-changed='added' data-chem-formula='5'> |
2380 | 1 | <mi data-chem-element='3'>Cl</mi> |
2381 | 1 | <mo data-changed='added'>⁣</mo> |
2382 | 1 | <msub data-chem-formula='1'> |
2383 | 1 | <mi data-chem-element='1'>O</mi> |
2384 | 1 | <mn>2</mn> |
2385 | 1 | </msub> |
2386 | 1 | </mrow> |
2387 | 1 | <mo>]</mo> |
2388 | 1 | </mrow> |
2389 | 1 | <mo>+</mo> |
2390 | 1 | </msup> |
2391 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2392 | 1 | <msup data-chem-formula='9'> |
2393 | 1 | <mrow data-chem-formula='8'> |
2394 | 1 | <mo>[</mo> |
2395 | 1 | <mrow data-changed='added' data-chem-formula='5'> |
2396 | 1 | <mi data-chem-element='3'>Cl</mi> |
2397 | 1 | <mo data-changed='added'>⁣</mo> |
2398 | 1 | <msub data-chem-formula='1'> |
2399 | 1 | <mi data-chem-element='1'>O</mi> |
2400 | 1 | <mn>4</mn> |
2401 | 1 | </msub> |
2402 | 1 | </mrow> |
2403 | 1 | <mo>]</mo> |
2404 | 1 | </mrow> |
2405 | 1 | <mo>-</mo> |
2406 | 1 | </msup> |
2407 | 1 | </mrow> |
2408 | 1 | </math>"; |
2409 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2410 | 1 | } |
2411 | ||
2412 | #[test] | |
2413 | 1 | fn ethylene_with_bond() { |
2414 | 1 | let test = "<math><mrow> |
2415 | 1 | <msub><mi>H</mi><mn>2</mn></msub><mi>C</mi> |
2416 | 1 | <mo>=</mo> |
2417 | 1 | <mi>C</mi><msub><mi>H</mi><mn>2</mn></msub> |
2418 | 1 | </mrow></math>"; |
2419 | 1 | let target = "<math> |
2420 | 1 | <mrow data-chem-formula='8'> |
2421 | 1 | <msub data-chem-formula='1'> |
2422 | 1 | <mi data-chem-element='1'>H</mi> |
2423 | 1 | <mn>2</mn> |
2424 | 1 | </msub> |
2425 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2426 | 1 | <mi data-chem-element='1'>C</mi> |
2427 | 1 | <mo data-chemical-bond='true' data-chem-formula-op='1'>=</mo> |
2428 | 1 | <mi data-chem-element='1'>C</mi> |
2429 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2430 | 1 | <msub data-chem-formula='1'> |
2431 | 1 | <mi data-chem-element='1'>H</mi> |
2432 | 1 | <mn>2</mn> |
2433 | 1 | </msub> |
2434 | 1 | </mrow> |
2435 | 1 | </math>"; |
2436 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2437 | 1 | } |
2438 | ||
2439 | #[test] | |
2440 | 1 | fn ferric_chloride_aq() { |
2441 | 1 | let test = "<math><mrow> |
2442 | 1 | <mi>Fe</mi> |
2443 | 1 | <msub><mi>Cl</mi><mn>3</mn></msub> |
2444 | 1 | <mrow><mo>(</mo><mrow><mi>aq</mi></mrow><mo>)</mo></mrow> |
2445 | 1 | </mrow></math>"; |
2446 | 1 | let target = "<math> |
2447 | 1 | <mrow data-chem-formula='11'> |
2448 | 1 | <mi data-chem-element='3'>Fe</mi> |
2449 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2450 | 1 | <msub data-chem-formula='3'> |
2451 | 1 | <mi data-chem-element='3'>Cl</mi> |
2452 | 1 | <mn>3</mn> |
2453 | 1 | </msub> |
2454 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2455 | 1 | <mrow data-chem-formula='3'> |
2456 | 1 | <mo>(</mo> |
2457 | 1 | <mi>aq</mi> |
2458 | 1 | <mo>)</mo> |
2459 | 1 | </mrow> |
2460 | 1 | </mrow> |
2461 | 1 | </math>"; |
2462 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2463 | 1 | } |
2464 | ||
2465 | #[test] | |
2466 | 1 | fn ferric_chloride_aq_as_mi() { |
2467 | 1 | let test = "<math><mrow> |
2468 | 1 | <mi>Fe</mi> |
2469 | 1 | <msub><mi>Cl</mi><mn>3</mn></msub> |
2470 | 1 | <mi>(aq)</mi> |
2471 | 1 | </mrow></math>"; |
2472 | 1 | let target = "<math> |
2473 | 1 | <mrow data-chem-formula='11'> |
2474 | 1 | <mi data-chem-element='3'>Fe</mi> |
2475 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2476 | 1 | <msub data-chem-formula='3'> |
2477 | 1 | <mi data-chem-element='3'>Cl</mi> |
2478 | 1 | <mn>3</mn> |
2479 | 1 | </msub> |
2480 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2481 | 1 | <mrow data-chem-formula='3'> |
2482 | 1 | <mo>(</mo> |
2483 | 1 | <mi>aq</mi> |
2484 | 1 | <mo>)</mo> |
2485 | 1 | </mrow> |
2486 | 1 | </mrow> |
2487 | 1 | </math>"; |
2488 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2489 | 1 | } |
2490 | ||
2491 | #[test] | |
2492 | 1 | fn chemtype_ammonia() { |
2493 | 1 | let test = r#"<math><msub><mi>NH</mi><mn>3</mn></msub></math>"#; |
2494 | 1 | let target = " <math> |
2495 | 1 | <mrow data-changed='added' data-chem-formula='5'> |
2496 | 1 | <mi mathvariant='normal' data-chem-element='1'>N</mi> |
2497 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2498 | 1 | <msub data-chem-formula='2'> |
2499 | 1 | <mi mathvariant='normal' data-chem-element='1' data-split='true'>H</mi> |
2500 | 1 | <mn>3</mn> |
2501 | 1 | </msub> |
2502 | 1 | </mrow> |
2503 | 1 | </math>"; |
2504 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2505 | 1 | } |
2506 | ||
2507 | #[test] | |
2508 | 1 | fn mhchem_ammonia() { |
2509 | 1 | let test = r#"<math> |
2510 | 1 | <mrow> |
2511 | 1 | <mi data-mjx-auto-op="false">NH</mi> |
2512 | 1 | <msub> |
2513 | 1 | <mpadded width="0"> |
2514 | 1 | <mphantom> |
2515 | 1 | <mi>A</mi> |
2516 | 1 | </mphantom> |
2517 | 1 | </mpadded> |
2518 | 1 | <mpadded height="0"> |
2519 | 1 | <mn>3</mn> |
2520 | 1 | </mpadded> |
2521 | 1 | </msub> |
2522 | 1 | </mrow> |
2523 | 1 | </math>"#; |
2524 | 1 | let target = "<math> |
2525 | 1 | <mrow data-chem-formula='5'> |
2526 | 1 | <mi mathvariant='normal' data-chem-element='1'>N</mi> |
2527 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2528 | 1 | <mmultiscripts data-mjx-auto-op='false' data-chem-formula='2'> |
2529 | 1 | <mi mathvariant='normal' data-mjx-auto-op='false' data-split='true' data-chem-element='1'>H</mi> |
2530 | 1 | <mn>3</mn> |
2531 | 1 | <none></none> |
2532 | 1 | </mmultiscripts> |
2533 | 1 | </mrow> |
2534 | 1 | </math>"; |
2535 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2536 | 1 | } |
2537 | ||
2538 | #[test] | |
2539 | 1 | fn mhchem_so4() { |
2540 | 1 | let test = "<math> |
2541 | 1 | <mrow> |
2542 | 1 | <mi>SO</mi> |
2543 | 1 | <msub> |
2544 | 1 | <mpadded width='0'> |
2545 | 1 | <mphantom> |
2546 | 1 | <mi>A</mi> |
2547 | 1 | </mphantom> |
2548 | 1 | </mpadded> |
2549 | 1 | <mpadded height='0'> |
2550 | 1 | <mn>4</mn> |
2551 | 1 | </mpadded> |
2552 | 1 | </msub> |
2553 | 1 | <msup> |
2554 | 1 | <mpadded width='0'> |
2555 | 1 | <mphantom> |
2556 | 1 | <mi>A</mi> |
2557 | 1 | </mphantom> |
2558 | 1 | </mpadded> |
2559 | 1 | <mrow> |
2560 | 1 | <mn>2</mn> |
2561 | 1 | <mo>−</mo> |
2562 | 1 | </mrow> |
2563 | 1 | </msup> |
2564 | 1 | </mrow> |
2565 | 1 | </math>"; |
2566 | 1 | let target = "<math> |
2567 | 1 | <mrow data-chem-formula='7'> |
2568 | 1 | <mi mathvariant='normal' data-chem-element='1'>S</mi> |
2569 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2570 | 1 | <mmultiscripts data-chem-formula='5'> |
2571 | 1 | <mi mathvariant='normal' data-split='true' data-chem-element='1'>O</mi> |
2572 | 1 | <mn>4</mn> |
2573 | 1 | <none/> |
2574 | 1 | <none/> |
2575 | 1 | <mrow data-chem-formula='3'> |
2576 | 1 | <mn>2</mn> |
2577 | 1 | <mo>-</mo> |
2578 | 1 | </mrow> |
2579 | 1 | </mmultiscripts> |
2580 | 1 | </mrow> |
2581 | 1 | </math>"; |
2582 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2583 | 1 | } |
2584 | ||
2585 | #[test] | |
2586 | 1 | fn mhchem_short_ion() { |
2587 | 1 | let test = " <math> |
2588 | 1 | <mrow> |
2589 | 1 | <mi mathvariant='normal'>H</mi> |
2590 | 1 | <msub> |
2591 | 1 | <mpadded width='0'> <mphantom> <mi>A</mi> </mphantom> </mpadded> |
2592 | 1 | <mpadded height='0'> <mn>3</mn></mpadded> |
2593 | 1 | </msub> |
2594 | 1 | <mi mathvariant='normal'>O</mi> |
2595 | 1 | <msup> |
2596 | 1 | <mpadded width='0'> <mphantom> <mi>A</mi> </mphantom> </mpadded> |
2597 | 1 | <mo>+</mo> |
2598 | 1 | </msup> |
2599 | 1 | </mrow> |
2600 | 1 | </math>"; |
2601 | 1 | let target = "<math> |
2602 | 1 | <mrow data-chem-formula='6'> |
2603 | 1 | <mmultiscripts data-chem-formula='2'> |
2604 | 1 | <mi mathvariant='normal' data-chem-element='2'>H</mi> |
2605 | 1 | <mn>3</mn> |
2606 | 1 | <none></none> |
2607 | 1 | </mmultiscripts> |
2608 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2609 | 1 | <mmultiscripts data-chem-formula='3'> |
2610 | 1 | <mi mathvariant='normal' data-chem-element='2'>O</mi> |
2611 | 1 | <none></none> |
2612 | 1 | <mo>+</mo> |
2613 | 1 | </mmultiscripts> |
2614 | 1 | </mrow> |
2615 | 1 | </math>"; |
2616 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2617 | 1 | } |
2618 | ||
2619 | #[test] | |
2620 | 1 | fn mhchem_ions_and_state() { |
2621 | 1 | let test = "<math> |
2622 | 1 | <mrow> |
2623 | 1 | <mrow> |
2624 | 1 | <mi>Na</mi> |
2625 | 1 | </mrow> |
2626 | 1 | <msup> |
2627 | 1 | <mrow> |
2628 | 1 | <mrow> |
2629 | 1 | <mpadded width='0'> |
2630 | 1 | <mphantom> |
2631 | 1 | <mi>A</mi> |
2632 | 1 | </mphantom> |
2633 | 1 | </mpadded> |
2634 | 1 | </mrow> |
2635 | 1 | </mrow> |
2636 | 1 | <mrow> |
2637 | 1 | <mo>+</mo> |
2638 | 1 | </mrow> |
2639 | 1 | </msup> |
2640 | 1 | <mo stretchy='false'>(</mo> |
2641 | 1 | <mrow> |
2642 | 1 | <mi>aq</mi> |
2643 | 1 | </mrow> |
2644 | 1 | <mo stretchy='false'>)</mo> |
2645 | 1 | <mrow> |
2646 | 1 | <mi>Cl</mi> |
2647 | 1 | </mrow> |
2648 | 1 | <msup> |
2649 | 1 | <mrow> |
2650 | 1 | <mrow> |
2651 | 1 | <mpadded width='0'> |
2652 | 1 | <mphantom> |
2653 | 1 | <mi>A</mi> |
2654 | 1 | </mphantom> |
2655 | 1 | </mpadded> |
2656 | 1 | </mrow> |
2657 | 1 | </mrow> |
2658 | 1 | <mrow> |
2659 | 1 | <mo>−</mo> |
2660 | 1 | </mrow> |
2661 | 1 | </msup> |
2662 | 1 | <mspace width='0.111em'></mspace> |
2663 | 1 | <mo stretchy='false'>(</mo> |
2664 | 1 | <mrow> |
2665 | 1 | <mi>aq</mi> |
2666 | 1 | </mrow> |
2667 | 1 | <mo stretchy='false'>)</mo> |
2668 | 1 | </mrow> |
2669 | 1 | </math>"; |
2670 | 1 | let target = "<math> |
2671 | 1 | <mrow data-chem-formula='18'> |
2672 | 1 | <mmultiscripts data-chem-formula='4'> |
2673 | 1 | <mi data-chem-element='3'>Na</mi> |
2674 | 1 | <none></none> |
2675 | 1 | <mo>+</mo> |
2676 | 1 | </mmultiscripts> |
2677 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2678 | 1 | <mrow data-changed='added' data-chem-formula='3'> |
2679 | 1 | <mo stretchy='false'>(</mo> |
2680 | 1 | <mi>aq</mi> |
2681 | 1 | <mo stretchy='false'>)</mo> |
2682 | 1 | </mrow> |
2683 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2684 | 1 | <mmultiscripts data-chem-formula='5'> |
2685 | 1 | <mi data-chem-element='3'>Cl</mi> |
2686 | 1 | <none></none> |
2687 | 1 | <mo>-</mo> |
2688 | 1 | </mmultiscripts> |
2689 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2690 | 1 | <mrow data-changed='added' data-chem-formula='3'> |
2691 | 1 | <mo stretchy='false' data-previous-space-width='0.111'>(</mo> |
2692 | 1 | <mi>aq</mi> |
2693 | 1 | <mo stretchy='false'>)</mo> |
2694 | 1 | </mrow> |
2695 | 1 | </mrow> |
2696 | 1 | </math>"; |
2697 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2698 | 1 | } |
2699 | ||
2700 | #[test] | |
2701 | 1 | fn ethylene_with_colon_bond() { |
2702 | 1 | let test = "<math><mrow> |
2703 | 1 | <msub><mi>H</mi><mn>2</mn></msub><mi>C</mi> |
2704 | 1 | <mo>::</mo> |
2705 | 1 | <mi>C</mi><msub><mi>H</mi><mn>2</mn></msub> |
2706 | 1 | </mrow></math>"; |
2707 | 1 | let target = "<math> |
2708 | 1 | <mrow data-chem-formula='8'> |
2709 | 1 | <msub data-chem-formula='1'> |
2710 | 1 | <mi data-chem-element='1'>H</mi> |
2711 | 1 | <mn>2</mn> |
2712 | 1 | </msub> |
2713 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2714 | 1 | <mi data-chem-element='1'>C</mi> |
2715 | 1 | <mo data-chemical-bond='true' data-chem-formula-op='1'>∷</mo> |
2716 | 1 | <mi data-chem-element='1'>C</mi> |
2717 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2718 | 1 | <msub data-chem-formula='1'> |
2719 | 1 | <mi data-chem-element='1'>H</mi> |
2720 | 1 | <mn>2</mn> |
2721 | 1 | </msub> |
2722 | 1 | </mrow> |
2723 | 1 | </math>"; |
2724 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2725 | 1 | } |
2726 | ||
2727 | #[test] | |
2728 | 1 | fn mhchem_u238() { |
2729 | 1 | let test = "<math> |
2730 | 1 | <mrow> |
2731 | 1 | <msubsup> |
2732 | 1 | <mrow> |
2733 | 1 | <mrow> |
2734 | 1 | <mpadded width='0'> |
2735 | 1 | <mphantom> |
2736 | 1 | <mi>A</mi> |
2737 | 1 | </mphantom> |
2738 | 1 | </mpadded> |
2739 | 1 | </mrow> |
2740 | 1 | </mrow> |
2741 | 1 | <mrow> |
2742 | 1 | <mrow> |
2743 | 1 | <mpadded height='0' depth='0'> |
2744 | 1 | <mphantom></mphantom> |
2745 | 1 | </mpadded> |
2746 | 1 | </mrow> |
2747 | 1 | </mrow> |
2748 | 1 | <mrow> |
2749 | 1 | <mrow> |
2750 | 1 | <mpadded height='0' depth='0'> |
2751 | 1 | <mphantom> |
2752 | 1 | <mn>238</mn> |
2753 | 1 | </mphantom> |
2754 | 1 | </mpadded> |
2755 | 1 | </mrow> |
2756 | 1 | </mrow> |
2757 | 1 | </msubsup> |
2758 | 1 | <mspace width='-0.083em' linebreak='nobreak'></mspace> |
2759 | 1 | <msubsup> |
2760 | 1 | <mrow> |
2761 | 1 | <mrow> |
2762 | 1 | <mpadded width='0'> |
2763 | 1 | <mphantom> |
2764 | 1 | <mi>A</mi> |
2765 | 1 | </mphantom> |
2766 | 1 | </mpadded> |
2767 | 1 | </mrow> |
2768 | 1 | </mrow> |
2769 | 1 | <mrow> |
2770 | 1 | <mrow> |
2771 | 1 | <mpadded width='0'> |
2772 | 1 | <mphantom> |
2773 | 1 | <mn>2</mn> |
2774 | 1 | </mphantom> |
2775 | 1 | </mpadded> |
2776 | 1 | </mrow> |
2777 | 1 | <mrow> |
2778 | 1 | <mpadded width='0' lspace='-1width'> |
2779 | 1 | <mrow> |
2780 | 1 | <mpadded height='0'></mpadded> |
2781 | 1 | </mrow> |
2782 | 1 | </mpadded> |
2783 | 1 | </mrow> |
2784 | 1 | </mrow> |
2785 | 1 | <mrow> |
2786 | 1 | <mrow> |
2787 | 1 | <mpadded height='0'> |
2788 | 1 | <mrow> |
2789 | 1 | <mpadded width='0'> |
2790 | 1 | <mphantom> |
2791 | 1 | <mn>2</mn> |
2792 | 1 | </mphantom> |
2793 | 1 | </mpadded> |
2794 | 1 | </mrow> |
2795 | 1 | </mpadded> |
2796 | 1 | </mrow> |
2797 | 1 | <mrow> |
2798 | 1 | <mpadded width='0' lspace='-1width'> |
2799 | 1 | <mn>238</mn> |
2800 | 1 | </mpadded> |
2801 | 1 | </mrow> |
2802 | 1 | </mrow> |
2803 | 1 | </msubsup> |
2804 | 1 | <mrow> |
2805 | 1 | <mi mathvariant='normal'>U</mi> |
2806 | 1 | </mrow> |
2807 | 1 | </mrow> |
2808 | 1 | </math>"; |
2809 | 1 | let target = " <math> |
2810 | 1 | <mmultiscripts data-previous-space-width='-0.083' data-chem-formula='5'> |
2811 | 1 | <mi mathvariant='normal' data-chem-element='2'>U</mi> |
2812 | 1 | <mprescripts></mprescripts> |
2813 | 1 | <none></none> |
2814 | 1 | <mn>238</mn> |
2815 | 1 | </mmultiscripts> |
2816 | 1 | </math>"; |
2817 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2818 | 1 | } |
2819 | ||
2820 | #[test] | |
2821 | 1 | fn mhchem_hcl_aq() { |
2822 | 1 | let test = "<math> |
2823 | 1 | <mrow> |
2824 | 1 | <mn>2</mn> |
2825 | 1 | <mstyle scriptlevel='0'> |
2826 | 1 | <mspace width='0.167em'></mspace> |
2827 | 1 | </mstyle> |
2828 | 1 | <mrow> |
2829 | 1 | <mi>HCl</mi> |
2830 | 1 | </mrow> |
2831 | 1 | <mspace width='0.111em'></mspace> |
2832 | 1 | <mo stretchy='false'>(</mo> |
2833 | 1 | <mrow> |
2834 | 1 | <mi>aq</mi> |
2835 | 1 | </mrow> |
2836 | 1 | <mo stretchy='false'>)</mo> |
2837 | 1 | </mrow> |
2838 | 1 | </math>"; |
2839 | 1 | let target = "<math> |
2840 | 1 | <mrow data-chem-formula='9'> |
2841 | 1 | <mn>2</mn> |
2842 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁢</mo> |
2843 | 1 | <mrow data-changed='added' data-chem-formula='9'> |
2844 | 1 | <mi mathvariant='normal' data-previous-space-width='0.167' data-chem-element='1'>H</mi> |
2845 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2846 | 1 | <mi data-split='true' data-chem-element='3'>Cl</mi> |
2847 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2848 | 1 | <mrow data-changed='added' data-chem-formula='3'> |
2849 | 1 | <mo stretchy='false' data-previous-space-width='0.111'>(</mo> |
2850 | 1 | <mi>aq</mi> |
2851 | 1 | <mo stretchy='false'>)</mo> |
2852 | 1 | </mrow> |
2853 | 1 | </mrow> |
2854 | 1 | </mrow> |
2855 | 1 | </math>"; |
2856 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2857 | 1 | } |
2858 | ||
2859 | #[test] | |
2860 | 1 | fn mhchem_nested_sub() { |
2861 | // from \ce{(CH3)3} | |
2862 | 1 | let test = "<math> |
2863 | 1 | <mrow> |
2864 | 1 | <mo stretchy='false'>(</mo> |
2865 | 1 | <mrow> |
2866 | 1 | <mi>CH</mi> |
2867 | 1 | </mrow> |
2868 | 1 | <msub> |
2869 | 1 | <mrow> |
2870 | 1 | <mrow> |
2871 | 1 | <mpadded width='0'> |
2872 | 1 | <mphantom> |
2873 | 1 | <mi>A</mi> |
2874 | 1 | </mphantom> |
2875 | 1 | </mpadded> |
2876 | 1 | </mrow> |
2877 | 1 | </mrow> |
2878 | 1 | <mrow> |
2879 | 1 | <mrow> |
2880 | 1 | <mpadded height='0'> |
2881 | 1 | <mn>3</mn> |
2882 | 1 | </mpadded> |
2883 | 1 | </mrow> |
2884 | 1 | </mrow> |
2885 | 1 | </msub> |
2886 | 1 | <mo stretchy='false'>)</mo> |
2887 | 1 | <msub> |
2888 | 1 | <mrow> |
2889 | 1 | <mrow> |
2890 | 1 | <mpadded width='0'> |
2891 | 1 | <mphantom> |
2892 | 1 | <mi>A</mi> |
2893 | 1 | </mphantom> |
2894 | 1 | </mpadded> |
2895 | 1 | </mrow> |
2896 | 1 | </mrow> |
2897 | 1 | <mrow> |
2898 | 1 | <mrow> |
2899 | 1 | <mpadded height='0'> |
2900 | 1 | <mn>3</mn> |
2901 | 1 | </mpadded> |
2902 | 1 | </mrow> |
2903 | 1 | </mrow> |
2904 | 1 | </msub> |
2905 | 1 | </mrow> |
2906 | 1 | </math>"; |
2907 | 1 | let target = "<math> |
2908 | 1 | <mmultiscripts data-chem-formula='8'> |
2909 | 1 | <mrow data-changed='added' data-chem-formula='8'> |
2910 | 1 | <mo stretchy='false'>(</mo> |
2911 | 1 | <mrow data-changed='added' data-chem-formula='5'> |
2912 | 1 | <mi mathvariant='normal' data-chem-element='1'>C</mi> |
2913 | 1 | <mo data-changed='added'>⁣</mo> |
2914 | 1 | <mmultiscripts data-chem-formula='2'> |
2915 | 1 | <mi mathvariant='normal' data-split='true' data-chem-element='1'>H</mi> |
2916 | 1 | <mn>3</mn> |
2917 | 1 | <none></none> |
2918 | 1 | </mmultiscripts> |
2919 | 1 | </mrow> |
2920 | 1 | <mo stretchy='false'>)</mo> |
2921 | 1 | </mrow> |
2922 | 1 | <mn>3</mn> |
2923 | 1 | <none></none> |
2924 | 1 | </mmultiscripts> |
2925 | 1 | </math>"; |
2926 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2927 | 1 | } |
2928 | ||
2929 | #[test] | |
2930 | 1 | fn mhchem_isotopes() { |
2931 | // from \ce{^{18}O{}^{16}O} | |
2932 | 1 | let test = "<math> |
2933 | 1 | <mrow> |
2934 | 1 | <msubsup> |
2935 | 1 | <mpadded width='0'> |
2936 | 1 | <mphantom> |
2937 | 1 | <mi>A</mi> |
2938 | 1 | </mphantom> |
2939 | 1 | </mpadded> |
2940 | 1 | <mpadded height='0' depth='0'> |
2941 | 1 | <mphantom></mphantom> |
2942 | 1 | </mpadded> |
2943 | 1 | <mpadded height='0' depth='0'> |
2944 | 1 | <mphantom> |
2945 | 1 | <mn>18</mn> |
2946 | 1 | </mphantom> |
2947 | 1 | </mpadded> |
2948 | 1 | </msubsup> |
2949 | 1 | <mspace width='-0.083em'></mspace> |
2950 | 1 | <msubsup> |
2951 | 1 | <mpadded width='0'> |
2952 | 1 | <mphantom> |
2953 | 1 | <mi>A</mi> |
2954 | 1 | </mphantom> |
2955 | 1 | </mpadded> |
2956 | 1 | <mrow> |
2957 | 1 | <mpadded width='0'> |
2958 | 1 | <mphantom> |
2959 | 1 | <mn>2</mn> |
2960 | 1 | </mphantom> |
2961 | 1 | </mpadded> |
2962 | 1 | <mpadded width='0' lspace='-1width'> |
2963 | 1 | <mpadded height='0'></mpadded> |
2964 | 1 | </mpadded> |
2965 | 1 | </mrow> |
2966 | 1 | <mrow> |
2967 | 1 | <mpadded height='0'> |
2968 | 1 | <mpadded width='0'> |
2969 | 1 | <mphantom> |
2970 | 1 | <mn>2</mn> |
2971 | 1 | </mphantom> |
2972 | 1 | </mpadded> |
2973 | 1 | </mpadded> |
2974 | 1 | <mpadded width='0' lspace='-1width'> |
2975 | 1 | <mn>18</mn> |
2976 | 1 | </mpadded> |
2977 | 1 | </mrow> |
2978 | 1 | </msubsup> |
2979 | 1 | <mi mathvariant='normal'>O</mi> |
2980 | 1 | <mspace width='0.111em'></mspace> |
2981 | 1 | <msubsup> |
2982 | 1 | <mpadded width='0'> |
2983 | 1 | <mphantom> |
2984 | 1 | <mi>A</mi> |
2985 | 1 | </mphantom> |
2986 | 1 | </mpadded> |
2987 | 1 | <mpadded height='0' depth='0'> |
2988 | 1 | <mphantom></mphantom> |
2989 | 1 | </mpadded> |
2990 | 1 | <mpadded height='0' depth='0'> |
2991 | 1 | <mphantom> |
2992 | 1 | <mn>16</mn> |
2993 | 1 | </mphantom> |
2994 | 1 | </mpadded> |
2995 | 1 | </msubsup> |
2996 | 1 | <mspace width='-0.083em'></mspace> |
2997 | 1 | <msubsup> |
2998 | 1 | <mpadded width='0'> |
2999 | 1 | <mphantom> |
3000 | 1 | <mi>A</mi> |
3001 | 1 | </mphantom> |
3002 | 1 | </mpadded> |
3003 | 1 | <mrow> |
3004 | 1 | <mpadded width='0'> |
3005 | 1 | <mphantom> |
3006 | 1 | <mn>2</mn> |
3007 | 1 | </mphantom> |
3008 | 1 | </mpadded> |
3009 | 1 | <mpadded width='0' lspace='-1width'> |
3010 | 1 | <mpadded height='0'></mpadded> |
3011 | 1 | </mpadded> |
3012 | 1 | </mrow> |
3013 | 1 | <mrow> |
3014 | 1 | <mpadded height='0'> |
3015 | 1 | <mpadded width='0'> |
3016 | 1 | <mphantom> |
3017 | 1 | <mn>2</mn> |
3018 | 1 | </mphantom> |
3019 | 1 | </mpadded> |
3020 | 1 | </mpadded> |
3021 | 1 | <mpadded width='0' lspace='-1width'> |
3022 | 1 | <mn>16</mn> |
3023 | 1 | </mpadded> |
3024 | 1 | </mrow> |
3025 | 1 | </msubsup> |
3026 | 1 | <mi mathvariant='normal'>O</mi> |
3027 | 1 | </mrow> |
3028 | 1 | </math>"; |
3029 | 1 | let target = "<math> |
3030 | 1 | <mrow data-chem-formula='11'> |
3031 | 1 | <mmultiscripts data-previous-space-width='-0.083' data-chem-formula='5'> |
3032 | 1 | <mi mathvariant='normal' data-chem-element='2'>O</mi> |
3033 | 1 | <mprescripts></mprescripts> |
3034 | 1 | <none></none> |
3035 | 1 | <mn>18</mn> |
3036 | 1 | </mmultiscripts> |
3037 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
3038 | 1 | <mmultiscripts data-previous-space-width='0.027999999999999997' data-chem-formula='5'> |
3039 | 1 | <mi mathvariant='normal' data-chem-element='2'>O</mi> |
3040 | 1 | <mprescripts></mprescripts> |
3041 | 1 | <none></none> |
3042 | 1 | <mn>16</mn> |
3043 | 1 | </mmultiscripts> |
3044 | 1 | </mrow> |
3045 | 1 | </math>"; |
3046 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
3047 | 1 | } |
3048 | ||
3049 | ||
3050 | #[test] | |
3051 | 1 | fn merge_bug_274() { |
3052 | 1 | let test = r#" |
3053 | 1 | <math> |
3054 | 1 | <mrow> |
3055 | 1 | <mtable> |
3056 | 1 | <mtr> |
3057 | 1 | <mtd> |
3058 | 1 | <mrow> |
3059 | 1 | <msub><mtext>H</mtext><mn>2</mn></msub> |
3060 | 1 | <mtext>g</mtext> |
3061 | 1 | <mtext/> |
3062 | 1 | <mtext>+</mtext> |
3063 | 1 | <mtext/> |
3064 | 1 | <msub><mrow><mtext>Cl</mtext></mrow><mn>2</mn></msub> |
3065 | 1 | <mo stretchy="false">(</mo> |
3066 | 1 | <mtext>g</mtext> |
3067 | 1 | <mo stretchy="false">)</mo> |
3068 | 1 | <mo>→</mo> |
3069 | 1 | <mn>2</mn> |
3070 | 1 | <mtext>HCl(g)</mtext> |
3071 | 1 | </mrow> |
3072 | 1 | </mtd> |
3073 | 1 | </mtr> |
3074 | 1 | <mtr> |
3075 | 1 | <mtd> |
3076 | 1 | <mrow> |
3077 | 1 | <mn>1</mn> |
3078 | 1 | <mo>:</mo> |
3079 | 1 | <mn>1</mn> |
3080 | 1 | <mo>:</mo> |
3081 | 1 | <mn>2</mn> |
3082 | 1 | </mrow> |
3083 | 1 | </mtd> |
3084 | 1 | </mtr> |
3085 | 1 | <mtr> |
3086 | 1 | <mtd> |
3087 | 1 | <mrow> |
3088 | 1 | <mn>1</mn> |
3089 | 1 | <mtext/> |
3090 | 1 | <msub><mtext>H</mtext><mn>2</mn></msub> |
3091 | 1 | <mtext/> |
3092 | 1 | <mtext>to</mtext> |
3093 | 1 | <mtext/> |
3094 | 1 | <mn>1</mn> |
3095 | 1 | <mtext/> |
3096 | 1 | <msub><mrow><mtext>Cl</mtext></mrow><mn>2</mn></msub> |
3097 | 1 | <mtext/> |
3098 | 1 | <mtext>to</mtext> |
3099 | 1 | <mtext/> |
3100 | 1 | <mtext>2</mtext> |
3101 | 1 | <mtext/> |
3102 | 1 | <mtext>HCl</mtext> |
3103 | 1 | </mrow> |
3104 | 1 | </mtd> |
3105 | 1 | </mtr> |
3106 | 1 | </mtable> |
3107 | 1 | </mrow> |
3108 | 1 | </math> |
3109 | 1 | "#; |
3110 | 1 | let target = " |
3111 | 1 | <math> |
3112 | 1 | <mtable> |
3113 | 1 | <mtr> |
3114 | 1 | <mtd data-maybe-chemistry='9'> |
3115 | 1 | <mrow data-maybe-chemistry='9'> |
3116 | 1 | <mrow data-changed='added' data-maybe-chemistry='8'> |
3117 | 1 | <mrow data-changed='added' data-maybe-chemistry='1'> |
3118 | 1 | <msub data-maybe-chemistry='1'> |
3119 | 1 | <mtext data-maybe-chemistry='1'>H</mtext> |
3120 | 1 | <mn>2</mn> |
3121 | 1 | </msub> |
3122 | 1 | <mo data-changed='added' data-maybe-chemistry='0'>⁢</mo> |
3123 | 1 | <mtext data-maybe-chemistry='0'>g</mtext> |
3124 | 1 | </mrow> |
3125 | 1 | <mo data-chem-equation-op='1' data-maybe-chemistry='1'>+</mo> |
3126 | 1 | <mrow data-changed='added' data-maybe-chemistry='6'> |
3127 | 1 | <msub data-maybe-chemistry='3'> |
3128 | 1 | <mtext data-maybe-chemistry='3'>Cl</mtext> |
3129 | 1 | <mn>2</mn> |
3130 | 1 | </msub> |
3131 | 1 | <mo data-changed='added' data-maybe-chemistry='0'>⁣</mo> |
3132 | 1 | <mrow data-changed='added' data-maybe-chemistry='2'> |
3133 | 1 | <mo stretchy='false'>(</mo> |
3134 | 1 | <mtext>g</mtext> |
3135 | 1 | <mo stretchy='false'>)</mo> |
3136 | 1 | </mrow> |
3137 | 1 | </mrow> |
3138 | 1 | </mrow> |
3139 | 1 | <mo data-chem-equation-op='1' data-maybe-chemistry='1'>→</mo> |
3140 | 1 | <mrow data-changed='added' data-maybe-chemistry='0'> |
3141 | 1 | <mn data-maybe-chemistry='0'>2</mn> |
3142 | 1 | <mo data-changed='added' data-maybe-chemistry='0'>⁢</mo> |
3143 | 1 | <mtext data-maybe-chemistry='0'>HCl(g)</mtext> |
3144 | 1 | </mrow> |
3145 | 1 | </mrow> |
3146 | 1 | </mtd> |
3147 | 1 | </mtr> |
3148 | 1 | <mtr> |
3149 | 1 | <mtd> |
3150 | 1 | <mrow> |
3151 | 1 | <mn>1</mn> |
3152 | 1 | <mo>:</mo> |
3153 | 1 | <mn>1</mn> |
3154 | 1 | <mo>:</mo> |
3155 | 1 | <mn>2</mn> |
3156 | 1 | </mrow> |
3157 | 1 | </mtd> |
3158 | 1 | </mtr> |
3159 | 1 | <mtr> |
3160 | 1 | <mtd data-maybe-chemistry='7'> |
3161 | 1 | <mrow data-maybe-chemistry='7'> |
3162 | 1 | <mn data-maybe-chemistry='0'>1</mn> |
3163 | 1 | <mo data-changed='added' data-maybe-chemistry='0'>⁢</mo> |
3164 | 1 | <msub data-maybe-chemistry='1'> |
3165 | 1 | <mtext data-maybe-chemistry='1'>H</mtext> |
3166 | 1 | <mn>2</mn> |
3167 | 1 | </msub> |
3168 | 1 | <mo data-changed='added' data-maybe-chemistry='0'>⁢</mo> |
3169 | 1 | <mtext data-maybe-chemistry='0'>to</mtext> |
3170 | 1 | <mo data-changed='added' data-maybe-chemistry='0'>⁢</mo> |
3171 | 1 | <mn data-maybe-chemistry='0'>1</mn> |
3172 | 1 | <mo data-changed='added' data-maybe-chemistry='0'>⁢</mo> |
3173 | 1 | <msub data-maybe-chemistry='3'> |
3174 | 1 | <mtext data-maybe-chemistry='3'>Cl</mtext> |
3175 | 1 | <mn>2</mn> |
3176 | 1 | </msub> |
3177 | 1 | <mo data-changed='added' data-maybe-chemistry='0'>⁢</mo> |
3178 | 1 | <mtext data-maybe-chemistry='0'>to</mtext> |
3179 | 1 | <mo data-changed='added' data-maybe-chemistry='0'>⁢</mo> |
3180 | 1 | <mn data-maybe-chemistry='0'>2</mn> |
3181 | 1 | <mo data-changed='added' data-maybe-chemistry='0'>⁢</mo> |
3182 | 1 | <mi data-maybe-chemistry='1' mathvariant='normal'>H</mi> |
3183 | 1 | <mo data-changed='added' data-maybe-chemistry='0'>⁢</mo> |
3184 | 1 | <mi data-maybe-chemistry='3' data-split='true'>Cl</mi> |
3185 | 1 | </mrow> |
3186 | 1 | </mtd> |
3187 | 1 | </mtr> |
3188 | 1 | </mtable> |
3189 | 1 | </math> |
3190 | 1 | "; |
3191 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
3192 | 1 | } |
3193 | ||
3194 | #[test] | |
3195 | 1 | fn merge_bug_303() { |
3196 | 1 | let test = r#" |
3197 | 1 | <math> |
3198 | 1 | <mn>2</mn> |
3199 | 1 | <msup><mtext>OH</mtext><mo>−</mo></msup> |
3200 | 1 | <mo stretchy="false">(</mo> |
3201 | 1 | <mtext>aq</mtext> |
3202 | 1 | <mo stretchy="false">)</mo> |
3203 | 1 | <mo>+</mo> |
3204 | 1 | <mtext>C</mtext> |
3205 | 1 | <msup><mtext>u</mtext><mrow><mn>2</mn><mo>+</mo></mrow></msup> |
3206 | 1 | </math> |
3207 | 1 | "#; |
3208 | 1 | let target = " |
3209 | 1 | <math> |
3210 | 1 | <mrow data-changed='added'> |
3211 | 1 | <mrow data-changed='added'> |
3212 | 1 | <mn>2</mn> |
3213 | 1 | <mo data-changed='added'>⁢</mo> |
3214 | 1 | <mrow data-changed='added'> |
3215 | 1 | <msup><mi>OH</mi><mo>-</mo></msup> |
3216 | 1 | <mo data-changed='added'>⁡</mo> |
3217 | 1 | <mrow data-changed='added'> |
3218 | 1 | <mo stretchy='false'>(</mo> |
3219 | 1 | <mtext>aq</mtext> |
3220 | 1 | <mo stretchy='false'>)</mo> |
3221 | 1 | </mrow> |
3222 | 1 | </mrow> |
3223 | 1 | </mrow> |
3224 | 1 | <mo>+</mo> |
3225 | 1 | <mrow data-changed='added'> |
3226 | 1 | <mtext>C</mtext> |
3227 | 1 | <mo data-changed='added'>⁢</mo> |
3228 | 1 | <msup> <mtext>u</mtext> <mrow><mn>2</mn><mo>+</mo></mrow> </msup> |
3229 | 1 | </mrow> |
3230 | 1 | </mrow> |
3231 | 1 | </math> |
3232 | 1 | "; |
3233 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
3234 | 1 | } |
3235 | ||
3236 | #[test] | |
3237 | 1 | fn mtd_assert_bug_393() { |
3238 | 1 | let test = r#" |
3239 | 1 | <math display="block"> |
3240 | 1 | <mtable> |
3241 | 1 | <mtr> |
3242 | 1 | <mtd> |
3243 | 1 | <mrow> |
3244 | 1 | <mi>A</mi> |
3245 | 1 | <mi>c</mi> |
3246 | 1 | </mrow> |
3247 | 1 | </mtd> |
3248 | 1 | <mtd> |
3249 | 1 | <mi>A</mi> |
3250 | 1 | <mfenced> |
3251 | 1 | <mtable> |
3252 | 1 | <mtr> |
3253 | 1 | <mtd> |
3254 | 1 | <mrow> |
3255 | 1 | <mi>c</mi> |
3256 | 1 | <mi>n</mi> |
3257 | 1 | </mrow> |
3258 | 1 | </mtd> |
3259 | 1 | </mtr> |
3260 | 1 | </mtable> |
3261 | 1 | </mfenced> |
3262 | 1 | </mtd> |
3263 | 1 | </mtr> |
3264 | 1 | </mtable> |
3265 | 1 | </math>"#; |
3266 | 1 | let target = " |
3267 | 1 | <math display='block'> |
3268 | 1 | <mtable> |
3269 | 1 | <mtr> |
3270 | 1 | <mtd> |
3271 | 1 | <mi>A</mi> |
3272 | 1 | <mi>c</mi> |
3273 | 1 | </mtd> |
3274 | 1 | <mtd> |
3275 | 1 | <mrow data-changed='added'> |
3276 | 1 | <mi>A</mi> |
3277 | 1 | <mrow> |
3278 | 1 | <mo data-changed='from_mfenced'>(</mo> |
3279 | 1 | <mtable> |
3280 | 1 | <mtr> |
3281 | 1 | <mtd> |
3282 | 1 | <mrow> |
3283 | 1 | <mi>c</mi> |
3284 | 1 | <mi>n</mi> |
3285 | 1 | </mrow> |
3286 | 1 | </mtd> |
3287 | 1 | </mtr> |
3288 | 1 | </mtable> |
3289 | 1 | <mo data-changed='from_mfenced'>)</mo> |
3290 | 1 | </mrow> |
3291 | 1 | </mrow> |
3292 | 1 | </mtd> |
3293 | 1 | </mtr> |
3294 | 1 | </mtable> |
3295 | 1 | </math>"; |
3296 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
3297 | 1 | } |
3298 | ||
3299 | } |
Line | Count | Source |
1 | #![allow(clippy::needless_return)] | |
2 | ||
3 | // Chemistry terms used here: | |
4 | // chemical formula -- this references a molecule (one or more elements with bonds between them), including its state. | |
5 | // chemical equation -- this is a notation specialized to chemistry -- it has concentration, arrows, equality, "addition" along with | |
6 | // some special symbols for operators and (mostly) chemical formulas for operands. | |
7 | // Operand exceptions are the equilibrium constant, numbers, and identifiers. | |
8 | // Although a chemical equation is a superset of a chemical formula, because we want to distinguish the two (e.g., '=' is in both), | |
9 | // we require that chemical equation is an mrow | |
10 | // FIX?? -- can it be an adorned mrow? | |
11 | // Note: with the current definition, if any element in a potential chem equation is ruled out, the entire mrow is ruled out. | |
12 | // | |
13 | // The general flow is that for every element that looks like a chem formula/equation, we mark it with data-likely-[equation/formula] | |
14 | // After we are done marking "likely", we go back and either delete them or replace them with data-[equation/formula]. | |
15 | // Note: anything already marked with data-[equation/formula] doesn't need recomputation later (essentially the result is cached) | |
16 | // | |
17 | // There is a chicken and egg problem with detecting chemistry: to more reliably detect it, we need good structure. | |
18 | // However, to get the structure right (e.,g "=" being a double bond, not equality; chem elements being in 'mi's; ...), | |
19 | // we need to know "=" is part of a chemical formula. | |
20 | // The imperfect solution used is: | |
21 | // As the final step of each recursive call to 'clean_mathml', | |
22 | // 1. mi/mtext: is it a chemical element(s) or one of the symbols used in chemical formulas (not equations). | |
23 | // If so, mark it MAYBE_CHEMISTRY. | |
24 | // 2. msub/msup/msubsup/mmultiscripts: is base marked MAYBE_CHEMISTRY and the scripts are potential adornments, mark it MAYBE_CHEMISTRY | |
25 | // 3. mrows: these take a few passes (remember, they aren't structured properly yet) | |
26 | // On the assumption that chemistry is not common we implement a "show me" attitude before changing the structure. | |
27 | // Pass 1: | |
28 | // a) for any run of mi/mtext that can be re-split into chem elements, split them and mark them if it is at least 3 chars long | |
29 | // b) if there are any potential chem formula operators (e.g., "=" and ":") and the previous node is marked MAYBE_CHEMISTRY, | |
30 | // mark this as MAYBE_CHEMISTRY | |
31 | // Pass 2: (assuming something was marked in pass 1) | |
32 | // a) find the first marked child and then the last consecutive marked child and trim any mo's from the ends | |
33 | // b) evaluate the likelihood that the sequence is chemistry | |
34 | // yes: replace mathml children with new (potentially restructured) children | |
35 | // no: clear all the marks for the old children | |
36 | // After canonicalization, we take another pass looking for chemical equations and marking them if found. | |
37 | ||
38 | use sxd_document::dom::{Element, Document, ChildOfElement}; | |
39 | use crate::canonicalize::*; | |
40 | use crate::pretty_print::mml_to_string; | |
41 | use crate::xpath_functions::{is_leaf, IsNode}; | |
42 | use regex::Regex; | |
43 | use crate::xpath_functions::IsBracketed; | |
44 | use phf::{phf_map, phf_set}; | |
45 | use std::convert::TryInto; | |
46 | #[allow(unused_imports)] | |
47 | use log::{error, debug}; | |
48 | use std::collections::HashSet; | |
49 | use std::cmp::Ordering; | |
50 | use crate::errors::*; | |
51 | use std::sync::LazyLock; | |
52 | ||
53 | ||
54 | pub static NOT_CHEMISTRY: i32 = -10000; // should overwhelm any positive signal | |
55 | static NOT_CHEMISTRY_THRESHOLD: i32 = -10000/2; // value for testing -- that way some can be added to NOT_CHEMISTRY and still meet the test | |
56 | static CHEMISTRY_THRESHOLD: i32 = 5; // if this changes, change CHEMISTRY_THRESHOLD_STR | |
57 | ||
58 | ||
59 | /// this might be chemistry -- should only exist during canonicalization | |
60 | pub static MAYBE_CHEMISTRY: &str = "data-maybe-chemistry"; | |
61 | ||
62 | /// Attr flag to indicate chemical equation | |
63 | static CHEM_EQUATION: &str = "data-chem-equation"; | |
64 | /// Attr flag to indicate chemical formula | |
65 | static CHEM_FORMULA: &str = "data-chem-formula"; | |
66 | /// Attr flag to indicate chemical element | |
67 | static CHEM_ELEMENT: &str = "data-chem-element"; | |
68 | static CHEM_FORMULA_OPERATOR: &str = "data-chem-formula-op"; | |
69 | static CHEM_EQUATION_OPERATOR: &str = "data-chem-equation-op"; | |
70 | static CHEM_STATE: &str = "data-chem-state"; | |
71 | ||
72 | /// mark a new chem element that happened due to splitting a leaf | |
73 | pub static SPLIT_TOKEN: &str = "data-split"; | |
74 | ||
75 | /// mark a new chem element that happened due to merging two leaves | |
76 | static MERGED_TOKEN: &str = "data-merged"; | |
77 | ||
78 | /// these can be in the base of an under/over script | |
79 | 6.64k | fn is_chem_equation_arrow(ch: char) -> bool { |
80 | 6.64k |
|
81 | '→' | '➔' | '←' | '⟶' | '⟵' | '⤻' | '⇋' | '⇌' | | |
82 | '↑' | '↓' | '↿' | '↾' | '⇃' | '⇂' | '⥮' | '⥯' | '⇷' | '⇸' | '⤉' | '⤈' | | |
83 | '⥂' | '⥄' | '⥃' | | |
84 | '\u{1f8d0}' | '\u{1f8d1}' | '\u{1f8d2}' | '\u{1f8d3}' | '\u{1f8d4}' | '\u{1f8d5}' // proposed Unicode equilibrium arrows | |
85 | ) | |
86 | 6.64k | } |
87 | ||
88 | // Returns true if the 'property' (should have ":") is in the intent | |
89 | 196k | fn has_chem_intent(mathml: Element, property: &str) -> bool { |
90 | 196k | if let Some( |
91 | 16.9k | let head = intent.split('(').next().unwrap(); |
92 | 16.9k | return head.contains(property); |
93 | 179k | } |
94 | 179k | return false; |
95 | 196k | } |
96 | ||
97 | 26.7k | fn has_inherited_property(mathml: Element, property: &str) -> bool { |
98 | 26.7k | let mut current = mathml; |
99 | loop { | |
100 | 101k | if has_chem_intent(current, property) { |
101 | 0 | return true; |
102 | 101k | } |
103 | // chem might not be temp node without a 'math' parent | |
104 | 101k | if name(current) == "math" || |
105 | 26.7k | break; |
106 | 74.6k | } |
107 | 74.6k | current = get_parent(current); |
108 | } | |
109 | 26.7k | return false; |
110 | 26.7k | } |
111 | ||
112 | 30.2k | pub fn is_chemistry_off(mathml: Element) -> bool { |
113 | 30.2k | if has_chem_intent(mathml, ":chemical-formula") || has_chem_intent(mathml, ":chemical-equation") { |
114 | 4 | return false; |
115 | 30.2k | } |
116 | 30.2k | let pref_manager = crate::prefs::PreferenceManager::get(); |
117 | 30.2k | return pref_manager.borrow().pref_to_string("Chemistry") == "Off"; |
118 | 30.2k | } |
119 | ||
120 | 10.1k | pub fn clean_chemistry_mrow(mathml: Element) { |
121 | 10.1k | if is_chemistry_off(mathml) { |
122 | 0 | return; |
123 | 10.1k | } |
124 | // debug!("clean_chemistry_mrow:\n{}", mml_to_string(mathml)); | |
125 | 10.1k | let mut children = mathml.children().iter() |
126 | 31.3k | . |
127 | 10.1k | .collect::<Vec<Element>>(); |
128 | 10.1k | if let Some( |
129 | 246 | mathml.replace_children(&new_children); |
130 | 246 | children = new_children; |
131 | 9.93k | } |
132 | 10.1k | clean_mrow_children_mark_pass(&children); |
133 | 10.1k | } |
134 | ||
135 | /// Do some aggressive structural changes and if they make this look like a chemistry formula, mark it as one else remove other marks | |
136 | /// Note: the element is replaced with a new restructured element if it is marked as chemistry | |
137 | /// Pass 1: | |
138 | /// a) for any run of mi/mtext that can be re-split into chem elements, split them and mark them if it is at least 3 chars long. | |
139 | /// Also split "(g)", etc., when in mi/mtext | |
140 | /// b) if there are any potential chem formula operators (e.g., "=" and ":") and the previous node is marked MAYBE_CHEMISTRY, | |
141 | /// mark this as MAYBE_CHEMISTRY | |
142 | 10.1k | fn clean_mrow_children_restructure_pass<'a>(old_children: &[Element<'a>]) -> Option<Vec<Element<'a>>> { |
143 | 10.1k | let mut changed = false; |
144 | 10.1k | let mut new_children = Vec::with_capacity(2*old_children.len()); |
145 | 10.1k | let mut i = 0; |
146 | 40.7k | while i < old_children.len() { |
147 | 30.6k | if let Some( |
148 | 1 | new_children.push(paren_mrow_aq); |
149 | 1 | i += 4; // skipping "( a q )" |
150 | 1 | changed = true; |
151 | 1 | continue; |
152 | } else { | |
153 | 30.6k | let child = old_children[i]; |
154 | 30.6k | let child_name = name(child); |
155 | 30.6k | if child_name == "mi" || ( |
156 | // break mi/mtext that is done as "(g)", etc. Even if it isn't 'g', 'l', etc., it probably shouldn't be an mi/text. | |
157 | 8.62k | let text = as_text(child); |
158 | 8.62k | if text.starts_with('(') && |
159 | 4 | let doc = child.document(); |
160 | 4 | let state = create_mathml_element(&doc, "mi"); |
161 | 4 | state.set_text(&text[1..text.len()-1]); |
162 | 4 | let open = create_mathml_element(&doc, "mo"); |
163 | 4 | open.set_text("("); |
164 | 4 | let close = create_mathml_element(&doc, "mo"); |
165 | 4 | close.set_text(")"); |
166 | 4 | let mrow = create_mathml_element(&doc, "mrow"); |
167 | 4 | mrow.append_children(&[open,state,close]); |
168 | 4 | new_children.push(mrow); |
169 | 4 | i += 1; |
170 | 4 | changed = true; |
171 | 4 | continue; |
172 | 8.62k | } |
173 | 21.9k | } else if i + 2 < old_children.len() { |
174 | // wrap with an mrow if we are not already an 'mrow' | |
175 | 9.68k | let parent = get_parent(child); // safe since 'math' is always at root |
176 | 9.68k | if !(name(parent) == "mrow" && |
177 | 8.68k | let Some( |
178 | // debug!("make_mrow added mrow"); | |
179 | 377 | new_children.push(paren_mrow); |
180 | 377 | i += 3; |
181 | 377 | changed = true; |
182 | 377 | continue; |
183 | 9.30k | } |
184 | 12.3k | } |
185 | 30.2k | if child_name == "mo" { |
186 | 9.50k | let likely_chemistry_op = likely_chem_formula_operator(child); |
187 | // debug!("clean_mrow_children_restructure_pass -- in mo: likely {}, {}", likely_chemistry_op, mml_to_string(child)); | |
188 | 9.50k | if likely_chemistry_op >= 0 { |
189 | // if possible chemistry to left and right, then override text for operator lookup | |
190 | // note: on the right, we haven't set chem flag for operators yet, so we skip them | |
191 | 2.98k | let preceding = child.preceding_siblings(); |
192 | 2.98k | let following = child.following_siblings(); |
193 | 2.98k | if !preceding.is_empty() && |
194 | 1.84k | ( has_inherited_property(child, "chemical-formula") || |
195 | 2.27k |
|
196 | 2.27k | let child = as_element(child); |
197 | 2.27k | name(child)=="mn" || |
198 | 574 |
|
199 | 574 | let child = as_element(child); |
200 | 574 | name(child)=="mo" || |
201 | 574 | })) { |
202 | 146 | // "=", etc., should be treated as high priority separators |
203 | 146 | // debug!("clean_mrow_children_restructure: child = {}", mml_to_string(child)); |
204 | 146 | child.set_attribute_value(CHEMICAL_BOND, "true"); |
205 | 146 | child.set_attribute_value(CHEM_FORMULA_OPERATOR, &likely_chemistry_op.to_string()); |
206 | 146 | child.set_attribute_value(MAYBE_CHEMISTRY, &likely_chemistry_op.to_string()); |
207 | 2.83k | } |
208 | 6.52k | } else { |
209 | 6.52k | likely_chem_equation_operator(child); // need to mark MAYBE_CHEMISTRY for CHEMICAL_BOND tests |
210 | 6.52k | } |
211 | 20.7k | } else if child_name == "mrow" && |
212 | 2.05k | let Some( |
213 | 1 | latex_value == r"\mathrel{\longrightleftharpoons}" { |
214 | 0 | child.set_attribute_value("data-unicode", "\u{1f8d2}"); |
215 | 0 | child.set_attribute_value(MAYBE_CHEMISTRY, "2"); // same as is_hack_for_missing_arrows() |
216 | 20.7k | } |
217 | 30.2k | i += 1; |
218 | 30.2k | new_children.push(child); |
219 | } | |
220 | } | |
221 | ||
222 | 10.1k | return if changed { |
223 | ||
224 | ||
225 | /// if it looks like we have ChemFormula ( a q ), merge the 'a' and 'q' together into an 'mi' | |
226 | /// if not already true, structure '( aq )' into a single mrow (might be other elements on either side) | |
227 | /// returns the last char matched | |
228 | 30.6k | fn clean_aq_state<'a>(children: &[Element<'a>], i: usize) -> Option<Element<'a>> { |
229 | 30.6k | if i+3 >= children.len() || ( |
230 | 27.8k | return None; // can't be '( a q )' -- not enough elements left or not Chem Formula on left |
231 | 2.79k | } |
232 | ||
233 | // this is a little sloppy in that we allow matching text in any leaf element, but we can use the same function | |
234 | 2.79k | if is_text(children[i], "(") && |
235 | 244 | is_text(children[i+1], "a") && |
236 | 1 | is_text(children[i+3], ")") { |
237 | 1 | let mi = create_mathml_element(&children[i].document(), "mi"); |
238 | 1 | mi.set_text("aq"); |
239 | 1 | return make_mrow([children[i], mi, children[i+3]]); |
240 | 2.79k | } |
241 | 2.79k | return None; |
242 | 30.6k | } |
243 | ||
244 | 12.3k | fn is_text(node: Element, target: &str) -> bool { |
245 | 12.3k | return is_leaf(node) && |
246 | 12.3k | } |
247 | ||
248 | /// Converts "( child )" to mrow with those elements as children. | |
249 | /// This is to make ascertaining whether this is a chemical state easier, but it is correct even if not a chemical state. | |
250 | 8.68k | fn make_mrow(children: [Element; 3]) -> Option<Element> { |
251 | // this is a little sloppy in that we allow matching text in any leaf element, but we can use the same function | |
252 | 8.68k | if is_text(children[0], "(") && |
253 | 631 | is_text(children[2], ")") { |
254 | 378 | let mrow = create_mathml_element(&children[0].document(), "mrow"); |
255 | 378 | mrow.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
256 | 378 | mrow.append_children(children); |
257 | 378 | return Some(mrow); |
258 | 8.31k | } |
259 | 8.31k | return None; |
260 | 8.68k | } |
261 | 10.1k | } |
262 | ||
263 | /// Pass 2: (assuming something was marked in pass 1) | |
264 | /// a) find the first marked child and then the last consecutive marked child and trim any mo's from the ends | |
265 | /// b) evaluate the likelihood that the sequence is chemistry | |
266 | 10.1k | fn clean_mrow_children_mark_pass(children: &[Element]) { |
267 | 10.1k | let mut start = None; |
268 | 30.6k | for i in |
269 | 30.6k | let child = children[i]; |
270 | 30.6k | if child.attribute(MAYBE_CHEMISTRY).is_some() { |
271 | 4.64k | if start.is_none() { |
272 | 3.63k | if name(child) == "mo" { |
273 | 2.38k | // debug!(" start.is_none(): removing MAYBE_CHEMISTRY on {}", as_text(child)); |
274 | 2.38k | child.remove_attribute(MAYBE_CHEMISTRY); |
275 | 2.38k | child.remove_attribute(CHEM_FORMULA_OPERATOR); |
276 | 2.38k | child.remove_attribute(CHEM_EQUATION_OPERATOR); |
277 | 2.38k | child.remove_attribute(CHEMICAL_BOND); |
278 | 2.38k | } else { |
279 | 1.25k | start = Some(i); |
280 | 1.25k | } |
281 | 1.00k | } |
282 | 25.9k | } else if let Some( |
283 | 804 | remove_operators_at_end_of_sequence(children, seq_start, i) { |
284 | 804 | start = None; |
285 | 25.1k | } |
286 | } | |
287 | ||
288 | 10.1k | if let Some( |
289 | 452 | remove_operators_at_end_of_sequence(children, seq_start, children.len()); |
290 | 9.73k | } |
291 | 10.1k | return; |
292 | ||
293 | ||
294 | 1.25k | fn remove_operators_at_end_of_sequence(children: &[Element], start: usize, end: usize) -> bool { |
295 | // debug!(" looking for ops at end of {}..{}, last is:{}", start, end, mml_to_string(children[end-1])); | |
296 | 1.45k | for stop in ( |
297 | 1.45k | let end_child = children[stop]; |
298 | 1.45k | if name(end_child) == "mo" { |
299 | 202 | end_child.remove_attribute(MAYBE_CHEMISTRY); |
300 | 202 | } else { |
301 | 1.25k | return true; |
302 | } | |
303 | } | |
304 | 0 | return false |
305 | 1.25k | } |
306 | 10.1k | } |
307 | ||
308 | ||
309 | /// Very little software gets the token elements for chemistry right. | |
310 | /// Sometimes multiple elements are in a single token (e.g. "NaCl") and sometimes | |
311 | /// a single element is spread across multiple tokens (e.g. "N", "a"). | |
312 | /// | |
313 | /// Here we attempt one or the other repair, but not both on the assumption there is | |
314 | /// consistency in the error. | |
315 | /// | |
316 | /// Returns a Vec of the chemical elements or None. If a merge happened, the tree is altered. | |
317 | 12.3k | pub fn convert_leaves_to_chem_elements(mathml: Element) -> Option<Vec<Element>> { |
318 | // gather up all the consecutive mi/mtext | |
319 | 12.3k | if !(name(mathml) == "mi" || |
320 | 0 | return None; // do nothing |
321 | 12.3k | } |
322 | ||
323 | // we play games with the string to avoid allocation... | |
324 | 12.3k | let token_string = as_text(mathml); |
325 | 12.3k | if !token_string.is_ascii() { |
326 | 2.67k | return None; // chemical elements are ASCII |
327 | 9.62k | } |
328 | 9.62k | let doc = mathml.document(); |
329 | 9.62k | if token_string.len() > 1 { // safe because all chars are ASCII |
330 | 2.54k | return split_string_chem_element(&doc, mathml); |
331 | 7.08k | } |
332 | 7.08k | let parent = get_parent(mathml); |
333 | 7.08k | let parent_name = name(parent); |
334 | 7.08k | if !(parent_name == "mrow" || |
335 | 2.57k | return None; // only try to merge if in an mrow |
336 | 4.50k | } |
337 | 4.50k | let answer = merge_tokens_chem_element(&doc, mathml, &mathml.following_siblings()); |
338 | 4.50k | return answer; |
339 | ||
340 | ||
341 | 4.50k | fn merge_tokens_chem_element<'a>(doc: &Document<'a>, leaf: Element<'a>, following_siblings: &[ChildOfElement<'a>]) -> Option<Vec<Element<'a>>> { |
342 | 4.50k | if following_siblings.is_empty() { |
343 | 1.22k | return None; |
344 | 3.28k | } |
345 | 3.28k | let second_element = as_element(following_siblings[0]); |
346 | 3.28k | let second_element_name = name(second_element); |
347 | 3.28k | if second_element_name != "mi" && |
348 | 3.02k | return None; |
349 | 256 | } |
350 | 256 | let second_element_text = as_text(second_element); |
351 | 256 | if second_element_text.len() != 1 { |
352 | 57 | return None; |
353 | 199 | } |
354 | 199 | let token_string = as_text(leaf); |
355 | 199 | let chem_token_string = vec![token_string.as_bytes()[0], second_element_text.as_bytes()[0]]; |
356 | 199 | if let Some( |
357 | 4 | chem_element.set_text(as_text(chem_element)); |
358 | 4 | chem_element.set_attribute_value(MAYBE_CHEMISTRY, chem_element.attribute_value(MAYBE_CHEMISTRY).unwrap()); |
359 | 4 | chem_element.set_attribute_value(MERGED_TOKEN, "true"); |
360 | 4 | second_element.remove_from_parent(); |
361 | 4 | return Some(vec![chem_element]); |
362 | 195 | } |
363 | 195 | return None; |
364 | 4.50k | } |
365 | ||
366 | /// split the string which has been checked to be all ASCII chars | |
367 | 2.54k | fn split_string_chem_element<'a>(doc: &Document<'a>, leaf: Element<'a>) -> Option<Vec<Element<'a>>> { |
368 | 2.54k | let token_string = as_text(leaf).as_bytes(); |
369 | 2.54k | let token_len = token_string.len(); |
370 | 2.54k | let mut j = 0; |
371 | 2.54k | let mut new_children = Vec::with_capacity(token_string.len()); |
372 | 3.31k | while j < token_len { |
373 | // try elements of length 2 and 1, preferring longer elements (e.g., prefer "Na" over "N") | |
374 | 2.94k | if let Some( |
375 | 310 | new_children.push(chem_element); |
376 | 310 | j += 2; |
377 | 310 | continue; |
378 | 2.63k | } else if let Some( |
379 | 457 | new_children.push(chem_element); |
380 | 457 | j += 1; |
381 | 457 | continue; |
382 | 2.18k | } |
383 | 2.18k | return None; // didn't find a valid chem element |
384 | } | |
385 | 362 | if new_children.len() <= 1 { |
386 | 231 | return None; |
387 | 131 | } |
388 | 131 | add_attrs(new_children[new_children.len()-1], &leaf.attributes()); |
389 | 131 | new_children[new_children.len()-1].set_attribute_value(SPLIT_TOKEN, "true"); |
390 | // debug!("split_string_chem_element: {} -> {}", String::from_utf8(token_string.to_vec()).unwrap(), new_children.len()); | |
391 | 131 | return Some(new_children); |
392 | 2.54k | } |
393 | ||
394 | /// Returns element or None | |
395 | 5.78k | fn get_chem_element<'a>(doc: &Document<'a>, bytes_str: &[u8], n: usize) -> Option<Element<'a>> { |
396 | use std::str; | |
397 | 5.78k | let len = bytes_str.len(); |
398 | 5.78k | if n > len { |
399 | 277 | return None; // can't be an chemical letter |
400 | 5.50k | } |
401 | 5.50k | match str::from_utf8(&bytes_str[..n]) { |
402 | 5.50k | Ok(chem_element) => { |
403 | 5.50k | if CHEMICAL_ELEMENT_ELECTRONEGATIVITY.contains_key(chem_element) { |
404 | 771 | return Some(new_chemical_element(doc, chem_element)); |
405 | 4.73k | } |
406 | 4.73k | return None; |
407 | } | |
408 | 0 | Err(_) => return None, |
409 | } | |
410 | 5.78k | } |
411 | ||
412 | 771 | fn new_chemical_element<'a>(doc: &Document<'a>, chem_element_str: &str) -> Element<'a> { |
413 | 771 | let result = create_mathml_element(doc, "mi"); |
414 | 771 | result.set_text(chem_element_str); |
415 | 771 | result.set_attribute_value(MAYBE_CHEMISTRY, if chem_element_str.len() == 1 { |
416 | 771 | if chem_element_str.len() == 1 { |
417 | 457 | result.set_attribute_value("mathvariant", "normal"); |
418 | 457 |
|
419 | 771 | return result; |
420 | 771 | } |
421 | 12.3k | } |
422 | ||
423 | /// Looks at the children of the element and uses heuristics to decide whether this is a chemical equation/formula | |
424 | /// If it is, it is marked with either data-chem-equation or data-chem-formula | |
425 | /// This function assumes proper structure | |
426 | /// | |
427 | /// Returns true if not chemistry -- added attrs, mrows, and leaves are removed in preparation for a second parse | |
428 | 5.05k | pub fn scan_and_mark_chemistry(mathml: Element) -> bool { |
429 | 5.05k | if is_chemistry_off(mathml) { |
430 | 0 | return true; |
431 | 5.05k | } |
432 | ||
433 | 5.05k | let child = as_element(mathml.children()[0]); |
434 | // debug!("scan_and_mark_chemistry:\n{}", mml_to_string(child)); | |
435 | 5.05k | assert_eq!(name(mathml), "math"); |
436 | 5.05k | let is_chemistry = if let Some( |
437 | // MathJax v4 includes this really useful info -- if it starts \ce -- we have Chemistry | |
438 | // need to determine if it is an equation or a formula | |
439 | 5 | latex.trim_start().starts_with(r"\ce") |
440 | } else { | |
441 | 5.05k | has_chem_intent(mathml, ":chemical-formula") || has_chem_intent(mathml, ":chemical-equation") |
442 | }; | |
443 | ||
444 | 5.05k | if is_chemistry || |
445 | 669 | assert_eq!(mathml.children().len(), 1); |
446 | 669 | let likelihood = likely_chem_formula(child); |
447 | 669 | if likelihood >= CHEMISTRY_THRESHOLD || |
448 | 211 | child.set_attribute_value(MAYBE_CHEMISTRY, std::cmp::max(CHEMISTRY_THRESHOLD, likelihood).to_string().as_str()); |
449 | 211 | set_marked_chemistry_attr(child, CHEM_FORMULA); |
450 | 458 | } |
451 | ||
452 | 669 | if child.attribute(CHEM_FORMULA).is_none() { |
453 | // can't be both an equation and a formula... | |
454 | 458 | let likelihood = likely_chem_equation(child); |
455 | 458 | if is_chemistry || |
456 | 36 | child.set_attribute_value(MAYBE_CHEMISTRY, std::cmp::max(CHEMISTRY_THRESHOLD, likelihood).to_string().as_str()); |
457 | 36 | set_marked_chemistry_attr(child, CHEM_EQUATION); |
458 | 422 | } |
459 | 211 | } |
460 | 4.38k | } |
461 | // debug!("...after marking:\n{}", mml_to_string(child)); | |
462 | ||
463 | 5.05k | if child.attribute(CHEM_FORMULA).is_none() && |
464 | 4.80k | if !has_maybe_chemistry(mathml) { |
465 | 3.68k | return true; // quick check avoids needing a second parse due to removing added elements |
466 | 1.12k | } |
467 | 1.12k | return !is_changed_after_unmarking_chemistry(mathml); |
468 | } else { | |
469 | 247 | return true; |
470 | } | |
471 | 5.05k | } |
472 | ||
473 | // returns the marked attr value or None | |
474 | 16.2k | fn get_marked_value(mathml: Element) -> Option<i32> { |
475 | 16.2k | return mathml.attribute_value(MAYBE_CHEMISTRY).map(|value| |
476 | 16.2k | } |
477 | ||
478 | /// Sets the attr 'chem' | |
479 | /// Recurse through all the children that have MAYBE_CHEMISTRY set | |
480 | 4.24k | fn set_marked_chemistry_attr(mathml: Element, chem: &str) { |
481 | 4.24k | let tag_name = name(mathml); |
482 | 4.24k | if let Some( |
483 | 2.88k | maybe_attr.remove_from_parent(); |
484 | ||
485 | 2.88k | match tag_name { |
486 | 2.88k | "mi" | |
487 | 2.03k | "mo" => { |
488 | 686 | if mathml.attribute(CHEM_FORMULA_OPERATOR).is_none() && |
489 | // don't mark as both formula and equation | |
490 | 433 | mathml.set_attribute_value(if chem == CHEM_FORMULA { |
491 | 253 | } |
492 | }, | |
493 | 1.35k | "mn" => |
494 | 1.26k | "mrow" | |
495 | 1.25k | let mut chem_name = chem; |
496 | 1.25k | if tag_name != "mrow" && |
497 | // look at base -- if an mi/mtext then this is really a chemical formula | |
498 | 69 | let base = as_element(mathml.children()[0]); |
499 | 69 | let base_name = name(base); |
500 | 69 | if base_name == "mi" || |
501 | 63 | chem_name = CHEM_FORMULA; |
502 | 63 |
|
503 | 1.18k | } |
504 | ||
505 | 1.25k | if mathml.attribute(CHEM_FORMULA).is_none() { |
506 | 1.23k | // don't mark as both formula and equation |
507 | 1.23k | mathml.set_attribute_value(chem_name, maybe_attr.value()); |
508 | 1.23k |
|
509 | 3.92k | for child in |
510 | 3.92k | set_marked_chemistry_attr(as_element(child), chem); |
511 | 3.92k | }; |
512 | } | |
513 | 10 | "mfrac" => { |
514 | 0 | let children = mathml.children(); |
515 | // debug!("mfrac children: {}", mml_to_string(mathml)); | |
516 | 0 | let numerator_is_chem_equation = IsBracketed::is_bracketed(as_element(children[0]), "[", "]", false, true); |
517 | 0 | let denominator_is_chem_equation = IsBracketed::is_bracketed(as_element(children[1]), "[", "]", false, true); |
518 | 0 | if numerator_is_chem_equation && denominator_is_chem_equation { |
519 | 0 | mathml.set_attribute_value(CHEM_EQUATION, "true"); |
520 | 0 | } |
521 | } | |
522 | 10 | _ => error!("Internal error: {tag_name} should not be marked as 'MAYBE_CHEMISTRY'"), |
523 | } | |
524 | 1.35k | } else if tag_name == "mrow" { |
525 | // could have been added during canonicalization, so never marked. Recurse to the children | |
526 | 68 | for child in |
527 | 68 | set_marked_chemistry_attr(as_element(child), chem); |
528 | 68 | }; |
529 | 1.32k | } |
530 | 4.24k | } |
531 | ||
532 | /// returns true if MAYBE_CHEMISTRY's occur within the element | |
533 | 41.3k | fn has_maybe_chemistry(mathml: Element) -> bool { |
534 | 41.3k | if mathml.attribute(MAYBE_CHEMISTRY).is_some() { |
535 | 1.12k | return true; |
536 | 40.2k | } |
537 | 40.2k | if !is_leaf(mathml) { |
538 | 36.5k | for child in |
539 | 36.5k | if has_maybe_chemistry(as_element(child)) { |
540 | 3.15k | return true; |
541 | 33.3k | } |
542 | } | |
543 | 22.2k | } |
544 | 37.0k | return false; |
545 | 41.3k | } |
546 | ||
547 | /// Clears MAYBE_CHEMISTRY from this element and its decedents | |
548 | /// Also deletes added mrows and leaves; returns true if anything is deleted | |
549 | 19.7k | fn is_changed_after_unmarking_chemistry(mathml: Element) -> bool { |
550 | 19.7k | mathml.remove_attribute(MAYBE_CHEMISTRY); |
551 | 19.7k | if is_leaf(mathml) { |
552 | // don't bother testing for the attr -- just remove and nothing bad happens if they aren't there | |
553 | 13.3k | mathml.remove_attribute(CHEM_FORMULA_OPERATOR); |
554 | 13.3k | mathml.remove_attribute(CHEM_EQUATION_OPERATOR); |
555 | 13.3k | mathml.remove_attribute(CHEMICAL_BOND); |
556 | 13.3k | if mathml.attribute(MERGED_TOKEN).is_some() { |
557 | 3 | unmerge_element(mathml); |
558 | 3 | return true; // need to re-parse |
559 | 13.3k | } else if mathml.attribute(SPLIT_TOKEN).is_some() { |
560 | 33 | if let Err( |
561 | 0 | panic!("{}", err); |
562 | 33 | } |
563 | // debug!("After merge_element:{}", mml_to_string(mathml)); | |
564 | // let parent = get_parent(mathml); | |
565 | // debug!("After merge_element: -- parent{}", mml_to_string(parent)); | |
566 | ||
567 | 13.3k | } else if let Some( |
568 | 2.14k | changed_value == ADDED_ATTR_VALUE && |
569 | 2.11k | name(mathml) != "mtext" { // a hack fix for #477 (chem never modifies mtext, so this is ok) |
570 | 2.11k | mathml.remove_from_parent(); |
571 | 2.11k | return true; |
572 | 11.1k | } |
573 | 11.2k | return false; |
574 | 6.38k | } else if IsNode::is_scripted(mathml) && |
575 | 1.04k | name(as_element(mathml.children()[0])) == "mi" && |
576 | 575 | as_element(mathml.children()[0]).attribute(SPLIT_TOKEN).is_some() { |
577 | // Undo a split that happened in a scripted element. | |
578 | // We put the preceding elements into the base and call merge_element on the last element of the base | |
579 | // The first and/or the last child in the sequence could be a script that needs to be unwrapped | |
580 | 1 | let mut parent = get_parent(mathml); // there is always a "math" node |
581 | // debug!("mathml:\n{}", mml_to_string(mathml)); | |
582 | // debug!("parent before merge:\n{}", mml_to_string(parent)); | |
583 | // debug!("grandparent before merge:\n{}", mml_to_string(get_parent(parent))); | |
584 | ||
585 | 1 | let mut preceding_children = mathml.preceding_siblings(); |
586 | // could be no preceding children to canonicalization creating mrows (see issue #303), so might need to use parent, etc | |
587 | 2 | while preceding_children.is_empty() { |
588 | 1 | preceding_children = parent.preceding_siblings(); |
589 | 1 | if name(parent) == "math" { |
590 | 0 | break; // consider {SIN}^{-1} -- no preceding child |
591 | 1 | } |
592 | 1 | parent = get_parent(parent); |
593 | } | |
594 | ||
595 | 1 | let mut new_script_children = vec![]; |
596 | 1 | if !preceding_children.is_empty() { |
597 | // deal with the first element (if it needs unwrapping, it has only prescripts) | |
598 | 1 | let first_element_of_split = as_element(preceding_children[preceding_children.len()-1]); |
599 | // debug!("first_element_of_split: \n{}", mml_to_string(first_element_of_split)); | |
600 | 1 | if name(first_element_of_split) == "mmultiscripts" { |
601 | // take the base and make it the first child of preceding_children (what will get merged) | |
602 | // put the rest of the elements (the prescripts) at the end of the parent last element (mathml) which must be an mmultiscripts | |
603 | 0 | let first_element_children = first_element_of_split.children(); |
604 | 0 | assert_eq!(name(mathml), "mmultiscripts"); |
605 | 0 | let mut script_children = mathml.children(); |
606 | 0 | assert_eq!(name(as_element(script_children[0])), "mi"); |
607 | 0 | assert!(!script_children.len().is_multiple_of(2)); // doesn't have <mprescripts/> |
608 | 0 | script_children.push(first_element_children[1]); // mprescripts |
609 | 0 | script_children.push(first_element_children[2]); // prescripts subscript |
610 | 0 | script_children.push(first_element_children[3]); // prescripts superscript |
611 | ||
612 | 0 | let base_of_first_element = first_element_children[0]; // base |
613 | 0 | assert_eq!(name(as_element(base_of_first_element)), "mi"); |
614 | 0 | let script_base = as_element(script_children[0]); |
615 | 0 | let mut merged_base_text = as_text( as_element(base_of_first_element)).to_string(); |
616 | 0 | merged_base_text.push_str(as_text(script_base)); |
617 | 0 | script_base.set_text(&merged_base_text); |
618 | 0 | script_base.remove_attribute("mathvariant"); |
619 | 0 | script_base.remove_attribute(ADDED_ATTR_VALUE); |
620 | 0 | script_base.remove_attribute(MAYBE_CHEMISTRY); |
621 | 0 | script_base.remove_attribute(SPLIT_TOKEN); |
622 | 0 | mathml.replace_children(script_children); |
623 | ||
624 | 0 | first_element_of_split.remove_from_parent(); |
625 | 0 | return true; |
626 | 1 | } |
627 | 1 | new_script_children.push(ChildOfElement::Element(first_element_of_split)); |
628 | 0 | } |
629 | 1 | debug!("mathml after handling preceding children:\n{}", |
630 | 1 | let mut children_of_script = mathml.children(); |
631 | 1 | let split_child = as_element(children_of_script[0]); |
632 | 1 | new_script_children.append(&mut children_of_script); |
633 | 1 | mathml.replace_children(new_script_children); // temporarily has bad number of children |
634 | // debug!("After making bad script:\n{}", mml_to_string(mathml)); | |
635 | 1 | if let Err( |
636 | 0 | panic!("{}", err); |
637 | 1 | } |
638 | 1 | return true; |
639 | } else { | |
640 | 6.37k | let mut answer = false; |
641 | 18.5k | for child in |
642 | 18.5k | let child = as_element(child); |
643 | 18.5k | if name(child) == "mtd" && |
644 | 2 | answer = true; // each mtd acts as a potential island for chemistry, so don't clear it |
645 | 18.5k | } else { |
646 | 18.5k | answer |= is_changed_after_unmarking_chemistry(child); |
647 | 18.5k | } |
648 | } | |
649 | 6.37k | if name(mathml) == "mrow" { |
650 | 3.58k | if let Some( |
651 | // we added an mrow, we can remove it -- but this might be already processed which is the case if "data-id-added" is true (exists) | |
652 | 2.86k | if changed_value == ADDED_ATTR_VALUE && mathml.attribute("data-id-added").is_none() { |
653 | // mrows get added for several reasons. One of them is to canonicalize elements like msqrt that can have 1 or more children; | |
654 | // those should not get removed because the re-parse doesn't add those | |
655 | // Although they would never be added, elements with fixed number of children also shouldn't have the mrow go away | |
656 | // We are left with only removing mrows with one child or mrows that are children of mrows (simpler test than ELEMENTS_WITH_ONE_CHILD) | |
657 | 2.86k | let parent = get_parent(mathml); // mathml is mrow, so parent always exists |
658 | 2.86k | if mathml.children().len() == 1 || |
659 | 6.26k | let |
660 | 2.31k | mathml.remove_attribute(CHANGED_ATTR); // if just one child, the attrs are pushed onto the child |
661 | // debug!("is_changed_after_unmarking: before replace - parent\n{}", mml_to_string(parent)); | |
662 | 2.31k | replace_children(mathml, children); |
663 | // debug!("is_changed_after_unmarking: parent\n{}", mml_to_string(parent)); | |
664 | ||
665 | 557 | } |
666 | 0 | } |
667 | 720 | } |
668 | 3.58k | return true; |
669 | 2.79k | } |
670 | 2.79k | return answer; |
671 | } | |
672 | ||
673 | 3 | fn unmerge_element(mathml: Element) { |
674 | // a merged token occurs when two single letters get merged into one. Here we recreate the two tokens | |
675 | 3 | assert!(is_leaf(mathml)); |
676 | // debug!("unmerge_element: {}", mml_to_string(mathml)); | |
677 | 3 | let mut token_str = as_text(mathml).chars(); |
678 | 3 | let first = create_mathml_element(&mathml.document(), name(mathml)); |
679 | 3 | first.set_text(&token_str.next().unwrap().to_string()); |
680 | 3 | let second = create_mathml_element(&mathml.document(), name(mathml)); |
681 | 3 | second.set_text(&token_str.next().unwrap().to_string()); |
682 | 3 | replace_children(mathml, vec![first, second]); |
683 | 3 | } |
684 | ||
685 | /// Put the split pieces back together (undo the split) | |
686 | 34 | fn merge_element(mathml: Element) -> Result<()> { |
687 | // debug!("merge_element: {}", mml_to_string(mathml)); | |
688 | // debug!("merge_element parent: {}", mml_to_string(get_parent(mathml))); | |
689 | 34 | assert!(is_leaf(mathml)); |
690 | 34 | let mut preceding_children = mathml.preceding_siblings(); |
691 | // debug!("preceding_children: {}", preceding_children.iter().map(|&el| name(as_element(el)).to_string()).collect::<Vec<String>>().join(", ")); | |
692 | 34 | if preceding_children.is_empty() { |
693 | // handle: | |
694 | // * case where we have mi mmultiscripts mi ... where the second mi needs to join with the first (see test mhchem_so4) | |
695 | // * case where the child got buried in an added mrow (can only happen one level deep because invisible times should get inserted) | |
696 | 0 | let parent = get_parent(mathml); // mathml is leaf, so parent always exists |
697 | 0 | preceding_children = parent.preceding_siblings(); |
698 | 0 | if preceding_children.is_empty() || |
699 | 0 | !(name(parent) == "mmultiscripts" || |
700 | 0 | (name(parent) == "mrow" && parent.attribute_value(CHANGED_ATTR).is_some() && |
701 | 0 | parent.attribute_value(CHANGED_ATTR).unwrap() == ADDED_ATTR_VALUE)) { |
702 | 0 | bail!("Internal error: {} should not have been split'", mml_to_string(mathml)); |
703 | 0 | } |
704 | 34 | } |
705 | // Note: there was an invisible U+2063, but it was removed before we got here | |
706 | // The parent mrow could have many children that couldn't have been part of a split -- only consider feasible children to split (mi/mtext) | |
707 | // To figure this out, we walk backwards adding the text in reverse and then reverse that text in the end | |
708 | 34 | let mut merged_text = Vec::default(); |
709 | 46 | for &child in |
710 | 46 | let child = as_element(child); |
711 | // because this is before canonicalization, there could be an mrow with just mi/mtext | |
712 | 46 | if name(child) == "mrow" && |
713 | 0 | // "lift" the child up so all the links (e.g., siblings) are correct |
714 | 0 | let child = as_element(child.children()[0]); |
715 | 0 | set_mathml_name(child, name(child)); |
716 | 0 | crate::canonicalize::add_attrs(child, &child.attributes()); |
717 | 0 | child.replace_children(child.children()); |
718 | 46 | } |
719 | 46 | if name(child) != "mi" && |
720 | 12 | break; |
721 | 34 | } |
722 | 34 | merged_text.push(as_text(child)); |
723 | 34 | child.remove_from_parent(); |
724 | } | |
725 | 34 | merged_text.reverse(); |
726 | 34 | let mut merged_text = merged_text.join(""); |
727 | 34 | merged_text.push_str(as_text(mathml)); |
728 | 34 | mathml.set_text(&merged_text); |
729 | 34 | mathml.remove_attribute("mathvariant"); |
730 | 34 | mathml.remove_attribute(ADDED_ATTR_VALUE); |
731 | 34 | mathml.remove_attribute(MAYBE_CHEMISTRY); |
732 | 34 | mathml.remove_attribute(SPLIT_TOKEN); |
733 | 34 | return Ok( () ); |
734 | 34 | } |
735 | 19.7k | } |
736 | ||
737 | /// Returns true only if 'mathml' potentially is chemistry. | |
738 | /// This assumes canonicalization has happened and that 'mathml' is the 'math' element | |
739 | 5.05k | fn is_chemistry_sanity_check(mathml: Element) -> bool { |
740 | // This does some sanity checking. More can definitely be done | |
741 | // Checks: | |
742 | // * there should be chemical elements | |
743 | // * if the child is an mrow with three children, the operator should be '=' (not CHEMICAL_BOND) or an arrow | |
744 | // in this case, we gather up the elements on the lhs and rhs. The sets should be equal and non-empty. | |
745 | // the exception is if there are prescripts, in which as we might have radioactive decay so we don't require the sets to be equal | |
746 | // * otherwise, we gather up all the chemical elements and make sure the set is non-empty | |
747 | // * if it isn't an mrow, we leave it to likely_chem_equation() to rule it out | |
748 | 5.05k | assert_eq!(name(mathml), "math"); |
749 | 5.05k | assert_eq!(mathml.children().len(), 1); |
750 | 5.05k | let mathml = as_element(mathml.children()[0]); |
751 | 5.05k | if name(mathml) == "mrow" { |
752 | 3.29k | let mrow_children = mathml.children(); |
753 | 3.29k | if mrow_children.len() == 3 && |
754 | 371 | let mut lhs_elements = HashSet::with_capacity(8); // likely more than anything we'll encounter -- bigger affects '=' op |
755 | 371 | let lhs_has_prescripts = gather_chemical_elements(as_element(mrow_children[0]), &mut lhs_elements); |
756 | // need to include the arrow as it might have the addition of some chemical elements (see UEB/iceb.rs/chem_16_5_2) | |
757 | 371 | gather_chemical_elements(as_element(mrow_children[1]), &mut lhs_elements); |
758 | 371 | let mut rhs_elements = HashSet::with_capacity(8); // likely more than anything we'll encounter -- bigger affects '=' op |
759 | 371 | let rhs_has_prescripts = gather_chemical_elements(as_element(mrow_children[2]), &mut rhs_elements); |
760 | 371 | if lhs_elements.is_empty() { |
761 | 269 | return false; |
762 | 102 | } |
763 | // debug!("lhs/rhs elements: {:?}, {:?}", lhs_elements, rhs_elements); | |
764 | // debug!("lhs/rhs has prescripts: {}, {}", lhs_has_prescripts, rhs_has_prescripts); | |
765 | 102 | if lhs_elements == rhs_elements { |
766 | 37 | return !(lhs_has_prescripts ^ rhs_has_prescripts); // seems reasonable that if the lhs has prescripts, so should the rhs |
767 | 65 | } |
768 | 65 | return lhs_has_prescripts && |
769 | 2.92k | } |
770 | 1.76k | } |
771 | 4.68k | let mut chem_elements = HashSet::with_capacity(8); // likely more than anything we'll encounter -- bigger affects '=' op |
772 | 4.68k | gather_chemical_elements(mathml, &mut chem_elements); |
773 | 4.68k | return !chem_elements.is_empty(); |
774 | ||
775 | ||
776 | 2.52k | fn is_arrow_or_equal(mathml: Element) -> bool { |
777 | 2.52k | let base = get_possible_embellished_node(mathml); |
778 | 2.52k | if name(base) != "mo" || |
779 | 542 | return false; |
780 | 1.98k | } |
781 | 1.98k | let text = as_text(base); |
782 | 1.98k | return text == "=" || |
783 | ||
784 | 2.52k | } |
785 | ||
786 | /// Gather up all the chemical elements in the element and return true if it has numerical prescripts | |
787 | 48.3k | fn gather_chemical_elements<'a>(mathml: Element<'a>, chem_elements: &mut HashSet<&'a str>) -> bool { |
788 | 48.3k | match name(mathml) { |
789 | 48.3k | "mi" | |
790 | 10.8k | if is_chemical_element(mathml) { |
791 | 1.60k | chem_elements.insert(as_text(mathml)); |
792 | 9.27k | } |
793 | 10.8k | return false; |
794 | }, | |
795 | 37.4k | "msub" | |
796 | 2.16k | gather_chemical_elements(get_possible_embellished_node(mathml), chem_elements); |
797 | 2.16k | return name(mathml) == "mmultiscripts" && |
798 | }, | |
799 | 35.2k | "semantics" => { |
800 | 0 | return gather_chemical_elements( get_presentation_element(mathml).1, chem_elements ); |
801 | }, | |
802 | 35.2k | _ => if is_leaf(mathml) { return |
803 | } | |
804 | ||
805 | // mrow, msqrt, etc | |
806 | 13.7k | let mut has_prescripts = false; |
807 | 40.3k | for child in |
808 | 40.3k | let child = as_element(child); |
809 | 40.3k | has_prescripts |= gather_chemical_elements(child, chem_elements); |
810 | 40.3k | } |
811 | 13.7k | return has_prescripts; |
812 | 48.3k | } |
813 | ||
814 | /// find the mprescripts child and then check the following siblings for numerical prescripts | |
815 | 291 | fn has_numerical_prescripts(mathml: Element) -> bool { |
816 | 291 | let children = mathml.children(); |
817 | // quick check to see if there is an mprescripts child | |
818 | 291 | if !children.len().is_multiple_of(2) { // <mprescripts/> => even number of children |
819 | 129 | return false; |
820 | 162 | } |
821 | // we need enumerate because the "step_by" will cause any returned iterator to jump ahead by 2 | |
822 | 162 | let i_mprescripts = children.iter() |
823 | 162 | .enumerate() |
824 | 162 | .skip(1) |
825 | 162 | .step_by(2) |
826 | 222 | . |
827 | 162 | .map(|(i, _)| i); |
828 | ||
829 | 162 | if let Some(i) = i_mprescripts { |
830 | 162 | let subscript = as_element(children[i+1]); // can be +1/-1 for beta decay |
831 | 162 | let superscript = as_element(children[i+2]); // mass number, so always >= 0 |
832 | 162 | if name(superscript) != "mn" { |
833 | 55 | return false; |
834 | 107 | } |
835 | 107 | return name(subscript) == "mn" || |
836 | 36 | (name(subscript) == "mrow" && |
837 | 0 | name(as_element(subscript.children()[3])) == "mm" && |
838 | 0 | name(as_element(subscript.children()[1])) == "mo" && |
839 | 0 | matches!(as_text(as_element(subscript.children()[1])), "+" | "-")); |
840 | 0 | } |
841 | 0 | return false; |
842 | 291 | } |
843 | 5.05k | } |
844 | ||
845 | /// Looks at the children of the element and uses heuristics to decide whether this is a chemical equation. | |
846 | /// This assumes canonicalization of characters has happened | |
847 | 713 | fn likely_chem_equation(mathml: Element) -> i32 { |
848 | // mfrac -- could be a ratio of concentrations | |
849 | 713 | if name(mathml) != "mrow" && |
850 | 119 | return NOT_CHEMISTRY; |
851 | 594 | } |
852 | ||
853 | // debug!("start likely_chem_equation:\n{}", mml_to_string(mathml)); | |
854 | // mrow -- check the children to see if we are likely to be a chemical equation | |
855 | ||
856 | // concentrations should either be unscripted or have a superscript that isn't a charge | |
857 | // they occur in an mrow or mfrac | |
858 | 594 | if IsBracketed::is_bracketed(mathml, "[", "]", false, true) { |
859 | 10 | let parent_name = name(get_parent(mathml)); |
860 | 10 | if parent_name == "mfrac" || parent_name == "mrow" || |
861 | 0 | (parent_name == "msup" && likely_chem_superscript(as_element(mathml.following_siblings()[0])) < 0){ |
862 | 10 | return if as_element(mathml.children()[0]).attribute(CHEM_FORMULA).is_some() { |
863 | 0 | } |
864 | 584 | } |
865 | ||
866 | // possible improvement -- give bonus points for consecutive (not counting invisible separators) chemical elements on top of the existing points | |
867 | 584 | let mut likelihood = 0; // indicator of likely match |
868 | 584 | let mut has_equilibrium_constant = false; |
869 | 584 | let children = mathml.children(); |
870 | 1.22k | for i in |
871 | 1.22k | let child = as_element(children[i]); |
872 | // debug!(" i={}, likelihood={}, child={}", i, likelihood, crate::canonicalize::element_summary(child)); | |
873 | 1.22k | if let Some( |
874 | 457 | likelihood += likely; |
875 | 457 | continue; |
876 | 771 | } |
877 | 771 | if i == children.len()-1 { |
878 | 195 | let likely = likely_chem_state(child); |
879 | 195 | if likely > 0 { |
880 | 0 | likelihood += likely; |
881 | 0 | break; |
882 | 195 | } |
883 | // otherwise, check the last element as normal | |
884 | 576 | } |
885 | 771 | let tag_name = name(child); |
886 | 771 | let likely = match tag_name { |
887 | 771 | "mi" => |
888 | 625 | "mn" => |
889 | 616 | "mo" | |
890 | 286 | "msub" | |
891 | 38 | if is_equilibrium_constant(child) { |
892 | 0 | has_equilibrium_constant = true; |
893 | 0 | 2 |
894 | } else { | |
895 | 38 | likely_adorned_chem_formula(child) |
896 | } | |
897 | }, | |
898 | 248 | "mfrac" => { |
899 | 0 | if has_equilibrium_constant { |
900 | 0 | 2 |
901 | } else { | |
902 | 0 | -3 // fraction tend only to appear after an equilibrium constant |
903 | } | |
904 | }, | |
905 | 248 | "mrow" => { |
906 | 248 | let likely = likely_chem_formula(child); |
907 | 248 | if likely < 0 { |
908 | 248 | likely_chem_equation(child) |
909 | } else { | |
910 | 0 | likely |
911 | } | |
912 | }, | |
913 | // no need to check for mtr or mtd because they only exist in a table and the recursion is dealt with here. | |
914 | 0 | "mtable" => { |
915 | 0 | for mrow in child.children() { |
916 | 0 | let mrow = as_element(mrow); |
917 | 0 | for mtd in mrow.children() { |
918 | 0 | let mtd = as_element(mtd); |
919 | 0 | let mut likely = likely_chem_formula(mtd); |
920 | 0 | if likely < CHEMISTRY_THRESHOLD { |
921 | 0 | likely = likely_chem_equation(mtd); |
922 | 0 | } |
923 | 0 | if likely < CHEMISTRY_THRESHOLD { |
924 | 0 | is_changed_after_unmarking_chemistry(mtd); |
925 | 0 | } |
926 | } | |
927 | } | |
928 | 0 | NOT_CHEMISTRY |
929 | }, | |
930 | 0 | "semantics" => { |
931 | 0 | likely_chem_equation(get_presentation_element(mathml).1) |
932 | }, | |
933 | 0 | _ => NOT_CHEMISTRY, |
934 | }; | |
935 | 771 | if likely >= 0 { |
936 | 164 | child.set_attribute_value(MAYBE_CHEMISTRY, &likely.to_string()); |
937 | 607 | } |
938 | 771 | likelihood += likely; |
939 | 771 | if likelihood < NOT_CHEMISTRY_THRESHOLD { |
940 | 396 | return NOT_CHEMISTRY; |
941 | 375 | } |
942 | } | |
943 | ||
944 | 188 | if likelihood >= 0 { |
945 | 108 | mathml.set_attribute_value(MAYBE_CHEMISTRY, &likelihood.to_string()); |
946 | 108 |
|
947 | 188 | return likelihood; |
948 | 713 | } |
949 | ||
950 | ||
951 | /// could be a number, a state ("(l)", "(g)", etc), or a number followed by a state | |
952 | 1.19k | fn likely_chem_subscript(subscript: Element) -> i32 { |
953 | 1.19k | let subscript_name = name(subscript); |
954 | 1.19k | if subscript_name == "mn" && |
955 | 674 | return 0; // not really much chem info about an integer subscript |
956 | 525 | } else if subscript_name == "mi" { |
957 | 328 | let text = as_text(subscript); |
958 | 328 | if text == "s" || |
959 | 6 | subscript.set_attribute_value(CHEM_STATE, "true"); |
960 | 6 | return 2; |
961 | 322 | } |
962 | 197 | } else if subscript_name == "mrow" { |
963 | // debug!("likely_chem_subscript:\n{}", mml_to_string(subscript)); | |
964 | 184 | let children = subscript.children(); |
965 | 184 | if children.len() == 3 && |
966 | 6 | return likely_chem_subscript(as_element(children[1])); |
967 | 178 | } |
968 | 178 | let i_first_child = as_element(children[0]); |
969 | 178 | if children.len() == 2 && |
970 | 103 | name(i_first_child) == "mn" && |
971 | 81 | name(as_element(children[1])) == "mrow" && |
972 | 0 | likely_chem_state(as_element(children[1])) > 0 { // notation used in en.wikipedia.org/wiki/Electrolyte#Formation |
973 | 0 | return 2; |
974 | 178 | } |
975 | 13 | } |
976 | // could be a variable 'n' or something else -- just not likely | |
977 | 513 | return -3 |
978 | 1.19k | } |
979 | ||
980 | 17 | fn small_roman_to_number(text: &str) -> &str { |
981 | // simplest to do a look up | |
982 | static ROMAN_TO_NUMBER: phf::Map<&str, &str> = phf_map! { | |
983 | "I" => "1", "II" => "2", "III" => "3", "IV" => "4", "V" => "5", "VI" => "6", "VII" => "7", "VIII" => "8", "IX" => "9", | |
984 | }; | |
985 | 17 | return ROMAN_TO_NUMBER.get(text).unwrap_or(&""); |
986 | ||
987 | 17 | } |
988 | ||
989 | 1.65k | fn likely_chem_superscript(sup: Element) -> i32 { |
990 | // either one or more '+'s (or '-'s) or a number followed by +/- | |
991 | // also could be state (en.wikipedia.org/wiki/Nuclear_chemistry#PUREX_chemistry) | |
992 | // bullet is radical (en.wikipedia.org/wiki/Radical_(chemistry)#Depiction_in_chemical_reactions); mhchem uses dot operator | |
993 | // these can stand alone, be followed by +/- or have a number in front "(2•)-"" [examples from mhchem documentation] | |
994 | // roman numerals are "oxidation state" and range from -4 to +9 | |
995 | 3 | static MULTIPLE_PLUS_OR_MINUS_OR_DOT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\++$|^-+$|^\U{2212}+$|^[⋅∙•][-+\U{2212}]*$").unwrap()); |
996 | 3 | static SINGLE_PLUS_OR_MINUS_OR_DOT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[+-\U{2212}⋅∙•]$").unwrap()); |
997 | static DOTS: &[char; 3] = &['⋅', '∙', '•']; | |
998 | 1.65k | let sup_name = name(sup); |
999 | 1.65k | if sup_name == "mo" && |
1000 | 113 | if as_text(sup).find(DOTS).is_some() { |
1001 | 7 | sup.set_attribute_value(MAYBE_CHEMISTRY, "1"); |
1002 | 7 | sup.set_attribute_value(CHEM_FORMULA_OPERATOR, "1"); // value doesn't really matter |
1003 | 106 | } |
1004 | 113 | return if as_text(sup).len()==1 { |
1005 | 1.54k | } else if (sup_name == "mi" || |
1006 | 17 | sup.set_attribute_value("data-number", small_roman_to_number(as_text(sup))); |
1007 | 17 | sup.set_attribute_value(MAYBE_CHEMISTRY, "2"); |
1008 | 17 | return 2; |
1009 | 1.52k | } else if sup_name == "mrow" { |
1010 | // look for something like '2+' | |
1011 | 311 | let children = sup.children(); |
1012 | 311 | if children.len() == 2 { |
1013 | 177 | let first = as_element(children[0]); |
1014 | 177 | let second = as_element(children[1]); |
1015 | 177 | if name(first) == "mn" && |
1016 | 55 | let second_text = as_text(second); |
1017 | 55 | if SINGLE_PLUS_OR_MINUS_OR_DOT.is_match(second_text) { |
1018 | 55 | if second_text.find(DOTS).is_some() { |
1019 | 0 | second.set_attribute_value(MAYBE_CHEMISTRY, "2"); |
1020 | 0 | second.set_attribute_value(CHEM_FORMULA_OPERATOR, "2"); // value doesn't really matter |
1021 | 55 | } |
1022 | 55 | sup.set_attribute_value(MAYBE_CHEMISTRY, "3"); |
1023 | 55 | return 3; // ending with a +/- makes it likely this is an ion |
1024 | 0 | } |
1025 | 122 | } |
1026 | 134 | } |
1027 | // gather up the text and see if it is all +, -, etc | |
1028 | 256 | let mut text = "".to_string(); |
1029 | 414 | for child in |
1030 | 414 | let child = as_element(*child); |
1031 | 414 | if name(child) == "mo" { |
1032 | 169 | text.push_str(as_text(child)); |
1033 | 169 | } else { |
1034 | // could have something like 'mrow(mrow 2n, -) (chem example 5-9) -- so fallback to still ok if ends with + or - | |
1035 | 245 | let last_super_child = as_element(children[children.len()-1]); |
1036 | 245 | if name(last_super_child) == "mo" { |
1037 | 7 | let text = as_text(last_super_child); |
1038 | 7 | if text == "+" || text == "-" { |
1039 | 1 | sup.set_attribute_value(MAYBE_CHEMISTRY, "3"); |
1040 | 1 | return 3; |
1041 | 6 | } |
1042 | 238 | } |
1043 | 244 | return NOT_CHEMISTRY; |
1044 | } | |
1045 | } | |
1046 | 11 | if MULTIPLE_PLUS_OR_MINUS_OR_DOT.is_match(&text) { |
1047 | 13 | for child in |
1048 | 13 | let child = as_element(child); |
1049 | 13 | if name(child) == "mo" && as_text(child).find(DOTS).is_some() { |
1050 | 0 | child.set_attribute_value(MAYBE_CHEMISTRY, "1"); |
1051 | 0 | child.set_attribute_value(CHEM_FORMULA_OPERATOR, "1"); // value doesn't really matter |
1052 | 13 | } |
1053 | } | |
1054 | 6 | let likely = 2*text.len() as i32; |
1055 | 6 | sup.set_attribute_value(MAYBE_CHEMISTRY, &likely.to_string()); |
1056 | 6 | return likely; |
1057 | 5 | } |
1058 | 1.21k | } |
1059 | 1.21k | return NOT_CHEMISTRY |
1060 | 1.65k | } |
1061 | ||
1062 | ||
1063 | /// chem_formula is likely if it is one of: | |
1064 | /// * a (possibly adorned) chemical element | |
1065 | /// * an operator that represents a bond | |
1066 | /// * fences around a chemical formula | |
1067 | /// * an mrow made up of only chemical formulas | |
1068 | 15.0k | fn likely_chem_formula(mathml: Element) -> i32 { |
1069 | // debug!("start likely_chem_formula:\n{}", mml_to_string(mathml)); | |
1070 | 15.0k | if let Some( |
1071 | 2.65k | return value; // already marked |
1072 | 12.3k | } |
1073 | ||
1074 | 12.3k | let tag_name = name(mathml); |
1075 | 12.3k | let likelihood = match tag_name { |
1076 | // a parent may clear the chem flags if something says can't be chemistry (e.g, a non chemically valid script) | |
1077 | 12.3k | "mi" => |
1078 | 10.3k | "mo" => |
1079 | 5.90k | "mtext" => |
1080 | 5.85k | "mn" => |
1081 | 3.87k | "msub" | |
1082 | 225 | likely_chem_formula(as_element(mathml.children()[0])); // set MAYBE_CHEMISTRY attribute |
1083 | 225 | likely_adorned_chem_formula(mathml) |
1084 | }, | |
1085 | 3.64k | "mrow" => { |
1086 | 3.41k | let chem_state = likely_chem_state(mathml); |
1087 | 3.41k | if chem_state > 0 { |
1088 | 18 | chem_state |
1089 | } else { | |
1090 | 3.39k | likely_mrow_chem_formula(mathml) |
1091 | } | |
1092 | }, | |
1093 | 232 | "mfrac" => { |
1094 | 73 | let children = mathml.children(); |
1095 | 73 | let num_likely = likely_chem_formula(as_element(children[0])); |
1096 | 73 | let denom_likely = likely_chem_formula(as_element(children[1])); |
1097 | 73 | let likely = num_likely.max(denom_likely); |
1098 | 73 | if likely < CHEMISTRY_THRESHOLD {NOT_CHEMISTRY} else { |
1099 | } | |
1100 | 159 | "mtd" => { |
1101 | 5 | let mut likely = likely_chem_formula(as_element(mathml.children()[0])); |
1102 | 5 | if likely < CHEMISTRY_THRESHOLD { |
1103 | 4 | likely = likely_chem_equation(mathml); |
1104 | 4 |
|
1105 | 5 | likely |
1106 | } | |
1107 | 154 | "mtable" => { |
1108 | 4 | for mrow in |
1109 | 4 | let mrow = as_element(mrow); |
1110 | 5 | for mtd in |
1111 | 5 | let mtd = as_element(mtd); |
1112 | 5 | let mut likely = likely_chem_formula(mtd); |
1113 | 5 | if likely < CHEMISTRY_THRESHOLD { |
1114 | 3 | likely = likely_chem_equation(mtd); |
1115 | 3 |
|
1116 | 5 | if likely < CHEMISTRY_THRESHOLD { |
1117 | 3 | is_changed_after_unmarking_chemistry(mtd); |
1118 | 3 |
|
1119 | } | |
1120 | } | |
1121 | 2 | NOT_CHEMISTRY |
1122 | }, | |
1123 | 152 | "semantics" => { |
1124 | 0 | likely_chem_formula(get_presentation_element(mathml).1) |
1125 | }, | |
1126 | _ => { | |
1127 | 152 | if !is_leaf(mathml) { |
1128 | // mfrac, msqrt, etc | |
1129 | 320 | for child in |
1130 | 320 | let child = as_element(child); |
1131 | 320 | let likelihood = likely_chem_formula(child); |
1132 | 320 | if likelihood > 0 { |
1133 | 77 | child.set_attribute_value(MAYBE_CHEMISTRY, likelihood.to_string().as_str()); |
1134 | 243 | }; |
1135 | } | |
1136 | 0 | } |
1137 | // debug!("NOT_CHEMISTRY:\n{}", mml_to_string(mathml)); | |
1138 | 152 | NOT_CHEMISTRY |
1139 | } | |
1140 | }; | |
1141 | 12.3k | if likelihood >= 0 { |
1142 | 5.09k | mathml.set_attribute_value(MAYBE_CHEMISTRY, &likelihood.to_string()); |
1143 | 7.30k | } |
1144 | // debug!("likely_chem_formula {}:\n{}", likelihood, mml_to_string(mathml)); | |
1145 | ||
1146 | 12.3k | return likelihood; |
1147 | ||
1148 | 3.39k | fn likely_mrow_chem_formula(mrow: Element) -> i32 { |
1149 | // For parens, the only reason to add them is to group the children and then indicate that there is more than one molecule | |
1150 | 3.39k | if IsBracketed::is_bracketed(mrow, "(", ")", false, false) || |
1151 | 3.14k | IsBracketed::is_bracketed(mrow, "[", "]", false, false) { |
1152 | // If it is bracketed, it should have a subscript to indicate the number of the element. | |
1153 | // We give a pass to unadorned bracketing chars | |
1154 | 310 | if mrow.children().len() != 3 { |
1155 | 0 | return NOT_CHEMISTRY; |
1156 | 310 | } |
1157 | 310 | let contents = as_element(mrow.children()[1]); |
1158 | 310 | let parent = get_parent(mrow); |
1159 | 310 | let parent_is_scripted = IsNode::is_scripted(parent); |
1160 | 310 | if name(contents) != "mrow" && |
1161 | 53 | return NOT_CHEMISTRY; |
1162 | 257 | } |
1163 | 257 | let likely = likely_chem_formula(contents); |
1164 | 257 | if parent_is_scripted { |
1165 | 149 | return likely + 3; |
1166 | } else { | |
1167 | 108 | return likely; |
1168 | } | |
1169 | 3.08k | } |
1170 | ||
1171 | 3.08k | let mut likelihood = if is_order_ok(mrow) { |
1172 | ||
1173 | // check all the children and compute the likelihood of that this is a chemical formula | |
1174 | // bonus point for consecutive chemical formula children (not counting invisible children) | |
1175 | 3.08k | let mut last_was_likely_formula = 0; // 0 is false, 1 is true |
1176 | 3.08k | let mut is_chem_formula = true; // assume true until we prove otherwise (still want to mark the children) |
1177 | 12.5k | for child in |
1178 | 12.5k | let child = as_element(child); |
1179 | 12.5k | let likely = likely_chem_formula(child); |
1180 | // debug!(" in mrow: likely={}, likelihood={}", likely, likelihood); | |
1181 | 12.5k | match likely.cmp(&0) { |
1182 | Ordering::Greater => { | |
1183 | 2.56k | likelihood += likely + last_was_likely_formula; |
1184 | 2.56k | last_was_likely_formula = if name(child) == "mo" { |
1185 | }, | |
1186 | 5.86k | Ordering::Less => { |
1187 | 5.86k | // debug!("in likely_chem_formula: FALSE: likelihood={}, child\n{}", likelihood, mml_to_string(child)); |
1188 | 5.86k | is_chem_formula = false; |
1189 | 5.86k | last_was_likely_formula = 0; |
1190 | 5.86k | likelihood += likely; |
1191 | 5.86k | }, |
1192 | Ordering::Equal => { | |
1193 | 4.08k | if name(child) == "mo" { |
1194 | 2.27k | let text = as_text(child); |
1195 | 2.27k | if text != "\u{2062}" && |
1196 | 8 | last_was_likely_formula = 0; |
1197 | 2.26k | } |
1198 | 1.81k | } |
1199 | }, | |
1200 | } | |
1201 | // debug!("in likely_chem_formula likelihood={}, child\n{}", likelihood, mml_to_string(child)); | |
1202 | // debug!(" likelihood={} (likely={})", likelihood, likely); | |
1203 | } | |
1204 | ||
1205 | 3.08k | if !is_chem_formula || |
1206 | // the children may have looked have looked right, but something has said "not likely" | |
1207 | 2.25k | return NOT_CHEMISTRY; |
1208 | 832 | } else if likelihood < CHEMISTRY_THRESHOLD && |
1209 | // debug!("is_short_formula is true for:\n{}", mml_to_string(mrow)); | |
1210 | 47 | return CHEMISTRY_THRESHOLD |
1211 | 785 | } |
1212 | 785 | return likelihood; |
1213 | 3.39k | } |
1214 | ||
1215 | 15.0k | } |
1216 | ||
1217 | /// This does some checks that sort of follow IUPAC's "Red Book" in section IR-4.4. | |
1218 | /// Those rules require knowledge that the program doesn't have (e.g., which bond is closest to the central atom). | |
1219 | /// Instead, we mainly use the two main types of orderings: alphabetical and electronegativity. | |
1220 | /// We first do a test to see if this looks like a structural formula -- if so, ordering doesn't apply. | |
1221 | /// If a formula has groupings, each grouping is checked independently of the rest since | |
1222 | /// there are cases where the outer ordering doesn't match the inner ordering. | |
1223 | /// For "generalized salts", we need to split the elements into positive and negative ions, and within each group | |
1224 | /// the order is suppose to be alphabetical but many use electronegativity (the point being there are two separate groups). | |
1225 | /// This site has a nice summary of the rules: https://chemistry.stackexchange.com/questions/537/why-is-arsenous-acid-denoted-h3aso3/538#538 | |
1226 | /// Note: "(OH)" doesn't fit with the above, and Susan Jolly suggests allowing any sequence that ends with H, so we allow that. | |
1227 | /// Also, Susan Jolly suggested allowing any compound with C, H, and O | |
1228 | 3.08k | fn is_order_ok(mrow: Element) -> bool { |
1229 | 3.08k | assert_eq!(name(mrow), "mrow"); |
1230 | 3.08k | if let Some( |
1231 | 2.73k | if |
1232 | 1.48k | return false; |
1233 | 846 | } |
1234 | 846 | let n_elements = elements.len(); |
1235 | 846 | if n_elements < 2 { |
1236 | 475 | return true; |
1237 | 371 | } else if has_noble_element(&elements) { |
1238 | 0 | return false; // noble elements don't form compounds |
1239 | } else { | |
1240 | 371 | return elements[n_elements-1] == "H" || // special case that includes "OH" |
1241 | // has_non_metal_element(&elements) && !has_non_metal_element(&elements) && // must have a metal and non-metal | |
1242 | 295 | has_c_h_o(&elements) || |
1243 | 291 | is_structural(&elements) || |
1244 | 271 | is_alphabetical(&elements) || |
1245 | 169 | is_ordered_by_electronegativity(&elements) || |
1246 | 12 | is_generalized_salt(&elements); |
1247 | } | |
1248 | } else { | |
1249 | 759 | return false; |
1250 | } | |
1251 | 3.08k | } |
1252 | ||
1253 | // from https://learnwithdrscott.com/ionic-bond-definition/ | |
1254 | // I don't include the noble gases since they don't interact with other elements and are ruled out elsewhere | |
1255 | // fn has_non_metal_element(elements: &[&str]) -> bool { | |
1256 | // static NON_METAL_ELEMENTS: phf::Set<&str> = phf_set! { | |
1257 | // "H", "B", "C", "N", "O", "F", "Si", "P", "S", "Cl", "As", "Se", "Br", "Te", "I", "At", | |
1258 | // }; | |
1259 | // return elements.iter().any(|&e| NON_METAL_ELEMENTS.contains(e)); | |
1260 | // } | |
1261 | ||
1262 | ||
1263 | 374 | fn has_noble_element(elements: &[&str]) -> bool { |
1264 | static NOBLE_ELEMENTS: phf::Set<&str> = phf_set! { | |
1265 | "He", "Ne", "Ar", "Kr", "Xe", "Rn", "Og" // Og might be reactive, but it is unstable | |
1266 | }; | |
1267 | 893 | return |
1268 | 374 | } |
1269 | ||
1270 | 295 | fn has_c_h_o(elements: &[&str]) -> bool { |
1271 | 295 | return elements.contains(&"C") && |
1272 | 295 | } |
1273 | ||
1274 | ||
1275 | 295 | fn is_structural(elements: &[&str]) -> bool { |
1276 | 295 | assert!(elements.len() > 1); // already handled |
1277 | ||
1278 | // debug!("is_structural: {:?}", elements); | |
1279 | 295 | let mut element_set = HashSet::with_capacity(elements.len()); |
1280 | 627 |
|
1281 | 295 | return element_set.len() < elements.len(); |
1282 | 295 | } |
1283 | ||
1284 | /// collect up all the elements in the mrow. | |
1285 | /// Returns the elements (which can be an empty vector) or None if something (right now an operator) rules out them being elements | |
1286 | 3.10k | fn collect_elements(mrow: Element<'_>) -> Option<Vec<&str>> { |
1287 | 3.10k | let mut elements = Vec::with_capacity(mrow.children().len()/2+1); // don't bother with slots for operators |
1288 | 8.86k | for child in |
1289 | 8.86k | let child = as_element(child); |
1290 | 8.86k | match name(child) { |
1291 | 8.86k | "mi" | |
1292 | 6.06k | "msub" | |
1293 | 584 | let base = as_element(child.children()[0]); |
1294 | 584 | let base_name = name(base); |
1295 | 584 | if base_name == "mi" || |
1296 | 514 | elements.push(as_text(base)); |
1297 | 514 |
|
1298 | }, | |
1299 | 5.48k | "mo" if |
1300 | 2.46k | "mo" => (), |
1301 | 2.25k | _ => (), // let loop in likely_chem_formula() deal with all the negatives |
1302 | } | |
1303 | } | |
1304 | 2.34k | return Some(elements); |
1305 | 3.10k | } |
1306 | ||
1307 | /// check to make sure elements are ordered alphabetically | |
1308 | /// Actually check Hill's system that puts 'C' followed by 'H' first if 'C' is present | |
1309 | 275 | fn is_alphabetical(elements: &[&str]) -> bool { |
1310 | 275 | assert!(elements.len() > 1); // already handled |
1311 | // debug!("is_alphabetical: {:?}", elements); | |
1312 | 275 | let mut elements = elements; |
1313 | 275 | if elements[1..].contains(&"C") { // "C" must be first if present |
1314 | 22 | return false; |
1315 | 253 | } |
1316 | 253 | if elements[0] == "C" { |
1317 | 10 | elements = if elements[1]=="H" { |
1318 | 243 | } |
1319 | 253 | return elements.len() < 2 || |
1320 | 275 | } |
1321 | ||
1322 | 174 | fn is_ordered_by_electronegativity(elements: &[&str]) -> bool { |
1323 | // HPO_4^2 (Mono-hydrogen phosphate) doesn't fit this pattern, nor does HCO_3^- (Hydrogen carbonate) and some others | |
1324 | // FIX: drop "H" from the ordering?? | |
1325 | 174 | assert!(elements.len() > 1); // already handled |
1326 | 188 | return |
1327 | 174 | } |
1328 | ||
1329 | 12 | fn is_generalized_salt(elements: &[&str]) -> bool { |
1330 | 12 | assert!(!elements.is_empty()); |
1331 | 12 | return false; |
1332 | 12 | } |
1333 | ||
1334 | ||
1335 | /// Returns the likelihood that the arg is an adorned chem formula | |
1336 | /// Adornments are: | |
1337 | /// superscripts with +/- and optionally a number (charge) | |
1338 | /// numeric subscripts (e.g. H_2) | |
1339 | /// In addition to chemical elements, we include nuclear decay since there is a lot of overlap in notation | |
1340 | /// The nuclear decay notation is mostly taken from https://tinyurl.com/2f6b8e3a | |
1341 | /// Basically it is a chemical element or 'e', 'p', 'n', 'α', 'β', or 'γ' with pre-sub/superscript | |
1342 | /// There is also an instance with a charge on the referenced page, so we allow that also. | |
1343 | /// | |
1344 | /// Note: https://tinyurl.com/ysmr8cw2 says "++"/"--", etc., is sometimes used in a superscript particle physics instead of a "2" | |
1345 | /// | |
1346 | /// Note: msubsup cleaning for an empty script hasn't happened and we consider an empty script a sign of attempting to vertically align sub/superscripts | |
1347 | /// | |
1348 | /// Note: 'mathml' is not necessarily canonicalized | |
1349 | 2.85k | pub fn likely_adorned_chem_formula(mathml: Element) -> i32 { |
1350 | 2.85k | if ! |
1351 | 1 | return NOT_CHEMISTRY; |
1352 | 2.85k | } |
1353 | // some simple sanity checks on the scripts... | |
1354 | 2.85k | let tag_name = name(mathml); |
1355 | 2.85k | let children = mathml.children(); |
1356 | 2.85k | let mut likelihood = 0; |
1357 | 2.85k | let mut is_empty_subscript = false; |
1358 | // debug!("likely_adorned_chem_formula:\n{}", mml_to_string(mathml)); | |
1359 | 2.85k | if tag_name == "msub" || |
1360 | // subscripts should be just a number, although they could be 'n' or '2n' or other exprs. | |
1361 | 1.10k | let subscript = as_element(children[1]); |
1362 | 1.10k | is_empty_subscript = name(subscript) == "mtext" && |
1363 | 1.10k | if !is_empty_subscript { |
1364 | 1.10k | likelihood += likely_chem_subscript(subscript); |
1365 | 1.10k |
|
1366 | 1.74k | } |
1367 | ||
1368 | 2.85k | let mut empty_superscript = false; |
1369 | 2.85k | if tag_name == "msup" || |
1370 | // debug!("likely_adorned_chem_formula: mathml\n{}", mml_to_string(mathml)); | |
1371 | 1.59k | let superscript = as_element(children[if tag_name == "msup" { |
1372 | 1.59k | empty_superscript = name(superscript) == "mtext" && |
1373 | 1.59k | if !empty_superscript { |
1374 | 1.58k | likelihood += likely_chem_superscript(superscript); |
1375 | 1.58k |
|
1376 | 1.26k | } |
1377 | 2.85k | if tag_name == "msubsup" && ( |
1378 | 9 | likelihood += 1; // might be trying to vertically align scripts as in done in chemistry |
1379 | 2.84k | } |
1380 | ||
1381 | 2.85k | if tag_name == "mmultiscripts" { |
1382 | // prescripts are normally positive integers, chem 2.5.1 allows for a superscript for a Lewis dot | |
1383 | // postscript should be a charge | |
1384 | ||
1385 | let prescripts; | |
1386 | let postscripts; | |
1387 | 351 | if children.len() == 4 && |
1388 | 138 | prescripts = &children[2..4]; |
1389 | 138 | postscripts = &children[0..0]; // empty |
1390 | 213 | } else if children.len() == 6 && |
1391 | 55 | prescripts = &children[4..6]; |
1392 | 55 | postscripts = &children[1..3]; // empty |
1393 | 158 | } else if children.len() == 3 || |
1394 | 118 | prescripts = &children[0..0]; // empty |
1395 | 118 | postscripts = &children[1..]; |
1396 | 118 | } else { |
1397 | 40 | return NOT_CHEMISTRY; |
1398 | }; | |
1399 | ||
1400 | 311 | if !prescripts.is_empty() { |
1401 | 193 | let pre_subscript = as_element(prescripts[0]); |
1402 | 193 | let pre_subscript_name = name(pre_subscript); |
1403 | ||
1404 | 193 | let pre_superscript = as_element(prescripts[1]); |
1405 | 193 | let pre_superscript_name = name(pre_superscript); |
1406 | ||
1407 | // deal with special case of 'e' with prescripts of -1 and 0 | |
1408 | 193 | if is_adorned_electron(children[0], prescripts) { |
1409 | 31 | return 100; // very likely chemistry |
1410 | 162 | } |
1411 | 162 | let base = as_element(children[0]); |
1412 | 162 | let base_name = name(base); |
1413 | 162 | let |
1414 | 154 | let Some( |
1415 | 127 | *atomic_number |
1416 | } else { | |
1417 | 35 | return NOT_CHEMISTRY; |
1418 | }; | |
1419 | 127 | if pre_superscript_name == "mo" { |
1420 | // Lewis dot prescript case | |
1421 | 3 | if pre_subscript_name != "none" { |
1422 | 0 | return NOT_CHEMISTRY; |
1423 | 3 | } |
1424 | 3 | likelihood += likely_chem_superscript(pre_superscript); |
1425 | 124 | } else if pre_superscript_name == "mn" { // must have a pre-superscript (neutrons + protons) |
1426 | 75 | if let Ok(mass) = as_text(pre_superscript).parse::<u32>() { |
1427 | // "drip line" is 1.5 * mass < 3.5 * mass -- it is possible to outside of this range, but VERY unlikely | |
1428 | // to avoid floating point, we multiply by 2 and compare to 3 and 7 | |
1429 | 75 | if 3*atomic_number < 2*mass && |
1430 | 74 | likelihood += 3; |
1431 | 74 |
|
1432 | 0 | } |
1433 | 75 | if pre_subscript_name == "mn" && |
1434 | 69 | likelihood = CHEMISTRY_THRESHOLD; |
1435 | 69 |
|
1436 | } else { | |
1437 | 49 | return NOT_CHEMISTRY; |
1438 | } | |
1439 | 118 | } |
1440 | ||
1441 | 196 | if !postscripts.is_empty() { |
1442 | 119 | let mut i = 0; |
1443 | 266 | while i < postscripts.len() { |
1444 | 147 | let sub = as_element(postscripts[i]); |
1445 | // debug!("sub: {}", mml_to_string(sub)); | |
1446 | 147 | if name(sub) != "none" { |
1447 | 91 | likelihood += likely_chem_subscript(sub); |
1448 | 91 |
|
1449 | 147 | let sup = as_element(postscripts[i+1]); |
1450 | 147 | if name(sup) != "none" { |
1451 | 65 | // debug!("sup: {}", mml_to_string(sub)); |
1452 | 65 | likelihood += likely_chem_superscript(sup); |
1453 | 82 | } |
1454 | 147 | i += 2; |
1455 | } | |
1456 | 77 | } |
1457 | 2.50k | } |
1458 | ||
1459 | 2.69k | let base = as_element(children[0]); |
1460 | 2.69k | let base_name = name(base); |
1461 | 2.69k | if base_name == "mi" || |
1462 | 2.05k | likelihood += likely_chem_element(base); |
1463 | 2.05k | } else if |
1464 | // debug!("mrow addition:\n{}", mml_to_string(base)); | |
1465 | // a safe minor canonicalization that allows "short_form" calculations if appropriate | |
1466 | 187 | if (IsBracketed::is_bracketed(base, "(", ")", false, false) || |
1467 | 89 | IsBracketed::is_bracketed(base, "[", "]", false, false)) && |
1468 | 148 | base.children().len() > 3 { |
1469 | 77 | let inner_mrow = create_mathml_element(&base.document(), "mrow"); |
1470 | 77 | inner_mrow.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); |
1471 | 77 | let mut children = base.children(); |
1472 | 77 | let inside_of_parens = children.drain(1..children.len()-1); |
1473 | 77 | inner_mrow.append_children(inside_of_parens); |
1474 | 77 | base.replace_children(vec![children[0], ChildOfElement::Element(inner_mrow), children[children.len()-1]]); |
1475 | 110 | } |
1476 | 187 | likelihood += likely_chem_formula(base); |
1477 | 454 | } else { |
1478 | 454 | likelihood += likely_chem_formula(base); |
1479 | 454 | } |
1480 | ||
1481 | // debug!("returning from likely_adorned_chem_formula: likelihood={}, mathml\n{}", likelihood, mml_to_string(mathml)); | |
1482 | 2.69k | return likelihood; |
1483 | ||
1484 | ||
1485 | 193 | fn is_adorned_electron(base: ChildOfElement, prescripts: &[ChildOfElement]) -> bool { |
1486 | // looking for 'e' with prescripts of -1 and 0 | |
1487 | 193 | let base = as_element(base); |
1488 | 193 | let pre_lower = as_element(prescripts[0]); |
1489 | 193 | let pre_upper = as_element(prescripts[1]); |
1490 | 193 | if (name(base) == "mi" || |
1491 | 31 | name(pre_upper) == "mn" && as_text(pre_upper) == "0" && |
1492 | 31 | name(pre_lower) == "mrow" && pre_lower.children().len() == 2 { |
1493 | // looking '-' and '1' | |
1494 | 31 | let lower_children = pre_lower.children(); |
1495 | 31 | let minus = as_element(lower_children[0]); |
1496 | 31 | let one = as_element(lower_children[1]); |
1497 | // not yet normalized, so we need to compare against ASCII minus and u+2212 | |
1498 | 31 | return name(minus) == "mo" && (as_text(minus) == "-" || as_text(minus) == "−") && |
1499 | 31 | name(one) == "mn" && as_text(one) == "1"; |
1500 | } else { | |
1501 | 162 | return false; |
1502 | } | |
1503 | 193 | } |
1504 | 2.85k | } |
1505 | ||
1506 | /// useful function to see if the str is a single char matching the predicate | |
1507 | 29.6k | fn is_single_char_matching(leaf_text: &str, pred: impl Fn(char) -> bool) -> bool { |
1508 | 29.6k | let mut chars = leaf_text.chars(); |
1509 | 29.6k | if let Some(ch) = chars.next() && chars.next().is_none() { |
1510 | 29.5k | return pred(ch); |
1511 | 87 | } |
1512 | 87 | return false; |
1513 | 29.6k | } |
1514 | ||
1515 | 17.2k | fn likely_chem_formula_operator(mathml: Element) -> i32 { |
1516 | // mostly from chenzhijin.com/en/article/Useful%20Unicode%20for%20Chemists (Arrows and Other) | |
1517 | // also en.wikipedia.org/wiki/Chemical_formula#Condensed_formula | |
1518 | #[derive(PartialEq, Eq)] | |
1519 | enum BondType {DoubleBond, TripleBond} // options for is_legal_bond() | |
1520 | // "⋅" is used in GTM 16.2 and en.wikipedia.org/wiki/Cement_chemist_notation -- may want to add some similar chars | |
1521 | static CHEM_FORMULA_OPERATORS: phf::Set<&str> = phf_set! { | |
1522 | "-", "\u{2212}", "⋅", ":", "=", "∷", "≡", ":::", "≣", "::::", // bond symbols (need both 2212 and minus because maybe not canonicalized) | |
1523 | "⋮", // lewis dots, part of "⋮⋮" - triple bond (see Nemeth chem guide 2.5.4) | |
1524 | }; | |
1525 | 16.1k | fn is_chem_formula_ok(ch: char) -> bool { |
1526 | 16.1k |
|
1527 | 16.1k | } |
1528 | ||
1529 | 17.2k | assert_eq!(name(mathml), "mo"); |
1530 | 17.2k | let leaf_text = as_text(mathml); |
1531 | 17.2k | if CHEM_FORMULA_OPERATORS.contains(leaf_text) && |
1532 | 1.85k | (has_inherited_property(mathml, "chemical-formula") || |
1533 | 1.85k | ( !(leaf_text == "=" || |
1534 | 1.05k | ( !(leaf_text == "≡" || |
1535 | ) { | |
1536 | 1.04k | mathml.set_attribute_value(MAYBE_CHEMISTRY, "1"); |
1537 | 1.04k | mathml.set_attribute_value(CHEM_FORMULA_OPERATOR, "1"); |
1538 | 1.04k | return 1; |
1539 | 16.1k | } else if is_single_char_matching(leaf_text, is_chem_formula_ok) { |
1540 | 6.49k | return 0; // not much info |
1541 | } else { | |
1542 | 9.67k | return -3; // still a small chance; |
1543 | } | |
1544 | ||
1545 | 874 | fn is_legal_bond(mathml: Element, bond_type: BondType) -> bool { |
1546 | 874 | let preceding = mathml.preceding_siblings(); |
1547 | 874 | let following = mathml.following_siblings(); |
1548 | 874 | if preceding.is_empty() || |
1549 | 115 | return false; |
1550 | 759 | } |
1551 | ||
1552 | 759 | let mut preceding_element = as_element(preceding[preceding.len()-1]); |
1553 | // special check for CH_2 -- double bond is really with C | |
1554 | 759 | if bond_type == BondType::DoubleBond && |
1555 | 31 | preceding.len() > 1 && |
1556 | 2 | preceding_element = as_element(preceding[preceding.len()-2]); |
1557 | 2 | if !is_leaf(preceding_element) || as_text(preceding_element) != "C" { |
1558 | 0 | return false; |
1559 | 2 | } |
1560 | 757 | } else if name(preceding_element) != "mi" && |
1561 | 320 | return false; |
1562 | 437 | } |
1563 | 439 | let following_element = get_possible_embellished_node(as_element(following[0])); |
1564 | 439 | if name(following_element) != "mi" && |
1565 | 313 | return false; |
1566 | 126 | } |
1567 | 126 | let preceding_text = as_text(preceding_element); |
1568 | 126 | let following_text = as_text(following_element); |
1569 | 126 | return match bond_type { |
1570 | 105 | BondType::DoubleBond => is_legal_double_bond(preceding_text, following_text), |
1571 | 21 | BondType::TripleBond => is_legal_triple_bond(preceding_text, following_text), |
1572 | }; | |
1573 | ||
1574 | 105 | fn is_legal_double_bond(left: &str, right: &str) -> bool { |
1575 | // this is based on table in en.wikipedia.org/wiki/Double_bond#Types_of_double_bonds_between_atoms | |
1576 | static DOUBLE_BOND_TO_SELF: phf::Set<&str> = phf_set! { | |
1577 | "C", "O", "N", "S", "Si", "Ge", "Sn", "Pb" | |
1578 | }; | |
1579 | // "C" => &["O", "N", "S"], | |
1580 | // "O" => &["N", "S"], | |
1581 | 105 | if left == right && |
1582 | 44 | return true; |
1583 | 61 | } |
1584 | 61 | return match left { |
1585 | 61 | "C" => |
1586 | 58 | "O" => |
1587 | 57 | "Si" => |
1588 | 57 | _ => false, |
1589 | } | |
1590 | 105 | } |
1591 | ||
1592 | 21 | fn is_legal_triple_bond(left: &str, right: &str) -> bool { |
1593 | // According to https://tinyurl.com/rkynhwj3 (from physics.org) | |
1594 | // triple bonds can be formed between any of B, C, N, and O | |
1595 | // Apparently they can also be forced in other cases, but they are rare. | |
1596 | // 'B' is from studiousguy.com/triple-bond-examples/ | |
1597 | 21 | return (left == "B" || left == "C" || |
1598 | 18 | (right == "B" || right == "C" || |
1599 | 21 | } |
1600 | 874 | } |
1601 | 17.2k | } |
1602 | ||
1603 | /// This assumes canonicalization of characters has happened | |
1604 | 6.85k | fn likely_chem_equation_operator(mathml: Element) -> i32 { |
1605 | ||
1606 | 6.73k | fn is_chem_equation_operator(ch: char) -> bool { |
1607 | 6.73k |
|
1608 | 6.73k | } |
1609 | ||
1610 | 6.85k | let elem_name = name(mathml); |
1611 | 6.85k | if elem_name == "munder" || |
1612 | 86 | let base = as_element(mathml.children()[0]); |
1613 | 86 | if name(base) == "mo" && |
1614 | 1 | base.set_attribute_value(MAYBE_CHEMISTRY, "1"); |
1615 | 1 | base.set_attribute_value(CHEM_EQUATION_OPERATOR, "1"); |
1616 | 1 | return 1; |
1617 | 85 | } else if elem_name == "mover" && |
1618 | 9 | return 2; |
1619 | } else { | |
1620 | 76 | return NOT_CHEMISTRY; |
1621 | } | |
1622 | 6.76k | } |
1623 | ||
1624 | 6.76k | if name(mathml) == "mo" { |
1625 | 6.76k | let text = as_text(mathml); |
1626 | 6.76k | if is_single_char_matching(text, is_chem_equation_operator) || |
1627 | 1.96k | mathml.set_attribute_value(MAYBE_CHEMISTRY, "1"); |
1628 | 1.96k | mathml.set_attribute_value(CHEM_EQUATION_OPERATOR, "1"); |
1629 | 1.96k | return 1; |
1630 | 4.79k | } else if text == "\u{2062}" || text == "\u{2063}" { |
1631 | // FIX: the invisible operator between elements should be well-defined, but this likely needs work, so both accepted for now | |
1632 | 0 | return 0; |
1633 | 4.79k | } |
1634 | 0 | } |
1635 | 4.79k | return -3; // there is still a chance |
1636 | ||
1637 | /// Detects output of mhchem for some equilibrium arrows that currently (11/22) don't have Unicode points | |
1638 | /// See github.com/NSoiffer/MathCAT/issues/60 for the patterns being matched | |
1639 | 20 | fn is_hack_for_missing_arrows(mover: Element) -> bool { |
1640 | 20 | assert_eq!(name(mover), "mover"); |
1641 | 20 | let children = mover.children(); |
1642 | 20 | let base = as_element(children[0]); |
1643 | 20 | let mo_base = if name(base) == "mrow" && |
1644 | 9 | as_element(base.children()[0]) |
1645 | } else { | |
1646 | 11 | base |
1647 | }; | |
1648 | 20 | let upper = as_element(children[1]); |
1649 | 20 | let mo_upper = if name(upper) == "mrow" && |
1650 | 9 | as_element(upper.children()[1]) |
1651 | } else { | |
1652 | 11 | upper |
1653 | }; | |
1654 | // slightly sloppy match, but almost certainly good enough | |
1655 | 20 | return name(mo_base) == "mo" && |
1656 | 9 | as_text(mo_base) == "↽" && as_text(mo_upper) == "⇀"; |
1657 | 20 | } |
1658 | 6.85k | } |
1659 | ||
1660 | 38 | fn is_equilibrium_constant(mut mathml: Element) -> bool { |
1661 | 38 | if name(mathml) == "msub" { |
1662 | 27 | mathml = as_element(mathml.children()[0]); |
1663 | 27 |
|
1664 | ||
1665 | 38 | return name(mathml) == "mi" && |
1666 | 38 | } |
1667 | ||
1668 | // Oxidation states range from -4 to 9 and are written with (a subset of) roman numerals. | |
1669 | // All instances seem to be upper case that I've seen. | |
1670 | 3 | static SMALL_UPPER_ROMAN_NUMERAL: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*^(IX|IV|V?I{0,3})\s*$").unwrap()); |
1671 | ||
1672 | /// look for "(s), "(l)", "(g)", "(aq)" (could also use [...]) | |
1673 | /// this might be called before canonicalization, but in clean_chemistry_mrow, we made sure "( xxx )" is grouped properly | |
1674 | 3.68k | pub fn likely_chem_state(mathml: Element) -> i32 { |
1675 | ||
1676 | 3.68k | if IsBracketed::is_bracketed(mathml, "(", ")", false, false) || |
1677 | 3.30k | IsBracketed::is_bracketed(mathml, "[", "]", false, false) { |
1678 | 438 | let contents = as_element(mathml.children()[1]); |
1679 | 438 | let contents_name = name(contents); |
1680 | 438 | if contents_name == "mi" || |
1681 | 109 | let text = as_text(contents); |
1682 | 109 | if text == "s" || |
1683 | 67 | return text.len() as i32 + 1; // hack to count chars -- works because all are ASCII |
1684 | 42 | }; |
1685 | 329 | } |
1686 | 3.24k | } |
1687 | 3.61k | return NOT_CHEMISTRY; |
1688 | 3.68k | } |
1689 | ||
1690 | /// Returns the likelihood that the arg is an element | |
1691 | 16.4k | pub fn likely_chem_element(mathml: Element) -> i32 { |
1692 | static NUCLEAR_SYMBOLS: [&str; 6] = ["e", "p", "n", "α", "β","γ"]; | |
1693 | ||
1694 | 16.4k | assert!(name(mathml) == "mi" || |
1695 | 16.4k | let text = as_text(mathml); |
1696 | 16.4k | if as_text(mathml).trim().is_empty() { |
1697 | 782 | return 0; // whitespace |
1698 | 15.6k | } else if is_chemical_element(mathml) { |
1699 | // single letter = 1; single letter with mathvariant="normal" = 2; double = 3 -- all elements are ASCII | |
1700 | 2.21k | return if text.len() == 1 { |
1701 | 1.90k | if mathml.attribute_value("mathvariant").unwrap_or_default() == "normal" { |
1702 | } else { | |
1703 | 311 | 3 |
1704 | }; | |
1705 | 13.4k | } else if NUCLEAR_SYMBOLS.contains(&text) { |
1706 | 659 | return 0; |
1707 | // not much special about them; | |
1708 | } else { | |
1709 | 12.7k | return NOT_CHEMISTRY; |
1710 | } | |
1711 | 16.4k | } |
1712 | ||
1713 | static SHORT_SINGLE_LETTER_ELEMENT_FORMULAE: phf::Set<&str> = phf_set! { | |
1714 | // from en.wikipedia.org/wiki/Glossary_of_chemical_formulae (via chem_formula_from_wikipedia.py) | |
1715 | "BF_3", "BI_3", "BN", "BP", "B_2F_4", "B_2H_6", "B_2O_3", "B_2S_3", "B_4C", | |
1716 | "CB_4", "CF_4", "CH_2", "CH_4", "CO", "CO_2", "CO_3", "CS_2", "CW", "C_2F_4", | |
1717 | "C_2H_4", "C_2H_6", "C_2U", "C_2Y", "C_3H_4", "C_3H_6", "C_3H_8", "C_4H_2", | |
1718 | "C_4H_8", "C_4I_2", "C_6H_6", "C_6N_4", "C_7H_8", "C_8H_8", "DI", "D_2O", | |
1719 | "FI", "FI_2", "FK", "FN", "FO", "FO_2", "FP", "FS", "FW", "FY", "F_2", | |
1720 | "F_2N", "F_2O", "F_2O_2", "F_2P", "F_2S", "F_2S_2", "F_2W", "F_2Y", "F_3B", | |
1721 | "F_3P", "F_3S", "F_3W", "F_3Y", "F_4B_2", "F_4C", "F_4C_2", "F_4N_2", | |
1722 | "F_4S", "F_4U", "F_4W", "F_5I", "F_5P", "F_5S", "F_5U", "F_5W", "F_6S", | |
1723 | "F_6W", "F_7I", "HF", "HI", "HK", "HN_3", "H_2", "H_2C", "H_2C_2", "H_2C_4", | |
1724 | "H_2O", "H_2O_2", "H_2S", "H_3N", "H_3P", "H_4C", "H_4C_2", "H_4C_3", | |
1725 | "H_4N_2", "H_4N_4", "H_6B_2", "H_6C_2", "H_6C_3", "H_6C_6", "H_8C_3", | |
1726 | "H_8C_7", "H_8C_8", "ID", "IF", "IF_5", "IF_7", "IH", "IK", "IO_3", "I_2", | |
1727 | "I_2F", "I_2O_5", "I_2W", "I_3B", "I_3N", "I_3U", "I_3V", "I_4P_2", "I_4W", | |
1728 | "KH", "KI", "K_2F_2", "K_2O", "K_2O_2", "K_2S", "NB", "NF", "NF_2", "NF_3", | |
1729 | "NI_3", "NO", "NO_2", "NU", "NV", "N_2", "N_2F_4", "N_2H_2", "N_2H_4", | |
1730 | "N_2O_3", "N_2O_4", "N_2O_5", "N_3H", "N_4C_6", "N_4H_4", "N_5P_3", "O", | |
1731 | "OD_2", "OF", "OF_2", "OH_2", "OK_2", "ON", "ON_2", "OT_2", "O_2", "O_2C", | |
1732 | "O_2F_2", "O_2H_2", "O_2K_2", "O_2N", "O_2S", "O_2U", "O_2W", "O_3", | |
1733 | "O_3C", "O_3I", "O_3N_2", "O_3S", "O_3U", "O_3V_2", "O_3W", "O_3Y_2", | |
1734 | "O_5I_2", "O_5N_2", "O_5P_2", "O_5V_2", "O_8U_3", "PB", "PF", "PF_2", "PF_3", | |
1735 | "PH_3", "PY", "P_2F_4", "P_2I_4", "P_2O_5", "P_2S_3", "P_3N_5", "SF", "SF_2", | |
1736 | "SF_4", "SF_5", "SF_6", "SH_2", "SK_2", "SO_2", "SO_3", "S_2C", "S_2F_2", | |
1737 | "S_2W", "S_3B_2", "S_3P_2", "S_3W", "S_3Y_2", "T_2O", "UC_2", "UF_4", "UF_5", | |
1738 | "UI_3", "UN", "UO_2", "UO_3", "US_2", "U_3O_8", "VI_3", "VN", "V_2O_3", | |
1739 | "WC", "WF", "WF_2", "WF_3", "WF_4", "WF_5", "WF_6", "WI_2", "WI_4", "WO_2", | |
1740 | "WS_2", "WS_3", "YB_6", "YC_2", "YF", "YF_2", "YF_3", "YP", "Y_2O_3", | |
1741 | ||
1742 | // from en.wikipedia.org/wiki/Ion#Common_ions (via chem_formula_from_wikipedia.py) | |
1743 | "CH_3COO^−", "CN^−", "CO_3^2−", "C^−", "C_2O_4^2−", "F^−", "HCOO^−", | |
1744 | "HPO_4^2−", "HSO_3^−", "HSO_4^−", "H^+", "H^−", "H_2PO_4^−", "H_3O^+", "I^−", | |
1745 | "NH_4^+", "NO_2^−", "NO_3^−", "N^3−", "N_3^−", "OH^−", "O^2−", "O_2^2−", | |
1746 | "PO_4^3−", "P^3−", "SO_3^2−", "SO_4^2−", "S^2−", "S_2O_3^2−", | |
1747 | ||
1748 | // from gchem.cm.utexas.edu/canvas.php?target=bonding/ionic/polyatomic-ions.html | |
1749 | "PO_3^3−", "IO_3^−", | |
1750 | ||
1751 | // others | |
1752 | "CH_3", /* methyl */ | |
1753 | "NH_3", // ammonium | |
1754 | }; | |
1755 | ||
1756 | /// Returns true if the formula is composed of 1 or 2 single letter elements and it matches a known compound/ion | |
1757 | /// This might be called (via likely_adorned_chem_formula) unparsed | |
1758 | 387 | fn is_short_formula(mrow: Element) -> bool { |
1759 | 387 | assert_eq!(name(mrow), "mrow"); |
1760 | 387 | let children = mrow.children(); |
1761 | 387 | let n_children = children.len(); |
1762 | 387 | if n_children == 0 || n_children > 3 || ( |
1763 | 12 | return false; |
1764 | 375 | } |
1765 | ||
1766 | 375 | let first_element = convert_to_short_form( as_element(children[0]) ); |
1767 | 375 | if n_children == 1 { |
1768 | 2 | return first_element.is_ok(); |
1769 | 373 | } |
1770 | 373 | let second_element = convert_to_short_form( as_element(children[if n_children == 2 { |
1771 | 373 | return match (first_element, second_element) { |
1772 | 365 | (Ok(first), Ok(second)) => { |
1773 | 365 | let short_form = first + second.as_str(); |
1774 | // debug!("short_form: {}", short_form); | |
1775 | 365 | return SHORT_SINGLE_LETTER_ELEMENT_FORMULAE.contains(&short_form); |
1776 | }, | |
1777 | 8 | _ => false, |
1778 | } | |
1779 | 387 | } |
1780 | ||
1781 | 931 | fn convert_to_short_form(mathml: Element) -> Result<String> { |
1782 | 931 | let mathml_name = name(mathml); |
1783 | 931 | return match mathml_name { |
1784 | 931 | "mi" | |
1785 | 95 | "none" => |
1786 | 95 | "msub" | |
1787 | 86 | let is_mmultiscripts = mathml_name == "mmultiscripts"; |
1788 | 86 | let children = mathml.children(); |
1789 | 86 | let mut result = convert_to_short_form(as_element(children[0])) |
1790 | 86 | if is_mmultiscripts && |
1791 | 0 | bail!("mmultiscripts found with {} children -- not part of chemical formula", children.len()); |
1792 | 86 | } |
1793 | 86 | if mathml_name == "msub" || |
1794 | 83 | result += "_"; |
1795 | 83 | result += &convert_to_short_form(as_element(children[1])) |
1796 | 3 | } |
1797 | 85 | if mathml_name == "msup" || |
1798 | 3 | result += "^"; |
1799 | 3 | result += &convert_to_short_form(as_element(children[if mathml_name=="msup" {1} else { |
1800 | 82 | } |
1801 | 85 | Ok( result ) |
1802 | }, | |
1803 | 9 | "mrow" => { |
1804 | // the only time this is valid is if the superscript is something like "+" or "2+", so we do a few checks and short circuit false now | |
1805 | 9 | let mrow_children = mathml.children(); |
1806 | 9 | if mrow_children.len() == 1 || mrow_children.len() == 2 { |
1807 | 0 | let mut result = convert_to_short_form(as_element(mrow_children[0]))?; |
1808 | 0 | if mrow_children.len() == 2 { |
1809 | 0 | result += &convert_to_short_form(as_element(mrow_children[1]))?; |
1810 | 0 | } |
1811 | 0 | return Ok(result) |
1812 | } else { | |
1813 | 9 | bail!("mrow found with {} children -- not part of chemical formula", mrow_children.len()); |
1814 | } | |
1815 | } | |
1816 | 0 | _ => bail!("{} found -- not part of chemical formula", mathml_name), |
1817 | } | |
1818 | 931 | } |
1819 | ||
1820 | /// A map of chemical elements and their relative IUPAC electronegativity (https://i.stack.imgur.com/VCSzW.png) | |
1821 | /// That list uses a horizontal line for the Lanthanide and Actinide Series. | |
1822 | /// Because I had already ordered the elements before realizing that, I opened a gap and started the higher ones again with a '1' in front. | |
1823 | /// The list is missing recent (unstable) elements -- I added them with the same value as the element above them in the periodic table. | |
1824 | static CHEMICAL_ELEMENT_ELECTRONEGATIVITY: phf::Map<&str, u32> = phf_map! { | |
1825 | "Ac" => 40, "Ag" => 155, "Al" => 163, "Am" => 29, "Ar" => 4, "As" => 172, "At" => 181, "Au" => 154, | |
1826 | "B" => 164, "Ba" => 14, "Be" => 18, "Bh" => 137, "Bi" => 170, "Bk" => 27, "Br" => 183, | |
1827 | "C" => 169, "Ca" => 16, "Cd" => 158, "Ce" => 56, "Cf" => 26, "Cl" => 184, "Cm" => 28, "Cn" => 157, "Co" => 148, "Cr" => 136, "Cs" => 8, "Cu" => 156, | |
1828 | "Db" => 129, "Ds" => 149, "Dy" => 48, | |
1829 | "Er" => 46, "Es" => 25, "Eu" => 51, "F" => 185, "Fe" => 144, "Fl" => 165, "Fm" => 24, "Fr" => 7, "Ga" => 162, "Gd" => 50, "Ge" => 167, | |
1830 | "H" => 175, "He" => 6, "Hf" => 126, "Hg" => 157, "Ho" => 47, "Hs" => 141, "I" => 182, "In" => 161, "Ir" => 146, "K" => 10, "Kr" => 3, | |
1831 | "La" => 62, "Li" => 12, "Lr" => 19, "Lu" => 41, "Lv" => 176, "Mc" => 170, "Md" => 23, "Mg" => 17, "Mn" => 140, "Mo" => 135, "Mt" => 145, | |
1832 | "N" => 174, "Na" => 11, "Nb" => 131, "Nd" => 54, "Ne" => 5, "Nh" => 160, "Ni" => 152, "No" => 22, "Np" => 31, "O" => 180, "Og" => 1, "Os" => 142, | |
1833 | "P" => 173, "Pa" => 33, "Pb" => 165, "Pd" => 151, "Pm" => 53, "Po" => 176, "Pr" => 55, "Pt" => 150, "Pu" => 30, | |
1834 | "Ra" => 13, "Rb" => 9, "Re" => 138, "Rf" => 125, "Rg" => 153, "Rh" => 147, "Rn" => 1, "Ru" => 143, | |
1835 | "S" => 179, "Sb" => 171, "Sc" => 124, "Se" => 178, "Sg" => 133, "Si" => 168, "Sm" => 52, "Sn" => 166, "Sr" => 15, | |
1836 | "Ta" => 130, "Tb" => 49, "Tc" => 139, "Te" => 177, "Th" => 34, "Ti" => 128, "Tl" => 160, "Tm" => 45, "Ts" => 181, | |
1837 | "U" => 32, "V" => 132, "W" => 134, "Xe" => 2, "Y" => 123, "Yb" => 44, "Zn" => 159, "Zr" => 127, | |
1838 | // The following come from E.A. Moore who said to treat them like chemicals | |
1839 | // These stand for methyl, ethyl, alkyl, acetyl and phenyl and apparently are quite commonly used ("Ac" is already a chemical) | |
1840 | // A full(er?) list is at en.wikipedia.org/wiki/Skeletal_formula#Alkyl_groups and in following sections | |
1841 | "Me" => 0, "Et" => 0, "R" => 0, /* "Ac" => 0, */ "Ph" => 0, | |
1842 | "X" => 0, /* treated as an unknown */ | |
1843 | }; | |
1844 | ||
1845 | // A map of the chemical elements and their atomic numbers | |
1846 | static CHEMICAL_ELEMENT_ATOMIC_NUMBER: phf::Map<&str, u32> = phf_map! { | |
1847 | "H" => 1, "He" => 2, "Li" => 3, "Be" => 4, "B" => 5, "C" => 6, "N" => 7, "O" => 8, "F" => 9, "Ne" => 10, | |
1848 | "Na" => 11, "Mg" => 12, "Al" => 13, "Si" => 14, "P" => 15, "S" => 16, "Cl" => 17, "Ar" => 18, "K" => 19, "Ca" => 20, | |
1849 | "Sc" => 21, "Ti" => 22, "V" => 23, "Cr" => 24, "Mn" => 25, "Fe" => 26, "Co" => 27, "Ni" => 28, "Cu" => 29, "Zn" => 30, | |
1850 | "Ga" => 31, "Ge" => 32, "As" => 33, "Se" => 34, "Br" => 35, "Kr" => 36, "Rb" => 37, "Sr" => 38, "Y" => 39, "Zr" => 40, | |
1851 | "Nb" => 41, "Mo" => 42, "Tc" => 43, "Ru" => 44, "Rh" => 45, "Pd" => 46, "Ag" => 47, "Cd" => 48, "In" => 49, "Sn" => 50, | |
1852 | "Sb" => 51, "Te" => 52, "I" => 53, "Xe" => 54, "Cs" => 55, "Ba" => 56, "La" => 57, "Ce" => 58, "Pr" => 59, "Nd" => 60, | |
1853 | "Pm" => 61, "Sm" => 62, "Eu" => 63, "Gd" => 64, "Tb" => 65, "Dy" => 66, "Ho" => 67, "Er" => 68, "Tm" => 69, "Yb" => 70, | |
1854 | "Lu" => 71, "Hf" => 72, "Ta" => 73, "W" => 74, "Re" => 75, "Os" => 76, "Ir" => 77, "Pt" => 78, "Au" => 79, "Hg" => 80, | |
1855 | "Tl" => 81, "Pb" => 82, "Bi" => 83, "Po" => 84, "At" => 85, "Rn" => 86, "Fr" => 87, "Ra" => 88, "Ac" => 89, "Th" => 90, | |
1856 | "Pa" => 91, "U" => 92, "Np" => 93, "Pu" => 94, "Am" => 95, "Cm" => 96, "Bk" => 97, "Cf" => 98, "Es" => 99, "Fm" => 100, | |
1857 | "Md" => 101, "No" => 102, "Lr" => 103, "Rf" => 104, "Db" => 105, "Sg" => 106, "Bh" => 107, "Hs" => 108, "Mt" => 109, "Ds" => 110, | |
1858 | "Rg" => 111, "Cn" => 112, "Nh" => 113, "Fl" => 114, "Mc" => 115, "Lv" => 116, "Ts" => 117, "Og" => 118, | |
1859 | }; | |
1860 | ||
1861 | 26.9k | pub fn is_chemical_element(node: Element) -> bool { |
1862 | // FIX: allow name to be in an mrow (e.g., <mi>N</mi><mi>a</mi> | |
1863 | 26.9k | let name = name(node); |
1864 | 26.9k | if name != "mi" && |
1865 | 71 | return false; |
1866 | 26.9k | } |
1867 | ||
1868 | 26.9k | let text = as_text(node); |
1869 | 26.9k | return CHEMICAL_ELEMENT_ELECTRONEGATIVITY.contains_key(text) || |
1870 | 23.0k | has_chem_intent(node, "chemical-element") || |
1871 | 23.0k | has_inherited_property(node, "chemical-formula"); |
1872 | 26.9k | } |
1873 | ||
1874 | ||
1875 | #[cfg(test)] | |
1876 | mod chem_tests { | |
1877 | ||
1878 | ||
1879 | #[allow(unused_imports)] | |
1880 | use super::super::init_logger; | |
1881 | use super::super::are_strs_canonically_equal; | |
1882 | use super::*; | |
1883 | ||
1884 | 40 | fn parse_mathml_string<F>(test: &str, test_mathml: F) -> bool |
1885 | 40 | where F: Fn(Element) -> bool { |
1886 | use sxd_document::parser; | |
1887 | use crate::interface::{get_element, trim_element}; | |
1888 | ||
1889 | ||
1890 | 40 | let test = if test.starts_with("<math") { |
1891 | 40 | let new_package = parser::parse(test); |
1892 | 40 | if let Err( |
1893 | 0 | panic!("Invalid MathML input:\n{}\nError is: {}", &test, &e.to_string()); |
1894 | 40 | } |
1895 | ||
1896 | 40 | let new_package = new_package.unwrap(); |
1897 | 40 | let mut mathml = get_element(&new_package); |
1898 | 40 | trim_element(mathml, false); |
1899 | 40 | mathml = as_element(mathml.children()[0]); |
1900 | 40 | return test_mathml(mathml); |
1901 | 40 | } |
1902 | ||
1903 | #[test] | |
1904 | 1 | fn test_noble_element() { |
1905 | // mathml test strings need to be canonical MathML since we aren't testing canonicalize() | |
1906 | 1 | let test = "<mrow> <mi>Na</mi> <mo>⁣</mo> <mi>Cl</mi> </mrow>"; // |
1907 | 1 | assert!( !parse_mathml_string(test, |mathml| has_noble_element( &collect_elements(mathml).unwrap() )) ); |
1908 | 1 | let test = "<mrow> <mi>Ar</mi> <mo>⁣</mo> <mi>Cl</mi> </mrow>"; // |
1909 | 1 | assert!( parse_mathml_string(test, |mathml| has_noble_element( &collect_elements(mathml).unwrap() )) ); |
1910 | 1 | let test = "<mrow> <mi>Ne</mi> </mrow>"; // |
1911 | 1 | assert!( parse_mathml_string(test, |mathml| has_noble_element( &collect_elements(mathml).unwrap() )) ); |
1912 | 1 | } |
1913 | ||
1914 | #[test] | |
1915 | 1 | fn test_alphabetical_order() { |
1916 | // mathml test strings need to be canonical MathML since we aren't testing canonicalize() | |
1917 | 1 | let test = r#"<mrow> |
1918 | 1 | <msub><mi>C</mi><mn>6</mn></msub><mo>⁣</mo> |
1919 | 1 | <msub><mi>H</mi><mn>14</mn></msub> |
1920 | 1 | </mrow>"#; |
1921 | 1 | assert!( parse_mathml_string(test, |mathml| is_alphabetical( &collect_elements(mathml).unwrap() )) ); |
1922 | 1 | let test = r#"<mrow> |
1923 | 1 | <msub><mi>C</mi><mn>6</mn></msub><mo>⁣</mo> |
1924 | 1 | <msub><mi>H</mi><mn>12</mn></msub><mo>⁣</mo> |
1925 | 1 | <msub><mi>O</mi><mn>6</mn></msub> |
1926 | 1 | </mrow>"#; |
1927 | 1 | assert!( parse_mathml_string(test, |mathml| is_alphabetical( &collect_elements(mathml).unwrap() )) ); |
1928 | 1 | let test = "<mrow> <mi>B</mi> <mo>⁣</mo> <mi>C</mi> <mo>⁣</mo> <mi>O</mi></mrow>"; // "C" should be first |
1929 | 1 | assert!( !parse_mathml_string(test, |mathml| is_alphabetical( &collect_elements(mathml).unwrap() )) ); |
1930 | 1 | let test = "<mrow> <mi>P</mi> <mo>⁣</mo> <mi>B</mi> <mo>⁣</mo> <mi>O</mi></mrow>"; // not alphabetical |
1931 | 1 | assert!( !parse_mathml_string(test, |mathml| is_alphabetical( &collect_elements(mathml).unwrap() )) ); |
1932 | 1 | } |
1933 | ||
1934 | #[test] | |
1935 | 1 | fn test_is_structural() { |
1936 | // mathml test strings need to be canonical MathML since we aren't testing canonicalize() | |
1937 | 1 | let test = r#"<mrow> |
1938 | 1 | <msub><mi>C</mi><mn>6</mn></msub><mo>⁣</mo> |
1939 | 1 | <msub><mi>H</mi><mn>14</mn></msub> |
1940 | 1 | </mrow>"#; |
1941 | 1 | assert!( !parse_mathml_string(test, |mathml| is_structural( &collect_elements(mathml).unwrap() )) ); |
1942 | 1 | let test = "<mrow> <mi>B</mi> <mo>⁣</mo> <mi>C</mi> <mo>⁣</mo> <mi>O</mi></mrow>"; |
1943 | 1 | assert!( !parse_mathml_string(test, |mathml| is_structural( &collect_elements(mathml).unwrap() )) ); |
1944 | 1 | let test = "<mrow> <mi>H</mi> <mo>⁣</mo> <mi>O</mi> <mo>⁣</mo> <mi>H</mi></mrow>"; |
1945 | 1 | assert!( parse_mathml_string(test, |mathml| is_structural( &collect_elements(mathml).unwrap() )) ); |
1946 | 1 | let test = "<mrow data-chem-formula='9'> |
1947 | 1 | <mmultiscripts data-chem-formula='1'> |
1948 | 1 | <mi mathvariant='normal' data-chem-element='1'>H</mi> |
1949 | 1 | <mn>2</mn> |
1950 | 1 | <none></none> |
1951 | 1 | </mmultiscripts> |
1952 | 1 | <mo data-changed='added'>⁣</mo> |
1953 | 1 | <mi mathvariant='normal' data-chem-element='1'>C</mi> |
1954 | 1 | <mo data-chemical-bond='true' data-chem-formula-op='1'>=</mo> |
1955 | 1 | <mi mathvariant='normal' data-chem-element='1'>C</mi> |
1956 | 1 | <mo data-changed='added'>⁣</mo> |
1957 | 1 | <mmultiscripts data-chem-formula='1'> |
1958 | 1 | <mi mathvariant='normal' data-chem-element='1'>H</mi> |
1959 | 1 | <mn>2</mn> |
1960 | 1 | <none></none> |
1961 | 1 | </mmultiscripts> |
1962 | 1 | </mrow>"; |
1963 | 1 | assert!( parse_mathml_string(test, |mathml| is_structural( &collect_elements(mathml).unwrap() )) ); |
1964 | 1 | } |
1965 | ||
1966 | ||
1967 | #[test] | |
1968 | 1 | fn test_electronegativity_order() { |
1969 | // mathml test strings need to be canonical MathML since we aren't testing canonicalize() | |
1970 | 1 | let test = r#"<mrow> |
1971 | 1 | <mi>N</mi><mo>⁣</mo> |
1972 | 1 | <msub><mi>H</mi><mn>3</mn></msub> |
1973 | 1 | </mrow>"#; |
1974 | 1 | assert!( parse_mathml_string(test, |mathml| is_ordered_by_electronegativity( &collect_elements(mathml).unwrap() )) ); |
1975 | 1 | let test = r#"<mrow> |
1976 | 1 | <mi>O</mi><mo>⁣</mo> |
1977 | 1 | <msub><mi>F</mi><mn>2</mn></msub> |
1978 | 1 | </mrow>"#; |
1979 | 1 | assert!( parse_mathml_string(test, |mathml| is_ordered_by_electronegativity( &collect_elements(mathml).unwrap() )) ); |
1980 | 1 | let test = r#"<mrow> |
1981 | 1 | <msub><mi>Rb</mi><mn>15</mn></msub><mo>⁣</mo> |
1982 | 1 | <msub><mi>Hg</mi><mn>16</mn></msub> |
1983 | 1 | </mrow>"#; |
1984 | 1 | assert!( parse_mathml_string(test, |mathml| is_ordered_by_electronegativity( &collect_elements(mathml).unwrap() )) ); |
1985 | 1 | let test = r#" |
1986 | 1 | <mrow><msup> |
1987 | 1 | <mo>[</mo> |
1988 | 1 | <mi>Si</mi><mo>⁣</mo> |
1989 | 1 | <msub><mi>As</mi><mn>4</mn></msub> |
1990 | 1 | <mo>]</mo> |
1991 | 1 | <mrow><mn>8</mn><mo>-</mo></mrow> |
1992 | 1 | </msup></mrow>"#; |
1993 | 1 | assert!( parse_mathml_string(test, |mathml| is_ordered_by_electronegativity( &collect_elements(as_element(mathml.children()[0])).unwrap() )) ); |
1994 | 1 | let test = r#"<mrow> |
1995 | 1 | <mi>Si</mi><mo>⁣</mo> |
1996 | 1 | <msub><mi>H</mi><mn>2</mn></msub> |
1997 | 1 | <mi>Br</mi><mo>⁣</mo> |
1998 | 1 | <mi>Cl</mi> |
1999 | 1 | </mrow>"#; |
2000 | 1 | assert!( parse_mathml_string(test, |mathml| is_ordered_by_electronegativity( &collect_elements(mathml).unwrap() )) ); |
2001 | 1 | } |
2002 | ||
2003 | #[test] | |
2004 | 1 | fn test_order() { |
2005 | 1 | let test = r#"<mrow> |
2006 | 1 | <msub><mi>C</mi><mn>2</mn></msub><mo>⁣</mo> |
2007 | 1 | <msub><mi>H</mi><mn>4</mn></msub><mo>⁣</mo> |
2008 | 1 | <msub><mrow> <mo>(</mo><mi>N</mi> <mo>⁣</mo> <msub> <mi>H</mi> <mn>2</mn> </msub><mo>)</mo> </mrow><mn>2</mn></msub> |
2009 | 1 | </mrow>"#; |
2010 | 1 | assert!( parse_mathml_string(test, is_order_ok) ); |
2011 | 1 | let test = r#"<mrow> |
2012 | 1 | <mi>Fe</mi><mo>⁣</mo> |
2013 | 1 | <mi>O</mi><mo>⁣</mo> |
2014 | 1 | <mrow> <mo>(</mo><mrow><mi>O</mi> <mo>⁣</mo><mi>H</mi> </mrow><mo>)</mo> </mrow> |
2015 | 1 | </mrow>"#; |
2016 | 1 | assert!( parse_mathml_string(test, is_order_ok) ); |
2017 | 1 | let test = r#"<mrow> // R-4.4.3.3 -- Chain compound doesn't fit rules but should be accepted |
2018 | 1 | <mi>Br</mi><mo>⁣</mo> |
2019 | 1 | <mi>S</mi><mo>⁣</mo> |
2020 | 1 | <mi>C</mi><mo>⁣</mo> |
2021 | 1 | <mi>N</mi> |
2022 | 1 | </mrow>"#; |
2023 | 1 | assert!( parse_mathml_string(test, |mathml| likely_chem_formula(mathml)==5) ); |
2024 | 1 | } |
2025 | ||
2026 | #[test] | |
2027 | 1 | fn test_simple_double_bond() { |
2028 | 1 | let test1 = r#"<mrow><mi>C</mi><mo>=</mo><mi>C</mi></mrow>"#; |
2029 | 1 | assert!( parse_mathml_string(test1, |mathml| likely_chem_formula(mathml) < CHEMISTRY_THRESHOLD) ); // just under threshold |
2030 | 1 | let test2 = r#"<mrow><mi>C</mi><mo>∷</mo><mi>O</mi></mrow>"#; |
2031 | 1 | assert!( parse_mathml_string(test2, |mathml| likely_chem_formula(mathml)==CHEMISTRY_THRESHOLD) ); |
2032 | 1 | let test3 = r#"<mrow><mi>N</mi><mo>=</mo><mi>N</mi></mrow>"#; |
2033 | 1 | assert!( parse_mathml_string(test3, |mathml| likely_chem_formula(mathml) < CHEMISTRY_THRESHOLD) ); // just under threshold |
2034 | 1 | let test4 = r#"<mrow><mi>Sn</mi><mo>=</mo><mi>Sn</mi></mrow>"#; |
2035 | 1 | assert!( parse_mathml_string(test4, |mathml| likely_chem_formula(mathml) == 8) ); |
2036 | 1 | let test5 = r#"<mrow><mi>O</mi><mo>=</mo><mi>S</mi></mrow>"#; |
2037 | 1 | assert!( parse_mathml_string(test5, |mathml| likely_chem_formula(mathml) < CHEMISTRY_THRESHOLD) ); // just under threshold |
2038 | 1 | let test10 = r#"<mrow><mi>K</mi><mo>=</mo><mi>K</mi></mrow>"#; |
2039 | 1 | assert!( parse_mathml_string(test10, |mathml| likely_chem_formula(mathml) == NOT_CHEMISTRY) ); |
2040 | 1 | let test11 = r#"<mrow><mi>C</mi><mo>=</mo><mi>K</mi></mrow>"#; |
2041 | 1 | assert!( parse_mathml_string(test11, |mathml| likely_chem_formula(mathml) == NOT_CHEMISTRY) ); |
2042 | 1 | } |
2043 | ||
2044 | #[test] | |
2045 | 1 | fn test_double_bond() { |
2046 | 1 | let test1 = r#"<mrow><mi mathvariant='normal'>C</mi><msub><mi mathvariant='normal'>H</mi><mn>2</mn></msub><mo>=</mo><mi>C</mi></mrow>"#; |
2047 | 1 | assert!( parse_mathml_string(test1, |mathml| likely_chem_formula(mathml)==8) ); |
2048 | 1 | let test2 = r#"<mrow><mi mathvariant='normal'>C</mi><msub><mi mathvariant='normal'>H</mi><mn>2</mn></msub><mo>=</mo> |
2049 | 1 | <mi>C</mi><mi>H</mi><mi>R</mi></mrow>"#; |
2050 | 1 | assert!( parse_mathml_string(test2, |mathml| likely_chem_formula(mathml)==12) ); |
2051 | 1 | let test3 = r#"<mrow><msub><mi mathvariant='normal'>H</mi><mn>2</mn></msub><mi mathvariant='normal'>C</mi><mo>=</mo> |
2052 | 1 | <mi>C</mi><msub><mi mathvariant='normal'>H</mi><mn>2</mn></msub></mrow>"#; |
2053 | 1 | assert!( parse_mathml_string(test3, |mathml| likely_chem_formula(mathml)==11) ); |
2054 | 1 | let test4 = r#"<mrow><mi>H</mi><mo>-</mo><mi>N</mi><mo>=</mo><mi>N</mi><mo>-</mo><mi>H</mi></mrow>"#; |
2055 | 1 | assert!( parse_mathml_string(test4, |mathml| likely_chem_formula(mathml)==10) ); |
2056 | 1 | let test10 = r#"<mrow><mi mathvariant='normal'>C</mi><msub><mi mathvariant='normal'>H</mi><mn>3</mn></msub><mo>=</mo><mi>C</mi></mrow>"#; |
2057 | 1 | assert!( parse_mathml_string(test10, |mathml| likely_chem_formula(mathml)==NOT_CHEMISTRY) ); |
2058 | 1 | } |
2059 | ||
2060 | #[test] | |
2061 | #[ignore] // It would be good to say "not chemistry" for this, but there aren't rules for that at the moment | |
2062 | 0 | fn test_water_bond() { |
2063 | 0 | let test11 = r#"<mrow><msub><mi mathvariant='normal'>H</mi><mn>2</mn></msub><mi mathvariant='normal'>O</mi><mo>=</mo><mi>O</mi></mrow>"#; |
2064 | 0 | assert!( parse_mathml_string(test11, |mathml| {println!("val={}", likely_chem_formula(mathml)); likely_chem_formula(mathml)==8}) ); |
2065 | // assert!( parse_mathml_string(test11, |mathml| likely_chem_formula(mathml)==NOT_CHEMISTRY) ); | |
2066 | 0 | } |
2067 | ||
2068 | ||
2069 | #[test] | |
2070 | 1 | fn test_triple_bond() { |
2071 | 1 | let test1 = r#"<mrow><mi>C</mi><mo>≡</mo><mi>C</mi></mrow>"#; |
2072 | 1 | assert!( parse_mathml_string(test1, |mathml| likely_chem_formula(mathml) < CHEMISTRY_THRESHOLD) ); |
2073 | 1 | let test2 = r#"<mrow><mi>C</mi><mo>:::</mo><mi>O</mi></mrow>"#; |
2074 | 1 | assert!( parse_mathml_string(test2, |mathml| likely_chem_formula(mathml)==CHEMISTRY_THRESHOLD) ); |
2075 | 1 | let test3 = r#"<mrow><mi>H</mi><mo>-</mo><mi>C</mi><mo>≡</mo><mi>C</mi><mo>-</mo><mi>H</mi></mrow>"#; |
2076 | 1 | assert!( parse_mathml_string(test3, |mathml| likely_chem_formula(mathml)==10) ); |
2077 | 1 | let test4 = r#"<mrow><mi>H</mi><mo>-</mo><mi>C</mi><mo>≡</mo><mi>C</mi><mo>-</mo><mi>H</mi></mrow>"#; |
2078 | 1 | assert!( parse_mathml_string(test4, |mathml| likely_chem_formula(mathml)==10) ); |
2079 | 1 | let test5 = r#"<mrow><mi>N</mi><mo>-</mo><mi>C</mi><mo>≡</mo><mi>C</mi><mo>-</mo><mi>N</mi></mrow>"#; |
2080 | 1 | assert!( parse_mathml_string(test5, |mathml| likely_chem_formula(mathml)==10) ); |
2081 | 1 | let test6 = r#"<mrow><mi>H</mi><mo>-</mo><mi>C</mi><mo>≡</mo> |
2082 | 1 | <mi>C</mi><mo>-</mo><mi mathvariant='normal'>C</mi><msub><mi mathvariant='normal'>H</mi><mn>3</mn></msub></mrow>"#; // 1-Propyne |
2083 | 1 | assert!( parse_mathml_string(test6, |mathml| likely_chem_formula(mathml)==14) ); |
2084 | // assert!( parse_mathml_string(test6, |mathml| {println!("val={}", likely_chem_formula(mathml)); likely_chem_formula(mathml)==10}) ); | |
2085 | 1 | let test10 = r#"<mrow><mi>O</mi><mo>:::</mo><mi>S</mi></mrow>"#; |
2086 | 1 | assert!( parse_mathml_string(test10, |mathml| likely_chem_formula(mathml)==NOT_CHEMISTRY) ); |
2087 | 1 | let test11 = r#"<mrow><mi>Pb</mi><mo>≡</mo><mi>Pb</mi></mrow>"#; |
2088 | 1 | assert!( parse_mathml_string(test11, |mathml| likely_chem_formula(mathml)==NOT_CHEMISTRY) ); |
2089 | 1 | let test12 = r#"<mrow><mi>C</mi><mo>≡</mo><mi>K</mi></mrow>"#; |
2090 | 1 | assert!( parse_mathml_string(test12, |mathml| likely_chem_formula(mathml)==NOT_CHEMISTRY) ); |
2091 | 1 | } |
2092 | ||
2093 | #[test] | |
2094 | 1 | fn split_mi() { |
2095 | 1 | let test = "<math><mi>LiF</mi></math>"; |
2096 | 1 | let target = "<math> |
2097 | 1 | <mrow data-changed='added' data-chem-formula='5'> |
2098 | 1 | <mi data-chem-element='3'>Li</mi> |
2099 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2100 | 1 | <mi mathvariant='normal' data-split='true' data-chem-element='1'>F</mi> |
2101 | 1 | </mrow> |
2102 | 1 | </math>"; |
2103 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2104 | 1 | } |
2105 | ||
2106 | #[test] | |
2107 | 1 | fn no_split_mi() { |
2108 | 1 | let test = "<math><mi>HC</mi></math>"; |
2109 | 1 | let target = "<math> |
2110 | 1 | <mi>HC</mi> |
2111 | 1 | </math>"; |
2112 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2113 | 1 | } |
2114 | ||
2115 | #[test] | |
2116 | 1 | fn combine_mi() { |
2117 | 1 | let test = "<math><mi>H</mi><mi>C</mi><mi>l</mi></math>"; |
2118 | 1 | let target = " <math> |
2119 | 1 | <mrow data-changed='added' data-chem-formula='5'> |
2120 | 1 | <mi data-chem-element='1'>H</mi> |
2121 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2122 | 1 | <mi data-merged='true' data-chem-element='3'>Cl</mi> |
2123 | 1 | </mrow> |
2124 | 1 | </math>"; |
2125 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2126 | 1 | } |
2127 | ||
2128 | #[test] | |
2129 | 1 | fn no_combine() { |
2130 | 1 | let test = "<math><mi>C</mi><mi>l</mi></math>"; |
2131 | 1 | let target = "<math> |
2132 | 1 | <mrow data-changed='added'> |
2133 | 1 | <mi>C</mi> |
2134 | 1 | <mo data-changed='added'>⁢</mo> |
2135 | 1 | <mi>l</mi> |
2136 | 1 | </mrow> |
2137 | 1 | </math>"; |
2138 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2139 | 1 | } |
2140 | ||
2141 | #[test] | |
2142 | 1 | fn add_script() { |
2143 | 1 | let test = "<math> <mi>SO</mi> <msub> <mrow></mrow> <mn>2</mn> </msub> </math>"; |
2144 | 1 | let target = "<math> |
2145 | 1 | <mrow data-changed='added' data-chem-formula='5'> |
2146 | 1 | <mi mathvariant='normal' data-chem-element='1'>S</mi> |
2147 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2148 | 1 | <mmultiscripts data-chem-formula='2'> |
2149 | 1 | <mi mathvariant='normal' data-split='true' data-chem-element='1'>O</mi> |
2150 | 1 | <mn>2</mn> |
2151 | 1 | <none></none> |
2152 | 1 | </mmultiscripts> |
2153 | 1 | </mrow> |
2154 | 1 | </math>"; |
2155 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2156 | 1 | } |
2157 | ||
2158 | #[test] | |
2159 | 1 | fn add_script_bug_287() { |
2160 | 1 | let test = r#"<math><mrow> |
2161 | 1 | <msubsup> |
2162 | 1 | <mrow><mi mathvariant="normal">SO</mi></mrow> |
2163 | 1 | <mn>4</mn> |
2164 | 1 | <mrow><mn>2</mn><mo>−</mo></mrow> |
2165 | 1 | </msubsup> |
2166 | 1 | </mrow></math>"#; |
2167 | 1 | let target = r#"<math> |
2168 | 1 | <mrow data-changed='added' data-chem-formula='7'> |
2169 | 1 | <mi mathvariant='normal' data-chem-element='1'>S</mi> |
2170 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2171 | 1 | <msubsup data-chem-formula='5'> |
2172 | 1 | <mi mathvariant='normal' data-split='true' data-chem-element='1'>O</mi> |
2173 | 1 | <mn>4</mn> |
2174 | 1 | <mrow data-chem-formula='3'><mn>2</mn><mo>-</mo></mrow> |
2175 | 1 | </msubsup> |
2176 | 1 | </mrow> |
2177 | 1 | </math>"#; |
2178 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2179 | 1 | } |
2180 | ||
2181 | #[test] | |
2182 | 1 | fn salt() { |
2183 | 1 | let test = "<math><mi>Na</mi><mi>Cl</mi></math>"; |
2184 | 1 | let target = "<math> |
2185 | 1 | <mrow data-changed='added' data-chem-formula='7'> |
2186 | 1 | <mi data-chem-element='3'>Na</mi> |
2187 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2188 | 1 | <mi data-chem-element='3'>Cl</mi> |
2189 | 1 | </mrow> |
2190 | 1 | </math>"; |
2191 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2192 | 1 | } |
2193 | ||
2194 | #[test] | |
2195 | 1 | fn water() { |
2196 | 1 | let test = "<math><msub><mi mathvariant='normal'>H</mi><mn>2</mn></msub><mi mathvariant='normal'>O</mi></math>"; |
2197 | 1 | let target = "<math> |
2198 | 1 | <mrow data-changed='added' data-chem-formula='5'> |
2199 | 1 | <msub data-chem-formula='2'> |
2200 | 1 | <mi mathvariant='normal' data-chem-element='2'>H</mi> |
2201 | 1 | <mn>2</mn> |
2202 | 1 | </msub> |
2203 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2204 | 1 | <mi mathvariant='normal' data-chem-element='2'>O</mi> |
2205 | 1 | </mrow> |
2206 | 1 | </math>"; |
2207 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2208 | 1 | } |
2209 | ||
2210 | #[test] | |
2211 | 1 | fn mhchem_water() { |
2212 | 1 | let test = "<math> |
2213 | 1 | <mrow> |
2214 | 1 | <mrow> |
2215 | 1 | <mi mathvariant='normal'>H</mi> |
2216 | 1 | </mrow> |
2217 | 1 | <msub> |
2218 | 1 | <mrow> |
2219 | 1 | <mrow> |
2220 | 1 | <mpadded width='0'> |
2221 | 1 | <mphantom> |
2222 | 1 | <mi>A</mi> |
2223 | 1 | </mphantom> |
2224 | 1 | </mpadded> |
2225 | 1 | </mrow> |
2226 | 1 | </mrow> |
2227 | 1 | <mrow> |
2228 | 1 | <mrow> |
2229 | 1 | <mpadded height='0'> |
2230 | 1 | <mn>2</mn> |
2231 | 1 | </mpadded> |
2232 | 1 | </mrow> |
2233 | 1 | </mrow> |
2234 | 1 | </msub> |
2235 | 1 | <mrow> |
2236 | 1 | <mi mathvariant='normal'>O</mi> |
2237 | 1 | </mrow> |
2238 | 1 | </mrow> |
2239 | 1 | </math>"; |
2240 | 1 | let target = "<math> |
2241 | 1 | <mrow data-chem-formula='5'> |
2242 | 1 | <mmultiscripts data-chem-formula='2'> |
2243 | 1 | <mi mathvariant='normal' data-chem-element='2'>H</mi> |
2244 | 1 | <mn>2</mn> |
2245 | 1 | <none></none> |
2246 | 1 | </mmultiscripts> |
2247 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2248 | 1 | <mi mathvariant='normal' data-chem-element='2'>O</mi> |
2249 | 1 | </mrow> |
2250 | 1 | </math>"; |
2251 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2252 | 1 | } |
2253 | ||
2254 | #[test] | |
2255 | 1 | fn carbon() { |
2256 | 1 | let test = "<math><mi>C</mi></math>"; // not enough to trigger recognition |
2257 | 1 | let target = " <math> |
2258 | 1 | <mi>C</mi> |
2259 | 1 | </math>"; |
2260 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2261 | 1 | } |
2262 | ||
2263 | #[test] | |
2264 | 1 | fn sulfate() { |
2265 | 1 | let test = "<math><mrow><msup> |
2266 | 1 | <mrow><mo>[</mo><mi>S</mi><msub><mi>O</mi><mn>4</mn></msub><mo>]</mo></mrow> |
2267 | 1 | <mrow><mn>2</mn><mo>−</mo></mrow> |
2268 | 1 | </msup></mrow></math>"; |
2269 | 1 | let target = "<math> |
2270 | 1 | <msup data-chem-formula='9'> |
2271 | 1 | <mrow data-chem-formula='6'> |
2272 | 1 | <mo>[</mo> |
2273 | 1 | <mrow data-changed='added' data-chem-formula='3'> |
2274 | 1 | <mi data-chem-element='1'>S</mi> |
2275 | 1 | <mo data-changed='added'>⁣</mo> |
2276 | 1 | <msub data-chem-formula='1'> |
2277 | 1 | <mi data-chem-element='1'>O</mi> |
2278 | 1 | <mn>4</mn> |
2279 | 1 | </msub> |
2280 | 1 | </mrow> |
2281 | 1 | <mo>]</mo> |
2282 | 1 | </mrow> |
2283 | 1 | <mrow data-chem-formula='3'> |
2284 | 1 | <mn>2</mn> |
2285 | 1 | <mo>-</mo> |
2286 | 1 | </mrow> |
2287 | 1 | </msup> |
2288 | 1 | </math>"; |
2289 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2290 | 1 | } |
2291 | ||
2292 | #[test] | |
2293 | 1 | fn aluminum_sulfate() { |
2294 | 1 | let test = "<math><mrow><msub><mi>Al</mi><mn>2</mn></msub> |
2295 | 1 | <msub><mrow><mo>(</mo><mi>S</mi><msub><mi>O</mi><mn>4</mn></msub><mo>)</mo></mrow><mn>3</mn></msub></mrow></math>"; |
2296 | 1 | let target = " <math> |
2297 | 1 | <mrow data-chem-formula='10'> |
2298 | 1 | <msub data-chem-formula='3'> |
2299 | 1 | <mi data-chem-element='3'>Al</mi> |
2300 | 1 | <mn>2</mn> |
2301 | 1 | </msub> |
2302 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2303 | 1 | <msub data-chem-formula='6'> |
2304 | 1 | <mrow data-chem-formula='6'> |
2305 | 1 | <mo>(</mo> |
2306 | 1 | <mrow data-changed='added' data-chem-formula='3'> |
2307 | 1 | <mi data-chem-element='1'>S</mi> |
2308 | 1 | <mo data-changed='added'>⁣</mo> |
2309 | 1 | <msub data-chem-formula='1'> |
2310 | 1 | <mi data-chem-element='1'>O</mi> |
2311 | 1 | <mn>4</mn> |
2312 | 1 | </msub> |
2313 | 1 | </mrow> |
2314 | 1 | <mo>)</mo> |
2315 | 1 | </mrow> |
2316 | 1 | <mn>3</mn> |
2317 | 1 | </msub> |
2318 | 1 | </mrow> |
2319 | 1 | </math>"; |
2320 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2321 | 1 | } |
2322 | ||
2323 | #[test] | |
2324 | 1 | fn ethanol_bonds() { |
2325 | 1 | let test = "<math> |
2326 | 1 | <mrow> |
2327 | 1 | <mi>C</mi> |
2328 | 1 | <msub> <mi>H</mi> <mn>3</mn> </msub> |
2329 | 1 | <mo>−</mo> |
2330 | 1 | <mi>C</mi> |
2331 | 1 | <msub> <mi>H</mi> <mn>2</mn> </msub> |
2332 | 1 | <mo>−</mo> |
2333 | 1 | <mi>O</mi> |
2334 | 1 | <mi>H</mi> |
2335 | 1 | </mrow> |
2336 | 1 | </math>"; |
2337 | 1 | let target = "<math> |
2338 | 1 | <mrow data-chem-formula='13'> |
2339 | 1 | <mi data-chem-element='1'>C</mi> |
2340 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2341 | 1 | <msub data-chem-formula='1'> |
2342 | 1 | <mi data-chem-element='1'>H</mi> |
2343 | 1 | <mn>3</mn> |
2344 | 1 | </msub> |
2345 | 1 | <mo data-chemical-bond='true' data-chem-formula-op='1'>-</mo> |
2346 | 1 | <mi data-chem-element='1'>C</mi> |
2347 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2348 | 1 | <msub data-chem-formula='1'> |
2349 | 1 | <mi data-chem-element='1'>H</mi> |
2350 | 1 | <mn>2</mn> |
2351 | 1 | </msub> |
2352 | 1 | <mo data-chemical-bond='true' data-chem-formula-op='1'>-</mo> |
2353 | 1 | <mi data-chem-element='1'>O</mi> |
2354 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2355 | 1 | <mi data-chem-element='1'>H</mi> |
2356 | 1 | </mrow> |
2357 | 1 | </math>"; |
2358 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2359 | 1 | } |
2360 | ||
2361 | #[test] | |
2362 | 1 | fn dichlorine_hexoxide() { |
2363 | // init_logger(); | |
2364 | 1 | let test = "<math><mrow> |
2365 | 1 | <msup> |
2366 | 1 | <mrow><mo>[</mo><mi>Cl</mi><msub><mi>O</mi><mn>2</mn></msub><mo>]</mo></mrow> |
2367 | 1 | <mo>+</mo> |
2368 | 1 | </msup> |
2369 | 1 | <msup> |
2370 | 1 | <mrow><mo>[</mo><mi>Cl</mi><msub><mi>O</mi><mn>4</mn></msub><mo>]</mo></mrow> |
2371 | 1 | <mo>-</mo> |
2372 | 1 | </msup> |
2373 | 1 | </mrow></math>"; |
2374 | 1 | let target = "<math> |
2375 | 1 | <mrow data-chem-formula='19'> |
2376 | 1 | <msup data-chem-formula='9'> |
2377 | 1 | <mrow data-chem-formula='8'> |
2378 | 1 | <mo>[</mo> |
2379 | 1 | <mrow data-changed='added' data-chem-formula='5'> |
2380 | 1 | <mi data-chem-element='3'>Cl</mi> |
2381 | 1 | <mo data-changed='added'>⁣</mo> |
2382 | 1 | <msub data-chem-formula='1'> |
2383 | 1 | <mi data-chem-element='1'>O</mi> |
2384 | 1 | <mn>2</mn> |
2385 | 1 | </msub> |
2386 | 1 | </mrow> |
2387 | 1 | <mo>]</mo> |
2388 | 1 | </mrow> |
2389 | 1 | <mo>+</mo> |
2390 | 1 | </msup> |
2391 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2392 | 1 | <msup data-chem-formula='9'> |
2393 | 1 | <mrow data-chem-formula='8'> |
2394 | 1 | <mo>[</mo> |
2395 | 1 | <mrow data-changed='added' data-chem-formula='5'> |
2396 | 1 | <mi data-chem-element='3'>Cl</mi> |
2397 | 1 | <mo data-changed='added'>⁣</mo> |
2398 | 1 | <msub data-chem-formula='1'> |
2399 | 1 | <mi data-chem-element='1'>O</mi> |
2400 | 1 | <mn>4</mn> |
2401 | 1 | </msub> |
2402 | 1 | </mrow> |
2403 | 1 | <mo>]</mo> |
2404 | 1 | </mrow> |
2405 | 1 | <mo>-</mo> |
2406 | 1 | </msup> |
2407 | 1 | </mrow> |
2408 | 1 | </math>"; |
2409 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2410 | 1 | } |
2411 | ||
2412 | #[test] | |
2413 | 1 | fn ethylene_with_bond() { |
2414 | 1 | let test = "<math><mrow> |
2415 | 1 | <msub><mi>H</mi><mn>2</mn></msub><mi>C</mi> |
2416 | 1 | <mo>=</mo> |
2417 | 1 | <mi>C</mi><msub><mi>H</mi><mn>2</mn></msub> |
2418 | 1 | </mrow></math>"; |
2419 | 1 | let target = "<math> |
2420 | 1 | <mrow data-chem-formula='8'> |
2421 | 1 | <msub data-chem-formula='1'> |
2422 | 1 | <mi data-chem-element='1'>H</mi> |
2423 | 1 | <mn>2</mn> |
2424 | 1 | </msub> |
2425 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2426 | 1 | <mi data-chem-element='1'>C</mi> |
2427 | 1 | <mo data-chemical-bond='true' data-chem-formula-op='1'>=</mo> |
2428 | 1 | <mi data-chem-element='1'>C</mi> |
2429 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2430 | 1 | <msub data-chem-formula='1'> |
2431 | 1 | <mi data-chem-element='1'>H</mi> |
2432 | 1 | <mn>2</mn> |
2433 | 1 | </msub> |
2434 | 1 | </mrow> |
2435 | 1 | </math>"; |
2436 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2437 | 1 | } |
2438 | ||
2439 | #[test] | |
2440 | 1 | fn ferric_chloride_aq() { |
2441 | 1 | let test = "<math><mrow> |
2442 | 1 | <mi>Fe</mi> |
2443 | 1 | <msub><mi>Cl</mi><mn>3</mn></msub> |
2444 | 1 | <mrow><mo>(</mo><mrow><mi>aq</mi></mrow><mo>)</mo></mrow> |
2445 | 1 | </mrow></math>"; |
2446 | 1 | let target = "<math> |
2447 | 1 | <mrow data-chem-formula='11'> |
2448 | 1 | <mi data-chem-element='3'>Fe</mi> |
2449 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2450 | 1 | <msub data-chem-formula='3'> |
2451 | 1 | <mi data-chem-element='3'>Cl</mi> |
2452 | 1 | <mn>3</mn> |
2453 | 1 | </msub> |
2454 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2455 | 1 | <mrow data-chem-formula='3'> |
2456 | 1 | <mo>(</mo> |
2457 | 1 | <mi>aq</mi> |
2458 | 1 | <mo>)</mo> |
2459 | 1 | </mrow> |
2460 | 1 | </mrow> |
2461 | 1 | </math>"; |
2462 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2463 | 1 | } |
2464 | ||
2465 | #[test] | |
2466 | 1 | fn ferric_chloride_aq_as_mi() { |
2467 | 1 | let test = "<math><mrow> |
2468 | 1 | <mi>Fe</mi> |
2469 | 1 | <msub><mi>Cl</mi><mn>3</mn></msub> |
2470 | 1 | <mi>(aq)</mi> |
2471 | 1 | </mrow></math>"; |
2472 | 1 | let target = "<math> |
2473 | 1 | <mrow data-chem-formula='11'> |
2474 | 1 | <mi data-chem-element='3'>Fe</mi> |
2475 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2476 | 1 | <msub data-chem-formula='3'> |
2477 | 1 | <mi data-chem-element='3'>Cl</mi> |
2478 | 1 | <mn>3</mn> |
2479 | 1 | </msub> |
2480 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2481 | 1 | <mrow data-chem-formula='3'> |
2482 | 1 | <mo>(</mo> |
2483 | 1 | <mi>aq</mi> |
2484 | 1 | <mo>)</mo> |
2485 | 1 | </mrow> |
2486 | 1 | </mrow> |
2487 | 1 | </math>"; |
2488 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2489 | 1 | } |
2490 | ||
2491 | #[test] | |
2492 | 1 | fn chemtype_ammonia() { |
2493 | 1 | let test = r#"<math><msub><mi>NH</mi><mn>3</mn></msub></math>"#; |
2494 | 1 | let target = " <math> |
2495 | 1 | <mrow data-changed='added' data-chem-formula='5'> |
2496 | 1 | <mi mathvariant='normal' data-chem-element='1'>N</mi> |
2497 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2498 | 1 | <msub data-chem-formula='2'> |
2499 | 1 | <mi mathvariant='normal' data-chem-element='1' data-split='true'>H</mi> |
2500 | 1 | <mn>3</mn> |
2501 | 1 | </msub> |
2502 | 1 | </mrow> |
2503 | 1 | </math>"; |
2504 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2505 | 1 | } |
2506 | ||
2507 | #[test] | |
2508 | 1 | fn mhchem_ammonia() { |
2509 | 1 | let test = r#"<math> |
2510 | 1 | <mrow> |
2511 | 1 | <mi data-mjx-auto-op="false">NH</mi> |
2512 | 1 | <msub> |
2513 | 1 | <mpadded width="0"> |
2514 | 1 | <mphantom> |
2515 | 1 | <mi>A</mi> |
2516 | 1 | </mphantom> |
2517 | 1 | </mpadded> |
2518 | 1 | <mpadded height="0"> |
2519 | 1 | <mn>3</mn> |
2520 | 1 | </mpadded> |
2521 | 1 | </msub> |
2522 | 1 | </mrow> |
2523 | 1 | </math>"#; |
2524 | 1 | let target = "<math> |
2525 | 1 | <mrow data-chem-formula='5'> |
2526 | 1 | <mi mathvariant='normal' data-chem-element='1'>N</mi> |
2527 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2528 | 1 | <mmultiscripts data-mjx-auto-op='false' data-chem-formula='2'> |
2529 | 1 | <mi mathvariant='normal' data-mjx-auto-op='false' data-split='true' data-chem-element='1'>H</mi> |
2530 | 1 | <mn>3</mn> |
2531 | 1 | <none></none> |
2532 | 1 | </mmultiscripts> |
2533 | 1 | </mrow> |
2534 | 1 | </math>"; |
2535 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2536 | 1 | } |
2537 | ||
2538 | #[test] | |
2539 | 1 | fn mhchem_so4() { |
2540 | 1 | let test = "<math> |
2541 | 1 | <mrow> |
2542 | 1 | <mi>SO</mi> |
2543 | 1 | <msub> |
2544 | 1 | <mpadded width='0'> |
2545 | 1 | <mphantom> |
2546 | 1 | <mi>A</mi> |
2547 | 1 | </mphantom> |
2548 | 1 | </mpadded> |
2549 | 1 | <mpadded height='0'> |
2550 | 1 | <mn>4</mn> |
2551 | 1 | </mpadded> |
2552 | 1 | </msub> |
2553 | 1 | <msup> |
2554 | 1 | <mpadded width='0'> |
2555 | 1 | <mphantom> |
2556 | 1 | <mi>A</mi> |
2557 | 1 | </mphantom> |
2558 | 1 | </mpadded> |
2559 | 1 | <mrow> |
2560 | 1 | <mn>2</mn> |
2561 | 1 | <mo>−</mo> |
2562 | 1 | </mrow> |
2563 | 1 | </msup> |
2564 | 1 | </mrow> |
2565 | 1 | </math>"; |
2566 | 1 | let target = "<math> |
2567 | 1 | <mrow data-chem-formula='7'> |
2568 | 1 | <mi mathvariant='normal' data-chem-element='1'>S</mi> |
2569 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2570 | 1 | <mmultiscripts data-chem-formula='5'> |
2571 | 1 | <mi mathvariant='normal' data-split='true' data-chem-element='1'>O</mi> |
2572 | 1 | <mn>4</mn> |
2573 | 1 | <none/> |
2574 | 1 | <none/> |
2575 | 1 | <mrow data-chem-formula='3'> |
2576 | 1 | <mn>2</mn> |
2577 | 1 | <mo>-</mo> |
2578 | 1 | </mrow> |
2579 | 1 | </mmultiscripts> |
2580 | 1 | </mrow> |
2581 | 1 | </math>"; |
2582 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2583 | 1 | } |
2584 | ||
2585 | #[test] | |
2586 | 1 | fn mhchem_short_ion() { |
2587 | 1 | let test = " <math> |
2588 | 1 | <mrow> |
2589 | 1 | <mi mathvariant='normal'>H</mi> |
2590 | 1 | <msub> |
2591 | 1 | <mpadded width='0'> <mphantom> <mi>A</mi> </mphantom> </mpadded> |
2592 | 1 | <mpadded height='0'> <mn>3</mn></mpadded> |
2593 | 1 | </msub> |
2594 | 1 | <mi mathvariant='normal'>O</mi> |
2595 | 1 | <msup> |
2596 | 1 | <mpadded width='0'> <mphantom> <mi>A</mi> </mphantom> </mpadded> |
2597 | 1 | <mo>+</mo> |
2598 | 1 | </msup> |
2599 | 1 | </mrow> |
2600 | 1 | </math>"; |
2601 | 1 | let target = "<math> |
2602 | 1 | <mrow data-chem-formula='6'> |
2603 | 1 | <mmultiscripts data-chem-formula='2'> |
2604 | 1 | <mi mathvariant='normal' data-chem-element='2'>H</mi> |
2605 | 1 | <mn>3</mn> |
2606 | 1 | <none></none> |
2607 | 1 | </mmultiscripts> |
2608 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2609 | 1 | <mmultiscripts data-chem-formula='3'> |
2610 | 1 | <mi mathvariant='normal' data-chem-element='2'>O</mi> |
2611 | 1 | <none></none> |
2612 | 1 | <mo>+</mo> |
2613 | 1 | </mmultiscripts> |
2614 | 1 | </mrow> |
2615 | 1 | </math>"; |
2616 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2617 | 1 | } |
2618 | ||
2619 | #[test] | |
2620 | 1 | fn mhchem_ions_and_state() { |
2621 | 1 | let test = "<math> |
2622 | 1 | <mrow> |
2623 | 1 | <mrow> |
2624 | 1 | <mi>Na</mi> |
2625 | 1 | </mrow> |
2626 | 1 | <msup> |
2627 | 1 | <mrow> |
2628 | 1 | <mrow> |
2629 | 1 | <mpadded width='0'> |
2630 | 1 | <mphantom> |
2631 | 1 | <mi>A</mi> |
2632 | 1 | </mphantom> |
2633 | 1 | </mpadded> |
2634 | 1 | </mrow> |
2635 | 1 | </mrow> |
2636 | 1 | <mrow> |
2637 | 1 | <mo>+</mo> |
2638 | 1 | </mrow> |
2639 | 1 | </msup> |
2640 | 1 | <mo stretchy='false'>(</mo> |
2641 | 1 | <mrow> |
2642 | 1 | <mi>aq</mi> |
2643 | 1 | </mrow> |
2644 | 1 | <mo stretchy='false'>)</mo> |
2645 | 1 | <mrow> |
2646 | 1 | <mi>Cl</mi> |
2647 | 1 | </mrow> |
2648 | 1 | <msup> |
2649 | 1 | <mrow> |
2650 | 1 | <mrow> |
2651 | 1 | <mpadded width='0'> |
2652 | 1 | <mphantom> |
2653 | 1 | <mi>A</mi> |
2654 | 1 | </mphantom> |
2655 | 1 | </mpadded> |
2656 | 1 | </mrow> |
2657 | 1 | </mrow> |
2658 | 1 | <mrow> |
2659 | 1 | <mo>−</mo> |
2660 | 1 | </mrow> |
2661 | 1 | </msup> |
2662 | 1 | <mspace width='0.111em'></mspace> |
2663 | 1 | <mo stretchy='false'>(</mo> |
2664 | 1 | <mrow> |
2665 | 1 | <mi>aq</mi> |
2666 | 1 | </mrow> |
2667 | 1 | <mo stretchy='false'>)</mo> |
2668 | 1 | </mrow> |
2669 | 1 | </math>"; |
2670 | 1 | let target = "<math> |
2671 | 1 | <mrow data-chem-formula='18'> |
2672 | 1 | <mmultiscripts data-chem-formula='4'> |
2673 | 1 | <mi data-chem-element='3'>Na</mi> |
2674 | 1 | <none></none> |
2675 | 1 | <mo>+</mo> |
2676 | 1 | </mmultiscripts> |
2677 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2678 | 1 | <mrow data-changed='added' data-chem-formula='3'> |
2679 | 1 | <mo stretchy='false'>(</mo> |
2680 | 1 | <mi>aq</mi> |
2681 | 1 | <mo stretchy='false'>)</mo> |
2682 | 1 | </mrow> |
2683 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2684 | 1 | <mmultiscripts data-chem-formula='5'> |
2685 | 1 | <mi data-chem-element='3'>Cl</mi> |
2686 | 1 | <none></none> |
2687 | 1 | <mo>-</mo> |
2688 | 1 | </mmultiscripts> |
2689 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2690 | 1 | <mrow data-changed='added' data-chem-formula='3'> |
2691 | 1 | <mo stretchy='false' data-previous-space-width='0.111'>(</mo> |
2692 | 1 | <mi>aq</mi> |
2693 | 1 | <mo stretchy='false'>)</mo> |
2694 | 1 | </mrow> |
2695 | 1 | </mrow> |
2696 | 1 | </math>"; |
2697 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2698 | 1 | } |
2699 | ||
2700 | #[test] | |
2701 | 1 | fn ethylene_with_colon_bond() { |
2702 | 1 | let test = "<math><mrow> |
2703 | 1 | <msub><mi>H</mi><mn>2</mn></msub><mi>C</mi> |
2704 | 1 | <mo>::</mo> |
2705 | 1 | <mi>C</mi><msub><mi>H</mi><mn>2</mn></msub> |
2706 | 1 | </mrow></math>"; |
2707 | 1 | let target = "<math> |
2708 | 1 | <mrow data-chem-formula='8'> |
2709 | 1 | <msub data-chem-formula='1'> |
2710 | 1 | <mi data-chem-element='1'>H</mi> |
2711 | 1 | <mn>2</mn> |
2712 | 1 | </msub> |
2713 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2714 | 1 | <mi data-chem-element='1'>C</mi> |
2715 | 1 | <mo data-chemical-bond='true' data-chem-formula-op='1'>∷</mo> |
2716 | 1 | <mi data-chem-element='1'>C</mi> |
2717 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2718 | 1 | <msub data-chem-formula='1'> |
2719 | 1 | <mi data-chem-element='1'>H</mi> |
2720 | 1 | <mn>2</mn> |
2721 | 1 | </msub> |
2722 | 1 | </mrow> |
2723 | 1 | </math>"; |
2724 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2725 | 1 | } |
2726 | ||
2727 | #[test] | |
2728 | 1 | fn mhchem_u238() { |
2729 | 1 | let test = "<math> |
2730 | 1 | <mrow> |
2731 | 1 | <msubsup> |
2732 | 1 | <mrow> |
2733 | 1 | <mrow> |
2734 | 1 | <mpadded width='0'> |
2735 | 1 | <mphantom> |
2736 | 1 | <mi>A</mi> |
2737 | 1 | </mphantom> |
2738 | 1 | </mpadded> |
2739 | 1 | </mrow> |
2740 | 1 | </mrow> |
2741 | 1 | <mrow> |
2742 | 1 | <mrow> |
2743 | 1 | <mpadded height='0' depth='0'> |
2744 | 1 | <mphantom></mphantom> |
2745 | 1 | </mpadded> |
2746 | 1 | </mrow> |
2747 | 1 | </mrow> |
2748 | 1 | <mrow> |
2749 | 1 | <mrow> |
2750 | 1 | <mpadded height='0' depth='0'> |
2751 | 1 | <mphantom> |
2752 | 1 | <mn>238</mn> |
2753 | 1 | </mphantom> |
2754 | 1 | </mpadded> |
2755 | 1 | </mrow> |
2756 | 1 | </mrow> |
2757 | 1 | </msubsup> |
2758 | 1 | <mspace width='-0.083em' linebreak='nobreak'></mspace> |
2759 | 1 | <msubsup> |
2760 | 1 | <mrow> |
2761 | 1 | <mrow> |
2762 | 1 | <mpadded width='0'> |
2763 | 1 | <mphantom> |
2764 | 1 | <mi>A</mi> |
2765 | 1 | </mphantom> |
2766 | 1 | </mpadded> |
2767 | 1 | </mrow> |
2768 | 1 | </mrow> |
2769 | 1 | <mrow> |
2770 | 1 | <mrow> |
2771 | 1 | <mpadded width='0'> |
2772 | 1 | <mphantom> |
2773 | 1 | <mn>2</mn> |
2774 | 1 | </mphantom> |
2775 | 1 | </mpadded> |
2776 | 1 | </mrow> |
2777 | 1 | <mrow> |
2778 | 1 | <mpadded width='0' lspace='-1width'> |
2779 | 1 | <mrow> |
2780 | 1 | <mpadded height='0'></mpadded> |
2781 | 1 | </mrow> |
2782 | 1 | </mpadded> |
2783 | 1 | </mrow> |
2784 | 1 | </mrow> |
2785 | 1 | <mrow> |
2786 | 1 | <mrow> |
2787 | 1 | <mpadded height='0'> |
2788 | 1 | <mrow> |
2789 | 1 | <mpadded width='0'> |
2790 | 1 | <mphantom> |
2791 | 1 | <mn>2</mn> |
2792 | 1 | </mphantom> |
2793 | 1 | </mpadded> |
2794 | 1 | </mrow> |
2795 | 1 | </mpadded> |
2796 | 1 | </mrow> |
2797 | 1 | <mrow> |
2798 | 1 | <mpadded width='0' lspace='-1width'> |
2799 | 1 | <mn>238</mn> |
2800 | 1 | </mpadded> |
2801 | 1 | </mrow> |
2802 | 1 | </mrow> |
2803 | 1 | </msubsup> |
2804 | 1 | <mrow> |
2805 | 1 | <mi mathvariant='normal'>U</mi> |
2806 | 1 | </mrow> |
2807 | 1 | </mrow> |
2808 | 1 | </math>"; |
2809 | 1 | let target = " <math> |
2810 | 1 | <mmultiscripts data-previous-space-width='-0.083' data-chem-formula='5'> |
2811 | 1 | <mi mathvariant='normal' data-chem-element='2'>U</mi> |
2812 | 1 | <mprescripts></mprescripts> |
2813 | 1 | <none></none> |
2814 | 1 | <mn>238</mn> |
2815 | 1 | </mmultiscripts> |
2816 | 1 | </math>"; |
2817 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2818 | 1 | } |
2819 | ||
2820 | #[test] | |
2821 | 1 | fn mhchem_hcl_aq() { |
2822 | 1 | let test = "<math> |
2823 | 1 | <mrow> |
2824 | 1 | <mn>2</mn> |
2825 | 1 | <mstyle scriptlevel='0'> |
2826 | 1 | <mspace width='0.167em'></mspace> |
2827 | 1 | </mstyle> |
2828 | 1 | <mrow> |
2829 | 1 | <mi>HCl</mi> |
2830 | 1 | </mrow> |
2831 | 1 | <mspace width='0.111em'></mspace> |
2832 | 1 | <mo stretchy='false'>(</mo> |
2833 | 1 | <mrow> |
2834 | 1 | <mi>aq</mi> |
2835 | 1 | </mrow> |
2836 | 1 | <mo stretchy='false'>)</mo> |
2837 | 1 | </mrow> |
2838 | 1 | </math>"; |
2839 | 1 | let target = "<math> |
2840 | 1 | <mrow data-chem-formula='9'> |
2841 | 1 | <mn>2</mn> |
2842 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁢</mo> |
2843 | 1 | <mrow data-changed='added' data-chem-formula='9'> |
2844 | 1 | <mi mathvariant='normal' data-previous-space-width='0.167' data-chem-element='1'>H</mi> |
2845 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2846 | 1 | <mi data-split='true' data-chem-element='3'>Cl</mi> |
2847 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
2848 | 1 | <mrow data-changed='added' data-chem-formula='3'> |
2849 | 1 | <mo stretchy='false' data-previous-space-width='0.111'>(</mo> |
2850 | 1 | <mi>aq</mi> |
2851 | 1 | <mo stretchy='false'>)</mo> |
2852 | 1 | </mrow> |
2853 | 1 | </mrow> |
2854 | 1 | </mrow> |
2855 | 1 | </math>"; |
2856 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2857 | 1 | } |
2858 | ||
2859 | #[test] | |
2860 | 1 | fn mhchem_nested_sub() { |
2861 | // from \ce{(CH3)3} | |
2862 | 1 | let test = "<math> |
2863 | 1 | <mrow> |
2864 | 1 | <mo stretchy='false'>(</mo> |
2865 | 1 | <mrow> |
2866 | 1 | <mi>CH</mi> |
2867 | 1 | </mrow> |
2868 | 1 | <msub> |
2869 | 1 | <mrow> |
2870 | 1 | <mrow> |
2871 | 1 | <mpadded width='0'> |
2872 | 1 | <mphantom> |
2873 | 1 | <mi>A</mi> |
2874 | 1 | </mphantom> |
2875 | 1 | </mpadded> |
2876 | 1 | </mrow> |
2877 | 1 | </mrow> |
2878 | 1 | <mrow> |
2879 | 1 | <mrow> |
2880 | 1 | <mpadded height='0'> |
2881 | 1 | <mn>3</mn> |
2882 | 1 | </mpadded> |
2883 | 1 | </mrow> |
2884 | 1 | </mrow> |
2885 | 1 | </msub> |
2886 | 1 | <mo stretchy='false'>)</mo> |
2887 | 1 | <msub> |
2888 | 1 | <mrow> |
2889 | 1 | <mrow> |
2890 | 1 | <mpadded width='0'> |
2891 | 1 | <mphantom> |
2892 | 1 | <mi>A</mi> |
2893 | 1 | </mphantom> |
2894 | 1 | </mpadded> |
2895 | 1 | </mrow> |
2896 | 1 | </mrow> |
2897 | 1 | <mrow> |
2898 | 1 | <mrow> |
2899 | 1 | <mpadded height='0'> |
2900 | 1 | <mn>3</mn> |
2901 | 1 | </mpadded> |
2902 | 1 | </mrow> |
2903 | 1 | </mrow> |
2904 | 1 | </msub> |
2905 | 1 | </mrow> |
2906 | 1 | </math>"; |
2907 | 1 | let target = "<math> |
2908 | 1 | <mmultiscripts data-chem-formula='8'> |
2909 | 1 | <mrow data-changed='added' data-chem-formula='8'> |
2910 | 1 | <mo stretchy='false'>(</mo> |
2911 | 1 | <mrow data-changed='added' data-chem-formula='5'> |
2912 | 1 | <mi mathvariant='normal' data-chem-element='1'>C</mi> |
2913 | 1 | <mo data-changed='added'>⁣</mo> |
2914 | 1 | <mmultiscripts data-chem-formula='2'> |
2915 | 1 | <mi mathvariant='normal' data-split='true' data-chem-element='1'>H</mi> |
2916 | 1 | <mn>3</mn> |
2917 | 1 | <none></none> |
2918 | 1 | </mmultiscripts> |
2919 | 1 | </mrow> |
2920 | 1 | <mo stretchy='false'>)</mo> |
2921 | 1 | </mrow> |
2922 | 1 | <mn>3</mn> |
2923 | 1 | <none></none> |
2924 | 1 | </mmultiscripts> |
2925 | 1 | </math>"; |
2926 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
2927 | 1 | } |
2928 | ||
2929 | #[test] | |
2930 | 1 | fn mhchem_isotopes() { |
2931 | // from \ce{^{18}O{}^{16}O} | |
2932 | 1 | let test = "<math> |
2933 | 1 | <mrow> |
2934 | 1 | <msubsup> |
2935 | 1 | <mpadded width='0'> |
2936 | 1 | <mphantom> |
2937 | 1 | <mi>A</mi> |
2938 | 1 | </mphantom> |
2939 | 1 | </mpadded> |
2940 | 1 | <mpadded height='0' depth='0'> |
2941 | 1 | <mphantom></mphantom> |
2942 | 1 | </mpadded> |
2943 | 1 | <mpadded height='0' depth='0'> |
2944 | 1 | <mphantom> |
2945 | 1 | <mn>18</mn> |
2946 | 1 | </mphantom> |
2947 | 1 | </mpadded> |
2948 | 1 | </msubsup> |
2949 | 1 | <mspace width='-0.083em'></mspace> |
2950 | 1 | <msubsup> |
2951 | 1 | <mpadded width='0'> |
2952 | 1 | <mphantom> |
2953 | 1 | <mi>A</mi> |
2954 | 1 | </mphantom> |
2955 | 1 | </mpadded> |
2956 | 1 | <mrow> |
2957 | 1 | <mpadded width='0'> |
2958 | 1 | <mphantom> |
2959 | 1 | <mn>2</mn> |
2960 | 1 | </mphantom> |
2961 | 1 | </mpadded> |
2962 | 1 | <mpadded width='0' lspace='-1width'> |
2963 | 1 | <mpadded height='0'></mpadded> |
2964 | 1 | </mpadded> |
2965 | 1 | </mrow> |
2966 | 1 | <mrow> |
2967 | 1 | <mpadded height='0'> |
2968 | 1 | <mpadded width='0'> |
2969 | 1 | <mphantom> |
2970 | 1 | <mn>2</mn> |
2971 | 1 | </mphantom> |
2972 | 1 | </mpadded> |
2973 | 1 | </mpadded> |
2974 | 1 | <mpadded width='0' lspace='-1width'> |
2975 | 1 | <mn>18</mn> |
2976 | 1 | </mpadded> |
2977 | 1 | </mrow> |
2978 | 1 | </msubsup> |
2979 | 1 | <mi mathvariant='normal'>O</mi> |
2980 | 1 | <mspace width='0.111em'></mspace> |
2981 | 1 | <msubsup> |
2982 | 1 | <mpadded width='0'> |
2983 | 1 | <mphantom> |
2984 | 1 | <mi>A</mi> |
2985 | 1 | </mphantom> |
2986 | 1 | </mpadded> |
2987 | 1 | <mpadded height='0' depth='0'> |
2988 | 1 | <mphantom></mphantom> |
2989 | 1 | </mpadded> |
2990 | 1 | <mpadded height='0' depth='0'> |
2991 | 1 | <mphantom> |
2992 | 1 | <mn>16</mn> |
2993 | 1 | </mphantom> |
2994 | 1 | </mpadded> |
2995 | 1 | </msubsup> |
2996 | 1 | <mspace width='-0.083em'></mspace> |
2997 | 1 | <msubsup> |
2998 | 1 | <mpadded width='0'> |
2999 | 1 | <mphantom> |
3000 | 1 | <mi>A</mi> |
3001 | 1 | </mphantom> |
3002 | 1 | </mpadded> |
3003 | 1 | <mrow> |
3004 | 1 | <mpadded width='0'> |
3005 | 1 | <mphantom> |
3006 | 1 | <mn>2</mn> |
3007 | 1 | </mphantom> |
3008 | 1 | </mpadded> |
3009 | 1 | <mpadded width='0' lspace='-1width'> |
3010 | 1 | <mpadded height='0'></mpadded> |
3011 | 1 | </mpadded> |
3012 | 1 | </mrow> |
3013 | 1 | <mrow> |
3014 | 1 | <mpadded height='0'> |
3015 | 1 | <mpadded width='0'> |
3016 | 1 | <mphantom> |
3017 | 1 | <mn>2</mn> |
3018 | 1 | </mphantom> |
3019 | 1 | </mpadded> |
3020 | 1 | </mpadded> |
3021 | 1 | <mpadded width='0' lspace='-1width'> |
3022 | 1 | <mn>16</mn> |
3023 | 1 | </mpadded> |
3024 | 1 | </mrow> |
3025 | 1 | </msubsup> |
3026 | 1 | <mi mathvariant='normal'>O</mi> |
3027 | 1 | </mrow> |
3028 | 1 | </math>"; |
3029 | 1 | let target = "<math> |
3030 | 1 | <mrow data-chem-formula='11'> |
3031 | 1 | <mmultiscripts data-previous-space-width='-0.083' data-chem-formula='5'> |
3032 | 1 | <mi mathvariant='normal' data-chem-element='2'>O</mi> |
3033 | 1 | <mprescripts></mprescripts> |
3034 | 1 | <none></none> |
3035 | 1 | <mn>18</mn> |
3036 | 1 | </mmultiscripts> |
3037 | 1 | <mo data-changed='added' data-chem-formula-op='0'>⁣</mo> |
3038 | 1 | <mmultiscripts data-previous-space-width='0.027999999999999997' data-chem-formula='5'> |
3039 | 1 | <mi mathvariant='normal' data-chem-element='2'>O</mi> |
3040 | 1 | <mprescripts></mprescripts> |
3041 | 1 | <none></none> |
3042 | 1 | <mn>16</mn> |
3043 | 1 | </mmultiscripts> |
3044 | 1 | </mrow> |
3045 | 1 | </math>"; |
3046 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
3047 | 1 | } |
3048 | ||
3049 | ||
3050 | #[test] | |
3051 | 1 | fn merge_bug_274() { |
3052 | 1 | let test = r#" |
3053 | 1 | <math> |
3054 | 1 | <mrow> |
3055 | 1 | <mtable> |
3056 | 1 | <mtr> |
3057 | 1 | <mtd> |
3058 | 1 | <mrow> |
3059 | 1 | <msub><mtext>H</mtext><mn>2</mn></msub> |
3060 | 1 | <mtext>g</mtext> |
3061 | 1 | <mtext/> |
3062 | 1 | <mtext>+</mtext> |
3063 | 1 | <mtext/> |
3064 | 1 | <msub><mrow><mtext>Cl</mtext></mrow><mn>2</mn></msub> |
3065 | 1 | <mo stretchy="false">(</mo> |
3066 | 1 | <mtext>g</mtext> |
3067 | 1 | <mo stretchy="false">)</mo> |
3068 | 1 | <mo>→</mo> |
3069 | 1 | <mn>2</mn> |
3070 | 1 | <mtext>HCl(g)</mtext> |
3071 | 1 | </mrow> |
3072 | 1 | </mtd> |
3073 | 1 | </mtr> |
3074 | 1 | <mtr> |
3075 | 1 | <mtd> |
3076 | 1 | <mrow> |
3077 | 1 | <mn>1</mn> |
3078 | 1 | <mo>:</mo> |
3079 | 1 | <mn>1</mn> |
3080 | 1 | <mo>:</mo> |
3081 | 1 | <mn>2</mn> |
3082 | 1 | </mrow> |
3083 | 1 | </mtd> |
3084 | 1 | </mtr> |
3085 | 1 | <mtr> |
3086 | 1 | <mtd> |
3087 | 1 | <mrow> |
3088 | 1 | <mn>1</mn> |
3089 | 1 | <mtext/> |
3090 | 1 | <msub><mtext>H</mtext><mn>2</mn></msub> |
3091 | 1 | <mtext/> |
3092 | 1 | <mtext>to</mtext> |
3093 | 1 | <mtext/> |
3094 | 1 | <mn>1</mn> |
3095 | 1 | <mtext/> |
3096 | 1 | <msub><mrow><mtext>Cl</mtext></mrow><mn>2</mn></msub> |
3097 | 1 | <mtext/> |
3098 | 1 | <mtext>to</mtext> |
3099 | 1 | <mtext/> |
3100 | 1 | <mtext>2</mtext> |
3101 | 1 | <mtext/> |
3102 | 1 | <mtext>HCl</mtext> |
3103 | 1 | </mrow> |
3104 | 1 | </mtd> |
3105 | 1 | </mtr> |
3106 | 1 | </mtable> |
3107 | 1 | </mrow> |
3108 | 1 | </math> |
3109 | 1 | "#; |
3110 | 1 | let target = " |
3111 | 1 | <math> |
3112 | 1 | <mtable> |
3113 | 1 | <mtr> |
3114 | 1 | <mtd data-maybe-chemistry='9'> |
3115 | 1 | <mrow data-maybe-chemistry='9'> |
3116 | 1 | <mrow data-changed='added' data-maybe-chemistry='8'> |
3117 | 1 | <mrow data-changed='added' data-maybe-chemistry='1'> |
3118 | 1 | <msub data-maybe-chemistry='1'> |
3119 | 1 | <mtext data-maybe-chemistry='1'>H</mtext> |
3120 | 1 | <mn>2</mn> |
3121 | 1 | </msub> |
3122 | 1 | <mo data-changed='added' data-maybe-chemistry='0'>⁢</mo> |
3123 | 1 | <mtext data-maybe-chemistry='0'>g</mtext> |
3124 | 1 | </mrow> |
3125 | 1 | <mo data-chem-equation-op='1' data-maybe-chemistry='1'>+</mo> |
3126 | 1 | <mrow data-changed='added' data-maybe-chemistry='6'> |
3127 | 1 | <msub data-maybe-chemistry='3'> |
3128 | 1 | <mtext data-maybe-chemistry='3'>Cl</mtext> |
3129 | 1 | <mn>2</mn> |
3130 | 1 | </msub> |
3131 | 1 | <mo data-changed='added' data-maybe-chemistry='0'>⁣</mo> |
3132 | 1 | <mrow data-changed='added' data-maybe-chemistry='2'> |
3133 | 1 | <mo stretchy='false'>(</mo> |
3134 | 1 | <mtext>g</mtext> |
3135 | 1 | <mo stretchy='false'>)</mo> |
3136 | 1 | </mrow> |
3137 | 1 | </mrow> |
3138 | 1 | </mrow> |
3139 | 1 | <mo data-chem-equation-op='1' data-maybe-chemistry='1'>→</mo> |
3140 | 1 | <mrow data-changed='added' data-maybe-chemistry='0'> |
3141 | 1 | <mn data-maybe-chemistry='0'>2</mn> |
3142 | 1 | <mo data-changed='added' data-maybe-chemistry='0'>⁢</mo> |
3143 | 1 | <mtext data-maybe-chemistry='0'>HCl(g)</mtext> |
3144 | 1 | </mrow> |
3145 | 1 | </mrow> |
3146 | 1 | </mtd> |
3147 | 1 | </mtr> |
3148 | 1 | <mtr> |
3149 | 1 | <mtd> |
3150 | 1 | <mrow> |
3151 | 1 | <mn>1</mn> |
3152 | 1 | <mo>:</mo> |
3153 | 1 | <mn>1</mn> |
3154 | 1 | <mo>:</mo> |
3155 | 1 | <mn>2</mn> |
3156 | 1 | </mrow> |
3157 | 1 | </mtd> |
3158 | 1 | </mtr> |
3159 | 1 | <mtr> |
3160 | 1 | <mtd data-maybe-chemistry='7'> |
3161 | 1 | <mrow data-maybe-chemistry='7'> |
3162 | 1 | <mn data-maybe-chemistry='0'>1</mn> |
3163 | 1 | <mo data-changed='added' data-maybe-chemistry='0'>⁢</mo> |
3164 | 1 | <msub data-maybe-chemistry='1'> |
3165 | 1 | <mtext data-maybe-chemistry='1'>H</mtext> |
3166 | 1 | <mn>2</mn> |
3167 | 1 | </msub> |
3168 | 1 | <mo data-changed='added' data-maybe-chemistry='0'>⁢</mo> |
3169 | 1 | <mtext data-maybe-chemistry='0'>to</mtext> |
3170 | 1 | <mo data-changed='added' data-maybe-chemistry='0'>⁢</mo> |
3171 | 1 | <mn data-maybe-chemistry='0'>1</mn> |
3172 | 1 | <mo data-changed='added' data-maybe-chemistry='0'>⁢</mo> |
3173 | 1 | <msub data-maybe-chemistry='3'> |
3174 | 1 | <mtext data-maybe-chemistry='3'>Cl</mtext> |
3175 | 1 | <mn>2</mn> |
3176 | 1 | </msub> |
3177 | 1 | <mo data-changed='added' data-maybe-chemistry='0'>⁢</mo> |
3178 | 1 | <mtext data-maybe-chemistry='0'>to</mtext> |
3179 | 1 | <mo data-changed='added' data-maybe-chemistry='0'>⁢</mo> |
3180 | 1 | <mn data-maybe-chemistry='0'>2</mn> |
3181 | 1 | <mo data-changed='added' data-maybe-chemistry='0'>⁢</mo> |
3182 | 1 | <mi data-maybe-chemistry='1' mathvariant='normal'>H</mi> |
3183 | 1 | <mo data-changed='added' data-maybe-chemistry='0'>⁢</mo> |
3184 | 1 | <mi data-maybe-chemistry='3' data-split='true'>Cl</mi> |
3185 | 1 | </mrow> |
3186 | 1 | </mtd> |
3187 | 1 | </mtr> |
3188 | 1 | </mtable> |
3189 | 1 | </math> |
3190 | 1 | "; |
3191 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
3192 | 1 | } |
3193 | ||
3194 | #[test] | |
3195 | 1 | fn merge_bug_303() { |
3196 | 1 | let test = r#" |
3197 | 1 | <math> |
3198 | 1 | <mn>2</mn> |
3199 | 1 | <msup><mtext>OH</mtext><mo>−</mo></msup> |
3200 | 1 | <mo stretchy="false">(</mo> |
3201 | 1 | <mtext>aq</mtext> |
3202 | 1 | <mo stretchy="false">)</mo> |
3203 | 1 | <mo>+</mo> |
3204 | 1 | <mtext>C</mtext> |
3205 | 1 | <msup><mtext>u</mtext><mrow><mn>2</mn><mo>+</mo></mrow></msup> |
3206 | 1 | </math> |
3207 | 1 | "#; |
3208 | 1 | let target = " |
3209 | 1 | <math> |
3210 | 1 | <mrow data-changed='added'> |
3211 | 1 | <mrow data-changed='added'> |
3212 | 1 | <mn>2</mn> |
3213 | 1 | <mo data-changed='added'>⁢</mo> |
3214 | 1 | <mrow data-changed='added'> |
3215 | 1 | <msup><mi>OH</mi><mo>-</mo></msup> |
3216 | 1 | <mo data-changed='added'>⁡</mo> |
3217 | 1 | <mrow data-changed='added'> |
3218 | 1 | <mo stretchy='false'>(</mo> |
3219 | 1 | <mtext>aq</mtext> |
3220 | 1 | <mo stretchy='false'>)</mo> |
3221 | 1 | </mrow> |
3222 | 1 | </mrow> |
3223 | 1 | </mrow> |
3224 | 1 | <mo>+</mo> |
3225 | 1 | <mrow data-changed='added'> |
3226 | 1 | <mtext>C</mtext> |
3227 | 1 | <mo data-changed='added'>⁢</mo> |
3228 | 1 | <msup> <mtext>u</mtext> <mrow><mn>2</mn><mo>+</mo></mrow> </msup> |
3229 | 1 | </mrow> |
3230 | 1 | </mrow> |
3231 | 1 | </math> |
3232 | 1 | "; |
3233 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
3234 | 1 | } |
3235 | ||
3236 | #[test] | |
3237 | 1 | fn mtd_assert_bug_393() { |
3238 | 1 | let test = r#" |
3239 | 1 | <math display="block"> |
3240 | 1 | <mtable> |
3241 | 1 | <mtr> |
3242 | 1 | <mtd> |
3243 | 1 | <mrow> |
3244 | 1 | <mi>A</mi> |
3245 | 1 | <mi>c</mi> |
3246 | 1 | </mrow> |
3247 | 1 | </mtd> |
3248 | 1 | <mtd> |
3249 | 1 | <mi>A</mi> |
3250 | 1 | <mfenced> |
3251 | 1 | <mtable> |
3252 | 1 | <mtr> |
3253 | 1 | <mtd> |
3254 | 1 | <mrow> |
3255 | 1 | <mi>c</mi> |
3256 | 1 | <mi>n</mi> |
3257 | 1 | </mrow> |
3258 | 1 | </mtd> |
3259 | 1 | </mtr> |
3260 | 1 | </mtable> |
3261 | 1 | </mfenced> |
3262 | 1 | </mtd> |
3263 | 1 | </mtr> |
3264 | 1 | </mtable> |
3265 | 1 | </math>"#; |
3266 | 1 | let target = " |
3267 | 1 | <math display='block'> |
3268 | 1 | <mtable> |
3269 | 1 | <mtr> |
3270 | 1 | <mtd> |
3271 | 1 | <mi>A</mi> |
3272 | 1 | <mi>c</mi> |
3273 | 1 | </mtd> |
3274 | 1 | <mtd> |
3275 | 1 | <mrow data-changed='added'> |
3276 | 1 | <mi>A</mi> |
3277 | 1 | <mrow> |
3278 | 1 | <mo data-changed='from_mfenced'>(</mo> |
3279 | 1 | <mtable> |
3280 | 1 | <mtr> |
3281 | 1 | <mtd> |
3282 | 1 | <mrow> |
3283 | 1 | <mi>c</mi> |
3284 | 1 | <mi>n</mi> |
3285 | 1 | </mrow> |
3286 | 1 | </mtd> |
3287 | 1 | </mtr> |
3288 | 1 | </mtable> |
3289 | 1 | <mo data-changed='from_mfenced'>)</mo> |
3290 | 1 | </mrow> |
3291 | 1 | </mrow> |
3292 | 1 | </mtd> |
3293 | 1 | </mtr> |
3294 | 1 | </mtable> |
3295 | 1 | </math>"; |
3296 | 1 | assert!(are_strs_canonically_equal(test, target, &[])); |
3297 | 1 | } |
3298 | ||
3299 | } |
Line | Count | Source |
1 | //! # Definitions module | |
2 | //! This module is responsible for reading in the definitions files and converting them to either vectors or hashmaps so that | |
3 | //! the definitions can be used by the program. | |
4 | //! | |
5 | //! ## Leaked Implementation Details | |
6 | //! There is no escaping some implementation details. | |
7 | //! Because these definitions are stored in global variables, the variables need to be protected | |
8 | //! in some way so they can be written at runtime when the files are read. | |
9 | //! This is done by putting them inside of a lock (`thread_local`). | |
10 | //! | |
11 | //! Furthermore, it was necessary to use `RefCell` and `Rc` to deal with interior mutability. | |
12 | //! All of this means that a lock needs to be obtained _and_ the contents borrowed to access a definition. | |
13 | //! | |
14 | //! To minimize the global variable footprint, all of the definitions are put inside of a single global variable [`DEFINITIONS`]. | |
15 | //! | |
16 | //! //! Note: some of the variables are `vec`s and some are `hashset`s. | |
17 | //! Numbers are typically vectors so that indexing a digit is easy. | |
18 | //! Others such as `functions_names` are a hashset because you just want to know if an `mi` is a known name or not. | |
19 | //! The functions `as_vec` and `as_hashset` should be used on the appropriate variable. | |
20 | //! ## Names | |
21 | //! The names of "variables" in the definition files use camel case (e.g., "FunctionNames"). In the code, to fit with rust | |
22 | //! naming conventions, snake case is used (e.g, "function_names"). | |
23 | //! | |
24 | //! See the struct [`Definitions`] for the variables that are read in. | |
25 | #![allow(clippy::needless_return)] | |
26 | ||
27 | use yaml_rust::yaml::Hash; | |
28 | use yaml_rust::Yaml; | |
29 | use crate::errors::*; | |
30 | use crate::prefs::*; | |
31 | use std::{cell::RefCell, cell::Ref, cell::RefMut, rc::Rc}; | |
32 | use std::path::{Path, PathBuf}; | |
33 | use std::collections::{HashMap, HashSet}; | |
34 | use crate::shim_filesystem::read_to_string_shim; | |
35 | ||
36 | /// An enum to paper over the different types of data access needed. | |
37 | /// | |
38 | /// Having a Rc<RefCell<FromFileVariable>> seems a bit complicated in terms of types but... | |
39 | /// 1. The rust book seems to endorse the Rc<RefCell<...>>> approach when there are multiple owners of mutable date. | |
40 | /// See <https://doc.rust-lang.org/book/ch15-05-interior-mutability.html> towards the end | |
41 | /// 2. When a file is read, we need to clear and add data to the structure being read (reassigning could work for clearing). | |
42 | /// When we use the data, we either want to index into it or test if an item is there. | |
43 | /// The structures we use are either a Vec or a HashMap, so we need to abstract that away in `FromFileVariable`. | |
44 | /// Unfortunately, traits don't quite work as an option here: | |
45 | /// * Vec implements extends (`add`), but there is no test/contains | |
46 | /// * Hashmap implements `index`, but panics if the item isn't there | |
47 | /// | |
48 | /// Because of the above limitations, we introduce the enum [`Contains`] which dispatches appropriately to Vec/Hashmap | |
49 | #[derive(Debug, Clone)] | |
50 | pub enum Contains { | |
51 | Vec(Rc<RefCell<Vec<String>>>), | |
52 | Set(Rc<RefCell<HashSet<String>>>), | |
53 | Map(Rc<RefCell<HashMap<String, String>>>), | |
54 | } | |
55 | ||
56 | impl Contains { | |
57 | // fn add(&mut self, item: String) { | |
58 | // match self { | |
59 | // Contains::Vec(v) => { v.borrow_mut().push(item); }, | |
60 | // Contains::Set(s) => { s.borrow_mut().insert(item); } | |
61 | // } | |
62 | // } | |
63 | ||
64 | // fn clear(&mut self) { | |
65 | // match self { | |
66 | // Contains::Vec(v) => { v.borrow_mut().clear(); }, | |
67 | // Contains::Set(s) => { s.borrow_mut().clear(); } | |
68 | // } | |
69 | // } | |
70 | } | |
71 | pub type CollectionFromFile = Contains; | |
72 | type VariableDefHashMap = HashMap<String, CollectionFromFile>; | |
73 | ||
74 | /// Global structure containing all of the definitions. | |
75 | /// Each field in the structure corresponds to a named value read in from the `definitions.yaml` files. | |
76 | /// | |
77 | /// The names of "variables" in the definition files use camel case (e.g., "FunctionNames"). In the code, to fit with rust | |
78 | /// naming conventions, snake case is used (e.g, "function_names"). | |
79 | /// | |
80 | /// There should only be one instance of this structure ([`DEFINITIONS`]) | |
81 | // FIX: this probably can done with a macro to remove all the repetition | |
82 | pub struct Definitions { | |
83 | pub name_to_var_mapping: VariableDefHashMap, | |
84 | } | |
85 | ||
86 | impl Default for Definitions { | |
87 | 0 | fn default() -> Self { |
88 | 0 | Definitions { |
89 | 0 | name_to_var_mapping: HashMap::with_capacity(30), |
90 | 0 | } |
91 | 0 | } |
92 | } | |
93 | ||
94 | impl Definitions { | |
95 | 5.48k | fn new() -> Self { |
96 | 5.48k | Definitions { |
97 | 5.48k | name_to_var_mapping: HashMap::with_capacity(30), |
98 | 5.48k | } |
99 | 5.48k | } |
100 | ||
101 | 158k | pub fn get_hashset(&self, name: &str) -> Option<Ref<'_, HashSet<String>>> { |
102 | 158k | let names = self.name_to_var_mapping.get(name); |
103 | 151k | if let Some(Contains::Set( |
104 | 151k | return Some(set.borrow()); |
105 | 7.29k | } |
106 | 7.29k | return None; |
107 | 158k | } |
108 | ||
109 | 15.9k | pub fn get_hashmap(&self, name: &str) -> Option<Ref<'_, HashMap<String, String>>> { |
110 | 15.9k | let names = self.name_to_var_mapping.get(name); |
111 | 15.9k | if let Some(Contains::Map(map)) = names { |
112 | 15.9k | return Some(map.borrow()); |
113 | 0 | } |
114 | 0 | return None; |
115 | 15.9k | } |
116 | ||
117 | 1.31k | pub fn get_vec(&self, name: &str) -> Option<Ref<'_, Vec<String>>> { |
118 | 1.31k | let names = self.name_to_var_mapping.get(name); |
119 | 1.31k | if let Some(Contains::Vec(vec)) = names { |
120 | 1.31k | return Some(vec.borrow()); |
121 | 0 | } |
122 | 0 | return None; |
123 | 1.31k | } |
124 | } | |
125 | ||
126 | thread_local!{ | |
127 | /// Global variable containing all of the definitions. | |
128 | /// See [`Definitions`] for more details. | |
129 | pub static SPEECH_DEFINITIONS: RefCell<Definitions> = RefCell::new( Definitions::new() ); | |
130 | pub static BRAILLE_DEFINITIONS: RefCell<Definitions> = RefCell::new( Definitions::new() ); | |
131 | pub static DEFINITIONS: &'static std::thread::LocalKey<RefCell<Definitions>> = const { &SPEECH_DEFINITIONS }; | |
132 | } | |
133 | ||
134 | /// Reads the `definitions.yaml` files specified by current_files -- these are presumed to need updating. | |
135 | /// | |
136 | /// If there is a failure during read, the error is propagated to the caller | |
137 | 5.50k | pub fn read_definitions_file(use_speech_defs: bool) -> Result<Vec<PathBuf>> { |
138 | // for each file in `locations`, read the contents and process them | |
139 | 5.50k | let pref_manager = PreferenceManager::get(); |
140 | 5.50k | let pref_manager = pref_manager.borrow(); |
141 | 5.50k | let file_path = pref_manager.get_definitions_file(use_speech_defs); |
142 | 5.50k | let definitions = if use_speech_defs { |
143 | 5.50k | definitions.with( |defs| defs.borrow_mut().name_to_var_mapping.clear() ); |
144 | 5.50k | let mut new_files = vec![file_path.to_path_buf()]; |
145 | 5.50k | let mut files_read = read_one_definitions_file(use_speech_defs, file_path).with_context(|| |
146 | 5.50k | new_files.append(&mut files_read); |
147 | ||
148 | // merge the contents of `TrigFunctions` into a set that contains all the function names (from `AdditionalFunctionNames`). | |
149 | 5.50k | return definitions.with(|defs| { |
150 | 5.50k | let mut defs = defs.borrow_mut(); |
151 | 5.50k | make_all_set_references_valid(&mut defs); |
152 | 5.50k | return Ok(new_files); |
153 | 5.50k | }); |
154 | ||
155 | ||
156 | /// Make references to all used set be valid by creating empty sets if they weren't defined | |
157 | 5.50k | fn make_all_set_references_valid(defs: &mut RefMut<Definitions>) { |
158 | // FIX: this list is created by hand -- it would be better if there was a way to create the list Automatically | |
159 | // Note: "FunctionNames" is created in build_all_functions_set() if not already set | |
160 | 5.50k | let used_set_names = ["GeometryPrefixOperators", "LikelyFunctionNames", "TrigFunctionNames", "AdditionalFunctionNames", "Arrows", "GeometryShapes"]; |
161 | // let name_to_mapping = defs.name_to_var_mapping.borrow_mut(); | |
162 | 33.0k | for set_name in |
163 | 33.0k | if defs.get_hashset(set_name).is_none() { |
164 | 1.74k | defs.name_to_var_mapping.insert(set_name.to_string(), Contains::Set( Rc::new( RefCell::new( HashSet::with_capacity(0) ) ) )); |
165 | 31.2k | } |
166 | } | |
167 | 5.50k | if defs.get_hashset("FunctionNames").is_none() { |
168 | 5.46k | let all_functions = build_all_functions_set(defs); |
169 | 5.46k | defs.name_to_var_mapping.insert("FunctionNames".to_string(), Contains::Set( Rc::new( RefCell::new( all_functions ) ) )); |
170 | 5.46k |
|
171 | 5.50k | } |
172 | ||
173 | /// merge "TrigFunctions" and "AdditionalFunctionNames" into a new set named "FunctionNames" | |
174 | 5.46k | fn build_all_functions_set(defs: &mut RefMut<Definitions>) -> HashSet<String> { |
175 | 5.46k | let trig_functions = defs.get_hashset("TrigFunctionNames").unwrap(); |
176 | 5.46k | let mut all_functions = defs.get_hashset("AdditionalFunctionNames").unwrap().clone(); |
177 | 109k | for trig_name in |
178 | 109k | all_functions.insert(trig_name.clone()); |
179 | 109k | } |
180 | 5.46k | return all_functions; |
181 | 5.46k | } |
182 | 5.50k | } |
183 | ||
184 | use crate::speech::*; | |
185 | 11.7k | fn read_one_definitions_file(use_speech_defs: bool, path: &Path) -> Result<Vec<PathBuf>> { |
186 | // read in the file contents | |
187 | 11.7k | let definition_file_contents = read_to_string_shim(path) |
188 | 11.7k | .with_context(|| |
189 | ||
190 | // callback to do the work of building up the defined vectors/hashmaps (in 'build_values') from YAML | |
191 | 11.7k | let defs_build_fn = |variable_def_list: &Yaml| { |
192 | // Rule::DefinitionList | |
193 | // debug!("variable_def_list {} is\n{}", yaml_to_type(variable_def_list), yaml_to_string(variable_def_list)); | |
194 | 11.7k | let mut files_read = vec![path.to_path_buf()]; |
195 | 11.7k | let vec = crate::speech::as_vec_checked(variable_def_list) |
196 | 11.7k | .with_context(|| |
197 | 175k | for variable_def in |
198 | 175k | if let Some( |
199 | 6.27k | files_read.append(&mut added_files); |
200 | 168k | } |
201 | } | |
202 | 11.7k | return Ok(files_read); |
203 | 11.7k | }; |
204 | ||
205 | // Convert the file contents to YAML and call the callback | |
206 | 11.7k | return crate::speech::compile_rule(&definition_file_contents, defs_build_fn) |
207 | 11.7k | .with_context(|| |
208 | 11.7k | } |
209 | ||
210 | /// Do the work of converting a single YAML def into the vec/hashset/hashmap | |
211 | /// name: [a, b, c] -- assume an indexed vector | |
212 | /// name: {a, b, c} -- assume a hash set | |
213 | /// name: {a: A, b: B, c: C} -- assume a hashmap | |
214 | /// Returns all the files that were read | |
215 | 175k | fn build_values(definition: &Yaml, use_speech_defs: bool, path: &Path) -> Result<Option<Vec<PathBuf>>> { |
216 | // Rule::Definition | |
217 | 175k | let dictionary = crate::speech::as_hash_checked(definition) |
218 | 175k | if dictionary.len()!=1 { |
219 | 0 | bail!("Should only be one definition rule: {}", yaml_to_type(definition)); |
220 | 175k | } |
221 | 175k | let (key, value) = dictionary.iter().next().unwrap(); |
222 | 175k | let def_name = key.as_str().ok_or_else(|| |
223 | 175k | if def_name == "include" { |
224 | 6.27k | let do_include_fn = |new_file: &Path| { |
225 | 6.27k | read_one_definitions_file(use_speech_defs, new_file) |
226 | 6.27k | }; |
227 | 6.27k | let include_file_name = value.as_str().ok_or_else(|| |
228 | 6.27k | return Ok( Some(crate::speech::process_include(path, include_file_name, do_include_fn) |
229 | 168k | } |
230 | ||
231 | let result; | |
232 | 168k | if def_name.starts_with("Numbers") || |
233 | 58.0k | result = Contains::Vec( Rc::new( RefCell::new( get_vec_values(value.as_vec().unwrap()) |
234 | } else { | |
235 | // match value.as_vec() { | |
236 | // Some(vec) => { | |
237 | // result = Contains::Set( Rc::new( RefCell::new( get_set_values(vec)? ) ) ); }, | |
238 | // None => { | |
239 | // let dict = value.as_hash().ok_or_else(|| anyhow!("definition list value '{}' is not an array or dictionary", yaml_to_type(value)))?; | |
240 | // result = Contains::Map( Rc::new( RefCell::new( get_map_values(dict) | |
241 | // .chain_err(||format!("while reading value '{}'", def_name))? ) ) ); | |
242 | ||
243 | // }, | |
244 | // } | |
245 | 110k | let dict = value.as_hash().ok_or_else(|| |
246 | 110k | if dict.is_empty() { |
247 | 15.6k | result = Contains::Set( Rc::new( RefCell::new( HashSet::with_capacity(0) ) ) ); |
248 | 15.6k | } else { |
249 | // peak and see if this is a set or a map | |
250 | 95.3k | let (_, entry_value) = dict.iter().next().unwrap(); |
251 | 95.3k | if entry_value.is_null() { |
252 | 63.4k | result = Contains::Set( Rc::new( RefCell::new( get_set_values(dict) |
253 | 63.4k | .with_context(|| |
254 | } else { | |
255 | // peak and see if this is a set or a map | |
256 | 31.8k | let (_, entry_value) = dict.iter().next().unwrap(); |
257 | 31.8k | if entry_value.is_null() { |
258 | 0 | result = Contains::Set( Rc::new( RefCell::new( get_set_values(dict) |
259 | 0 | .with_context(||format!("while reading value '{def_name}'"))? ) ) ); |
260 | } else { | |
261 | 31.8k | result = Contains::Map( Rc::new( RefCell::new( get_map_values(dict) |
262 | 31.8k | .with_context(|| |
263 | } | |
264 | } | |
265 | } | |
266 | }; | |
267 | ||
268 | 168k | let definitions = if use_speech_defs { |
269 | 168k | return definitions.with(|definitions| { |
270 | 168k | let name_definition_map = &mut definitions.borrow_mut().name_to_var_mapping; |
271 | 168k | name_definition_map.insert(def_name.to_string(), result); |
272 | 168k | return Ok(None); |
273 | 168k | }); |
274 | ||
275 | 58.0k | fn get_vec_values(values: &Vec<Yaml>) -> Result<Vec<String>> { |
276 | 58.0k | let mut result = Vec::with_capacity(values.len()); |
277 | 787k | for yaml_value in |
278 | 787k | let value = yaml_value.as_str() |
279 | 787k | .ok_or_else(|| |
280 | 787k | .to_string(); |
281 | 787k | result.push(value); |
282 | } | |
283 | 58.0k | return Ok(result); |
284 | 58.0k | } |
285 | ||
286 | 63.4k | fn get_set_values(values: &Hash) -> Result<HashSet<String>> { |
287 | 63.4k | let mut result = HashSet::with_capacity(2*values.len()); |
288 | 5.81M | for (key, value) in |
289 | 5.81M | let key = key.as_str() |
290 | 5.81M | .ok_or_else(|| |
291 | 5.81M | .to_string(); |
292 | 5.81M | if let Yaml::Null = value { |
293 | 5.81M | } else { |
294 | 0 | bail!("list entry '{}' is not a string", yaml_to_type(value)); |
295 | } | |
296 | 5.81M | result.insert(key); |
297 | } | |
298 | 63.4k | return Ok(result); |
299 | 63.4k | } |
300 | ||
301 | 31.8k | fn get_map_values(values: &Hash) -> Result<HashMap<String, String>> { |
302 | 31.8k | let mut result = HashMap::with_capacity(2*values.len()); |
303 | 1.13M | for (key, value) in |
304 | 1.13M | let key = key.as_str() |
305 | 1.13M | .ok_or_else(|| |
306 | 1.13M | .to_string(); |
307 | 1.13M | let value = value.as_str() |
308 | 1.13M | .ok_or_else(|| |
309 | 1.13M | .to_string(); |
310 | 1.13M | result.insert(key, value); |
311 | } | |
312 | 31.8k | return Ok(result); |
313 | 31.8k | } |
314 | 175k | } |
315 | ||
316 | ||
317 | #[cfg(test)] | |
318 | mod tests { | |
319 | use super::*; | |
320 | ||
321 | #[test] | |
322 | 1 | fn test_vec() { |
323 | 1 | let numbers = r#"[NumbersTens: ["", "ten", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]]"#; |
324 | 1 | let defs_build_fn = |variable_def_list: &Yaml| { |
325 | // Rule::DefinitionList | |
326 | //debug!("variable_def_list {} is\n{}", yaml_to_type(variable_def_list), yaml_to_string(variable_def_list, 0)); | |
327 | 1 | for variable_def in variable_def_list.as_vec().unwrap() { |
328 | 1 | if let Err( |
329 | 0 | bail!("{}", crate::interface::errors_to_string(&e.context(format!("in file {:?}", numbers)))); |
330 | 1 | } |
331 | } | |
332 | 1 | return Ok(vec![]); |
333 | 1 | }; |
334 | 1 | compile_rule(numbers, defs_build_fn).unwrap(); |
335 | 1 | SPEECH_DEFINITIONS.with(|defs| { |
336 | 1 | let defs = defs.borrow(); |
337 | 1 | let names = defs.get_vec("NumbersTens"); |
338 | 1 | assert!(names.is_some()); |
339 | 1 | let names = names.unwrap(); |
340 | 1 | assert_eq!(names.len(), 10); |
341 | 1 | assert_eq!(names[0], ""); |
342 | 1 | assert_eq!(names[9], "ninety"); |
343 | 1 | }); |
344 | 1 | } |
345 | ||
346 | ||
347 | #[test] | |
348 | 1 | fn test_set() { |
349 | 1 | let likely_function_names = r#"[LikelyFunctionNames: {"f", "g", "h", "F", "G", "H", "[A-Za-z]+"}]"#; |
350 | 1 | let defs_build_fn = |variable_def_list: &Yaml| { |
351 | // Rule::DefinitionList | |
352 | //debug!("variable_def_list {} is\n{}", yaml_to_type(variable_def_list), yaml_to_string(variable_def_list, 0)); | |
353 | 1 | for variable_def in variable_def_list.as_vec().unwrap() { |
354 | 1 | if let Err( |
355 | 0 | bail!("{}", crate::interface::errors_to_string(&e.context(format!("in file {:?}", likely_function_names)))); |
356 | 1 | } |
357 | } | |
358 | 1 | return Ok(vec![]); |
359 | 1 | }; |
360 | 1 | compile_rule(likely_function_names, defs_build_fn).unwrap(); |
361 | 1 | SPEECH_DEFINITIONS.with(|defs| { |
362 | 1 | let defs = defs.borrow(); |
363 | 1 | let names = defs.get_hashset("LikelyFunctionNames"); |
364 | 1 | assert!(names.is_some()); |
365 | 1 | let names = names.unwrap(); |
366 | 1 | assert_eq!(names.len(), 7); |
367 | 1 | assert!(names.contains("f")); |
368 | 1 | assert!(!names.contains("a")); |
369 | 1 | }); |
370 | 1 | } |
371 | ||
372 | #[test] | |
373 | 1 | fn test_hashmap() { |
374 | 1 | let units = r#"[Units: {"A": "amp", "g": "gram", "m": "meter", "sec": "second"}]"#; |
375 | 1 | let defs_build_fn = |variable_def_list: &Yaml| { |
376 | // Rule::DefinitionList | |
377 | //debug!("variable_def_list {} is\n{}", yaml_to_type(variable_def_list), yaml_to_string(variable_def_list, 0)); | |
378 | 1 | for variable_def in variable_def_list.as_vec().unwrap() { |
379 | 1 | if let Err( |
380 | 0 | bail!("{}", crate::interface::errors_to_string(&e.context(format!("in file {:?}", units)))); |
381 | 1 | } |
382 | } | |
383 | 1 | return Ok(vec![]); |
384 | 1 | }; |
385 | 1 | compile_rule(units, defs_build_fn).unwrap(); |
386 | 1 | SPEECH_DEFINITIONS.with(|defs| { |
387 | 1 | let defs = defs.borrow(); |
388 | 1 | let names = defs.get_hashmap("Units"); |
389 | 1 | assert!(names.is_some()); |
390 | 1 | let names = names.unwrap(); |
391 | 1 | assert_eq!(names.len(), 4); |
392 | 1 | assert_eq!(names.get("A").unwrap(), "amp"); |
393 | 1 | assert_eq!(names.get("sec").unwrap(), "second"); |
394 | 1 | assert_eq!(names.get("xxx"), None); |
395 | 1 | }); |
396 | 1 | } |
397 | } |
Line | Count | Source |
1 | //! # Definitions module | |
2 | //! This module is responsible for reading in the definitions files and converting them to either vectors or hashmaps so that | |
3 | //! the definitions can be used by the program. | |
4 | //! | |
5 | //! ## Leaked Implementation Details | |
6 | //! There is no escaping some implementation details. | |
7 | //! Because these definitions are stored in global variables, the variables need to be protected | |
8 | //! in some way so they can be written at runtime when the files are read. | |
9 | //! This is done by putting them inside of a lock (`thread_local`). | |
10 | //! | |
11 | //! Furthermore, it was necessary to use `RefCell` and `Rc` to deal with interior mutability. | |
12 | //! All of this means that a lock needs to be obtained _and_ the contents borrowed to access a definition. | |
13 | //! | |
14 | //! To minimize the global variable footprint, all of the definitions are put inside of a single global variable [`DEFINITIONS`]. | |
15 | //! | |
16 | //! //! Note: some of the variables are `vec`s and some are `hashset`s. | |
17 | //! Numbers are typically vectors so that indexing a digit is easy. | |
18 | //! Others such as `functions_names` are a hashset because you just want to know if an `mi` is a known name or not. | |
19 | //! The functions `as_vec` and `as_hashset` should be used on the appropriate variable. | |
20 | //! ## Names | |
21 | //! The names of "variables" in the definition files use camel case (e.g., "FunctionNames"). In the code, to fit with rust | |
22 | //! naming conventions, snake case is used (e.g, "function_names"). | |
23 | //! | |
24 | //! See the struct [`Definitions`] for the variables that are read in. | |
25 | #![allow(clippy::needless_return)] | |
26 | ||
27 | use yaml_rust::yaml::Hash; | |
28 | use yaml_rust::Yaml; | |
29 | use crate::errors::*; | |
30 | use crate::prefs::*; | |
31 | use std::{cell::RefCell, cell::Ref, cell::RefMut, rc::Rc}; | |
32 | use std::path::{Path, PathBuf}; | |
33 | use std::collections::{HashMap, HashSet}; | |
34 | use crate::shim_filesystem::read_to_string_shim; | |
35 | ||
36 | /// An enum to paper over the different types of data access needed. | |
37 | /// | |
38 | /// Having a Rc<RefCell<FromFileVariable>> seems a bit complicated in terms of types but... | |
39 | /// 1. The rust book seems to endorse the Rc<RefCell<...>>> approach when there are multiple owners of mutable date. | |
40 | /// See <https://doc.rust-lang.org/book/ch15-05-interior-mutability.html> towards the end | |
41 | /// 2. When a file is read, we need to clear and add data to the structure being read (reassigning could work for clearing). | |
42 | /// When we use the data, we either want to index into it or test if an item is there. | |
43 | /// The structures we use are either a Vec or a HashMap, so we need to abstract that away in `FromFileVariable`. | |
44 | /// Unfortunately, traits don't quite work as an option here: | |
45 | /// * Vec implements extends (`add`), but there is no test/contains | |
46 | /// * Hashmap implements `index`, but panics if the item isn't there | |
47 | /// | |
48 | /// Because of the above limitations, we introduce the enum [`Contains`] which dispatches appropriately to Vec/Hashmap | |
49 | #[derive(Debug, Clone)] | |
50 | pub enum Contains { | |
51 | Vec(Rc<RefCell<Vec<String>>>), | |
52 | Set(Rc<RefCell<HashSet<String>>>), | |
53 | Map(Rc<RefCell<HashMap<String, String>>>), | |
54 | } | |
55 | ||
56 | impl Contains { | |
57 | // fn add(&mut self, item: String) { | |
58 | // match self { | |
59 | // Contains::Vec(v) => { v.borrow_mut().push(item); }, | |
60 | // Contains::Set(s) => { s.borrow_mut().insert(item); } | |
61 | // } | |
62 | // } | |
63 | ||
64 | // fn clear(&mut self) { | |
65 | // match self { | |
66 | // Contains::Vec(v) => { v.borrow_mut().clear(); }, | |
67 | // Contains::Set(s) => { s.borrow_mut().clear(); } | |
68 | // } | |
69 | // } | |
70 | } | |
71 | pub type CollectionFromFile = Contains; | |
72 | type VariableDefHashMap = HashMap<String, CollectionFromFile>; | |
73 | ||
74 | /// Global structure containing all of the definitions. | |
75 | /// Each field in the structure corresponds to a named value read in from the `definitions.yaml` files. | |
76 | /// | |
77 | /// The names of "variables" in the definition files use camel case (e.g., "FunctionNames"). In the code, to fit with rust | |
78 | /// naming conventions, snake case is used (e.g, "function_names"). | |
79 | /// | |
80 | /// There should only be one instance of this structure ([`DEFINITIONS`]) | |
81 | // FIX: this probably can done with a macro to remove all the repetition | |
82 | pub struct Definitions { | |
83 | pub name_to_var_mapping: VariableDefHashMap, | |
84 | } | |
85 | ||
86 | impl Default for Definitions { | |
87 | 0 | fn default() -> Self { |
88 | 0 | Definitions { |
89 | 0 | name_to_var_mapping: HashMap::with_capacity(30), |
90 | 0 | } |
91 | 0 | } |
92 | } | |
93 | ||
94 | impl Definitions { | |
95 | 5.48k | fn new() -> Self { |
96 | 5.48k | Definitions { |
97 | 5.48k | name_to_var_mapping: HashMap::with_capacity(30), |
98 | 5.48k | } |
99 | 5.48k | } |
100 | ||
101 | 158k | pub fn get_hashset(&self, name: &str) -> Option<Ref<'_, HashSet<String>>> { |
102 | 158k | let names = self.name_to_var_mapping.get(name); |
103 | 151k | if let Some(Contains::Set( |
104 | 151k | return Some(set.borrow()); |
105 | 7.29k | } |
106 | 7.29k | return None; |
107 | 158k | } |
108 | ||
109 | 15.9k | pub fn get_hashmap(&self, name: &str) -> Option<Ref<'_, HashMap<String, String>>> { |
110 | 15.9k | let names = self.name_to_var_mapping.get(name); |
111 | 15.9k | if let Some(Contains::Map(map)) = names { |
112 | 15.9k | return Some(map.borrow()); |
113 | 0 | } |
114 | 0 | return None; |
115 | 15.9k | } |
116 | ||
117 | 1.31k | pub fn get_vec(&self, name: &str) -> Option<Ref<'_, Vec<String>>> { |
118 | 1.31k | let names = self.name_to_var_mapping.get(name); |
119 | 1.31k | if let Some(Contains::Vec(vec)) = names { |
120 | 1.31k | return Some(vec.borrow()); |
121 | 0 | } |
122 | 0 | return None; |
123 | 1.31k | } |
124 | } | |
125 | ||
126 | thread_local!{ | |
127 | /// Global variable containing all of the definitions. | |
128 | /// See [`Definitions`] for more details. | |
129 | pub static SPEECH_DEFINITIONS: RefCell<Definitions> = RefCell::new( Definitions::new() ); | |
130 | pub static BRAILLE_DEFINITIONS: RefCell<Definitions> = RefCell::new( Definitions::new() ); | |
131 | pub static DEFINITIONS: &'static std::thread::LocalKey<RefCell<Definitions>> = const { &SPEECH_DEFINITIONS }; | |
132 | } | |
133 | ||
134 | /// Reads the `definitions.yaml` files specified by current_files -- these are presumed to need updating. | |
135 | /// | |
136 | /// If there is a failure during read, the error is propagated to the caller | |
137 | 5.50k | pub fn read_definitions_file(use_speech_defs: bool) -> Result<Vec<PathBuf>> { |
138 | // for each file in `locations`, read the contents and process them | |
139 | 5.50k | let pref_manager = PreferenceManager::get(); |
140 | 5.50k | let pref_manager = pref_manager.borrow(); |
141 | 5.50k | let file_path = pref_manager.get_definitions_file(use_speech_defs); |
142 | 5.50k | let definitions = if use_speech_defs { |
143 | 5.50k | definitions.with( |defs| defs.borrow_mut().name_to_var_mapping.clear() ); |
144 | 5.50k | let mut new_files = vec![file_path.to_path_buf()]; |
145 | 5.50k | let mut files_read = read_one_definitions_file(use_speech_defs, file_path).with_context(|| |
146 | 5.50k | new_files.append(&mut files_read); |
147 | ||
148 | // merge the contents of `TrigFunctions` into a set that contains all the function names (from `AdditionalFunctionNames`). | |
149 | 5.50k | return definitions.with(|defs| { |
150 | 5.50k | let mut defs = defs.borrow_mut(); |
151 | 5.50k | make_all_set_references_valid(&mut defs); |
152 | 5.50k | return Ok(new_files); |
153 | 5.50k | }); |
154 | ||
155 | ||
156 | /// Make references to all used set be valid by creating empty sets if they weren't defined | |
157 | 5.50k | fn make_all_set_references_valid(defs: &mut RefMut<Definitions>) { |
158 | // FIX: this list is created by hand -- it would be better if there was a way to create the list Automatically | |
159 | // Note: "FunctionNames" is created in build_all_functions_set() if not already set | |
160 | 5.50k | let used_set_names = ["GeometryPrefixOperators", "LikelyFunctionNames", "TrigFunctionNames", "AdditionalFunctionNames", "Arrows", "GeometryShapes"]; |
161 | // let name_to_mapping = defs.name_to_var_mapping.borrow_mut(); | |
162 | 33.0k | for set_name in |
163 | 33.0k | if defs.get_hashset(set_name).is_none() { |
164 | 1.74k | defs.name_to_var_mapping.insert(set_name.to_string(), Contains::Set( Rc::new( RefCell::new( HashSet::with_capacity(0) ) ) )); |
165 | 31.2k | } |
166 | } | |
167 | 5.50k | if defs.get_hashset("FunctionNames").is_none() { |
168 | 5.46k | let all_functions = build_all_functions_set(defs); |
169 | 5.46k | defs.name_to_var_mapping.insert("FunctionNames".to_string(), Contains::Set( Rc::new( RefCell::new( all_functions ) ) )); |
170 | 5.46k |
|
171 | 5.50k | } |
172 | ||
173 | /// merge "TrigFunctions" and "AdditionalFunctionNames" into a new set named "FunctionNames" | |
174 | 5.46k | fn build_all_functions_set(defs: &mut RefMut<Definitions>) -> HashSet<String> { |
175 | 5.46k | let trig_functions = defs.get_hashset("TrigFunctionNames").unwrap(); |
176 | 5.46k | let mut all_functions = defs.get_hashset("AdditionalFunctionNames").unwrap().clone(); |
177 | 109k | for trig_name in |
178 | 109k | all_functions.insert(trig_name.clone()); |
179 | 109k | } |
180 | 5.46k | return all_functions; |
181 | 5.46k | } |
182 | 5.50k | } |
183 | ||
184 | use crate::speech::*; | |
185 | 11.7k | fn read_one_definitions_file(use_speech_defs: bool, path: &Path) -> Result<Vec<PathBuf>> { |
186 | // read in the file contents | |
187 | 11.7k | let definition_file_contents = read_to_string_shim(path) |
188 | 11.7k | .with_context(|| |
189 | ||
190 | // callback to do the work of building up the defined vectors/hashmaps (in 'build_values') from YAML | |
191 | 11.7k | let defs_build_fn = |variable_def_list: &Yaml| { |
192 | // Rule::DefinitionList | |
193 | // debug!("variable_def_list {} is\n{}", yaml_to_type(variable_def_list), yaml_to_string(variable_def_list)); | |
194 | 11.7k | let mut files_read = vec![path.to_path_buf()]; |
195 | 11.7k | let vec = crate::speech::as_vec_checked(variable_def_list) |
196 | 11.7k | .with_context(|| |
197 | 175k | for variable_def in |
198 | 175k | if let Some( |
199 | 6.27k | files_read.append(&mut added_files); |
200 | 168k | } |
201 | } | |
202 | 11.7k | return Ok(files_read); |
203 | 11.7k | }; |
204 | ||
205 | // Convert the file contents to YAML and call the callback | |
206 | 11.7k | return crate::speech::compile_rule(&definition_file_contents, defs_build_fn) |
207 | 11.7k | .with_context(|| |
208 | 11.7k | } |
209 | ||
210 | /// Do the work of converting a single YAML def into the vec/hashset/hashmap | |
211 | /// name: [a, b, c] -- assume an indexed vector | |
212 | /// name: {a, b, c} -- assume a hash set | |
213 | /// name: {a: A, b: B, c: C} -- assume a hashmap | |
214 | /// Returns all the files that were read | |
215 | 175k | fn build_values(definition: &Yaml, use_speech_defs: bool, path: &Path) -> Result<Option<Vec<PathBuf>>> { |
216 | // Rule::Definition | |
217 | 175k | let dictionary = crate::speech::as_hash_checked(definition) |
218 | 175k | if dictionary.len()!=1 { |
219 | 0 | bail!("Should only be one definition rule: {}", yaml_to_type(definition)); |
220 | 175k | } |
221 | 175k | let (key, value) = dictionary.iter().next().unwrap(); |
222 | 175k | let def_name = key.as_str().ok_or_else(|| |
223 | 175k | if def_name == "include" { |
224 | 6.27k | let do_include_fn = |new_file: &Path| { |
225 | 6.27k | read_one_definitions_file(use_speech_defs, new_file) |
226 | 6.27k | }; |
227 | 6.27k | let include_file_name = value.as_str().ok_or_else(|| |
228 | 6.27k | return Ok( Some(crate::speech::process_include(path, include_file_name, do_include_fn) |
229 | 168k | } |
230 | ||
231 | let result; | |
232 | 168k | if def_name.starts_with("Numbers") || |
233 | 58.0k | result = Contains::Vec( Rc::new( RefCell::new( get_vec_values(value.as_vec().unwrap()) |
234 | } else { | |
235 | // match value.as_vec() { | |
236 | // Some(vec) => { | |
237 | // result = Contains::Set( Rc::new( RefCell::new( get_set_values(vec)? ) ) ); }, | |
238 | // None => { | |
239 | // let dict = value.as_hash().ok_or_else(|| anyhow!("definition list value '{}' is not an array or dictionary", yaml_to_type(value)))?; | |
240 | // result = Contains::Map( Rc::new( RefCell::new( get_map_values(dict) | |
241 | // .chain_err(||format!("while reading value '{}'", def_name))? ) ) ); | |
242 | ||
243 | // }, | |
244 | // } | |
245 | 110k | let dict = value.as_hash().ok_or_else(|| |
246 | 110k | if dict.is_empty() { |
247 | 15.6k | result = Contains::Set( Rc::new( RefCell::new( HashSet::with_capacity(0) ) ) ); |
248 | 15.6k | } else { |
249 | // peak and see if this is a set or a map | |
250 | 95.3k | let (_, entry_value) = dict.iter().next().unwrap(); |
251 | 95.3k | if entry_value.is_null() { |
252 | 63.4k | result = Contains::Set( Rc::new( RefCell::new( get_set_values(dict) |
253 | 63.4k | .with_context(|| |
254 | } else { | |
255 | // peak and see if this is a set or a map | |
256 | 31.8k | let (_, entry_value) = dict.iter().next().unwrap(); |
257 | 31.8k | if entry_value.is_null() { |
258 | 0 | result = Contains::Set( Rc::new( RefCell::new( get_set_values(dict) |
259 | 0 | .with_context(||format!("while reading value '{def_name}'"))? ) ) ); |
260 | } else { | |
261 | 31.8k | result = Contains::Map( Rc::new( RefCell::new( get_map_values(dict) |
262 | 31.8k | .with_context(|| |
263 | } | |
264 | } | |
265 | } | |
266 | }; | |
267 | ||
268 | 168k | let definitions = if use_speech_defs { |
269 | 168k | return definitions.with(|definitions| { |
270 | 168k | let name_definition_map = &mut definitions.borrow_mut().name_to_var_mapping; |
271 | 168k | name_definition_map.insert(def_name.to_string(), result); |
272 | 168k | return Ok(None); |
273 | 168k | }); |
274 | ||
275 | 58.0k | fn get_vec_values(values: &Vec<Yaml>) -> Result<Vec<String>> { |
276 | 58.0k | let mut result = Vec::with_capacity(values.len()); |
277 | 787k | for yaml_value in |
278 | 787k | let value = yaml_value.as_str() |
279 | 787k | .ok_or_else(|| |
280 | 787k | .to_string(); |
281 | 787k | result.push(value); |
282 | } | |
283 | 58.0k | return Ok(result); |
284 | 58.0k | } |
285 | ||
286 | 63.4k | fn get_set_values(values: &Hash) -> Result<HashSet<String>> { |
287 | 63.4k | let mut result = HashSet::with_capacity(2*values.len()); |
288 | 5.81M | for (key, value) in |
289 | 5.81M | let key = key.as_str() |
290 | 5.81M | .ok_or_else(|| |
291 | 5.81M | .to_string(); |
292 | 5.81M | if let Yaml::Null = value { |
293 | 5.81M | } else { |
294 | 0 | bail!("list entry '{}' is not a string", yaml_to_type(value)); |
295 | } | |
296 | 5.81M | result.insert(key); |
297 | } | |
298 | 63.4k | return Ok(result); |
299 | 63.4k | } |
300 | ||
301 | 31.8k | fn get_map_values(values: &Hash) -> Result<HashMap<String, String>> { |
302 | 31.8k | let mut result = HashMap::with_capacity(2*values.len()); |
303 | 1.13M | for (key, value) in |
304 | 1.13M | let key = key.as_str() |
305 | 1.13M | .ok_or_else(|| |
306 | 1.13M | .to_string(); |
307 | 1.13M | let value = value.as_str() |
308 | 1.13M | .ok_or_else(|| |
309 | 1.13M | .to_string(); |
310 | 1.13M | result.insert(key, value); |
311 | } | |
312 | 31.8k | return Ok(result); |
313 | 31.8k | } |
314 | 175k | } |
315 | ||
316 | ||
317 | #[cfg(test)] | |
318 | mod tests { | |
319 | use super::*; | |
320 | ||
321 | #[test] | |
322 | 1 | fn test_vec() { |
323 | 1 | let numbers = r#"[NumbersTens: ["", "ten", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]]"#; |
324 | 1 | let defs_build_fn = |variable_def_list: &Yaml| { |
325 | // Rule::DefinitionList | |
326 | //debug!("variable_def_list {} is\n{}", yaml_to_type(variable_def_list), yaml_to_string(variable_def_list, 0)); | |
327 | 1 | for variable_def in variable_def_list.as_vec().unwrap() { |
328 | 1 | if let Err( |
329 | 0 | bail!("{}", crate::interface::errors_to_string(&e.context(format!("in file {:?}", numbers)))); |
330 | 1 | } |
331 | } | |
332 | 1 | return Ok(vec![]); |
333 | 1 | }; |
334 | 1 | compile_rule(numbers, defs_build_fn).unwrap(); |
335 | 1 | SPEECH_DEFINITIONS.with(|defs| { |
336 | 1 | let defs = defs.borrow(); |
337 | 1 | let names = defs.get_vec("NumbersTens"); |
338 | 1 | assert!(names.is_some()); |
339 | 1 | let names = names.unwrap(); |
340 | 1 | assert_eq!(names.len(), 10); |
341 | 1 | assert_eq!(names[0], ""); |
342 | 1 | assert_eq!(names[9], "ninety"); |
343 | 1 | }); |
344 | 1 | } |
345 | ||
346 | ||
347 | #[test] | |
348 | 1 | fn test_set() { |
349 | 1 | let likely_function_names = r#"[LikelyFunctionNames: {"f", "g", "h", "F", "G", "H", "[A-Za-z]+"}]"#; |
350 | 1 | let defs_build_fn = |variable_def_list: &Yaml| { |
351 | // Rule::DefinitionList | |
352 | //debug!("variable_def_list {} is\n{}", yaml_to_type(variable_def_list), yaml_to_string(variable_def_list, 0)); | |
353 | 1 | for variable_def in variable_def_list.as_vec().unwrap() { |
354 | 1 | if let Err( |
355 | 0 | bail!("{}", crate::interface::errors_to_string(&e.context(format!("in file {:?}", likely_function_names)))); |
356 | 1 | } |
357 | } | |
358 | 1 | return Ok(vec![]); |
359 | 1 | }; |
360 | 1 | compile_rule(likely_function_names, defs_build_fn).unwrap(); |
361 | 1 | SPEECH_DEFINITIONS.with(|defs| { |
362 | 1 | let defs = defs.borrow(); |
363 | 1 | let names = defs.get_hashset("LikelyFunctionNames"); |
364 | 1 | assert!(names.is_some()); |
365 | 1 | let names = names.unwrap(); |
366 | 1 | assert_eq!(names.len(), 7); |
367 | 1 | assert!(names.contains("f")); |
368 | 1 | assert!(!names.contains("a")); |
369 | 1 | }); |
370 | 1 | } |
371 | ||
372 | #[test] | |
373 | 1 | fn test_hashmap() { |
374 | 1 | let units = r#"[Units: {"A": "amp", "g": "gram", "m": "meter", "sec": "second"}]"#; |
375 | 1 | let defs_build_fn = |variable_def_list: &Yaml| { |
376 | // Rule::DefinitionList | |
377 | //debug!("variable_def_list {} is\n{}", yaml_to_type(variable_def_list), yaml_to_string(variable_def_list, 0)); | |
378 | 1 | for variable_def in variable_def_list.as_vec().unwrap() { |
379 | 1 | if let Err( |
380 | 0 | bail!("{}", crate::interface::errors_to_string(&e.context(format!("in file {:?}", units)))); |
381 | 1 | } |
382 | } | |
383 | 1 | return Ok(vec![]); |
384 | 1 | }; |
385 | 1 | compile_rule(units, defs_build_fn).unwrap(); |
386 | 1 | SPEECH_DEFINITIONS.with(|defs| { |
387 | 1 | let defs = defs.borrow(); |
388 | 1 | let names = defs.get_hashmap("Units"); |
389 | 1 | assert!(names.is_some()); |
390 | 1 | let names = names.unwrap(); |
391 | 1 | assert_eq!(names.len(), 4); |
392 | 1 | assert_eq!(names.get("A").unwrap(), "amp"); |
393 | 1 | assert_eq!(names.get("sec").unwrap(), "second"); |
394 | 1 | assert_eq!(names.get("xxx"), None); |
395 | 1 | }); |
396 | 1 | } |
397 | } |
Line | Count | Source |
1 | //! Use heuristics to infer the intent. | |
2 | //! For example, an `mfrac` with `linethickness=0` would be a binomial | |
3 | //! The inference is added to the MathML | |
4 | //! | |
5 | //! The implementation of the module is on hold until the MathML committee figures out how it wants to do this. | |
6 | #![allow(clippy::needless_return)] | |
7 | ||
8 | use sxd_document::dom::{Element, Document, ChildOfElement}; | |
9 | use crate::prefs::PreferenceManager; | |
10 | use crate::speech::SpeechRulesWithContext; | |
11 | use crate::canonicalize::{as_element, as_text, name, create_mathml_element, set_mathml_name, INTENT_ATTR, MATHML_FROM_NAME_ATTR}; | |
12 | use crate::errors::*; | |
13 | use std::fmt; | |
14 | use std::sync::LazyLock; | |
15 | use crate::pretty_print::mml_to_string; | |
16 | use crate::xpath_functions::is_leaf; | |
17 | use regex::Regex; | |
18 | use phf::phf_set; | |
19 | use log::{debug, error, warn}; | |
20 | ||
21 | const IMPLICIT_FUNCTION_NAME: &str = "apply-function"; | |
22 | ||
23 | 2.47k | pub fn infer_intent<'r, 'c, 's:'c, 'm:'c>(rules_with_context: &'r mut SpeechRulesWithContext<'c,'s,'m>, mathml: Element<'c>) -> Result<Element<'m>> { |
24 | 2.47k | match catch_errors_building_intent(rules_with_context, mathml) { |
25 | 2.45k | Ok(intent) => return Ok(intent), |
26 | 19 | Err(e) => { |
27 | // lookup what we should do for error recovery | |
28 | 19 | let intent_preference = rules_with_context.get_rules().pref_manager.borrow().pref_to_string("IntentErrorRecovery"); |
29 | 19 | if intent_preference == "Error" { |
30 | 9 | return Err(e); |
31 | } else { | |
32 | 10 | let saved_intent_attr = mathml.attribute_value(INTENT_ATTR).unwrap(); |
33 | 10 | mathml.remove_attribute(INTENT_ATTR); |
34 | // can't call intent_from_mathml() because we have already borrowed_mut -- we call a more internal version | |
35 | 10 | let intent_tree = match rules_with_context.match_pattern::<Element<'m>>(mathml) |
36 | 10 | .context("Pattern match/replacement failure!") { |
37 | 0 | Err(e) => Err(e), |
38 | 10 | Ok(intent) => { |
39 | 10 | intent.set_attribute_value(INTENT_ATTR, saved_intent_attr); // so attr can be potentially be viewed later |
40 | 10 | Ok(intent) |
41 | }, | |
42 | }; | |
43 | 10 | mathml.set_attribute_value(INTENT_ATTR, saved_intent_attr); |
44 | 10 | return intent_tree; |
45 | } | |
46 | } | |
47 | } | |
48 | ||
49 | 2.47k | fn catch_errors_building_intent<'r, 'c, 's:'c, 'm:'c>(rules_with_context: &'r mut SpeechRulesWithContext<'c,'s,'m>, mathml: Element<'c>) -> Result<Element<'m>> { |
50 | 2.47k | if let Some(intent_str) = mathml.attribute_value(INTENT_ATTR) { |
51 | // debug!("Before intent: {}", crate::pretty_print::mml_to_string(mathml)); | |
52 | 2.47k | let mut lex_state = LexState::init(intent_str.trim()) |
53 | 2.47k | let mut intent_offset = 0; |
54 | 2.47k | let |
55 | 2.47k | .with_context(|| |
56 | 2.46k | if lex_state.token != Token::None { |
57 | 5 | bail!("Error in intent value: extra unparsed intent '{}' in intent attribute value '{}'", lex_state.remaining_str, intent_str); |
58 | 2.45k | } |
59 | 2.45k | assert!(lex_state.remaining_str.is_empty()); |
60 | // debug!("Resulting intent:\n{}", crate::pretty_print::mml_to_string(result)); | |
61 | 2.45k | return Ok(result); |
62 | 0 | } |
63 | 0 | bail!("Internal error: infer_intent() called on MathML with no intent arg:\n{}", mml_to_string(mathml)); |
64 | 2.47k | } |
65 | 2.47k | } |
66 | ||
67 | ||
68 | static FIXITIES: phf::Set<&str> = phf_set! { | |
69 | "function", "infix", "prefix", "postfix", "silent", "other", | |
70 | }; | |
71 | ||
72 | /// Eliminate all but the last fixity property | |
73 | 7.63k | pub fn simplify_fixity_properties(properties: &str) -> String { |
74 | 7.63k | let parts: Vec<&str> = properties.split(':').collect(); |
75 | // debug!("simplify_fixity_properties {} parts from input: '{}'", parts.len(), properties); | |
76 | 7.63k | let mut fixity_property = ""; |
77 | 7.63k | let mut answer = ":".to_string(); |
78 | 19.2k | for part in |
79 | 19.2k | if FIXITIES.contains(part) { |
80 | 1.12k | fixity_property = part; |
81 | 18.1k | } else if !part.is_empty() { |
82 | 4.71k | answer.push_str(part); |
83 | 4.71k | answer.push(':'); |
84 | 13.4k | } |
85 | } | |
86 | 7.63k | if !fixity_property.is_empty() { |
87 | 1.12k | answer.push_str(fixity_property); |
88 | 1.12k | answer.push(':'); |
89 | 6.51k | } |
90 | 7.63k | return answer; |
91 | 7.63k | } |
92 | ||
93 | /// Given the intent add the fixity property for the intent if it isn't given (and one exists) | |
94 | 2.72k | fn add_fixity(intent: Element) { |
95 | 2.72k | let properties = intent.attribute_value(INTENT_PROPERTY).unwrap_or_default(); |
96 | 7.47k | if |
97 | 2.63k | let intent_name = name(intent); |
98 | 2.63k | crate::definitions::SPEECH_DEFINITIONS.with(|definitions| { |
99 | 2.63k | let definitions = definitions.borrow(); |
100 | 2.63k | if let Some( |
101 | 12 | let Some((fixity, _)) = definition.split_once("=") { |
102 | 12 | let new_properties = (if properties.is_empty() {":"} else { |
103 | 12 | intent.set_attribute_value(INTENT_PROPERTY, &new_properties); |
104 | // debug!("Added fixity: new value '{}'", intent.attribute_value(INTENT_PROPERTY).unwrap()); | |
105 | 2.62k | }; |
106 | 2.63k | }); |
107 | 90 | } |
108 | 2.72k | } |
109 | ||
110 | ||
111 | /// Given some MathML, expand out any intents taking into account their fixity property | |
112 | /// This is recursive | |
113 | 363 | pub fn add_fixity_children(intent: Element) -> Element { |
114 | 363 | let children = intent.children(); |
115 | 363 | if children.is_empty() || (children.len() == 1 && children[0].element().is_none()) { |
116 | 0 | return intent; |
117 | 363 | } |
118 | ||
119 | 363 | for child in children { |
120 | 363 | let child = as_element(child); |
121 | 363 | if child.attribute_value(INTENT_ATTR).is_some() { |
122 | 0 | add_fixity_child(child); |
123 | 363 | } |
124 | } | |
125 | 363 | return intent; |
126 | ||
127 | 0 | fn add_fixity_child(mathml: Element) -> Element { |
128 | 0 | let mut children = mathml.children(); |
129 | 0 | if children.is_empty() { |
130 | 0 | return mathml; |
131 | 0 | } |
132 | // we also exclude fixity on mtable because they mess up the counts (see 'en::mtable::unknown_mtable_property') | |
133 | 0 | if mathml.attribute_value(MATHML_FROM_NAME_ATTR).unwrap_or_default() == "mtable" { |
134 | 0 | return mathml; |
135 | 0 | } |
136 | 0 | let doc = mathml.document(); |
137 | 0 | let properties = mathml.attribute_value(INTENT_PROPERTY).unwrap_or_default(); |
138 | 0 | let fixity = properties.rsplit(':').find(|&property| FIXITIES.contains(property)).unwrap_or_default(); |
139 | 0 | let intent_name = name(mathml); |
140 | ||
141 | 0 | let op_name_id = mathml.attribute_value("id").unwrap_or("new-id"); |
142 | 0 | match fixity { |
143 | 0 | "infix" => { |
144 | 0 | let mut new_children = Vec::with_capacity(2*children.len()-1); |
145 | 0 | new_children.push(children[0]); |
146 | 0 | for (i, &child) in children.iter().enumerate().skip(1) { |
147 | 0 | new_children.push(create_operator_element(intent_name, fixity, op_name_id, i, &doc)); |
148 | 0 | new_children.push(child); |
149 | 0 | } |
150 | 0 | mathml.replace_children(new_children); |
151 | }, | |
152 | 0 | "prefix" => { |
153 | 0 | children.insert(0, create_operator_element(intent_name, fixity, op_name_id, 1, &doc)); |
154 | 0 | mathml.replace_children(children); |
155 | 0 | }, |
156 | 0 | "postfix" => { |
157 | 0 | children.push( create_operator_element(intent_name, fixity, op_name_id, 1, &doc)); |
158 | 0 | mathml.replace_children(children); |
159 | 0 | }, |
160 | 0 | "silent" => { |
161 | 0 | // children remain the same -- nothing to do |
162 | 0 | }, |
163 | 0 | "other" => { |
164 | 0 | // a special case -- will be handled with specific rules (e.g., intervals need to add "from" and "to", not a single word) |
165 | 0 | }, |
166 | _ => { // "function" is the default | |
167 | // build a function like notation function-name U+2061 <mrow> children </mrow> | |
168 | 0 | let mut new_children = Vec::with_capacity(3); |
169 | 0 | let function_name = create_operator_element(intent_name, "function", op_name_id, 1, &doc); |
170 | 0 | new_children.push(function_name); |
171 | 0 | let invisible_apply_function = create_operator_element("mo", "infix", op_name_id, 2, &doc); |
172 | 0 | invisible_apply_function.element().unwrap().set_text("\u{2061}"); |
173 | 0 | new_children.push(invisible_apply_function); |
174 | 0 | let mrow_wrapper = create_mathml_element(&doc, "mrow"); |
175 | 0 | mrow_wrapper.set_attribute_value("id", (op_name_id.to_string() + "3").as_str()); |
176 | 0 | mrow_wrapper.append_children(children); |
177 | 0 | new_children.push(ChildOfElement::Element(mrow_wrapper)); |
178 | 0 | mathml.replace_children(new_children); |
179 | 0 | if fixity.is_empty() { |
180 | 0 | mathml.set_attribute_value(INTENT_PROPERTY, ":function:"); |
181 | 0 | } |
182 | }, | |
183 | } | |
184 | 0 | return mathml; |
185 | ||
186 | 0 | fn create_operator_element<'a>(intent_name: &str, fixity: &str, id: &str, id_inc: usize, doc: &Document<'a>) -> ChildOfElement<'a> { |
187 | 0 | let intent_name = intent_speech_for_name(intent_name, &PreferenceManager::get().borrow().pref_to_string("NavMode"), fixity); |
188 | 0 | let element = create_mathml_element(doc, &intent_name); |
189 | 0 | element.set_attribute_value("id", &format!("{id}-fixity-{id_inc}")); |
190 | 0 | element.set_attribute_value(MATHML_FROM_NAME_ATTR, "mo"); |
191 | 0 | return ChildOfElement::Element(element); |
192 | 0 | } |
193 | 0 | } |
194 | 363 | } |
195 | ||
196 | 340 | pub fn intent_speech_for_name(intent_name: &str, verbosity: &str, fixity: &str) -> String { |
197 | 340 | crate::definitions::SPEECH_DEFINITIONS.with(|definitions| { |
198 | 340 | let definitions = definitions.borrow(); |
199 | 340 | if let Some( |
200 | // Split the pattern is: | |
201 | // fixity-def [|| fixity-def]* | |
202 | // fixity-def := fixity=[open;] verbosity[; close] | |
203 | // verbosity := terse | medium | verbose | |
204 | 396 | if let Some( |
205 | 294 | let (_, matched_intent) = matched_intent.split_once("=").unwrap_or_default(); |
206 | 294 | let parts = matched_intent.trim().split(";").collect::<Vec<&str>>(); |
207 | 294 | let mut operator_names = (if parts.len() > 1 { |
208 | 294 | match operator_names.len() { |
209 | 236 | 1 => return operator_names[0].trim().to_string(), |
210 | 2 | 3 => { | |
211 | 58 | if operator_names.len() == 2 { |
212 | 0 | warn!("Intent '{intent_name}' has only two operator names, but should have three"); |
213 | 0 | operator_names.push(operator_names[1]); |
214 | 58 | } |
215 | 58 | let intent_word = match verbosity { |
216 | 58 | "Terse" => |
217 | 56 | "Medium" => |
218 | 2 | _ => operator_names[2], |
219 | }; | |
220 | 58 | return intent_word.trim().to_string(); |
221 | }, | |
222 | _ => { | |
223 | 0 | error!("Intent '{}' has too many ({}) operator names, should only have 2", intent_name, operator_names.len()); |
224 | 0 | return intent_name.to_string(); |
225 | }, | |
226 | } | |
227 | 0 | } |
228 | 46 | }; |
229 | 46 | return intent_name.replace(['_', '-'], " ").trim().to_string(); |
230 | 340 | }) |
231 | 340 | } |
232 | ||
233 | ||
234 | ||
235 | // intent := self-property-list | expression | |
236 | // self-property-list := property+ S | |
237 | // expression := S ( term property* | application ) S | |
238 | // term := concept-or-literal | number | reference | |
239 | // concept-or-literal := NCName | |
240 | // number := '-'? \d+ ( '.' \d+ )? | |
241 | // reference := '$' NCName | |
242 | // application := expression '(' arguments? S ')' | |
243 | // arguments := expression ( ',' expression )* | |
244 | // property := S ':' NCName | |
245 | // S := [ \t\n\r]* | |
246 | ||
247 | // The practical restrictions of NCName are that it cannot contain several symbol characters like | |
248 | // !, ", #, $, %, &, ', (, ), *, +, ,, /, :, ;, <, =, >, ?, @, [, \, ], ^, `, {, |, }, ~, and whitespace characters | |
249 | // Furthermore an NCName cannot begin with a number, dot or minus character although they can appear later in an NCName. | |
250 | // NC_NAME defined in www.w3.org/TR/REC-xml/#sec-common-syn, but is complicated | |
251 | // We follow NC_NAME for the basic latin block, but then allow everything | |
252 | 2 | static CONCEPT_OR_LITERAL: LazyLock<Regex> = LazyLock::new(|| { |
253 | 2 | Regex::new(r#"^[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // NC_NAME but simpler |
254 | 2 | ).unwrap() |
255 | 2 | }); |
256 | 2 | static PROPERTY: LazyLock<Regex> = LazyLock::new(|| { |
257 | 2 | Regex::new(r#"^:[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // : NC_NAME |
258 | 2 | ).unwrap() |
259 | 2 | }); |
260 | 2 | static ARG_REF: LazyLock<Regex> = LazyLock::new(|| { |
261 | 2 | Regex::new(r#"^\$[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // $ NC_NAME |
262 | 2 | ).unwrap() |
263 | 2 | }); |
264 | 2 | static NUMBER: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"^-?[0-9]+(\.[0-9]+)?"#).unwrap()); |
265 | ||
266 | static TERMINALS_AS_U8: [u8; 3] = [b'(', b',', b')']; | |
267 | // static TERMINALS: [char; 3] = ['(', ',',')']; | |
268 | ||
269 | // 'i -- "i" for the lifetime of the INTENT_ATTR string | |
270 | #[derive(Debug, PartialEq, Eq, Clone)] | |
271 | enum Token<'i> { | |
272 | Terminal(&'i str), // "(", ",", ")" | |
273 | Property(&'i str), | |
274 | ArgRef(&'i str), | |
275 | ConceptOrLiteral(&'i str), | |
276 | Number(&'i str), | |
277 | None, // out of characters | |
278 | } | |
279 | ||
280 | impl fmt::Display for Token<'_> { | |
281 | 3 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
282 | 3 | return write!(f, "{}", |
283 | 3 | match self { |
284 | 3 | Token::Terminal(str) => format!("Terminal('{str}')"), |
285 | 0 | Token::Property(str) => format!("Property({str})"), |
286 | 0 | Token::ArgRef(str) => format!("ArgRef({str})"), |
287 | 0 | Token::ConceptOrLiteral(str) => format!("Literal({str})"), |
288 | 0 | Token::Number(str) => format!("Number({str})"), |
289 | 0 | Token::None => "None".to_string(), |
290 | } | |
291 | ); | |
292 | 3 | } |
293 | } | |
294 | ||
295 | impl Token<'_> { | |
296 | 3.64k | fn is_terminal(&self, terminal: &str) -> bool { |
297 | 3.64k | if let Token::Terminal( |
298 | 1.02k | return value == terminal; |
299 | } else { | |
300 | 2.61k | return false; |
301 | } | |
302 | 3.64k | } |
303 | ||
304 | 5.21k | fn as_str(&self) -> &str { |
305 | 5.21k | return match self { |
306 | 0 | Token::Terminal(str) => str, |
307 | 4.79k | Token::Property(str) => str, |
308 | 226 | Token::ArgRef(str) => str, |
309 | 161 | Token::ConceptOrLiteral(str) => str, |
310 | 29 | Token::Number(str) => str, |
311 | 0 | Token::None => "", |
312 | } | |
313 | 5.21k | } |
314 | } | |
315 | ||
316 | struct LexState<'i> { | |
317 | token: Token<'i>, | |
318 | remaining_str: &'i str, // always trimmed | |
319 | } | |
320 | ||
321 | impl fmt::Display for LexState<'_> { | |
322 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
323 | 0 | return writeln!(f, "token: {}, remaining: '{}'", self.token, self.remaining_str); |
324 | 0 | } |
325 | } | |
326 | ||
327 | impl<'i> LexState<'i> { | |
328 | 2.50k | fn init(str: &'i str) -> Result<LexState<'i>> { |
329 | 2.50k | let mut lex_state = LexState { token: Token::None, remaining_str: str.trim() }; |
330 | 2.50k | lex_state.get_next() |
331 | 2.50k | return Ok(lex_state); |
332 | 2.50k | } |
333 | ||
334 | // helper function for LexState -- do not call outside of the impl | |
335 | 2.82k | fn set_token(&mut self, str: &'i str) -> Result<()> { |
336 | // Note: 'str' is already trimmed | |
337 | 2.82k | if str.is_empty() { |
338 | 0 | self.token = Token::None; |
339 | 2.82k | } else if TERMINALS_AS_U8.contains(&str.as_bytes()[0]) { |
340 | 0 | self.token = Token::Terminal(str); |
341 | 2.82k | } else if let Some( |
342 | 2.40k | self.token = Token::Property(matched_property.as_str()); |
343 | 2.40k | } else if let Some( |
344 | 226 | self.token = Token::ArgRef(matched_arg_ref.as_str()); |
345 | 226 | } else if let Some( |
346 | 161 | self.token = Token::ConceptOrLiteral(matched_literal.as_str()); |
347 | 161 | } else if let Some( |
348 | 29 | self.token = Token::Number(matched_number.as_str()); |
349 | 29 | } else { |
350 | 0 | bail!("Illegal 'intent' syntax: {}", str); |
351 | } | |
352 | 2.82k | return Ok( () ); |
353 | 2.82k | } |
354 | ||
355 | 5.69k | fn get_next(&mut self) -> Result<&Token<'_>> { |
356 | 5.69k | if self.remaining_str.is_empty() { |
357 | 2.48k | self.token = Token::None; |
358 | 3.21k | } else if TERMINALS_AS_U8.contains(&self.remaining_str.as_bytes()[0]) { |
359 | 391 | self.token = Token::Terminal(&self.remaining_str[..1]); |
360 | 391 | self.remaining_str = self.remaining_str[1..].trim_start(); |
361 | 391 | } else { |
362 | 2.82k | self.set_token(self.remaining_str) |
363 | 2.82k | self.remaining_str = self.remaining_str[self.token.as_str().len()..].trim_start(); |
364 | } | |
365 | 5.69k | return Ok(&self.token); |
366 | 5.69k | } |
367 | ||
368 | 3.64k | fn is_terminal(&self, terminal: &str) -> bool { |
369 | 3.64k | return self.token.is_terminal(terminal); |
370 | 3.64k | } |
371 | } | |
372 | ||
373 | 2.74k | fn build_intent<'b, 'r, 'c, 's:'c, 'm:'c>(rules_with_context: &'r mut SpeechRulesWithContext<'c,'s,'m>, |
374 | 2.74k | lex_state: &mut LexState<'b>, |
375 | 2.74k | mathml: Element<'c>, |
376 | 2.74k | intent_offset: &mut u32) -> Result<Element<'m>> { |
377 | // intent := self-property-list | expression | |
378 | // self-property-list := property+ S | |
379 | // expression := S ( term property* | application ) S | |
380 | // term := concept-or-literal | number | reference | |
381 | // concept-or-literal := NCName | |
382 | // number := '-'? \d+ ( '.' \d+ )? | |
383 | // reference := '$' NCName | |
384 | // application := expression '(' arguments? S ')' | |
385 | // | |
386 | // When we flatten intent we have this implementation looking for Tokens or '(' [for application] | |
387 | // Essentially, the grammar we deal with here is: | |
388 | // intent := property+ | (concept-or-literal | number | reference) property* '('? | |
389 | // debug!(" start build_intent: state: {}", lex_state); | |
390 | 2.74k | let doc = rules_with_context.get_document(); |
391 | let mut intent; | |
392 | 2.74k | debug!(" build_intent: start mathml name={}, intent_offset={}", |
393 | 2.74k | match lex_state.token { |
394 | Token::Property(_) => { | |
395 | // We only have a property -- we want to keep this tag/element | |
396 | // There are two paths: | |
397 | // 1. If there is a function call, then the children are dealt with there | |
398 | // 2. If there is *no* function call, then the children are kept, which means we return to pattern matching | |
399 | // Note: to avoid infinite loop, we need to remove the 'intent' so we don't end up back here; we put it back later | |
400 | 2.33k | let properties = get_properties(lex_state) |
401 | 2.33k | if lex_state.is_terminal("(") { |
402 | 2 | intent = create_mathml_element(&doc, name(mathml)); |
403 | 2 | intent.set_attribute_value(INTENT_PROPERTY, &properties); |
404 | 2 | intent.set_attribute_value(MATHML_FROM_NAME_ATTR, name(mathml)); |
405 | 2 | intent.set_attribute_value("id", mathml.attribute_value("id") |
406 | 2 | .ok_or_else(|| |
407 | } else { | |
408 | 2.32k | let saved_intent = mathml.attribute_value(INTENT_ATTR).unwrap(); |
409 | 2.32k | mathml.remove_attribute(INTENT_ATTR); |
410 | 2.32k | mathml.set_attribute_value(INTENT_PROPERTY, &properties); // needs to be set before the pattern match |
411 | 2.32k | intent = rules_with_context.match_pattern::<Element<'m>>(mathml) |
412 | // debug!("Intent after pattern match:\n{}", mml_to_string(intent)); | |
413 | 2.32k | mathml.set_attribute_value(INTENT_ATTR, saved_intent); |
414 | } | |
415 | 2.33k | add_fixity(intent); |
416 | 2.33k | return Ok(intent); // if we start with properties, then there can only be properties |
417 | }, | |
418 | 161 | Token::ConceptOrLiteral(word) | Token::Number( |
419 | 189 | let leaf_name = if let Token::Number(_) = lex_state.token { |
420 | 189 | intent = create_mathml_element(&doc, leaf_name); |
421 | // if the str is part of a larger intent and not the head (e.g., "a" in "f($x, a)", but not the "f" in it), then it is "made up" | |
422 | // debug!(" Token::ConceptOrLiteral, word={}, leaf_name={}", word, leaf_name); | |
423 | 189 | intent.set_attribute_value(MATHML_FROM_NAME_ATTR, |
424 | 189 | if word == mathml.attribute_value(INTENT_ATTR).unwrap_or_default() { |
425 | 189 | intent.set_text(word); // '-' and '_' get removed by the rules. |
426 | 189 | if let Some( |
427 | 136 | intent.set_attribute_value("id", &format!("{}-literal-{}", id, intent_offset)); |
428 | 136 | *intent_offset += 1; |
429 | 136 |
|
430 | 189 | lex_state.get_next() |
431 | 189 | if let Token::Property(_) = lex_state.token { |
432 | 60 | let properties = get_properties(lex_state) |
433 | 60 | intent.set_attribute_value(INTENT_PROPERTY, &properties); |
434 | 129 | } |
435 | }, | |
436 | 223 | Token::ArgRef(word) => { |
437 | 223 | intent = match find_arg(rules_with_context, &word[1..], mathml, intent_offset, true, false) |
438 | 221 | Some(e) => { |
439 | 221 | lex_state.get_next() |
440 | 221 | e |
441 | }, | |
442 | 1 | None => bail!("intent arg '{}' not found", word), |
443 | }; | |
444 | 221 | if let Token::Property(_) = lex_state.token { |
445 | 3 | let properties = get_properties(lex_state) |
446 | 3 | intent.set_attribute_value(INTENT_PROPERTY, &properties); |
447 | 218 | } |
448 | }, | |
449 | 3 | _ => bail!("Illegal 'intent' syntax: found {}", lex_state.token), |
450 | }; | |
451 | 410 | if lex_state.is_terminal("(") { |
452 | 136 | intent = build_function(intent, rules_with_context, lex_state, mathml, intent_offset) |
453 | 274 | } |
454 | // debug!(" end build_intent: state: {} piece: {}", lex_state, mml_to_string(intent)); | |
455 | 395 | add_fixity(intent); |
456 | 395 | return Ok(intent); |
457 | 2.74k | } |
458 | ||
459 | pub const INTENT_PROPERTY: &str = "data-intent-property"; | |
460 | ||
461 | /// Get all the properties, stopping we don't have any more | |
462 | /// Returns the string of the properties terminated with an additional ":" | |
463 | 2.39k | fn get_properties(lex_state: &mut LexState) -> Result<String> { |
464 | // return the 'hint' leaving the state | |
465 | 2.39k | assert!(matches!(lex_state.token, Token::Property(str) if str.starts_with(':'))); |
466 | 2.39k | let mut properties = String::with_capacity(60); |
467 | 2.39k | properties.push_str(lex_state.token.as_str()); |
468 | loop { | |
469 | 2.40k | let token = lex_state.get_next() |
470 | 2.40k | if let Token::Property( |
471 | 11 | properties.push_str(property); |
472 | 11 | } else { |
473 | 2.39k | properties.push(':'); |
474 | // debug!(" get_properties: returns {}", properties); | |
475 | 2.39k | return Ok(simplify_fixity_properties(&properties)); |
476 | } | |
477 | } | |
478 | 2.39k | } |
479 | ||
480 | /// Build a function 'f(...)' where '...' can be empty | |
481 | /// | |
482 | /// Also handles nested functions like f(...)(...) | |
483 | /// | |
484 | /// Start state: at '(' | |
485 | /// | |
486 | /// End state: after ')' | |
487 | 136 | fn build_function<'b, 'r, 'c, 's:'c, 'm:'c>( |
488 | 136 | function_name: Element<'m>, |
489 | 136 | rules_with_context: &'r mut SpeechRulesWithContext<'c,'s,'m>, |
490 | 136 | lex_state: &mut LexState<'b>, |
491 | 136 | mathml: Element<'c>, |
492 | 136 | intent_offset: &mut u32) -> Result<Element<'m>> { |
493 | // debug!(" start build_function: name: {}, state: {}", name(function_name), lex_state); | |
494 | // application := intent '(' arguments? S ')' where 'function_name' is 'intent' | |
495 | 136 | assert!(lex_state.is_terminal("(")); |
496 | 136 | let mut function = function_name; |
497 | 136 | function.set_attribute_value(MATHML_FROM_NAME_ATTR, name(mathml)); |
498 | 260 | while lex_state.is_terminal("(") { |
499 | 139 | lex_state.get_next() |
500 | 139 | if lex_state.is_terminal(")") { |
501 | // grammar requires at least one argument | |
502 | 9 | bail!("Illegal 'intent' syntax: missing argument for intent name '{}'", name(function_name)); |
503 | 130 | } |
504 | 130 | let |
505 | 125 | function = lift_function_name(rules_with_context.get_document(), function, children); |
506 | ||
507 | 125 | if !lex_state.is_terminal(")") { |
508 | 1 | bail!("Illegal 'intent' syntax: missing ')' for intent name '{}'", name(function_name)); |
509 | 124 | } |
510 | 124 | lex_state.get_next() |
511 | } | |
512 | ||
513 | // debug!(" end build_function/# children: {}, #state: {} ..[bfa] function name: {}", | |
514 | // function.children().len(), lex_state, mml_to_string(function)); | |
515 | 121 | return Ok(function); |
516 | 136 | } |
517 | ||
518 | // process all the args of a function | |
519 | // Start state: after '(' | |
520 | // End state: on ')' | |
521 | 130 | fn build_arguments<'b, 'r, 'c, 's:'c, 'm:'c>( |
522 | 130 | rules_with_context: &'r mut SpeechRulesWithContext<'c,'s,'m>, |
523 | 130 | lex_state: &mut LexState<'b>, |
524 | 130 | mathml: Element<'c>, |
525 | 130 | intent_offset: &mut u32) -> Result<Vec<Element<'m>>> { |
526 | // arguments := intent ( ',' intent )*' | |
527 | // debug!(" start build_args state: {}", lex_state); | |
528 | ||
529 | // there is at least one arg | |
530 | 130 | let mut children = Vec::with_capacity(lex_state.remaining_str.len()/3 + 1); // conservative estimate ('3' - "$x,"); |
531 | 130 | children. |
532 | // debug!(" build_args: # children {}; state: {}", children.len(), lex_state); | |
533 | ||
534 | 239 | while lex_state.is_terminal(",") { |
535 | 114 | lex_state.get_next() |
536 | 114 | children. |
537 | // debug!(" build_args, # children {}; state: {}", children.len(), lex_state); | |
538 | } | |
539 | ||
540 | // debug!(" end build_args, # children {}; state: {}", children.len(), lex_state); | |
541 | 125 | return Ok(children); |
542 | 130 | } |
543 | ||
544 | /// lift the children up to LITERAL_NAME | |
545 | 125 | fn lift_function_name<'m>(doc: Document<'m>, function_name: Element<'m>, children: Vec<Element<'m>>) -> Element<'m> { |
546 | // debug!(" lift_function_name: {}", name(function_name)); | |
547 | // debug!(" lift_function_name: {}", mml_to_string(function_name)); | |
548 | 125 | if name(function_name) == "mi" || |
549 | // simple/normal case of f(x,y) | |
550 | // don't want to say that this is a leaf -- doing so messes up because it potentially has children | |
551 | 121 | set_mathml_name(function_name, as_text(function_name)); |
552 | 121 | function_name.set_text(""); |
553 | 121 | function_name.replace_children(children); |
554 | 129 | if |
555 | 14 | let properties = function_name.attribute_value(INTENT_PROPERTY).unwrap_or(":").to_owned(); |
556 | 14 | function_name.set_attribute_value(INTENT_PROPERTY, &(properties + "silent:")); |
557 | 107 | } |
558 | 121 | return function_name; |
559 | 4 | } else if function_name.children().is_empty() { |
560 | // "... :property(...)" -- no function name | |
561 | 0 | function_name.replace_children(children); |
562 | 0 | return function_name; |
563 | } else { | |
564 | // more complicated case of nested name: f(x)(y,z) | |
565 | // create an apply_function(f(x), y, z) | |
566 | 4 | let result = create_mathml_element(&doc, IMPLICIT_FUNCTION_NAME); |
567 | 4 | result.set_attribute_value(MATHML_FROM_NAME_ATTR, "mrow"); |
568 | 4 | result.append_child(function_name); |
569 | 4 | result.append_children(children); |
570 | 4 | return result; |
571 | } | |
572 | 125 | } |
573 | ||
574 | ||
575 | /// look for @arg=name in mathml | |
576 | /// if 'check_intent', then look at an @intent for this element (typically false for non-recursive calls) | |
577 | 946 | fn find_arg<'r, 'c, 's:'c, 'm:'c>( |
578 | 946 | rules_with_context: &'r mut SpeechRulesWithContext<'c,'s,'m>, |
579 | 946 | name: &str, |
580 | 946 | mathml: Element<'c>, |
581 | 946 | intent_offset: &mut u32, |
582 | 946 | skip_self: bool, |
583 | 946 | no_check_inside: bool) -> Result<Option<Element<'m>>> { |
584 | // debug!("Looking for '{}' in\n{}", name, mml_to_string(mathml)); | |
585 | 946 | if !skip_self && |
586 | 723 | let Some( |
587 | // debug!("looking for '{}', found arg='{}'", name, arg_val); | |
588 | 411 | if name == arg_val { |
589 | // check to see if this mathml has an intent value -- if so the value is the value of its intent value | |
590 | 222 | if let Some( |
591 | 28 | let mut lex_state = LexState::init(intent_str.trim()) |
592 | 28 | return Ok( Some( build_intent(rules_with_context, &mut lex_state, mathml, intent_offset) |
593 | } else { | |
594 | 194 | return Ok( Some( rules_with_context.match_pattern::<Element<'m>>(mathml) |
595 | } | |
596 | 189 | } else if no_check_inside { |
597 | 189 | return Ok(None); // don't look inside 'arg' |
598 | 0 | } |
599 | 535 | } |
600 | ||
601 | 535 | if no_check_inside && |
602 | 2 | return Ok(None); // don't look inside 'intent' |
603 | 533 | } |
604 | ||
605 | 533 | if is_leaf(mathml){ |
606 | 121 | return Ok(None); |
607 | 412 | } |
608 | ||
609 | 723 | for child in |
610 | 723 | let child = as_element(child); |
611 | 723 | if let Some( |
612 | 396 | return Ok( Some(element) ); |
613 | 326 | } |
614 | } | |
615 | ||
616 | 15 | return Ok(None); // not present |
617 | 946 | } |
618 | ||
619 | #[cfg(test)] | |
620 | mod tests { | |
621 | #[allow(unused_imports)] | |
622 | use crate::init_logger; | |
623 | use log::debug; | |
624 | use sxd_document::parser; | |
625 | ||
626 | ||
627 | 27 | fn test_intent(mathml: &str, target: &str, intent_error_recovery: &str) -> bool { |
628 | use crate::interface::*; | |
629 | use crate::pretty_print::mml_to_string; | |
630 | // this forces initialization | |
631 | 27 | crate::interface::set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); |
632 | // crate::speech::SpeechRules::initialize_all_rules().unwrap(); | |
633 | 27 | set_preference("IntentErrorRecovery", intent_error_recovery).unwrap(); |
634 | 27 | set_preference("SpeechStyle", "SimpleSpeak").unwrap(); // avoids possibility of "LiteralSpeak" |
635 | 27 | let package1 = &parser::parse(mathml).expect("Failed to parse test input"); |
636 | 27 | let mathml = get_element(package1); |
637 | 27 | trim_element(mathml, false); |
638 | 27 | debug!("test:\n{}", |
639 | ||
640 | 27 | let package2 = &parser::parse(target).expect("Failed to parse target input"); |
641 | 27 | let target = get_element(package2); |
642 | 27 | trim_element(target,true); |
643 | 27 | debug!("target:\n{}", |
644 | ||
645 | 27 | let |
646 | 18 | Ok(e) => e, |
647 | 9 | Err(e) => { |
648 | 9 | debug!("{}", |
649 | 9 | return false; // could be intentional failure |
650 | } | |
651 | }; | |
652 | 18 | debug!("result:\n{}", |
653 | 18 | match is_same_element(result, target, &[]) { |
654 | 18 | Ok(_) => return true, |
655 | 0 | Err(e) => panic!("{}:\nresult: {}target: {}", e, mml_to_string(result), mml_to_string(target)), |
656 | } | |
657 | 27 | } |
658 | ||
659 | #[test] | |
660 | 1 | fn infer_binomial() { |
661 | 1 | let mathml = "<mrow intent='binomial($n, $m)'> |
662 | 1 | <mo>(</mo> |
663 | 1 | <mfrac linethickness='0'> <mn arg='n'>7</mn> <mn arg='m'>3</mn> </mfrac> |
664 | 1 | <mo>)</mo> |
665 | 1 | </mrow>"; |
666 | 1 | let intent = "<binomial data-from-mathml='mrow' data-intent-property=':infix:'> <mn data-from-mathml='mn' arg='n'>7</mn> <mn data-from-mathml='mn' arg='m'>3</mn> </binomial>"; |
667 | 1 | assert!(test_intent(mathml, intent, "Error")); |
668 | 1 | } |
669 | ||
670 | #[test] | |
671 | 1 | fn infer_binomial_intent_arg() { |
672 | 1 | let mathml = "<msubsup intent='$op($n,$m)'> |
673 | 1 | <mi arg='op' intent='binomial'>C</mi> |
674 | 1 | <mi arg='n'>n</mi> |
675 | 1 | <mi arg='m'>m</mi> |
676 | 1 | </msubsup>"; |
677 | 1 | let intent = "<binomial data-from-mathml='msubsup' data-intent-property=':infix:'> <mi data-from-mathml='mi' arg='n'>n</mi> <mi data-from-mathml='mi' arg='m'>m</mi></binomial>"; |
678 | 1 | assert!(test_intent(mathml, intent, "Error")); |
679 | 1 | } |
680 | ||
681 | #[test] | |
682 | 1 | fn silent_underscore() { |
683 | 1 | let mathml = "<mrow><mi intent='__-'>silent</mi><mo>+</mo><mi>e</mi></mrow>"; |
684 | 1 | let intent = "<mrow data-from-mathml='mrow'> |
685 | 1 | <mi data-from-mathml='mi'>__-</mi> |
686 | 1 | <mo data-from-mathml='mo'>+</mo> |
687 | 1 | <mi data-from-mathml='mi'>e</mi> |
688 | 1 | </mrow>"; |
689 | 1 | assert!(test_intent(mathml, intent, "Error")); |
690 | 1 | } |
691 | ||
692 | ||
693 | #[test] | |
694 | 1 | fn silent_underscore_function() { |
695 | 1 | let mathml = "<mrow intent='__-_(speak, this)'></mrow>"; |
696 | 1 | let intent = "<__-_ data-from-mathml='mrow' data-intent-property=':silent:'> |
697 | 1 | <mi data-from-mathml='mi'>speak</mi> |
698 | 1 | <mi data-from-mathml='mi'>this</mi> |
699 | 1 | </__-_>"; |
700 | 1 | assert!(test_intent(mathml, intent, "Error")); |
701 | 1 | } |
702 | ||
703 | #[test] | |
704 | 1 | fn intent_multiple_properties() { |
705 | 1 | let mathml = "<mrow intent='foo:silent:int(bar:positive-int:int, $a:foo:bar:foo-bar, $b:number)'> |
706 | 1 | <mi arg='a'>a</mi> |
707 | 1 | <mo arg='p' intent='plus'>+</mo> |
708 | 1 | <mi arg='b' intent=':negative-int:int'>b</mi> |
709 | 1 | </mrow>"; |
710 | 1 | let intent = "<foo data-intent-property=':int:silent:' data-from-mathml='mrow'> |
711 | 1 | <mi data-from-mathml='mi' data-intent-property=':positive-int:int:'>bar</mi> |
712 | 1 | <mi data-from-mathml='mi' arg='a' data-intent-property=':foo:bar:foo-bar:'>a</mi> |
713 | 1 | <mi data-from-mathml='mi' arg='b' data-intent-property=':number:'>b</mi> |
714 | 1 | </foo>"; |
715 | 1 | assert!(test_intent(mathml, intent, "Error")); |
716 | 1 | } |
717 | #[test] | |
718 | 1 | fn intent_nest_no_arg_call() { |
719 | 1 | let mathml = "<mrow intent='foo(bar())'> |
720 | 1 | <mi arg='a'>a</mi> |
721 | 1 | <mo arg='p' intent='plus'>+</mo> |
722 | 1 | <mi arg='b'>b</mi> |
723 | 1 | <mo arg='f' intent='factorial'>!</mo> |
724 | 1 | </mrow>"; |
725 | 1 | let intent = "<foo><bar></bar></foo>"; |
726 | 1 | assert!(!test_intent(mathml, intent, "Error")); |
727 | 1 | } |
728 | ||
729 | #[test] | |
730 | 1 | fn intent_hints() { |
731 | 1 | let mathml = "<mrow intent='foo:silent(bar:postfix(3))'> |
732 | 1 | <mi arg='a'>a</mi> |
733 | 1 | <mo arg='p' intent='plus'>+</mo> |
734 | 1 | <mi arg='b'>b</mi> |
735 | 1 | <mo arg='f' intent='factorial'>!</mo> |
736 | 1 | </mrow>"; |
737 | 1 | let intent = "<foo data-intent-property=':silent:' data-from-mathml='mrow'> |
738 | 1 | <bar data-intent-property=':postfix:' data-from-mathml='mrow'> |
739 | 1 | <mn data-from-mathml='mn'>3</mn> |
740 | 1 | </bar> |
741 | 1 | </foo>"; |
742 | 1 | assert!(test_intent(mathml, intent, "Error")); |
743 | 1 | } |
744 | ||
745 | #[test] | |
746 | 1 | fn intent_hints_and_type() { |
747 | 1 | let mathml = "<mrow intent='foo:is-foolish:function($b)'> |
748 | 1 | <mi arg='a'>a</mi> |
749 | 1 | <mo arg='p' intent='plus'>+</mo> |
750 | 1 | <mi intent='b:int' arg='b'>b</mi> |
751 | 1 | <mo arg='f' intent='factorial'>!</mo> |
752 | 1 | </mrow>"; |
753 | 1 | let intent = "<foo data-intent-property=':is-foolish:function:' data-from-mathml='mrow'> |
754 | 1 | <mi data-intent-property=':int:' data-from-mathml='mi'>b</mi> |
755 | 1 | </foo>"; |
756 | 1 | assert!(test_intent(mathml, intent, "Error")); |
757 | 1 | } |
758 | ||
759 | #[test] | |
760 | 1 | fn intent_in_intent_first_arg() { |
761 | 1 | let mathml = "<mrow intent='p(f(b), a)'> |
762 | 1 | <mi arg='a'>a</mi> |
763 | 1 | <mo arg='p' intent='plus'>+</mo> |
764 | 1 | <mi arg='b'>b</mi> |
765 | 1 | <mo arg='f' intent='factorial'>!</mo> |
766 | 1 | </mrow>"; |
767 | 1 | let intent = "<p data-from-mathml='mrow'> |
768 | 1 | <f data-from-mathml='mrow'> |
769 | 1 | <mi data-from-mathml='mi'>b</mi> |
770 | 1 | </f> |
771 | 1 | <mi data-from-mathml='mi'>a</mi> |
772 | 1 | </p>"; |
773 | 1 | assert!(test_intent(mathml, intent, "Error")); |
774 | 1 | } |
775 | ||
776 | #[test] | |
777 | 1 | fn intent_in_intent_second_arg() { |
778 | 1 | let mathml = "<mrow intent='$p(a,$f(b))'> |
779 | 1 | <mi arg='a'>a</mi> |
780 | 1 | <mo arg='p' intent='plus'>+</mo> |
781 | 1 | <mi arg='b'>b</mi> |
782 | 1 | <mo arg='f' intent='factorial'>!</mo> |
783 | 1 | </mrow>"; |
784 | 1 | let intent = "<plus data-from-mathml='mrow' data-intent-property=':infix:'> |
785 | 1 | <mi data-from-mathml='mi'>a</mi> |
786 | 1 | <factorial data-from-mathml='mrow'> |
787 | 1 | <mi data-from-mathml='mi'>b</mi> |
788 | 1 | </factorial> |
789 | 1 | </plus>"; |
790 | 1 | assert!(test_intent(mathml, intent, "Error")); |
791 | 1 | } |
792 | ||
793 | #[test] | |
794 | 1 | fn intent_with_whitespace() { |
795 | 1 | let mathml = "<mrow intent=' $arrow ( $a , $b,$c ) '> |
796 | 1 | <mi arg='a'>A</mi> |
797 | 1 | <mover> |
798 | 1 | <mo movablelimits='false' arg='arrow' intent='map'>⟶</mo> |
799 | 1 | <mo arg='U2245' intent='congruence'>≅</mo> |
800 | 1 | </mover> |
801 | 1 | <mi arg='b'>B</mi> |
802 | 1 | <mi arg='c'>C</mi> |
803 | 1 | </mrow>"; |
804 | 1 | let intent = "<map data-from-mathml='mrow'> <mi data-from-mathml='mi' arg='a'>A</mi> <mi data-from-mathml='mi' arg='b'>B</mi> <mi data-from-mathml='mi' arg='c'>C</mi> </map>"; |
805 | 1 | assert!(test_intent(mathml, intent, "Error")); |
806 | 1 | } |
807 | ||
808 | #[test] | |
809 | 1 | fn intent_template_at_toplevel() { |
810 | 1 | let mathml = "<msup intent='$H $n'> |
811 | 1 | <mi arg='H' mathvariant='normal'>H</mi> |
812 | 1 | <mn arg='n'>2</mn> |
813 | 1 | </msup>"; |
814 | 1 | let intent = "<mrow><mi arg='H' mathvariant='normal'>H</mi><mn arg='n'>2</mn></mrow>"; |
815 | 1 | assert!(!test_intent(mathml, intent, "Error")); |
816 | 1 | } |
817 | ||
818 | #[test] | |
819 | 1 | fn intent_with_nested_indirect_head() { |
820 | 1 | let mathml = "<mrow intent='$op($a,$b)'> |
821 | 1 | <mi arg='a'>A</mi> |
822 | 1 | <mover arg='op' intent='$ra($cong)'> |
823 | 1 | <mo movablelimits='false' arg='ra' intent='map'>⟶</mo> |
824 | 1 | <mo arg='cong' intent='congruence'>≅</mo> |
825 | 1 | </mover> |
826 | 1 | <mi arg='b'>B</mi> |
827 | 1 | </mrow>"; |
828 | 1 | let intent = "<apply-function data-from-mathml='mrow'> |
829 | 1 | <map data-from-mathml='mrow'> |
830 | 1 | <mi data-from-mathml='mo'>congruence</mi> |
831 | 1 | </map> |
832 | 1 | <mi data-from-mathml='mi' arg='a'>A</mi> |
833 | 1 | <mi data-from-mathml='mi' arg='b'>B</mi> |
834 | 1 | </apply-function>"; |
835 | 1 | assert!(test_intent(mathml, intent, "Error")); |
836 | 1 | } |
837 | ||
838 | #[test] | |
839 | 1 | fn intent_with_literals() { |
840 | 1 | let mathml = "<mrow intent='vector(1, 0.0, 0.1, -23, -0.1234, last)'> |
841 | 1 | <mi>x</mi> |
842 | 1 | </mrow>"; |
843 | 1 | let intent = "<vector data-from-mathml='mrow' data-intent-property=':function:'> |
844 | 1 | <mn data-from-mathml='mn'>1</mn> |
845 | 1 | <mn data-from-mathml='mn'>0.0</mn> |
846 | 1 | <mn data-from-mathml='mn'>0.1</mn> |
847 | 1 | <mn data-from-mathml='mn'>-23</mn> |
848 | 1 | <mn data-from-mathml='mn'>-0.1234</mn> |
849 | 1 | <mi data-from-mathml='mi'>last</mi> |
850 | 1 | </vector>"; |
851 | 1 | assert!(test_intent(mathml, intent, "Error")); |
852 | 1 | } |
853 | ||
854 | #[test] | |
855 | 1 | fn intent_with_template_literals() { |
856 | 1 | let mathml = "<mrow intent='1 0.0 0.1 -23 -0.1234 last'> |
857 | 1 | <mi>x</mi> |
858 | 1 | </mrow>"; |
859 | 1 | let intent = "<mrow><mn>1</mn><mn>0.</mn><mn>.1</mn><mn>-23</mn><mn>-.1234</mn><mi>last</mi></mrow>"; |
860 | 1 | assert!(!test_intent(mathml, intent, "Error")); |
861 | 1 | } |
862 | ||
863 | #[test] | |
864 | 1 | fn intent_with_nested_head() { |
865 | 1 | let mathml = "<mrow intent='$ra($cong)($a,$b)'> |
866 | 1 | <mi arg='a'>A</mi> |
867 | 1 | <mover> |
868 | 1 | <mo movablelimits='false' arg='ra' intent='map'>⟶</mo> |
869 | 1 | <mo arg='cong' intent='congruence'>≅</mo> |
870 | 1 | </mover> |
871 | 1 | <mi arg='b'>B</mi> |
872 | 1 | </mrow>"; |
873 | 1 | let intent = "<apply-function data-from-mathml='mrow'> |
874 | 1 | <map data-from-mathml='mrow'> |
875 | 1 | <mi data-from-mathml='mo'>congruence</mi> |
876 | 1 | </map> |
877 | 1 | <mi data-from-mathml='mi' arg='a'>A</mi> |
878 | 1 | <mi data-from-mathml='mi' arg='b'>B</mi> |
879 | 1 | </apply-function>"; |
880 | 1 | assert!(test_intent(mathml, intent, "Error")); |
881 | 1 | } |
882 | ||
883 | ||
884 | #[test] | |
885 | 1 | fn intent_with_nested_head_and_hints() { |
886 | 1 | let mathml = "<mrow intent='pre:prefix(in:infix($a, x))(post:postfix($b))'> |
887 | 1 | <mi arg='a'>A</mi> |
888 | 1 | <mover> |
889 | 1 | <mo intent='map'>⟶</mo> |
890 | 1 | <mo intent='congruence'>≅</mo> |
891 | 1 | </mover> |
892 | 1 | <mi arg='b'>B</mi> |
893 | 1 | </mrow>"; |
894 | 1 | let intent = "<apply-function data-from-mathml='mrow'> |
895 | 1 | <pre data-intent-property=':prefix:' data-from-mathml='mrow'> |
896 | 1 | <in data-intent-property=':infix:' data-from-mathml='mrow'> |
897 | 1 | <mi data-from-mathml='mi' arg='a'>A</mi> |
898 | 1 | <mi data-from-mathml='mi'>x</mi> |
899 | 1 | </in> |
900 | 1 | </pre> |
901 | 1 | <post data-intent-property=':postfix:' data-from-mathml='mrow'> |
902 | 1 | <mi data-from-mathml='mi' arg='b'>B</mi> |
903 | 1 | </post> |
904 | 1 | </apply-function>"; |
905 | 1 | assert!(test_intent(mathml, intent, "Error")); |
906 | 1 | } |
907 | ||
908 | ||
909 | #[test] | |
910 | 1 | fn intent_double_indirect_head() { |
911 | 1 | let mathml = "<mrow intent='$m:prefix($c)($a,$b)'> |
912 | 1 | <mi arg='a'>A</mi> |
913 | 1 | <mover> |
914 | 1 | <mo movablelimits='false' arg='m' intent='map'>⟶</mo> |
915 | 1 | <mo arg='c' intent='congruence'>≅</mo> |
916 | 1 | </mover> |
917 | 1 | <mi arg='b'>B</mi> |
918 | 1 | </mrow>"; |
919 | 1 | let intent = "<apply-function data-from-mathml='mrow'> |
920 | 1 | <map data-intent-property=':prefix:' data-from-mathml='mrow'> |
921 | 1 | <mi data-from-mathml='mo'>congruence</mi> |
922 | 1 | </map> |
923 | 1 | <mi data-from-mathml='mi' arg='a'>A</mi> |
924 | 1 | <mi data-from-mathml='mi' arg='b'>B</mi> |
925 | 1 | </apply-function>"; |
926 | 1 | assert!(test_intent(mathml, intent, "Error")); |
927 | 1 | } |
928 | ||
929 | #[test] | |
930 | 1 | fn intent_missing_open() { |
931 | 1 | let mathml = "<mrow intent='$p $a,$f($b))'> |
932 | 1 | <mi arg='a'>a</mi> |
933 | 1 | <mo arg='p' intent='plus'>+</mo> |
934 | 1 | <mi arg='b'>b</mi> |
935 | 1 | <mo arg='f' intent='factorial'>!</mo> |
936 | 1 | </mrow>"; |
937 | 1 | let intent = "<plus> <mi arg='a'>a</mi> <factorial><mi arg='b'>b</mi></factorial> </plus>"; |
938 | 1 | assert!(!test_intent(mathml, intent, "Error")); |
939 | 1 | } |
940 | ||
941 | #[test] | |
942 | 1 | fn intent_no_comma() { |
943 | 1 | let mathml = "<mrow intent='$p($a $f($b))'> |
944 | 1 | <mi arg='a'>a</mi> |
945 | 1 | <mo arg='p' intent='plus'>+</mo> |
946 | 1 | <mi arg='b'>b</mi> |
947 | 1 | <mo arg='f' intent='factorial'>!</mo> |
948 | 1 | </mrow>"; |
949 | 1 | let intent = "<plus> |
950 | 1 | <mrow> |
951 | 1 | <mi arg='a'>a</mi> |
952 | 1 | <factorial> <mi arg='b'>b</mi> </factorial> |
953 | 1 | </mrow> |
954 | 1 | </plus>"; |
955 | 1 | assert!(!test_intent(mathml, intent, "Error")); |
956 | 1 | } |
957 | ||
958 | #[test] | |
959 | 1 | fn intent_no_arg() { |
960 | 1 | let mathml = "<mrow intent='factorial()'> |
961 | 1 | <mi arg='a'>a</mi> |
962 | 1 | <mo arg='p' intent='plus'>+</mo> |
963 | 1 | <mi arg='b'>b</mi> |
964 | 1 | <mo arg='f' intent='factorial'>!</mo> |
965 | 1 | </mrow>"; |
966 | 1 | let target = "<factorial></factorial>"; |
967 | 1 | assert!(!test_intent(mathml, target, "Error")); |
968 | 1 | } |
969 | ||
970 | #[test] | |
971 | 1 | fn intent_illegal_no_arg() { |
972 | 1 | let mathml = "<mrow intent='factorial(()))'> |
973 | 1 | <mi arg='a'>a</mi> |
974 | 1 | <mo arg='p' intent='plus'>+</mo> |
975 | 1 | <mi arg='b'>b</mi> |
976 | 1 | <mo arg='f' intent='factorial'>!</mo> |
977 | 1 | </mrow>"; |
978 | 1 | let target = "<factorial></factorial>"; |
979 | 1 | assert!(!test_intent(mathml, target, "Error")); |
980 | 1 | } |
981 | ||
982 | #[test] | |
983 | 1 | fn intent_illegal_no_arg_ignore() { |
984 | 1 | let mathml = "<mrow intent='factorial()'> |
985 | 1 | <mi arg='a'>a</mi> |
986 | 1 | <mo arg='p' intent='plus'>+</mo> |
987 | 1 | <mi arg='b'>b</mi> |
988 | 1 | <mo arg='f' intent='factorial'>!</mo> |
989 | 1 | </mrow>"; |
990 | 1 | let target = "<mrow data-from-mathml='mrow' intent='factorial()'> |
991 | 1 | <mi data-from-mathml='mi' arg='a'>a</mi> |
992 | 1 | <mi data-from-mathml='mo'>plus</mi> |
993 | 1 | <mi data-from-mathml='mi' arg='b'>b</mi> |
994 | 1 | <mi data-from-mathml='mo'>factorial</mi> |
995 | 1 | </mrow>"; |
996 | 1 | assert!(test_intent(mathml, target, "IgnoreIntent")); |
997 | 1 | } |
998 | ||
999 | #[test] | |
1000 | 1 | fn intent_illegal_self_ref() { |
1001 | 1 | let mathml = "<mrow intent='foo:is-foolish:function($b)'> |
1002 | 1 | <mi intent='$b:int' arg='b'>b</mi> |
1003 | 1 | </mrow>"; |
1004 | 1 | let target = "<foo data-intent-property=':function:' data-intent-type='is-foolish'><mi data-intent-type='int'>b</mi></foo>"; |
1005 | 1 | assert!(!test_intent(mathml, target, "Error")); |
1006 | 1 | } |
1007 | ||
1008 | #[test] | |
1009 | 1 | fn infer_missing_second_arg() { |
1010 | 1 | let mathml = "<mrow intent='binomial($n,)'> |
1011 | 1 | <mo>(</mo> |
1012 | 1 | <mfrac linethickness='0'> <mn arg='n'>7</mn> <mn arg='m'>3</mn> </mfrac> |
1013 | 1 | <mo>)</mo> |
1014 | 1 | </mrow>"; |
1015 | 1 | let target = "<binomial data-intent-property='binomial($n,)'> \n |
1016 | 1 | <mn data-from-mathml='mn' arg='n'>7</mn> <mn data-from-mathml='mn' arg='m'>3</mn> </binomial>"; |
1017 | 1 | assert!(!test_intent(mathml, target, "Error")); |
1018 | 1 | } |
1019 | ||
1020 | #[test] | |
1021 | 1 | fn infer_missing_second_arg_ignore() { |
1022 | 1 | let mathml = "<mrow intent='binomial($n,)'> |
1023 | 1 | <mo>(</mo> |
1024 | 1 | <mfrac linethickness='0'> <mn arg='n'>7</mn> <mn arg='m'>3</mn> </mfrac> |
1025 | 1 | <mo>)</mo> |
1026 | 1 | </mrow>"; |
1027 | 1 | let target = "<mrow data-from-mathml='mrow' intent='binomial($n,)'> |
1028 | 1 | <mo data-from-mathml='mo'>(</mo> |
1029 | 1 | <fraction data-from-mathml='mfrac' linethickness='0'> <mn data-from-mathml='mn' arg='n'>7</mn> <mn data-from-mathml='mn' arg='m'>3</mn> </fraction> |
1030 | 1 | <mo data-from-mathml='mo'>)</mo> |
1031 | 1 | </mrow>"; |
1032 | 1 | assert!(test_intent(mathml, target, "IgnoreIntent")); |
1033 | 1 | } |
1034 | ||
1035 | #[test] | |
1036 | 1 | fn plane1_char_in_concept_name() { |
1037 | 1 | let mathml = "<math><mrow><mo intent='🐇'>🐇</mo><mi>X</mi></mrow></math>"; |
1038 | 1 | let intent = "<math data-from-mathml='math'> |
1039 | 1 | <mrow data-from-mathml='mrow'> |
1040 | 1 | <mi data-from-mathml='mo'>🐇</mi> |
1041 | 1 | <mi data-from-mathml='mi'>X</mi> |
1042 | 1 | </mrow> |
1043 | 1 | </math>"; |
1044 | 1 | assert!(test_intent(mathml, intent, "Error")); |
1045 | 1 | } |
1046 | } |
Line | Count | Source |
1 | //! Use heuristics to infer the intent. | |
2 | //! For example, an `mfrac` with `linethickness=0` would be a binomial | |
3 | //! The inference is added to the MathML | |
4 | //! | |
5 | //! The implementation of the module is on hold until the MathML committee figures out how it wants to do this. | |
6 | #![allow(clippy::needless_return)] | |
7 | ||
8 | use sxd_document::dom::{Element, Document, ChildOfElement}; | |
9 | use crate::prefs::PreferenceManager; | |
10 | use crate::speech::SpeechRulesWithContext; | |
11 | use crate::canonicalize::{as_element, as_text, name, create_mathml_element, set_mathml_name, INTENT_ATTR, MATHML_FROM_NAME_ATTR}; | |
12 | use crate::errors::*; | |
13 | use std::fmt; | |
14 | use std::sync::LazyLock; | |
15 | use crate::pretty_print::mml_to_string; | |
16 | use crate::xpath_functions::is_leaf; | |
17 | use regex::Regex; | |
18 | use phf::phf_set; | |
19 | use log::{debug, error, warn}; | |
20 | ||
21 | const IMPLICIT_FUNCTION_NAME: &str = "apply-function"; | |
22 | ||
23 | 2.47k | pub fn infer_intent<'r, 'c, 's:'c, 'm:'c>(rules_with_context: &'r mut SpeechRulesWithContext<'c,'s,'m>, mathml: Element<'c>) -> Result<Element<'m>> { |
24 | 2.47k | match catch_errors_building_intent(rules_with_context, mathml) { |
25 | 2.45k | Ok(intent) => return Ok(intent), |
26 | 19 | Err(e) => { |
27 | // lookup what we should do for error recovery | |
28 | 19 | let intent_preference = rules_with_context.get_rules().pref_manager.borrow().pref_to_string("IntentErrorRecovery"); |
29 | 19 | if intent_preference == "Error" { |
30 | 9 | return Err(e); |
31 | } else { | |
32 | 10 | let saved_intent_attr = mathml.attribute_value(INTENT_ATTR).unwrap(); |
33 | 10 | mathml.remove_attribute(INTENT_ATTR); |
34 | // can't call intent_from_mathml() because we have already borrowed_mut -- we call a more internal version | |
35 | 10 | let intent_tree = match rules_with_context.match_pattern::<Element<'m>>(mathml) |
36 | 10 | .context("Pattern match/replacement failure!") { |
37 | 0 | Err(e) => Err(e), |
38 | 10 | Ok(intent) => { |
39 | 10 | intent.set_attribute_value(INTENT_ATTR, saved_intent_attr); // so attr can be potentially be viewed later |
40 | 10 | Ok(intent) |
41 | }, | |
42 | }; | |
43 | 10 | mathml.set_attribute_value(INTENT_ATTR, saved_intent_attr); |
44 | 10 | return intent_tree; |
45 | } | |
46 | } | |
47 | } | |
48 | ||
49 | 2.47k | fn catch_errors_building_intent<'r, 'c, 's:'c, 'm:'c>(rules_with_context: &'r mut SpeechRulesWithContext<'c,'s,'m>, mathml: Element<'c>) -> Result<Element<'m>> { |
50 | 2.47k | if let Some(intent_str) = mathml.attribute_value(INTENT_ATTR) { |
51 | // debug!("Before intent: {}", crate::pretty_print::mml_to_string(mathml)); | |
52 | 2.47k | let mut lex_state = LexState::init(intent_str.trim()) |
53 | 2.47k | let mut intent_offset = 0; |
54 | 2.47k | let |
55 | 2.47k | .with_context(|| |
56 | 2.46k | if lex_state.token != Token::None { |
57 | 5 | bail!("Error in intent value: extra unparsed intent '{}' in intent attribute value '{}'", lex_state.remaining_str, intent_str); |
58 | 2.45k | } |
59 | 2.45k | assert!(lex_state.remaining_str.is_empty()); |
60 | // debug!("Resulting intent:\n{}", crate::pretty_print::mml_to_string(result)); | |
61 | 2.45k | return Ok(result); |
62 | 0 | } |
63 | 0 | bail!("Internal error: infer_intent() called on MathML with no intent arg:\n{}", mml_to_string(mathml)); |
64 | 2.47k | } |
65 | 2.47k | } |
66 | ||
67 | ||
68 | static FIXITIES: phf::Set<&str> = phf_set! { | |
69 | "function", "infix", "prefix", "postfix", "silent", "other", | |
70 | }; | |
71 | ||
72 | /// Eliminate all but the last fixity property | |
73 | 7.63k | pub fn simplify_fixity_properties(properties: &str) -> String { |
74 | 7.63k | let parts: Vec<&str> = properties.split(':').collect(); |
75 | // debug!("simplify_fixity_properties {} parts from input: '{}'", parts.len(), properties); | |
76 | 7.63k | let mut fixity_property = ""; |
77 | 7.63k | let mut answer = ":".to_string(); |
78 | 19.2k | for part in |
79 | 19.2k | if FIXITIES.contains(part) { |
80 | 1.12k | fixity_property = part; |
81 | 18.1k | } else if !part.is_empty() { |
82 | 4.71k | answer.push_str(part); |
83 | 4.71k | answer.push(':'); |
84 | 13.4k | } |
85 | } | |
86 | 7.63k | if !fixity_property.is_empty() { |
87 | 1.12k | answer.push_str(fixity_property); |
88 | 1.12k | answer.push(':'); |
89 | 6.51k | } |
90 | 7.63k | return answer; |
91 | 7.63k | } |
92 | ||
93 | /// Given the intent add the fixity property for the intent if it isn't given (and one exists) | |
94 | 2.72k | fn add_fixity(intent: Element) { |
95 | 2.72k | let properties = intent.attribute_value(INTENT_PROPERTY).unwrap_or_default(); |
96 | 7.47k | if |
97 | 2.63k | let intent_name = name(intent); |
98 | 2.63k | crate::definitions::SPEECH_DEFINITIONS.with(|definitions| { |
99 | 2.63k | let definitions = definitions.borrow(); |
100 | 2.63k | if let Some( |
101 | 12 | let Some((fixity, _)) = definition.split_once("=") { |
102 | 12 | let new_properties = (if properties.is_empty() {":"} else { |
103 | 12 | intent.set_attribute_value(INTENT_PROPERTY, &new_properties); |
104 | // debug!("Added fixity: new value '{}'", intent.attribute_value(INTENT_PROPERTY).unwrap()); | |
105 | 2.62k | }; |
106 | 2.63k | }); |
107 | 90 | } |
108 | 2.72k | } |
109 | ||
110 | ||
111 | /// Given some MathML, expand out any intents taking into account their fixity property | |
112 | /// This is recursive | |
113 | 363 | pub fn add_fixity_children(intent: Element) -> Element { |
114 | 363 | let children = intent.children(); |
115 | 363 | if children.is_empty() || (children.len() == 1 && children[0].element().is_none()) { |
116 | 0 | return intent; |
117 | 363 | } |
118 | ||
119 | 363 | for child in children { |
120 | 363 | let child = as_element(child); |
121 | 363 | if child.attribute_value(INTENT_ATTR).is_some() { |
122 | 0 | add_fixity_child(child); |
123 | 363 | } |
124 | } | |
125 | 363 | return intent; |
126 | ||
127 | 0 | fn add_fixity_child(mathml: Element) -> Element { |
128 | 0 | let mut children = mathml.children(); |
129 | 0 | if children.is_empty() { |
130 | 0 | return mathml; |
131 | 0 | } |
132 | // we also exclude fixity on mtable because they mess up the counts (see 'en::mtable::unknown_mtable_property') | |
133 | 0 | if mathml.attribute_value(MATHML_FROM_NAME_ATTR).unwrap_or_default() == "mtable" { |
134 | 0 | return mathml; |
135 | 0 | } |
136 | 0 | let doc = mathml.document(); |
137 | 0 | let properties = mathml.attribute_value(INTENT_PROPERTY).unwrap_or_default(); |
138 | 0 | let fixity = properties.rsplit(':').find(|&property| FIXITIES.contains(property)).unwrap_or_default(); |
139 | 0 | let intent_name = name(mathml); |
140 | ||
141 | 0 | let op_name_id = mathml.attribute_value("id").unwrap_or("new-id"); |
142 | 0 | match fixity { |
143 | 0 | "infix" => { |
144 | 0 | let mut new_children = Vec::with_capacity(2*children.len()-1); |
145 | 0 | new_children.push(children[0]); |
146 | 0 | for (i, &child) in children.iter().enumerate().skip(1) { |
147 | 0 | new_children.push(create_operator_element(intent_name, fixity, op_name_id, i, &doc)); |
148 | 0 | new_children.push(child); |
149 | 0 | } |
150 | 0 | mathml.replace_children(new_children); |
151 | }, | |
152 | 0 | "prefix" => { |
153 | 0 | children.insert(0, create_operator_element(intent_name, fixity, op_name_id, 1, &doc)); |
154 | 0 | mathml.replace_children(children); |
155 | 0 | }, |
156 | 0 | "postfix" => { |
157 | 0 | children.push( create_operator_element(intent_name, fixity, op_name_id, 1, &doc)); |
158 | 0 | mathml.replace_children(children); |
159 | 0 | }, |
160 | 0 | "silent" => { |
161 | 0 | // children remain the same -- nothing to do |
162 | 0 | }, |
163 | 0 | "other" => { |
164 | 0 | // a special case -- will be handled with specific rules (e.g., intervals need to add "from" and "to", not a single word) |
165 | 0 | }, |
166 | _ => { // "function" is the default | |
167 | // build a function like notation function-name U+2061 <mrow> children </mrow> | |
168 | 0 | let mut new_children = Vec::with_capacity(3); |
169 | 0 | let function_name = create_operator_element(intent_name, "function", op_name_id, 1, &doc); |
170 | 0 | new_children.push(function_name); |
171 | 0 | let invisible_apply_function = create_operator_element("mo", "infix", op_name_id, 2, &doc); |
172 | 0 | invisible_apply_function.element().unwrap().set_text("\u{2061}"); |
173 | 0 | new_children.push(invisible_apply_function); |
174 | 0 | let mrow_wrapper = create_mathml_element(&doc, "mrow"); |
175 | 0 | mrow_wrapper.set_attribute_value("id", (op_name_id.to_string() + "3").as_str()); |
176 | 0 | mrow_wrapper.append_children(children); |
177 | 0 | new_children.push(ChildOfElement::Element(mrow_wrapper)); |
178 | 0 | mathml.replace_children(new_children); |
179 | 0 | if fixity.is_empty() { |
180 | 0 | mathml.set_attribute_value(INTENT_PROPERTY, ":function:"); |
181 | 0 | } |
182 | }, | |
183 | } | |
184 | 0 | return mathml; |
185 | ||
186 | 0 | fn create_operator_element<'a>(intent_name: &str, fixity: &str, id: &str, id_inc: usize, doc: &Document<'a>) -> ChildOfElement<'a> { |
187 | 0 | let intent_name = intent_speech_for_name(intent_name, &PreferenceManager::get().borrow().pref_to_string("NavMode"), fixity); |
188 | 0 | let element = create_mathml_element(doc, &intent_name); |
189 | 0 | element.set_attribute_value("id", &format!("{id}-fixity-{id_inc}")); |
190 | 0 | element.set_attribute_value(MATHML_FROM_NAME_ATTR, "mo"); |
191 | 0 | return ChildOfElement::Element(element); |
192 | 0 | } |
193 | 0 | } |
194 | 363 | } |
195 | ||
196 | 340 | pub fn intent_speech_for_name(intent_name: &str, verbosity: &str, fixity: &str) -> String { |
197 | 340 | crate::definitions::SPEECH_DEFINITIONS.with(|definitions| { |
198 | 340 | let definitions = definitions.borrow(); |
199 | 340 | if let Some( |
200 | // Split the pattern is: | |
201 | // fixity-def [|| fixity-def]* | |
202 | // fixity-def := fixity=[open;] verbosity[; close] | |
203 | // verbosity := terse | medium | verbose | |
204 | 396 | if let Some( |
205 | 294 | let (_, matched_intent) = matched_intent.split_once("=").unwrap_or_default(); |
206 | 294 | let parts = matched_intent.trim().split(";").collect::<Vec<&str>>(); |
207 | 294 | let mut operator_names = (if parts.len() > 1 { |
208 | 294 | match operator_names.len() { |
209 | 236 | 1 => return operator_names[0].trim().to_string(), |
210 | 2 | 3 => { | |
211 | 58 | if operator_names.len() == 2 { |
212 | 0 | warn!("Intent '{intent_name}' has only two operator names, but should have three"); |
213 | 0 | operator_names.push(operator_names[1]); |
214 | 58 | } |
215 | 58 | let intent_word = match verbosity { |
216 | 58 | "Terse" => |
217 | 56 | "Medium" => |
218 | 2 | _ => operator_names[2], |
219 | }; | |
220 | 58 | return intent_word.trim().to_string(); |
221 | }, | |
222 | _ => { | |
223 | 0 | error!("Intent '{}' has too many ({}) operator names, should only have 2", intent_name, operator_names.len()); |
224 | 0 | return intent_name.to_string(); |
225 | }, | |
226 | } | |
227 | 0 | } |
228 | 46 | }; |
229 | 46 | return intent_name.replace(['_', '-'], " ").trim().to_string(); |
230 | 340 | }) |
231 | 340 | } |
232 | ||
233 | ||
234 | ||
235 | // intent := self-property-list | expression | |
236 | // self-property-list := property+ S | |
237 | // expression := S ( term property* | application ) S | |
238 | // term := concept-or-literal | number | reference | |
239 | // concept-or-literal := NCName | |
240 | // number := '-'? \d+ ( '.' \d+ )? | |
241 | // reference := '$' NCName | |
242 | // application := expression '(' arguments? S ')' | |
243 | // arguments := expression ( ',' expression )* | |
244 | // property := S ':' NCName | |
245 | // S := [ \t\n\r]* | |
246 | ||
247 | // The practical restrictions of NCName are that it cannot contain several symbol characters like | |
248 | // !, ", #, $, %, &, ', (, ), *, +, ,, /, :, ;, <, =, >, ?, @, [, \, ], ^, `, {, |, }, ~, and whitespace characters | |
249 | // Furthermore an NCName cannot begin with a number, dot or minus character although they can appear later in an NCName. | |
250 | // NC_NAME defined in www.w3.org/TR/REC-xml/#sec-common-syn, but is complicated | |
251 | // We follow NC_NAME for the basic latin block, but then allow everything | |
252 | 2 | static CONCEPT_OR_LITERAL: LazyLock<Regex> = LazyLock::new(|| { |
253 | 2 | Regex::new(r#"^[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // NC_NAME but simpler |
254 | 2 | ).unwrap() |
255 | 2 | }); |
256 | 2 | static PROPERTY: LazyLock<Regex> = LazyLock::new(|| { |
257 | 2 | Regex::new(r#"^:[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // : NC_NAME |
258 | 2 | ).unwrap() |
259 | 2 | }); |
260 | 2 | static ARG_REF: LazyLock<Regex> = LazyLock::new(|| { |
261 | 2 | Regex::new(r#"^\$[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // $ NC_NAME |
262 | 2 | ).unwrap() |
263 | 2 | }); |
264 | 2 | static NUMBER: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"^-?[0-9]+(\.[0-9]+)?"#).unwrap()); |
265 | ||
266 | static TERMINALS_AS_U8: [u8; 3] = [b'(', b',', b')']; | |
267 | // static TERMINALS: [char; 3] = ['(', ',',')']; | |
268 | ||
269 | // 'i -- "i" for the lifetime of the INTENT_ATTR string | |
270 | #[derive(Debug, PartialEq, Eq, Clone)] | |
271 | enum Token<'i> { | |
272 | Terminal(&'i str), // "(", ",", ")" | |
273 | Property(&'i str), | |
274 | ArgRef(&'i str), | |
275 | ConceptOrLiteral(&'i str), | |
276 | Number(&'i str), | |
277 | None, // out of characters | |
278 | } | |
279 | ||
280 | impl fmt::Display for Token<'_> { | |
281 | 3 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
282 | 3 | return write!(f, "{}", |
283 | 3 | match self { |
284 | 3 | Token::Terminal(str) => format!("Terminal('{str}')"), |
285 | 0 | Token::Property(str) => format!("Property({str})"), |
286 | 0 | Token::ArgRef(str) => format!("ArgRef({str})"), |
287 | 0 | Token::ConceptOrLiteral(str) => format!("Literal({str})"), |
288 | 0 | Token::Number(str) => format!("Number({str})"), |
289 | 0 | Token::None => "None".to_string(), |
290 | } | |
291 | ); | |
292 | 3 | } |
293 | } | |
294 | ||
295 | impl Token<'_> { | |
296 | 3.64k | fn is_terminal(&self, terminal: &str) -> bool { |
297 | 3.64k | if let Token::Terminal( |
298 | 1.02k | return value == terminal; |
299 | } else { | |
300 | 2.61k | return false; |
301 | } | |
302 | 3.64k | } |
303 | ||
304 | 5.21k | fn as_str(&self) -> &str { |
305 | 5.21k | return match self { |
306 | 0 | Token::Terminal(str) => str, |
307 | 4.79k | Token::Property(str) => str, |
308 | 226 | Token::ArgRef(str) => str, |
309 | 161 | Token::ConceptOrLiteral(str) => str, |
310 | 29 | Token::Number(str) => str, |
311 | 0 | Token::None => "", |
312 | } | |
313 | 5.21k | } |
314 | } | |
315 | ||
316 | struct LexState<'i> { | |
317 | token: Token<'i>, | |
318 | remaining_str: &'i str, // always trimmed | |
319 | } | |
320 | ||
321 | impl fmt::Display for LexState<'_> { | |
322 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
323 | 0 | return writeln!(f, "token: {}, remaining: '{}'", self.token, self.remaining_str); |
324 | 0 | } |
325 | } | |
326 | ||
327 | impl<'i> LexState<'i> { | |
328 | 2.50k | fn init(str: &'i str) -> Result<LexState<'i>> { |
329 | 2.50k | let mut lex_state = LexState { token: Token::None, remaining_str: str.trim() }; |
330 | 2.50k | lex_state.get_next() |
331 | 2.50k | return Ok(lex_state); |
332 | 2.50k | } |
333 | ||
334 | // helper function for LexState -- do not call outside of the impl | |
335 | 2.82k | fn set_token(&mut self, str: &'i str) -> Result<()> { |
336 | // Note: 'str' is already trimmed | |
337 | 2.82k | if str.is_empty() { |
338 | 0 | self.token = Token::None; |
339 | 2.82k | } else if TERMINALS_AS_U8.contains(&str.as_bytes()[0]) { |
340 | 0 | self.token = Token::Terminal(str); |
341 | 2.82k | } else if let Some( |
342 | 2.40k | self.token = Token::Property(matched_property.as_str()); |
343 | 2.40k | } else if let Some( |
344 | 226 | self.token = Token::ArgRef(matched_arg_ref.as_str()); |
345 | 226 | } else if let Some( |
346 | 161 | self.token = Token::ConceptOrLiteral(matched_literal.as_str()); |
347 | 161 | } else if let Some( |
348 | 29 | self.token = Token::Number(matched_number.as_str()); |
349 | 29 | } else { |
350 | 0 | bail!("Illegal 'intent' syntax: {}", str); |
351 | } | |
352 | 2.82k | return Ok( () ); |
353 | 2.82k | } |
354 | ||
355 | 5.69k | fn get_next(&mut self) -> Result<&Token<'_>> { |
356 | 5.69k | if self.remaining_str.is_empty() { |
357 | 2.48k | self.token = Token::None; |
358 | 3.21k | } else if TERMINALS_AS_U8.contains(&self.remaining_str.as_bytes()[0]) { |
359 | 391 | self.token = Token::Terminal(&self.remaining_str[..1]); |
360 | 391 | self.remaining_str = self.remaining_str[1..].trim_start(); |
361 | 391 | } else { |
362 | 2.82k | self.set_token(self.remaining_str) |
363 | 2.82k | self.remaining_str = self.remaining_str[self.token.as_str().len()..].trim_start(); |
364 | } | |
365 | 5.69k | return Ok(&self.token); |
366 | 5.69k | } |
367 | ||
368 | 3.64k | fn is_terminal(&self, terminal: &str) -> bool { |
369 | 3.64k | return self.token.is_terminal(terminal); |
370 | 3.64k | } |
371 | } | |
372 | ||
373 | 2.74k | fn build_intent<'b, 'r, 'c, 's:'c, 'm:'c>(rules_with_context: &'r mut SpeechRulesWithContext<'c,'s,'m>, |
374 | 2.74k | lex_state: &mut LexState<'b>, |
375 | 2.74k | mathml: Element<'c>, |
376 | 2.74k | intent_offset: &mut u32) -> Result<Element<'m>> { |
377 | // intent := self-property-list | expression | |
378 | // self-property-list := property+ S | |
379 | // expression := S ( term property* | application ) S | |
380 | // term := concept-or-literal | number | reference | |
381 | // concept-or-literal := NCName | |
382 | // number := '-'? \d+ ( '.' \d+ )? | |
383 | // reference := '$' NCName | |
384 | // application := expression '(' arguments? S ')' | |
385 | // | |
386 | // When we flatten intent we have this implementation looking for Tokens or '(' [for application] | |
387 | // Essentially, the grammar we deal with here is: | |
388 | // intent := property+ | (concept-or-literal | number | reference) property* '('? | |
389 | // debug!(" start build_intent: state: {}", lex_state); | |
390 | 2.74k | let doc = rules_with_context.get_document(); |
391 | let mut intent; | |
392 | 2.74k | debug!(" build_intent: start mathml name={}, intent_offset={}", |
393 | 2.74k | match lex_state.token { |
394 | Token::Property(_) => { | |
395 | // We only have a property -- we want to keep this tag/element | |
396 | // There are two paths: | |
397 | // 1. If there is a function call, then the children are dealt with there | |
398 | // 2. If there is *no* function call, then the children are kept, which means we return to pattern matching | |
399 | // Note: to avoid infinite loop, we need to remove the 'intent' so we don't end up back here; we put it back later | |
400 | 2.33k | let properties = get_properties(lex_state) |
401 | 2.33k | if lex_state.is_terminal("(") { |
402 | 2 | intent = create_mathml_element(&doc, name(mathml)); |
403 | 2 | intent.set_attribute_value(INTENT_PROPERTY, &properties); |
404 | 2 | intent.set_attribute_value(MATHML_FROM_NAME_ATTR, name(mathml)); |
405 | 2 | intent.set_attribute_value("id", mathml.attribute_value("id") |
406 | 2 | .ok_or_else(|| |
407 | } else { | |
408 | 2.32k | let saved_intent = mathml.attribute_value(INTENT_ATTR).unwrap(); |
409 | 2.32k | mathml.remove_attribute(INTENT_ATTR); |
410 | 2.32k | mathml.set_attribute_value(INTENT_PROPERTY, &properties); // needs to be set before the pattern match |
411 | 2.32k | intent = rules_with_context.match_pattern::<Element<'m>>(mathml) |
412 | // debug!("Intent after pattern match:\n{}", mml_to_string(intent)); | |
413 | 2.32k | mathml.set_attribute_value(INTENT_ATTR, saved_intent); |
414 | } | |
415 | 2.33k | add_fixity(intent); |
416 | 2.33k | return Ok(intent); // if we start with properties, then there can only be properties |
417 | }, | |
418 | 161 | Token::ConceptOrLiteral(word) | Token::Number( |
419 | 189 | let leaf_name = if let Token::Number(_) = lex_state.token { |
420 | 189 | intent = create_mathml_element(&doc, leaf_name); |
421 | // if the str is part of a larger intent and not the head (e.g., "a" in "f($x, a)", but not the "f" in it), then it is "made up" | |
422 | // debug!(" Token::ConceptOrLiteral, word={}, leaf_name={}", word, leaf_name); | |
423 | 189 | intent.set_attribute_value(MATHML_FROM_NAME_ATTR, |
424 | 189 | if word == mathml.attribute_value(INTENT_ATTR).unwrap_or_default() { |
425 | 189 | intent.set_text(word); // '-' and '_' get removed by the rules. |
426 | 189 | if let Some( |
427 | 136 | intent.set_attribute_value("id", &format!("{}-literal-{}", id, intent_offset)); |
428 | 136 | *intent_offset += 1; |
429 | 136 |
|
430 | 189 | lex_state.get_next() |
431 | 189 | if let Token::Property(_) = lex_state.token { |
432 | 60 | let properties = get_properties(lex_state) |
433 | 60 | intent.set_attribute_value(INTENT_PROPERTY, &properties); |
434 | 129 | } |
435 | }, | |
436 | 223 | Token::ArgRef(word) => { |
437 | 223 | intent = match find_arg(rules_with_context, &word[1..], mathml, intent_offset, true, false) |
438 | 221 | Some(e) => { |
439 | 221 | lex_state.get_next() |
440 | 221 | e |
441 | }, | |
442 | 1 | None => bail!("intent arg '{}' not found", word), |
443 | }; | |
444 | 221 | if let Token::Property(_) = lex_state.token { |
445 | 3 | let properties = get_properties(lex_state) |
446 | 3 | intent.set_attribute_value(INTENT_PROPERTY, &properties); |
447 | 218 | } |
448 | }, | |
449 | 3 | _ => bail!("Illegal 'intent' syntax: found {}", lex_state.token), |
450 | }; | |
451 | 410 | if lex_state.is_terminal("(") { |
452 | 136 | intent = build_function(intent, rules_with_context, lex_state, mathml, intent_offset) |
453 | 274 | } |
454 | // debug!(" end build_intent: state: {} piece: {}", lex_state, mml_to_string(intent)); | |
455 | 395 | add_fixity(intent); |
456 | 395 | return Ok(intent); |
457 | 2.74k | } |
458 | ||
459 | pub const INTENT_PROPERTY: &str = "data-intent-property"; | |
460 | ||
461 | /// Get all the properties, stopping we don't have any more | |
462 | /// Returns the string of the properties terminated with an additional ":" | |
463 | 2.39k | fn get_properties(lex_state: &mut LexState) -> Result<String> { |
464 | // return the 'hint' leaving the state | |
465 | 2.39k | assert!(matches!(lex_state.token, Token::Property(str) if str.starts_with(':'))); |
466 | 2.39k | let mut properties = String::with_capacity(60); |
467 | 2.39k | properties.push_str(lex_state.token.as_str()); |
468 | loop { | |
469 | 2.40k | let token = lex_state.get_next() |
470 | 2.40k | if let Token::Property( |
471 | 11 | properties.push_str(property); |
472 | 11 | } else { |
473 | 2.39k | properties.push(':'); |
474 | // debug!(" get_properties: returns {}", properties); | |
475 | 2.39k | return Ok(simplify_fixity_properties(&properties)); |
476 | } | |
477 | } | |
478 | 2.39k | } |
479 | ||
480 | /// Build a function 'f(...)' where '...' can be empty | |
481 | /// | |
482 | /// Also handles nested functions like f(...)(...) | |
483 | /// | |
484 | /// Start state: at '(' | |
485 | /// | |
486 | /// End state: after ')' | |
487 | 136 | fn build_function<'b, 'r, 'c, 's:'c, 'm:'c>( |
488 | 136 | function_name: Element<'m>, |
489 | 136 | rules_with_context: &'r mut SpeechRulesWithContext<'c,'s,'m>, |
490 | 136 | lex_state: &mut LexState<'b>, |
491 | 136 | mathml: Element<'c>, |
492 | 136 | intent_offset: &mut u32) -> Result<Element<'m>> { |
493 | // debug!(" start build_function: name: {}, state: {}", name(function_name), lex_state); | |
494 | // application := intent '(' arguments? S ')' where 'function_name' is 'intent' | |
495 | 136 | assert!(lex_state.is_terminal("(")); |
496 | 136 | let mut function = function_name; |
497 | 136 | function.set_attribute_value(MATHML_FROM_NAME_ATTR, name(mathml)); |
498 | 260 | while lex_state.is_terminal("(") { |
499 | 139 | lex_state.get_next() |
500 | 139 | if lex_state.is_terminal(")") { |
501 | // grammar requires at least one argument | |
502 | 9 | bail!("Illegal 'intent' syntax: missing argument for intent name '{}'", name(function_name)); |
503 | 130 | } |
504 | 130 | let |
505 | 125 | function = lift_function_name(rules_with_context.get_document(), function, children); |
506 | ||
507 | 125 | if !lex_state.is_terminal(")") { |
508 | 1 | bail!("Illegal 'intent' syntax: missing ')' for intent name '{}'", name(function_name)); |
509 | 124 | } |
510 | 124 | lex_state.get_next() |
511 | } | |
512 | ||
513 | // debug!(" end build_function/# children: {}, #state: {} ..[bfa] function name: {}", | |
514 | // function.children().len(), lex_state, mml_to_string(function)); | |
515 | 121 | return Ok(function); |
516 | 136 | } |
517 | ||
518 | // process all the args of a function | |
519 | // Start state: after '(' | |
520 | // End state: on ')' | |
521 | 130 | fn build_arguments<'b, 'r, 'c, 's:'c, 'm:'c>( |
522 | 130 | rules_with_context: &'r mut SpeechRulesWithContext<'c,'s,'m>, |
523 | 130 | lex_state: &mut LexState<'b>, |
524 | 130 | mathml: Element<'c>, |
525 | 130 | intent_offset: &mut u32) -> Result<Vec<Element<'m>>> { |
526 | // arguments := intent ( ',' intent )*' | |
527 | // debug!(" start build_args state: {}", lex_state); | |
528 | ||
529 | // there is at least one arg | |
530 | 130 | let mut children = Vec::with_capacity(lex_state.remaining_str.len()/3 + 1); // conservative estimate ('3' - "$x,"); |
531 | 130 | children. |
532 | // debug!(" build_args: # children {}; state: {}", children.len(), lex_state); | |
533 | ||
534 | 239 | while lex_state.is_terminal(",") { |
535 | 114 | lex_state.get_next() |
536 | 114 | children. |
537 | // debug!(" build_args, # children {}; state: {}", children.len(), lex_state); | |
538 | } | |
539 | ||
540 | // debug!(" end build_args, # children {}; state: {}", children.len(), lex_state); | |
541 | 125 | return Ok(children); |
542 | 130 | } |
543 | ||
544 | /// lift the children up to LITERAL_NAME | |
545 | 125 | fn lift_function_name<'m>(doc: Document<'m>, function_name: Element<'m>, children: Vec<Element<'m>>) -> Element<'m> { |
546 | // debug!(" lift_function_name: {}", name(function_name)); | |
547 | // debug!(" lift_function_name: {}", mml_to_string(function_name)); | |
548 | 125 | if name(function_name) == "mi" || |
549 | // simple/normal case of f(x,y) | |
550 | // don't want to say that this is a leaf -- doing so messes up because it potentially has children | |
551 | 121 | set_mathml_name(function_name, as_text(function_name)); |
552 | 121 | function_name.set_text(""); |
553 | 121 | function_name.replace_children(children); |
554 | 129 | if |
555 | 14 | let properties = function_name.attribute_value(INTENT_PROPERTY).unwrap_or(":").to_owned(); |
556 | 14 | function_name.set_attribute_value(INTENT_PROPERTY, &(properties + "silent:")); |
557 | 107 | } |
558 | 121 | return function_name; |
559 | 4 | } else if function_name.children().is_empty() { |
560 | // "... :property(...)" -- no function name | |
561 | 0 | function_name.replace_children(children); |
562 | 0 | return function_name; |
563 | } else { | |
564 | // more complicated case of nested name: f(x)(y,z) | |
565 | // create an apply_function(f(x), y, z) | |
566 | 4 | let result = create_mathml_element(&doc, IMPLICIT_FUNCTION_NAME); |
567 | 4 | result.set_attribute_value(MATHML_FROM_NAME_ATTR, "mrow"); |
568 | 4 | result.append_child(function_name); |
569 | 4 | result.append_children(children); |
570 | 4 | return result; |
571 | } | |
572 | 125 | } |
573 | ||
574 | ||
575 | /// look for @arg=name in mathml | |
576 | /// if 'check_intent', then look at an @intent for this element (typically false for non-recursive calls) | |
577 | 946 | fn find_arg<'r, 'c, 's:'c, 'm:'c>( |
578 | 946 | rules_with_context: &'r mut SpeechRulesWithContext<'c,'s,'m>, |
579 | 946 | name: &str, |
580 | 946 | mathml: Element<'c>, |
581 | 946 | intent_offset: &mut u32, |
582 | 946 | skip_self: bool, |
583 | 946 | no_check_inside: bool) -> Result<Option<Element<'m>>> { |
584 | // debug!("Looking for '{}' in\n{}", name, mml_to_string(mathml)); | |
585 | 946 | if !skip_self && |
586 | 723 | let Some( |
587 | // debug!("looking for '{}', found arg='{}'", name, arg_val); | |
588 | 411 | if name == arg_val { |
589 | // check to see if this mathml has an intent value -- if so the value is the value of its intent value | |
590 | 222 | if let Some( |
591 | 28 | let mut lex_state = LexState::init(intent_str.trim()) |
592 | 28 | return Ok( Some( build_intent(rules_with_context, &mut lex_state, mathml, intent_offset) |
593 | } else { | |
594 | 194 | return Ok( Some( rules_with_context.match_pattern::<Element<'m>>(mathml) |
595 | } | |
596 | 189 | } else if no_check_inside { |
597 | 189 | return Ok(None); // don't look inside 'arg' |
598 | 0 | } |
599 | 535 | } |
600 | ||
601 | 535 | if no_check_inside && |
602 | 2 | return Ok(None); // don't look inside 'intent' |
603 | 533 | } |
604 | ||
605 | 533 | if is_leaf(mathml){ |
606 | 121 | return Ok(None); |
607 | 412 | } |
608 | ||
609 | 723 | for child in |
610 | 723 | let child = as_element(child); |
611 | 723 | if let Some( |
612 | 396 | return Ok( Some(element) ); |
613 | 326 | } |
614 | } | |
615 | ||
616 | 15 | return Ok(None); // not present |
617 | 946 | } |
618 | ||
619 | #[cfg(test)] | |
620 | mod tests { | |
621 | #[allow(unused_imports)] | |
622 | use crate::init_logger; | |
623 | use log::debug; | |
624 | use sxd_document::parser; | |
625 | ||
626 | ||
627 | 27 | fn test_intent(mathml: &str, target: &str, intent_error_recovery: &str) -> bool { |
628 | use crate::interface::*; | |
629 | use crate::pretty_print::mml_to_string; | |
630 | // this forces initialization | |
631 | 27 | crate::interface::set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); |
632 | // crate::speech::SpeechRules::initialize_all_rules().unwrap(); | |
633 | 27 | set_preference("IntentErrorRecovery", intent_error_recovery).unwrap(); |
634 | 27 | set_preference("SpeechStyle", "SimpleSpeak").unwrap(); // avoids possibility of "LiteralSpeak" |
635 | 27 | let package1 = &parser::parse(mathml).expect("Failed to parse test input"); |
636 | 27 | let mathml = get_element(package1); |
637 | 27 | trim_element(mathml, false); |
638 | 27 | debug!("test:\n{}", |
639 | ||
640 | 27 | let package2 = &parser::parse(target).expect("Failed to parse target input"); |
641 | 27 | let target = get_element(package2); |
642 | 27 | trim_element(target,true); |
643 | 27 | debug!("target:\n{}", |
644 | ||
645 | 27 | let |
646 | 18 | Ok(e) => e, |
647 | 9 | Err(e) => { |
648 | 9 | debug!("{}", |
649 | 9 | return false; // could be intentional failure |
650 | } | |
651 | }; | |
652 | 18 | debug!("result:\n{}", |
653 | 18 | match is_same_element(result, target, &[]) { |
654 | 18 | Ok(_) => return true, |
655 | 0 | Err(e) => panic!("{}:\nresult: {}target: {}", e, mml_to_string(result), mml_to_string(target)), |
656 | } | |
657 | 27 | } |
658 | ||
659 | #[test] | |
660 | 1 | fn infer_binomial() { |
661 | 1 | let mathml = "<mrow intent='binomial($n, $m)'> |
662 | 1 | <mo>(</mo> |
663 | 1 | <mfrac linethickness='0'> <mn arg='n'>7</mn> <mn arg='m'>3</mn> </mfrac> |
664 | 1 | <mo>)</mo> |
665 | 1 | </mrow>"; |
666 | 1 | let intent = "<binomial data-from-mathml='mrow' data-intent-property=':infix:'> <mn data-from-mathml='mn' arg='n'>7</mn> <mn data-from-mathml='mn' arg='m'>3</mn> </binomial>"; |
667 | 1 | assert!(test_intent(mathml, intent, "Error")); |
668 | 1 | } |
669 | ||
670 | #[test] | |
671 | 1 | fn infer_binomial_intent_arg() { |
672 | 1 | let mathml = "<msubsup intent='$op($n,$m)'> |
673 | 1 | <mi arg='op' intent='binomial'>C</mi> |
674 | 1 | <mi arg='n'>n</mi> |
675 | 1 | <mi arg='m'>m</mi> |
676 | 1 | </msubsup>"; |
677 | 1 | let intent = "<binomial data-from-mathml='msubsup' data-intent-property=':infix:'> <mi data-from-mathml='mi' arg='n'>n</mi> <mi data-from-mathml='mi' arg='m'>m</mi></binomial>"; |
678 | 1 | assert!(test_intent(mathml, intent, "Error")); |
679 | 1 | } |
680 | ||
681 | #[test] | |
682 | 1 | fn silent_underscore() { |
683 | 1 | let mathml = "<mrow><mi intent='__-'>silent</mi><mo>+</mo><mi>e</mi></mrow>"; |
684 | 1 | let intent = "<mrow data-from-mathml='mrow'> |
685 | 1 | <mi data-from-mathml='mi'>__-</mi> |
686 | 1 | <mo data-from-mathml='mo'>+</mo> |
687 | 1 | <mi data-from-mathml='mi'>e</mi> |
688 | 1 | </mrow>"; |
689 | 1 | assert!(test_intent(mathml, intent, "Error")); |
690 | 1 | } |
691 | ||
692 | ||
693 | #[test] | |
694 | 1 | fn silent_underscore_function() { |
695 | 1 | let mathml = "<mrow intent='__-_(speak, this)'></mrow>"; |
696 | 1 | let intent = "<__-_ data-from-mathml='mrow' data-intent-property=':silent:'> |
697 | 1 | <mi data-from-mathml='mi'>speak</mi> |
698 | 1 | <mi data-from-mathml='mi'>this</mi> |
699 | 1 | </__-_>"; |
700 | 1 | assert!(test_intent(mathml, intent, "Error")); |
701 | 1 | } |
702 | ||
703 | #[test] | |
704 | 1 | fn intent_multiple_properties() { |
705 | 1 | let mathml = "<mrow intent='foo:silent:int(bar:positive-int:int, $a:foo:bar:foo-bar, $b:number)'> |
706 | 1 | <mi arg='a'>a</mi> |
707 | 1 | <mo arg='p' intent='plus'>+</mo> |
708 | 1 | <mi arg='b' intent=':negative-int:int'>b</mi> |
709 | 1 | </mrow>"; |
710 | 1 | let intent = "<foo data-intent-property=':int:silent:' data-from-mathml='mrow'> |
711 | 1 | <mi data-from-mathml='mi' data-intent-property=':positive-int:int:'>bar</mi> |
712 | 1 | <mi data-from-mathml='mi' arg='a' data-intent-property=':foo:bar:foo-bar:'>a</mi> |
713 | 1 | <mi data-from-mathml='mi' arg='b' data-intent-property=':number:'>b</mi> |
714 | 1 | </foo>"; |
715 | 1 | assert!(test_intent(mathml, intent, "Error")); |
716 | 1 | } |
717 | #[test] | |
718 | 1 | fn intent_nest_no_arg_call() { |
719 | 1 | let mathml = "<mrow intent='foo(bar())'> |
720 | 1 | <mi arg='a'>a</mi> |
721 | 1 | <mo arg='p' intent='plus'>+</mo> |
722 | 1 | <mi arg='b'>b</mi> |
723 | 1 | <mo arg='f' intent='factorial'>!</mo> |
724 | 1 | </mrow>"; |
725 | 1 | let intent = "<foo><bar></bar></foo>"; |
726 | 1 | assert!(!test_intent(mathml, intent, "Error")); |
727 | 1 | } |
728 | ||
729 | #[test] | |
730 | 1 | fn intent_hints() { |
731 | 1 | let mathml = "<mrow intent='foo:silent(bar:postfix(3))'> |
732 | 1 | <mi arg='a'>a</mi> |
733 | 1 | <mo arg='p' intent='plus'>+</mo> |
734 | 1 | <mi arg='b'>b</mi> |
735 | 1 | <mo arg='f' intent='factorial'>!</mo> |
736 | 1 | </mrow>"; |
737 | 1 | let intent = "<foo data-intent-property=':silent:' data-from-mathml='mrow'> |
738 | 1 | <bar data-intent-property=':postfix:' data-from-mathml='mrow'> |
739 | 1 | <mn data-from-mathml='mn'>3</mn> |
740 | 1 | </bar> |
741 | 1 | </foo>"; |
742 | 1 | assert!(test_intent(mathml, intent, "Error")); |
743 | 1 | } |
744 | ||
745 | #[test] | |
746 | 1 | fn intent_hints_and_type() { |
747 | 1 | let mathml = "<mrow intent='foo:is-foolish:function($b)'> |
748 | 1 | <mi arg='a'>a</mi> |
749 | 1 | <mo arg='p' intent='plus'>+</mo> |
750 | 1 | <mi intent='b:int' arg='b'>b</mi> |
751 | 1 | <mo arg='f' intent='factorial'>!</mo> |
752 | 1 | </mrow>"; |
753 | 1 | let intent = "<foo data-intent-property=':is-foolish:function:' data-from-mathml='mrow'> |
754 | 1 | <mi data-intent-property=':int:' data-from-mathml='mi'>b</mi> |
755 | 1 | </foo>"; |
756 | 1 | assert!(test_intent(mathml, intent, "Error")); |
757 | 1 | } |
758 | ||
759 | #[test] | |
760 | 1 | fn intent_in_intent_first_arg() { |
761 | 1 | let mathml = "<mrow intent='p(f(b), a)'> |
762 | 1 | <mi arg='a'>a</mi> |
763 | 1 | <mo arg='p' intent='plus'>+</mo> |
764 | 1 | <mi arg='b'>b</mi> |
765 | 1 | <mo arg='f' intent='factorial'>!</mo> |
766 | 1 | </mrow>"; |
767 | 1 | let intent = "<p data-from-mathml='mrow'> |
768 | 1 | <f data-from-mathml='mrow'> |
769 | 1 | <mi data-from-mathml='mi'>b</mi> |
770 | 1 | </f> |
771 | 1 | <mi data-from-mathml='mi'>a</mi> |
772 | 1 | </p>"; |
773 | 1 | assert!(test_intent(mathml, intent, "Error")); |
774 | 1 | } |
775 | ||
776 | #[test] | |
777 | 1 | fn intent_in_intent_second_arg() { |
778 | 1 | let mathml = "<mrow intent='$p(a,$f(b))'> |
779 | 1 | <mi arg='a'>a</mi> |
780 | 1 | <mo arg='p' intent='plus'>+</mo> |
781 | 1 | <mi arg='b'>b</mi> |
782 | 1 | <mo arg='f' intent='factorial'>!</mo> |
783 | 1 | </mrow>"; |
784 | 1 | let intent = "<plus data-from-mathml='mrow' data-intent-property=':infix:'> |
785 | 1 | <mi data-from-mathml='mi'>a</mi> |
786 | 1 | <factorial data-from-mathml='mrow'> |
787 | 1 | <mi data-from-mathml='mi'>b</mi> |
788 | 1 | </factorial> |
789 | 1 | </plus>"; |
790 | 1 | assert!(test_intent(mathml, intent, "Error")); |
791 | 1 | } |
792 | ||
793 | #[test] | |
794 | 1 | fn intent_with_whitespace() { |
795 | 1 | let mathml = "<mrow intent=' $arrow ( $a , $b,$c ) '> |
796 | 1 | <mi arg='a'>A</mi> |
797 | 1 | <mover> |
798 | 1 | <mo movablelimits='false' arg='arrow' intent='map'>⟶</mo> |
799 | 1 | <mo arg='U2245' intent='congruence'>≅</mo> |
800 | 1 | </mover> |
801 | 1 | <mi arg='b'>B</mi> |
802 | 1 | <mi arg='c'>C</mi> |
803 | 1 | </mrow>"; |
804 | 1 | let intent = "<map data-from-mathml='mrow'> <mi data-from-mathml='mi' arg='a'>A</mi> <mi data-from-mathml='mi' arg='b'>B</mi> <mi data-from-mathml='mi' arg='c'>C</mi> </map>"; |
805 | 1 | assert!(test_intent(mathml, intent, "Error")); |
806 | 1 | } |
807 | ||
808 | #[test] | |
809 | 1 | fn intent_template_at_toplevel() { |
810 | 1 | let mathml = "<msup intent='$H $n'> |
811 | 1 | <mi arg='H' mathvariant='normal'>H</mi> |
812 | 1 | <mn arg='n'>2</mn> |
813 | 1 | </msup>"; |
814 | 1 | let intent = "<mrow><mi arg='H' mathvariant='normal'>H</mi><mn arg='n'>2</mn></mrow>"; |
815 | 1 | assert!(!test_intent(mathml, intent, "Error")); |
816 | 1 | } |
817 | ||
818 | #[test] | |
819 | 1 | fn intent_with_nested_indirect_head() { |
820 | 1 | let mathml = "<mrow intent='$op($a,$b)'> |
821 | 1 | <mi arg='a'>A</mi> |
822 | 1 | <mover arg='op' intent='$ra($cong)'> |
823 | 1 | <mo movablelimits='false' arg='ra' intent='map'>⟶</mo> |
824 | 1 | <mo arg='cong' intent='congruence'>≅</mo> |
825 | 1 | </mover> |
826 | 1 | <mi arg='b'>B</mi> |
827 | 1 | </mrow>"; |
828 | 1 | let intent = "<apply-function data-from-mathml='mrow'> |
829 | 1 | <map data-from-mathml='mrow'> |
830 | 1 | <mi data-from-mathml='mo'>congruence</mi> |
831 | 1 | </map> |
832 | 1 | <mi data-from-mathml='mi' arg='a'>A</mi> |
833 | 1 | <mi data-from-mathml='mi' arg='b'>B</mi> |
834 | 1 | </apply-function>"; |
835 | 1 | assert!(test_intent(mathml, intent, "Error")); |
836 | 1 | } |
837 | ||
838 | #[test] | |
839 | 1 | fn intent_with_literals() { |
840 | 1 | let mathml = "<mrow intent='vector(1, 0.0, 0.1, -23, -0.1234, last)'> |
841 | 1 | <mi>x</mi> |
842 | 1 | </mrow>"; |
843 | 1 | let intent = "<vector data-from-mathml='mrow' data-intent-property=':function:'> |
844 | 1 | <mn data-from-mathml='mn'>1</mn> |
845 | 1 | <mn data-from-mathml='mn'>0.0</mn> |
846 | 1 | <mn data-from-mathml='mn'>0.1</mn> |
847 | 1 | <mn data-from-mathml='mn'>-23</mn> |
848 | 1 | <mn data-from-mathml='mn'>-0.1234</mn> |
849 | 1 | <mi data-from-mathml='mi'>last</mi> |
850 | 1 | </vector>"; |
851 | 1 | assert!(test_intent(mathml, intent, "Error")); |
852 | 1 | } |
853 | ||
854 | #[test] | |
855 | 1 | fn intent_with_template_literals() { |
856 | 1 | let mathml = "<mrow intent='1 0.0 0.1 -23 -0.1234 last'> |
857 | 1 | <mi>x</mi> |
858 | 1 | </mrow>"; |
859 | 1 | let intent = "<mrow><mn>1</mn><mn>0.</mn><mn>.1</mn><mn>-23</mn><mn>-.1234</mn><mi>last</mi></mrow>"; |
860 | 1 | assert!(!test_intent(mathml, intent, "Error")); |
861 | 1 | } |
862 | ||
863 | #[test] | |
864 | 1 | fn intent_with_nested_head() { |
865 | 1 | let mathml = "<mrow intent='$ra($cong)($a,$b)'> |
866 | 1 | <mi arg='a'>A</mi> |
867 | 1 | <mover> |
868 | 1 | <mo movablelimits='false' arg='ra' intent='map'>⟶</mo> |
869 | 1 | <mo arg='cong' intent='congruence'>≅</mo> |
870 | 1 | </mover> |
871 | 1 | <mi arg='b'>B</mi> |
872 | 1 | </mrow>"; |
873 | 1 | let intent = "<apply-function data-from-mathml='mrow'> |
874 | 1 | <map data-from-mathml='mrow'> |
875 | 1 | <mi data-from-mathml='mo'>congruence</mi> |
876 | 1 | </map> |
877 | 1 | <mi data-from-mathml='mi' arg='a'>A</mi> |
878 | 1 | <mi data-from-mathml='mi' arg='b'>B</mi> |
879 | 1 | </apply-function>"; |
880 | 1 | assert!(test_intent(mathml, intent, "Error")); |
881 | 1 | } |
882 | ||
883 | ||
884 | #[test] | |
885 | 1 | fn intent_with_nested_head_and_hints() { |
886 | 1 | let mathml = "<mrow intent='pre:prefix(in:infix($a, x))(post:postfix($b))'> |
887 | 1 | <mi arg='a'>A</mi> |
888 | 1 | <mover> |
889 | 1 | <mo intent='map'>⟶</mo> |
890 | 1 | <mo intent='congruence'>≅</mo> |
891 | 1 | </mover> |
892 | 1 | <mi arg='b'>B</mi> |
893 | 1 | </mrow>"; |
894 | 1 | let intent = "<apply-function data-from-mathml='mrow'> |
895 | 1 | <pre data-intent-property=':prefix:' data-from-mathml='mrow'> |
896 | 1 | <in data-intent-property=':infix:' data-from-mathml='mrow'> |
897 | 1 | <mi data-from-mathml='mi' arg='a'>A</mi> |
898 | 1 | <mi data-from-mathml='mi'>x</mi> |
899 | 1 | </in> |
900 | 1 | </pre> |
901 | 1 | <post data-intent-property=':postfix:' data-from-mathml='mrow'> |
902 | 1 | <mi data-from-mathml='mi' arg='b'>B</mi> |
903 | 1 | </post> |
904 | 1 | </apply-function>"; |
905 | 1 | assert!(test_intent(mathml, intent, "Error")); |
906 | 1 | } |
907 | ||
908 | ||
909 | #[test] | |
910 | 1 | fn intent_double_indirect_head() { |
911 | 1 | let mathml = "<mrow intent='$m:prefix($c)($a,$b)'> |
912 | 1 | <mi arg='a'>A</mi> |
913 | 1 | <mover> |
914 | 1 | <mo movablelimits='false' arg='m' intent='map'>⟶</mo> |
915 | 1 | <mo arg='c' intent='congruence'>≅</mo> |
916 | 1 | </mover> |
917 | 1 | <mi arg='b'>B</mi> |
918 | 1 | </mrow>"; |
919 | 1 | let intent = "<apply-function data-from-mathml='mrow'> |
920 | 1 | <map data-intent-property=':prefix:' data-from-mathml='mrow'> |
921 | 1 | <mi data-from-mathml='mo'>congruence</mi> |
922 | 1 | </map> |
923 | 1 | <mi data-from-mathml='mi' arg='a'>A</mi> |
924 | 1 | <mi data-from-mathml='mi' arg='b'>B</mi> |
925 | 1 | </apply-function>"; |
926 | 1 | assert!(test_intent(mathml, intent, "Error")); |
927 | 1 | } |
928 | ||
929 | #[test] | |
930 | 1 | fn intent_missing_open() { |
931 | 1 | let mathml = "<mrow intent='$p $a,$f($b))'> |
932 | 1 | <mi arg='a'>a</mi> |
933 | 1 | <mo arg='p' intent='plus'>+</mo> |
934 | 1 | <mi arg='b'>b</mi> |
935 | 1 | <mo arg='f' intent='factorial'>!</mo> |
936 | 1 | </mrow>"; |
937 | 1 | let intent = "<plus> <mi arg='a'>a</mi> <factorial><mi arg='b'>b</mi></factorial> </plus>"; |
938 | 1 | assert!(!test_intent(mathml, intent, "Error")); |
939 | 1 | } |
940 | ||
941 | #[test] | |
942 | 1 | fn intent_no_comma() { |
943 | 1 | let mathml = "<mrow intent='$p($a $f($b))'> |
944 | 1 | <mi arg='a'>a</mi> |
945 | 1 | <mo arg='p' intent='plus'>+</mo> |
946 | 1 | <mi arg='b'>b</mi> |
947 | 1 | <mo arg='f' intent='factorial'>!</mo> |
948 | 1 | </mrow>"; |
949 | 1 | let intent = "<plus> |
950 | 1 | <mrow> |
951 | 1 | <mi arg='a'>a</mi> |
952 | 1 | <factorial> <mi arg='b'>b</mi> </factorial> |
953 | 1 | </mrow> |
954 | 1 | </plus>"; |
955 | 1 | assert!(!test_intent(mathml, intent, "Error")); |
956 | 1 | } |
957 | ||
958 | #[test] | |
959 | 1 | fn intent_no_arg() { |
960 | 1 | let mathml = "<mrow intent='factorial()'> |
961 | 1 | <mi arg='a'>a</mi> |
962 | 1 | <mo arg='p' intent='plus'>+</mo> |
963 | 1 | <mi arg='b'>b</mi> |
964 | 1 | <mo arg='f' intent='factorial'>!</mo> |
965 | 1 | </mrow>"; |
966 | 1 | let target = "<factorial></factorial>"; |
967 | 1 | assert!(!test_intent(mathml, target, "Error")); |
968 | 1 | } |
969 | ||
970 | #[test] | |
971 | 1 | fn intent_illegal_no_arg() { |
972 | 1 | let mathml = "<mrow intent='factorial(()))'> |
973 | 1 | <mi arg='a'>a</mi> |
974 | 1 | <mo arg='p' intent='plus'>+</mo> |
975 | 1 | <mi arg='b'>b</mi> |
976 | 1 | <mo arg='f' intent='factorial'>!</mo> |
977 | 1 | </mrow>"; |
978 | 1 | let target = "<factorial></factorial>"; |
979 | 1 | assert!(!test_intent(mathml, target, "Error")); |
980 | 1 | } |
981 | ||
982 | #[test] | |
983 | 1 | fn intent_illegal_no_arg_ignore() { |
984 | 1 | let mathml = "<mrow intent='factorial()'> |
985 | 1 | <mi arg='a'>a</mi> |
986 | 1 | <mo arg='p' intent='plus'>+</mo> |
987 | 1 | <mi arg='b'>b</mi> |
988 | 1 | <mo arg='f' intent='factorial'>!</mo> |
989 | 1 | </mrow>"; |
990 | 1 | let target = "<mrow data-from-mathml='mrow' intent='factorial()'> |
991 | 1 | <mi data-from-mathml='mi' arg='a'>a</mi> |
992 | 1 | <mi data-from-mathml='mo'>plus</mi> |
993 | 1 | <mi data-from-mathml='mi' arg='b'>b</mi> |
994 | 1 | <mi data-from-mathml='mo'>factorial</mi> |
995 | 1 | </mrow>"; |
996 | 1 | assert!(test_intent(mathml, target, "IgnoreIntent")); |
997 | 1 | } |
998 | ||
999 | #[test] | |
1000 | 1 | fn intent_illegal_self_ref() { |
1001 | 1 | let mathml = "<mrow intent='foo:is-foolish:function($b)'> |
1002 | 1 | <mi intent='$b:int' arg='b'>b</mi> |
1003 | 1 | </mrow>"; |
1004 | 1 | let target = "<foo data-intent-property=':function:' data-intent-type='is-foolish'><mi data-intent-type='int'>b</mi></foo>"; |
1005 | 1 | assert!(!test_intent(mathml, target, "Error")); |
1006 | 1 | } |
1007 | ||
1008 | #[test] | |
1009 | 1 | fn infer_missing_second_arg() { |
1010 | 1 | let mathml = "<mrow intent='binomial($n,)'> |
1011 | 1 | <mo>(</mo> |
1012 | 1 | <mfrac linethickness='0'> <mn arg='n'>7</mn> <mn arg='m'>3</mn> </mfrac> |
1013 | 1 | <mo>)</mo> |
1014 | 1 | </mrow>"; |
1015 | 1 | let target = "<binomial data-intent-property='binomial($n,)'> \n |
1016 | 1 | <mn data-from-mathml='mn' arg='n'>7</mn> <mn data-from-mathml='mn' arg='m'>3</mn> </binomial>"; |
1017 | 1 | assert!(!test_intent(mathml, target, "Error")); |
1018 | 1 | } |
1019 | ||
1020 | #[test] | |
1021 | 1 | fn infer_missing_second_arg_ignore() { |
1022 | 1 | let mathml = "<mrow intent='binomial($n,)'> |
1023 | 1 | <mo>(</mo> |
1024 | 1 | <mfrac linethickness='0'> <mn arg='n'>7</mn> <mn arg='m'>3</mn> </mfrac> |
1025 | 1 | <mo>)</mo> |
1026 | 1 | </mrow>"; |
1027 | 1 | let target = "<mrow data-from-mathml='mrow' intent='binomial($n,)'> |
1028 | 1 | <mo data-from-mathml='mo'>(</mo> |
1029 | 1 | <fraction data-from-mathml='mfrac' linethickness='0'> <mn data-from-mathml='mn' arg='n'>7</mn> <mn data-from-mathml='mn' arg='m'>3</mn> </fraction> |
1030 | 1 | <mo data-from-mathml='mo'>)</mo> |
1031 | 1 | </mrow>"; |
1032 | 1 | assert!(test_intent(mathml, target, "IgnoreIntent")); |
1033 | 1 | } |
1034 | ||
1035 | #[test] | |
1036 | 1 | fn plane1_char_in_concept_name() { |
1037 | 1 | let mathml = "<math><mrow><mo intent='🐇'>🐇</mo><mi>X</mi></mrow></math>"; |
1038 | 1 | let intent = "<math data-from-mathml='math'> |
1039 | 1 | <mrow data-from-mathml='mrow'> |
1040 | 1 | <mi data-from-mathml='mo'>🐇</mi> |
1041 | 1 | <mi data-from-mathml='mi'>X</mi> |
1042 | 1 | </mrow> |
1043 | 1 | </math>"; |
1044 | 1 | assert!(test_intent(mathml, intent, "Error")); |
1045 | 1 | } |
1046 | } |
Line | Count | Source |
1 | //! The interface module provides functionality both for calling from an API and also running the code from `main`. | |
2 | //! | |
3 | #![allow(non_snake_case)] | |
4 | #![allow(clippy::needless_return)] | |
5 | use std::cell::RefCell; | |
6 | use std::sync::LazyLock; | |
7 | ||
8 | use crate::canonicalize::{as_text, create_mathml_element}; | |
9 | use crate::errors::*; | |
10 | use phf::phf_map; | |
11 | use regex::{Captures, Regex}; | |
12 | use sxd_document::dom::{Element, Document, ChildOfRoot, ChildOfElement, Attribute}; | |
13 | use sxd_document::parser; | |
14 | use sxd_document::Package; | |
15 | ||
16 | use crate::canonicalize::{as_element, name}; | |
17 | use crate::shim_filesystem::{find_all_dirs_shim, find_files_in_dir_that_ends_with_shim}; | |
18 | use log::{debug, error}; | |
19 | ||
20 | use crate::navigate::*; | |
21 | use crate::pretty_print::mml_to_string; | |
22 | use crate::xpath_functions::{is_leaf, IsNode}; | |
23 | use std::panic::{catch_unwind, AssertUnwindSafe}; | |
24 | ||
25 | /// Maximum depth to prevent stack overflow on deeply nested MathML | |
26 | pub const MAX_DEPTH: usize = 512; | |
27 | ||
28 | #[cfg(feature = "enable-logs")] | |
29 | use std::sync::Once; | |
30 | #[cfg(feature = "enable-logs")] | |
31 | static INIT: Once = Once::new(); | |
32 | ||
33 | 45.0k | fn enable_logs() { |
34 | #[cfg(feature = "enable-logs")] | |
35 | INIT.call_once(||{ | |
36 | #[cfg(target_os = "android")] | |
37 | { | |
38 | use log::*; | |
39 | use android_logger::*; | |
40 | ||
41 | android_logger::init_once( | |
42 | Config::default() | |
43 | .with_max_level(LevelFilter::Trace) | |
44 | .with_tag("MathCat") | |
45 | ); | |
46 | trace!("Activated Android logger!"); | |
47 | } | |
48 | }); | |
49 | 45.0k | } |
50 | ||
51 | // For getting a message from a panic | |
52 | thread_local! { | |
53 | // Stores (Message, File, Line) | |
54 | static PANIC_INFO: RefCell<Option<(String, String, u32)>> = const { RefCell::new(None) }; | |
55 | } | |
56 | ||
57 | /// Initialize the panic handler to catch panics and store the message, file, and line number in `PANIC_INFO`. | |
58 | 13.5k | pub fn init_panic_handler() { |
59 | use std::panic; | |
60 | ||
61 | 13.5k | panic::set_hook(Box::new(|info| |
62 | 1 | let location = info.location() |
63 | 1 | .map(|l| format!("{}:{}", l.file(), l.line())) |
64 | 1 | .unwrap_or_else(|| |
65 | ||
66 | 1 | let payload = info.payload(); |
67 | 1 | let msg = if let Some( |
68 | 0 | s.to_string() |
69 | 1 | } else if let Some(s) = payload.downcast_ref::<String>() { |
70 | 1 | s.clone() |
71 | } else { | |
72 | 0 | "Unknown panic payload".to_string() |
73 | }; | |
74 | ||
75 | // Use try_with/try_borrow_mut to ensure the hook never panics itself | |
76 | 1 | let _ = PANIC_INFO.try_with(|cell| { |
77 | 1 | if let Ok(mut slot) = cell.try_borrow_mut() { |
78 | 1 | *slot = Some((msg, location, 0)); |
79 | 1 |
|
80 | 1 | }); |
81 | 1 | })); |
82 | 13.5k | } |
83 | ||
84 | 41.1k | pub fn report_any_panic<T>(result: Result<Result<T, Error>, Box<dyn std::any::Any + Send>>) -> Result<T, Error> { |
85 | 41.1k | match result { |
86 | 41.1k | Ok(val) => val, |
87 | Err(_) => { | |
88 | // Retrieve the smuggled info | |
89 | 1 | let details = PANIC_INFO.with(|cell| cell.borrow_mut().take()); |
90 | ||
91 | 1 | if let Some((msg, file, line)) = details { |
92 | 1 | Err(anyhow::anyhow!( |
93 | 1 | "MathCAT crash! Please report the following information: '{}' at {}:{}", |
94 | 1 | msg, file, line |
95 | 1 | )) |
96 | } else { | |
97 | 0 | Err(anyhow::anyhow!("MathCAT crash! -- please report")) |
98 | } | |
99 | } | |
100 | } | |
101 | 41.1k | } |
102 | ||
103 | // wrap up some common functionality between the call from 'main' and AT | |
104 | 4.91k | fn cleanup_mathml(mathml: Element) -> Result<Element> { |
105 | 4.91k | trim_element(mathml, false); |
106 | 4.91k | let |
107 | 4.91k | let mathml = add_ids(mathml); |
108 | 4.91k | return Ok(mathml); |
109 | 4.91k | } |
110 | ||
111 | thread_local! { | |
112 | /// The current node being navigated (also spoken and brailled) is stored in `MATHML_INSTANCE`. | |
113 | pub static MATHML_INSTANCE: RefCell<Package> = init_mathml_instance(); | |
114 | } | |
115 | ||
116 | 3.92k | fn init_mathml_instance() -> RefCell<Package> { |
117 | 3.92k | let package = parser::parse("<math></math>") |
118 | 3.92k | .expect("Internal error in 'init_mathml_instance;: didn't parse initializer string"); |
119 | 3.92k | return RefCell::new(package); |
120 | 3.92k | } |
121 | ||
122 | /// Set the Rules directory | |
123 | /// IMPORTANT: this should be the very first call to MathCAT. If 'dir' is an empty string, the environment var 'MathCATRulesDir' is tried. | |
124 | 5.08k | pub fn set_rules_dir(dir: impl AsRef<str>) -> Result<()> { |
125 | 5.08k | enable_logs(); |
126 | 5.08k | init_panic_handler(); |
127 | 5.08k | let dir = dir.as_ref().to_string(); |
128 | 5.08k | let result = catch_unwind(AssertUnwindSafe(|| { |
129 | use std::path::PathBuf; | |
130 | 5.08k | let dir_os = if dir.is_empty() { |
131 | 0 | std::env::var_os("MathCATRulesDir").unwrap_or_default() |
132 | } else { | |
133 | 5.08k | std::ffi::OsString::from(&dir) |
134 | }; | |
135 | 5.08k | let pref_manager = crate::prefs::PreferenceManager::get(); |
136 | 5.08k | pref_manager.borrow_mut().initialize(PathBuf::from(dir_os)) |
137 | 5.08k | })); |
138 | 5.08k | return report_any_panic(result); |
139 | 5.08k | } |
140 | ||
141 | /// Returns the version number (from Cargo.toml) of the build | |
142 | 0 | pub fn get_version() -> String { |
143 | 0 | enable_logs(); |
144 | const VERSION: &str = env!("CARGO_PKG_VERSION"); | |
145 | 0 | return VERSION.to_string(); |
146 | 0 | } |
147 | ||
148 | /// This will override any previous MathML that was set. | |
149 | /// This returns canonical MathML with 'id's set on any node that doesn't have an id. | |
150 | /// The ids can be used for sync highlighting if the `Bookmark` API preference is true. | |
151 | 4.88k | pub fn set_mathml(mathml_str: impl AsRef<str>) -> Result<String> { |
152 | 4.88k | enable_logs(); |
153 | // if these are present when resent to MathJaX, MathJaX crashes (https://github.com/mathjax/MathJax/issues/2822) | |
154 | 3 | static MATHJAX_V2: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"class *= *['"]MJX-.*?['"]"#).unwrap()); |
155 | 3 | static MATHJAX_V3: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"class *= *['"]data-mjx-.*?['"]"#).unwrap()); |
156 | ||
157 | // Strip out processing instructions and comments -- these are not MathML and can cause DOS problems in the parser | |
158 | 3 | static PROCESSING_INSTRUCTION: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"<\?[\s\S]{1,2048}\?>"#).unwrap()); |
159 | 3 | static XML_COMMENT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"(?s)"#).unwrap()); |
160 | ||
161 | // These have some length limits to avoid DOS attacks via long strings | |
162 | 3 | static NAMESPACE_DECL: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"xmlns:[[:alpha:]]{1,32}"#).unwrap()); |
163 | 3 | static PREFIX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"(</?)[[:alpha:]]{1,32}:"#).unwrap()); |
164 | 3 | static HTML_ENTITIES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"&([a-zA-Z]{2,10});"#).unwrap()); |
165 | 4.88k | let result = catch_unwind(AssertUnwindSafe(|| { |
166 | 4.88k | NAVIGATION_STATE.with(|nav_stack| { |
167 | 4.88k | nav_stack.borrow_mut().reset(); |
168 | 4.88k | }); |
169 | ||
170 | // We need the main definitions files to be read in so canonicalize can work. | |
171 | // This call reads all of them for the current preferences, but that's ok since they will likely be used | |
172 | 4.88k | crate::speech::SPEECH_RULES.with(|rules| rules.borrow_mut().read_files()) |
173 | ||
174 | 4.88k | let mathml_str = mathml_str.as_ref(); |
175 | // Safety guard: Reject strings > 1MB to prevent DoS/Stack issues | |
176 | 4.88k | if mathml_str.len() > 1024 * 1024 { |
177 | 0 | bail!("MathML string of size {} bytes exceeds length limit of 1MB", mathml_str.len()); |
178 | 4.88k | } |
179 | ||
180 | 4.88k | return MATHML_INSTANCE.with(|old_package| { |
181 | static HTML_ENTITIES_MAPPING: phf::Map<&str, &str> = include!("entities.in"); | |
182 | ||
183 | 4.88k | let mut error_message = "".to_string(); // can't return a result inside the replace_all, so we do this hack of setting the message and then returning the error |
184 | ||
185 | 4.88k | let mathml_str = XML_COMMENT.replace_all(mathml_str, ""); |
186 | 4.88k | let mathml_str = PROCESSING_INSTRUCTION.replace_all(&mathml_str, ""); |
187 | // FIX: need to deal with character data and convert to something the parser knows | |
188 | 4.88k | let mathml_str = HTML_ENTITIES.replace_all(&mathml_str, |cap: &Captures| match |
189 | None => { | |
190 | 1 | error_message = format!("No entity named '{}'", &cap[0]); |
191 | 1 | cap[0].to_string() |
192 | } | |
193 | 95 | Some(&ch) => ch.to_string(), |
194 | 96 | }); |
195 | ||
196 | 4.88k | if !error_message.is_empty() { |
197 | // Clear stale state so subsequent API calls do not return previous user's data (security issue) | |
198 | 1 | old_package.replace(parser::parse("<math></math>").unwrap()); |
199 | 1 | bail!(error_message); |
200 | 4.88k | } |
201 | 4.88k | let mathml_str = MATHJAX_V2.replace_all(&mathml_str, ""); |
202 | 4.88k | let mathml_str = MATHJAX_V3.replace_all(&mathml_str, ""); |
203 | ||
204 | // the speech rules use the xpath "name" function and that includes the prefix | |
205 | // getting rid of the prefix properly probably involves a recursive replacement in the tree | |
206 | // if the prefix is used, it is almost certainly something like "m" or "mml", so this cheat will work. | |
207 | 4.88k | let mathml_str = NAMESPACE_DECL.replace(&mathml_str, "xmlns"); // do this before the PREFIX replace! |
208 | 4.88k | let mathml_str = PREFIX.replace_all(&mathml_str, "$1"); |
209 | ||
210 | 4.88k | let new_package = parser::parse(&mathml_str); |
211 | 4.88k | if let Err( |
212 | // Clear stale state so subsequent API calls do not return previous user's data (security issue) | |
213 | 1 | old_package.replace(parser::parse("<math></math>").unwrap()); |
214 | 1 | bail!("Invalid MathML input:\n{}\nError is: {}", &mathml_str, &e.to_string()); |
215 | 4.88k | } |
216 | ||
217 | 4.88k | let new_package = new_package.unwrap(); |
218 | 4.88k | let mathml = get_element(&new_package); |
219 | 4.88k | let |
220 | 4.88k | let mathml_string = mml_to_string(mathml); |
221 | 4.88k | old_package.replace(new_package); |
222 | ||
223 | 4.88k | return Ok(mathml_string); |
224 | 4.88k | }); |
225 | 4.88k | })); |
226 | ||
227 | 4.88k | return report_any_panic(result); |
228 | 4.88k | } |
229 | ||
230 | /// Get the spoken text of the MathML that was set. | |
231 | /// The speech takes into account any AT or user preferences. | |
232 | 3.46k | pub fn get_spoken_text() -> Result<String> { |
233 | 3.46k | enable_logs(); |
234 | 3.46k | let result = catch_unwind(AssertUnwindSafe(|| { |
235 | 3.46k | MATHML_INSTANCE.with(|package_instance| { |
236 | 3.46k | let package_instance = package_instance.borrow(); |
237 | 3.46k | let mathml = get_element(&package_instance); |
238 | 3.46k | let new_package = Package::new(); |
239 | 3.46k | let intent = crate::speech::intent_from_mathml(mathml, new_package.as_document()) |
240 | 3.46k | debug!("Intent tree:\n{}", |
241 | 3.46k | let speech = crate::speech::speak_mathml(intent, "", 0) |
242 | 3.46k | return Ok(speech); |
243 | 3.46k | }) |
244 | 3.46k | })); |
245 | 3.46k | return report_any_panic(result); |
246 | 3.46k | } |
247 | ||
248 | /// Get the spoken text for an overview of the MathML that was set. | |
249 | /// The speech takes into account any AT or user preferences. | |
250 | /// Note: this implementation for is currently minimal and should not be used. | |
251 | 0 | pub fn get_overview_text() -> Result<String> { |
252 | 0 | enable_logs(); |
253 | 0 | let result = catch_unwind(AssertUnwindSafe(|| { |
254 | 0 | MATHML_INSTANCE.with(|package_instance| { |
255 | 0 | let package_instance = package_instance.borrow(); |
256 | 0 | let mathml = get_element(&package_instance); |
257 | 0 | let speech = crate::speech::overview_mathml(mathml, "", 0)?; |
258 | 0 | return Ok(speech); |
259 | 0 | }) |
260 | 0 | })); |
261 | 0 | return report_any_panic(result); |
262 | 0 | } |
263 | ||
264 | /// Get the value of the named preference. | |
265 | /// None is returned if `name` is not a known preference. | |
266 | 100 | pub fn get_preference(name: impl AsRef<str>) -> Result<String> { |
267 | 100 | enable_logs(); |
268 | 100 | let name = name.as_ref().to_string(); |
269 | 100 | let result = catch_unwind(AssertUnwindSafe(|| { |
270 | use crate::prefs::NO_PREFERENCE; | |
271 | 100 | crate::speech::SPEECH_RULES.with(|rules| { |
272 | 100 | let rules = rules.borrow(); |
273 | 100 | let pref_manager = rules.pref_manager.borrow(); |
274 | 100 | let mut value = pref_manager.pref_to_string(&name); |
275 | 100 | if value == NO_PREFERENCE { |
276 | 1 | value = pref_manager.pref_to_string(&name); |
277 | 99 | } |
278 | 100 | if value == NO_PREFERENCE { |
279 | 1 | bail!("No preference named '{}'", name); |
280 | } else { | |
281 | 99 | return Ok(value); |
282 | } | |
283 | 100 | }) |
284 | 100 | })); |
285 | 100 | return report_any_panic(result); |
286 | 100 | } |
287 | ||
288 | /// Set a MathCAT preference. The preference name should be a known preference name. | |
289 | /// The value should either be a string or a number (depending upon the preference being set) | |
290 | /// The list of known user preferences is in the MathCAT user documentation. | |
291 | /// Here are common preferences set by programs (not settable by the user): | |
292 | /// * TTS -- SSML, SAPI5, None | |
293 | /// * Pitch -- normalized at '1.0' | |
294 | /// * Rate -- words per minute (should match current speech rate). | |
295 | /// There is a separate "MathRate" that is user settable that causes a relative percentage change from this rate. | |
296 | /// * Volume -- default 100 | |
297 | /// * Voice -- set a voice to use (not implemented) | |
298 | /// * Gender -- set pick any voice of the given gender (not implemented) | |
299 | /// * Bookmark -- set to `true` if a `mark`/`bookmark` should be part of the returned speech (used for sync highlighting) | |
300 | /// | |
301 | /// Important: both the preference name and value are case-sensitive | |
302 | /// | |
303 | /// This function can be called multiple times to set different values. | |
304 | /// The values are persistent and extend beyond calls to [`set_mathml`]. | |
305 | /// A value can be overwritten by calling this function again with a different value. | |
306 | /// | |
307 | /// Be careful setting preferences -- these potentially override user settings, so only preferences that really need setting should be set. | |
308 | 17.7k | pub fn set_preference(name: impl AsRef<str>, value: impl AsRef<str>) -> Result<()> { |
309 | 17.7k | enable_logs(); |
310 | 17.7k | let name = name.as_ref().to_string(); |
311 | 17.7k | let value = value.as_ref().to_string(); |
312 | 17.7k | let result = catch_unwind(AssertUnwindSafe(|| { |
313 | 17.7k | set_preference_impl(&name, &value) |
314 | 17.7k | })); |
315 | 17.7k | return report_any_panic(result); |
316 | 17.7k | } |
317 | ||
318 | 17.7k | fn set_preference_impl(name: &str, value: &str) -> Result<()> { |
319 | 17.7k | let mut value = value.to_string(); |
320 | 17.7k | if name == "Language" || |
321 | // check the format | |
322 | 5.02k | if value != "Auto" { |
323 | // could get es, es-419, or en-us-nyc ... we only care about the first two parts so we clean it up a little | |
324 | 5.02k | let mut lang_country_split = value.split('-'); |
325 | 5.02k | let language = lang_country_split.next().unwrap_or(""); |
326 | 5.02k | let country = lang_country_split.next().unwrap_or(""); |
327 | 5.02k | if language.len() != 2 { |
328 | 0 | bail!( |
329 | "Improper format for 'Language' preference '{}'. Should be of form 'en' or 'en-gb'", | |
330 | value | |
331 | ); | |
332 | 5.02k | } |
333 | 5.02k | let mut new_lang_country = language.to_string(); // need a temp value because 'country' is borrowed from 'value' above |
334 | 5.02k | if !country.is_empty() { |
335 | 321 | new_lang_country.push('-'); |
336 | 321 | new_lang_country.push_str(country); |
337 | 4.70k | } |
338 | 5.02k | value = new_lang_country; |
339 | 0 | } |
340 | 5.02k | if name == "LanguageAuto" && |
341 | 0 | bail!("'LanguageAuto' can not have the value 'Auto'"); |
342 | 5.02k | } |
343 | 12.7k | } |
344 | ||
345 | 17.7k | crate::speech::SPEECH_RULES.with(|rules| { |
346 | 17.7k | let rules = rules.borrow_mut(); |
347 | 17.7k | if let Some( |
348 | 0 | bail!("{}", error_string); |
349 | 17.7k | } |
350 | ||
351 | // we set the value even if it was the same as the old value because this might override a potentially changed future user value | |
352 | 17.7k | let mut pref_manager = rules.pref_manager.borrow_mut(); |
353 | 17.7k | if name == "LanguageAuto" { |
354 | 0 | let language_pref = pref_manager.pref_to_string("Language"); |
355 | 0 | if language_pref != "Auto" { |
356 | 0 | bail!( |
357 | "'LanguageAuto' can only be used when 'Language' has the value 'Auto'; Language={}", | |
358 | language_pref | |
359 | ); | |
360 | 0 | } |
361 | 17.7k | } |
362 | 17.7k | let lower_case_value = value.to_lowercase(); |
363 | 17.7k | if lower_case_value == "true" || |
364 | 1.50k | pref_manager.set_api_boolean_pref(name, value.to_lowercase() == "true"); |
365 | 1.50k | } else { |
366 | 16.2k | match name { |
367 | 16.2k | "Pitch" | "Rate" | "Volume" | "CapitalLetters_Pitch" | "MathRate" | "PauseFactor" => { |
368 | 0 | pref_manager.set_api_float_pref(name, to_float(name, &value)?) |
369 | } | |
370 | _ => { | |
371 | 16.2k | pref_manager.set_string_pref(name, &value) |
372 | } | |
373 | } | |
374 | }; | |
375 | 17.7k | return Ok::<(), Error>(()); |
376 | 17.7k | }) |
377 | ||
378 | 17.7k | return Ok(()); |
379 | 17.7k | } |
380 | ||
381 | 0 | fn to_float(name: &str, value: &str) -> Result<f64> { |
382 | 0 | return match value.parse::<f64>() { |
383 | 0 | Ok(val) => Ok(val), |
384 | 0 | Err(_) => bail!("SetPreference: preference'{}'s value '{}' must be a float", name, value), |
385 | }; | |
386 | 0 | } |
387 | ||
388 | /// Get the braille associated with the MathML that was set by [`set_mathml`]. | |
389 | /// The braille returned depends upon the preference for the `code` preference (default `Nemeth`). | |
390 | /// If 'nav_node_id' is given, it is highlighted based on the value of `BrailleNavHighlight` (default: `EndPoints`) | |
391 | 1.36k | pub fn get_braille(nav_node_id: impl AsRef<str>) -> Result<String> { |
392 | 1.36k | enable_logs(); |
393 | 1.36k | let nav_node_id = nav_node_id.as_ref().to_string(); |
394 | 1.36k | let result = catch_unwind(AssertUnwindSafe(|| { |
395 | 1.36k | MATHML_INSTANCE.with(|package_instance| { |
396 | 1.36k | let package_instance = package_instance.borrow(); |
397 | 1.36k | let mathml = get_element(&package_instance); |
398 | 1.36k | let braille = crate::braille::braille_mathml(mathml, &nav_node_id) |
399 | 1.36k | return Ok(braille); |
400 | 1.36k | }) |
401 | 1.36k | })); |
402 | 1.36k | return report_any_panic(result); |
403 | 1.36k | } |
404 | ||
405 | /// Get the braille associated with the current navigation focus of the MathML that was set by [`set_mathml`]. | |
406 | /// The braille returned depends upon the preference for the `code` preference (default `Nemeth`). | |
407 | /// The returned braille is brailled as if the current navigation focus is the entire expression to be brailled. | |
408 | 0 | pub fn get_navigation_braille() -> Result<String> { |
409 | 0 | enable_logs(); |
410 | 0 | let result = catch_unwind(AssertUnwindSafe(|| { |
411 | 0 | MATHML_INSTANCE.with(|package_instance| { |
412 | 0 | let package_instance = package_instance.borrow(); |
413 | 0 | let mathml = get_element(&package_instance); |
414 | 0 | let new_package = Package::new(); // used if we need to create a new tree |
415 | 0 | let new_doc = new_package.as_document(); |
416 | 0 | let nav_mathml = NAVIGATION_STATE.with(|nav_stack| { |
417 | 0 | return match nav_stack.borrow_mut().get_navigation_mathml(mathml) { |
418 | 0 | Err(e) => Err(e), |
419 | 0 | Ok((found, offset)) => { |
420 | // get the MathML node and wrap it inside of a <math> element | |
421 | // if the offset is given, we need to get the character it references | |
422 | 0 | if offset == 0 { |
423 | 0 | if name(found) == "math" { |
424 | 0 | Ok(found) |
425 | } else { | |
426 | 0 | let new_mathml = create_mathml_element(&new_doc, "math"); |
427 | 0 | new_mathml.append_child(copy_mathml(found)); |
428 | 0 | new_doc.root().append_child(new_mathml); |
429 | 0 | Ok(new_mathml) |
430 | } | |
431 | 0 | } else if !is_leaf(found) { |
432 | 0 | bail!( |
433 | "Internal error: non-zero offset '{}' on a non-leaf element '{}'", | |
434 | offset, | |
435 | 0 | name(found) |
436 | ); | |
437 | 0 | } else if let Some(ch) = as_text(found).chars().nth(offset) { |
438 | 0 | let internal_mathml = create_mathml_element(&new_doc, name(found)); |
439 | 0 | internal_mathml.set_text(&ch.to_string()); |
440 | 0 | let new_mathml = create_mathml_element(&new_doc, "math"); |
441 | 0 | new_mathml.append_child(internal_mathml); |
442 | 0 | new_doc.root().append_child(new_mathml); |
443 | 0 | Ok(new_mathml) |
444 | } else { | |
445 | 0 | bail!( |
446 | "Internal error: offset '{}' on leaf element '{}' doesn't exist", | |
447 | offset, | |
448 | 0 | mml_to_string(found) |
449 | ); | |
450 | } | |
451 | } | |
452 | }; | |
453 | 0 | })?; |
454 | ||
455 | 0 | let braille = crate::braille::braille_mathml(nav_mathml, "")?.0; |
456 | 0 | return Ok(braille); |
457 | 0 | }) |
458 | 0 | })); |
459 | 0 | return report_any_panic(result); |
460 | 0 | } |
461 | ||
462 | /// Given a key code along with the modifier keys, the current node is moved accordingly (or value reported in some cases). | |
463 | /// `key` is the [keycode](https://developer.mozilla.org/en-US/docs/Web/API/KeyboardEvent/keyCode#constants_for_keycode_value) for the key (in JavaScript, `ev.key_code`) | |
464 | /// The spoken text for the new current node is returned. | |
465 | 0 | pub fn do_navigate_keypress( |
466 | 0 | key: usize, |
467 | 0 | shift_key: bool, |
468 | 0 | control_key: bool, |
469 | 0 | alt_key: bool, |
470 | 0 | meta_key: bool, |
471 | 0 | ) -> Result<String> { |
472 | 0 | enable_logs(); |
473 | 0 | let result = catch_unwind(AssertUnwindSafe(|| { |
474 | 0 | MATHML_INSTANCE.with(|package_instance| { |
475 | 0 | let package_instance = package_instance.borrow(); |
476 | 0 | let mathml = get_element(&package_instance); |
477 | 0 | return do_mathml_navigate_key_press(mathml, key, shift_key, control_key, alt_key, meta_key); |
478 | 0 | }) |
479 | 0 | })); |
480 | 0 | return report_any_panic(result); |
481 | 0 | } |
482 | ||
483 | /// Given a navigation command, the current node is moved accordingly. | |
484 | /// This is a higher level interface than `do_navigate_keypress` for applications that want to interpret the keys themselves. | |
485 | /// The valid commands are: | |
486 | /// * Standard move commands: | |
487 | /// `MovePrevious`, `MoveNext`, `MoveStart`, `MoveEnd`, `MoveLineStart`, `MoveLineEnd` | |
488 | /// * Movement in a table or elementary math: | |
489 | /// `MoveCellPrevious`, `MoveCellNext`, `MoveCellUp`, `MoveCellDown`, `MoveColumnStart`, `MoveColumnEnd` | |
490 | /// * Moving into children or out to parents: | |
491 | /// `ZoomIn`, `ZoomOut`, `ZoomOutAll`, `ZoomInAll` | |
492 | /// * Undo the last movement command: | |
493 | /// `MoveLastLocation` | |
494 | /// * Read commands (standard speech): | |
495 | /// `ReadPrevious`, `ReadNext`, `ReadCurrent`, `ReadCellCurrent`, `ReadStart`, `ReadEnd`, `ReadLineStart`, `ReadLineEnd` | |
496 | /// * Describe commands (overview): | |
497 | /// `DescribePrevious`, `DescribeNext`, `DescribeCurrent` | |
498 | /// * Location information: | |
499 | /// `WhereAmI`, `WhereAmIAll` | |
500 | /// * Change navigation modes (circle up/down): | |
501 | /// `ToggleZoomLockUp`, `ToggleZoomLockDown` | |
502 | /// * Speak the current navigation mode | |
503 | /// `ToggleSpeakMode` | |
504 | /// | |
505 | /// There are 10 place markers that can be set/read/described or moved to. | |
506 | /// * Setting: | |
507 | /// `SetPlacemarker0`, `SetPlacemarker1`, `SetPlacemarker2`, `SetPlacemarker3`, `SetPlacemarker4`, `SetPlacemarker5`, `SetPlacemarker6`, `SetPlacemarker7`, `SetPlacemarker8`, `SetPlacemarker9` | |
508 | /// * Reading: | |
509 | /// `Read0`, `Read1`, `Read2`, `Read3`, `Read4`, `Read5`, `Read6`, `Read7`, `Read8`, `Read9` | |
510 | /// * Describing: | |
511 | /// `Describe0`, `Describe1`, `Describe2`, `Describe3`, `Describe4`, `Describe5`, `Describe6`, `Describe7`, `Describe8`, `Describe9` | |
512 | /// * Moving: | |
513 | /// `MoveTo0`, `MoveTo1`, `MoveTo2`, `MoveTo3`, `MoveTo4`, `MoveTo5`, `MoveTo6`, `MoveTo7`, `MoveTo8`, `MoveTo9` | |
514 | /// | |
515 | /// When done with Navigation, call with `Exit` | |
516 | 0 | pub fn do_navigate_command(command: impl AsRef<str>) -> Result<String> { |
517 | 0 | enable_logs(); |
518 | 0 | let command = command.as_ref().to_string(); |
519 | 0 | let result = catch_unwind(AssertUnwindSafe(|| { |
520 | 0 | let cmd = NAV_COMMANDS.get_key(&command); // gets a &'static version of the command |
521 | 0 | if cmd.is_none() { |
522 | 0 | bail!("Unknown command in call to DoNavigateCommand()"); |
523 | 0 | }; |
524 | 0 | let cmd = *cmd.unwrap(); |
525 | 0 | MATHML_INSTANCE.with(|package_instance| { |
526 | 0 | let package_instance = package_instance.borrow(); |
527 | 0 | let mathml = get_element(&package_instance); |
528 | 0 | return do_navigate_command_string(mathml, cmd); |
529 | 0 | }) |
530 | 0 | })); |
531 | 0 | return report_any_panic(result); |
532 | 0 | } |
533 | ||
534 | /// Given an 'id' and an offset (for tokens), set the navigation node to that id. | |
535 | /// An error is returned if the 'id' doesn't exist | |
536 | 2 | pub fn set_navigation_node(id: impl AsRef<str>, offset: usize) -> Result<()> { |
537 | 2 | enable_logs(); |
538 | 2 | let id = id.as_ref().to_string(); |
539 | 2 | let result = catch_unwind(AssertUnwindSafe(|| { |
540 | 2 | MATHML_INSTANCE.with(|package_instance| { |
541 | 2 | let package_instance = package_instance.borrow(); |
542 | 2 | let mathml = get_element(&package_instance); |
543 | 2 | return set_navigation_node_from_id(mathml, &id, offset); |
544 | 2 | }) |
545 | 2 | })); |
546 | 2 | return report_any_panic(result); |
547 | 2 | } |
548 | ||
549 | /// Return the MathML associated with the current (navigation) node and the offset (0-based) from that mathml (not yet implemented) | |
550 | /// The offset is needed for token elements that have multiple characters. | |
551 | 0 | pub fn get_navigation_mathml() -> Result<(String, usize)> { |
552 | 0 | enable_logs(); |
553 | 0 | let result = catch_unwind(AssertUnwindSafe(|| { |
554 | 0 | MATHML_INSTANCE.with(|package_instance| { |
555 | 0 | let package_instance = package_instance.borrow(); |
556 | 0 | let mathml = get_element(&package_instance); |
557 | 0 | return NAVIGATION_STATE.with(|nav_stack| { |
558 | 0 | return match nav_stack.borrow_mut().get_navigation_mathml(mathml) { |
559 | 0 | Err(e) => Err(e), |
560 | 0 | Ok((found, offset)) => Ok((mml_to_string(found), offset)), |
561 | }; | |
562 | 0 | }); |
563 | 0 | }) |
564 | 0 | })); |
565 | 0 | return report_any_panic(result); |
566 | 0 | } |
567 | ||
568 | /// Return the `id` and `offset` (0-based) associated with the current (navigation) node. | |
569 | /// `offset` (not yet implemented) | |
570 | /// The offset is needed for token elements that have multiple characters. | |
571 | 2 | pub fn get_navigation_mathml_id() -> Result<(String, usize)> { |
572 | 2 | enable_logs(); |
573 | 2 | let result = catch_unwind(AssertUnwindSafe(|| { |
574 | 2 | MATHML_INSTANCE.with(|package_instance| { |
575 | 2 | let package_instance = package_instance.borrow(); |
576 | 2 | let mathml = get_element(&package_instance); |
577 | 2 | return Ok(NAVIGATION_STATE.with(|nav_stack| { |
578 | 2 | return nav_stack.borrow().get_navigation_mathml_id(mathml); |
579 | 2 | })); |
580 | 2 | }) |
581 | 2 | })); |
582 | 2 | return report_any_panic(result); |
583 | 2 | } |
584 | ||
585 | /// Return the start and end braille character positions associated with the current (navigation) node. | |
586 | 2 | pub fn get_braille_position() -> Result<(usize, usize)> { |
587 | 2 | enable_logs(); |
588 | 2 | let result = catch_unwind(AssertUnwindSafe(|| { |
589 | 2 | MATHML_INSTANCE.with(|package_instance| { |
590 | 2 | let package_instance = package_instance.borrow(); |
591 | 2 | let mathml = get_element(&package_instance); |
592 | 2 | let nav_node = get_navigation_mathml_id() |
593 | 2 | let (_, start, end) = crate::braille::braille_mathml(mathml, &nav_node.0) |
594 | 2 | return Ok((start, end)); |
595 | 2 | }) |
596 | 2 | })); |
597 | 2 | return report_any_panic(result); |
598 | 2 | } |
599 | ||
600 | /// Given a 0-based braille position, return the smallest MathML node enclosing it. | |
601 | /// This node might be a leaf with an offset. | |
602 | 91 | pub fn get_navigation_node_from_braille_position(position: usize) -> Result<(String, usize)> { |
603 | 91 | enable_logs(); |
604 | 91 | let result = catch_unwind(AssertUnwindSafe(|| { |
605 | 91 | MATHML_INSTANCE.with(|package_instance| { |
606 | 91 | let package_instance = package_instance.borrow(); |
607 | 91 | let mathml = get_element(&package_instance); |
608 | 91 | return crate::braille::get_navigation_node_from_braille_position(mathml, position); |
609 | 91 | }) |
610 | 91 | })); |
611 | 91 | return report_any_panic(result); |
612 | 91 | } |
613 | ||
614 | 0 | pub fn get_supported_braille_codes() -> Result<Vec<String>> { |
615 | 0 | enable_logs(); |
616 | 0 | let result = catch_unwind(AssertUnwindSafe(|| { |
617 | 0 | let rules_dir = crate::prefs::PreferenceManager::get().borrow().get_rules_dir(); |
618 | 0 | let braille_dir = rules_dir.join("Braille"); |
619 | 0 | let mut braille_code_paths = Vec::new(); |
620 | ||
621 | 0 | find_all_dirs_shim(&braille_dir, &mut braille_code_paths); |
622 | 0 | let mut braille_code_paths = braille_code_paths.iter() |
623 | 0 | .map(|path| path.strip_prefix(&braille_dir).unwrap().to_string_lossy().to_string()) |
624 | 0 | .filter(|string_path| !string_path.is_empty() ) |
625 | 0 | .collect::<Vec<String>>(); |
626 | 0 | braille_code_paths.sort(); |
627 | ||
628 | 0 | Ok(braille_code_paths) |
629 | 0 | })); |
630 | 0 | return report_any_panic(result); |
631 | 0 | } |
632 | ||
633 | /// Returns a Vec of all supported languages ("en", "es", ...) | |
634 | 1 | pub fn get_supported_languages() -> Result<Vec<String>> { |
635 | 1 | enable_logs(); |
636 | 1 | let result = catch_unwind(AssertUnwindSafe(|| { |
637 | 1 | let rules_dir = crate::prefs::PreferenceManager::get().borrow().get_rules_dir(); |
638 | 1 | let lang_dir = rules_dir.join("Languages"); |
639 | 1 | let mut lang_paths = Vec::new(); |
640 | ||
641 | 1 | find_all_dirs_shim(&lang_dir, &mut lang_paths); |
642 | 1 | let mut language_paths = lang_paths.iter() |
643 | 13 | . |
644 | 13 | .to_string_lossy() |
645 | 13 | .replace(std::path::MAIN_SEPARATOR, "-") |
646 | 13 | .to_string()) |
647 | 13 | . |
648 | 1 | .collect::<Vec<String>>(); |
649 | ||
650 | // make sure the 'zz' test dir isn't included (build.rs removes it, but for debugging is there) | |
651 | 13 |
|
652 | 1 | language_paths.sort(); |
653 | 1 | Ok(language_paths) |
654 | 1 | })); |
655 | 1 | return report_any_panic(result); |
656 | 1 | } |
657 | ||
658 | 0 | pub fn get_supported_speech_styles(lang: impl AsRef<str>) -> Result<Vec<String>> { |
659 | 0 | enable_logs(); |
660 | 0 | let lang = lang.as_ref().to_string(); |
661 | 0 | let result = catch_unwind(AssertUnwindSafe(|| { |
662 | 0 | let rules_dir = crate::prefs::PreferenceManager::get().borrow().get_rules_dir(); |
663 | 0 | let lang_dir = rules_dir.join("Languages").join(&lang); |
664 | 0 | let mut speech_styles = find_files_in_dir_that_ends_with_shim(&lang_dir, "_Rules.yaml"); |
665 | 0 | for file_name in &mut speech_styles { |
666 | 0 | file_name.truncate(file_name.len() - "_Rules.yaml".len()) |
667 | } | |
668 | 0 | speech_styles.sort(); |
669 | 0 | speech_styles.dedup(); // remove duplicates -- shouldn't be any, but just in case |
670 | 0 | Ok(speech_styles) |
671 | 0 | })); |
672 | 0 | return report_any_panic(result); |
673 | 0 | } |
674 | ||
675 | // utility functions | |
676 | ||
677 | /// Copy (recursively) the (MathML) element and return the new one. | |
678 | /// The Element type does not copy and modifying the structure of an element's child will modify the element, so we need a copy | |
679 | /// Convert the returned error from set_mathml, etc., to a useful string for display | |
680 | 363 | pub fn copy_mathml(mathml: Element) -> Element { |
681 | 363 | return copy_mathml_recursive(mathml, 0); |
682 | 363 | } |
683 | ||
684 | 4.53k | fn copy_mathml_recursive(mathml: Element, depth: usize) -> Element { |
685 | // Safety: Prevent stack overflow on deeply nested MathML | |
686 | 4.53k | if depth > MAX_DEPTH { |
687 | // Return the element as a leaf if it's too deep to prevent crash | |
688 | 0 | return create_mathml_element(&mathml.document(), name(mathml)); |
689 | 4.53k | } |
690 | ||
691 | // If it represents MathML, the 'Element' can only have Text and Element children along with attributes | |
692 | 4.53k | let children = mathml.children(); |
693 | 4.53k | let new_mathml = create_mathml_element(&mathml.document(), name(mathml)); |
694 | 9.52k |
|
695 | 9.52k | new_mathml.set_attribute_value(attr.name(), attr.value()); |
696 | 9.52k | }); |
697 | ||
698 | // can't use is_leaf/as_text because this is also used with the intent tree | |
699 | 4.53k | if children.len() == 1 && |
700 | 3.26k | let Some( |
701 | 2.59k | new_mathml.set_text(text.text()); |
702 | 2.59k | return new_mathml; |
703 | 1.93k | } |
704 | ||
705 | 1.93k | let mut new_children = Vec::with_capacity(children.len()); |
706 | 4.17k | for child in |
707 | 4.17k | let child = as_element(child); |
708 | 4.17k | let new_child = copy_mathml_recursive(child, depth + 1); |
709 | 4.17k | new_children.push(new_child); |
710 | 4.17k | } |
711 | 1.93k | new_mathml.append_children(new_children); |
712 | 1.93k | return new_mathml; |
713 | 4.53k | } |
714 | ||
715 | 0 | pub fn errors_to_string(e: &Error) -> String { |
716 | 0 | enable_logs(); |
717 | 0 | let mut result = format!("{e}\n"); |
718 | 0 | for cause in e.chain().skip(1) { // skips original error |
719 | 0 | result += &format!("caused by: {cause}\n"); |
720 | 0 | } |
721 | 0 | result |
722 | 0 | } |
723 | ||
724 | 4.91k | fn add_ids(mathml: Element) -> Element { |
725 | use std::time::SystemTime; | |
726 | 4.91k | let time = if cfg!(target_family = "wasm") { |
727 | 0 | fastrand::usize(..) |
728 | } else { | |
729 | 4.91k | SystemTime::now() |
730 | 4.91k | .duration_since(SystemTime::UNIX_EPOCH) |
731 | 4.91k | .unwrap() |
732 | 4.91k | .as_millis() as usize |
733 | }; | |
734 | 4.91k | let mut time_part = radix_fmt::radix(time, 36).to_string(); |
735 | 4.91k | if time_part.len() < 3 { |
736 | 0 | time_part.push_str("a2c"); // needs to be at least three chars |
737 | 4.91k | } |
738 | 4.91k | let mut random_part = radix_fmt::radix(fastrand::u32(..), 36).to_string(); |
739 | 4.91k | if random_part.len() < 4 { |
740 | 0 | random_part.push_str("a1b2"); // needs to be at least four chars |
741 | 4.91k | } |
742 | 4.91k | let prefix = "M".to_string() + &time_part[time_part.len() - 3..] + &random_part[random_part.len() - 4..] + "-"; // begin with letter |
743 | 4.91k | add_ids_to_all(mathml, &prefix, 0); |
744 | 4.91k | return mathml; |
745 | ||
746 | 57.8k | fn add_ids_to_all(mathml: Element, id_prefix: &str, count: usize) -> usize { |
747 | 57.8k | let mut count = count; |
748 | 57.8k | if mathml.attribute("id").is_none() { |
749 | 57.1k | mathml.set_attribute_value("id", (id_prefix.to_string() + &count.to_string()).as_str()); |
750 | 57.1k | mathml.set_attribute_value("data-id-added", "true"); |
751 | 57.1k | count += 1; |
752 | 57.1k |
|
753 | ||
754 | 57.8k | if crate::xpath_functions::is_leaf(mathml) { |
755 | 35.8k | return count; |
756 | 22.0k | } |
757 | ||
758 | 52.9k | for child in |
759 | 52.9k | let child = as_element(child); |
760 | 52.9k | count = add_ids_to_all(child, id_prefix, count); |
761 | 52.9k | } |
762 | 22.0k | return count; |
763 | 57.8k | } |
764 | 4.91k | } |
765 | ||
766 | 10.3k | pub fn get_element(package: &Package) -> Element<'_> { |
767 | 10.3k | enable_logs(); |
768 | 10.3k | let doc = package.as_document(); |
769 | 10.3k | let mut result = None; |
770 | 10.3k | for root_child in doc.root().children() { |
771 | 10.3k | if let ChildOfRoot::Element(e) = root_child { |
772 | 10.3k | assert!(result.is_none()); |
773 | 10.3k | result = Some(e); |
774 | 0 | } |
775 | } | |
776 | 10.3k | return result.unwrap(); |
777 | 10.3k | } |
778 | ||
779 | /// Get the intent after setting the MathML | |
780 | /// Used in testing | |
781 | #[allow(dead_code)] | |
782 | 32 | pub fn get_intent<'a>(mathml: Element<'a>, doc: Document<'a>) -> Result<Element<'a>> { |
783 | 32 | crate::speech::SPEECH_RULES.with(|rules| rules.borrow_mut().read_files().unwrap()); |
784 | 32 | let mathml = cleanup_mathml(mathml) |
785 | 32 | return crate::speech::intent_from_mathml(mathml, doc); |
786 | 32 | } |
787 | ||
788 | #[allow(dead_code)] | |
789 | 22 | fn trim_doc(doc: &Document) { |
790 | 22 | for root_child in doc.root().children() { |
791 | 22 | if let ChildOfRoot::Element(e) = root_child { |
792 | 22 | trim_element(e, false); |
793 | 22 | } else { |
794 | 0 | doc.root().remove_child(root_child); // comment or processing instruction |
795 | 0 | } |
796 | } | |
797 | 22 | } |
798 | ||
799 | /// Not really meant to be public -- used by tests in some packages | |
800 | 55.5k | pub fn trim_element(e: Element, allow_structure_in_leaves: bool) { |
801 | // "<mtext>this is text</mtext" results in 3 text children | |
802 | // these are combined into one child as it makes code downstream simpler | |
803 | ||
804 | // space, tab, newline, carriage return all get collapsed to a single space | |
805 | const WHITESPACE: &[char] = &[' ', '\u{0009}', '\u{000A}','\u{000C}', '\u{000D}']; | |
806 | 3 | static WHITESPACE_MATCH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"[ \u{0009}\u{000A}\u{00C}\u{000D}]+"#).unwrap()); |
807 | ||
808 | 55.5k | if is_leaf(e) && ( |
809 | // Assume it is HTML inside of the leaf -- turn the HTML into a string | |
810 | 34.7k | make_leaf_element(e); |
811 | 34.7k | return; |
812 | 20.7k | } |
813 | ||
814 | 20.7k | let mut single_text = "".to_string(); |
815 | 87.6k | for child in |
816 | 87.6k | match child { |
817 | 50.1k | ChildOfElement::Element(c) => { |
818 | 50.1k | trim_element(c, allow_structure_in_leaves); |
819 | 50.1k | } |
820 | 37.4k | ChildOfElement::Text(t) => { |
821 | 37.4k | single_text += t.text(); |
822 | 37.4k | e.remove_child(child); |
823 | 37.4k | } |
824 | 21 | _ => { |
825 | 21 | e.remove_child(child); |
826 | 21 | } |
827 | } | |
828 | } | |
829 | ||
830 | // CSS considers only space, tab, linefeed, and carriage return as collapsable whitespace | |
831 | 20.7k | if !(is_leaf(e) || name(e) == "intent-literal" || single_text.is_empty()) { |
832 | // intent-literal comes from testing intent | |
833 | // FIX: we have a problem -- what should happen??? | |
834 | // FIX: For now, just keep the children and ignore the text and log an error -- shouldn't panic/crash | |
835 | 14.3k | if !single_text.trim_matches(WHITESPACE).is_empty() { |
836 | 20 | error!( |
837 | "trim_element: both element and textual children which shouldn't happen -- ignoring text '{single_text}'" | |
838 | ); | |
839 | 14.2k | } |
840 | 14.3k | return; |
841 | 6.44k | } |
842 | 6.44k | if e.children().is_empty() && |
843 | 0 | // debug!("Combining text in {}: '{}' -> '{}'", e.name().local_part(), single_text, trimmed_text); |
844 | 0 | e.set_text(&WHITESPACE_MATCH.replace_all(&single_text, " ")); |
845 | 6.44k | } |
846 | ||
847 | 34.7k | fn make_leaf_element(mathml_leaf: Element) { |
848 | // MathML leaves like <mn> really shouldn't have non-textual content, but you could have embedded HTML | |
849 | // Here, we convert them to leaves by grabbing up all the text and making that the content | |
850 | // Potentially, we leave them and let (default) rules do something, but it makes other parts of the code | |
851 | // messier because checking the text of a leaf becomes Option<&str> rather than just &str | |
852 | 34.7k | let children = mathml_leaf.children(); |
853 | 34.7k | if children.is_empty() { |
854 | 503 | return; |
855 | 34.2k | } |
856 | ||
857 | 34.2k | if rewrite_and_flatten_embedded_mathml(mathml_leaf) { |
858 | 1 | return; |
859 | 34.2k | } |
860 | ||
861 | // gather up the text | |
862 | 34.2k | let mut text = "".to_string(); |
863 | 34.6k | for child in |
864 | 34.6k | let child_text = match child { |
865 | 8 | ChildOfElement::Element(child) => { |
866 | 8 | if name(child) == "mglyph" { |
867 | 3 | child.attribute_value("alt").unwrap_or("").to_string() |
868 | } else { | |
869 | 5 | gather_text(child) |
870 | } | |
871 | } | |
872 | 34.4k | ChildOfElement::Text(t) => { |
873 | // debug!("ChildOfElement::Text: '{}'", t.text()); | |
874 | 34.4k | t.text().to_string() |
875 | } | |
876 | 222 | _ => "".to_string(), |
877 | }; | |
878 | 34.6k | if !child_text.is_empty() { |
879 | 34.4k | text += &child_text; |
880 | 34.4k |
|
881 | } | |
882 | ||
883 | // get rid of the old children and replace with the text we just built | |
884 | 34.2k | mathml_leaf.clear_children(); |
885 | 34.2k | mathml_leaf.set_text(WHITESPACE_MATCH.replace_all(&text, " ").trim_matches(WHITESPACE)); |
886 | // debug!("make_leaf_element: text is '{}'", crate::canonicalize::as_text(mathml_leaf)); | |
887 | ||
888 | /// gather up all the contents of the element and return them with a leading space | |
889 | 7 | fn gather_text(html: Element) -> String { |
890 | 7 | let mut text = "".to_string(); // since we are throwing out the element tag, add a space between the contents |
891 | 7 | for child in html.children() { |
892 | 7 | match child { |
893 | 2 | ChildOfElement::Element(child) => { |
894 | 2 | text += &gather_text(child); |
895 | 2 | } |
896 | 5 | ChildOfElement::Text(t) => text += t.text(), |
897 | 0 | _ => (), |
898 | } | |
899 | } | |
900 | // debug!("gather_text: '{}'", text); | |
901 | 7 | return text; |
902 | 7 | } |
903 | 34.7k | } |
904 | ||
905 | 34.2k | fn rewrite_and_flatten_embedded_mathml(mathml_leaf: Element) -> bool { |
906 | // first see if it can or needs to be rewritten | |
907 | // this is likely rare, so we do a check and if true, to a second pass building the result | |
908 | 34.2k | let mut needs_rewrite = false; |
909 | 34.6k | for child in |
910 | 34.6k | if let Some( |
911 | 8 | if name(element) != "math" { |
912 | 7 | return false; // something other than MathML as a child -- can't rewrite |
913 | 1 | } |
914 | 1 | needs_rewrite = true; |
915 | 34.6k | } |
916 | }; | |
917 | ||
918 | 34.2k | if !needs_rewrite { |
919 | 34.2k | return false; |
920 | 1 | } |
921 | ||
922 | // now do the rewrite, flatting out the mathml and returning an mrow with the children | |
923 | 1 | let leaf_name = name(mathml_leaf); |
924 | 1 | let doc = mathml_leaf.document(); |
925 | 1 | let mut new_children = Vec::new(); |
926 | 1 | let mut is_last_mtext = false; |
927 | 5 | for child in |
928 | 5 | if let Some( |
929 | 1 | trim_element(element, true); |
930 | 1 | new_children.append(&mut element.children()); // don't want 'math' wrapper |
931 | 1 | is_last_mtext = false; |
932 | 4 | } else if let Some(text) = child.text() { |
933 | // combine adjacent text nodes into single nodes | |
934 | 4 | if is_last_mtext { |
935 | 2 | let last_child = new_children.last_mut().unwrap().element().unwrap(); |
936 | 2 | let new_text = as_text(last_child).to_string() + text.text(); |
937 | 2 | last_child.set_text(&new_text); |
938 | 2 | } else { |
939 | 2 | let new_leaf_node = create_mathml_element(&doc, leaf_name); |
940 | 2 | new_leaf_node.set_text(text.text()); |
941 | 2 | new_children.push(ChildOfElement::Element(new_leaf_node)); |
942 | 2 | is_last_mtext = true; |
943 | 2 | } |
944 | 0 | } |
945 | }; | |
946 | ||
947 | // clean up whitespace in text nodes | |
948 | 3 | for child in |
949 | 3 | if let Some(element) = child.element() && is_leaf(element) { |
950 | 2 | let text = as_text(element); |
951 | 2 | let cleaned_text = WHITESPACE_MATCH.replace_all(text, " ").trim_matches(WHITESPACE).to_string(); |
952 | 2 | element.set_text(&cleaned_text); |
953 | 2 |
|
954 | } | |
955 | ||
956 | 1 | crate::canonicalize::set_mathml_name(mathml_leaf, "mrow"); |
957 | 1 | mathml_leaf.clear_children(); |
958 | 1 | mathml_leaf.append_children(new_children); |
959 | ||
960 | // debug!("rewrite_and_flatten_embedded_mathml: flattened\n'{}'", mml_to_string(mathml_leaf)); | |
961 | 1 | return true; |
962 | 34.2k | } |
963 | 55.5k | } |
964 | ||
965 | // used for testing trim | |
966 | /// returns Ok() if two Documents are equal or some info where they differ in the Err | |
967 | #[allow(dead_code)] | |
968 | 11 | fn is_same_doc(doc1: &Document, doc2: &Document) -> Result<()> { |
969 | // assume 'e' doesn't have element children until proven otherwise | |
970 | // this means we keep Text children until we are proven they aren't needed | |
971 | 11 | if doc1.root().children().len() != doc2.root().children().len() { |
972 | 0 | bail!( |
973 | "Children of docs have {} != {} children", | |
974 | 0 | doc1.root().children().len(), |
975 | 0 | doc2.root().children().len() |
976 | ); | |
977 | 11 | } |
978 | ||
979 | 11 | for (i, (c1, c2)) in doc1 |
980 | 11 | .root() |
981 | 11 | .children() |
982 | 11 | .iter() |
983 | 11 | .zip(doc2.root().children().iter()) |
984 | 11 | .enumerate() |
985 | { | |
986 | 11 | match c1 { |
987 | 11 | ChildOfRoot::Element(e1) => { |
988 | 11 | if let ChildOfRoot::Element(e2) = c2 { |
989 | 11 | is_same_element(*e1, *e2, &[]) |
990 | } else { | |
991 | 0 | bail!("child #{}, first is element, second is something else", i); |
992 | } | |
993 | } | |
994 | 0 | ChildOfRoot::Comment(com1) => { |
995 | 0 | if let ChildOfRoot::Comment(com2) = c2 { |
996 | 0 | if com1.text() != com2.text() { |
997 | 0 | bail!("child #{} -- comment text differs", i); |
998 | 0 | } |
999 | } else { | |
1000 | 0 | bail!("child #{}, first is comment, second is something else", i); |
1001 | } | |
1002 | } | |
1003 | 0 | ChildOfRoot::ProcessingInstruction(p1) => { |
1004 | 0 | if let ChildOfRoot::ProcessingInstruction(p2) = c2 { |
1005 | 0 | if p1.target() != p2.target() || p1.value() != p2.value() { |
1006 | 0 | bail!("child #{} -- processing instruction differs", i); |
1007 | 0 | } |
1008 | } else { | |
1009 | 0 | bail!( |
1010 | "child #{}, first is processing instruction, second is something else", | |
1011 | i | |
1012 | ); | |
1013 | } | |
1014 | } | |
1015 | } | |
1016 | } | |
1017 | 10 | return Ok(()); |
1018 | 11 | } |
1019 | ||
1020 | /// returns Ok() if two Documents are equal or some info where they differ in the Err | |
1021 | // Not really meant to be public -- used by tests in some packages | |
1022 | #[allow(dead_code)] | |
1023 | 1.92k | pub fn is_same_element(e1: Element, e2: Element, ignore_attrs: &[&str]) -> Result<()> { |
1024 | 1.92k | enable_logs(); |
1025 | 1.92k | if name(e1) != name(e2) { |
1026 | 0 | bail!("Names not the same: {}, {}", name(e1), name(e2)); |
1027 | 1.92k | } |
1028 | ||
1029 | // assume 'e' doesn't have element children until proven otherwise | |
1030 | // this means we keep Text children until we are proven they aren't needed | |
1031 | 1.92k | if e1.children().len() != e2.children().len() { |
1032 | 0 | bail!( |
1033 | "Children of {} have {} != {} children", | |
1034 | 0 | name(e1), |
1035 | 0 | e1.children().len(), |
1036 | 0 | e2.children().len() |
1037 | ); | |
1038 | 1.92k | } |
1039 | ||
1040 | 1.92k | if let Err( |
1041 | 0 | bail!("In element {}, {}", name(e1), e); |
1042 | 1.92k | } |
1043 | ||
1044 | 2.86k | for (i, (c1, c2)) in |
1045 | 2.86k | match c1 { |
1046 | 1.72k | ChildOfElement::Element(child1) => { |
1047 | 1.72k | if let ChildOfElement::Element(child2) = c2 { |
1048 | 1.72k | is_same_element(*child1, *child2, ignore_attrs) |
1049 | } else { | |
1050 | 0 | bail!("{} child #{}, first is element, second is something else", name(e1), i); |
1051 | } | |
1052 | } | |
1053 | 0 | ChildOfElement::Comment(com1) => { |
1054 | 0 | if let ChildOfElement::Comment(com2) = c2 { |
1055 | 0 | if com1.text() != com2.text() { |
1056 | 0 | bail!("{} child #{} -- comment text differs", name(e1), i); |
1057 | 0 | } |
1058 | } else { | |
1059 | 0 | bail!("{} child #{}, first is comment, second is something else", name(e1), i); |
1060 | } | |
1061 | } | |
1062 | 0 | ChildOfElement::ProcessingInstruction(p1) => { |
1063 | 0 | if let ChildOfElement::ProcessingInstruction(p2) = c2 { |
1064 | 0 | if p1.target() != p2.target() || p1.value() != p2.value() { |
1065 | 0 | bail!("{} child #{} -- processing instruction differs", name(e1), i); |
1066 | 0 | } |
1067 | } else { | |
1068 | 0 | bail!( |
1069 | "{} child #{}, first is processing instruction, second is something else", | |
1070 | 0 | name(e1), |
1071 | i | |
1072 | ); | |
1073 | } | |
1074 | } | |
1075 | 1.14k | ChildOfElement::Text(t1) => { |
1076 | 1.14k | if let ChildOfElement::Text(t2) = c2 { |
1077 | 1.14k | if t1.text() != t2.text() { |
1078 | 1 | bail!("{} child #{} -- text differs", name(e1), i); |
1079 | 1.14k | } |
1080 | } else { | |
1081 | 0 | bail!("{} child #{}, first is text, second is something else", name(e1), i); |
1082 | } | |
1083 | } | |
1084 | } | |
1085 | } | |
1086 | 1.91k | return Ok(()); |
1087 | ||
1088 | /// compares attributes -- '==' didn't seems to work | |
1089 | 1.92k | fn attrs_are_same(attrs1: Vec<Attribute>, attrs2: Vec<Attribute>, ignore: &[&str]) -> Result<()> { |
1090 | 1.92k | let attrs1 = attrs1.iter() |
1091 | 1.92k | .filter(|a| ! |
1092 | 1.92k | .collect::<Vec<Attribute>>(); |
1093 | 1.92k | let attrs2 = attrs2.iter() |
1094 | 1.92k | .filter(|a| ! |
1095 | 1.92k | .collect::<Vec<Attribute>>(); |
1096 | 1.92k | if attrs1.len() != attrs2.len() { |
1097 | 0 | bail!("Attributes have different length: {:?} != {:?}", attrs1, attrs2); |
1098 | 1.92k | } |
1099 | // can't guarantee attrs are in the same order | |
1100 | 1.92k | for |
1101 | 1.40k | if let Some(found_attr2) = attrs2 |
1102 | 1.40k | .iter() |
1103 | 1.88k | . |
1104 | { | |
1105 | 1.40k | if attr1.value() == found_attr2.value() { |
1106 | 1.40k | continue; |
1107 | } else { | |
1108 | 0 | bail!( |
1109 | "Attribute named {} has differing values:\n '{}'\n '{}'", | |
1110 | 0 | attr1.name().local_part(), |
1111 | 0 | attr1.value(), |
1112 | 0 | found_attr2.value() |
1113 | ); | |
1114 | } | |
1115 | } else { | |
1116 | 0 | bail!( |
1117 | "Attribute name {} not in [{}]", | |
1118 | 0 | print_attr(&attr1), |
1119 | 0 | print_attrs(&attrs2) |
1120 | ); | |
1121 | } | |
1122 | } | |
1123 | 1.92k | return Ok(()); |
1124 | ||
1125 | 0 | fn print_attr(attr: &Attribute) -> String { |
1126 | 0 | return format!("@{}='{}'", attr.name().local_part(), attr.value()); |
1127 | 0 | } |
1128 | 0 | fn print_attrs(attrs: &[Attribute]) -> String { |
1129 | 0 | return attrs.iter().map(print_attr).collect::<Vec<String>>().join(", "); |
1130 | 0 | } |
1131 | 1.92k | } |
1132 | 1.92k | } |
1133 | ||
1134 | #[cfg(test)] | |
1135 | mod tests { | |
1136 | #[allow(unused_imports)] | |
1137 | use super::super::init_logger; | |
1138 | use super::*; | |
1139 | ||
1140 | 10 | fn are_parsed_strs_equal(test: &str, target: &str) -> bool { |
1141 | 10 | let test_package = &parser::parse(test).expect("Failed to parse input"); |
1142 | 10 | let test_doc = test_package.as_document(); |
1143 | 10 | trim_doc(&test_doc); |
1144 | 10 | debug!("test:\n{}", |
1145 | ||
1146 | 10 | let target_package = &parser::parse(target).expect("Failed to parse input"); |
1147 | 10 | let target_doc = target_package.as_document(); |
1148 | 10 | trim_doc(&target_doc); |
1149 | 10 | debug!("target:\n{}", |
1150 | ||
1151 | 10 | match is_same_doc(&test_doc, &target_doc) { |
1152 | 10 | Ok(_) => return true, |
1153 | 0 | Err(e) => panic!("{}", e), |
1154 | } | |
1155 | 10 | } |
1156 | ||
1157 | #[test] | |
1158 | 1 | fn trim_same() { |
1159 | 1 | let trimmed_str = "<math><mrow><mo>-</mo><mi>a</mi></mrow></math>"; |
1160 | 1 | assert!(are_parsed_strs_equal(trimmed_str, trimmed_str)); |
1161 | 1 | } |
1162 | ||
1163 | #[test] | |
1164 | 1 | fn trim_whitespace() { |
1165 | 1 | let trimmed_str = "<math><mrow><mo>-</mo><mi> a </mi></mrow></math>"; |
1166 | 1 | let whitespace_str = "<math> <mrow ><mo>-</mo><mi> a </mi></mrow ></math>"; |
1167 | 1 | assert!(are_parsed_strs_equal(trimmed_str, whitespace_str)); |
1168 | 1 | } |
1169 | ||
1170 | #[test] | |
1171 | 1 | fn no_trim_whitespace_nbsp() { |
1172 | 1 | let trimmed_str = "<math><mrow><mo>-</mo><mtext>  a </mtext></mrow></math>"; |
1173 | 1 | let whitespace_str = "<math> <mrow ><mo>-</mo><mtext>  a </mtext></mrow ></math>"; |
1174 | 1 | assert!(are_parsed_strs_equal(trimmed_str, whitespace_str)); |
1175 | 1 | } |
1176 | ||
1177 | #[test] | |
1178 | 1 | fn trim_comment() { |
1179 | 1 | let whitespace_str = "<math> <mrow ><mo>-</mo><mi> a </mi></mrow ></math>"; |
1180 | 1 | let comment_str = "<math><mrow><mo>-</mo><!--a comment --><mi> a </mi></mrow></math>"; |
1181 | 1 | assert!(are_parsed_strs_equal(comment_str, whitespace_str)); |
1182 | 1 | } |
1183 | ||
1184 | #[test] | |
1185 | 1 | fn replace_mglyph() { |
1186 | 1 | let mglyph_str = "<math> |
1187 | 1 | <mrow> |
1188 | 1 | <mi>X<mglyph fontfamily='my-braid-font' index='2' alt='23braid' /></mi> |
1189 | 1 | <mo>+</mo> |
1190 | 1 | <mi> |
1191 | 1 | <mglyph fontfamily='my-braid-font' index='5' alt='132braid' />Y |
1192 | 1 | </mi> |
1193 | 1 | <mo>=</mo> |
1194 | 1 | <mi> |
1195 | 1 | <mglyph fontfamily='my-braid-font' index='3' alt='13braid' /> |
1196 | 1 | </mi> |
1197 | 1 | </mrow> |
1198 | 1 | </math>"; |
1199 | 1 | let result_str = "<math> |
1200 | 1 | <mrow> |
1201 | 1 | <mi>X23braid</mi> |
1202 | 1 | <mo>+</mo> |
1203 | 1 | <mi>132braidY</mi> |
1204 | 1 | <mo>=</mo> |
1205 | 1 | <mi>13braid</mi> |
1206 | 1 | </mrow> |
1207 | 1 | </math>"; |
1208 | 1 | assert!(are_parsed_strs_equal(mglyph_str, result_str)); |
1209 | 1 | } |
1210 | ||
1211 | #[test] | |
1212 | 1 | fn trim_differs() { |
1213 | 1 | let whitespace_str = "<math> <mrow ><mo>-</mo><mi> a </mi></mrow ></math>"; |
1214 | 1 | let different_str = "<math> <mrow ><mo>-</mo><mi> b </mi></mrow ></math>"; |
1215 | ||
1216 | // need to manually do this since failure shouldn't be a panic | |
1217 | 1 | let package1 = &parser::parse(whitespace_str).expect("Failed to parse input"); |
1218 | 1 | let doc1 = package1.as_document(); |
1219 | 1 | trim_doc(&doc1); |
1220 | 1 | debug!("doc1:\n{}", |
1221 | ||
1222 | 1 | let package2 = parser::parse(different_str).expect("Failed to parse input"); |
1223 | 1 | let doc2 = package2.as_document(); |
1224 | 1 | trim_doc(&doc2); |
1225 | 1 | debug!("doc2:\n{}", |
1226 | ||
1227 | 1 | assert!(is_same_doc(&doc1, &doc2).is_err()); |
1228 | 1 | } |
1229 | ||
1230 | #[test] | |
1231 | 1 | fn test_entities() { |
1232 | // this forces initialization | |
1233 | 1 | set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); |
1234 | ||
1235 | 1 | let entity_str = set_mathml("<math><mrow><mo>−</mo><mi>𝕞</mi></mrow></math>").unwrap(); |
1236 | 1 | let converted_str = |
1237 | 1 | set_mathml("<math><mrow><mo>−</mo><mi>𝕞</mi></mrow></math>").unwrap(); |
1238 | ||
1239 | // need to remove unique ids | |
1240 | 1 | static ID_MATCH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"id='.+?' "#).unwrap()); |
1241 | 1 | let entity_str = ID_MATCH.replace_all(&entity_str, ""); |
1242 | 1 | let converted_str = ID_MATCH.replace_all(&converted_str, ""); |
1243 | 1 | assert_eq!(entity_str, converted_str, "normal entity test failed"); |
1244 | ||
1245 | 1 | let entity_str = set_mathml( |
1246 | "<math data-quot=\""value"\" data-apos=''value''><mi>XXX</mi></math>", | |
1247 | ) | |
1248 | 1 | .unwrap(); |
1249 | 1 | let converted_str = |
1250 | 1 | set_mathml("<math data-quot='\"value\"' data-apos=\"'value'\"><mi>XXX</mi></math>").unwrap(); |
1251 | 1 | let entity_str = ID_MATCH.replace_all(&entity_str, ""); |
1252 | 1 | let converted_str = ID_MATCH.replace_all(&converted_str, ""); |
1253 | 1 | assert_eq!(entity_str, converted_str, "special entities quote test failed"); |
1254 | ||
1255 | 1 | let entity_str = |
1256 | 1 | set_mathml("<math><mo><</mo><mo>></mo><mtext>&lt;</mtext></math>").unwrap(); |
1257 | 1 | let converted_str = |
1258 | 1 | set_mathml("<math><mo><</mo><mo>></mo><mtext>&lt;</mtext></math>") |
1259 | 1 | .unwrap(); |
1260 | 1 | let entity_str = ID_MATCH.replace_all(&entity_str, ""); |
1261 | 1 | let converted_str = ID_MATCH.replace_all(&converted_str, ""); |
1262 | 1 | assert_eq!(entity_str, converted_str, "special entities <,>,& test failed"); |
1263 | 1 | } |
1264 | ||
1265 | #[test] | |
1266 | 1 | fn can_recover_from_invalid_set_rules_dir() { |
1267 | use std::env; | |
1268 | // MathCAT will check the env var "MathCATRulesDir" as an override, so the following test might succeed if we don't override the env var | |
1269 | 1 | unsafe { env::set_var("MathCATRulesDir", "MathCATRulesDir"); } // safe because we are single threaded |
1270 | 1 | assert!(set_rules_dir("someInvalidRulesDir").is_err()); |
1271 | 1 | assert!( |
1272 | 1 | set_rules_dir(super::super::abs_rules_dir_path()).is_ok(), |
1273 | "\nset_rules_dir to '{}' failed", | |
1274 | 0 | super::super::abs_rules_dir_path() |
1275 | ); | |
1276 | 1 | assert!(set_mathml("<math><mn>1</mn></math>").is_ok()); |
1277 | 1 | } |
1278 | ||
1279 | #[test] | |
1280 | 1 | fn single_html_in_mtext() { |
1281 | 1 | let test = "<math><mn>1</mn> <mtext>a<p> para 1</p>bc</mtext> <mi>y</mi></math>"; |
1282 | 1 | let target = "<math><mn>1</mn> <mtext>a para 1bc</mtext> <mi>y</mi></math>"; |
1283 | 1 | assert!(are_parsed_strs_equal(test, target)); |
1284 | 1 | } |
1285 | ||
1286 | #[test] | |
1287 | 1 | fn multiple_html_in_mtext() { |
1288 | 1 | let test = "<math><mn>1</mn> <mtext>a<p>para 1</p> <p>para 2</p>bc </mtext> <mi>y</mi></math>"; |
1289 | 1 | let target = "<math><mn>1</mn> <mtext>apara 1 para 2bc</mtext> <mi>y</mi></math>"; |
1290 | 1 | assert!(are_parsed_strs_equal(test, target)); |
1291 | 1 | } |
1292 | ||
1293 | #[test] | |
1294 | 1 | fn nested_html_in_mtext() { |
1295 | 1 | let test = "<math><mn>1</mn> <mtext>a <ol><li>first</li><li>second</li></ol> bc</mtext> <mi>y</mi></math>"; |
1296 | 1 | let target = "<math><mn>1</mn> <mtext>a firstsecond bc</mtext> <mi>y</mi></math>"; |
1297 | 1 | assert!(are_parsed_strs_equal(test, target)); |
1298 | 1 | } |
1299 | ||
1300 | #[test] | |
1301 | 1 | fn empty_html_in_mtext() { |
1302 | 1 | let test = "<math><mn>1</mn> <mtext>a<br/>bc</mtext> <mi>y</mi></math>"; |
1303 | 1 | let target = "<math><mn>1</mn> <mtext>abc</mtext> <mi>y</mi></math>"; |
1304 | 1 | assert!(are_parsed_strs_equal(test, target)); |
1305 | 1 | } |
1306 | ||
1307 | #[test] | |
1308 | 1 | fn mathml_in_mtext() { |
1309 | 1 | let test = "<math><mtext>if <math> <msup><mi>n</mi><mn>2</mn></msup></math> is real</mtext></math>"; |
1310 | 1 | let target = "<math><mrow><mtext>if </mtext><msup><mi>n</mi><mn>2</mn></msup><mtext> is real</mtext></mrow></math>"; |
1311 | 1 | assert!(are_parsed_strs_equal(test, target)); |
1312 | 1 | } |
1313 | ||
1314 | #[test] | |
1315 | 1 | fn stack_overflow_protection() { |
1316 | 1 | set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); |
1317 | 1 | let mut bad_mathml = String::from("<math>"); |
1318 | 513 | for _ in |
1319 | 513 | bad_mathml.push_str("<msqrt><mi>n</mi>"); |
1320 | 513 | } |
1321 | 513 | for _ in |
1322 | 513 | bad_mathml.push_str("</msqrt>"); |
1323 | 513 | } |
1324 | 1 | bad_mathml.push_str("</math>"); |
1325 | 1 | assert_eq!(set_mathml(bad_mathml).unwrap_err().to_string(), "MathML is too deeply nested to process"); |
1326 | 1 | } |
1327 | ||
1328 | #[test] | |
1329 | 1 | fn old_mathml_cleared_on_error() { |
1330 | 1 | set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); |
1331 | 1 | let good_mathml = "<math><mn>3</mn></math>"; |
1332 | 1 | set_mathml(good_mathml).unwrap(); |
1333 | 1 | let bad_mathml = "<math><mi>&xabc;</mi></math>"; |
1334 | 1 | assert!(set_mathml(bad_mathml).is_err()); |
1335 | 1 | assert!(get_spoken_text().unwrap() == ""); |
1336 | 1 | set_mathml(good_mathml).unwrap(); |
1337 | 1 | let bad_mathml = "<math>garbage"; |
1338 | 1 | assert!(set_mathml(bad_mathml).is_err()); |
1339 | 1 | assert!(get_spoken_text().unwrap() == ""); |
1340 | 1 | } |
1341 | } |
Line | Count | Source |
1 | //! The interface module provides functionality both for calling from an API and also running the code from `main`. | |
2 | //! | |
3 | #![allow(non_snake_case)] | |
4 | #![allow(clippy::needless_return)] | |
5 | use std::cell::RefCell; | |
6 | use std::sync::LazyLock; | |
7 | ||
8 | use crate::canonicalize::{as_text, create_mathml_element}; | |
9 | use crate::errors::*; | |
10 | use phf::phf_map; | |
11 | use regex::{Captures, Regex}; | |
12 | use sxd_document::dom::{Element, Document, ChildOfRoot, ChildOfElement, Attribute}; | |
13 | use sxd_document::parser; | |
14 | use sxd_document::Package; | |
15 | ||
16 | use crate::canonicalize::{as_element, name}; | |
17 | use crate::shim_filesystem::{find_all_dirs_shim, find_files_in_dir_that_ends_with_shim}; | |
18 | use log::{debug, error}; | |
19 | ||
20 | use crate::navigate::*; | |
21 | use crate::pretty_print::mml_to_string; | |
22 | use crate::xpath_functions::{is_leaf, IsNode}; | |
23 | use std::panic::{catch_unwind, AssertUnwindSafe}; | |
24 | ||
25 | /// Maximum depth to prevent stack overflow on deeply nested MathML | |
26 | pub const MAX_DEPTH: usize = 512; | |
27 | ||
28 | #[cfg(feature = "enable-logs")] | |
29 | use std::sync::Once; | |
30 | #[cfg(feature = "enable-logs")] | |
31 | static INIT: Once = Once::new(); | |
32 | ||
33 | 45.0k | fn enable_logs() { |
34 | #[cfg(feature = "enable-logs")] | |
35 | INIT.call_once(||{ | |
36 | #[cfg(target_os = "android")] | |
37 | { | |
38 | use log::*; | |
39 | use android_logger::*; | |
40 | ||
41 | android_logger::init_once( | |
42 | Config::default() | |
43 | .with_max_level(LevelFilter::Trace) | |
44 | .with_tag("MathCat") | |
45 | ); | |
46 | trace!("Activated Android logger!"); | |
47 | } | |
48 | }); | |
49 | 45.0k | } |
50 | ||
51 | // For getting a message from a panic | |
52 | thread_local! { | |
53 | // Stores (Message, File, Line) | |
54 | static PANIC_INFO: RefCell<Option<(String, String, u32)>> = const { RefCell::new(None) }; | |
55 | } | |
56 | ||
57 | /// Initialize the panic handler to catch panics and store the message, file, and line number in `PANIC_INFO`. | |
58 | 13.5k | pub fn init_panic_handler() { |
59 | use std::panic; | |
60 | ||
61 | 13.5k | panic::set_hook(Box::new(|info| |
62 | 1 | let location = info.location() |
63 | 1 | .map(|l| format!("{}:{}", l.file(), l.line())) |
64 | 1 | .unwrap_or_else(|| |
65 | ||
66 | 1 | let payload = info.payload(); |
67 | 1 | let msg = if let Some( |
68 | 0 | s.to_string() |
69 | 1 | } else if let Some(s) = payload.downcast_ref::<String>() { |
70 | 1 | s.clone() |
71 | } else { | |
72 | 0 | "Unknown panic payload".to_string() |
73 | }; | |
74 | ||
75 | // Use try_with/try_borrow_mut to ensure the hook never panics itself | |
76 | 1 | let _ = PANIC_INFO.try_with(|cell| { |
77 | 1 | if let Ok(mut slot) = cell.try_borrow_mut() { |
78 | 1 | *slot = Some((msg, location, 0)); |
79 | 1 |
|
80 | 1 | }); |
81 | 1 | })); |
82 | 13.5k | } |
83 | ||
84 | 41.1k | pub fn report_any_panic<T>(result: Result<Result<T, Error>, Box<dyn std::any::Any + Send>>) -> Result<T, Error> { |
85 | 41.1k | match result { |
86 | 41.1k | Ok(val) => val, |
87 | Err(_) => { | |
88 | // Retrieve the smuggled info | |
89 | 1 | let details = PANIC_INFO.with(|cell| cell.borrow_mut().take()); |
90 | ||
91 | 1 | if let Some((msg, file, line)) = details { |
92 | 1 | Err(anyhow::anyhow!( |
93 | 1 | "MathCAT crash! Please report the following information: '{}' at {}:{}", |
94 | 1 | msg, file, line |
95 | 1 | )) |
96 | } else { | |
97 | 0 | Err(anyhow::anyhow!("MathCAT crash! -- please report")) |
98 | } | |
99 | } | |
100 | } | |
101 | 41.1k | } |
102 | ||
103 | // wrap up some common functionality between the call from 'main' and AT | |
104 | 4.91k | fn cleanup_mathml(mathml: Element) -> Result<Element> { |
105 | 4.91k | trim_element(mathml, false); |
106 | 4.91k | let |
107 | 4.91k | let mathml = add_ids(mathml); |
108 | 4.91k | return Ok(mathml); |
109 | 4.91k | } |
110 | ||
111 | thread_local! { | |
112 | /// The current node being navigated (also spoken and brailled) is stored in `MATHML_INSTANCE`. | |
113 | pub static MATHML_INSTANCE: RefCell<Package> = init_mathml_instance(); | |
114 | } | |
115 | ||
116 | 3.92k | fn init_mathml_instance() -> RefCell<Package> { |
117 | 3.92k | let package = parser::parse("<math></math>") |
118 | 3.92k | .expect("Internal error in 'init_mathml_instance;: didn't parse initializer string"); |
119 | 3.92k | return RefCell::new(package); |
120 | 3.92k | } |
121 | ||
122 | /// Set the Rules directory | |
123 | /// IMPORTANT: this should be the very first call to MathCAT. If 'dir' is an empty string, the environment var 'MathCATRulesDir' is tried. | |
124 | 5.08k | pub fn set_rules_dir(dir: impl AsRef<str>) -> Result<()> { |
125 | 5.08k | enable_logs(); |
126 | 5.08k | init_panic_handler(); |
127 | 5.08k | let dir = dir.as_ref().to_string(); |
128 | 5.08k | let result = catch_unwind(AssertUnwindSafe(|| { |
129 | use std::path::PathBuf; | |
130 | 5.08k | let dir_os = if dir.is_empty() { |
131 | 0 | std::env::var_os("MathCATRulesDir").unwrap_or_default() |
132 | } else { | |
133 | 5.08k | std::ffi::OsString::from(&dir) |
134 | }; | |
135 | 5.08k | let pref_manager = crate::prefs::PreferenceManager::get(); |
136 | 5.08k | pref_manager.borrow_mut().initialize(PathBuf::from(dir_os)) |
137 | 5.08k | })); |
138 | 5.08k | return report_any_panic(result); |
139 | 5.08k | } |
140 | ||
141 | /// Returns the version number (from Cargo.toml) of the build | |
142 | 0 | pub fn get_version() -> String { |
143 | 0 | enable_logs(); |
144 | const VERSION: &str = env!("CARGO_PKG_VERSION"); | |
145 | 0 | return VERSION.to_string(); |
146 | 0 | } |
147 | ||
148 | /// This will override any previous MathML that was set. | |
149 | /// This returns canonical MathML with 'id's set on any node that doesn't have an id. | |
150 | /// The ids can be used for sync highlighting if the `Bookmark` API preference is true. | |
151 | 4.88k | pub fn set_mathml(mathml_str: impl AsRef<str>) -> Result<String> { |
152 | 4.88k | enable_logs(); |
153 | // if these are present when resent to MathJaX, MathJaX crashes (https://github.com/mathjax/MathJax/issues/2822) | |
154 | 3 | static MATHJAX_V2: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"class *= *['"]MJX-.*?['"]"#).unwrap()); |
155 | 3 | static MATHJAX_V3: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"class *= *['"]data-mjx-.*?['"]"#).unwrap()); |
156 | ||
157 | // Strip out processing instructions and comments -- these are not MathML and can cause DOS problems in the parser | |
158 | 3 | static PROCESSING_INSTRUCTION: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"<\?[\s\S]{1,2048}\?>"#).unwrap()); |
159 | 3 | static XML_COMMENT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"(?s)"#).unwrap()); |
160 | ||
161 | // These have some length limits to avoid DOS attacks via long strings | |
162 | 3 | static NAMESPACE_DECL: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"xmlns:[[:alpha:]]{1,32}"#).unwrap()); |
163 | 3 | static PREFIX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"(</?)[[:alpha:]]{1,32}:"#).unwrap()); |
164 | 3 | static HTML_ENTITIES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"&([a-zA-Z]{2,10});"#).unwrap()); |
165 | 4.88k | let result = catch_unwind(AssertUnwindSafe(|| { |
166 | 4.88k | NAVIGATION_STATE.with(|nav_stack| { |
167 | 4.88k | nav_stack.borrow_mut().reset(); |
168 | 4.88k | }); |
169 | ||
170 | // We need the main definitions files to be read in so canonicalize can work. | |
171 | // This call reads all of them for the current preferences, but that's ok since they will likely be used | |
172 | 4.88k | crate::speech::SPEECH_RULES.with(|rules| rules.borrow_mut().read_files()) |
173 | ||
174 | 4.88k | let mathml_str = mathml_str.as_ref(); |
175 | // Safety guard: Reject strings > 1MB to prevent DoS/Stack issues | |
176 | 4.88k | if mathml_str.len() > 1024 * 1024 { |
177 | 0 | bail!("MathML string of size {} bytes exceeds length limit of 1MB", mathml_str.len()); |
178 | 4.88k | } |
179 | ||
180 | 4.88k | return MATHML_INSTANCE.with(|old_package| { |
181 | static HTML_ENTITIES_MAPPING: phf::Map<&str, &str> = include!("entities.in"); | |
182 | ||
183 | 4.88k | let mut error_message = "".to_string(); // can't return a result inside the replace_all, so we do this hack of setting the message and then returning the error |
184 | ||
185 | 4.88k | let mathml_str = XML_COMMENT.replace_all(mathml_str, ""); |
186 | 4.88k | let mathml_str = PROCESSING_INSTRUCTION.replace_all(&mathml_str, ""); |
187 | // FIX: need to deal with character data and convert to something the parser knows | |
188 | 4.88k | let mathml_str = HTML_ENTITIES.replace_all(&mathml_str, |cap: &Captures| match |
189 | None => { | |
190 | 1 | error_message = format!("No entity named '{}'", &cap[0]); |
191 | 1 | cap[0].to_string() |
192 | } | |
193 | 95 | Some(&ch) => ch.to_string(), |
194 | 96 | }); |
195 | ||
196 | 4.88k | if !error_message.is_empty() { |
197 | // Clear stale state so subsequent API calls do not return previous user's data (security issue) | |
198 | 1 | old_package.replace(parser::parse("<math></math>").unwrap()); |
199 | 1 | bail!(error_message); |
200 | 4.88k | } |
201 | 4.88k | let mathml_str = MATHJAX_V2.replace_all(&mathml_str, ""); |
202 | 4.88k | let mathml_str = MATHJAX_V3.replace_all(&mathml_str, ""); |
203 | ||
204 | // the speech rules use the xpath "name" function and that includes the prefix | |
205 | // getting rid of the prefix properly probably involves a recursive replacement in the tree | |
206 | // if the prefix is used, it is almost certainly something like "m" or "mml", so this cheat will work. | |
207 | 4.88k | let mathml_str = NAMESPACE_DECL.replace(&mathml_str, "xmlns"); // do this before the PREFIX replace! |
208 | 4.88k | let mathml_str = PREFIX.replace_all(&mathml_str, "$1"); |
209 | ||
210 | 4.88k | let new_package = parser::parse(&mathml_str); |
211 | 4.88k | if let Err( |
212 | // Clear stale state so subsequent API calls do not return previous user's data (security issue) | |
213 | 1 | old_package.replace(parser::parse("<math></math>").unwrap()); |
214 | 1 | bail!("Invalid MathML input:\n{}\nError is: {}", &mathml_str, &e.to_string()); |
215 | 4.88k | } |
216 | ||
217 | 4.88k | let new_package = new_package.unwrap(); |
218 | 4.88k | let mathml = get_element(&new_package); |
219 | 4.88k | let |
220 | 4.88k | let mathml_string = mml_to_string(mathml); |
221 | 4.88k | old_package.replace(new_package); |
222 | ||
223 | 4.88k | return Ok(mathml_string); |
224 | 4.88k | }); |
225 | 4.88k | })); |
226 | ||
227 | 4.88k | return report_any_panic(result); |
228 | 4.88k | } |
229 | ||
230 | /// Get the spoken text of the MathML that was set. | |
231 | /// The speech takes into account any AT or user preferences. | |
232 | 3.46k | pub fn get_spoken_text() -> Result<String> { |
233 | 3.46k | enable_logs(); |
234 | 3.46k | let result = catch_unwind(AssertUnwindSafe(|| { |
235 | 3.46k | MATHML_INSTANCE.with(|package_instance| { |
236 | 3.46k | let package_instance = package_instance.borrow(); |
237 | 3.46k | let mathml = get_element(&package_instance); |
238 | 3.46k | let new_package = Package::new(); |
239 | 3.46k | let intent = crate::speech::intent_from_mathml(mathml, new_package.as_document()) |
240 | 3.46k | debug!("Intent tree:\n{}", |
241 | 3.46k | let speech = crate::speech::speak_mathml(intent, "", 0) |
242 | 3.46k | return Ok(speech); |
243 | 3.46k | }) |
244 | 3.46k | })); |
245 | 3.46k | return report_any_panic(result); |
246 | 3.46k | } |
247 | ||
248 | /// Get the spoken text for an overview of the MathML that was set. | |
249 | /// The speech takes into account any AT or user preferences. | |
250 | /// Note: this implementation for is currently minimal and should not be used. | |
251 | 0 | pub fn get_overview_text() -> Result<String> { |
252 | 0 | enable_logs(); |
253 | 0 | let result = catch_unwind(AssertUnwindSafe(|| { |
254 | 0 | MATHML_INSTANCE.with(|package_instance| { |
255 | 0 | let package_instance = package_instance.borrow(); |
256 | 0 | let mathml = get_element(&package_instance); |
257 | 0 | let speech = crate::speech::overview_mathml(mathml, "", 0)?; |
258 | 0 | return Ok(speech); |
259 | 0 | }) |
260 | 0 | })); |
261 | 0 | return report_any_panic(result); |
262 | 0 | } |
263 | ||
264 | /// Get the value of the named preference. | |
265 | /// None is returned if `name` is not a known preference. | |
266 | 100 | pub fn get_preference(name: impl AsRef<str>) -> Result<String> { |
267 | 100 | enable_logs(); |
268 | 100 | let name = name.as_ref().to_string(); |
269 | 100 | let result = catch_unwind(AssertUnwindSafe(|| { |
270 | use crate::prefs::NO_PREFERENCE; | |
271 | 100 | crate::speech::SPEECH_RULES.with(|rules| { |
272 | 100 | let rules = rules.borrow(); |
273 | 100 | let pref_manager = rules.pref_manager.borrow(); |
274 | 100 | let mut value = pref_manager.pref_to_string(&name); |
275 | 100 | if value == NO_PREFERENCE { |
276 | 1 | value = pref_manager.pref_to_string(&name); |
277 | 99 | } |
278 | 100 | if value == NO_PREFERENCE { |
279 | 1 | bail!("No preference named '{}'", name); |
280 | } else { | |
281 | 99 | return Ok(value); |
282 | } | |
283 | 100 | }) |
284 | 100 | })); |
285 | 100 | return report_any_panic(result); |
286 | 100 | } |
287 | ||
288 | /// Set a MathCAT preference. The preference name should be a known preference name. | |
289 | /// The value should either be a string or a number (depending upon the preference being set) | |
290 | /// The list of known user preferences is in the MathCAT user documentation. | |
291 | /// Here are common preferences set by programs (not settable by the user): | |
292 | /// * TTS -- SSML, SAPI5, None | |
293 | /// * Pitch -- normalized at '1.0' | |
294 | /// * Rate -- words per minute (should match current speech rate). | |
295 | /// There is a separate "MathRate" that is user settable that causes a relative percentage change from this rate. | |
296 | /// * Volume -- default 100 | |
297 | /// * Voice -- set a voice to use (not implemented) | |
298 | /// * Gender -- set pick any voice of the given gender (not implemented) | |
299 | /// * Bookmark -- set to `true` if a `mark`/`bookmark` should be part of the returned speech (used for sync highlighting) | |
300 | /// | |
301 | /// Important: both the preference name and value are case-sensitive | |
302 | /// | |
303 | /// This function can be called multiple times to set different values. | |
304 | /// The values are persistent and extend beyond calls to [`set_mathml`]. | |
305 | /// A value can be overwritten by calling this function again with a different value. | |
306 | /// | |
307 | /// Be careful setting preferences -- these potentially override user settings, so only preferences that really need setting should be set. | |
308 | 17.7k | pub fn set_preference(name: impl AsRef<str>, value: impl AsRef<str>) -> Result<()> { |
309 | 17.7k | enable_logs(); |
310 | 17.7k | let name = name.as_ref().to_string(); |
311 | 17.7k | let value = value.as_ref().to_string(); |
312 | 17.7k | let result = catch_unwind(AssertUnwindSafe(|| { |
313 | 17.7k | set_preference_impl(&name, &value) |
314 | 17.7k | })); |
315 | 17.7k | return report_any_panic(result); |
316 | 17.7k | } |
317 | ||
318 | 17.7k | fn set_preference_impl(name: &str, value: &str) -> Result<()> { |
319 | 17.7k | let mut value = value.to_string(); |
320 | 17.7k | if name == "Language" || |
321 | // check the format | |
322 | 5.02k | if value != "Auto" { |
323 | // could get es, es-419, or en-us-nyc ... we only care about the first two parts so we clean it up a little | |
324 | 5.02k | let mut lang_country_split = value.split('-'); |
325 | 5.02k | let language = lang_country_split.next().unwrap_or(""); |
326 | 5.02k | let country = lang_country_split.next().unwrap_or(""); |
327 | 5.02k | if language.len() != 2 { |
328 | 0 | bail!( |
329 | "Improper format for 'Language' preference '{}'. Should be of form 'en' or 'en-gb'", | |
330 | value | |
331 | ); | |
332 | 5.02k | } |
333 | 5.02k | let mut new_lang_country = language.to_string(); // need a temp value because 'country' is borrowed from 'value' above |
334 | 5.02k | if !country.is_empty() { |
335 | 321 | new_lang_country.push('-'); |
336 | 321 | new_lang_country.push_str(country); |
337 | 4.70k | } |
338 | 5.02k | value = new_lang_country; |
339 | 0 | } |
340 | 5.02k | if name == "LanguageAuto" && |
341 | 0 | bail!("'LanguageAuto' can not have the value 'Auto'"); |
342 | 5.02k | } |
343 | 12.7k | } |
344 | ||
345 | 17.7k | crate::speech::SPEECH_RULES.with(|rules| { |
346 | 17.7k | let rules = rules.borrow_mut(); |
347 | 17.7k | if let Some( |
348 | 0 | bail!("{}", error_string); |
349 | 17.7k | } |
350 | ||
351 | // we set the value even if it was the same as the old value because this might override a potentially changed future user value | |
352 | 17.7k | let mut pref_manager = rules.pref_manager.borrow_mut(); |
353 | 17.7k | if name == "LanguageAuto" { |
354 | 0 | let language_pref = pref_manager.pref_to_string("Language"); |
355 | 0 | if language_pref != "Auto" { |
356 | 0 | bail!( |
357 | "'LanguageAuto' can only be used when 'Language' has the value 'Auto'; Language={}", | |
358 | language_pref | |
359 | ); | |
360 | 0 | } |
361 | 17.7k | } |
362 | 17.7k | let lower_case_value = value.to_lowercase(); |
363 | 17.7k | if lower_case_value == "true" || |
364 | 1.50k | pref_manager.set_api_boolean_pref(name, value.to_lowercase() == "true"); |
365 | 1.50k | } else { |
366 | 16.2k | match name { |
367 | 16.2k | "Pitch" | "Rate" | "Volume" | "CapitalLetters_Pitch" | "MathRate" | "PauseFactor" => { |
368 | 0 | pref_manager.set_api_float_pref(name, to_float(name, &value)?) |
369 | } | |
370 | _ => { | |
371 | 16.2k | pref_manager.set_string_pref(name, &value) |
372 | } | |
373 | } | |
374 | }; | |
375 | 17.7k | return Ok::<(), Error>(()); |
376 | 17.7k | }) |
377 | ||
378 | 17.7k | return Ok(()); |
379 | 17.7k | } |
380 | ||
381 | 0 | fn to_float(name: &str, value: &str) -> Result<f64> { |
382 | 0 | return match value.parse::<f64>() { |
383 | 0 | Ok(val) => Ok(val), |
384 | 0 | Err(_) => bail!("SetPreference: preference'{}'s value '{}' must be a float", name, value), |
385 | }; | |
386 | 0 | } |
387 | ||
388 | /// Get the braille associated with the MathML that was set by [`set_mathml`]. | |
389 | /// The braille returned depends upon the preference for the `code` preference (default `Nemeth`). | |
390 | /// If 'nav_node_id' is given, it is highlighted based on the value of `BrailleNavHighlight` (default: `EndPoints`) | |
391 | 1.36k | pub fn get_braille(nav_node_id: impl AsRef<str>) -> Result<String> { |
392 | 1.36k | enable_logs(); |
393 | 1.36k | let nav_node_id = nav_node_id.as_ref().to_string(); |
394 | 1.36k | let result = catch_unwind(AssertUnwindSafe(|| { |
395 | 1.36k | MATHML_INSTANCE.with(|package_instance| { |
396 | 1.36k | let package_instance = package_instance.borrow(); |
397 | 1.36k | let mathml = get_element(&package_instance); |
398 | 1.36k | let braille = crate::braille::braille_mathml(mathml, &nav_node_id) |
399 | 1.36k | return Ok(braille); |
400 | 1.36k | }) |
401 | 1.36k | })); |
402 | 1.36k | return report_any_panic(result); |
403 | 1.36k | } |
404 | ||
405 | /// Get the braille associated with the current navigation focus of the MathML that was set by [`set_mathml`]. | |
406 | /// The braille returned depends upon the preference for the `code` preference (default `Nemeth`). | |
407 | /// The returned braille is brailled as if the current navigation focus is the entire expression to be brailled. | |
408 | 0 | pub fn get_navigation_braille() -> Result<String> { |
409 | 0 | enable_logs(); |
410 | 0 | let result = catch_unwind(AssertUnwindSafe(|| { |
411 | 0 | MATHML_INSTANCE.with(|package_instance| { |
412 | 0 | let package_instance = package_instance.borrow(); |
413 | 0 | let mathml = get_element(&package_instance); |
414 | 0 | let new_package = Package::new(); // used if we need to create a new tree |
415 | 0 | let new_doc = new_package.as_document(); |
416 | 0 | let nav_mathml = NAVIGATION_STATE.with(|nav_stack| { |
417 | 0 | return match nav_stack.borrow_mut().get_navigation_mathml(mathml) { |
418 | 0 | Err(e) => Err(e), |
419 | 0 | Ok((found, offset)) => { |
420 | // get the MathML node and wrap it inside of a <math> element | |
421 | // if the offset is given, we need to get the character it references | |
422 | 0 | if offset == 0 { |
423 | 0 | if name(found) == "math" { |
424 | 0 | Ok(found) |
425 | } else { | |
426 | 0 | let new_mathml = create_mathml_element(&new_doc, "math"); |
427 | 0 | new_mathml.append_child(copy_mathml(found)); |
428 | 0 | new_doc.root().append_child(new_mathml); |
429 | 0 | Ok(new_mathml) |
430 | } | |
431 | 0 | } else if !is_leaf(found) { |
432 | 0 | bail!( |
433 | "Internal error: non-zero offset '{}' on a non-leaf element '{}'", | |
434 | offset, | |
435 | 0 | name(found) |
436 | ); | |
437 | 0 | } else if let Some(ch) = as_text(found).chars().nth(offset) { |
438 | 0 | let internal_mathml = create_mathml_element(&new_doc, name(found)); |
439 | 0 | internal_mathml.set_text(&ch.to_string()); |
440 | 0 | let new_mathml = create_mathml_element(&new_doc, "math"); |
441 | 0 | new_mathml.append_child(internal_mathml); |
442 | 0 | new_doc.root().append_child(new_mathml); |
443 | 0 | Ok(new_mathml) |
444 | } else { | |
445 | 0 | bail!( |
446 | "Internal error: offset '{}' on leaf element '{}' doesn't exist", | |
447 | offset, | |
448 | 0 | mml_to_string(found) |
449 | ); | |
450 | } | |
451 | } | |
452 | }; | |
453 | 0 | })?; |
454 | ||
455 | 0 | let braille = crate::braille::braille_mathml(nav_mathml, "")?.0; |
456 | 0 | return Ok(braille); |
457 | 0 | }) |
458 | 0 | })); |
459 | 0 | return report_any_panic(result); |
460 | 0 | } |
461 | ||
462 | /// Given a key code along with the modifier keys, the current node is moved accordingly (or value reported in some cases). | |
463 | /// `key` is the [keycode](https://developer.mozilla.org/en-US/docs/Web/API/KeyboardEvent/keyCode#constants_for_keycode_value) for the key (in JavaScript, `ev.key_code`) | |
464 | /// The spoken text for the new current node is returned. | |
465 | 0 | pub fn do_navigate_keypress( |
466 | 0 | key: usize, |
467 | 0 | shift_key: bool, |
468 | 0 | control_key: bool, |
469 | 0 | alt_key: bool, |
470 | 0 | meta_key: bool, |
471 | 0 | ) -> Result<String> { |
472 | 0 | enable_logs(); |
473 | 0 | let result = catch_unwind(AssertUnwindSafe(|| { |
474 | 0 | MATHML_INSTANCE.with(|package_instance| { |
475 | 0 | let package_instance = package_instance.borrow(); |
476 | 0 | let mathml = get_element(&package_instance); |
477 | 0 | return do_mathml_navigate_key_press(mathml, key, shift_key, control_key, alt_key, meta_key); |
478 | 0 | }) |
479 | 0 | })); |
480 | 0 | return report_any_panic(result); |
481 | 0 | } |
482 | ||
483 | /// Given a navigation command, the current node is moved accordingly. | |
484 | /// This is a higher level interface than `do_navigate_keypress` for applications that want to interpret the keys themselves. | |
485 | /// The valid commands are: | |
486 | /// * Standard move commands: | |
487 | /// `MovePrevious`, `MoveNext`, `MoveStart`, `MoveEnd`, `MoveLineStart`, `MoveLineEnd` | |
488 | /// * Movement in a table or elementary math: | |
489 | /// `MoveCellPrevious`, `MoveCellNext`, `MoveCellUp`, `MoveCellDown`, `MoveColumnStart`, `MoveColumnEnd` | |
490 | /// * Moving into children or out to parents: | |
491 | /// `ZoomIn`, `ZoomOut`, `ZoomOutAll`, `ZoomInAll` | |
492 | /// * Undo the last movement command: | |
493 | /// `MoveLastLocation` | |
494 | /// * Read commands (standard speech): | |
495 | /// `ReadPrevious`, `ReadNext`, `ReadCurrent`, `ReadCellCurrent`, `ReadStart`, `ReadEnd`, `ReadLineStart`, `ReadLineEnd` | |
496 | /// * Describe commands (overview): | |
497 | /// `DescribePrevious`, `DescribeNext`, `DescribeCurrent` | |
498 | /// * Location information: | |
499 | /// `WhereAmI`, `WhereAmIAll` | |
500 | /// * Change navigation modes (circle up/down): | |
501 | /// `ToggleZoomLockUp`, `ToggleZoomLockDown` | |
502 | /// * Speak the current navigation mode | |
503 | /// `ToggleSpeakMode` | |
504 | /// | |
505 | /// There are 10 place markers that can be set/read/described or moved to. | |
506 | /// * Setting: | |
507 | /// `SetPlacemarker0`, `SetPlacemarker1`, `SetPlacemarker2`, `SetPlacemarker3`, `SetPlacemarker4`, `SetPlacemarker5`, `SetPlacemarker6`, `SetPlacemarker7`, `SetPlacemarker8`, `SetPlacemarker9` | |
508 | /// * Reading: | |
509 | /// `Read0`, `Read1`, `Read2`, `Read3`, `Read4`, `Read5`, `Read6`, `Read7`, `Read8`, `Read9` | |
510 | /// * Describing: | |
511 | /// `Describe0`, `Describe1`, `Describe2`, `Describe3`, `Describe4`, `Describe5`, `Describe6`, `Describe7`, `Describe8`, `Describe9` | |
512 | /// * Moving: | |
513 | /// `MoveTo0`, `MoveTo1`, `MoveTo2`, `MoveTo3`, `MoveTo4`, `MoveTo5`, `MoveTo6`, `MoveTo7`, `MoveTo8`, `MoveTo9` | |
514 | /// | |
515 | /// When done with Navigation, call with `Exit` | |
516 | 0 | pub fn do_navigate_command(command: impl AsRef<str>) -> Result<String> { |
517 | 0 | enable_logs(); |
518 | 0 | let command = command.as_ref().to_string(); |
519 | 0 | let result = catch_unwind(AssertUnwindSafe(|| { |
520 | 0 | let cmd = NAV_COMMANDS.get_key(&command); // gets a &'static version of the command |
521 | 0 | if cmd.is_none() { |
522 | 0 | bail!("Unknown command in call to DoNavigateCommand()"); |
523 | 0 | }; |
524 | 0 | let cmd = *cmd.unwrap(); |
525 | 0 | MATHML_INSTANCE.with(|package_instance| { |
526 | 0 | let package_instance = package_instance.borrow(); |
527 | 0 | let mathml = get_element(&package_instance); |
528 | 0 | return do_navigate_command_string(mathml, cmd); |
529 | 0 | }) |
530 | 0 | })); |
531 | 0 | return report_any_panic(result); |
532 | 0 | } |
533 | ||
534 | /// Given an 'id' and an offset (for tokens), set the navigation node to that id. | |
535 | /// An error is returned if the 'id' doesn't exist | |
536 | 2 | pub fn set_navigation_node(id: impl AsRef<str>, offset: usize) -> Result<()> { |
537 | 2 | enable_logs(); |
538 | 2 | let id = id.as_ref().to_string(); |
539 | 2 | let result = catch_unwind(AssertUnwindSafe(|| { |
540 | 2 | MATHML_INSTANCE.with(|package_instance| { |
541 | 2 | let package_instance = package_instance.borrow(); |
542 | 2 | let mathml = get_element(&package_instance); |
543 | 2 | return set_navigation_node_from_id(mathml, &id, offset); |
544 | 2 | }) |
545 | 2 | })); |
546 | 2 | return report_any_panic(result); |
547 | 2 | } |
548 | ||
549 | /// Return the MathML associated with the current (navigation) node and the offset (0-based) from that mathml (not yet implemented) | |
550 | /// The offset is needed for token elements that have multiple characters. | |
551 | 0 | pub fn get_navigation_mathml() -> Result<(String, usize)> { |
552 | 0 | enable_logs(); |
553 | 0 | let result = catch_unwind(AssertUnwindSafe(|| { |
554 | 0 | MATHML_INSTANCE.with(|package_instance| { |
555 | 0 | let package_instance = package_instance.borrow(); |
556 | 0 | let mathml = get_element(&package_instance); |
557 | 0 | return NAVIGATION_STATE.with(|nav_stack| { |
558 | 0 | return match nav_stack.borrow_mut().get_navigation_mathml(mathml) { |
559 | 0 | Err(e) => Err(e), |
560 | 0 | Ok((found, offset)) => Ok((mml_to_string(found), offset)), |
561 | }; | |
562 | 0 | }); |
563 | 0 | }) |
564 | 0 | })); |
565 | 0 | return report_any_panic(result); |
566 | 0 | } |
567 | ||
568 | /// Return the `id` and `offset` (0-based) associated with the current (navigation) node. | |
569 | /// `offset` (not yet implemented) | |
570 | /// The offset is needed for token elements that have multiple characters. | |
571 | 2 | pub fn get_navigation_mathml_id() -> Result<(String, usize)> { |
572 | 2 | enable_logs(); |
573 | 2 | let result = catch_unwind(AssertUnwindSafe(|| { |
574 | 2 | MATHML_INSTANCE.with(|package_instance| { |
575 | 2 | let package_instance = package_instance.borrow(); |
576 | 2 | let mathml = get_element(&package_instance); |
577 | 2 | return Ok(NAVIGATION_STATE.with(|nav_stack| { |
578 | 2 | return nav_stack.borrow().get_navigation_mathml_id(mathml); |
579 | 2 | })); |
580 | 2 | }) |
581 | 2 | })); |
582 | 2 | return report_any_panic(result); |
583 | 2 | } |
584 | ||
585 | /// Return the start and end braille character positions associated with the current (navigation) node. | |
586 | 2 | pub fn get_braille_position() -> Result<(usize, usize)> { |
587 | 2 | enable_logs(); |
588 | 2 | let result = catch_unwind(AssertUnwindSafe(|| { |
589 | 2 | MATHML_INSTANCE.with(|package_instance| { |
590 | 2 | let package_instance = package_instance.borrow(); |
591 | 2 | let mathml = get_element(&package_instance); |
592 | 2 | let nav_node = get_navigation_mathml_id() |
593 | 2 | let (_, start, end) = crate::braille::braille_mathml(mathml, &nav_node.0) |
594 | 2 | return Ok((start, end)); |
595 | 2 | }) |
596 | 2 | })); |
597 | 2 | return report_any_panic(result); |
598 | 2 | } |
599 | ||
600 | /// Given a 0-based braille position, return the smallest MathML node enclosing it. | |
601 | /// This node might be a leaf with an offset. | |
602 | 91 | pub fn get_navigation_node_from_braille_position(position: usize) -> Result<(String, usize)> { |
603 | 91 | enable_logs(); |
604 | 91 | let result = catch_unwind(AssertUnwindSafe(|| { |
605 | 91 | MATHML_INSTANCE.with(|package_instance| { |
606 | 91 | let package_instance = package_instance.borrow(); |
607 | 91 | let mathml = get_element(&package_instance); |
608 | 91 | return crate::braille::get_navigation_node_from_braille_position(mathml, position); |
609 | 91 | }) |
610 | 91 | })); |
611 | 91 | return report_any_panic(result); |
612 | 91 | } |
613 | ||
614 | 0 | pub fn get_supported_braille_codes() -> Result<Vec<String>> { |
615 | 0 | enable_logs(); |
616 | 0 | let result = catch_unwind(AssertUnwindSafe(|| { |
617 | 0 | let rules_dir = crate::prefs::PreferenceManager::get().borrow().get_rules_dir(); |
618 | 0 | let braille_dir = rules_dir.join("Braille"); |
619 | 0 | let mut braille_code_paths = Vec::new(); |
620 | ||
621 | 0 | find_all_dirs_shim(&braille_dir, &mut braille_code_paths); |
622 | 0 | let mut braille_code_paths = braille_code_paths.iter() |
623 | 0 | .map(|path| path.strip_prefix(&braille_dir).unwrap().to_string_lossy().to_string()) |
624 | 0 | .filter(|string_path| !string_path.is_empty() ) |
625 | 0 | .collect::<Vec<String>>(); |
626 | 0 | braille_code_paths.sort(); |
627 | ||
628 | 0 | Ok(braille_code_paths) |
629 | 0 | })); |
630 | 0 | return report_any_panic(result); |
631 | 0 | } |
632 | ||
633 | /// Returns a Vec of all supported languages ("en", "es", ...) | |
634 | 1 | pub fn get_supported_languages() -> Result<Vec<String>> { |
635 | 1 | enable_logs(); |
636 | 1 | let result = catch_unwind(AssertUnwindSafe(|| { |
637 | 1 | let rules_dir = crate::prefs::PreferenceManager::get().borrow().get_rules_dir(); |
638 | 1 | let lang_dir = rules_dir.join("Languages"); |
639 | 1 | let mut lang_paths = Vec::new(); |
640 | ||
641 | 1 | find_all_dirs_shim(&lang_dir, &mut lang_paths); |
642 | 1 | let mut language_paths = lang_paths.iter() |
643 | 13 | . |
644 | 13 | .to_string_lossy() |
645 | 13 | .replace(std::path::MAIN_SEPARATOR, "-") |
646 | 13 | .to_string()) |
647 | 13 | . |
648 | 1 | .collect::<Vec<String>>(); |
649 | ||
650 | // make sure the 'zz' test dir isn't included (build.rs removes it, but for debugging is there) | |
651 | 13 |
|
652 | 1 | language_paths.sort(); |
653 | 1 | Ok(language_paths) |
654 | 1 | })); |
655 | 1 | return report_any_panic(result); |
656 | 1 | } |
657 | ||
658 | 0 | pub fn get_supported_speech_styles(lang: impl AsRef<str>) -> Result<Vec<String>> { |
659 | 0 | enable_logs(); |
660 | 0 | let lang = lang.as_ref().to_string(); |
661 | 0 | let result = catch_unwind(AssertUnwindSafe(|| { |
662 | 0 | let rules_dir = crate::prefs::PreferenceManager::get().borrow().get_rules_dir(); |
663 | 0 | let lang_dir = rules_dir.join("Languages").join(&lang); |
664 | 0 | let mut speech_styles = find_files_in_dir_that_ends_with_shim(&lang_dir, "_Rules.yaml"); |
665 | 0 | for file_name in &mut speech_styles { |
666 | 0 | file_name.truncate(file_name.len() - "_Rules.yaml".len()) |
667 | } | |
668 | 0 | speech_styles.sort(); |
669 | 0 | speech_styles.dedup(); // remove duplicates -- shouldn't be any, but just in case |
670 | 0 | Ok(speech_styles) |
671 | 0 | })); |
672 | 0 | return report_any_panic(result); |
673 | 0 | } |
674 | ||
675 | // utility functions | |
676 | ||
677 | /// Copy (recursively) the (MathML) element and return the new one. | |
678 | /// The Element type does not copy and modifying the structure of an element's child will modify the element, so we need a copy | |
679 | /// Convert the returned error from set_mathml, etc., to a useful string for display | |
680 | 363 | pub fn copy_mathml(mathml: Element) -> Element { |
681 | 363 | return copy_mathml_recursive(mathml, 0); |
682 | 363 | } |
683 | ||
684 | 4.53k | fn copy_mathml_recursive(mathml: Element, depth: usize) -> Element { |
685 | // Safety: Prevent stack overflow on deeply nested MathML | |
686 | 4.53k | if depth > MAX_DEPTH { |
687 | // Return the element as a leaf if it's too deep to prevent crash | |
688 | 0 | return create_mathml_element(&mathml.document(), name(mathml)); |
689 | 4.53k | } |
690 | ||
691 | // If it represents MathML, the 'Element' can only have Text and Element children along with attributes | |
692 | 4.53k | let children = mathml.children(); |
693 | 4.53k | let new_mathml = create_mathml_element(&mathml.document(), name(mathml)); |
694 | 9.52k |
|
695 | 9.52k | new_mathml.set_attribute_value(attr.name(), attr.value()); |
696 | 9.52k | }); |
697 | ||
698 | // can't use is_leaf/as_text because this is also used with the intent tree | |
699 | 4.53k | if children.len() == 1 && |
700 | 3.26k | let Some( |
701 | 2.59k | new_mathml.set_text(text.text()); |
702 | 2.59k | return new_mathml; |
703 | 1.93k | } |
704 | ||
705 | 1.93k | let mut new_children = Vec::with_capacity(children.len()); |
706 | 4.17k | for child in |
707 | 4.17k | let child = as_element(child); |
708 | 4.17k | let new_child = copy_mathml_recursive(child, depth + 1); |
709 | 4.17k | new_children.push(new_child); |
710 | 4.17k | } |
711 | 1.93k | new_mathml.append_children(new_children); |
712 | 1.93k | return new_mathml; |
713 | 4.53k | } |
714 | ||
715 | 0 | pub fn errors_to_string(e: &Error) -> String { |
716 | 0 | enable_logs(); |
717 | 0 | let mut result = format!("{e}\n"); |
718 | 0 | for cause in e.chain().skip(1) { // skips original error |
719 | 0 | result += &format!("caused by: {cause}\n"); |
720 | 0 | } |
721 | 0 | result |
722 | 0 | } |
723 | ||
724 | 4.91k | fn add_ids(mathml: Element) -> Element { |
725 | use std::time::SystemTime; | |
726 | 4.91k | let time = if cfg!(target_family = "wasm") { |
727 | 0 | fastrand::usize(..) |
728 | } else { | |
729 | 4.91k | SystemTime::now() |
730 | 4.91k | .duration_since(SystemTime::UNIX_EPOCH) |
731 | 4.91k | .unwrap() |
732 | 4.91k | .as_millis() as usize |
733 | }; | |
734 | 4.91k | let mut time_part = radix_fmt::radix(time, 36).to_string(); |
735 | 4.91k | if time_part.len() < 3 { |
736 | 0 | time_part.push_str("a2c"); // needs to be at least three chars |
737 | 4.91k | } |
738 | 4.91k | let mut random_part = radix_fmt::radix(fastrand::u32(..), 36).to_string(); |
739 | 4.91k | if random_part.len() < 4 { |
740 | 0 | random_part.push_str("a1b2"); // needs to be at least four chars |
741 | 4.91k | } |
742 | 4.91k | let prefix = "M".to_string() + &time_part[time_part.len() - 3..] + &random_part[random_part.len() - 4..] + "-"; // begin with letter |
743 | 4.91k | add_ids_to_all(mathml, &prefix, 0); |
744 | 4.91k | return mathml; |
745 | ||
746 | 57.8k | fn add_ids_to_all(mathml: Element, id_prefix: &str, count: usize) -> usize { |
747 | 57.8k | let mut count = count; |
748 | 57.8k | if mathml.attribute("id").is_none() { |
749 | 57.1k | mathml.set_attribute_value("id", (id_prefix.to_string() + &count.to_string()).as_str()); |
750 | 57.1k | mathml.set_attribute_value("data-id-added", "true"); |
751 | 57.1k | count += 1; |
752 | 57.1k |
|
753 | ||
754 | 57.8k | if crate::xpath_functions::is_leaf(mathml) { |
755 | 35.8k | return count; |
756 | 22.0k | } |
757 | ||
758 | 52.9k | for child in |
759 | 52.9k | let child = as_element(child); |
760 | 52.9k | count = add_ids_to_all(child, id_prefix, count); |
761 | 52.9k | } |
762 | 22.0k | return count; |
763 | 57.8k | } |
764 | 4.91k | } |
765 | ||
766 | 10.3k | pub fn get_element(package: &Package) -> Element<'_> { |
767 | 10.3k | enable_logs(); |
768 | 10.3k | let doc = package.as_document(); |
769 | 10.3k | let mut result = None; |
770 | 10.3k | for root_child in doc.root().children() { |
771 | 10.3k | if let ChildOfRoot::Element(e) = root_child { |
772 | 10.3k | assert!(result.is_none()); |
773 | 10.3k | result = Some(e); |
774 | 0 | } |
775 | } | |
776 | 10.3k | return result.unwrap(); |
777 | 10.3k | } |
778 | ||
779 | /// Get the intent after setting the MathML | |
780 | /// Used in testing | |
781 | #[allow(dead_code)] | |
782 | 32 | pub fn get_intent<'a>(mathml: Element<'a>, doc: Document<'a>) -> Result<Element<'a>> { |
783 | 32 | crate::speech::SPEECH_RULES.with(|rules| rules.borrow_mut().read_files().unwrap()); |
784 | 32 | let mathml = cleanup_mathml(mathml) |
785 | 32 | return crate::speech::intent_from_mathml(mathml, doc); |
786 | 32 | } |
787 | ||
788 | #[allow(dead_code)] | |
789 | 22 | fn trim_doc(doc: &Document) { |
790 | 22 | for root_child in doc.root().children() { |
791 | 22 | if let ChildOfRoot::Element(e) = root_child { |
792 | 22 | trim_element(e, false); |
793 | 22 | } else { |
794 | 0 | doc.root().remove_child(root_child); // comment or processing instruction |
795 | 0 | } |
796 | } | |
797 | 22 | } |
798 | ||
799 | /// Not really meant to be public -- used by tests in some packages | |
800 | 55.5k | pub fn trim_element(e: Element, allow_structure_in_leaves: bool) { |
801 | // "<mtext>this is text</mtext" results in 3 text children | |
802 | // these are combined into one child as it makes code downstream simpler | |
803 | ||
804 | // space, tab, newline, carriage return all get collapsed to a single space | |
805 | const WHITESPACE: &[char] = &[' ', '\u{0009}', '\u{000A}','\u{000C}', '\u{000D}']; | |
806 | 3 | static WHITESPACE_MATCH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"[ \u{0009}\u{000A}\u{00C}\u{000D}]+"#).unwrap()); |
807 | ||
808 | 55.5k | if is_leaf(e) && ( |
809 | // Assume it is HTML inside of the leaf -- turn the HTML into a string | |
810 | 34.7k | make_leaf_element(e); |
811 | 34.7k | return; |
812 | 20.7k | } |
813 | ||
814 | 20.7k | let mut single_text = "".to_string(); |
815 | 87.6k | for child in |
816 | 87.6k | match child { |
817 | 50.1k | ChildOfElement::Element(c) => { |
818 | 50.1k | trim_element(c, allow_structure_in_leaves); |
819 | 50.1k | } |
820 | 37.4k | ChildOfElement::Text(t) => { |
821 | 37.4k | single_text += t.text(); |
822 | 37.4k | e.remove_child(child); |
823 | 37.4k | } |
824 | 21 | _ => { |
825 | 21 | e.remove_child(child); |
826 | 21 | } |
827 | } | |
828 | } | |
829 | ||
830 | // CSS considers only space, tab, linefeed, and carriage return as collapsable whitespace | |
831 | 20.7k | if !(is_leaf(e) || name(e) == "intent-literal" || single_text.is_empty()) { |
832 | // intent-literal comes from testing intent | |
833 | // FIX: we have a problem -- what should happen??? | |
834 | // FIX: For now, just keep the children and ignore the text and log an error -- shouldn't panic/crash | |
835 | 14.3k | if !single_text.trim_matches(WHITESPACE).is_empty() { |
836 | 20 | error!( |
837 | "trim_element: both element and textual children which shouldn't happen -- ignoring text '{single_text}'" | |
838 | ); | |
839 | 14.2k | } |
840 | 14.3k | return; |
841 | 6.44k | } |
842 | 6.44k | if e.children().is_empty() && |
843 | 0 | // debug!("Combining text in {}: '{}' -> '{}'", e.name().local_part(), single_text, trimmed_text); |
844 | 0 | e.set_text(&WHITESPACE_MATCH.replace_all(&single_text, " ")); |
845 | 6.44k | } |
846 | ||
847 | 34.7k | fn make_leaf_element(mathml_leaf: Element) { |
848 | // MathML leaves like <mn> really shouldn't have non-textual content, but you could have embedded HTML | |
849 | // Here, we convert them to leaves by grabbing up all the text and making that the content | |
850 | // Potentially, we leave them and let (default) rules do something, but it makes other parts of the code | |
851 | // messier because checking the text of a leaf becomes Option<&str> rather than just &str | |
852 | 34.7k | let children = mathml_leaf.children(); |
853 | 34.7k | if children.is_empty() { |
854 | 503 | return; |
855 | 34.2k | } |
856 | ||
857 | 34.2k | if rewrite_and_flatten_embedded_mathml(mathml_leaf) { |
858 | 1 | return; |
859 | 34.2k | } |
860 | ||
861 | // gather up the text | |
862 | 34.2k | let mut text = "".to_string(); |
863 | 34.6k | for child in |
864 | 34.6k | let child_text = match child { |
865 | 8 | ChildOfElement::Element(child) => { |
866 | 8 | if name(child) == "mglyph" { |
867 | 3 | child.attribute_value("alt").unwrap_or("").to_string() |
868 | } else { | |
869 | 5 | gather_text(child) |
870 | } | |
871 | } | |
872 | 34.4k | ChildOfElement::Text(t) => { |
873 | // debug!("ChildOfElement::Text: '{}'", t.text()); | |
874 | 34.4k | t.text().to_string() |
875 | } | |
876 | 222 | _ => "".to_string(), |
877 | }; | |
878 | 34.6k | if !child_text.is_empty() { |
879 | 34.4k | text += &child_text; |
880 | 34.4k |
|
881 | } | |
882 | ||
883 | // get rid of the old children and replace with the text we just built | |
884 | 34.2k | mathml_leaf.clear_children(); |
885 | 34.2k | mathml_leaf.set_text(WHITESPACE_MATCH.replace_all(&text, " ").trim_matches(WHITESPACE)); |
886 | // debug!("make_leaf_element: text is '{}'", crate::canonicalize::as_text(mathml_leaf)); | |
887 | ||
888 | /// gather up all the contents of the element and return them with a leading space | |
889 | 7 | fn gather_text(html: Element) -> String { |
890 | 7 | let mut text = "".to_string(); // since we are throwing out the element tag, add a space between the contents |
891 | 7 | for child in html.children() { |
892 | 7 | match child { |
893 | 2 | ChildOfElement::Element(child) => { |
894 | 2 | text += &gather_text(child); |
895 | 2 | } |
896 | 5 | ChildOfElement::Text(t) => text += t.text(), |
897 | 0 | _ => (), |
898 | } | |
899 | } | |
900 | // debug!("gather_text: '{}'", text); | |
901 | 7 | return text; |
902 | 7 | } |
903 | 34.7k | } |
904 | ||
905 | 34.2k | fn rewrite_and_flatten_embedded_mathml(mathml_leaf: Element) -> bool { |
906 | // first see if it can or needs to be rewritten | |
907 | // this is likely rare, so we do a check and if true, to a second pass building the result | |
908 | 34.2k | let mut needs_rewrite = false; |
909 | 34.6k | for child in |
910 | 34.6k | if let Some( |
911 | 8 | if name(element) != "math" { |
912 | 7 | return false; // something other than MathML as a child -- can't rewrite |
913 | 1 | } |
914 | 1 | needs_rewrite = true; |
915 | 34.6k | } |
916 | }; | |
917 | ||
918 | 34.2k | if !needs_rewrite { |
919 | 34.2k | return false; |
920 | 1 | } |
921 | ||
922 | // now do the rewrite, flatting out the mathml and returning an mrow with the children | |
923 | 1 | let leaf_name = name(mathml_leaf); |
924 | 1 | let doc = mathml_leaf.document(); |
925 | 1 | let mut new_children = Vec::new(); |
926 | 1 | let mut is_last_mtext = false; |
927 | 5 | for child in |
928 | 5 | if let Some( |
929 | 1 | trim_element(element, true); |
930 | 1 | new_children.append(&mut element.children()); // don't want 'math' wrapper |
931 | 1 | is_last_mtext = false; |
932 | 4 | } else if let Some(text) = child.text() { |
933 | // combine adjacent text nodes into single nodes | |
934 | 4 | if is_last_mtext { |
935 | 2 | let last_child = new_children.last_mut().unwrap().element().unwrap(); |
936 | 2 | let new_text = as_text(last_child).to_string() + text.text(); |
937 | 2 | last_child.set_text(&new_text); |
938 | 2 | } else { |
939 | 2 | let new_leaf_node = create_mathml_element(&doc, leaf_name); |
940 | 2 | new_leaf_node.set_text(text.text()); |
941 | 2 | new_children.push(ChildOfElement::Element(new_leaf_node)); |
942 | 2 | is_last_mtext = true; |
943 | 2 | } |
944 | 0 | } |
945 | }; | |
946 | ||
947 | // clean up whitespace in text nodes | |
948 | 3 | for child in |
949 | 3 | if let Some(element) = child.element() && is_leaf(element) { |
950 | 2 | let text = as_text(element); |
951 | 2 | let cleaned_text = WHITESPACE_MATCH.replace_all(text, " ").trim_matches(WHITESPACE).to_string(); |
952 | 2 | element.set_text(&cleaned_text); |
953 | 2 |
|
954 | } | |
955 | ||
956 | 1 | crate::canonicalize::set_mathml_name(mathml_leaf, "mrow"); |
957 | 1 | mathml_leaf.clear_children(); |
958 | 1 | mathml_leaf.append_children(new_children); |
959 | ||
960 | // debug!("rewrite_and_flatten_embedded_mathml: flattened\n'{}'", mml_to_string(mathml_leaf)); | |
961 | 1 | return true; |
962 | 34.2k | } |
963 | 55.5k | } |
964 | ||
965 | // used for testing trim | |
966 | /// returns Ok() if two Documents are equal or some info where they differ in the Err | |
967 | #[allow(dead_code)] | |
968 | 11 | fn is_same_doc(doc1: &Document, doc2: &Document) -> Result<()> { |
969 | // assume 'e' doesn't have element children until proven otherwise | |
970 | // this means we keep Text children until we are proven they aren't needed | |
971 | 11 | if doc1.root().children().len() != doc2.root().children().len() { |
972 | 0 | bail!( |
973 | "Children of docs have {} != {} children", | |
974 | 0 | doc1.root().children().len(), |
975 | 0 | doc2.root().children().len() |
976 | ); | |
977 | 11 | } |
978 | ||
979 | 11 | for (i, (c1, c2)) in doc1 |
980 | 11 | .root() |
981 | 11 | .children() |
982 | 11 | .iter() |
983 | 11 | .zip(doc2.root().children().iter()) |
984 | 11 | .enumerate() |
985 | { | |
986 | 11 | match c1 { |
987 | 11 | ChildOfRoot::Element(e1) => { |
988 | 11 | if let ChildOfRoot::Element(e2) = c2 { |
989 | 11 | is_same_element(*e1, *e2, &[]) |
990 | } else { | |
991 | 0 | bail!("child #{}, first is element, second is something else", i); |
992 | } | |
993 | } | |
994 | 0 | ChildOfRoot::Comment(com1) => { |
995 | 0 | if let ChildOfRoot::Comment(com2) = c2 { |
996 | 0 | if com1.text() != com2.text() { |
997 | 0 | bail!("child #{} -- comment text differs", i); |
998 | 0 | } |
999 | } else { | |
1000 | 0 | bail!("child #{}, first is comment, second is something else", i); |
1001 | } | |
1002 | } | |
1003 | 0 | ChildOfRoot::ProcessingInstruction(p1) => { |
1004 | 0 | if let ChildOfRoot::ProcessingInstruction(p2) = c2 { |
1005 | 0 | if p1.target() != p2.target() || p1.value() != p2.value() { |
1006 | 0 | bail!("child #{} -- processing instruction differs", i); |
1007 | 0 | } |
1008 | } else { | |
1009 | 0 | bail!( |
1010 | "child #{}, first is processing instruction, second is something else", | |
1011 | i | |
1012 | ); | |
1013 | } | |
1014 | } | |
1015 | } | |
1016 | } | |
1017 | 10 | return Ok(()); |
1018 | 11 | } |
1019 | ||
1020 | /// returns Ok() if two Documents are equal or some info where they differ in the Err | |
1021 | // Not really meant to be public -- used by tests in some packages | |
1022 | #[allow(dead_code)] | |
1023 | 1.92k | pub fn is_same_element(e1: Element, e2: Element, ignore_attrs: &[&str]) -> Result<()> { |
1024 | 1.92k | enable_logs(); |
1025 | 1.92k | if name(e1) != name(e2) { |
1026 | 0 | bail!("Names not the same: {}, {}", name(e1), name(e2)); |
1027 | 1.92k | } |
1028 | ||
1029 | // assume 'e' doesn't have element children until proven otherwise | |
1030 | // this means we keep Text children until we are proven they aren't needed | |
1031 | 1.92k | if e1.children().len() != e2.children().len() { |
1032 | 0 | bail!( |
1033 | "Children of {} have {} != {} children", | |
1034 | 0 | name(e1), |
1035 | 0 | e1.children().len(), |
1036 | 0 | e2.children().len() |
1037 | ); | |
1038 | 1.92k | } |
1039 | ||
1040 | 1.92k | if let Err( |
1041 | 0 | bail!("In element {}, {}", name(e1), e); |
1042 | 1.92k | } |
1043 | ||
1044 | 2.86k | for (i, (c1, c2)) in |
1045 | 2.86k | match c1 { |
1046 | 1.72k | ChildOfElement::Element(child1) => { |
1047 | 1.72k | if let ChildOfElement::Element(child2) = c2 { |
1048 | 1.72k | is_same_element(*child1, *child2, ignore_attrs) |
1049 | } else { | |
1050 | 0 | bail!("{} child #{}, first is element, second is something else", name(e1), i); |
1051 | } | |
1052 | } | |
1053 | 0 | ChildOfElement::Comment(com1) => { |
1054 | 0 | if let ChildOfElement::Comment(com2) = c2 { |
1055 | 0 | if com1.text() != com2.text() { |
1056 | 0 | bail!("{} child #{} -- comment text differs", name(e1), i); |
1057 | 0 | } |
1058 | } else { | |
1059 | 0 | bail!("{} child #{}, first is comment, second is something else", name(e1), i); |
1060 | } | |
1061 | } | |
1062 | 0 | ChildOfElement::ProcessingInstruction(p1) => { |
1063 | 0 | if let ChildOfElement::ProcessingInstruction(p2) = c2 { |
1064 | 0 | if p1.target() != p2.target() || p1.value() != p2.value() { |
1065 | 0 | bail!("{} child #{} -- processing instruction differs", name(e1), i); |
1066 | 0 | } |
1067 | } else { | |
1068 | 0 | bail!( |
1069 | "{} child #{}, first is processing instruction, second is something else", | |
1070 | 0 | name(e1), |
1071 | i | |
1072 | ); | |
1073 | } | |
1074 | } | |
1075 | 1.14k | ChildOfElement::Text(t1) => { |
1076 | 1.14k | if let ChildOfElement::Text(t2) = c2 { |
1077 | 1.14k | if t1.text() != t2.text() { |
1078 | 1 | bail!("{} child #{} -- text differs", name(e1), i); |
1079 | 1.14k | } |
1080 | } else { | |
1081 | 0 | bail!("{} child #{}, first is text, second is something else", name(e1), i); |
1082 | } | |
1083 | } | |
1084 | } | |
1085 | } | |
1086 | 1.91k | return Ok(()); |
1087 | ||
1088 | /// compares attributes -- '==' didn't seems to work | |
1089 | 1.92k | fn attrs_are_same(attrs1: Vec<Attribute>, attrs2: Vec<Attribute>, ignore: &[&str]) -> Result<()> { |
1090 | 1.92k | let attrs1 = attrs1.iter() |
1091 | 1.92k | .filter(|a| ! |
1092 | 1.92k | .collect::<Vec<Attribute>>(); |
1093 | 1.92k | let attrs2 = attrs2.iter() |
1094 | 1.92k | .filter(|a| ! |
1095 | 1.92k | .collect::<Vec<Attribute>>(); |
1096 | 1.92k | if attrs1.len() != attrs2.len() { |
1097 | 0 | bail!("Attributes have different length: {:?} != {:?}", attrs1, attrs2); |
1098 | 1.92k | } |
1099 | // can't guarantee attrs are in the same order | |
1100 | 1.92k | for |
1101 | 1.40k | if let Some(found_attr2) = attrs2 |
1102 | 1.40k | .iter() |
1103 | 1.88k | . |
1104 | { | |
1105 | 1.40k | if attr1.value() == found_attr2.value() { |
1106 | 1.40k | continue; |
1107 | } else { | |
1108 | 0 | bail!( |
1109 | "Attribute named {} has differing values:\n '{}'\n '{}'", | |
1110 | 0 | attr1.name().local_part(), |
1111 | 0 | attr1.value(), |
1112 | 0 | found_attr2.value() |
1113 | ); | |
1114 | } | |
1115 | } else { | |
1116 | 0 | bail!( |
1117 | "Attribute name {} not in [{}]", | |
1118 | 0 | print_attr(&attr1), |
1119 | 0 | print_attrs(&attrs2) |
1120 | ); | |
1121 | } | |
1122 | } | |
1123 | 1.92k | return Ok(()); |
1124 | ||
1125 | 0 | fn print_attr(attr: &Attribute) -> String { |
1126 | 0 | return format!("@{}='{}'", attr.name().local_part(), attr.value()); |
1127 | 0 | } |
1128 | 0 | fn print_attrs(attrs: &[Attribute]) -> String { |
1129 | 0 | return attrs.iter().map(print_attr).collect::<Vec<String>>().join(", "); |
1130 | 0 | } |
1131 | 1.92k | } |
1132 | 1.92k | } |
1133 | ||
1134 | #[cfg(test)] | |
1135 | mod tests { | |
1136 | #[allow(unused_imports)] | |
1137 | use super::super::init_logger; | |
1138 | use super::*; | |
1139 | ||
1140 | 10 | fn are_parsed_strs_equal(test: &str, target: &str) -> bool { |
1141 | 10 | let test_package = &parser::parse(test).expect("Failed to parse input"); |
1142 | 10 | let test_doc = test_package.as_document(); |
1143 | 10 | trim_doc(&test_doc); |
1144 | 10 | debug!("test:\n{}", |
1145 | ||
1146 | 10 | let target_package = &parser::parse(target).expect("Failed to parse input"); |
1147 | 10 | let target_doc = target_package.as_document(); |
1148 | 10 | trim_doc(&target_doc); |
1149 | 10 | debug!("target:\n{}", |
1150 | ||
1151 | 10 | match is_same_doc(&test_doc, &target_doc) { |
1152 | 10 | Ok(_) => return true, |
1153 | 0 | Err(e) => panic!("{}", e), |
1154 | } | |
1155 | 10 | } |
1156 | ||
1157 | #[test] | |
1158 | 1 | fn trim_same() { |
1159 | 1 | let trimmed_str = "<math><mrow><mo>-</mo><mi>a</mi></mrow></math>"; |
1160 | 1 | assert!(are_parsed_strs_equal(trimmed_str, trimmed_str)); |
1161 | 1 | } |
1162 | ||
1163 | #[test] | |
1164 | 1 | fn trim_whitespace() { |
1165 | 1 | let trimmed_str = "<math><mrow><mo>-</mo><mi> a </mi></mrow></math>"; |
1166 | 1 | let whitespace_str = "<math> <mrow ><mo>-</mo><mi> a </mi></mrow ></math>"; |
1167 | 1 | assert!(are_parsed_strs_equal(trimmed_str, whitespace_str)); |
1168 | 1 | } |
1169 | ||
1170 | #[test] | |
1171 | 1 | fn no_trim_whitespace_nbsp() { |
1172 | 1 | let trimmed_str = "<math><mrow><mo>-</mo><mtext>  a </mtext></mrow></math>"; |
1173 | 1 | let whitespace_str = "<math> <mrow ><mo>-</mo><mtext>  a </mtext></mrow ></math>"; |
1174 | 1 | assert!(are_parsed_strs_equal(trimmed_str, whitespace_str)); |
1175 | 1 | } |
1176 | ||
1177 | #[test] | |
1178 | 1 | fn trim_comment() { |
1179 | 1 | let whitespace_str = "<math> <mrow ><mo>-</mo><mi> a </mi></mrow ></math>"; |
1180 | 1 | let comment_str = "<math><mrow><mo>-</mo><!--a comment --><mi> a </mi></mrow></math>"; |
1181 | 1 | assert!(are_parsed_strs_equal(comment_str, whitespace_str)); |
1182 | 1 | } |
1183 | ||
1184 | #[test] | |
1185 | 1 | fn replace_mglyph() { |
1186 | 1 | let mglyph_str = "<math> |
1187 | 1 | <mrow> |
1188 | 1 | <mi>X<mglyph fontfamily='my-braid-font' index='2' alt='23braid' /></mi> |
1189 | 1 | <mo>+</mo> |
1190 | 1 | <mi> |
1191 | 1 | <mglyph fontfamily='my-braid-font' index='5' alt='132braid' />Y |
1192 | 1 | </mi> |
1193 | 1 | <mo>=</mo> |
1194 | 1 | <mi> |
1195 | 1 | <mglyph fontfamily='my-braid-font' index='3' alt='13braid' /> |
1196 | 1 | </mi> |
1197 | 1 | </mrow> |
1198 | 1 | </math>"; |
1199 | 1 | let result_str = "<math> |
1200 | 1 | <mrow> |
1201 | 1 | <mi>X23braid</mi> |
1202 | 1 | <mo>+</mo> |
1203 | 1 | <mi>132braidY</mi> |
1204 | 1 | <mo>=</mo> |
1205 | 1 | <mi>13braid</mi> |
1206 | 1 | </mrow> |
1207 | 1 | </math>"; |
1208 | 1 | assert!(are_parsed_strs_equal(mglyph_str, result_str)); |
1209 | 1 | } |
1210 | ||
1211 | #[test] | |
1212 | 1 | fn trim_differs() { |
1213 | 1 | let whitespace_str = "<math> <mrow ><mo>-</mo><mi> a </mi></mrow ></math>"; |
1214 | 1 | let different_str = "<math> <mrow ><mo>-</mo><mi> b </mi></mrow ></math>"; |
1215 | ||
1216 | // need to manually do this since failure shouldn't be a panic | |
1217 | 1 | let package1 = &parser::parse(whitespace_str).expect("Failed to parse input"); |
1218 | 1 | let doc1 = package1.as_document(); |
1219 | 1 | trim_doc(&doc1); |
1220 | 1 | debug!("doc1:\n{}", |
1221 | ||
1222 | 1 | let package2 = parser::parse(different_str).expect("Failed to parse input"); |
1223 | 1 | let doc2 = package2.as_document(); |
1224 | 1 | trim_doc(&doc2); |
1225 | 1 | debug!("doc2:\n{}", |
1226 | ||
1227 | 1 | assert!(is_same_doc(&doc1, &doc2).is_err()); |
1228 | 1 | } |
1229 | ||
1230 | #[test] | |
1231 | 1 | fn test_entities() { |
1232 | // this forces initialization | |
1233 | 1 | set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); |
1234 | ||
1235 | 1 | let entity_str = set_mathml("<math><mrow><mo>−</mo><mi>𝕞</mi></mrow></math>").unwrap(); |
1236 | 1 | let converted_str = |
1237 | 1 | set_mathml("<math><mrow><mo>−</mo><mi>𝕞</mi></mrow></math>").unwrap(); |
1238 | ||
1239 | // need to remove unique ids | |
1240 | 1 | static ID_MATCH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"id='.+?' "#).unwrap()); |
1241 | 1 | let entity_str = ID_MATCH.replace_all(&entity_str, ""); |
1242 | 1 | let converted_str = ID_MATCH.replace_all(&converted_str, ""); |
1243 | 1 | assert_eq!(entity_str, converted_str, "normal entity test failed"); |
1244 | ||
1245 | 1 | let entity_str = set_mathml( |
1246 | "<math data-quot=\""value"\" data-apos=''value''><mi>XXX</mi></math>", | |
1247 | ) | |
1248 | 1 | .unwrap(); |
1249 | 1 | let converted_str = |
1250 | 1 | set_mathml("<math data-quot='\"value\"' data-apos=\"'value'\"><mi>XXX</mi></math>").unwrap(); |
1251 | 1 | let entity_str = ID_MATCH.replace_all(&entity_str, ""); |
1252 | 1 | let converted_str = ID_MATCH.replace_all(&converted_str, ""); |
1253 | 1 | assert_eq!(entity_str, converted_str, "special entities quote test failed"); |
1254 | ||
1255 | 1 | let entity_str = |
1256 | 1 | set_mathml("<math><mo><</mo><mo>></mo><mtext>&lt;</mtext></math>").unwrap(); |
1257 | 1 | let converted_str = |
1258 | 1 | set_mathml("<math><mo><</mo><mo>></mo><mtext>&lt;</mtext></math>") |
1259 | 1 | .unwrap(); |
1260 | 1 | let entity_str = ID_MATCH.replace_all(&entity_str, ""); |
1261 | 1 | let converted_str = ID_MATCH.replace_all(&converted_str, ""); |
1262 | 1 | assert_eq!(entity_str, converted_str, "special entities <,>,& test failed"); |
1263 | 1 | } |
1264 | ||
1265 | #[test] | |
1266 | 1 | fn can_recover_from_invalid_set_rules_dir() { |
1267 | use std::env; | |
1268 | // MathCAT will check the env var "MathCATRulesDir" as an override, so the following test might succeed if we don't override the env var | |
1269 | 1 | unsafe { env::set_var("MathCATRulesDir", "MathCATRulesDir"); } // safe because we are single threaded |
1270 | 1 | assert!(set_rules_dir("someInvalidRulesDir").is_err()); |
1271 | 1 | assert!( |
1272 | 1 | set_rules_dir(super::super::abs_rules_dir_path()).is_ok(), |
1273 | "\nset_rules_dir to '{}' failed", | |
1274 | 0 | super::super::abs_rules_dir_path() |
1275 | ); | |
1276 | 1 | assert!(set_mathml("<math><mn>1</mn></math>").is_ok()); |
1277 | 1 | } |
1278 | ||
1279 | #[test] | |
1280 | 1 | fn single_html_in_mtext() { |
1281 | 1 | let test = "<math><mn>1</mn> <mtext>a<p> para 1</p>bc</mtext> <mi>y</mi></math>"; |
1282 | 1 | let target = "<math><mn>1</mn> <mtext>a para 1bc</mtext> <mi>y</mi></math>"; |
1283 | 1 | assert!(are_parsed_strs_equal(test, target)); |
1284 | 1 | } |
1285 | ||
1286 | #[test] | |
1287 | 1 | fn multiple_html_in_mtext() { |
1288 | 1 | let test = "<math><mn>1</mn> <mtext>a<p>para 1</p> <p>para 2</p>bc </mtext> <mi>y</mi></math>"; |
1289 | 1 | let target = "<math><mn>1</mn> <mtext>apara 1 para 2bc</mtext> <mi>y</mi></math>"; |
1290 | 1 | assert!(are_parsed_strs_equal(test, target)); |
1291 | 1 | } |
1292 | ||
1293 | #[test] | |
1294 | 1 | fn nested_html_in_mtext() { |
1295 | 1 | let test = "<math><mn>1</mn> <mtext>a <ol><li>first</li><li>second</li></ol> bc</mtext> <mi>y</mi></math>"; |
1296 | 1 | let target = "<math><mn>1</mn> <mtext>a firstsecond bc</mtext> <mi>y</mi></math>"; |
1297 | 1 | assert!(are_parsed_strs_equal(test, target)); |
1298 | 1 | } |
1299 | ||
1300 | #[test] | |
1301 | 1 | fn empty_html_in_mtext() { |
1302 | 1 | let test = "<math><mn>1</mn> <mtext>a<br/>bc</mtext> <mi>y</mi></math>"; |
1303 | 1 | let target = "<math><mn>1</mn> <mtext>abc</mtext> <mi>y</mi></math>"; |
1304 | 1 | assert!(are_parsed_strs_equal(test, target)); |
1305 | 1 | } |
1306 | ||
1307 | #[test] | |
1308 | 1 | fn mathml_in_mtext() { |
1309 | 1 | let test = "<math><mtext>if <math> <msup><mi>n</mi><mn>2</mn></msup></math> is real</mtext></math>"; |
1310 | 1 | let target = "<math><mrow><mtext>if </mtext><msup><mi>n</mi><mn>2</mn></msup><mtext> is real</mtext></mrow></math>"; |
1311 | 1 | assert!(are_parsed_strs_equal(test, target)); |
1312 | 1 | } |
1313 | ||
1314 | #[test] | |
1315 | 1 | fn stack_overflow_protection() { |
1316 | 1 | set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); |
1317 | 1 | let mut bad_mathml = String::from("<math>"); |
1318 | 513 | for _ in |
1319 | 513 | bad_mathml.push_str("<msqrt><mi>n</mi>"); |
1320 | 513 | } |
1321 | 513 | for _ in |
1322 | 513 | bad_mathml.push_str("</msqrt>"); |
1323 | 513 | } |
1324 | 1 | bad_mathml.push_str("</math>"); |
1325 | 1 | assert_eq!(set_mathml(bad_mathml).unwrap_err().to_string(), "MathML is too deeply nested to process"); |
1326 | 1 | } |
1327 | ||
1328 | #[test] | |
1329 | 1 | fn old_mathml_cleared_on_error() { |
1330 | 1 | set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); |
1331 | 1 | let good_mathml = "<math><mn>3</mn></math>"; |
1332 | 1 | set_mathml(good_mathml).unwrap(); |
1333 | 1 | let bad_mathml = "<math><mi>&xabc;</mi></math>"; |
1334 | 1 | assert!(set_mathml(bad_mathml).is_err()); |
1335 | 1 | assert!(get_spoken_text().unwrap() == ""); |
1336 | 1 | set_mathml(good_mathml).unwrap(); |
1337 | 1 | let bad_mathml = "<math>garbage"; |
1338 | 1 | assert!(set_mathml(bad_mathml).is_err()); |
1339 | 1 | assert!(get_spoken_text().unwrap() == ""); |
1340 | 1 | } |
1341 | } |
Line | Count | Source |
1 | #![allow(clippy::needless_return)] | |
2 | ||
3 | //! A library for generating speech and braille from MathML | |
4 | //! | |
5 | //! Typical usage is: | |
6 | //! 1. Set the rules directory [`set_rules_dir`] | |
7 | //! 2. Set whatever preferences are need with repeated calls to [`set_preference`]. | |
8 | //! 3. Set MathML via [`set_mathml`] | |
9 | //! A string representing the cleaned up MathML along with `id`s on each node is returned for highlighting if desired | |
10 | //! 4. Get the speech [`get_spoken_text`] or (Unicode) braille [`get_braille`]. | |
11 | //! | |
12 | //! The expression can be navigated also. | |
13 | //! This is done in one of two ways: | |
14 | //! 1. Pass key strokes to allow a user to navigate the MathML by calling [`do_navigate_keypress`]; the speech is returned. | |
15 | //! 2. Pass the MathCAT navigation command directory by called [`do_navigate_command`]; the speech is return returned. | |
16 | //! | |
17 | //! To get the MathML associated with the current navigation node, call [`get_navigation_mathml`]. | |
18 | //! To just get the `id` and offset from the id of the current navigation node, call [`get_navigation_mathml_id`]. | |
19 | /// | |
20 | /// This module re-exports anyhow types. Use `bail!` for early returns and | |
21 | /// `context()`/`with_context()` on Result to add context (replacing old `chain_err()`). | |
22 | pub mod errors { | |
23 | pub use anyhow::{anyhow, bail, Error, Result, Context}; | |
24 | } | |
25 | ||
26 | pub mod interface; | |
27 | #[cfg(feature = "include-zip")] | |
28 | pub use shim_filesystem::ZIPPED_RULE_FILES; | |
29 | ||
30 | mod canonicalize; | |
31 | mod infer_intent; | |
32 | pub mod speech; | |
33 | mod braille; | |
34 | mod navigate; | |
35 | mod prefs; | |
36 | mod tts; | |
37 | mod xpath_functions; | |
38 | mod definitions; | |
39 | pub mod pretty_print; | |
40 | mod chemistry; | |
41 | ||
42 | pub mod shim_filesystem; // really just for override_file_for_debugging_rules, but the config seems to throw it off | |
43 | pub use interface::*; | |
44 | use crate::errors::{bail, Result}; | |
45 | ||
46 | #[cfg(test)] | |
47 | 0 | pub fn init_logger() { |
48 | 0 | env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("debug")) |
49 | 0 | .is_test(true) |
50 | 0 | .format_timestamp(None) |
51 | 0 | .format_module_path(false) |
52 | 0 | .format_indent(None) |
53 | 0 | .format_level(false) |
54 | 0 | .init(); |
55 | 0 | } |
56 | ||
57 | /// Build Absolute path to rules dir for testing | |
58 | 250 | pub fn abs_rules_dir_path() -> String { |
59 | cfg_if::cfg_if! { | |
60 | if #[cfg(feature = "include-zip")] { | |
61 | return "Rules".to_string(); | |
62 | } else { | |
63 | // Package root (see tests/common/mod.rs `abs_rules_dir_path` for rationale). | |
64 | 250 | return std::path::Path::new(env!("CARGO_MANIFEST_DIR")) |
65 | 250 | .join("Rules") |
66 | 250 | .to_str() |
67 | 250 | .expect("CARGO_MANIFEST_DIR and Rules path must be UTF-8") |
68 | 250 | .to_string(); |
69 | } | |
70 | } | |
71 | 250 | } |
72 | ||
73 | 141 | pub fn are_strs_canonically_equal_with_locale(test: &str, target: &str, ignore_attrs: &[&str], block_separators: &str, decimal_separators: &str) -> Result<()> { |
74 | use crate::{interface::*, pretty_print::mml_to_string}; | |
75 | use sxd_document::parser; | |
76 | use crate::canonicalize::canonicalize; | |
77 | use std::panic::{catch_unwind, AssertUnwindSafe}; | |
78 | ||
79 | 141 | crate::interface::init_panic_handler(); |
80 | 141 | let result = catch_unwind(AssertUnwindSafe(|| { |
81 | // this forces initialization | |
82 | 141 | crate::interface::set_rules_dir(abs_rules_dir_path()).unwrap(); |
83 | 141 | crate::speech::SPEECH_RULES.with(|rules| rules.borrow_mut().read_files().unwrap()); |
84 | 141 | set_preference("Language", "en").unwrap(); |
85 | 141 | set_preference("BlockSeparators", block_separators).unwrap(); |
86 | 141 | set_preference("DecimalSeparators", decimal_separators).unwrap(); |
87 | ||
88 | 141 | let package1 = &parser::parse(test).expect("Failed to parse test input"); |
89 | 141 | let mathml = get_element(package1); |
90 | 141 | trim_element(mathml, false); |
91 | 141 | let mathml_test = canonicalize(mathml).unwrap(); |
92 | ||
93 | 141 | let package2 = &parser::parse(target).expect("Failed to parse target input"); |
94 | 141 | let mathml_target = get_element(package2); |
95 | 141 | trim_element(mathml_target, false); |
96 | ||
97 | 141 | match is_same_element(mathml_test, mathml_target, ignore_attrs) { |
98 | 141 | Ok(_) => Ok( () ), |
99 | 0 | Err(e) => { |
100 | 0 | bail!("{}\nResult:\n{}\nTarget:\n{}", e, mml_to_string(mathml_test), mml_to_string(mathml_target)); |
101 | }, | |
102 | } | |
103 | 141 | })); |
104 | 141 | match crate::interface::report_any_panic(result) { |
105 | 140 | Ok(()) => Ok(()), |
106 | 1 | Err(e) => { |
107 | 1 | eprintln!("{}", e); |
108 | 1 | Err(e) |
109 | } | |
110 | } | |
111 | 141 | } |
112 | ||
113 | /// sets locale to be US standard | |
114 | 30 | pub fn are_strs_canonically_equal(test: &str, target: &str, ignore_attrs: &[&str]) -> bool { |
115 | 30 | are_strs_canonically_equal_with_locale(test, target, ignore_attrs, ", \u{00A0}\u{202F}", ".").is_ok() |
116 | 30 | } |
117 | ||
118 | /// Like `are_strs_canonically_equal` but returns `Result` for use in `#[test]` functions that return `Result<()>`. | |
119 | 102 | pub fn are_strs_canonically_equal_result(test: &str, target: &str, ignore_attrs: &[&str]) -> Result<()> { |
120 | 102 | are_strs_canonically_equal_with_locale(test, target, ignore_attrs, ", \u{00A0}\u{202F}", ".") |
121 | 102 | } |
Line | Count | Source |
1 | #![allow(clippy::needless_return)] | |
2 | ||
3 | //! A library for generating speech and braille from MathML | |
4 | //! | |
5 | //! Typical usage is: | |
6 | //! 1. Set the rules directory [`set_rules_dir`] | |
7 | //! 2. Set whatever preferences are need with repeated calls to [`set_preference`]. | |
8 | //! 3. Set MathML via [`set_mathml`] | |
9 | //! A string representing the cleaned up MathML along with `id`s on each node is returned for highlighting if desired | |
10 | //! 4. Get the speech [`get_spoken_text`] or (Unicode) braille [`get_braille`]. | |
11 | //! | |
12 | //! The expression can be navigated also. | |
13 | //! This is done in one of two ways: | |
14 | //! 1. Pass key strokes to allow a user to navigate the MathML by calling [`do_navigate_keypress`]; the speech is returned. | |
15 | //! 2. Pass the MathCAT navigation command directory by called [`do_navigate_command`]; the speech is return returned. | |
16 | //! | |
17 | //! To get the MathML associated with the current navigation node, call [`get_navigation_mathml`]. | |
18 | //! To just get the `id` and offset from the id of the current navigation node, call [`get_navigation_mathml_id`]. | |
19 | /// | |
20 | /// This module re-exports anyhow types. Use `bail!` for early returns and | |
21 | /// `context()`/`with_context()` on Result to add context (replacing old `chain_err()`). | |
22 | pub mod errors { | |
23 | pub use anyhow::{anyhow, bail, Error, Result, Context}; | |
24 | } | |
25 | ||
26 | pub mod interface; | |
27 | #[cfg(feature = "include-zip")] | |
28 | pub use shim_filesystem::ZIPPED_RULE_FILES; | |
29 | ||
30 | mod canonicalize; | |
31 | mod infer_intent; | |
32 | pub mod speech; | |
33 | mod braille; | |
34 | mod navigate; | |
35 | mod prefs; | |
36 | mod tts; | |
37 | mod xpath_functions; | |
38 | mod definitions; | |
39 | pub mod pretty_print; | |
40 | mod chemistry; | |
41 | ||
42 | pub mod shim_filesystem; // really just for override_file_for_debugging_rules, but the config seems to throw it off | |
43 | pub use interface::*; | |
44 | use crate::errors::{bail, Result}; | |
45 | ||
46 | #[cfg(test)] | |
47 | 0 | pub fn init_logger() { |
48 | 0 | env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("debug")) |
49 | 0 | .is_test(true) |
50 | 0 | .format_timestamp(None) |
51 | 0 | .format_module_path(false) |
52 | 0 | .format_indent(None) |
53 | 0 | .format_level(false) |
54 | 0 | .init(); |
55 | 0 | } |
56 | ||
57 | /// Build Absolute path to rules dir for testing | |
58 | 250 | pub fn abs_rules_dir_path() -> String { |
59 | cfg_if::cfg_if! { | |
60 | if #[cfg(feature = "include-zip")] { | |
61 | return "Rules".to_string(); | |
62 | } else { | |
63 | // Package root (see tests/common/mod.rs `abs_rules_dir_path` for rationale). | |
64 | 250 | return std::path::Path::new(env!("CARGO_MANIFEST_DIR")) |
65 | 250 | .join("Rules") |
66 | 250 | .to_str() |
67 | 250 | .expect("CARGO_MANIFEST_DIR and Rules path must be UTF-8") |
68 | 250 | .to_string(); |
69 | } | |
70 | } | |
71 | 250 | } |
72 | ||
73 | 141 | pub fn are_strs_canonically_equal_with_locale(test: &str, target: &str, ignore_attrs: &[&str], block_separators: &str, decimal_separators: &str) -> Result<()> { |
74 | use crate::{interface::*, pretty_print::mml_to_string}; | |
75 | use sxd_document::parser; | |
76 | use crate::canonicalize::canonicalize; | |
77 | use std::panic::{catch_unwind, AssertUnwindSafe}; | |
78 | ||
79 | 141 | crate::interface::init_panic_handler(); |
80 | 141 | let result = catch_unwind(AssertUnwindSafe(|| { |
81 | // this forces initialization | |
82 | 141 | crate::interface::set_rules_dir(abs_rules_dir_path()).unwrap(); |
83 | 141 | crate::speech::SPEECH_RULES.with(|rules| rules.borrow_mut().read_files().unwrap()); |
84 | 141 | set_preference("Language", "en").unwrap(); |
85 | 141 | set_preference("BlockSeparators", block_separators).unwrap(); |
86 | 141 | set_preference("DecimalSeparators", decimal_separators).unwrap(); |
87 | ||
88 | 141 | let package1 = &parser::parse(test).expect("Failed to parse test input"); |
89 | 141 | let mathml = get_element(package1); |
90 | 141 | trim_element(mathml, false); |
91 | 141 | let mathml_test = canonicalize(mathml).unwrap(); |
92 | ||
93 | 141 | let package2 = &parser::parse(target).expect("Failed to parse target input"); |
94 | 141 | let mathml_target = get_element(package2); |
95 | 141 | trim_element(mathml_target, false); |
96 | ||
97 | 141 | match is_same_element(mathml_test, mathml_target, ignore_attrs) { |
98 | 141 | Ok(_) => Ok( () ), |
99 | 0 | Err(e) => { |
100 | 0 | bail!("{}\nResult:\n{}\nTarget:\n{}", e, mml_to_string(mathml_test), mml_to_string(mathml_target)); |
101 | }, | |
102 | } | |
103 | 141 | })); |
104 | 141 | match crate::interface::report_any_panic(result) { |
105 | 140 | Ok(()) => Ok(()), |
106 | 1 | Err(e) => { |
107 | 1 | eprintln!("{}", e); |
108 | 1 | Err(e) |
109 | } | |
110 | } | |
111 | 141 | } |
112 | ||
113 | /// sets locale to be US standard | |
114 | 30 | pub fn are_strs_canonically_equal(test: &str, target: &str, ignore_attrs: &[&str]) -> bool { |
115 | 30 | are_strs_canonically_equal_with_locale(test, target, ignore_attrs, ", \u{00A0}\u{202F}", ".").is_ok() |
116 | 30 | } |
117 | ||
118 | /// Like `are_strs_canonically_equal` but returns `Result` for use in `#[test]` functions that return `Result<()>`. | |
119 | 102 | pub fn are_strs_canonically_equal_result(test: &str, target: &str, ignore_attrs: &[&str]) -> Result<()> { |
120 | 102 | are_strs_canonically_equal_with_locale(test, target, ignore_attrs, ", \u{00A0}\u{202F}", ".") |
121 | 102 | } |
Line | Count | Source |
1 | // *** MathCAT doesn't normally want to build a binary *** | |
2 | // *** This file is here because it is useful for trying out things *** | |
3 | #![allow(clippy::needless_return)] | |
4 | ||
5 | use libmathcat::interface::*; | |
6 | use log::{debug, info}; | |
7 | use std::time::Instant; | |
8 | use std::process::exit; | |
9 | ||
10 | ||
11 | // Maybe also have this speak to test the TTS generation. | |
12 | // There is a rust winapi crate that mirrors the WinPAI and has "Speak(...)" in it | |
13 | ||
14 | // env RUST_LOG=DEBUG cargo run --features "include-zip" | |
15 | cfg_if::cfg_if! { | |
16 | if #[cfg(feature = "include-zip")] { | |
17 | fn get_rules_dir() -> String { | |
18 | return "Rules".to_string(); | |
19 | } | |
20 | } else { | |
21 | 0 | fn get_rules_dir() -> String { |
22 | // for testing with zipped rules dir | |
23 | // let rules_path = std::env::current_exe().unwrap().parent().unwrap().join("../../../MathCATForPython/addon/globalPlugins/MathCAT/Rules"); | |
24 | 0 | let rules_path = std::env::current_exe().unwrap().parent().unwrap().join("../../Rules"); |
25 | 0 | return rules_path.as_os_str().to_str().unwrap().to_string(); |
26 | 0 | } |
27 | } | |
28 | } | |
29 | ||
30 | 0 | fn main() { |
31 | 0 | env_logger::builder() |
32 | 0 | .format_timestamp(None) |
33 | 0 | .format_module_path(false) |
34 | 0 | .format_indent(Some(2)) |
35 | 0 | .format_level(false) |
36 | 0 | .init(); |
37 | ||
38 | // let expr = r#" | |
39 | // <math xmlns="http://www.w3.org/1998/Math/MathML" display="block"> | |
40 | // <mrow> | |
41 | // <msup> | |
42 | // <mi>e</mi> | |
43 | // <mrow> | |
44 | // <mo>−</mo> | |
45 | // <mfrac> | |
46 | // <mn>1</mn> | |
47 | // <mn>2</mn> | |
48 | // </mfrac> | |
49 | // <msup> | |
50 | // <mrow> | |
51 | // <mrow> | |
52 | // <mo>(</mo> | |
53 | // <mrow> | |
54 | // <mfrac> | |
55 | // <mrow> | |
56 | // <mi>x</mi> | |
57 | // <mo>−</mo> | |
58 | // <mi>μ</mi> | |
59 | // </mrow> | |
60 | // <mi>σ</mi> | |
61 | // </mfrac> | |
62 | // </mrow> | |
63 | // <mo>)</mo> | |
64 | // </mrow> | |
65 | // </mrow> | |
66 | // <mn>2</mn> | |
67 | // </msup> | |
68 | // </mrow> | |
69 | // </msup> | |
70 | // </mrow> | |
71 | // </math> | |
72 | // "#; | |
73 | // let expr = "<math display='inline' xmlns='http://www.w3.org/1998/Math/MathML'> | |
74 | // <msup intent='power($base(2, $base),silly($exp,-1.))'> | |
75 | // <mi arg='base'>x</mi> | |
76 | // <mi arg='exp'>n</mi> | |
77 | // </msup> | |
78 | // </math> | |
79 | // "; | |
80 | // let expr = "<mrow intent='pre@prefix(in@infix($a, x))(post@postfix($b))'> | |
81 | // <mi arg='a'>A</mi> | |
82 | // <mover> | |
83 | // <mo intent='map'>⟶</mo> | |
84 | // <mo intent='congruence'>≅</mo> | |
85 | // </mover> | |
86 | // <mi arg='b'>B</mi> | |
87 | // </mrow>"; | |
88 | // let expr = "<math><mi>Na</mi><mi>S</mi><mo>(</mo><mi>l</mi><mo>)</mo></math>"; | |
89 | ||
90 | ||
91 | // let expr = "<math xmlns='http://www.w3.org/1998/Math/MathML' display='block'> | |
92 | // <mrow> | |
93 | // <mo stretchy='false'>[</mo> | |
94 | // <mrow> | |
95 | // <mi>Co</mi> | |
96 | // </mrow> | |
97 | // <mo stretchy='false'>(</mo> | |
98 | // <mrow> | |
99 | // <mi>NH</mi> | |
100 | // </mrow> | |
101 | // <msub> | |
102 | // <mrow> | |
103 | // <mrow> | |
104 | // <mpadded width='0'> | |
105 | // <mphantom> | |
106 | // <mi>A</mi> | |
107 | // </mphantom> | |
108 | // </mpadded> | |
109 | // </mrow> | |
110 | // </mrow> | |
111 | // <mrow> | |
112 | // <mrow> | |
113 | // <mpadded height='0'> | |
114 | // <mn>3</mn> | |
115 | // </mpadded> | |
116 | // </mrow> | |
117 | // </mrow> | |
118 | // </msub> | |
119 | // <mo stretchy='false'>)</mo> | |
120 | // <msub> | |
121 | // <mrow> | |
122 | // <mrow> | |
123 | // <mpadded width='0'> | |
124 | // <mphantom> | |
125 | // <mi>A</mi> | |
126 | // </mphantom> | |
127 | // </mpadded> | |
128 | // </mrow> | |
129 | // </mrow> | |
130 | // <mrow> | |
131 | // <mrow> | |
132 | // <mpadded height='0'> | |
133 | // <mn>6</mn> | |
134 | // </mpadded> | |
135 | // </mrow> | |
136 | // </mrow> | |
137 | // </msub> | |
138 | // <mo stretchy='false'>]</mo> | |
139 | // <msup> | |
140 | // <mrow> | |
141 | // <mrow> | |
142 | // <mpadded width='0'> | |
143 | // <mphantom> | |
144 | // <mi>A</mi> | |
145 | // </mphantom> | |
146 | // </mpadded> | |
147 | // </mrow> | |
148 | // </mrow> | |
149 | // <mrow> | |
150 | // <mn>3</mn> | |
151 | // <mo>+</mo> | |
152 | // </mrow> | |
153 | // </msup> | |
154 | // <mtext> </mtext> | |
155 | // <mo stretchy='false'>(</mo> | |
156 | // <mrow> | |
157 | // <mi>Cl</mi> | |
158 | // </mrow> | |
159 | // <msub> | |
160 | // <mrow> | |
161 | // <mrow> | |
162 | // <mpadded width='0'> | |
163 | // <mphantom> | |
164 | // <mi>A</mi> | |
165 | // </mphantom> | |
166 | // </mpadded> | |
167 | // </mrow> | |
168 | // </mrow> | |
169 | // <mrow> | |
170 | // <mrow> | |
171 | // <mpadded height='0'> | |
172 | // <mn>3</mn> | |
173 | // </mpadded> | |
174 | // </mrow> | |
175 | // </mrow> | |
176 | // </msub> | |
177 | // <mo stretchy='false'>)</mo> | |
178 | // <msup> | |
179 | // <mrow> | |
180 | // <mrow> | |
181 | // <mpadded width='0'> | |
182 | // <mphantom> | |
183 | // <mi>A</mi> | |
184 | // </mphantom> | |
185 | // </mpadded> | |
186 | // </mrow> | |
187 | // </mrow> | |
188 | // <mrow> | |
189 | // <mo>−</mo>, | |
190 | // </mrow> | |
191 | // </msup> | |
192 | // </mrow> | |
193 | // </math>"; | |
194 | ||
195 | 0 | let expr = r#" |
196 | 0 | <math> |
197 | 0 | <msub><mi mathvariant="normal">N</mi><mn>2</mn></msub> |
198 | 0 | <munderover><mo>→</mo><mtext>Haber process</mtext><msub><mi mathvariant="normal">H</mi><mn>2</mn></msub></munderover> |
199 | 0 | <mi mathvariant="normal">N</mi> |
200 | 0 | <msub><mi mathvariant="normal">H</mi><mn>3</mn></msub> |
201 | 0 | </math> |
202 | 0 | "#; |
203 | // let instant = Instant::now(); | |
204 | ||
205 | // let rules_dir = "".to_string(); // Use MathCATRulesDir, potentially pointing to a zipped version | |
206 | 0 | if let Err(e) = set_rules_dir(get_rules_dir()) { |
207 | 0 | eprintln!("Error: exiting -- {}", errors_to_string(&e)); exit(1); |
208 | 0 | } |
209 | 0 | debug!("Languages: {}", libmathcat::interface::get_supported_languages().unwrap_or_default().join(", ")); |
210 | ||
211 | #[cfg(feature = "include-zip")] | |
212 | info!("***********include-zip is present**********"); | |
213 | 0 | info!("Version = '{}' using Rules dir {}", get_version(), get_rules_dir()); |
214 | 0 | set_preference("Language", "en").unwrap(); |
215 | 0 | set_preference("DecimalSeparator", "Auto").unwrap(); |
216 | 0 | set_preference("BrailleCode", "UEB").unwrap(); |
217 | 0 | set_preference("BrailleNavHighlight", "On").unwrap(); |
218 | 0 | set_preference("TTS", "None").unwrap(); |
219 | 0 | set_preference("Verbosity", "Verbose").unwrap(); |
220 | 0 | set_preference("NavVerbosity", "Verbose").unwrap(); |
221 | 0 | set_preference("NavMode", "Enhanced").unwrap(); |
222 | 0 | set_preference("Impairment", "Blindness").unwrap(); |
223 | 0 | set_preference("SpeechOverrides_CapitalLetters", "").unwrap(); |
224 | 0 | set_preference("MathRate", "80").unwrap(); |
225 | // set_preference("CapitalLetters_UseWord", "true").unwrap(); | |
226 | // set_preference("CapitalLetters_Pitch", "30").unwrap(); | |
227 | 0 | set_preference("CapitalLetters_Beep", "true").unwrap(); |
228 | 0 | set_preference("IntentErrorRecovery", "Error").unwrap(); |
229 | // set_preference("MathRate", "77").unwrap(); | |
230 | ||
231 | 0 | set_preference("Bookmark", "false").unwrap(); |
232 | 0 | set_preference("SpeechStyle", "ClearSpeak").unwrap(); |
233 | 0 | info!("Languages: {}", libmathcat::interface::get_supported_languages().unwrap_or_default().join(", ")); |
234 | 0 | info!("Speech styles: {}", libmathcat::interface::get_supported_speech_styles("ClearSpeak").unwrap_or_default().join(", ")); |
235 | 0 | info!("BrailleCodes: {}", libmathcat::interface::get_supported_braille_codes().unwrap_or_default().join(", ")); |
236 | // set_preference("DecimalSeparators", ",").unwrap(); | |
237 | // set_preference("BlockSeparators", ". ").unwrap(); | |
238 | 0 | if let Err(e) = set_mathml(expr) { |
239 | 0 | eprintln!("Error: exiting -- {}", errors_to_string(&e)); exit(1); |
240 | 0 | }; |
241 | ||
242 | // match do_navigate_command("ZoomIn".to_string()) { | |
243 | // Err(e) => eprintln!("Error: exiting -- {}", errors_to_string(&e)); exit(1);, | |
244 | // Ok(speech) => info!("\nZoomIn speech: '{speech}'"), | |
245 | // } | |
246 | // match do_navigate_command("ToggleZoomLockUp".to_string()) { | |
247 | // Err(e) => eprintln!("Error: exiting -- {}", errors_to_string(&e)); exit(1);, | |
248 | // Ok(speech) => info!("ToggleZoomLockUp speech: '{speech}'"), | |
249 | // } | |
250 | // match do_navigate_command("MovePrevious".to_string()) { | |
251 | // Err(e) => eprintln!("Error: exiting -- {}", errors_to_string(&e)); exit(1);, | |
252 | // Ok(speech) => info!("MovePrevious speech: '{speech}'"), | |
253 | // } | |
254 | // match do_navigate_command("MovePrevious".to_string()) { | |
255 | // Err(e) => eprintln!("Error: exiting -- {}", errors_to_string(&e)); exit(1);, | |
256 | // Ok(speech) => info!("MovePrevious speech: '{}'", speech), | |
257 | // } | |
258 | // match do_navigate_command("MovePrevious".to_string()) { | |
259 | // Err(e) => eprintln!("Error: exiting -- {}", errors_to_string(&e)); exit(1);, | |
260 | // Ok(speech) => info!("MovePrevious speech: '{}'", speech), | |
261 | // } | |
262 | // match do_navigate_command("MoveNext".to_string()) { | |
263 | // Err(e) => eprintln!("Error: exiting -- {}", errors_to_string(&e)); exit(1);, | |
264 | // Ok(speech) => info!("MoveNext speech: '{}'", speech), | |
265 | // } | |
266 | // match do_navigate_command("MoveNext".to_string()) { | |
267 | // Err(e) => eprintln!("Error: exiting -- {}", errors_to_string(&e)); exit(1);, | |
268 | // Ok(speech) => info!("MoveNext speech: '{}'", speech), | |
269 | // } | |
270 | // match do_navigate_command("MoveNext".to_string()) { | |
271 | // Err(e) => eprintln!("Error: exiting -- {}", errors_to_string(&e)); exit(1);, | |
272 | // Ok(speech) => info!("MoveNext speech: '{}'", speech), | |
273 | // } | |
274 | // match do_navigate_command("MoveNext".to_string()) { | |
275 | // Err(e) => eprintln!("Error: exiting -- {}", errors_to_string(&e)); exit(1);, | |
276 | // Ok(speech) => info!("MoveNext speech: '{}'", speech), | |
277 | // } | |
278 | // match get_spoken_text() { | |
279 | // Ok(speech) => info!("Computed speech string:\n '{speech}'"), | |
280 | // Err(e) => eprintln!("{}", errors_to_string(&e)); exit(1);, | |
281 | // } | |
282 | 0 | debug!("Speech language is {}", get_preference("Language").unwrap()); |
283 | 0 | debug!("DecimalSeparator: {:?}", get_preference("DecimalSeparator").unwrap()); |
284 | 0 | debug!("DecimalSeparators: {:?}, BlockSeparators: {:?}", get_preference("DecimalSeparators").unwrap(), get_preference("BlockSeparators").unwrap()); |
285 | 0 | debug!("SpeechStyle: {:?}", get_preference("SpeechStyle").unwrap()); |
286 | 0 | debug!("Verbosity: {:?}", get_preference("Verbosity").unwrap()); |
287 | ||
288 | // info!("Time taken for loading+speech+braille: {}ms", instant.elapsed().as_millis()); | |
289 | // let instant = Instant::now(); | |
290 | 0 | match get_spoken_text() { |
291 | 0 | Ok(speech) => info!("Computed speech string:\n '{}'", speech), |
292 | 0 | Err(e) => {eprintln!("{}", errors_to_string(&e)); exit(1);}, |
293 | } | |
294 | // info!("Time taken (second time for speech): {}ms", instant.elapsed().as_millis()); | |
295 | // info!("SpeechStyle: {:?}", get_preference("SpeechStyle")); | |
296 | ||
297 | 0 | match get_braille("") { |
298 | 0 | Ok(braille) => info!("Computed braille string:\n '{braille}'"), |
299 | 0 | Err(e) => {eprintln!("{}", errors_to_string(&e)); exit(1);}, |
300 | } | |
301 | 0 | debug!("...using BrailleCode: {:?}", get_preference("BrailleCode").unwrap()); |
302 | // let xpath_counts = libmathcat::speech::xpath_count(); | |
303 | // info!("#xpath = {}; duplicates = {}", xpath_counts.0, xpath_counts.1); | |
304 | // info!("Time taken (second time for speech + braille): {}ms", instant.elapsed().as_millis()); | |
305 | // debug!("Hashmap sizes:\n{}", libmathcat::speech::SpeechRules::print_sizes()); | |
306 | 0 | timing_test(expr, 000); |
307 | ||
308 | 0 | } |
309 | ||
310 | 0 | fn timing_test(expr: &str, n_loops: usize) { |
311 | 0 | if n_loops == 0 { |
312 | 0 | return; |
313 | 0 | } |
314 | ||
315 | 0 | let n_loops_float = n_loops as f64; |
316 | 0 | let instant = Instant::now(); |
317 | 0 | for _ in 0..n_loops { |
318 | 0 | if let Err(e) = set_mathml(expr) { |
319 | 0 | eprintln!("Error: exiting -- {}", errors_to_string(&e)); |
320 | 0 | }; |
321 | 0 | match get_spoken_text() { |
322 | 0 | Ok(_) =>( ), |
323 | 0 | Err(e) => {eprintln!("{}", errors_to_string(&e)); exit(1);}, |
324 | } | |
325 | 0 | match get_braille("") { |
326 | 0 | Ok(_) => (), |
327 | 0 | Err(e) => {eprintln!("{}", errors_to_string(&e)); exit(1);}, |
328 | } | |
329 | } | |
330 | 0 | info!("Time taken (time for set, speech, {} braille averaged over {} loops): {}ms", get_preference("BrailleCode").unwrap(), n_loops, instant.elapsed().as_millis() as f64/n_loops_float); |
331 | ||
332 | 0 | let instant = Instant::now(); |
333 | 0 | for _ in 0..n_loops { |
334 | 0 | if let Err(e) = set_mathml(expr) { |
335 | 0 | eprintln!("Error: exiting -- {}", errors_to_string(&e)); |
336 | 0 | }; |
337 | } | |
338 | 0 | info!("Time taken (time for set averaged over {} loops): {}ms", n_loops, instant.elapsed().as_millis() as f64/n_loops_float); |
339 | ||
340 | 0 | let instant = Instant::now(); |
341 | 0 | for _ in 0..n_loops { |
342 | 0 | match get_spoken_text() { |
343 | 0 | Ok(_) =>( ), |
344 | 0 | Err(e) => {eprintln!("{}", errors_to_string(&e)); exit(1);}, |
345 | } | |
346 | } | |
347 | 0 | info!("Time taken (time for get_spoken_text() averaged over {} loops): {}ms", n_loops, instant.elapsed().as_millis() as f64/n_loops_float); |
348 | ||
349 | 0 | set_preference("BrailleCode", "UEB").unwrap(); |
350 | 0 | get_braille("").unwrap(); |
351 | 0 | let instant = Instant::now(); |
352 | 0 | for _ in 0..n_loops { |
353 | 0 | match get_braille("") { |
354 | 0 | Ok(_) => (), |
355 | 0 | Err(e) => {eprintln!("{}", errors_to_string(&e)); exit(1);}, |
356 | } | |
357 | } | |
358 | 0 | info!("Time taken (time for {} braille averaged over {} loops): {}ms", get_preference("BrailleCode").unwrap(), n_loops, instant.elapsed().as_millis() as f64/n_loops_float); |
359 | ||
360 | 0 | if let Err(e) = set_mathml(expr) { |
361 | 0 | eprintln!("Error: exiting -- {}", errors_to_string(&e)); |
362 | 0 | }; |
363 | 0 | set_preference("BrailleCode", "Nemeth").unwrap(); |
364 | 0 | get_braille("").unwrap(); |
365 | 0 | let instant = Instant::now(); |
366 | 0 | for _ in 0..n_loops { |
367 | 0 | match get_braille("") { |
368 | 0 | Ok(_) => (), |
369 | 0 | Err(e) => {eprintln!("{}", errors_to_string(&e)); exit(1);} , |
370 | } | |
371 | } | |
372 | 0 | info!("Time taken (time for {} braille averaged over {} loops): {}ms", get_preference("BrailleCode").unwrap(), n_loops, instant.elapsed().as_millis() as f64/n_loops_float); |
373 | 0 | } |
Line | Count | Source |
1 | // *** MathCAT doesn't normally want to build a binary *** | |
2 | // *** This file is here because it is useful for trying out things *** | |
3 | #![allow(clippy::needless_return)] | |
4 | ||
5 | use libmathcat::interface::*; | |
6 | use log::{debug, info}; | |
7 | use std::time::Instant; | |
8 | use std::process::exit; | |
9 | ||
10 | ||
11 | // Maybe also have this speak to test the TTS generation. | |
12 | // There is a rust winapi crate that mirrors the WinPAI and has "Speak(...)" in it | |
13 | ||
14 | // env RUST_LOG=DEBUG cargo run --features "include-zip" | |
15 | cfg_if::cfg_if! { | |
16 | if #[cfg(feature = "include-zip")] { | |
17 | fn get_rules_dir() -> String { | |
18 | return "Rules".to_string(); | |
19 | } | |
20 | } else { | |
21 | 0 | fn get_rules_dir() -> String { |
22 | // for testing with zipped rules dir | |
23 | // let rules_path = std::env::current_exe().unwrap().parent().unwrap().join("../../../MathCATForPython/addon/globalPlugins/MathCAT/Rules"); | |
24 | 0 | let rules_path = std::env::current_exe().unwrap().parent().unwrap().join("../../Rules"); |
25 | 0 | return rules_path.as_os_str().to_str().unwrap().to_string(); |
26 | 0 | } |
27 | } | |
28 | } | |
29 | ||
30 | 0 | fn main() { |
31 | 0 | env_logger::builder() |
32 | 0 | .format_timestamp(None) |
33 | 0 | .format_module_path(false) |
34 | 0 | .format_indent(Some(2)) |
35 | 0 | .format_level(false) |
36 | 0 | .init(); |
37 | ||
38 | // let expr = r#" | |
39 | // <math xmlns="http://www.w3.org/1998/Math/MathML" display="block"> | |
40 | // <mrow> | |
41 | // <msup> | |
42 | // <mi>e</mi> | |
43 | // <mrow> | |
44 | // <mo>−</mo> | |
45 | // <mfrac> | |
46 | // <mn>1</mn> | |
47 | // <mn>2</mn> | |
48 | // </mfrac> | |
49 | // <msup> | |
50 | // <mrow> | |
51 | // <mrow> | |
52 | // <mo>(</mo> | |
53 | // <mrow> | |
54 | // <mfrac> | |
55 | // <mrow> | |
56 | // <mi>x</mi> | |
57 | // <mo>−</mo> | |
58 | // <mi>μ</mi> | |
59 | // </mrow> | |
60 | // <mi>σ</mi> | |
61 | // </mfrac> | |
62 | // </mrow> | |
63 | // <mo>)</mo> | |
64 | // </mrow> | |
65 | // </mrow> | |
66 | // <mn>2</mn> | |
67 | // </msup> | |
68 | // </mrow> | |
69 | // </msup> | |
70 | // </mrow> | |
71 | // </math> | |
72 | // "#; | |
73 | // let expr = "<math display='inline' xmlns='http://www.w3.org/1998/Math/MathML'> | |
74 | // <msup intent='power($base(2, $base),silly($exp,-1.))'> | |
75 | // <mi arg='base'>x</mi> | |
76 | // <mi arg='exp'>n</mi> | |
77 | // </msup> | |
78 | // </math> | |
79 | // "; | |
80 | // let expr = "<mrow intent='pre@prefix(in@infix($a, x))(post@postfix($b))'> | |
81 | // <mi arg='a'>A</mi> | |
82 | // <mover> | |
83 | // <mo intent='map'>⟶</mo> | |
84 | // <mo intent='congruence'>≅</mo> | |
85 | // </mover> | |
86 | // <mi arg='b'>B</mi> | |
87 | // </mrow>"; | |
88 | // let expr = "<math><mi>Na</mi><mi>S</mi><mo>(</mo><mi>l</mi><mo>)</mo></math>"; | |
89 | ||
90 | ||
91 | // let expr = "<math xmlns='http://www.w3.org/1998/Math/MathML' display='block'> | |
92 | // <mrow> | |
93 | // <mo stretchy='false'>[</mo> | |
94 | // <mrow> | |
95 | // <mi>Co</mi> | |
96 | // </mrow> | |
97 | // <mo stretchy='false'>(</mo> | |
98 | // <mrow> | |
99 | // <mi>NH</mi> | |
100 | // </mrow> | |
101 | // <msub> | |
102 | // <mrow> | |
103 | // <mrow> | |
104 | // <mpadded width='0'> | |
105 | // <mphantom> | |
106 | // <mi>A</mi> | |
107 | // </mphantom> | |
108 | // </mpadded> | |
109 | // </mrow> | |
110 | // </mrow> | |
111 | // <mrow> | |
112 | // <mrow> | |
113 | // <mpadded height='0'> | |
114 | // <mn>3</mn> | |
115 | // </mpadded> | |
116 | // </mrow> | |
117 | // </mrow> | |
118 | // </msub> | |
119 | // <mo stretchy='false'>)</mo> | |
120 | // <msub> | |
121 | // <mrow> | |
122 | // <mrow> | |
123 | // <mpadded width='0'> | |
124 | // <mphantom> | |
125 | // <mi>A</mi> | |
126 | // </mphantom> | |
127 | // </mpadded> | |
128 | // </mrow> | |
129 | // </mrow> | |
130 | // <mrow> | |
131 | // <mrow> | |
132 | // <mpadded height='0'> | |
133 | // <mn>6</mn> | |
134 | // </mpadded> | |
135 | // </mrow> | |
136 | // </mrow> | |
137 | // </msub> | |
138 | // <mo stretchy='false'>]</mo> | |
139 | // <msup> | |
140 | // <mrow> | |
141 | // <mrow> | |
142 | // <mpadded width='0'> | |
143 | // <mphantom> | |
144 | // <mi>A</mi> | |
145 | // </mphantom> | |
146 | // </mpadded> | |
147 | // </mrow> | |
148 | // </mrow> | |
149 | // <mrow> | |
150 | // <mn>3</mn> | |
151 | // <mo>+</mo> | |
152 | // </mrow> | |
153 | // </msup> | |
154 | // <mtext> </mtext> | |
155 | // <mo stretchy='false'>(</mo> | |
156 | // <mrow> | |
157 | // <mi>Cl</mi> | |
158 | // </mrow> | |
159 | // <msub> | |
160 | // <mrow> | |
161 | // <mrow> | |
162 | // <mpadded width='0'> | |
163 | // <mphantom> | |
164 | // <mi>A</mi> | |
165 | // </mphantom> | |
166 | // </mpadded> | |
167 | // </mrow> | |
168 | // </mrow> | |
169 | // <mrow> | |
170 | // <mrow> | |
171 | // <mpadded height='0'> | |
172 | // <mn>3</mn> | |
173 | // </mpadded> | |
174 | // </mrow> | |
175 | // </mrow> | |
176 | // </msub> | |
177 | // <mo stretchy='false'>)</mo> | |
178 | // <msup> | |
179 | // <mrow> | |
180 | // <mrow> | |
181 | // <mpadded width='0'> | |
182 | // <mphantom> | |
183 | // <mi>A</mi> | |
184 | // </mphantom> | |
185 | // </mpadded> | |
186 | // </mrow> | |
187 | // </mrow> | |
188 | // <mrow> | |
189 | // <mo>−</mo>, | |
190 | // </mrow> | |
191 | // </msup> | |
192 | // </mrow> | |
193 | // </math>"; | |
194 | ||
195 | 0 | let expr = r#" |
196 | 0 | <math> |
197 | 0 | <msub><mi mathvariant="normal">N</mi><mn>2</mn></msub> |
198 | 0 | <munderover><mo>→</mo><mtext>Haber process</mtext><msub><mi mathvariant="normal">H</mi><mn>2</mn></msub></munderover> |
199 | 0 | <mi mathvariant="normal">N</mi> |
200 | 0 | <msub><mi mathvariant="normal">H</mi><mn>3</mn></msub> |
201 | 0 | </math> |
202 | 0 | "#; |
203 | // let instant = Instant::now(); | |
204 | ||
205 | // let rules_dir = "".to_string(); // Use MathCATRulesDir, potentially pointing to a zipped version | |
206 | 0 | if let Err(e) = set_rules_dir(get_rules_dir()) { |
207 | 0 | eprintln!("Error: exiting -- {}", errors_to_string(&e)); exit(1); |
208 | 0 | } |
209 | 0 | debug!("Languages: {}", libmathcat::interface::get_supported_languages().unwrap_or_default().join(", ")); |
210 | ||
211 | #[cfg(feature = "include-zip")] | |
212 | info!("***********include-zip is present**********"); | |
213 | 0 | info!("Version = '{}' using Rules dir {}", get_version(), get_rules_dir()); |
214 | 0 | set_preference("Language", "en").unwrap(); |
215 | 0 | set_preference("DecimalSeparator", "Auto").unwrap(); |
216 | 0 | set_preference("BrailleCode", "UEB").unwrap(); |
217 | 0 | set_preference("BrailleNavHighlight", "On").unwrap(); |
218 | 0 | set_preference("TTS", "None").unwrap(); |
219 | 0 | set_preference("Verbosity", "Verbose").unwrap(); |
220 | 0 | set_preference("NavVerbosity", "Verbose").unwrap(); |
221 | 0 | set_preference("NavMode", "Enhanced").unwrap(); |
222 | 0 | set_preference("Impairment", "Blindness").unwrap(); |
223 | 0 | set_preference("SpeechOverrides_CapitalLetters", "").unwrap(); |
224 | 0 | set_preference("MathRate", "80").unwrap(); |
225 | // set_preference("CapitalLetters_UseWord", "true").unwrap(); | |
226 | // set_preference("CapitalLetters_Pitch", "30").unwrap(); | |
227 | 0 | set_preference("CapitalLetters_Beep", "true").unwrap(); |
228 | 0 | set_preference("IntentErrorRecovery", "Error").unwrap(); |
229 | // set_preference("MathRate", "77").unwrap(); | |
230 | ||
231 | 0 | set_preference("Bookmark", "false").unwrap(); |
232 | 0 | set_preference("SpeechStyle", "ClearSpeak").unwrap(); |
233 | 0 | info!("Languages: {}", libmathcat::interface::get_supported_languages().unwrap_or_default().join(", ")); |
234 | 0 | info!("Speech styles: {}", libmathcat::interface::get_supported_speech_styles("ClearSpeak").unwrap_or_default().join(", ")); |
235 | 0 | info!("BrailleCodes: {}", libmathcat::interface::get_supported_braille_codes().unwrap_or_default().join(", ")); |
236 | // set_preference("DecimalSeparators", ",").unwrap(); | |
237 | // set_preference("BlockSeparators", ". ").unwrap(); | |
238 | 0 | if let Err(e) = set_mathml(expr) { |
239 | 0 | eprintln!("Error: exiting -- {}", errors_to_string(&e)); exit(1); |
240 | 0 | }; |
241 | ||
242 | // match do_navigate_command("ZoomIn".to_string()) { | |
243 | // Err(e) => eprintln!("Error: exiting -- {}", errors_to_string(&e)); exit(1);, | |
244 | // Ok(speech) => info!("\nZoomIn speech: '{speech}'"), | |
245 | // } | |
246 | // match do_navigate_command("ToggleZoomLockUp".to_string()) { | |
247 | // Err(e) => eprintln!("Error: exiting -- {}", errors_to_string(&e)); exit(1);, | |
248 | // Ok(speech) => info!("ToggleZoomLockUp speech: '{speech}'"), | |
249 | // } | |
250 | // match do_navigate_command("MovePrevious".to_string()) { | |
251 | // Err(e) => eprintln!("Error: exiting -- {}", errors_to_string(&e)); exit(1);, | |
252 | // Ok(speech) => info!("MovePrevious speech: '{speech}'"), | |
253 | // } | |
254 | // match do_navigate_command("MovePrevious".to_string()) { | |
255 | // Err(e) => eprintln!("Error: exiting -- {}", errors_to_string(&e)); exit(1);, | |
256 | // Ok(speech) => info!("MovePrevious speech: '{}'", speech), | |
257 | // } | |
258 | // match do_navigate_command("MovePrevious".to_string()) { | |
259 | // Err(e) => eprintln!("Error: exiting -- {}", errors_to_string(&e)); exit(1);, | |
260 | // Ok(speech) => info!("MovePrevious speech: '{}'", speech), | |
261 | // } | |
262 | // match do_navigate_command("MoveNext".to_string()) { | |
263 | // Err(e) => eprintln!("Error: exiting -- {}", errors_to_string(&e)); exit(1);, | |
264 | // Ok(speech) => info!("MoveNext speech: '{}'", speech), | |
265 | // } | |
266 | // match do_navigate_command("MoveNext".to_string()) { | |
267 | // Err(e) => eprintln!("Error: exiting -- {}", errors_to_string(&e)); exit(1);, | |
268 | // Ok(speech) => info!("MoveNext speech: '{}'", speech), | |
269 | // } | |
270 | // match do_navigate_command("MoveNext".to_string()) { | |
271 | // Err(e) => eprintln!("Error: exiting -- {}", errors_to_string(&e)); exit(1);, | |
272 | // Ok(speech) => info!("MoveNext speech: '{}'", speech), | |
273 | // } | |
274 | // match do_navigate_command("MoveNext".to_string()) { | |
275 | // Err(e) => eprintln!("Error: exiting -- {}", errors_to_string(&e)); exit(1);, | |
276 | // Ok(speech) => info!("MoveNext speech: '{}'", speech), | |
277 | // } | |
278 | // match get_spoken_text() { | |
279 | // Ok(speech) => info!("Computed speech string:\n '{speech}'"), | |
280 | // Err(e) => eprintln!("{}", errors_to_string(&e)); exit(1);, | |
281 | // } | |
282 | 0 | debug!("Speech language is {}", get_preference("Language").unwrap()); |
283 | 0 | debug!("DecimalSeparator: {:?}", get_preference("DecimalSeparator").unwrap()); |
284 | 0 | debug!("DecimalSeparators: {:?}, BlockSeparators: {:?}", get_preference("DecimalSeparators").unwrap(), get_preference("BlockSeparators").unwrap()); |
285 | 0 | debug!("SpeechStyle: {:?}", get_preference("SpeechStyle").unwrap()); |
286 | 0 | debug!("Verbosity: {:?}", get_preference("Verbosity").unwrap()); |
287 | ||
288 | // info!("Time taken for loading+speech+braille: {}ms", instant.elapsed().as_millis()); | |
289 | // let instant = Instant::now(); | |
290 | 0 | match get_spoken_text() { |
291 | 0 | Ok(speech) => info!("Computed speech string:\n '{}'", speech), |
292 | 0 | Err(e) => {eprintln!("{}", errors_to_string(&e)); exit(1);}, |
293 | } | |
294 | // info!("Time taken (second time for speech): {}ms", instant.elapsed().as_millis()); | |
295 | // info!("SpeechStyle: {:?}", get_preference("SpeechStyle")); | |
296 | ||
297 | 0 | match get_braille("") { |
298 | 0 | Ok(braille) => info!("Computed braille string:\n '{braille}'"), |
299 | 0 | Err(e) => {eprintln!("{}", errors_to_string(&e)); exit(1);}, |
300 | } | |
301 | 0 | debug!("...using BrailleCode: {:?}", get_preference("BrailleCode").unwrap()); |
302 | // let xpath_counts = libmathcat::speech::xpath_count(); | |
303 | // info!("#xpath = {}; duplicates = {}", xpath_counts.0, xpath_counts.1); | |
304 | // info!("Time taken (second time for speech + braille): {}ms", instant.elapsed().as_millis()); | |
305 | // debug!("Hashmap sizes:\n{}", libmathcat::speech::SpeechRules::print_sizes()); | |
306 | 0 | timing_test(expr, 000); |
307 | ||
308 | 0 | } |
309 | ||
310 | 0 | fn timing_test(expr: &str, n_loops: usize) { |
311 | 0 | if n_loops == 0 { |
312 | 0 | return; |
313 | 0 | } |
314 | ||
315 | 0 | let n_loops_float = n_loops as f64; |
316 | 0 | let instant = Instant::now(); |
317 | 0 | for _ in 0..n_loops { |
318 | 0 | if let Err(e) = set_mathml(expr) { |
319 | 0 | eprintln!("Error: exiting -- {}", errors_to_string(&e)); |
320 | 0 | }; |
321 | 0 | match get_spoken_text() { |
322 | 0 | Ok(_) =>( ), |
323 | 0 | Err(e) => {eprintln!("{}", errors_to_string(&e)); exit(1);}, |
324 | } | |
325 | 0 | match get_braille("") { |
326 | 0 | Ok(_) => (), |
327 | 0 | Err(e) => {eprintln!("{}", errors_to_string(&e)); exit(1);}, |
328 | } | |
329 | } | |
330 | 0 | info!("Time taken (time for set, speech, {} braille averaged over {} loops): {}ms", get_preference("BrailleCode").unwrap(), n_loops, instant.elapsed().as_millis() as f64/n_loops_float); |
331 | ||
332 | 0 | let instant = Instant::now(); |
333 | 0 | for _ in 0..n_loops { |
334 | 0 | if let Err(e) = set_mathml(expr) { |
335 | 0 | eprintln!("Error: exiting -- {}", errors_to_string(&e)); |
336 | 0 | }; |
337 | } | |
338 | 0 | info!("Time taken (time for set averaged over {} loops): {}ms", n_loops, instant.elapsed().as_millis() as f64/n_loops_float); |
339 | ||
340 | 0 | let instant = Instant::now(); |
341 | 0 | for _ in 0..n_loops { |
342 | 0 | match get_spoken_text() { |
343 | 0 | Ok(_) =>( ), |
344 | 0 | Err(e) => {eprintln!("{}", errors_to_string(&e)); exit(1);}, |
345 | } | |
346 | } | |
347 | 0 | info!("Time taken (time for get_spoken_text() averaged over {} loops): {}ms", n_loops, instant.elapsed().as_millis() as f64/n_loops_float); |
348 | ||
349 | 0 | set_preference("BrailleCode", "UEB").unwrap(); |
350 | 0 | get_braille("").unwrap(); |
351 | 0 | let instant = Instant::now(); |
352 | 0 | for _ in 0..n_loops { |
353 | 0 | match get_braille("") { |
354 | 0 | Ok(_) => (), |
355 | 0 | Err(e) => {eprintln!("{}", errors_to_string(&e)); exit(1);}, |
356 | } | |
357 | } | |
358 | 0 | info!("Time taken (time for {} braille averaged over {} loops): {}ms", get_preference("BrailleCode").unwrap(), n_loops, instant.elapsed().as_millis() as f64/n_loops_float); |
359 | ||
360 | 0 | if let Err(e) = set_mathml(expr) { |
361 | 0 | eprintln!("Error: exiting -- {}", errors_to_string(&e)); |
362 | 0 | }; |
363 | 0 | set_preference("BrailleCode", "Nemeth").unwrap(); |
364 | 0 | get_braille("").unwrap(); |
365 | 0 | let instant = Instant::now(); |
366 | 0 | for _ in 0..n_loops { |
367 | 0 | match get_braille("") { |
368 | 0 | Ok(_) => (), |
369 | 0 | Err(e) => {eprintln!("{}", errors_to_string(&e)); exit(1);} , |
370 | } | |
371 | } | |
372 | 0 | info!("Time taken (time for {} braille averaged over {} loops): {}ms", get_preference("BrailleCode").unwrap(), n_loops, instant.elapsed().as_millis() as f64/n_loops_float); |
373 | 0 | } |
Line | Count | Source |
1 | //! Navigation is controlled by a `Navigation_Rules.yaml` file in conjunction with preferences. | |
2 | //! See preference documentation for more info on navigation preferences. | |
3 | #![allow(clippy::needless_return)] | |
4 | ||
5 | use std::cell::{Ref, RefCell, RefMut}; | |
6 | use sxd_xpath::context::Evaluation; | |
7 | use sxd_xpath::Value; | |
8 | use sxd_document::dom::Element; | |
9 | use sxd_document::Package; | |
10 | ||
11 | use std::fmt; | |
12 | use crate::canonicalize::{name, get_parent}; | |
13 | use crate::pretty_print::mml_to_string; | |
14 | use crate::speech::{NAVIGATION_RULES, CONCAT_INDICATOR, CONCAT_STRING, SpeechRules, SpeechRulesWithContext}; | |
15 | use crate::infer_intent::add_fixity_children; | |
16 | use crate::interface::copy_mathml; | |
17 | #[cfg(not(target_family = "wasm"))] | |
18 | use std::time::Instant; | |
19 | use crate::errors::*; | |
20 | use phf::phf_set; | |
21 | use log::{debug}; | |
22 | ||
23 | pub const ID_OFFSET: &str = "data-id-offset"; | |
24 | ||
25 | const MAX_PLACE_MARKERS: usize = 10; | |
26 | ||
27 | thread_local!{ | |
28 | /// The current set of navigation rules | |
29 | pub static NAVIGATION_STATE: RefCell<NavigationState> = | |
30 | RefCell::new( NavigationState::new() ); | |
31 | } | |
32 | ||
33 | pub static NAV_COMMANDS: phf::Set<&str> = phf_set! { | |
34 | "MovePrevious", "MoveNext", "MoveStart", "MoveEnd", "MoveLineStart", "MoveLineEnd", | |
35 | "MoveCellPrevious", "MoveCellNext", "MoveCellUp", "MoveCellDown", "MoveColumnStart", "MoveColumnEnd", | |
36 | "ZoomIn", "ZoomOut", "ZoomOutAll", "ZoomInAll", | |
37 | "MoveLastLocation", | |
38 | "ReadPrevious", "ReadNext", "ReadCurrent", "ReadCellCurrent", "ReadStart", "ReadEnd", "ReadLineStart", "ReadLineEnd", | |
39 | "DescribePrevious", "DescribeNext", "DescribeCurrent", | |
40 | "WhereAmI", "WhereAmIAll", | |
41 | "ToggleZoomLockUp", "ToggleZoomLockDown", "ToggleSpeakMode", | |
42 | "Exit", | |
43 | "MoveTo0","MoveTo1","MoveTo2","MoveTo3","MoveTo4","MoveTo5","MoveTo6","MoveTo7","MoveTo8","MoveTo9", | |
44 | "Read0","Read1","Read2","Read3","Read4","Read5","Read6","Read7","Read8","Read9", | |
45 | "Describe0","Describe1","Describe2","Describe3","Describe4","Describe5","Describe6","Describe7","Describe8","Describe9", | |
46 | "SetPlacemarker0","SetPlacemarker1","SetPlacemarker2","SetPlacemarker3","SetPlacemarker4","SetPlacemarker5","SetPlacemarker6","SetPlacemarker7","SetPlacemarker8","SetPlacemarker9", | |
47 | }; | |
48 | ||
49 | #[derive(Clone, PartialEq, Debug)] | |
50 | struct NavigationPosition { | |
51 | current_node: String, // id of current node | |
52 | current_node_offset: usize, // for leaves, char offset in leaf (default = 0), otherwise id for artificial intent node | |
53 | } | |
54 | ||
55 | impl fmt::Display for NavigationPosition { | |
56 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
57 | 0 | return write!(f, "{}[+{}]", self.current_node, self.current_node_offset); |
58 | 0 | } |
59 | } | |
60 | ||
61 | const ILLEGAL_NODE_ID: &str = "!not set"; // an illegal 'id' value | |
62 | impl Default for NavigationPosition { | |
63 | 48.6k | fn default() -> Self { |
64 | 48.6k | NavigationPosition { |
65 | 48.6k | current_node: ILLEGAL_NODE_ID.to_string(), |
66 | 48.6k | current_node_offset: 0 |
67 | 48.6k | } |
68 | 48.6k | } |
69 | } | |
70 | ||
71 | ||
72 | #[derive(Debug, Clone)] | |
73 | pub struct NavigationState { | |
74 | // it might be better to use a linked for the stacks, with the first node being the top | |
75 | // these two stacks should be kept in sync. | |
76 | position_stack: Vec<NavigationPosition>, // all positions, so we can go back to them | |
77 | command_stack: Vec<&'static str>, // all commands, so we can undo them | |
78 | place_markers: [NavigationPosition; MAX_PLACE_MARKERS], | |
79 | where_am_i: NavigationPosition, // current 'where am i' location | |
80 | ||
81 | #[cfg(target_family = "wasm")] | |
82 | where_am_i_start_time: usize, // FIX: for web | |
83 | #[cfg(not(target_family = "wasm"))] | |
84 | where_am_i_start_time: Instant, | |
85 | mode: String, // one of "Character", "Simple", or "Enhanced" | |
86 | speak_overview: bool, // true => describe after move; false => (standard) speech rules | |
87 | } | |
88 | ||
89 | impl fmt::Display for NavigationState { | |
90 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
91 | 0 | writeln!(f, "NavigationState{{")?; |
92 | 0 | write!(f, " Position Stack: ")?; |
93 | 0 | for (i, nav_state) in self.position_stack.iter().enumerate() { |
94 | 0 | write!(f, "{}{}", if i==0 {""} else {", "}, nav_state)?; |
95 | } | |
96 | 0 | writeln!(f)?; |
97 | 0 | write!(f, " Command Stack: ")?; |
98 | 0 | for (i, nav_state) in self.command_stack.iter().enumerate() { |
99 | 0 | write!(f, "{}{}", if i==0 {""} else {", "}, *nav_state)?; |
100 | } | |
101 | 0 | writeln!(f)?; |
102 | 0 | writeln!(f, " where_am_i: {}, start_time: {:?}", self.where_am_i, self.where_am_i_start_time)?; |
103 | 0 | writeln!(f, " mode: {}, speak_overview: {}", self.mode, self.speak_overview)?; |
104 | 0 | writeln!(f, "}}")?; |
105 | 0 | return Ok( () ); |
106 | 0 | } |
107 | } | |
108 | ||
109 | impl NavigationState { | |
110 | 3.92k | fn new() -> NavigationState { |
111 | 3.92k | return NavigationState { |
112 | 3.92k | position_stack: Vec::with_capacity(1024), |
113 | 3.92k | command_stack: Vec::with_capacity(1024), |
114 | 3.92k | place_markers: Default::default(), |
115 | 3.92k | where_am_i: NavigationPosition::default(), |
116 | 3.92k | // FIX: figure this out for the web |
117 | 3.92k | #[cfg(target_family = "wasm")] |
118 | 3.92k | where_am_i_start_time: 0, // FIX: for web |
119 | 3.92k | #[cfg(not(target_family = "wasm"))] |
120 | 3.92k | where_am_i_start_time: Instant::now(), // need to give it some value, and "default()" isn't an option |
121 | 3.92k | mode: "".to_string(), // set latter when we have some context |
122 | 3.92k | speak_overview: false, // set latter when we have some context |
123 | 3.92k | }; |
124 | 3.92k | } |
125 | ||
126 | 4.88k | pub fn reset(&mut self) { |
127 | 4.88k | self.position_stack.clear(); |
128 | 4.88k | self.command_stack.clear(); |
129 | 4.88k | self.where_am_i = NavigationPosition::default(); |
130 | 4.88k | self.reset_start_time() |
131 | 4.88k | } |
132 | ||
133 | ||
134 | // defining reset_start_time because of the following message if done inline | |
135 | // attributes on expressions are experimental | |
136 | // see issue #15701 <https://github.com/rust-lang/rust/issues/15701> for more information | |
137 | #[cfg(target_family = "wasm")] | |
138 | fn reset_start_time(&mut self) { | |
139 | self.where_am_i_start_time = 0; | |
140 | } | |
141 | ||
142 | #[cfg(not(target_family = "wasm"))] | |
143 | 4.88k | fn reset_start_time(&mut self) { |
144 | 4.88k | self.where_am_i_start_time = Instant::now(); // need to give it some value, and "default()" isn't an option |
145 | 4.88k | } |
146 | ||
147 | ||
148 | 563 | fn push(&mut self, position: NavigationPosition, command: &'static str) { |
149 | 563 | self.position_stack.push(position); |
150 | 563 | self.command_stack.push(command); |
151 | 563 | } |
152 | ||
153 | 46 | fn pop(&mut self) -> Option<(NavigationPosition, &'static str)> { |
154 | 46 | assert_eq!(self.position_stack.len(), self.command_stack.len()); |
155 | 46 | if self.position_stack.is_empty() { |
156 | 0 | return None; |
157 | } else { | |
158 | 46 | return Some( (self.position_stack.pop().unwrap(), self.command_stack.pop().unwrap()) ); |
159 | } | |
160 | 46 | } |
161 | ||
162 | 2.75k | fn top(&self) -> Option<(&NavigationPosition, &'static str)> { |
163 | 2.75k | if self.position_stack.is_empty() { |
164 | 0 | return None; |
165 | 2.75k | } |
166 | 2.75k | let last = self.position_stack.len()-1; |
167 | 2.75k | return Some( (&self.position_stack[last], self.command_stack[last]) ); |
168 | 2.75k | } |
169 | ||
170 | 0 | pub fn get_navigation_mathml<'a>(&self, mathml: Element<'a>) -> Result<(Element<'a>, usize)> { |
171 | 0 | if self.position_stack.is_empty() { |
172 | 0 | return Ok( (mathml, 0) ); |
173 | } else { | |
174 | 0 | let (position, _) = self.top().unwrap(); |
175 | 0 | return match get_node_by_id(mathml, position) { |
176 | 0 | None => bail!("internal error: id '{}' was not found in mathml:\n{}", |
177 | 0 | position.current_node, mml_to_string(mathml)), |
178 | 0 | Some(found) => Ok( (found, position.current_node_offset) ) |
179 | }; | |
180 | } | |
181 | 0 | } |
182 | ||
183 | 1.09k | pub fn get_navigation_mathml_id(&self, mathml: Element) -> (String, usize) { |
184 | 1.09k | if self.position_stack.is_empty() { |
185 | 47 | return (mathml.attribute_value("id").unwrap().to_string(), 0); |
186 | } else { | |
187 | 1.05k | let (position, _) = self.top().unwrap(); |
188 | 1.05k | return (position.current_node.clone(), position.current_node_offset); |
189 | } | |
190 | 1.09k | } |
191 | ||
192 | 549 | fn init_navigation_context(&self, context: &mut sxd_xpath::Context, command: &'static str, |
193 | 549 | nav_state_top: Option<(&NavigationPosition, &'static str)>) { |
194 | 549 | context.set_variable("NavCommand", command); |
195 | ||
196 | 549 | if command == "WhereAmI" && |
197 | 0 | context.set_variable("NavNode", self.where_am_i.current_node.as_str()); |
198 | 0 | context.set_variable("NavNodeOffset", self.where_am_i.current_node_offset as f64); |
199 | 549 | } else { |
200 | 549 | let position = &self.position_stack[self.position_stack.len()-1]; |
201 | 549 | context.set_variable("NavNode", position.current_node.as_str()); |
202 | 549 | context.set_variable("NavNodeOffset", position.current_node_offset as f64); |
203 | 549 | } |
204 | ||
205 | // get the index from command (e.g., '3' in 'SetPlacemarker3 or MoveTo3' and set 'PlaceMarker' to it's position) | |
206 | 549 | if command.ends_with(|ch: char| ch.is_ascii_digit()) { |
207 | 6 | let index = convert_last_char_to_number(command); |
208 | 6 | let position = &self.place_markers[index]; |
209 | 6 | context.set_variable("PlaceMarkerIndex", index as f64); |
210 | 6 | context.set_variable("PlaceMarker", position.current_node.as_str()); |
211 | 6 | context.set_variable("PlaceMarkerOffset", position.current_node_offset as f64); |
212 | 543 | } |
213 | ||
214 | 549 | context.set_variable("Overview", self.speak_overview); |
215 | 549 | context.set_variable("ReadZoomLevel", (if self.mode == "Enhanced" { |
216 | 549 | context.set_variable("MatchCounter", 0 as f64); |
217 | ||
218 | 549 | if command == "MoveLastLocation" { |
219 | 3 | let previous_command = match nav_state_top { |
220 | 0 | None => "None", |
221 | 3 | Some( (_, previous_command) ) => previous_command, |
222 | }; | |
223 | 3 | context.set_variable("PreviousNavCommand", previous_command); |
224 | 546 | } |
225 | ||
226 | // used by nav rules for speech -- needs an initial value so tests don't fail | |
227 | 549 | context.set_variable("SayCommand", "" ); |
228 | 549 | context.set_variable("Move2D", "" ); |
229 | 549 | context.set_variable("SpeakExpression", true ); // default is to speak the expr after navigation |
230 | 549 | return; |
231 | ||
232 | 6 | fn convert_last_char_to_number(str: &str) -> usize { |
233 | 6 | let last_char = str.as_bytes()[str.len()-1]; |
234 | 6 | assert!( last_char.is_ascii_digit() ); |
235 | 6 | return (last_char - b'0') as usize; |
236 | 6 | } |
237 | 549 | } |
238 | } | |
239 | ||
240 | // convert the last digit of a Placemarker command to an integer | |
241 | 2 | fn convert_last_char_to_number(str: &str) -> usize { |
242 | 2 | let last_char = str.as_bytes()[str.len()-1]; |
243 | 2 | assert!( last_char.is_ascii_digit() ); |
244 | 2 | return (last_char - b'0') as usize; |
245 | 2 | } |
246 | ||
247 | /// Get the node associated with a `NavigationPosition`. | |
248 | /// This can be called on an intent tree | |
249 | 9.18k | fn get_node_by_id<'a>(mathml: Element<'a>, pos: &NavigationPosition) -> Option<Element<'a>> { |
250 | 9.18k | if let Some( |
251 | 9.17k | mathml_id == pos.current_node.as_str() && |
252 | 1.46k | (crate::xpath_functions::is_leaf(mathml) || |
253 | 537 | mathml.attribute_value(ID_OFFSET).unwrap_or("0") == pos.current_node_offset.to_string()) { |
254 | 1.46k | return Some(mathml); |
255 | 7.71k | } |
256 | ||
257 | 10.0k | for child in |
258 | 10.0k | if let Some( |
259 | 7.71k | let Some( |
260 | 4.41k | return Some(found); |
261 | 5.60k | } |
262 | } | |
263 | 3.29k | return None; |
264 | 9.18k | } |
265 | ||
266 | /// Search the mathml for the id and set the navigation node to that id | |
267 | /// Resets the navigation stack | |
268 | 2 | pub fn set_navigation_node_from_id(mathml: Element, id: &str, offset: usize) -> Result<()> { |
269 | 2 | let current_node = id.to_string(); |
270 | 2 | let pos = NavigationPosition { current_node: current_node.clone(), current_node_offset: offset }; |
271 | 2 | let node = get_node_by_id(mathml, &pos); |
272 | 2 | if node.is_some() { |
273 | 2 | return NAVIGATION_STATE.with(|nav_state| { |
274 | 2 | let mut nav_state = nav_state.borrow_mut(); |
275 | 2 | nav_state.reset(); |
276 | 2 | nav_state.push(NavigationPosition{ |
277 | 2 | current_node, |
278 | 2 | current_node_offset: offset |
279 | 2 | }, "None"); |
280 | 2 | return Ok( () ); |
281 | 2 | }) |
282 | } else { | |
283 | 0 | bail!("Id {} not found in MathML {}", id, mml_to_string(mathml)); |
284 | } | |
285 | 2 | } |
286 | ||
287 | /// Get's the Nav Node from the context, with some exceptions such as Toggle commands where it isn't set. | |
288 | /// Note: mathml can be any node. It isn't really used but some Element needs to be part of Evaluate(). | |
289 | 571 | pub fn get_nav_node<'c>(context: &sxd_xpath::Context<'c>, var_name: &str, mathml: Element<'c>, start_node: Element<'c>, command: &str, nav_mode: &str) -> Result<String> { |
290 | 571 | let start_id = start_node.attribute_value("id").unwrap_or_default(); |
291 | 571 | if command.starts_with("Toggle") { |
292 | 1 | return Ok( start_id.to_string() ); |
293 | } else { | |
294 | 570 | return context_get_variable(context, var_name, mathml) |
295 | 570 | .with_context(|| |
296 | 0 | command, start_node.attribute_value("id").unwrap_or_default(), nav_mode)); |
297 | } | |
298 | 571 | } |
299 | ||
300 | // FIX: think of a better place to put this, and maybe a better interface | |
301 | /// Note: mathml can be any node. It isn't really used but some Element needs to be part of Evaluate(). | |
302 | /// If the context variable has String, Number, or Boolean xpath value, return it as a string. Otherwise it is an error | |
303 | 4.55k | pub fn context_get_variable<'c>(context: &sxd_xpath::Context<'c>, var_name: &str, mathml: Element<'c>) -> Result<String> { |
304 | // This is slightly roundabout because Context doesn't expose a way to get the values. | |
305 | // Instead, we create an "Evaluation", which is just one level of indirection. | |
306 | use sxd_xpath::nodeset::Node; | |
307 | 4.55k | let evaluation = Evaluation::new(context, Node::Element(mathml)); |
308 | 4.55k | return match evaluation.value_of(var_name.into()) { |
309 | 4.55k | Some(value) => match value { |
310 | 1.74k | Value::String(s) => Ok(s.clone()), |
311 | 1.20k | Value::Number(f) => Ok(f.to_string()), |
312 | 1.09k | Value::Boolean(b) => Ok(format!("{b}")), // "true" or "false" |
313 | 509 | Value::Nodeset(nodes) => { |
314 | 509 | if nodes.size() == 1 && |
315 | 509 | let Some(attr) = nodes.document_order_first().unwrap().attribute() { |
316 | 509 | return Ok(attr.value().to_string()); |
317 | 0 | }; |
318 | 0 | let mut error_message = format!("Variable '{var_name}' set somewhere in navigate.yaml is nodeset and not an attribute: "); |
319 | 0 | if nodes.size() == 0 { |
320 | 0 | error_message += &format!("0 nodes (false) -- {} set to non-existent node in\n{}", |
321 | 0 | var_name, mml_to_string(mathml)); |
322 | 0 | } else { |
323 | 0 | let singular = nodes.size()==1; |
324 | 0 | error_message += &format!("{} node{}. {}:", |
325 | 0 | nodes.size(), |
326 | 0 | if singular {""} else {"s"}, |
327 | 0 | if singular {"Node is"} else {"Nodes are"}); |
328 | 0 | nodes.document_order() |
329 | 0 | .iter() |
330 | 0 | .enumerate() |
331 | 0 | .for_each(|(i, node)| { |
332 | 0 | match node { |
333 | 0 | sxd_xpath::nodeset::Node::Element(mathml) => |
334 | 0 | error_message += &format!("#{}:\n{}",i, mml_to_string(*mathml)), |
335 | 0 | _ => error_message += &format!("'{node:?}'"), |
336 | } | |
337 | 0 | }) |
338 | }; | |
339 | 0 | bail!(error_message); |
340 | }, | |
341 | }, | |
342 | 0 | None => bail!("Could not find value for navigation variable '{}'", var_name), |
343 | } | |
344 | 4.55k | } |
345 | ||
346 | /// Wrapper around context_get_variable to get an integer variable | |
347 | 1.70k | fn context_get_int_variable<'c>(context: &sxd_xpath::Context<'c>, var_name: &str, mathml: Element<'c>) -> Result<usize> { |
348 | 1.70k | let value = context_get_variable(context, var_name, mathml) |
349 | 1.70k | return match value.parse::<usize>() { |
350 | 1.70k | Ok(i) => Ok(i), |
351 | 0 | Err(e) => bail!("Could not parse navigation variable '{}' with value '{}' as integer: {}", var_name, value, e), |
352 | } | |
353 | 1.70k | } |
354 | ||
355 | /// Given a key code along with the modifier keys, the current node is moved accordingly (or value reported in some cases).] | |
356 | /// The spoken text for the new current node is returned. | |
357 | 0 | pub fn do_mathml_navigate_key_press(mathml: Element, |
358 | 0 | key: usize, shift_key: bool, control_key: bool, alt_key: bool, meta_key: bool) -> Result<String> { |
359 | 0 | let (command, param) = key_press_to_command_and_param(key, shift_key, control_key, alt_key, meta_key)?; |
360 | 0 | return do_navigate_command_and_param(mathml, command, param); |
361 | 0 | } |
362 | ||
363 | 2 | fn do_navigate_command_and_param(mathml: Element, command: NavigationCommand, param: NavigationParam) -> Result<String> { |
364 | 2 | return do_navigate_command_string(mathml, navigation_command_string(command, param)); |
365 | 2 | } |
366 | ||
367 | 549 | pub fn do_navigate_command_string(mathml: Element, nav_command: &'static str) -> Result<String> { |
368 | // first check to see if nav file has been changed -- don't bother checking in loop below | |
369 | 549 | NAVIGATION_RULES.with(|rules| { |
370 | 549 | rules.borrow_mut().read_files() |
371 | 549 | }) |
372 | ||
373 | 549 | if mathml.children().is_empty() { |
374 | 0 | bail!("MathML has not been set -- can't navigate"); |
375 | 549 | }; |
376 | ||
377 | 549 | return NAVIGATION_STATE.with(|nav_state| { |
378 | 549 | let mut nav_state = nav_state.borrow_mut(); |
379 | // debug!("MathML: {}", mml_to_string(mathml)); | |
380 | 549 | if nav_state.position_stack.is_empty() { |
381 | // initialize to root node | |
382 | 47 | nav_state.push(NavigationPosition{ |
383 | 47 | current_node: mathml.attribute_value("id").unwrap().to_string(), |
384 | 47 | current_node_offset: 0 |
385 | 47 | }, "None") |
386 | 502 | }; |
387 | ||
388 | 549 | return NAVIGATION_RULES.with(|rules| { |
389 | 549 | let rules = rules.borrow(); |
390 | 549 | let new_package = Package::new(); |
391 | 549 | let mut rules_with_context = SpeechRulesWithContext::new(&rules, new_package.as_document(), "", 0); |
392 | ||
393 | 549 | nav_state.mode = rules.pref_manager.as_ref().borrow().pref_to_string("NavMode"); |
394 | 549 | nav_state.speak_overview = rules.pref_manager.as_ref().borrow().pref_to_string("Overview") == "true"; |
395 | ||
396 | 549 | nav_state.init_navigation_context(rules_with_context.get_context(), nav_command, nav_state.top()); |
397 | ||
398 | // start navigation off at the right node | |
399 | 549 | if nav_command == "MoveLastLocation" { |
400 | 3 | nav_state.pop(); |
401 | 546 | } |
402 | ||
403 | // If no speech happened for some calls, we try the call again (e.g, no speech for invisible times). | |
404 | // To prevent to infinite loop, we limit the number of tries | |
405 | const LOOP_LIMIT: usize = 3; | |
406 | 549 | let mut cumulative_speech = String::with_capacity(120); |
407 | 569 | for loop_count in |
408 | 569 | match apply_navigation_rules(mathml, nav_command, &rules, &mut rules_with_context, &mut nav_state, loop_count) { |
409 | 569 | Ok( (speech, done)) => { |
410 | 569 | cumulative_speech = cumulative_speech + if loop_count==0 { |
411 | 569 | if done { |
412 | 549 | let (tts, rate) = { |
413 | 549 | let prefs = rules.pref_manager.borrow(); |
414 | 549 | (prefs.pref_to_string("TTS"), prefs.pref_to_string("MathRate")) |
415 | 549 | }; |
416 | 549 | if rate != "100" { |
417 | 0 | match tts.as_str() { |
418 | 0 | "SSML" |
419 | 0 | if !cumulative_speech.starts_with("<prosody rate") => { |
420 | 0 | cumulative_speech = format!("<prosody rate='{}%'>{}</prosody>", &rate, &cumulative_speech); |
421 | 0 | } |
422 | 0 | "SAPI5" |
423 | 0 | if !cumulative_speech.starts_with("<rate speed") => { |
424 | 0 | cumulative_speech = format!( |
425 | 0 | "<rate speed='{:.1}'>{}</rate>", |
426 | 0 | 10.0 * (0.01 * rate.parse::<f32>().unwrap_or(100.0)).log(3.0), |
427 | 0 | cumulative_speech |
428 | 0 | ); |
429 | 0 | } |
430 | 0 | _ => (), // do nothing |
431 | } | |
432 | 549 | } |
433 | 549 | return Ok( rules.pref_manager.borrow().get_tts() |
434 | 549 | .merge_pauses(crate::speech::remove_optional_indicators( |
435 | 549 | &cumulative_speech.replace(CONCAT_STRING, "") |
436 | 549 | .replace(CONCAT_INDICATOR, "") |
437 | 549 | ) |
438 | 549 | .trim_start().trim_end_matches([' ', ',', ';'])) ); |
439 | 20 | } |
440 | }, | |
441 | 0 | Err(e) => { |
442 | 0 | return Err(e); |
443 | } | |
444 | } | |
445 | } | |
446 | 0 | bail!("Internal error: Navigation exceeded limit of number of times no speech generated |
447 | when attempting to {} in {} mode start at id={} in this MathML:\n{}.", | |
448 | 0 | nav_command, nav_state.mode, nav_state.top().unwrap().0.current_node, mml_to_string(mathml)); |
449 | 549 | }); |
450 | 549 | }); |
451 | ||
452 | 570 | fn get_start_node<'m>(mathml: Element<'m>, nav_state: &RefMut<NavigationState>) -> Result<Element<'m>> { |
453 | 570 | let element = match nav_state.top() { |
454 | None => { | |
455 | 0 | let nav_position = NavigationPosition { current_node: mathml.attribute_value("id").unwrap().to_string(), current_node_offset: 0 }; |
456 | 0 | get_node_by_id(mathml, &nav_position) |
457 | }, | |
458 | 570 | Some( (position, _) ) => get_node_by_id(mathml, position), |
459 | }; | |
460 | ||
461 | 570 | return match element { |
462 | 569 | Some(node) => Ok(node), |
463 | None => { | |
464 | 1 | bail!("Internal Error: didn't find id/offset '{:?}' while attempting to start navigation. MathML is\n{}", |
465 | 1 | nav_state.top().map(|t| t.0), mml_to_string(mathml)); |
466 | } | |
467 | }; | |
468 | 570 | } |
469 | ||
470 | ||
471 | ||
472 | 569 | fn apply_navigation_rules<'c, 'm:'c>(mathml: Element<'m>, nav_command: &'static str, |
473 | 569 | rules: &Ref<SpeechRules>, rules_with_context: &mut SpeechRulesWithContext<'c, '_, 'm>, nav_state: &mut RefMut<NavigationState>, |
474 | 569 | loop_count: usize) -> Result<(String, bool)> { |
475 | { | |
476 | 569 | let context = rules_with_context.get_context(); |
477 | 569 | context.set_variable("MatchCounter", loop_count as f64); |
478 | 569 | nav_state.mode = context_get_variable(context, "NavMode", mathml) |
479 | } | |
480 | ||
481 | 569 | let mut add_literal = nav_state.mode == "Character"; |
482 | 569 | let (intent, nav_intent) = if add_literal { |
483 | 206 | (mathml, mathml) |
484 | } else { | |
485 | 363 | let intent = crate::speech::intent_from_mathml(mathml, rules_with_context.get_document()) |
486 | 363 | (intent, add_fixity_children(copy_mathml(intent))) |
487 | }; | |
488 | ||
489 | 569 | let mut properties = ""; |
490 | 569 | if add_literal { |
491 | 206 | properties = mathml.attribute_value("data-intent-property").unwrap_or_default(); |
492 | 206 | if properties.contains(":literal:") { |
493 | 0 | add_literal = false; |
494 | 206 | } else { |
495 | 206 | mathml.set_attribute_value("data-intent-property", (":literal:".to_string() + properties).as_str()); |
496 | 206 | }; |
497 | 363 | } |
498 | // we should always find the start node. | |
499 | // however, if we were navigating by character, then switched the NavMode, the intent tree might not have that node in it | |
500 | 569 | let start_node = match get_start_node(nav_intent, nav_state) { |
501 | 568 | Ok(node) => node, |
502 | Err(_) => { | |
503 | // find the node in the other tree (probably mathml) and walk up to find a parent that has an id in both | |
504 | 1 | debug!("Could not find start_node in nav_intent -- trying other_tree"); |
505 | 1 | let other_tree = if nav_state.mode == "Character" { |
506 | 1 | let mut found_node = get_start_node(other_tree, nav_state) |
507 | 2 | while name(found_node) != "math" { |
508 | 2 | found_node = get_parent(found_node); |
509 | // debug!("found_node:\n{}", mml_to_string(found_node)); | |
510 | 2 | let temp_pos = NavigationPosition { |
511 | 2 | current_node: found_node.attribute_value("id").unwrap_or_default().to_string().clone(), |
512 | 2 | current_node_offset: found_node.attribute_value(ID_OFFSET).unwrap_or_default().parse::<usize>().unwrap_or_default(), |
513 | 2 | }; |
514 | 2 | if let Some( |
515 | 1 | found_node = intent_node; |
516 | 1 | break; |
517 | 1 | } |
518 | } | |
519 | 1 | found_node |
520 | } | |
521 | }; | |
522 | ||
523 | // debug!("intent=\n{}", mml_to_string(intent)); | |
524 | // debug!("nav intent=\n{}", mml_to_string(nav_intent)); | |
525 | // debug!("start_node id={}\n{}", nav_state.top().unwrap().0.current_node.as_str(), mml_to_string(start_node)); | |
526 | // if name(start_node) != "math" { | |
527 | // let mut parent= get_parent(start_node); | |
528 | // if name(parent) != "math" { | |
529 | // parent = get_parent(parent); | |
530 | // } | |
531 | // debug!("parent or grandparent of start_node:\n{}", mml_to_string(parent)); | |
532 | // } | |
533 | 569 | let offset = context_get_int_variable(rules_with_context.get_context(), "NavNodeOffset", intent) |
534 | 569 | rules_with_context.set_nav_node_offset(offset); |
535 | 569 | debug!("starting nav_position: {}, start node ={}", |
536 | ||
537 | 569 | let raw_speech_string = rules_with_context.match_pattern::<String>(start_node) |
538 | 569 | .context("Pattern match/replacement failure during math navigation!") |
539 | 569 | let speech = rules.pref_manager.borrow().get_tts() |
540 | 569 | .merge_pauses(crate::speech::remove_optional_indicators( |
541 | 569 | &raw_speech_string.replace(CONCAT_STRING, "") |
542 | 569 | .replace(CONCAT_INDICATOR, "") |
543 | 569 | ) |
544 | 569 | .trim()); |
545 | // debug!("Nav Speech: {}", speech); | |
546 | ||
547 | // FIX: add things that need to do a speech replacement based on some marker for "where am i" and others that loop ([Speak: id])??? | |
548 | // what else needs to be done/set??? | |
549 | ||
550 | // transfer some values that might have been set into the prefs | |
551 | 569 | let offset = context_get_int_variable(rules_with_context.get_context(), "NavNodeOffset", intent) |
552 | 569 | rules_with_context.set_nav_node_offset(offset); |
553 | 569 | let context = rules_with_context.get_context(); |
554 | 569 | nav_state.speak_overview = context_get_variable(context, "Overview", intent) |
555 | 569 | nav_state.mode = context_get_variable(context, "NavMode", intent) |
556 | 569 | rules.pref_manager.as_ref().borrow_mut().set_user_prefs("NavMode", &nav_state.mode) |
557 | ||
558 | 569 | debug!("context value of NavNodeOffset: {:?}", |
559 | 569 | let nav_position = NavigationPosition { |
560 | 569 | current_node: get_nav_node(context, "NavNode", intent, start_node, nav_command, &nav_state.mode) |
561 | 569 | current_node_offset: context_get_int_variable(context, "NavNodeOffset", intent) |
562 | }; | |
563 | ||
564 | // after a command, we either read or describe the new location (part of state) | |
565 | // also some commands are DescribeXXX/ReadXXX, so we need to look at the commands also | |
566 | 569 | let use_read_rules = if nav_command.starts_with("Read") { |
567 | 5 | true |
568 | 564 | } else if nav_command.starts_with("Describe") { |
569 | 3 | false |
570 | } else { | |
571 | 561 | !nav_state.speak_overview |
572 | }; | |
573 | ||
574 | 569 | debug!("after match nav_position: {}", nav_position); |
575 | // push the new location on the stack | |
576 | 569 | if nav_position != NavigationPosition::default() && &nav_position != nav_state.top().unwrap().0 { |
577 | 483 | nav_state.push(nav_position.clone(), nav_command); |
578 | 483 |
|
579 | ||
580 | 569 | if nav_command.starts_with("SetPlacemarker") { |
581 | 2 | let new_node_id = get_nav_node(context, "NavNode", intent, start_node, nav_command, &nav_state.mode) |
582 | 2 | nav_state.place_markers[convert_last_char_to_number(nav_command)] = NavigationPosition{ |
583 | 2 | current_node: new_node_id, |
584 | 2 | current_node_offset: context_get_int_variable(context, "NavNodeOffset", intent) |
585 | } | |
586 | 567 | } |
587 | ||
588 | 569 | let nav_mathml = get_node_by_id(intent, &nav_position); |
589 | 569 | if nav_mathml.is_some() && context_get_variable(context, "SpeakExpression", intent) |
590 | // Speak/Overview of where we landed (if we are supposed to speak it) -- use intent, not nav_intent | |
591 | // Note: NavMode might have changed, so we need to recheck the mode to see if we use LiteralSpeak | |
592 | 519 | let literal_speak = nav_state.mode == "Character"; |
593 | 519 | let node_speech_result = speak(mathml, intent, &nav_position, literal_speak, use_read_rules); |
594 | 519 | remove_literal_property(mathml, add_literal, properties); |
595 | 519 | let node_speech = match node_speech_result { |
596 | 519 | Ok(speech) => speech, |
597 | 0 | Err(e) => { |
598 | 0 | if e.to_string() == crate::speech::NAV_NODE_SPEECH_NOT_FOUND { |
599 | 0 | bail!("Internal error: With {}/{} in {} mode, can't {} from expression with id '{}' inside:\n{}", |
600 | 0 | rules.pref_manager.as_ref().borrow().pref_to_string("Language"), |
601 | 0 | rules.pref_manager.as_ref().borrow().pref_to_string("SpeechStyle"), |
602 | 0 | &nav_state.mode, nav_command, &nav_position.current_node, mml_to_string(if literal_speak {mathml} else {intent})); |
603 | 0 | } |
604 | 0 | return Err(e); |
605 | } | |
606 | }; | |
607 | ||
608 | // debug!("node_speech: '{}', speech: '{}'\n", node_speech, speech); | |
609 | 519 | if node_speech.is_empty() { |
610 | // try again in loop | |
611 | 20 | return Ok( (speech, false)); |
612 | } else { | |
613 | 499 | pop_stack(nav_state, loop_count, nav_command); |
614 | // debug!("returning: '{}'", speech.clone() + " " + &node_speech); | |
615 | 499 | return Ok( (speech + " " + &node_speech, true) ); |
616 | } | |
617 | } else { | |
618 | 50 | remove_literal_property(mathml, add_literal, properties); |
619 | 50 | pop_stack(nav_state, loop_count, nav_command); |
620 | 50 | return Ok( (speech, true) ); |
621 | }; | |
622 | ||
623 | 569 | fn remove_literal_property(mathml: Element, add_literal: bool, properties: &str) { |
624 | 569 | if add_literal { |
625 | 206 | if properties.is_empty() { |
626 | 206 | mathml.remove_attribute("data-intent-property"); |
627 | 206 | } else { |
628 | 0 | mathml.set_attribute_value("data-intent-property", properties); |
629 | 0 | } |
630 | 363 | } |
631 | 569 | } |
632 | ||
633 | 569 | } |
634 | ||
635 | ||
636 | 549 | fn pop_stack(nav_state: &mut NavigationState, count: usize, nav_command: &'static str) { |
637 | // save the final state and pop the intermediate states that did nothing | |
638 | 549 | let push_command_on_stack = (nav_command.starts_with("Move") && |
639 | // debug!("pop_stack: nav_command={}, count={}, push? {} stack=\n{}", nav_command, count, push_command_on_stack, nav_state); | |
640 | 549 | if count == 0 { |
641 | 529 | if !push_command_on_stack && |
642 | 3 | nav_state.pop(); // remove ReadXXX, SetPlacemarker, etc. commands that don't change the state |
643 | 526 | } |
644 | 529 | return; |
645 | 20 | } |
646 | 20 | let (top_position, top_command) = nav_state.pop().unwrap(); |
647 | 20 | let mut count = count - 1; |
648 | loop { | |
649 | // debug!(" ... loop count={}", count); | |
650 | 20 | nav_state.pop(); |
651 | 20 | if count == 0 { |
652 | 20 | break; |
653 | 0 | }; |
654 | 0 | count -= 1; |
655 | }; | |
656 | 20 | if push_command_on_stack { |
657 | 19 | nav_state.push(top_position, top_command); |
658 | 19 |
|
659 | // debug!("END pop_stack: stack=\n{}", nav_state); | |
660 | 549 | } |
661 | 549 | } |
662 | ||
663 | /// Speak the intent tree at the nav_node_id if that id exists in the intent tree; otherwise use the mathml tree. | |
664 | /// If full_read is true, we speak the tree, otherwise we use the overview rules. | |
665 | /// If literal_speak is true, we use the literal speak rules (and use the mathml tree). | |
666 | 519 | fn speak(mathml: Element, intent: Element, nav_position: &NavigationPosition, literal_speak: bool, full_read: bool) -> Result<String> { |
667 | 519 | if full_read { |
668 | // In something like x^3, we might be looking for the '3', but it will be "cubed", so we don't find it. | |
669 | // Or we might be on a "(" surrounding a matrix and that isn't part of the intent | |
670 | // We are probably safer in terms of getting the same speech if we retry intent starting at the nav node, | |
671 | // but the node to speak is almost certainly trivial. | |
672 | // By speaking the non-intent tree, we are certain to speak on the next try | |
673 | 505 | if !literal_speak && |
674 | // debug!("speak: nav_node_id={}, intent=\n{}", nav_node_id, mml_to_string(intent)); | |
675 | 327 | match crate::speech::speak_mathml(intent, &nav_position.current_node, nav_position.current_node_offset) { |
676 | 326 | Ok(speech) => return Ok(speech), |
677 | 1 | Err(e) => { |
678 | 1 | if e.to_string() != crate::speech::NAV_NODE_SPEECH_NOT_FOUND { |
679 | 0 | return Err(e); |
680 | 1 | } |
681 | // else could be something like '3' in 'x^3' ("cubed") | |
682 | }, | |
683 | } | |
684 | 178 | } |
685 | // debug!("speak (literal): nav_node_id={}, mathml=\n{}", nav_node_id, mml_to_string(mathml)); | |
686 | 179 | let speech = crate::speech::speak_mathml(mathml, |
687 | 179 | &nav_position.current_node, nav_position.current_node_offset); |
688 | // debug!("speech from speak: {:?}", speech); | |
689 | 179 | return speech; |
690 | } else { | |
691 | 14 | return crate::speech::overview_mathml(mathml, &nav_position.current_node, nav_position.current_node_offset); |
692 | } | |
693 | 519 | } |
694 | ||
695 | ||
696 | // MathPlayer's interface mentions these, so we keep them. | |
697 | // These (KeyboardEvent.keyCode) are consistent across platforms (mostly?) but are deprecated. | |
698 | // KeyboardEvent.code is recommended instead (a string) | |
699 | const VK_LEFT: usize = 0x25; | |
700 | const VK_RIGHT: usize = 0x27; | |
701 | const VK_UP: usize = 0x26; | |
702 | const VK_DOWN: usize = 0x28; | |
703 | const VK_RETURN: usize = 0x0D; | |
704 | const VK_SPACE: usize = 0x20; | |
705 | const VK_HOME: usize = 0x24; | |
706 | const VK_END: usize = 0x23; | |
707 | const VK_BACK: usize = 0x08; | |
708 | const VK_ESCAPE: usize = 0x1B; | |
709 | ||
710 | // Utilities that returns one of four commands/params based on shift/control key combinations | |
711 | ||
712 | enum NavigationCommand { | |
713 | Move, | |
714 | Zoom, | |
715 | MoveLastLocation, | |
716 | Read, | |
717 | Describe, | |
718 | ReadTo, | |
719 | Locate, | |
720 | ChangeNavMode, | |
721 | ToggleSpeakMode, | |
722 | SetPlacemarker, | |
723 | Exit, | |
724 | Last, | |
725 | } | |
726 | ||
727 | #[derive(PartialEq, PartialOrd, Clone, Copy)] | |
728 | enum NavigationParam { | |
729 | Placemarker0, | |
730 | Placemarker1, | |
731 | Placemarker2, | |
732 | Placemarker3, | |
733 | Placemarker4, | |
734 | Placemarker5, | |
735 | Placemarker6, | |
736 | Placemarker7, | |
737 | Placemarker8, | |
738 | Placemarker9, | |
739 | Previous, | |
740 | Current, | |
741 | Next, | |
742 | Start, | |
743 | End, | |
744 | LineStart, | |
745 | LineEnd, | |
746 | CellPrevious, | |
747 | CellCurrent, | |
748 | CellNext, | |
749 | ColStart, | |
750 | ColEnd, | |
751 | CellUp, | |
752 | CellDown, | |
753 | Last | |
754 | } | |
755 | ||
756 | ||
757 | 0 | fn choose_command( |
758 | 0 | shift_key: bool, |
759 | 0 | control_key: bool, |
760 | 0 | none: NavigationCommand, |
761 | 0 | shift: NavigationCommand, |
762 | 0 | control: NavigationCommand, |
763 | 0 | shift_control: NavigationCommand |
764 | 0 | ) -> NavigationCommand { |
765 | 0 | if shift_key && control_key { |
766 | 0 | return shift_control; |
767 | 0 | } else if control_key { |
768 | 0 | return control; |
769 | 0 | } else if shift_key { |
770 | 0 | return shift; |
771 | } else { | |
772 | 0 | return none; |
773 | } | |
774 | 0 | } |
775 | ||
776 | 0 | fn choose_param( |
777 | 0 | shift_key: bool, |
778 | 0 | control_key: bool, |
779 | 0 | none: NavigationParam, |
780 | 0 | shift: NavigationParam, |
781 | 0 | control: NavigationParam, |
782 | 0 | shift_control: NavigationParam |
783 | 0 | ) -> NavigationParam { |
784 | 0 | if shift_key && control_key { |
785 | 0 | return shift_control; |
786 | 0 | } else if control_key { |
787 | 0 | return control; |
788 | 0 | } else if shift_key { |
789 | 0 | return shift; |
790 | } else { | |
791 | 0 | return none; |
792 | } | |
793 | 0 | } |
794 | ||
795 | 0 | fn key_press_to_command_and_param( |
796 | 0 | key: usize, |
797 | 0 | shift_key: bool, |
798 | 0 | control_key: bool, |
799 | 0 | alt_key: bool, |
800 | 0 | meta_key: bool, |
801 | 0 | ) -> Result<(NavigationCommand, NavigationParam)> { |
802 | // key press mapping should probably be stored externally (registry) with an app that allows changes | |
803 | // for now, we build in the defaults | |
804 | ||
805 | // this is a hack to map alt+ctl+arrow to ctl+arrow to change table mappings (github.com/NSoiffer/MathCAT/issues/105) | |
806 | // if this change sticks, choose_command() needs to be changed and this hack should go away | |
807 | 0 | let mut alt_key = alt_key; |
808 | 0 | if alt_key && control_key && [VK_LEFT, VK_RIGHT, VK_UP, VK_DOWN].contains(&key) { |
809 | 0 | alt_key = false; |
810 | 0 | } |
811 | 0 | if alt_key || meta_key { |
812 | 0 | bail!("Invalid argument to key_press_to_command_and_param"); |
813 | 0 | } |
814 | ||
815 | let command; | |
816 | let param; | |
817 | 0 | match key { |
818 | 0 | VK_LEFT => { |
819 | 0 | command = choose_command(shift_key, control_key, NavigationCommand::Move, NavigationCommand::Read, NavigationCommand::Move, NavigationCommand::Describe); |
820 | 0 | param = choose_param( shift_key, control_key, NavigationParam::Previous, NavigationParam::Previous, NavigationParam::CellPrevious, NavigationParam::Previous); |
821 | 0 | }, |
822 | 0 | VK_RIGHT => { |
823 | 0 | command = choose_command(shift_key, control_key, NavigationCommand::Move, NavigationCommand::Read, NavigationCommand::Move, NavigationCommand::Describe); |
824 | 0 | param = choose_param( shift_key, control_key, NavigationParam::Next, NavigationParam::Next, NavigationParam::CellNext, NavigationParam::Next); |
825 | 0 | }, |
826 | 0 | VK_UP => { |
827 | 0 | command = choose_command(shift_key, control_key, NavigationCommand::Zoom, NavigationCommand::ChangeNavMode, NavigationCommand::Move, NavigationCommand::Zoom); |
828 | 0 | param = choose_param( shift_key, control_key, NavigationParam::Previous, NavigationParam::Previous, NavigationParam::CellUp, NavigationParam::Start); |
829 | 0 | }, |
830 | 0 | VK_DOWN => { |
831 | 0 | command = choose_command(shift_key, control_key, NavigationCommand::Zoom, NavigationCommand::ChangeNavMode, NavigationCommand::Move, NavigationCommand::Zoom); |
832 | 0 | param = choose_param( shift_key, control_key, NavigationParam::Next, NavigationParam::Next, NavigationParam::CellDown, NavigationParam::End); |
833 | 0 | }, |
834 | 0 | VK_RETURN => { |
835 | 0 | command = choose_command(shift_key, control_key, NavigationCommand::Locate, NavigationCommand::Last, NavigationCommand::Locate, NavigationCommand::Last); |
836 | 0 | param = choose_param( shift_key, control_key, NavigationParam::Previous,NavigationParam::Last, NavigationParam::Last, NavigationParam::Last); |
837 | 0 | }, |
838 | 0 | VK_SPACE => { |
839 | 0 | command = choose_command(shift_key, control_key, NavigationCommand::Read, NavigationCommand::ToggleSpeakMode, NavigationCommand::Read, NavigationCommand::Describe); |
840 | 0 | param = choose_param( shift_key, control_key, NavigationParam::Current, NavigationParam::Last, NavigationParam::CellCurrent, NavigationParam::Current); |
841 | 0 | }, |
842 | ||
843 | 0 | VK_HOME => { |
844 | 0 | command = choose_command(shift_key, control_key, NavigationCommand::Move, NavigationCommand::Move, NavigationCommand::Move, NavigationCommand::ReadTo); |
845 | 0 | param = choose_param( shift_key, control_key, NavigationParam::Start,NavigationParam::ColStart, NavigationParam::LineStart, NavigationParam::Start); |
846 | 0 | }, |
847 | 0 | VK_END => { |
848 | 0 | command = choose_command(shift_key, control_key, NavigationCommand::Move, NavigationCommand::Move, NavigationCommand::Move, NavigationCommand::ReadTo); |
849 | 0 | param = choose_param( shift_key, control_key, NavigationParam::End, NavigationParam::ColEnd, NavigationParam::LineEnd, NavigationParam::End); |
850 | 0 | }, |
851 | 0 | VK_BACK => { |
852 | 0 | command = NavigationCommand::MoveLastLocation; |
853 | 0 | param = NavigationParam::Last; |
854 | 0 | }, |
855 | 0 | VK_ESCAPE => { |
856 | 0 | command = NavigationCommand::Exit; |
857 | 0 | param = NavigationParam::Last; |
858 | 0 | }, |
859 | 0 | 0x30..=0x39 => { // '0' ... '9' |
860 | 0 | command = choose_command(shift_key, control_key, NavigationCommand::Move, NavigationCommand::Read, NavigationCommand::SetPlacemarker, NavigationCommand::Describe); |
861 | static PLACE_MARKER: &[NavigationParam] = &[ | |
862 | NavigationParam::Placemarker0, | |
863 | NavigationParam::Placemarker1, | |
864 | NavigationParam::Placemarker2, | |
865 | NavigationParam::Placemarker3, | |
866 | NavigationParam::Placemarker4, | |
867 | NavigationParam::Placemarker5, | |
868 | NavigationParam::Placemarker6, | |
869 | NavigationParam::Placemarker7, | |
870 | NavigationParam::Placemarker8, | |
871 | NavigationParam::Placemarker9, | |
872 | ]; | |
873 | 0 | param = PLACE_MARKER[key-0x30]; |
874 | }, | |
875 | 0 | _ => bail!("Unknown key press/command"), |
876 | }; | |
877 | ||
878 | 0 | return Ok( (command, param) ); |
879 | 0 | } |
880 | ||
881 | // translate the key presses into commands | |
882 | ||
883 | ||
884 | 2 | fn navigation_command_string(command: NavigationCommand, param: NavigationParam) -> &'static str { |
885 | 2 | match command { |
886 | NavigationCommand::Move => { | |
887 | 1 | return match param { |
888 | 0 | NavigationParam::Previous => "MovePrevious", |
889 | 0 | NavigationParam::Next => "MoveNext", |
890 | 1 | NavigationParam::Start => "MoveStart", |
891 | 0 | NavigationParam::End => "MoveEnd", |
892 | 0 | NavigationParam::LineStart => "MoveLineStart", |
893 | 0 | NavigationParam::LineEnd => "MoveLineEnd", |
894 | 0 | NavigationParam::CellPrevious => "MoveCellPrevious", |
895 | 0 | NavigationParam::CellNext => "MoveCellNext", |
896 | 0 | NavigationParam::CellUp => "MoveCellUp", |
897 | 0 | NavigationParam::CellDown => "MoveCellDown", |
898 | 0 | NavigationParam::ColStart => "MoveColumnStart", |
899 | 0 | NavigationParam::ColEnd => "MoveColumnEnd", |
900 | _ => { | |
901 | 0 | if param < NavigationParam::Placemarker0 || param > NavigationParam::Placemarker9 { |
902 | 0 | panic!("Internal Error: Found illegal value for param of NavigationCommand::Move"); |
903 | 0 | } |
904 | static MOVE_TO: &[&str] = &["MoveTo0","MoveTo1","MoveTo2","MoveTo3","MoveTo4","MoveTo5","MoveTo6","MoveTo7","MoveTo8","MoveTo9"]; | |
905 | 0 | return MOVE_TO[(param as usize) - (NavigationParam::Placemarker0 as usize)]; |
906 | } | |
907 | } | |
908 | }, | |
909 | NavigationCommand::Zoom => { | |
910 | 1 | return match param { |
911 | 0 | NavigationParam::Next => "ZoomIn", |
912 | 1 | NavigationParam::Previous => "ZoomOut", |
913 | 0 | NavigationParam::Start => "ZoomOutAll", |
914 | 0 | NavigationParam::End => "ZoomInAll", |
915 | 0 | _ => panic!("Illegal param for NavigationCommand::Zoom"), |
916 | } | |
917 | }, | |
918 | NavigationCommand::MoveLastLocation => { | |
919 | 0 | return "MoveLastLocation"; |
920 | }, | |
921 | NavigationCommand::Read => { | |
922 | 0 | return match param { |
923 | 0 | NavigationParam::Previous => "ReadPrevious", |
924 | 0 | NavigationParam::Next => "ReadNext", |
925 | 0 | NavigationParam::Current => "ReadCurrent", |
926 | 0 | NavigationParam::CellCurrent => "ReadCellCurrent", |
927 | 0 | NavigationParam::Start => "ReadStart", |
928 | 0 | NavigationParam::End => "ReadEnd", |
929 | 0 | NavigationParam::LineStart => "ReadLineStart", |
930 | 0 | NavigationParam::LineEnd => "ReadLineEnd", |
931 | _ => { | |
932 | 0 | if param < NavigationParam::Placemarker0 || param > NavigationParam::Placemarker9 { |
933 | 0 | panic!("Internal Error: Found illegal value for param of NavigationCommand::Move"); |
934 | 0 | } |
935 | static READ_PLACE_MARKERS: &[&str] = &["Read0","Read1","Read2","Read3","Read4","Read5","Read6","Read7","Read8","Read9"]; | |
936 | 0 | return READ_PLACE_MARKERS[(param as usize) - (NavigationParam::Placemarker0 as usize)]; |
937 | }, | |
938 | } | |
939 | }, | |
940 | NavigationCommand::Describe => { | |
941 | 0 | return match param { |
942 | 0 | NavigationParam::Previous => "DescribePrevious", |
943 | 0 | NavigationParam::Next => "DescribeNext", |
944 | 0 | NavigationParam::Current => "DescribeCurrent", |
945 | _ => { | |
946 | 0 | if param < NavigationParam::Placemarker0 || param > NavigationParam::Placemarker9 { |
947 | 0 | panic!("Internal Error: Found illegal value for param of NavigationCommand::Describe"); |
948 | 0 | } |
949 | static DESCRIBE_PLACE_MARKERS: &[&str] = &["Describe0","Describe1","Describe2","Describe3","Describe4","Describe5","Describe6","Describe7","Describe8","Describe9"]; | |
950 | 0 | return DESCRIBE_PLACE_MARKERS[(param as usize) - (NavigationParam::Placemarker0 as usize)]; |
951 | } | |
952 | } | |
953 | }, | |
954 | NavigationCommand::ReadTo => { | |
955 | 0 | todo!("ReadTo navigation command") |
956 | }, | |
957 | NavigationCommand::Locate => { | |
958 | 0 | if param ==NavigationParam::Previous { |
959 | 0 | return "WhereAmI"; |
960 | 0 | } else if param ==NavigationParam::Last { |
961 | 0 | return "WhereAmIAll"; |
962 | 0 | } |
963 | }, | |
964 | NavigationCommand::ChangeNavMode => { | |
965 | 0 | if param ==NavigationParam::Previous { |
966 | 0 | return "ToggleZoomLockUp"; |
967 | 0 | } else if param ==NavigationParam::Next { |
968 | 0 | return "ToggleZoomLockDown"; |
969 | 0 | } |
970 | }, | |
971 | NavigationCommand::ToggleSpeakMode => { | |
972 | 0 | return "ToggleSpeakMode"; |
973 | }, | |
974 | NavigationCommand::SetPlacemarker => { | |
975 | 0 | if param < NavigationParam::Placemarker0 || param > NavigationParam::Placemarker9 { |
976 | 0 | panic!("Internal Error: Found illegal value for param of NavigationCommand::SetPlacemarker"); |
977 | 0 | } |
978 | static SET_PLACE_MARKER: &[&str] = &["SetPlacemarker0","SetPlacemarker1","SetPlacemarker2","SetPlacemarker3","SetPlacemarker4","SetPlacemarker5","SetPlacemarker6","SetPlacemarker7","SetPlacemarker8","SetPlacemarker9"]; | |
979 | 0 | return SET_PLACE_MARKER[(param as usize) - (NavigationParam::Placemarker0 as usize)]; |
980 | }, | |
981 | NavigationCommand::Exit => { | |
982 | 0 | return "Exit"; |
983 | }, | |
984 | NavigationCommand::Last => { | |
985 | 0 | return "Error"; |
986 | } | |
987 | }; | |
988 | 0 | return "Error"; |
989 | 2 | } |
990 | ||
991 | #[cfg(test)] | |
992 | mod tests { | |
993 | use super::*; | |
994 | #[allow(unused_imports)] | |
995 | use crate::init_logger; | |
996 | use crate::interface::*; | |
997 | ||
998 | #[cfg(test)] | |
999 | /// Assert if result_id != '' and it doesn't match the id of the result of the move | |
1000 | /// Returns the speech from the command | |
1001 | 547 | fn test_command(command: &'static str, mathml: Element, result_id: &str) -> String { |
1002 | // debug!("\nCommand: {}", command); | |
1003 | 547 | NAVIGATION_STATE.with(|nav_stack| { |
1004 | 547 | let (start_id, _) = nav_stack.borrow().get_navigation_mathml_id(mathml); |
1005 | 547 | match do_navigate_command_string(mathml, command) { |
1006 | 0 | Err(e) => { |
1007 | 0 | panic!("\nStarting at '{}', '{} failed.\n{}", |
1008 | 0 | start_id, command, &crate::interface::errors_to_string(&e)) |
1009 | }, | |
1010 | 547 | Ok(nav_speech) => { |
1011 | 547 | let nav_speech = nav_speech.trim_end_matches(&[' ', ',', ';']); |
1012 | // debug!("Full speech: {}", nav_speech); | |
1013 | 547 | if !result_id.is_empty() { |
1014 | 547 | let (id, _) = nav_stack.borrow().get_navigation_mathml_id(mathml); |
1015 | 547 | assert_eq!(result_id, id, "\nStarting at '{}', '{} failed.", start_id, command); |
1016 | 0 | } |
1017 | 547 | return nav_speech.to_string(); |
1018 | } | |
1019 | }; | |
1020 | 547 | }) |
1021 | 547 | } |
1022 | ||
1023 | 56 | fn init_default_prefs(mathml: &str, nav_mode_default: &str) { |
1024 | 56 | set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); |
1025 | 56 | set_preference("NavMode", nav_mode_default).unwrap(); |
1026 | 56 | set_preference("NavVerbosity", "Verbose").unwrap(); |
1027 | 56 | set_preference("AutoZoomOut", "True").unwrap(); |
1028 | 56 | set_preference("Language", "en").unwrap(); |
1029 | 56 | set_preference("SpeechStyle", "SimpleSpeak").unwrap(); |
1030 | 56 | set_preference("Verbosity", "Medium").unwrap(); |
1031 | 56 | set_preference("Overview", "False").unwrap(); |
1032 | 56 | set_mathml(mathml).unwrap(); |
1033 | 56 | } |
1034 | ||
1035 | #[test] | |
1036 | 1 | fn zoom_in() -> Result<()> { |
1037 | 1 | let mathml_str = "<math id='math'><mfrac id='mfrac'> |
1038 | 1 | <msup id='msup'><mi id='base'>b</mi><mn id='exp'>2</mn></msup> |
1039 | 1 | <mi id='denom'>d</mi> |
1040 | 1 | </mfrac></math>"; |
1041 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1042 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1043 | 1 | let package_instance = package_instance.borrow(); |
1044 | 1 | let mathml = get_element(&package_instance); |
1045 | 1 | test_command("ZoomIn", mathml, "msup"); |
1046 | 1 | test_command("ZoomIn", mathml, "base"); |
1047 | 1 | test_command("ZoomIn", mathml, "base"); |
1048 | 1 | return Ok( () ); |
1049 | 1 | }); |
1050 | 1 | } |
1051 | ||
1052 | #[test] | |
1053 | 1 | fn test_init_navigate_move_right() -> Result<()> { |
1054 | // this is how navigation typically starts up | |
1055 | 1 | let mathml_str = " <math display='block' id='id-0'> |
1056 | 1 | <mrow id='id-1'> |
1057 | 1 | <msup id='msup'><mi id='base'>b</mi><mn id='exp'>2</mn></msup> |
1058 | 1 | <mo id='id-3'>=</mo> |
1059 | 1 | <mrow id='id-4'> |
1060 | 1 | <mi id='id-5'>a</mi> |
1061 | 1 | <mo id='id-6'>-</mo> |
1062 | 1 | <mn id='id-7'>2</mn> |
1063 | 1 | </mrow> |
1064 | 1 | </mrow> |
1065 | 1 | </math>"; |
1066 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1067 | 1 | debug!("--- Enhanced ---"); |
1068 | 1 | MATHML_INSTANCE.with(|package_instance| { |
1069 | 1 | let package_instance = package_instance.borrow(); |
1070 | 1 | let mathml = get_element(&package_instance); |
1071 | 1 | test_command("ZoomIn", mathml, "msup"); |
1072 | 1 | test_command("MoveNext", mathml, "id-3"); |
1073 | 1 | }); |
1074 | ||
1075 | 1 | init_default_prefs(mathml_str, "Simple"); |
1076 | 1 | debug!("--- Simple ---"); |
1077 | 1 | MATHML_INSTANCE.with(|package_instance: &RefCell<Package>| { |
1078 | 1 | let package_instance = package_instance.borrow(); |
1079 | 1 | let mathml = get_element(&package_instance); |
1080 | 1 | test_command("ZoomIn", mathml, "msup"); |
1081 | 1 | test_command("MoveNext", mathml, "id-3"); |
1082 | 1 | }); |
1083 | ||
1084 | 1 | init_default_prefs(mathml_str, "Character"); |
1085 | 1 | debug!("--- Character ---"); |
1086 | 1 | MATHML_INSTANCE.with(|package_instance| { |
1087 | 1 | let package_instance = package_instance.borrow(); |
1088 | 1 | let mathml = get_element(&package_instance); |
1089 | 1 | test_command("ZoomIn", mathml, "base"); |
1090 | 1 | test_command("MoveNext", mathml, "exp"); |
1091 | 1 | }); |
1092 | 1 | return Ok( () ); |
1093 | 1 | } |
1094 | ||
1095 | #[test] | |
1096 | 1 | fn zoom_in_parens() -> Result<()> { |
1097 | // (a+b)(c+d) + 1 | |
1098 | 1 | let mathml_str = " <math display='block' id='id-0'> |
1099 | 1 | <mrow id='id-1'> |
1100 | 1 | <mrow id='id-2'> |
1101 | 1 | <mrow id='id-3'> |
1102 | 1 | <mo stretchy='false' id='id-4'>(</mo> |
1103 | 1 | <mrow id='id-5'> |
1104 | 1 | <mi id='id-6'>a</mi> |
1105 | 1 | <mo id='id-7'>+</mo> |
1106 | 1 | <mi id='id-8'>b</mi> |
1107 | 1 | </mrow> |
1108 | 1 | <mo stretchy='false' id='id-9'>)</mo> |
1109 | 1 | </mrow> |
1110 | 1 | <mo id='id-10'>⁢</mo> |
1111 | 1 | <mrow id='id-11'> |
1112 | 1 | <mo stretchy='false' id='id-12'>(</mo> |
1113 | 1 | <mrow id='id-13'> |
1114 | 1 | <mi id='id-14'>c</mi> |
1115 | 1 | <mo id='id-15'>+</mo> |
1116 | 1 | <mi id='id-16'>d</mi> |
1117 | 1 | </mrow> |
1118 | 1 | <mo stretchy='false' id='id-17'>)</mo> |
1119 | 1 | </mrow> |
1120 | 1 | </mrow> |
1121 | 1 | <mo id='id-18'>+</mo> |
1122 | 1 | <mn id='id-19'>1</mn> |
1123 | 1 | </mrow> |
1124 | 1 | </math>"; |
1125 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1126 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1127 | 1 | let package_instance = package_instance.borrow(); |
1128 | 1 | let mathml = get_element(&package_instance); |
1129 | 1 | set_preference("NavMode", "Enhanced") |
1130 | 1 | debug!("\n------EnhancedMode----------"); |
1131 | 1 | test_command("ZoomIn", mathml, "id-2"); |
1132 | 1 | test_command("ZoomIn", mathml, "id-5"); |
1133 | 1 | test_command("ZoomIn", mathml, "id-6"); |
1134 | ||
1135 | // repeat, but this time with "Simple | |
1136 | 1 | set_preference("NavMode", "Simple") |
1137 | 1 | debug!("\n------SimpleMode----------"); |
1138 | 1 | test_command("ZoomOutAll", mathml, "id-1"); |
1139 | 1 | test_command("ZoomIn", mathml, "id-4"); |
1140 | 1 | test_command("ZoomIn", mathml, "id-4"); |
1141 | 1 | return Ok( () ); |
1142 | 1 | }); |
1143 | 1 | } |
1144 | ||
1145 | #[test] | |
1146 | 1 | fn zoom_in_all() -> Result<()> { |
1147 | 1 | let mathml_str = "<math id='math'><mfrac id='mfrac'> |
1148 | 1 | <msup id='msup'><mi id='base'>b</mi><mn id='exp'>2</mn></msup> |
1149 | 1 | <mi id='denom'>d</mi> |
1150 | 1 | </mfrac></math>"; |
1151 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1152 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1153 | 1 | let package_instance = package_instance.borrow(); |
1154 | 1 | let mathml = get_element(&package_instance); |
1155 | 1 | test_command("ZoomInAll", mathml, "base"); |
1156 | 1 | return Ok( () ); |
1157 | 1 | }); |
1158 | 1 | } |
1159 | ||
1160 | ||
1161 | #[test] | |
1162 | 1 | fn zoom_out() -> Result<()> { |
1163 | 1 | let mathml_str = "<math id='math'><mfrac id='mfrac'> |
1164 | 1 | <msup id='msup'><mi id='base'>b</mi><mn id='exp'>2</mn></msup> |
1165 | 1 | <mi id='denom'>d</mi> |
1166 | 1 | </mfrac></math>"; |
1167 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1168 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1169 | 1 | let package_instance = package_instance.borrow(); |
1170 | 1 | let mathml = get_element(&package_instance); |
1171 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1172 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
1173 | 1 | current_node: "base".to_string(), |
1174 | 1 | current_node_offset: 0 |
1175 | 1 | }, "None") |
1176 | 1 | }); |
1177 | 1 | test_command("ZoomOut", mathml, "msup"); |
1178 | ||
1179 | 1 | let _nav_speech = do_navigate_command_and_param(mathml, NavigationCommand::Zoom, NavigationParam::Previous) |
1180 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1181 | 1 | let (id, _) = nav_stack.borrow().get_navigation_mathml_id(mathml); |
1182 | 1 | assert_eq!(id, "mfrac"); |
1183 | 1 | }); |
1184 | 1 | return Ok( () ); |
1185 | 1 | }); |
1186 | 1 | } |
1187 | ||
1188 | #[test] | |
1189 | 1 | fn zoom_out_all() -> Result<()> { |
1190 | 1 | let mathml_str = "<math id='math'><mfrac id='mfrac'> |
1191 | 1 | <msup id='msup'><mi id='base'>b</mi><mn id='exp'>2</mn></msup> |
1192 | 1 | <mi id='denom'>d</mi> |
1193 | 1 | </mfrac></math>"; |
1194 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1195 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1196 | 1 | let package_instance = package_instance.borrow(); |
1197 | 1 | let mathml = get_element(&package_instance); |
1198 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1199 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
1200 | 1 | current_node: "base".to_string(), |
1201 | 1 | current_node_offset: 0 |
1202 | 1 | }, "None") |
1203 | 1 | }); |
1204 | ||
1205 | 1 | test_command("ZoomOutAll", mathml, "mfrac"); |
1206 | 1 | return Ok( () ); |
1207 | 1 | }); |
1208 | 1 | } |
1209 | ||
1210 | #[test] | |
1211 | 1 | fn move_start_end() -> Result<()> { |
1212 | 1 | let mathml_str = " <math display='block' id='id-0'> |
1213 | 1 | <mrow id='id-1'> |
1214 | 1 | <mi id='id-2'>x</mi> |
1215 | 1 | <mo id='id-3'>=</mo> |
1216 | 1 | <mrow id='id-4'> |
1217 | 1 | <mi id='id-5'>a</mi> |
1218 | 1 | <mo id='id-6'>-</mo> |
1219 | 1 | <mn id='id-7'>2</mn> |
1220 | 1 | </mrow> |
1221 | 1 | </mrow> |
1222 | 1 | </math>"; |
1223 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1224 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1225 | 1 | let package_instance = package_instance.borrow(); |
1226 | 1 | let mathml = get_element(&package_instance); |
1227 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1228 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
1229 | 1 | current_node: "id-4".to_string(), |
1230 | 1 | current_node_offset: 0 |
1231 | 1 | }, "None") |
1232 | 1 | }); |
1233 | ||
1234 | 1 | set_preference("NavMode", "Character") |
1235 | 1 | test_command("MoveStart", mathml, "id-2"); |
1236 | 1 | test_command("MoveEnd", mathml, "id-7"); |
1237 | 1 | set_preference("NavMode", "Simple") |
1238 | 1 | test_command("MoveStart", mathml, "id-2"); |
1239 | 1 | test_command("MoveEnd", mathml, "id-7"); |
1240 | 1 | set_preference("NavMode", "Enhanced") |
1241 | 1 | test_command("MoveStart", mathml, "id-2"); |
1242 | 1 | test_command("MovePrevious", mathml, "id-2"); |
1243 | 1 | test_command("MoveEnd", mathml, "id-4"); |
1244 | 1 | test_command("MoveNext", mathml, "id-4"); |
1245 | 1 | return Ok( () ); |
1246 | 1 | }); |
1247 | 1 | } |
1248 | ||
1249 | #[test] | |
1250 | 1 | fn move_line_start_end() -> Result<()> { |
1251 | 1 | let mathml_str = " <math display='block' id='id-0'> |
1252 | 1 | <mfrac displaystyle='true' id='id-1'> |
1253 | 1 | <mi id='id-2'>x</mi> |
1254 | 1 | <mrow id='id-3'> |
1255 | 1 | <msup id='id-4'> |
1256 | 1 | <mi id='id-5'>y</mi> |
1257 | 1 | <mn id='id-6'>2</mn> |
1258 | 1 | </msup> |
1259 | 1 | <mo id='id-7'>+</mo> |
1260 | 1 | <mn id='id-8'>1</mn> |
1261 | 1 | </mrow> |
1262 | 1 | </mfrac> |
1263 | 1 | </math>"; |
1264 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1265 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1266 | 1 | let package_instance = package_instance.borrow(); |
1267 | 1 | let mathml = get_element(&package_instance); |
1268 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1269 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
1270 | 1 | current_node: "id-7".to_string(), |
1271 | 1 | current_node_offset: 0 |
1272 | 1 | }, "None") |
1273 | 1 | }); |
1274 | ||
1275 | 1 | set_preference("NavMode", "Character") |
1276 | 1 | test_command("MoveLineStart", mathml, "id-5"); |
1277 | 1 | test_command("MoveLineEnd", mathml, "id-8"); |
1278 | 1 | set_preference("NavMode", "Simple") |
1279 | 1 | test_command("MoveLineStart", mathml, "id-4"); |
1280 | 1 | test_command("MoveLineEnd", mathml, "id-8"); |
1281 | 1 | set_preference("NavMode", "Enhanced") |
1282 | 1 | test_command("MoveLineStart", mathml, "id-4"); |
1283 | 1 | test_command("MoveLineEnd", mathml, "id-8"); |
1284 | 1 | test_command("MoveEnd", mathml, "id-3"); |
1285 | 1 | return Ok( () ); |
1286 | 1 | }); |
1287 | 1 | } |
1288 | ||
1289 | #[test] | |
1290 | 1 | fn text_extremes_and_move_last_location() -> Result<()> { |
1291 | 1 | let mathml_str = "<math id='math'><mfrac id='mfrac'> |
1292 | 1 | <msup id='msup'><mi id='base'>b</mi><mn id='exp'>2</mn></msup> |
1293 | 1 | <mi id='denom'>d</mi> |
1294 | 1 | </mfrac></math>"; |
1295 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1296 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1297 | 1 | let package_instance = package_instance.borrow(); |
1298 | 1 | let mathml = get_element(&package_instance); |
1299 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1300 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
1301 | 1 | current_node: "base".to_string(), |
1302 | 1 | current_node_offset: 0 |
1303 | 1 | }, "None") |
1304 | 1 | }); |
1305 | ||
1306 | 1 | test_command("ZoomOutAll", mathml, "mfrac"); |
1307 | 1 | test_command("ZoomOut", mathml, "mfrac"); |
1308 | 1 | test_command("MoveLastLocation", mathml, "base"); // second zoom out should do nothing |
1309 | ||
1310 | 1 | test_command("ZoomOut", mathml, "msup"); |
1311 | 1 | test_command("ZoomInAll", mathml, "base"); |
1312 | 1 | test_command("ZoomIn", mathml, "base"); |
1313 | 1 | test_command("MoveLastLocation", mathml, "msup"); // second zoom in should do nothing |
1314 | ||
1315 | 1 | return Ok( () ); |
1316 | 1 | }); |
1317 | 1 | } |
1318 | ||
1319 | #[test] | |
1320 | 1 | fn move_to_start() -> Result<()> { |
1321 | 1 | let mathml_str = "<math id='math'><mfrac id='mfrac'> |
1322 | 1 | <mrow id='num'><msup id='msup'><mi id='base'>b</mi><mn id='exp'>2</mn></msup><mo id='factorial'>!</mo></mrow> |
1323 | 1 | <mi id='denom'>d</mi> |
1324 | 1 | </mfrac></math>"; |
1325 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1326 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1327 | 1 | let package_instance = package_instance.borrow(); |
1328 | 1 | let mathml = get_element(&package_instance); |
1329 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1330 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
1331 | 1 | current_node: "denom".to_string(), |
1332 | 1 | current_node_offset: 0 |
1333 | 1 | }, "None") |
1334 | 1 | }); |
1335 | 1 | test_command("MoveLineStart", mathml, "denom"); |
1336 | ||
1337 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1338 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
1339 | 1 | current_node: "factorial".to_string(), |
1340 | 1 | current_node_offset: 0 |
1341 | 1 | }, "None") |
1342 | 1 | }); |
1343 | 1 | test_command("MoveLineStart", mathml, "msup"); |
1344 | ||
1345 | 1 | let _nav_speech = do_navigate_command_and_param(mathml, NavigationCommand::Move, NavigationParam::Start) |
1346 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1347 | 1 | let (id, _) = nav_stack.borrow().get_navigation_mathml_id(mathml); |
1348 | 1 | assert_eq!(id, "num"); |
1349 | 1 | }); |
1350 | 1 | return Ok( () ); |
1351 | 1 | }); |
1352 | 1 | } |
1353 | ||
1354 | #[test] | |
1355 | 1 | fn move_right_sup() -> Result<()> { |
1356 | 1 | let mathml_str = "<math display='block' id='id-0'> |
1357 | 1 | <mrow id='id-1'> |
1358 | 1 | <msup id='id-2'> |
1359 | 1 | <mn id='id-3'>2</mn> |
1360 | 1 | <mi id='id-4'>q</mi> |
1361 | 1 | </msup> |
1362 | 1 | <mo id='id-5'>-</mo> |
1363 | 1 | <mi id='id-6'>x</mi> |
1364 | 1 | </mrow> |
1365 | 1 | </math>"; |
1366 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1367 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1368 | 1 | let package_instance = package_instance.borrow(); |
1369 | 1 | let mathml = get_element(&package_instance); |
1370 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1371 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
1372 | 1 | current_node: "id-2".to_string(), |
1373 | 1 | current_node_offset: 0 |
1374 | 1 | }, "None") |
1375 | 1 | }); |
1376 | 1 | set_preference("NavMode", "Enhanced") |
1377 | 1 | test_command("MoveNext", mathml, "id-5"); |
1378 | ||
1379 | // reset start and test Simple | |
1380 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1381 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
1382 | 1 | current_node: "id-2".to_string(), |
1383 | 1 | current_node_offset: 0 |
1384 | 1 | }, "None") |
1385 | 1 | }); |
1386 | 1 | set_preference("NavMode", "Simple") |
1387 | 1 | test_command("MoveNext", mathml, "id-5"); |
1388 | ||
1389 | // reset start and test Character | |
1390 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1391 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
1392 | 1 | current_node: "id-3".to_string(), |
1393 | 1 | current_node_offset: 0 |
1394 | 1 | }, "None") |
1395 | 1 | }); |
1396 | 1 | set_preference("NavMode", "Character") |
1397 | 1 | test_command("MoveNext", mathml, "id-4"); |
1398 | 1 | test_command("MoveNext", mathml, "id-5"); |
1399 | 1 | return Ok( () ); |
1400 | 1 | }); |
1401 | 1 | } |
1402 | ||
1403 | ||
1404 | #[test] | |
1405 | 1 | fn move_msubsup_char() -> Result<()> { |
1406 | 1 | let mathml_str = "<math display='block' id='id-0'> |
1407 | 1 | <mrow id='id-1'> |
1408 | 1 | <mn id='id-2'>1</mn> |
1409 | 1 | <mo id='id-3'>+</mo> |
1410 | 1 | <msubsup id='id-4'> |
1411 | 1 | <mi id='id-5'>x</mi> |
1412 | 1 | <mn id='id-6'>2</mn> |
1413 | 1 | <mn id='id-7'>3</mn> |
1414 | 1 | </msubsup> |
1415 | 1 | <mo id='id-8'>+</mo> |
1416 | 1 | <mn id='id-9'>4</mn> |
1417 | 1 | </mrow> |
1418 | 1 | </math>"; |
1419 | 1 | init_default_prefs(mathml_str, "Character"); |
1420 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1421 | 1 | let package_instance = package_instance.borrow(); |
1422 | 1 | let mathml = get_element(&package_instance); |
1423 | 1 | assert_eq!("zoomed in all of the way; 1", test_command("ZoomInAll", mathml, "id-2")); |
1424 | 1 | assert_eq!("move right; plus", test_command("MoveNext", mathml, "id-3")); |
1425 | 1 | assert_eq!("move right; in base; x", test_command("MoveNext", mathml, "id-5")); |
1426 | 1 | assert_eq!("move right; in subscript; 2", test_command("MoveNext", mathml, "id-6")); |
1427 | 1 | assert_eq!("move right; in superscript; 3", test_command("MoveNext", mathml, "id-7")); |
1428 | 1 | assert_eq!("move right; out of superscript; plus", test_command("MoveNext", mathml, "id-8")); |
1429 | 1 | assert_eq!("move left; in superscript; 3", test_command("MovePrevious", mathml, "id-7")); |
1430 | 1 | assert_eq!("move left; in subscript; 2", test_command("MovePrevious", mathml, "id-6")); |
1431 | 1 | assert_eq!("move left; in base; x", test_command("MovePrevious", mathml, "id-5")); |
1432 | 1 | assert_eq!("move left; out of base; plus", test_command("MovePrevious", mathml, "id-3")); |
1433 | ||
1434 | 1 | return Ok( () ); |
1435 | 1 | }); |
1436 | 1 | } |
1437 | ||
1438 | #[test] | |
1439 | 1 | fn zoom_logbase() -> Result<()> { |
1440 | 1 | let mathml_str = "<math display='block' id='id-0'> |
1441 | 1 | <mrow displaystyle='true' id='id-1'> |
1442 | 1 | <msub id='id-2'> |
1443 | 1 | <mi id='id-3'>log</mi> |
1444 | 1 | <mn id='id-4'>2</mn> |
1445 | 1 | </msub> |
1446 | 1 | <mo data-changed='added' id='id-5'>⁡</mo> |
1447 | 1 | <mi id='id-6'>x</mi>a |
1448 | 1 | </mrow> |
1449 | 1 | </math>"; |
1450 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1451 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1452 | 1 | let package_instance = package_instance.borrow(); |
1453 | 1 | let mathml = get_element(&package_instance); |
1454 | 1 | assert_eq!("zoom in; the log base 2", test_command("ZoomIn", mathml, "id-2")); |
1455 | 1 | assert_eq!("zoom in; in base; 2", test_command("ZoomIn", mathml, "id-4")); |
1456 | 1 | assert_eq!("zoomed in all of the way; 2", test_command("ZoomIn", mathml, "id-4")); |
1457 | 1 | debug!("Now zooming out"); |
1458 | 1 | assert_eq!("zoom out; out of base; the log base 2", test_command("ZoomOut", mathml, "id-2")); |
1459 | 1 | assert_eq!("zoom out; the log base 2, of x", test_command("ZoomOut", mathml, "id-1")); |
1460 | 1 | assert_eq!("zoomed out all of the way; the log base 2, of x", test_command("ZoomOut", mathml, "id-1")); |
1461 | 1 | return Ok( () ); |
1462 | 1 | }); |
1463 | 1 | } |
1464 | ||
1465 | #[test] | |
1466 | 1 | fn zoom_logbase_power() -> Result<()> { |
1467 | 1 | let mathml_str = "<math display='block' id='id-0'> |
1468 | 1 | <mrow displaystyle='true' id='id-1'> |
1469 | 1 | <msubsup id='id-2'> |
1470 | 1 | <mi id='id-3'>log</mi> |
1471 | 1 | <mn id='id-4'>2</mn> |
1472 | 1 | <mn id='id-5'>3</mn> |
1473 | 1 | </msubsup> |
1474 | 1 | <mo data-changed='added' id='id-6'>⁡</mo> |
1475 | 1 | <mi id='id-7'>x</mi> |
1476 | 1 | </mrow> |
1477 | 1 | </math>"; |
1478 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1479 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1480 | 1 | let package_instance = package_instance.borrow(); |
1481 | 1 | let mathml = get_element(&package_instance); |
1482 | 1 | assert_eq!("zoom in; the log base 2, cubed", test_command("ZoomIn", mathml, "id-2")); |
1483 | 1 | assert_eq!("zoom in; in base; the log base 2", test_command("ZoomIn", mathml, "id-2-log-base")); |
1484 | 1 | assert_eq!("zoom in; in base; 2", test_command("ZoomIn", mathml, "id-4")); |
1485 | 1 | assert_eq!("zoomed in all of the way; 2", test_command("ZoomIn", mathml, "id-4")); |
1486 | 1 | debug!("Now zooming out"); |
1487 | 1 | assert_eq!("zoom out; out of base; the log base 2", test_command("ZoomOut", mathml, "id-2-log-base")); |
1488 | 1 | assert_eq!("zoom out; out of base; the log base 2, cubed", test_command("ZoomOut", mathml, "id-2")); |
1489 | 1 | assert_eq!("zoom out; the log base 2, cubed of x", test_command("ZoomOut", mathml, "id-1")); |
1490 | 1 | assert_eq!("zoomed out all of the way; the log base 2, cubed of x", test_command("ZoomOut", mathml, "id-1")); |
1491 | 1 | return Ok( () ); |
1492 | 1 | }); |
1493 | 1 | } |
1494 | ||
1495 | #[test] | |
1496 | 1 | fn zoom_msubsup() -> Result<()> { |
1497 | // msubsup is trickier because it creates an intent within an intent, so offsets need to be handled properly | |
1498 | 1 | let mathml_str = "<math id='math'><msubsup id='msubsup'><mi id='base'>𝑥</mi><mn id='sub'>1</mn><mn id='sup'>2</mn></msubsup></math>"; |
1499 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1500 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1501 | 1 | let package_instance = package_instance.borrow(); |
1502 | 1 | let mathml = get_element(&package_instance); |
1503 | 1 | set_preference("NavMode", "Enhanced").unwrap(); |
1504 | 1 | debug!("Enhanced mode"); |
1505 | 1 | do_commands(mathml) |
1506 | 1 | set_preference("NavMode", "Simple").unwrap(); |
1507 | 1 | debug!("Simple mode"); |
1508 | 1 | do_commands(mathml) |
1509 | 1 | set_preference("NavMode", "Character").unwrap(); |
1510 | 1 | debug!("Character mode"); |
1511 | 1 | assert_eq!("zoom in; in base; x", test_command("ZoomIn", mathml, "base")); |
1512 | 1 | assert_eq!("zoom out; out of base; x sub 1 super 2 end super", test_command("ZoomOut", mathml, "msubsup")); |
1513 | 1 | return Ok( () ); |
1514 | ||
1515 | /// Enhanced and Simple mode should behave the same | |
1516 | 2 | fn do_commands(mathml: Element) -> Result<()> { |
1517 | 2 | assert_eq!("zoom in; in base; x sub 1", test_command("ZoomIn", mathml, "msubsup-indexed-by")); |
1518 | 2 | assert_eq!("zoom in; in base; x", test_command("ZoomIn", mathml, "base")); |
1519 | 2 | assert_eq!("zoomed in all of the way; x", test_command("ZoomIn", mathml, "base")); |
1520 | 2 | debug!("Now zooming out"); |
1521 | 2 | assert_eq!("zoom out; out of base; x sub 1", test_command("ZoomOut", mathml, "msubsup-indexed-by")); |
1522 | 2 | assert_eq!("zoom out; out of base; x sub 1, squared", test_command("ZoomOut", mathml, "msubsup")); |
1523 | 2 | assert_eq!("zoomed out all of the way; x sub 1, squared", test_command("ZoomOut", mathml, "msubsup")); |
1524 | 2 | return Ok( () ); |
1525 | 2 | } |
1526 | 1 | }); |
1527 | 1 | } |
1528 | ||
1529 | #[test] | |
1530 | 1 | fn move_mmultiscripts_char() -> Result<()> { |
1531 | 1 | let mathml_str = "<math display='block' id='id-0'> |
1532 | 1 | <mmultiscripts data-mjx-texclass='ORD' data-chem-formula='5' id='id-1'> |
1533 | 1 | <mrow data-chem-formula='3' id='id-2'> |
1534 | 1 | <mo stretchy='false' id='id-3'>[</mo> |
1535 | 1 | <mmultiscripts data-chem-formula='3' id='id-4'> |
1536 | 1 | <mi data-chem-element='3' id='id-5'>Co</mi> |
1537 | 1 | <mn id='id-6'>6</mn> |
1538 | 1 | <none id='id-7'></none> |
1539 | 1 | </mmultiscripts> |
1540 | 1 | <mo stretchy='false' id='id-8'>]</mo> |
1541 | 1 | </mrow> |
1542 | 1 | <none id='id-9'></none> |
1543 | 1 | <mrow id='id-10'> |
1544 | 1 | <mn id='id-11'>3</mn> |
1545 | 1 | <mo id='id-12'>+</mo> |
1546 | 1 | </mrow> |
1547 | 1 | </mmultiscripts> |
1548 | 1 | </math>"; |
1549 | 1 | init_default_prefs(mathml_str, "Character"); |
1550 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1551 | 1 | let package_instance = package_instance.borrow(); |
1552 | 1 | let mathml = get_element(&package_instance); |
1553 | 1 | assert_eq!("zoomed in all of the way; in base; open bracket", test_command("ZoomInAll", mathml, "id-3")); |
1554 | 1 | assert_eq!("move right; in base; cap c o", test_command("MoveNext", mathml, "id-5")); |
1555 | 1 | assert_eq!("move right; in subscript; 6", test_command("MoveNext", mathml, "id-6")); |
1556 | 1 | assert_eq!("move right; out of subscript; close bracket", test_command("MoveNext", mathml, "id-8")); |
1557 | 1 | assert_eq!("move right; in superscript; 3", test_command("MoveNext", mathml, "id-11")); |
1558 | 1 | assert_eq!("move right; plus", test_command("MoveNext", mathml, "id-12")); |
1559 | 1 | assert_eq!("cannot move right, end of math", test_command("MoveNext", mathml, "id-12")); |
1560 | 1 | assert_eq!("move left; 3", test_command("MovePrevious", mathml, "id-11")); |
1561 | 1 | assert_eq!("move left; in base; close bracket", test_command("MovePrevious", mathml, "id-8")); |
1562 | 1 | assert_eq!("move left; in subscript; 6", test_command("MovePrevious", mathml, "id-6")); |
1563 | 1 | assert_eq!("move left; in base; cap c o", test_command("MovePrevious", mathml, "id-5")); |
1564 | 1 | assert_eq!("move left; out of base; open bracket", test_command("MovePrevious", mathml, "id-3")); |
1565 | ||
1566 | 1 | return Ok( () ); |
1567 | 1 | }); |
1568 | 1 | } |
1569 | ||
1570 | #[test] | |
1571 | 1 | fn move_right_char() -> Result<()> { |
1572 | 1 | let mathml_str = "<math id='id-0'> |
1573 | 1 | <mrow displaystyle='true' id='id-1'> |
1574 | 1 | <mi id='id-2'>x</mi> |
1575 | 1 | <mo id='id-3'>=</mo> |
1576 | 1 | <mrow id='id-4'> |
1577 | 1 | <mfrac id='id-5'> |
1578 | 1 | <mn id='id-6'>1</mn> |
1579 | 1 | <mrow id='id-7'> |
1580 | 1 | <mi id='id-8'>a</mi> |
1581 | 1 | <mo id='id-9'>+</mo> |
1582 | 1 | <mn id='id-10'>2</mn> |
1583 | 1 | </mrow> |
1584 | 1 | </mfrac> |
1585 | 1 | <mo id='id-11'>+</mo> |
1586 | 1 | <mrow id='id-12'> |
1587 | 1 | <mn id='id-13'>3</mn> |
1588 | 1 | <mo id='id-14'>⁢</mo> |
1589 | 1 | <mi id='id-15'>b</mi> |
1590 | 1 | </mrow> |
1591 | 1 | </mrow> |
1592 | 1 | </mrow> |
1593 | 1 | </math>"; |
1594 | 1 | init_default_prefs(mathml_str, "Character"); |
1595 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1596 | 1 | let package_instance = package_instance.borrow(); |
1597 | 1 | let mathml = get_element(&package_instance); |
1598 | 1 | test_command("ZoomInAll", mathml, "id-2"); |
1599 | 1 | test_command("MoveNext", mathml, "id-3"); |
1600 | 1 | test_command("MoveNext", mathml, "id-6"); |
1601 | 1 | test_command("MoveNext", mathml, "id-8"); |
1602 | 1 | test_command("MoveNext", mathml, "id-9"); |
1603 | 1 | test_command("MoveNext", mathml, "id-10"); |
1604 | 1 | test_command("MoveNext", mathml, "id-11"); |
1605 | 1 | test_command("MoveNext", mathml, "id-13"); |
1606 | 1 | test_command("MoveNext", mathml, "id-15"); |
1607 | 1 | test_command("MoveNext", mathml, "id-15"); |
1608 | ||
1609 | 1 | return Ok( () ); |
1610 | 1 | }); |
1611 | 1 | } |
1612 | ||
1613 | #[test] | |
1614 | 1 | fn char_mode_paren_test() -> Result<()> { |
1615 | 1 | let mathml_str = "<math display='block' id='id-0'> |
1616 | 1 | <mrow displaystyle='true' id='id-1'> |
1617 | 1 | <mrow id='id-2'> |
1618 | 1 | <mo id='id-3'>(</mo> |
1619 | 1 | <mi id='id-4'>a</mi> |
1620 | 1 | <mo id='id-5'>)</mo> |
1621 | 1 | </mrow> |
1622 | 1 | <mo id='id-6'>⁢</mo> |
1623 | 1 | <mrow id='id-7'> |
1624 | 1 | <mo id='id-8'>(</mo> |
1625 | 1 | <mi id='id-9'>b</mi> |
1626 | 1 | <mo id='id-10'>)</mo> |
1627 | 1 | </mrow> |
1628 | 1 | </mrow> |
1629 | 1 | </math>"; |
1630 | 1 | init_default_prefs(mathml_str, "Character"); |
1631 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1632 | 1 | let package_instance = package_instance.borrow(); |
1633 | 1 | let mathml = get_element(&package_instance); |
1634 | 1 | debug!("Character mode"); |
1635 | 1 | do_commands(mathml) |
1636 | 1 | set_preference("NavMode", "Simple").unwrap(); |
1637 | 1 | debug!("Simple mode"); |
1638 | 1 | test_command("ZoomIn", mathml, "id-3"); // zooms to the first parenthesis |
1639 | 1 | do_commands(mathml) |
1640 | 1 | set_preference("NavMode", "Enhanced").unwrap(); |
1641 | 1 | debug!("Enhanced mode"); |
1642 | 1 | test_command("ZoomIn", mathml, "id-4"); |
1643 | 1 | test_command("MoveNext", mathml, "id-6"); |
1644 | 1 | test_command("MoveNext", mathml, "id-9"); |
1645 | 1 | test_command("MovePrevious", mathml, "id-6"); |
1646 | 1 | test_command("MovePrevious", mathml, "id-4"); |
1647 | ||
1648 | 1 | return Ok( () ); |
1649 | 1 | }); |
1650 | ||
1651 | /// Simple and Character mode should behave the same | |
1652 | 2 | fn do_commands(mathml: Element) -> Result<()> { |
1653 | 2 | test_command("ZoomIn", mathml, "id-3"); |
1654 | 2 | test_command("MoveNext", mathml, "id-4"); |
1655 | 2 | test_command("MoveNext", mathml, "id-5"); |
1656 | 2 | test_command("MoveNext", mathml, "id-8"); |
1657 | 2 | test_command("MoveNext", mathml, "id-9"); |
1658 | 2 | test_command("MoveNext", mathml, "id-10"); |
1659 | 2 | test_command("MovePrevious", mathml, "id-9"); |
1660 | 2 | test_command("MovePrevious", mathml, "id-8"); |
1661 | 2 | test_command("MovePrevious", mathml, "id-5"); |
1662 | 2 | test_command("ZoomOutAll", mathml, "id-1"); |
1663 | 2 | return Ok( () ); |
1664 | 2 | } |
1665 | 1 | } |
1666 | ||
1667 | #[test] | |
1668 | 1 | fn char_mode_trig_test() -> Result<()> { |
1669 | 1 | let mathml_str = "<math id='id-0'> |
1670 | 1 | <mrow id='id-1'> |
1671 | 1 | <mi id='id-2'>sin</mi> |
1672 | 1 | <mo id='id-3'>⁡</mo> |
1673 | 1 | <mrow id='id-4'> |
1674 | 1 | <mo id='id-5'>(</mo> |
1675 | 1 | <mi id='id-6'>x</mi> |
1676 | 1 | <mo id='id-7'>)</mo> |
1677 | 1 | </mrow> |
1678 | 1 | </mrow> |
1679 | 1 | </math>"; |
1680 | 1 | init_default_prefs(mathml_str, "Simple"); |
1681 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1682 | 1 | let package_instance = package_instance.borrow(); |
1683 | 1 | let mathml = get_element(&package_instance); |
1684 | 1 | do_commands(mathml) |
1685 | 1 | set_preference("NavMode", "Simple").unwrap(); |
1686 | 1 | do_commands(mathml) |
1687 | 1 | set_preference("NavMode", "Enhanced").unwrap(); |
1688 | 1 | test_command("ZoomIn", mathml, "id-2"); |
1689 | 1 | test_command("MoveNext", mathml, "id-6"); |
1690 | 1 | test_command("MovePrevious", mathml, "id-2"); |
1691 | ||
1692 | 1 | return Ok( () ); |
1693 | 1 | }); |
1694 | ||
1695 | ||
1696 | /// Simple and Character mode should behave the same | |
1697 | 2 | fn do_commands(mathml: Element) -> Result<()> { |
1698 | 2 | test_command("ZoomIn", mathml, "id-2"); |
1699 | 2 | test_command("MoveNext", mathml, "id-5"); |
1700 | 2 | test_command("MoveNext", mathml, "id-6"); |
1701 | 2 | test_command("MoveNext", mathml, "id-7"); |
1702 | 2 | test_command("MovePrevious", mathml, "id-6"); |
1703 | 2 | test_command("MovePrevious", mathml, "id-5"); |
1704 | 2 | test_command("MovePrevious", mathml, "id-2"); |
1705 | 2 | test_command("ZoomOutAll", mathml, "id-1"); |
1706 | 2 | return Ok( () ); |
1707 | 2 | } |
1708 | 1 | } |
1709 | ||
1710 | #[test] | |
1711 | 1 | fn move_char_speech() -> Result<()> { |
1712 | 1 | let mathml_str = "<math display='block' id='id-0'> |
1713 | 1 | <mrow id='id-1'> |
1714 | 1 | <mfrac id='id-2'> |
1715 | 1 | <mi id='id-3'>x</mi> |
1716 | 1 | <mi id='id-4'>y</mi> |
1717 | 1 | </mfrac> |
1718 | 1 | <mo id='id-5'>⁢</mo> |
1719 | 1 | <mi id='id-6'>z</mi> |
1720 | 1 | </mrow> |
1721 | 1 | </math>"; |
1722 | 1 | init_default_prefs(mathml_str, "Character"); |
1723 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1724 | 1 | let package_instance = package_instance.borrow(); |
1725 | 1 | let mathml = get_element(&package_instance); |
1726 | 1 | test_command("ZoomInAll", mathml, "id-3"); |
1727 | 1 | assert_eq!("move right; in denominator; y", test_command("MoveNext", mathml, "id-4")); |
1728 | 1 | assert_eq!("move right; out of denominator; z", test_command("MoveNext", mathml, "id-6")); |
1729 | 1 | assert_eq!("move left; in denominator; y", test_command("MovePrevious", mathml, "id-4")); |
1730 | 1 | assert_eq!("move left; in numerator; x", test_command("MovePrevious", mathml, "id-3")); |
1731 | ||
1732 | 1 | return Ok( () ); |
1733 | 1 | }); |
1734 | 1 | } |
1735 | ||
1736 | #[test] | |
1737 | 1 | fn move_inside_leaves() -> Result<()> { |
1738 | 1 | let mathml_str = "<math display='block' id='id-0'> |
1739 | 1 | <mrow id='id-1'> |
1740 | 1 | <mfrac id='id-2'> |
1741 | 1 | <mi id='id-3'>top</mi> |
1742 | 1 | <mi id='id-4'>αβγ</mi> |
1743 | 1 | </mfrac> |
1744 | 1 | </mrow> |
1745 | 1 | </math>"; |
1746 | 1 | init_default_prefs(mathml_str, "Character"); |
1747 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1748 | 1 | let package_instance = package_instance.borrow(); |
1749 | 1 | let mathml = get_element(&package_instance); |
1750 | 1 | test_command("ZoomInAll", mathml, "id-3"); |
1751 | 1 | assert_eq!("zoomed in to first character; t", test_command("ZoomIn", mathml, "id-3")); |
1752 | 1 | assert_eq!("move right; o", test_command("MoveNext", mathml, "id-3")); |
1753 | 1 | assert_eq!("move right; p", test_command("MoveNext", mathml, "id-3")); |
1754 | 1 | assert_eq!("move right; in denominator; αβγ", test_command("MoveNext", mathml, "id-4")); |
1755 | 1 | assert_eq!("zoomed in to first character; alpha", test_command("ZoomIn", mathml, "id-4")); |
1756 | 1 | assert_eq!("move right; beta", test_command("MoveNext", mathml, "id-4")); |
1757 | 1 | assert_eq!("move right; gamma", test_command("MoveNext", mathml, "id-4")); |
1758 | 1 | assert_eq!("cannot move right, end of math", test_command("MoveNext", mathml, "id-4")); |
1759 | 1 | assert_eq!("move left; beta", test_command("MovePrevious", mathml, "id-4")); |
1760 | 1 | assert_eq!("zoom out; αβγ", test_command("ZoomOut", mathml, "id-4")); |
1761 | ||
1762 | 1 | return Ok( () ); |
1763 | 1 | }); |
1764 | 1 | } |
1765 | ||
1766 | #[test] | |
1767 | 1 | fn move_enhanced_times() -> Result<()> { |
1768 | 1 | let mathml_str = "<math display='block' id='id-0'> |
1769 | 1 | <mrow displaystyle='true' id='id-1'> |
1770 | 1 | <mn id='id-2'>2</mn> |
1771 | 1 | <mo id='id-3'>⁢</mo> |
1772 | 1 | <mrow id='id-4'> |
1773 | 1 | <mo id='id-5'>(</mo> |
1774 | 1 | <mrow id='id-6'> |
1775 | 1 | <mn id='id-7'>1</mn> |
1776 | 1 | <mo id='id-8'>-</mo> |
1777 | 1 | <mi id='id-9'>x</mi> |
1778 | 1 | </mrow> |
1779 | 1 | <mo id='id-10'>)</mo> |
1780 | 1 | </mrow> |
1781 | 1 | </mrow> |
1782 | 1 | </math>"; |
1783 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1784 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1785 | 1 | let package_instance = package_instance.borrow(); |
1786 | 1 | let mathml = get_element(&package_instance); |
1787 | 1 | test_command("ZoomIn", mathml, "id-2"); |
1788 | 1 | assert_eq!("move right; times", test_command("MoveNext", mathml, "id-3")); |
1789 | 1 | assert_eq!("move right; 1 minus x", test_command("MoveNext", mathml, "id-6")); |
1790 | 1 | assert_eq!("move left; times", test_command("MovePrevious", mathml, "id-3")); |
1791 | 1 | assert_eq!("move left; 2", test_command("MovePrevious", mathml, "id-2")); |
1792 | ||
1793 | 1 | return Ok( () ); |
1794 | 1 | }); |
1795 | 1 | } |
1796 | ||
1797 | #[test] | |
1798 | 1 | fn move_simple_no_times() -> Result<()> { |
1799 | 1 | let mathml_str = "<math display='block' id='id-0'> |
1800 | 1 | <mrow displaystyle='true' id='id-1'> |
1801 | 1 | <mn id='id-2'>2</mn> |
1802 | 1 | <mo id='id-3'>⁢</mo> |
1803 | 1 | <mrow id='id-4'> |
1804 | 1 | <mo id='id-5'>(</mo> |
1805 | 1 | <mrow id='id-6'> |
1806 | 1 | <mn id='id-7'>1</mn> |
1807 | 1 | <mo id='id-8'>-</mo> |
1808 | 1 | <mi id='id-9'>x</mi> |
1809 | 1 | </mrow> |
1810 | 1 | <mo id='id-10'>)</mo> |
1811 | 1 | </mrow> |
1812 | 1 | </mrow> |
1813 | 1 | </math>"; |
1814 | 1 | init_default_prefs(mathml_str, "Simple"); |
1815 | 1 | set_preference("SpeechStyle", "ClearSpeak").unwrap(); |
1816 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1817 | 1 | let package_instance = package_instance.borrow(); |
1818 | 1 | let mathml = get_element(&package_instance); |
1819 | 1 | test_command("ZoomIn", mathml, "id-2"); |
1820 | 1 | assert_eq!("move right; open paren", test_command("MoveNext", mathml, "id-5")); |
1821 | 1 | assert_eq!("move right; 1", test_command("MoveNext", mathml, "id-7")); |
1822 | 1 | assert_eq!("move left; open paren", test_command("MovePrevious", mathml, "id-5")); |
1823 | 1 | assert_eq!("move left; 2", test_command("MovePrevious", mathml, "id-2")); |
1824 | ||
1825 | 1 | return Ok( () ); |
1826 | 1 | }); |
1827 | 1 | } |
1828 | ||
1829 | ||
1830 | #[test] | |
1831 | 1 | fn move_cell() -> Result<()> { |
1832 | 1 | let mathml_str = "<math id='nav-0'> |
1833 | 1 | <mtable id='nav-1'> |
1834 | 1 | <mtr id='nav-2'> |
1835 | 1 | <mtd id='nav-3'> <mn id='nav-4'>1</mn></mtd> |
1836 | 1 | <mtd id='nav-5'> <mn id='nav-6'>2</mn></mtd> |
1837 | 1 | <mtd id='nav-7'><mn id='nav-8'>3</mn> </mtd> |
1838 | 1 | </mtr> |
1839 | 1 | <mtr id='nav-9'> |
1840 | 1 | <mtd id='nav-10'> |
1841 | 1 | <mrow id='nav-11'> |
1842 | 1 | <mi id='nav-12'>x</mi> |
1843 | 1 | <mo id='nav-13'>-</mo> |
1844 | 1 | <mi id='nav-14'>y</mi> |
1845 | 1 | </mrow> |
1846 | 1 | </mtd> |
1847 | 1 | <mtd id='nav-15'> |
1848 | 1 | <mfrac id='nav-16'> |
1849 | 1 | <mn id='nav-17'>1</mn> |
1850 | 1 | <mn id='nav-18'>2</mn> |
1851 | 1 | </mfrac> |
1852 | 1 | </mtd> |
1853 | 1 | <mtd id='nav-19'> |
1854 | 1 | <mi id='nav-20'>z</mi> |
1855 | 1 | </mtd> |
1856 | 1 | </mtr> |
1857 | 1 | <mtr id='nav-21'> |
1858 | 1 | <mtd id='nav-22'><mn id='nav-23'>7</mn> </mtd> |
1859 | 1 | <mtd id='nav-24'><mn id='nav-25'>8</mn> </mtd> |
1860 | 1 | <mtd id='nav-26'> <mn id='nav-27'>9</mn></mtd> |
1861 | 1 | </mtr> |
1862 | 1 | <mtr id='nav-28'> |
1863 | 1 | <mtd id='nav-29'> |
1864 | 1 | <mrow id='nav-30'> |
1865 | 1 | <mi id='nav-31'>sin</mi> |
1866 | 1 | <mo id='nav-32'>⁡</mo> |
1867 | 1 | <mi id='nav-33'>x</mi> |
1868 | 1 | </mrow> |
1869 | 1 | </mtd> |
1870 | 1 | <mtd id='nav-34'> |
1871 | 1 | <msup id='nav-35'> |
1872 | 1 | <mi id='nav-36'>e</mi> |
1873 | 1 | <mi id='nav-37'>x</mi> |
1874 | 1 | </msup> |
1875 | 1 | </mtd> |
1876 | 1 | <mtd id='nav-38'> |
1877 | 1 | <mrow id='nav-39'> |
1878 | 1 | <mn id='nav-40'>2</mn> |
1879 | 1 | <mo id='nav-41'>-</mo> |
1880 | 1 | <mi id='nav-42'>y</mi> |
1881 | 1 | </mrow> |
1882 | 1 | </mtd> |
1883 | 1 | </mtr> |
1884 | 1 | </mtable> |
1885 | 1 | </math>"; |
1886 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1887 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1888 | 1 | let package_instance = package_instance.borrow(); |
1889 | 1 | let mathml = get_element(&package_instance); |
1890 | 1 | test_command("ZoomInAll", mathml, "nav-4"); |
1891 | 1 | test_command("MoveCellNext", mathml, "nav-6"); |
1892 | 1 | test_command("MoveCellNext", mathml, "nav-8"); |
1893 | 1 | test_command("MoveCellNext", mathml, "nav-8"); |
1894 | 1 | test_command("MoveCellDown", mathml, "nav-20"); |
1895 | 1 | test_command("MoveCellDown", mathml, "nav-27"); |
1896 | 1 | let speech = test_command("MoveCellDown", mathml, "nav-39"); |
1897 | 1 | assert_eq!(speech, "move down, row 4, column 3; 2 minus y"); |
1898 | 1 | let speech = test_command("MoveCellDown", mathml, "nav-39"); |
1899 | 1 | assert_eq!(speech, "no next row"); |
1900 | 1 | test_command("MoveCellPrevious", mathml, "nav-35"); |
1901 | 1 | test_command("ZoomIn", mathml, "nav-36"); |
1902 | 1 | test_command("MoveCellUp", mathml, "nav-25"); |
1903 | 1 | test_command("MoveCellUp", mathml, "nav-16"); |
1904 | 1 | test_command("MoveCellUp", mathml, "nav-6"); |
1905 | 1 | test_command("MoveCellUp", mathml, "nav-6"); |
1906 | ||
1907 | 1 | return Ok( () ); |
1908 | 1 | }); |
1909 | 1 | } |
1910 | ||
1911 | #[test] | |
1912 | 1 | fn move_cell_char_mode() -> Result<()> { |
1913 | 1 | let mathml_str = "<math id='nav-0'> |
1914 | 1 | <mtable id='nav-1'> |
1915 | 1 | <mtr id='nav-2'> |
1916 | 1 | <mtd id='nav-3'> <mn id='nav-4'>1</mn></mtd> |
1917 | 1 | <mtd id='nav-5'> <mn id='nav-6'>2</mn></mtd> |
1918 | 1 | <mtd id='nav-7'><mn id='nav-8'>3</mn> </mtd> |
1919 | 1 | </mtr> |
1920 | 1 | <mtr id='nav-9'> |
1921 | 1 | <mtd id='nav-10'> |
1922 | 1 | <mrow id='nav-11'> |
1923 | 1 | <mi id='nav-12'>x</mi> |
1924 | 1 | <mo id='nav-13'>-</mo> |
1925 | 1 | <mi id='nav-14'>y</mi> |
1926 | 1 | </mrow> |
1927 | 1 | </mtd> |
1928 | 1 | <mtd id='nav-15'> |
1929 | 1 | <mfrac id='nav-16'> |
1930 | 1 | <mn id='nav-17'>1</mn> |
1931 | 1 | <mn id='nav-18'>2</mn> |
1932 | 1 | </mfrac> |
1933 | 1 | </mtd> |
1934 | 1 | <mtd id='nav-19'> |
1935 | 1 | <mi id='nav-20'>z</mi> |
1936 | 1 | </mtd> |
1937 | 1 | </mtr> |
1938 | 1 | <mtr id='nav-21'> |
1939 | 1 | <mtd id='nav-22'><mn id='nav-23'>7</mn> </mtd> |
1940 | 1 | <mtd id='nav-24'><mn id='nav-25'>8</mn> </mtd> |
1941 | 1 | <mtd id='nav-26'> <mn id='nav-27'>9</mn></mtd> |
1942 | 1 | </mtr> |
1943 | 1 | <mtr id='nav-28'> |
1944 | 1 | <mtd id='nav-29'> |
1945 | 1 | <mrow id='nav-30'> |
1946 | 1 | <mi id='nav-31'>sin</mi> |
1947 | 1 | <mo id='nav-32'>⁡</mo> |
1948 | 1 | <mi id='nav-33'>x</mi> |
1949 | 1 | </mrow> |
1950 | 1 | </mtd> |
1951 | 1 | <mtd id='nav-34'> |
1952 | 1 | <msup id='nav-35'> |
1953 | 1 | <mi id='nav-36'>e</mi> |
1954 | 1 | <mi id='nav-37'>x</mi> |
1955 | 1 | </msup> |
1956 | 1 | </mtd> |
1957 | 1 | <mtd id='nav-38'> |
1958 | 1 | <mrow id='nav-39'> |
1959 | 1 | <mn id='nav-40'>2</mn> |
1960 | 1 | <mo id='nav-41'>-</mo> |
1961 | 1 | <mi id='nav-42'>y</mi> |
1962 | 1 | </mrow> |
1963 | 1 | </mtd> |
1964 | 1 | </mtr> |
1965 | 1 | </mtable> |
1966 | 1 | </math>"; |
1967 | 1 | init_default_prefs(mathml_str, "Character"); |
1968 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1969 | 1 | let package_instance = package_instance.borrow(); |
1970 | 1 | let mathml = get_element(&package_instance); |
1971 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1972 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
1973 | 1 | current_node: "nav-8".to_string(), |
1974 | 1 | current_node_offset: 0 |
1975 | 1 | }, "None") |
1976 | 1 | }); |
1977 | 1 | test_command("MoveNext", mathml, "nav-12"); |
1978 | 1 | test_command("MoveNext", mathml, "nav-13"); |
1979 | 1 | test_command("MoveNext", mathml, "nav-14"); |
1980 | 1 | test_command("MoveNext", mathml, "nav-17"); |
1981 | 1 | test_command("MovePrevious", mathml, "nav-14"); |
1982 | 1 | test_command("MoveCellNext", mathml, "nav-17"); |
1983 | 1 | test_command("MoveCellPrevious", mathml, "nav-14"); |
1984 | 1 | test_command("MovePrevious", mathml, "nav-13"); |
1985 | 1 | test_command("MovePrevious", mathml, "nav-12"); |
1986 | 1 | test_command("MoveCellPrevious", mathml, "nav-12"); |
1987 | 1 | test_command("MovePrevious", mathml, "nav-8"); |
1988 | 1 | test_command("MoveCellDown", mathml, "nav-20"); |
1989 | 1 | test_command("MoveCellDown", mathml, "nav-27"); |
1990 | 1 | test_command("MoveCellDown", mathml, "nav-40"); |
1991 | 1 | test_command("MoveCellDown", mathml, "nav-40"); |
1992 | 1 | test_command("MoveCellPrevious", mathml, "nav-37"); |
1993 | 1 | test_command("MoveCellUp", mathml, "nav-25"); |
1994 | ||
1995 | 1 | return Ok( () ); |
1996 | 1 | }); |
1997 | 1 | } |
1998 | ||
1999 | #[test] | |
2000 | 1 | fn placemarker() -> Result<()> { |
2001 | 1 | let mathml_str = "<math display='block' id='math'> |
2002 | 1 | <mrow displaystyle='true' id='mrow'> |
2003 | 1 | <mi id='a'>a</mi> |
2004 | 1 | <mo id='plus-1'>+</mo> |
2005 | 1 | <mi id='b'>b</mi> |
2006 | 1 | <mo id='plus-2'>+</mo> |
2007 | 1 | <mi id='c'>c</mi> |
2008 | 1 | </mrow> |
2009 | 1 | </math>"; |
2010 | 1 | init_default_prefs(mathml_str, "Character"); |
2011 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2012 | 1 | let package_instance = package_instance.borrow(); |
2013 | 1 | let mathml = get_element(&package_instance); |
2014 | 1 | test_command("MoveStart", mathml, "a"); |
2015 | 1 | test_command("SetPlacemarker0", mathml, "a"); |
2016 | 1 | test_command("MoveEnd", mathml, "c"); |
2017 | 1 | test_command("Read0", mathml, "c"); |
2018 | 1 | test_command("Describe0", mathml, "c"); |
2019 | 1 | test_command("SetPlacemarker1", mathml, "c"); |
2020 | 1 | test_command("MoveTo0", mathml, "a"); |
2021 | 1 | test_command("MoveTo1", mathml, "c"); |
2022 | 1 | test_command("MoveLastLocation", mathml, "a"); |
2023 | ||
2024 | 1 | return Ok( () ); |
2025 | 1 | }); |
2026 | 1 | } |
2027 | ||
2028 | #[test] | |
2029 | 1 | fn where_am_i_all() -> Result<()> { |
2030 | 1 | let mathml_str = "<math id='math'><mfrac id='mfrac'> |
2031 | 1 | <msup id='msup'><mi id='base'>b</mi><mn id='exp'>2</mn></msup> |
2032 | 1 | <mi id='denom'>d</mi> |
2033 | 1 | </mfrac></math>"; |
2034 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
2035 | 1 | set_preference("SpeechStyle", "ClearSpeak").unwrap(); |
2036 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2037 | 1 | let package_instance = package_instance.borrow(); |
2038 | 1 | let mathml = get_element(&package_instance); |
2039 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
2040 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
2041 | 1 | current_node: "exp".to_string(), |
2042 | 1 | current_node_offset: 0 |
2043 | 1 | }, "None") |
2044 | 1 | }); |
2045 | // WhereAmIAll doesn't change the stack | |
2046 | 1 | let speech =test_command("WhereAmIAll", mathml, "exp"); |
2047 | // should be 2 "inside" strings corresponding to steps to the root | |
2048 | 1 | assert_eq!(speech, "2; inside; b squared; inside; the fraction with numerator; b squared; and denominator d"); |
2049 | 1 | return Ok( () ); |
2050 | 1 | }); |
2051 | 1 | } |
2052 | ||
2053 | #[test] | |
2054 | 1 | fn auto_zoom_out_mrow() -> Result<()> { |
2055 | 1 | let mathml_str = "<math id='math'> |
2056 | 1 | <mrow id='id-1'> |
2057 | 1 | <mrow id='id-2'> |
2058 | 1 | <mrow id='2ax'> |
2059 | 1 | <mn id='2'>2</mn> |
2060 | 1 | <mo id='id-5'>⁢</mo> |
2061 | 1 | <mi id='a'>a</mi> |
2062 | 1 | <mo id='id-7'>⁢</mo> |
2063 | 1 | <mi id='x'>x</mi> |
2064 | 1 | </mrow> |
2065 | 1 | <mo id='plus'>+</mo> |
2066 | 1 | <mi id='b'>b</mi> |
2067 | 1 | </mrow> |
2068 | 1 | <mo id='equal'>=</mo> |
2069 | 1 | <mn id='10'>10</mn> |
2070 | 1 | </mrow> |
2071 | 1 | </math>"; |
2072 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
2073 | 1 | set_preference("AutoZoomOut", "False") |
2074 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2075 | 1 | let package_instance = package_instance.borrow(); |
2076 | 1 | let mathml = get_element(&package_instance); |
2077 | 1 | test_command("ZoomInAll", mathml, "2"); |
2078 | 1 | test_command("MoveNext", mathml, "a"); |
2079 | 1 | test_command("MoveNext", mathml, "x"); |
2080 | 1 | test_command("MoveNext", mathml, "plus"); |
2081 | 1 | test_command("MovePrevious", mathml, "2ax"); |
2082 | 1 | return Ok( () ); |
2083 | 1 | }); |
2084 | 1 | } |
2085 | ||
2086 | #[test] | |
2087 | 1 | fn auto_zoom_out_fraction() -> Result<()> { |
2088 | 1 | let mathml_str = "<math id='math'> |
2089 | 1 | <mrow id='mrow'> |
2090 | 1 | <mfrac id='frac'> |
2091 | 1 | <mrow id='num'><mi id='a'>a</mi><mo id='plus'>+</mo><mn id='1'>1</mn></mrow> |
2092 | 1 | <mrow id='denom'><mn id='2'>2</mn><mo id='invisible-times'>⁢</mo><mi id='b'>b</mi></mrow> |
2093 | 1 | </mfrac> |
2094 | 1 | <mo id='minus'>-</mo> |
2095 | 1 | <mn id='3'>3</mn> |
2096 | 1 | </mrow> |
2097 | 1 | </math>"; |
2098 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
2099 | 1 | set_preference("AutoZoomOut", "False") |
2100 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2101 | 1 | let package_instance = package_instance.borrow(); |
2102 | 1 | let mathml = get_element(&package_instance); |
2103 | 1 | test_command("ZoomIn", mathml, "frac"); |
2104 | 1 | test_command("ZoomIn", mathml, "num"); |
2105 | 1 | test_command("MoveNext", mathml, "denom"); |
2106 | 1 | test_command("MoveNext", mathml, "denom"); |
2107 | 1 | test_command("MovePrevious", mathml, "num"); |
2108 | 1 | test_command("MovePrevious", mathml, "num"); |
2109 | 1 | test_command("ZoomOut", mathml, "frac"); |
2110 | 1 | test_command("MoveNext", mathml, "minus"); |
2111 | 1 | return Ok( () ); |
2112 | 1 | }); |
2113 | 1 | } |
2114 | ||
2115 | #[test] | |
2116 | 1 | fn zoom_root() -> Result<()> { |
2117 | 1 | let mathml_str = r#"<math display='block' id='id-0'> |
2118 | 1 | <mrow id='id-1'> |
2119 | 1 | <mo id='id-9'>±</mo> |
2120 | 1 | <msqrt id='id-10'> |
2121 | 1 | <mrow id='id-11'> |
2122 | 1 | <msup id='id-12'> <mi id='id-13'>b</mi> <mn id='id-14'>2</mn> </msup> |
2123 | 1 | <mo id='id-15'>-</mo> |
2124 | 1 | <mn id='id-17'>4</mn> |
2125 | 1 | </mrow> |
2126 | 1 | </msqrt> |
2127 | 1 | </mrow> |
2128 | 1 | </math>"#; |
2129 | ||
2130 | 1 | test_mode(mathml_str, "Enhanced") |
2131 | 1 | test_mode(mathml_str, "Simple") |
2132 | 1 | test_mode(mathml_str, "Character") |
2133 | 1 | return Ok( () ); |
2134 | ||
2135 | 3 | fn test_mode(mathml_str: &str, mode: &str) -> Result<()> { |
2136 | 3 | init_default_prefs(mathml_str, mode); |
2137 | 3 | set_preference("AutoZoomOut", "False") |
2138 | 3 | return MATHML_INSTANCE.with(|package_instance| { |
2139 | 3 | debug!("--- Testing mode {mode} ---"); |
2140 | 3 | let package_instance = package_instance.borrow(); |
2141 | 3 | let mathml = get_element(&package_instance); |
2142 | 3 | test_command("ZoomIn", mathml, "id-9"); |
2143 | 3 | debug!("\nStart zoom in"); |
2144 | 3 | match mode { |
2145 | 3 | "Enhanced" => { |
2146 | 1 | test_command("MoveNext", mathml, "id-10"); |
2147 | 1 | let speech = test_command("ZoomIn", mathml, "id-11"); |
2148 | 1 | assert_eq!(speech, "zoom in; in root; b squared minus 4"); // only one arg, so don't say "in root" |
2149 | 1 | let speech = test_command("ZoomIn", mathml, "id-12"); |
2150 | 1 | assert_eq!(speech, "zoom in; b squared"); // only one arg, so don't say "in root" |
2151 | 1 | let speech = test_command("ZoomIn", mathml, "id-13"); |
2152 | 1 | assert_eq!(speech, "zoom in; in base; b"); |
2153 | }, | |
2154 | 2 | "Simple" => { |
2155 | 1 | test_command("MoveNext", mathml, "id-10"); |
2156 | 1 | let speech = test_command("ZoomIn", mathml, "id-12"); |
2157 | 1 | assert_eq!(speech, "zoom in; in root; b squared"); |
2158 | 1 | let speech = test_command("ZoomIn", mathml, "id-13"); |
2159 | 1 | assert_eq!(speech, "zoom in; in base; b"); |
2160 | }, | |
2161 | _ => { // "Character" | |
2162 | 1 | let speech = test_command("MoveNext", mathml, "id-13"); |
2163 | 1 | assert_eq!(speech, "move right; in root; in base; b"); |
2164 | } | |
2165 | } | |
2166 | 3 | let squared_speech = if mode == "Character" { |
2167 | 3 | let sqrt_speech = if mode == "Character" { |
2168 | 3 | let speech = test_command("ZoomOut", mathml, "id-12"); |
2169 | 3 | assert_eq!(speech, format!("zoom out; out of base; {squared_speech}")); |
2170 | 3 | let speech = test_command("ZoomOut", mathml, "id-11"); |
2171 | 3 | assert_eq!(speech, format!("zoom out; {squared_speech} minus 4")); |
2172 | 3 | let speech = test_command("ZoomOut", mathml, "id-10"); |
2173 | 3 | assert_eq!(speech, format!("zoom out; out of root; the {sqrt_speech} of {squared_speech} minus 4, end root",)); |
2174 | 3 | return Ok( () ); |
2175 | 3 | }); |
2176 | 3 | } |
2177 | 1 | } |
2178 | ||
2179 | #[test] | |
2180 | 1 | fn matrix_speech() -> Result<()> { |
2181 | 1 | let mathml_str = r#"<math id='math'> |
2182 | 1 | <mrow id='mrow'> |
2183 | 1 | <mo id='open'>[</mo> |
2184 | 1 | <mtable columnspacing='1em' rowspacing='4pt' id='table'> |
2185 | 1 | <mtr id='row-1'> |
2186 | 1 | <mtd id='1-1'><mn id='id-6'>9</mn></mtd> |
2187 | 1 | <mtd id='1-2'><mrow id='id-8'><mo id='id-9'>-</mo><mn id='id-10'>13</mn></mrow></mtd> |
2188 | 1 | </mtr> |
2189 | 1 | <mtr id='row-2'> |
2190 | 1 | <mtd id='2-1'><mn id='id-13'>5</mn></mtd> |
2191 | 1 | <mtd id='2-2'><mo id='id-16'>-</mo><mn id='id-17'>6</mn></mtd> |
2192 | 1 | </mtr> |
2193 | 1 | </mtable> |
2194 | 1 | <mo id='close'>]</mo> |
2195 | 1 | </mrow> |
2196 | 1 | </math>"#; |
2197 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
2198 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2199 | 1 | let package_instance = package_instance.borrow(); |
2200 | 1 | let mathml = get_element(&package_instance); |
2201 | 1 | test_command("ZoomIn", mathml, "row-1"); |
2202 | 1 | let speech = test_command("MoveNext", mathml, "row-2"); |
2203 | 1 | assert_eq!(speech, "move right; row 2; 5, negative 6"); |
2204 | 1 | let speech = test_command("ZoomIn", mathml, "id-13"); |
2205 | 1 | assert_eq!(speech, "zoom in; column 1; 5"); |
2206 | 1 | let speech = test_command("ZoomOut", mathml, "row-2"); |
2207 | 1 | assert_eq!(speech, "zoom out; row 2; 5, negative 6"); |
2208 | 1 | let speech = test_command("ZoomOut", mathml, "table"); |
2209 | 1 | assert_eq!(speech, "zoom out; the 2 by 2 matrix; row 1; 9, negative 13; row 2; 5, negative 6"); |
2210 | 1 | return Ok( () ); |
2211 | 1 | }); |
2212 | 1 | } |
2213 | ||
2214 | #[test] | |
2215 | 1 | fn chem_speech() -> Result<()> { |
2216 | // this comes from bug 218 | |
2217 | 1 | let mathml_str = "<math display='block' id='id-0'> |
2218 | 1 | <mrow data-chem-formula='5' id='id-1'> |
2219 | 1 | <msub data-chem-formula='1' id='id-2'> |
2220 | 1 | <mi data-chem-element='1' id='id-3'>H</mi> |
2221 | 1 | <mn id='id-4'>2</mn> |
2222 | 1 | </msub> |
2223 | 1 | <mo data-chem-formula-op='0' id='id-5'>⁣</mo> |
2224 | 1 | <mi data-chem-element='1' id='id-6'>S</mi> |
2225 | 1 | <mo data-chem-formula-op='0' id='id-7'>⁣</mo> |
2226 | 1 | <msub data-chem-formula='1' id='id-8'> |
2227 | 1 | <mi data-chem-element='1' id='id-9'>O</mi> |
2228 | 1 | <mn id='id-10'>4</mn> |
2229 | 1 | </msub> |
2230 | 1 | </mrow> |
2231 | 1 | </math>"; |
2232 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
2233 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2234 | 1 | let package_instance = package_instance.borrow(); |
2235 | 1 | let mathml = get_element(&package_instance); |
2236 | 1 | test_command("ZoomIn", mathml, "id-2"); |
2237 | 1 | let speech = test_command("MoveNext", mathml, "id-6"); |
2238 | // tables need to check their parent for proper speech | |
2239 | 1 | assert_eq!(speech, "move right; cap s"); |
2240 | 1 | return Ok( () ); |
2241 | 1 | }); |
2242 | 1 | } |
2243 | ||
2244 | #[test] | |
2245 | 1 | fn determinant_speech() -> Result<()> { |
2246 | 1 | let mathml_str = "<math id='math'> |
2247 | 1 | <mrow id='mrow'> |
2248 | 1 | <mo id='open'>|</mo> |
2249 | 1 | <mtable columnspacing='1em' rowspacing='4pt' id='table'> |
2250 | 1 | <mtr id='row-1'> |
2251 | 1 | <mtd id='1-1'><mn id='id-6'>9</mn></mtd> |
2252 | 1 | <mtd id='1-2'><mrow id='id-8'><mo id='id-9'>-</mo><mn id='id-10'>13</mn></mrow></mtd> |
2253 | 1 | </mtr> |
2254 | 1 | <mtr id='row-2'> |
2255 | 1 | <mtd id='2-1'><mn id='id-13'>5</mn></mtd> |
2256 | 1 | <mtd id='2-2'><mrow id='row2-negative'><mo id='id-16'>-</mo><mn id='id-17'>6</mn></mrow></mtd> |
2257 | 1 | </mtr> |
2258 | 1 | </mtable> |
2259 | 1 | <mo id='close'>|</mo> |
2260 | 1 | </mrow> |
2261 | 1 | </math>"; |
2262 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
2263 | 1 | set_preference("SpeechStyle", "ClearSpeak").unwrap(); |
2264 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2265 | 1 | let package_instance = package_instance.borrow(); |
2266 | 1 | let mathml = get_element(&package_instance); |
2267 | 1 | let speech = test_command("ZoomIn", mathml, "row-1"); |
2268 | 1 | assert_eq!(speech, "zoom in; row 1; 9, negative 13"); |
2269 | 1 | let speech = test_command("MoveNext", mathml, "row-2"); |
2270 | 1 | assert_eq!(speech, "move right; row 2; 5, negative 6"); |
2271 | 1 | let speech = test_command("MoveNext", mathml, "row-2"); |
2272 | 1 | assert_eq!(speech, "cannot move right, end of math"); |
2273 | 1 | let speech = test_command("ZoomIn", mathml, "id-13"); |
2274 | 1 | assert_eq!(speech, "zoom in; column 1; 5"); |
2275 | 1 | let speech = test_command("MoveNext", mathml, "row2-negative"); |
2276 | 1 | assert_eq!(speech, "move right; column 2, negative 6"); |
2277 | 1 | let speech = test_command("ZoomOutAll", mathml, "table"); |
2278 | 1 | assert_eq!(speech, "zoomed out all of the way; the 2 by 2 determinant; row 1; 9, negative 13; row 2; 5, negative 6"); |
2279 | 1 | return Ok( () ); |
2280 | 1 | }); |
2281 | 1 | } |
2282 | ||
2283 | #[test] | |
2284 | 1 | fn cases_speech() -> Result<()> { |
2285 | 1 | let mathml_str = "<math id='id-0'> |
2286 | 1 | <mrow id='id-1'> |
2287 | 1 | <mo id='open'>{</mo> |
2288 | 1 | <mtable columnalign='left left' columnspacing='1em' displaystyle='false' rowspacing='.2em' id='table'> |
2289 | 1 | <mtr id='row-1'> |
2290 | 1 | <mtd id='id-5'><mrow id='id-6'><mrow id='id-7'><mo id='id-8'>-</mo><mi id='id-9'>x</mi></mrow><mo id='id-10'>,</mo></mrow></mtd> |
2291 | 1 | <mtd id='id-11'><mrow id='id-12'><mrow id='id-13'><mtext id='id-14'>if</mtext><mo id='id-15'>⁢</mo><mi id='id-16'>x</mi></mrow><mo id='id-17'><</mo><mn id='id-18'>0</mn></mrow></mtd> |
2292 | 1 | </mtr> |
2293 | 1 | <mtr id='row-2'> |
2294 | 1 | <mtd id='id-20'><mrow id='id-21'><mrow id='id-22'><mo id='id-23'>+</mo><mi id='id-24'>x</mi></mrow><mo id='id-25'>,</mo></mrow></mtd> |
2295 | 1 | <mtd id='id-26'><mrow id='id-27'><mrow id='id-28'><mtext id='id-29'>if</mtext><mo id='id-30'>⁢</mo><mi id='id-31'>x</mi></mrow><mo id='id-32'>≥</mo><mn id='id-33'>0</mn></mrow></mtd> |
2296 | 1 | </mtr> |
2297 | 1 | </mtable> |
2298 | 1 | </mrow> |
2299 | 1 | </math>"; |
2300 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
2301 | 1 | set_preference("SpeechStyle", "ClearSpeak").unwrap(); |
2302 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2303 | 1 | let package_instance = package_instance.borrow(); |
2304 | 1 | let mathml = get_element(&package_instance); |
2305 | 1 | test_command("ZoomIn", mathml, "row-1"); |
2306 | 1 | let speech = test_command("MovePrevious", mathml, "row-1"); |
2307 | 1 | assert_eq!(speech, "move left; start of math"); |
2308 | 1 | let speech = test_command("MoveNext", mathml, "row-2"); |
2309 | 1 | assert_eq!(speech, "move right; case 2; positive x comma; if x, is greater than or equal to 0"); |
2310 | 1 | let speech = test_command("ZoomOut", mathml, "table"); |
2311 | 1 | assert_eq!(speech, "zoom out; 2 cases; case 1; negative x comma; if x is less than 0; case 2; positive x comma; if x, is greater than or equal to 0"); |
2312 | 1 | let speech = test_command("ZoomIn", mathml, "row-1"); |
2313 | 1 | assert_eq!(speech, "zoom in; case 1; negative x comma; if x is less than 0"); |
2314 | 1 | set_preference("NavMode", "Character").unwrap(); |
2315 | 1 | let speech = test_command("MovePrevious", mathml, "open"); |
2316 | 1 | assert_eq!(speech, "move left; open brace"); |
2317 | 1 | return Ok( () ); |
2318 | 1 | }); |
2319 | 1 | } |
2320 | ||
2321 | #[test] | |
2322 | 1 | fn base_superscript() -> Result<()> { |
2323 | // bug #217 -- zoom into base of parenthesized script | |
2324 | 1 | let mathml_str = "<math display='block' id='id-0'> |
2325 | 1 | <msup id='id-1'> |
2326 | 1 | <mrow id='id-2'> |
2327 | 1 | <mo stretchy='false' id='id-3'>(</mo> |
2328 | 1 | <mrow id='id-4'> |
2329 | 1 | <mn id='id-5'>2</mn> |
2330 | 1 | <mo id='id-6'>⁢</mo> |
2331 | 1 | <mi id='id-7'>x</mi> |
2332 | 1 | </mrow> |
2333 | 1 | <mo stretchy='false' id='id-8'>)</mo> |
2334 | 1 | </mrow> |
2335 | 1 | <mn id='id-9'>2</mn> |
2336 | 1 | </msup> |
2337 | 1 | </math>"; |
2338 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
2339 | 1 | set_preference("SpeechStyle", "ClearSpeak").unwrap(); |
2340 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2341 | 1 | let package_instance = package_instance.borrow(); |
2342 | 1 | let mathml = get_element(&package_instance); |
2343 | 1 | let speech = test_command("ZoomIn", mathml, "id-4"); |
2344 | 1 | assert_eq!(speech, "zoom in; in base; 2 x"); |
2345 | 1 | let speech = test_command("MoveNext", mathml, "id-9"); |
2346 | 1 | assert_eq!(speech, "move right; in exponent; 2"); |
2347 | 1 | return Ok( () ); |
2348 | 1 | }); |
2349 | 1 | } |
2350 | ||
2351 | #[test] | |
2352 | 1 | fn binomial_intent() -> Result<()> { |
2353 | 1 | let mathml_str = "<math display='block' id='id-0'> |
2354 | 1 | <mrow intent='binomial($n,$k)' id='id-1'> |
2355 | 1 | <mo id='id-2'>(</mo> |
2356 | 1 | <mfrac linethickness='0pt' id='id-3'> |
2357 | 1 | <mi arg='n' id='id-4'>n</mi> |
2358 | 1 | <mi arg='k' id='id-5'>k</mi> |
2359 | 1 | </mfrac> |
2360 | 1 | <mo id='id-6'>)</mo> |
2361 | 1 | </mrow> |
2362 | 1 | </math>"; |
2363 | 1 | init_default_prefs(mathml_str, "Character"); |
2364 | 1 | set_preference("SpeechStyle", "ClearSpeak").unwrap(); |
2365 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2366 | 1 | let package_instance = package_instance.borrow(); |
2367 | 1 | let mathml = get_element(&package_instance); |
2368 | 1 | debug!("Character mode"); |
2369 | 1 | let speech = test_command("MoveStart", mathml, "id-2"); |
2370 | 1 | assert_eq!(speech, "move to start of math; open paren"); |
2371 | 1 | let speech = test_command("MoveNext", mathml, "id-4"); |
2372 | // I'm not keen on the use of numerator/denominator here, but character mode turns off intent | |
2373 | 1 | assert_eq!(speech, "move right; in numerator; n"); |
2374 | 1 | let speech = test_command("MoveNext", mathml, "id-5"); |
2375 | 1 | assert_eq!(speech, "move right; in denominator; k"); |
2376 | 1 | debug!("before zoom out"); |
2377 | 1 | let speech = test_command("ZoomOut", mathml, "id-3"); |
2378 | 1 | assert_eq!(speech, "zoom out; out of denominator; n over k"); |
2379 | // let speech = test_command("ZoomOut", mathml, "id-1"); | |
2380 | // assert_eq!(speech, "zoom out; open paren n over k, close paren"); | |
2381 | ||
2382 | 1 | set_preference("NavMode", "Simple").unwrap(); |
2383 | 1 | debug!("Simple mode"); |
2384 | 1 | let speech = test_command("ZoomIn", mathml, "id-4"); |
2385 | 1 | assert_eq!(speech, "zoom in; in part 1; n"); |
2386 | 1 | let speech = test_command("MoveNext", mathml, "id-5"); |
2387 | 1 | assert_eq!(speech, "move right; in part 2; k"); |
2388 | 1 | let speech = test_command("MoveNext", mathml, "id-5"); |
2389 | 1 | assert_eq!(speech, "cannot move right, end of math"); |
2390 | 1 | let speech = test_command("ZoomOut", mathml, "id-1-literal-0"); |
2391 | 1 | assert_eq!(speech, "zoom out; out of part 2; n choose k"); |
2392 | ||
2393 | 1 | set_preference("NavMode", "Enhanced").unwrap(); |
2394 | 1 | debug!("Enhanced mode"); |
2395 | 1 | let speech = test_command("ZoomIn", mathml, "id-4"); |
2396 | 1 | assert_eq!(speech, "zoom in; in part 1; n"); |
2397 | 1 | let speech = test_command("MoveNext", mathml, "id-5"); |
2398 | 1 | assert_eq!(speech, "move right; in part 2; k"); |
2399 | 1 | let speech = test_command("MoveNext", mathml, "id-5"); |
2400 | 1 | assert_eq!(speech, "cannot move right, end of math"); |
2401 | 1 | let speech = test_command("ZoomOut", mathml, "id-1-literal-0"); |
2402 | 1 | assert_eq!(speech, "zoom out; out of part 2; n choose k"); |
2403 | ||
2404 | 1 | return Ok( () ); |
2405 | 1 | }); |
2406 | 1 | } |
2407 | ||
2408 | #[test] | |
2409 | 1 | fn matrix_literal_intent() -> Result<()> { |
2410 | 1 | let mathml_str = r#"<math display='block' id='id-0'> |
2411 | 1 | <mrow intent='$m' id='id-1'> |
2412 | 1 | <mo id='id-2'>(</mo> |
2413 | 1 | <mtable arg='m' intent='_diagonal:prefix(1,2,3)' id='id-3'> |
2414 | 1 | <mtr id='id-4'> |
2415 | 1 | <mtd id='id-5'><mn id='id-6'>1</mn></mtd> |
2416 | 1 | <mtd id='id-7'><mn id='id-8'>0</mn></mtd> |
2417 | 1 | <mtd id='id-9'><mn id='id-10'>0</mn></mtd> |
2418 | 1 | </mtr> |
2419 | 1 | <mtr id='id-11'> |
2420 | 1 | <mtd id='id-12'><mn id='id-13'>0</mn></mtd> |
2421 | 1 | <mtd id='id-14'><mn id='id-15'>2</mn></mtd> |
2422 | 1 | <mtd id='id-16'><mn id='id-17'>0</mn></mtd> |
2423 | 1 | </mtr> |
2424 | 1 | <mtr id='id-18'> |
2425 | 1 | <mtd id='id-19'><mn id='id-20'>0</mn></mtd> |
2426 | 1 | <mtd id='id-21'><mn id='id-22'>0</mn></mtd> |
2427 | 1 | <mtd id='id-23'><mn id='id-24'>3</mn></mtd> |
2428 | 1 | </mtr> |
2429 | 1 | </mtable> |
2430 | 1 | <mo id='id-25'>)</mo> |
2431 | 1 | </mrow> |
2432 | 1 | </math>"#; |
2433 | 1 | init_default_prefs(mathml_str, "Simple"); |
2434 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2435 | 1 | let package_instance = package_instance.borrow(); |
2436 | 1 | let mathml = get_element(&package_instance); |
2437 | 1 | let speech = test_command("ZoomIn", mathml, "id-3-literal-1"); |
2438 | 1 | assert_eq!(speech, "zoom in; 1"); |
2439 | 1 | let speech = test_command("MoveNext", mathml, "id-3-literal-2"); |
2440 | 1 | assert_eq!(speech, "move right; 2"); |
2441 | 1 | let speech = test_command("MoveNext", mathml, "id-3-literal-3"); |
2442 | 1 | assert_eq!(speech, "move right; 3"); |
2443 | 1 | let speech = test_command("MoveNext", mathml, "id-3-literal-3"); |
2444 | 1 | assert_eq!(speech, "cannot move right, end of math"); |
2445 | 1 | let speech = test_command("ZoomOut", mathml, "id-3-literal-0"); |
2446 | 1 | assert_eq!(speech, "zoom out; diagonal 1 2 3"); |
2447 | ||
2448 | 1 | return Ok( () ); |
2449 | 1 | }); |
2450 | 1 | } |
2451 | ||
2452 | #[test] | |
2453 | 1 | fn absolute_value() -> Result<()> { |
2454 | 1 | let mathml_str = "<math id='math'> |
2455 | 1 | <mrow id='expr'> |
2456 | 1 | <mn id='2'>2</mn> |
2457 | 1 | <mrow id='abs'> |
2458 | 1 | <mo id='start'>|</mo> |
2459 | 1 | <mi id='x'>x</mi> |
2460 | 1 | <mo id='end'>|</mo> |
2461 | 1 | </mrow> |
2462 | 1 | </mrow> |
2463 | 1 | </math>"; |
2464 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
2465 | 1 | set_preference("SpeechStyle", "ClearSpeak").unwrap(); |
2466 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2467 | 1 | let package_instance = package_instance.borrow(); |
2468 | 1 | let mathml = get_element(&package_instance); |
2469 | 1 | let speech = test_command("ZoomIn", mathml, "2"); |
2470 | 1 | assert_eq!(speech, "zoom in; 2"); |
2471 | 1 | let speech = test_command("MoveNext", mathml, "abs"); |
2472 | 1 | assert_eq!(speech, "move right; the absolute value of x"); |
2473 | 1 | let speech = test_command("ZoomIn", mathml, "x"); |
2474 | 1 | assert_eq!(speech, "zoom in; in absolute value; x"); |
2475 | 1 | let speech = test_command("MoveNext", mathml, "x"); |
2476 | 1 | assert_eq!(speech, "cannot move right, end of math"); |
2477 | 1 | set_preference("NavMode", "Character").unwrap(); |
2478 | 1 | let speech = test_command("MoveNext", mathml, "end"); |
2479 | 1 | assert_eq!(speech, "move right; vertical line"); |
2480 | 1 | let speech = test_command("MoveLineStart", mathml, "2"); |
2481 | 1 | assert_eq!(speech, "move to start of line; 2"); |
2482 | 1 | let speech = test_command("MoveNext", mathml, "start"); |
2483 | 1 | assert_eq!(speech, "move right; vertical line"); |
2484 | 1 | return Ok( () ); |
2485 | 1 | }); |
2486 | 1 | } |
2487 | ||
2488 | #[test] | |
2489 | 1 | fn read_and_describe_fraction() -> Result<()> { |
2490 | 1 | let mathml_str = "<math id='math'> |
2491 | 1 | <mrow id='mrow'> |
2492 | 1 | <mfrac id='frac'> |
2493 | 1 | <mrow id='numerator'><mi>b</mi><mo>+</mo><mn>1</mn></mrow> |
2494 | 1 | <mn id='denom'>3</mn> |
2495 | 1 | </mfrac> |
2496 | 1 | <mo id='minus'>-</mo> |
2497 | 1 | <mn id='3'>3</mn> |
2498 | 1 | </mrow> |
2499 | 1 | </math>"; |
2500 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
2501 | 1 | set_preference("SpeechStyle", "SimpleSpeak").unwrap(); |
2502 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2503 | 1 | let package_instance = package_instance.borrow(); |
2504 | 1 | let mathml = get_element(&package_instance); |
2505 | 1 | test_command("ZoomIn", mathml, "frac"); |
2506 | 1 | let speech = test_command("ReadCurrent", mathml, "frac"); |
2507 | 1 | assert_eq!(speech, "read current; fraction, b plus 1, over 3, end fraction"); |
2508 | 1 | let speech = test_command("DescribeCurrent", mathml, "frac"); |
2509 | 1 | assert_eq!(speech, "describe current; fraction"); |
2510 | 1 | return Ok( () ); |
2511 | 1 | }); |
2512 | 1 | } |
2513 | ||
2514 | ||
2515 | #[test] | |
2516 | 1 | fn read_and_describe_mrow() -> Result<()> { |
2517 | 1 | let mathml_str = "<math id='math'> |
2518 | 1 | <mrow id='mrow'> |
2519 | 1 | <mn>1</mn><mo>+</mo> |
2520 | 1 | <mn>2</mn><mo>+</mo> |
2521 | 1 | <mn>3</mn><mo>+</mo> |
2522 | 1 | <mn>4</mn><mo>+</mo> |
2523 | 1 | <mn>5</mn><mo>+</mo> |
2524 | 1 | <mn>6</mn><mo>+</mo> |
2525 | 1 | <mn>7</mn> |
2526 | 1 | </mrow> |
2527 | 1 | </math>"; |
2528 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
2529 | 1 | set_preference("SpeechStyle", "SimpleSpeak").unwrap(); |
2530 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2531 | 1 | let package_instance = package_instance.borrow(); |
2532 | 1 | let mathml = get_element(&package_instance); |
2533 | 1 | let speech = test_command("ZoomOutAll", mathml, "mrow"); |
2534 | 1 | assert_eq!(speech, "zoomed out all of the way; 1 plus 2 plus 3 plus 4 plus 5 plus 6 plus 7"); |
2535 | 1 | let speech = test_command("ReadCurrent", mathml, "mrow"); |
2536 | 1 | assert_eq!(speech, "read current; 1 plus 2 plus 3 plus 4 plus 5 plus 6 plus 7"); |
2537 | 1 | let speech = test_command("DescribeCurrent", mathml, "mrow"); |
2538 | 1 | assert_eq!(speech, "describe current; 1 plus 2 plus 3 and so on"); |
2539 | 1 | return Ok( () ); |
2540 | 1 | }); |
2541 | 1 | } |
2542 | ||
2543 | ||
2544 | #[test] | |
2545 | 1 | fn read_next_invisible_char() -> Result<()> { |
2546 | 1 | let mathml_str = "<math id='id-0'> |
2547 | 1 | <mrow id='id-1'> |
2548 | 1 | <mi id='id-2'>x</mi> |
2549 | 1 | <mo id='id-3'>⁢</mo> |
2550 | 1 | <mi id='id-4'>y</mi> |
2551 | 1 | </mrow> |
2552 | 1 | </math>"; |
2553 | 1 | init_default_prefs(mathml_str, "Simple"); |
2554 | 1 | set_preference("SpeechStyle", "SimpleSpeak").unwrap(); |
2555 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2556 | 1 | let package_instance = package_instance.borrow(); |
2557 | 1 | let mathml = get_element(&package_instance); |
2558 | 1 | let speech = test_command("ZoomIn", mathml, "id-2"); |
2559 | 1 | assert_eq!(speech, "zoom in; x"); |
2560 | 1 | let speech = test_command("ToggleZoomLockUp", mathml, "id-2"); |
2561 | 1 | assert_eq!(speech, "enhanced mode; x"); |
2562 | 1 | let speech = test_command("ReadNext", mathml, "id-2"); |
2563 | 1 | assert_eq!(speech, "read right; y"); |
2564 | 1 | return Ok( () ); |
2565 | 1 | }); |
2566 | 1 | } |
2567 | ||
2568 | ||
2569 | #[test] | |
2570 | 1 | fn basic_language_test() -> Result<()> { |
2571 | // this is basically a sanity check that all the language's navigation.yaml files are at least syntactically correct | |
2572 | // FIX: should look through the Languages dir and figure this is out | |
2573 | 1 | let mathml_str = "<math id='math'> |
2574 | 1 | <mrow id='contents'> |
2575 | 1 | <mrow id='lhs'> |
2576 | 1 | <mrow id='term'> |
2577 | 1 | <mn id='2'>2</mn> |
2578 | 1 | <mo id='invisible-times'>⁢</mo> |
2579 | 1 | <msup id='msup'> |
2580 | 1 | <mi id='x'>x</mi> |
2581 | 1 | <mn id='3'>3</mn> |
2582 | 1 | </msup> |
2583 | 1 | </mrow> |
2584 | 1 | <mo id='plus'>+</mo> |
2585 | 1 | <mn id='1'>1</mn> |
2586 | 1 | </mrow> |
2587 | 1 | <mo id='id-11'>=</mo> |
2588 | 1 | <mi id='id-12'>y</mi> |
2589 | 1 | </mrow> |
2590 | 1 | </math>"; |
2591 | ||
2592 | 1 | set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); |
2593 | 11 | for lang in |
2594 | 11 | test_language(&lang, mathml_str); |
2595 | 11 | } |
2596 | 1 | return Ok( () ); |
2597 | ||
2598 | 11 | fn test_language(lang: &str, mathml_str: &str) { |
2599 | 11 | init_default_prefs(mathml_str, "Enhanced"); |
2600 | 11 | set_preference("Language", lang).unwrap(); |
2601 | ||
2602 | 11 | set_preference("NavMode", "Enhanced").unwrap(); |
2603 | 11 | MATHML_INSTANCE.with(|package_instance| { |
2604 | 11 | let package_instance = package_instance.borrow(); |
2605 | 11 | let mathml = get_element(&package_instance); |
2606 | 11 | test_command("ZoomInAll", mathml, "2"); |
2607 | 11 | test_command("MoveNext", mathml, "msup"); |
2608 | 11 | test_command("MoveNext", mathml, "plus"); |
2609 | 11 | test_command("MovePrevious", mathml, "term"); |
2610 | 11 | test_command("MovePrevious", mathml, "term"); |
2611 | 11 | test_command("ZoomOutAll", mathml, "contents"); |
2612 | 11 | }); |
2613 | ||
2614 | 11 | set_preference("NavMode", "Simple").unwrap(); |
2615 | 11 | MATHML_INSTANCE.with(|package_instance: &RefCell<Package>| { |
2616 | 11 | let package_instance = package_instance.borrow(); |
2617 | 11 | let mathml = get_element(&package_instance); |
2618 | 11 | test_command("ZoomInAll", mathml, "2"); |
2619 | 11 | test_command("MoveNext", mathml, "msup"); |
2620 | 11 | test_command("MoveNext", mathml, "plus"); |
2621 | 11 | test_command("MovePrevious", mathml, "msup"); |
2622 | 11 | test_command("MovePrevious", mathml, "2"); |
2623 | 11 | test_command("MovePrevious", mathml, "2"); |
2624 | 11 | test_command("ZoomOutAll", mathml, "contents"); |
2625 | 11 | }); |
2626 | ||
2627 | 11 | set_preference("NavMode", "Character").unwrap(); |
2628 | 11 | MATHML_INSTANCE.with(|package_instance| { |
2629 | 11 | let package_instance = package_instance.borrow(); |
2630 | 11 | let mathml = get_element(&package_instance); |
2631 | 11 | test_command("ZoomIn", mathml, "2"); |
2632 | 11 | test_command("MoveNext", mathml, "x"); |
2633 | 11 | test_command("MoveNext", mathml, "3"); |
2634 | 11 | test_command("MoveNext", mathml, "plus"); |
2635 | 11 | test_command("MovePrevious", mathml, "3"); |
2636 | 11 | test_command("MovePrevious", mathml, "x"); |
2637 | 11 | test_command("MovePrevious", mathml, "2"); |
2638 | 11 | test_command("MovePrevious", mathml, "2"); |
2639 | 11 | }); |
2640 | ||
2641 | // simple sanity check that "overview.yaml" doesn't have a syntax error | |
2642 | 11 | set_preference("Overview", "True").unwrap(); |
2643 | 11 | set_preference("NavMode", "Character").unwrap(); |
2644 | 11 | MATHML_INSTANCE.with(|package_instance| { |
2645 | 11 | let package_instance = package_instance.borrow(); |
2646 | 11 | let mathml = get_element(&package_instance); |
2647 | 11 | test_command("ZoomIn", mathml, "2"); |
2648 | 11 | }); |
2649 | 11 | } |
2650 | 1 | } |
2651 | } |
Line | Count | Source |
1 | //! Navigation is controlled by a `Navigation_Rules.yaml` file in conjunction with preferences. | |
2 | //! See preference documentation for more info on navigation preferences. | |
3 | #![allow(clippy::needless_return)] | |
4 | ||
5 | use std::cell::{Ref, RefCell, RefMut}; | |
6 | use sxd_xpath::context::Evaluation; | |
7 | use sxd_xpath::Value; | |
8 | use sxd_document::dom::Element; | |
9 | use sxd_document::Package; | |
10 | ||
11 | use std::fmt; | |
12 | use crate::canonicalize::{name, get_parent}; | |
13 | use crate::pretty_print::mml_to_string; | |
14 | use crate::speech::{NAVIGATION_RULES, CONCAT_INDICATOR, CONCAT_STRING, SpeechRules, SpeechRulesWithContext}; | |
15 | use crate::infer_intent::add_fixity_children; | |
16 | use crate::interface::copy_mathml; | |
17 | #[cfg(not(target_family = "wasm"))] | |
18 | use std::time::Instant; | |
19 | use crate::errors::*; | |
20 | use phf::phf_set; | |
21 | use log::{debug}; | |
22 | ||
23 | pub const ID_OFFSET: &str = "data-id-offset"; | |
24 | ||
25 | const MAX_PLACE_MARKERS: usize = 10; | |
26 | ||
27 | thread_local!{ | |
28 | /// The current set of navigation rules | |
29 | pub static NAVIGATION_STATE: RefCell<NavigationState> = | |
30 | RefCell::new( NavigationState::new() ); | |
31 | } | |
32 | ||
33 | pub static NAV_COMMANDS: phf::Set<&str> = phf_set! { | |
34 | "MovePrevious", "MoveNext", "MoveStart", "MoveEnd", "MoveLineStart", "MoveLineEnd", | |
35 | "MoveCellPrevious", "MoveCellNext", "MoveCellUp", "MoveCellDown", "MoveColumnStart", "MoveColumnEnd", | |
36 | "ZoomIn", "ZoomOut", "ZoomOutAll", "ZoomInAll", | |
37 | "MoveLastLocation", | |
38 | "ReadPrevious", "ReadNext", "ReadCurrent", "ReadCellCurrent", "ReadStart", "ReadEnd", "ReadLineStart", "ReadLineEnd", | |
39 | "DescribePrevious", "DescribeNext", "DescribeCurrent", | |
40 | "WhereAmI", "WhereAmIAll", | |
41 | "ToggleZoomLockUp", "ToggleZoomLockDown", "ToggleSpeakMode", | |
42 | "Exit", | |
43 | "MoveTo0","MoveTo1","MoveTo2","MoveTo3","MoveTo4","MoveTo5","MoveTo6","MoveTo7","MoveTo8","MoveTo9", | |
44 | "Read0","Read1","Read2","Read3","Read4","Read5","Read6","Read7","Read8","Read9", | |
45 | "Describe0","Describe1","Describe2","Describe3","Describe4","Describe5","Describe6","Describe7","Describe8","Describe9", | |
46 | "SetPlacemarker0","SetPlacemarker1","SetPlacemarker2","SetPlacemarker3","SetPlacemarker4","SetPlacemarker5","SetPlacemarker6","SetPlacemarker7","SetPlacemarker8","SetPlacemarker9", | |
47 | }; | |
48 | ||
49 | #[derive(Clone, PartialEq, Debug)] | |
50 | struct NavigationPosition { | |
51 | current_node: String, // id of current node | |
52 | current_node_offset: usize, // for leaves, char offset in leaf (default = 0), otherwise id for artificial intent node | |
53 | } | |
54 | ||
55 | impl fmt::Display for NavigationPosition { | |
56 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
57 | 0 | return write!(f, "{}[+{}]", self.current_node, self.current_node_offset); |
58 | 0 | } |
59 | } | |
60 | ||
61 | const ILLEGAL_NODE_ID: &str = "!not set"; // an illegal 'id' value | |
62 | impl Default for NavigationPosition { | |
63 | 48.6k | fn default() -> Self { |
64 | 48.6k | NavigationPosition { |
65 | 48.6k | current_node: ILLEGAL_NODE_ID.to_string(), |
66 | 48.6k | current_node_offset: 0 |
67 | 48.6k | } |
68 | 48.6k | } |
69 | } | |
70 | ||
71 | ||
72 | #[derive(Debug, Clone)] | |
73 | pub struct NavigationState { | |
74 | // it might be better to use a linked for the stacks, with the first node being the top | |
75 | // these two stacks should be kept in sync. | |
76 | position_stack: Vec<NavigationPosition>, // all positions, so we can go back to them | |
77 | command_stack: Vec<&'static str>, // all commands, so we can undo them | |
78 | place_markers: [NavigationPosition; MAX_PLACE_MARKERS], | |
79 | where_am_i: NavigationPosition, // current 'where am i' location | |
80 | ||
81 | #[cfg(target_family = "wasm")] | |
82 | where_am_i_start_time: usize, // FIX: for web | |
83 | #[cfg(not(target_family = "wasm"))] | |
84 | where_am_i_start_time: Instant, | |
85 | mode: String, // one of "Character", "Simple", or "Enhanced" | |
86 | speak_overview: bool, // true => describe after move; false => (standard) speech rules | |
87 | } | |
88 | ||
89 | impl fmt::Display for NavigationState { | |
90 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
91 | 0 | writeln!(f, "NavigationState{{")?; |
92 | 0 | write!(f, " Position Stack: ")?; |
93 | 0 | for (i, nav_state) in self.position_stack.iter().enumerate() { |
94 | 0 | write!(f, "{}{}", if i==0 {""} else {", "}, nav_state)?; |
95 | } | |
96 | 0 | writeln!(f)?; |
97 | 0 | write!(f, " Command Stack: ")?; |
98 | 0 | for (i, nav_state) in self.command_stack.iter().enumerate() { |
99 | 0 | write!(f, "{}{}", if i==0 {""} else {", "}, *nav_state)?; |
100 | } | |
101 | 0 | writeln!(f)?; |
102 | 0 | writeln!(f, " where_am_i: {}, start_time: {:?}", self.where_am_i, self.where_am_i_start_time)?; |
103 | 0 | writeln!(f, " mode: {}, speak_overview: {}", self.mode, self.speak_overview)?; |
104 | 0 | writeln!(f, "}}")?; |
105 | 0 | return Ok( () ); |
106 | 0 | } |
107 | } | |
108 | ||
109 | impl NavigationState { | |
110 | 3.92k | fn new() -> NavigationState { |
111 | 3.92k | return NavigationState { |
112 | 3.92k | position_stack: Vec::with_capacity(1024), |
113 | 3.92k | command_stack: Vec::with_capacity(1024), |
114 | 3.92k | place_markers: Default::default(), |
115 | 3.92k | where_am_i: NavigationPosition::default(), |
116 | 3.92k | // FIX: figure this out for the web |
117 | 3.92k | #[cfg(target_family = "wasm")] |
118 | 3.92k | where_am_i_start_time: 0, // FIX: for web |
119 | 3.92k | #[cfg(not(target_family = "wasm"))] |
120 | 3.92k | where_am_i_start_time: Instant::now(), // need to give it some value, and "default()" isn't an option |
121 | 3.92k | mode: "".to_string(), // set latter when we have some context |
122 | 3.92k | speak_overview: false, // set latter when we have some context |
123 | 3.92k | }; |
124 | 3.92k | } |
125 | ||
126 | 4.88k | pub fn reset(&mut self) { |
127 | 4.88k | self.position_stack.clear(); |
128 | 4.88k | self.command_stack.clear(); |
129 | 4.88k | self.where_am_i = NavigationPosition::default(); |
130 | 4.88k | self.reset_start_time() |
131 | 4.88k | } |
132 | ||
133 | ||
134 | // defining reset_start_time because of the following message if done inline | |
135 | // attributes on expressions are experimental | |
136 | // see issue #15701 <https://github.com/rust-lang/rust/issues/15701> for more information | |
137 | #[cfg(target_family = "wasm")] | |
138 | fn reset_start_time(&mut self) { | |
139 | self.where_am_i_start_time = 0; | |
140 | } | |
141 | ||
142 | #[cfg(not(target_family = "wasm"))] | |
143 | 4.88k | fn reset_start_time(&mut self) { |
144 | 4.88k | self.where_am_i_start_time = Instant::now(); // need to give it some value, and "default()" isn't an option |
145 | 4.88k | } |
146 | ||
147 | ||
148 | 563 | fn push(&mut self, position: NavigationPosition, command: &'static str) { |
149 | 563 | self.position_stack.push(position); |
150 | 563 | self.command_stack.push(command); |
151 | 563 | } |
152 | ||
153 | 46 | fn pop(&mut self) -> Option<(NavigationPosition, &'static str)> { |
154 | 46 | assert_eq!(self.position_stack.len(), self.command_stack.len()); |
155 | 46 | if self.position_stack.is_empty() { |
156 | 0 | return None; |
157 | } else { | |
158 | 46 | return Some( (self.position_stack.pop().unwrap(), self.command_stack.pop().unwrap()) ); |
159 | } | |
160 | 46 | } |
161 | ||
162 | 2.75k | fn top(&self) -> Option<(&NavigationPosition, &'static str)> { |
163 | 2.75k | if self.position_stack.is_empty() { |
164 | 0 | return None; |
165 | 2.75k | } |
166 | 2.75k | let last = self.position_stack.len()-1; |
167 | 2.75k | return Some( (&self.position_stack[last], self.command_stack[last]) ); |
168 | 2.75k | } |
169 | ||
170 | 0 | pub fn get_navigation_mathml<'a>(&self, mathml: Element<'a>) -> Result<(Element<'a>, usize)> { |
171 | 0 | if self.position_stack.is_empty() { |
172 | 0 | return Ok( (mathml, 0) ); |
173 | } else { | |
174 | 0 | let (position, _) = self.top().unwrap(); |
175 | 0 | return match get_node_by_id(mathml, position) { |
176 | 0 | None => bail!("internal error: id '{}' was not found in mathml:\n{}", |
177 | 0 | position.current_node, mml_to_string(mathml)), |
178 | 0 | Some(found) => Ok( (found, position.current_node_offset) ) |
179 | }; | |
180 | } | |
181 | 0 | } |
182 | ||
183 | 1.09k | pub fn get_navigation_mathml_id(&self, mathml: Element) -> (String, usize) { |
184 | 1.09k | if self.position_stack.is_empty() { |
185 | 47 | return (mathml.attribute_value("id").unwrap().to_string(), 0); |
186 | } else { | |
187 | 1.05k | let (position, _) = self.top().unwrap(); |
188 | 1.05k | return (position.current_node.clone(), position.current_node_offset); |
189 | } | |
190 | 1.09k | } |
191 | ||
192 | 549 | fn init_navigation_context(&self, context: &mut sxd_xpath::Context, command: &'static str, |
193 | 549 | nav_state_top: Option<(&NavigationPosition, &'static str)>) { |
194 | 549 | context.set_variable("NavCommand", command); |
195 | ||
196 | 549 | if command == "WhereAmI" && |
197 | 0 | context.set_variable("NavNode", self.where_am_i.current_node.as_str()); |
198 | 0 | context.set_variable("NavNodeOffset", self.where_am_i.current_node_offset as f64); |
199 | 549 | } else { |
200 | 549 | let position = &self.position_stack[self.position_stack.len()-1]; |
201 | 549 | context.set_variable("NavNode", position.current_node.as_str()); |
202 | 549 | context.set_variable("NavNodeOffset", position.current_node_offset as f64); |
203 | 549 | } |
204 | ||
205 | // get the index from command (e.g., '3' in 'SetPlacemarker3 or MoveTo3' and set 'PlaceMarker' to it's position) | |
206 | 549 | if command.ends_with(|ch: char| ch.is_ascii_digit()) { |
207 | 6 | let index = convert_last_char_to_number(command); |
208 | 6 | let position = &self.place_markers[index]; |
209 | 6 | context.set_variable("PlaceMarkerIndex", index as f64); |
210 | 6 | context.set_variable("PlaceMarker", position.current_node.as_str()); |
211 | 6 | context.set_variable("PlaceMarkerOffset", position.current_node_offset as f64); |
212 | 543 | } |
213 | ||
214 | 549 | context.set_variable("Overview", self.speak_overview); |
215 | 549 | context.set_variable("ReadZoomLevel", (if self.mode == "Enhanced" { |
216 | 549 | context.set_variable("MatchCounter", 0 as f64); |
217 | ||
218 | 549 | if command == "MoveLastLocation" { |
219 | 3 | let previous_command = match nav_state_top { |
220 | 0 | None => "None", |
221 | 3 | Some( (_, previous_command) ) => previous_command, |
222 | }; | |
223 | 3 | context.set_variable("PreviousNavCommand", previous_command); |
224 | 546 | } |
225 | ||
226 | // used by nav rules for speech -- needs an initial value so tests don't fail | |
227 | 549 | context.set_variable("SayCommand", "" ); |
228 | 549 | context.set_variable("Move2D", "" ); |
229 | 549 | context.set_variable("SpeakExpression", true ); // default is to speak the expr after navigation |
230 | 549 | return; |
231 | ||
232 | 6 | fn convert_last_char_to_number(str: &str) -> usize { |
233 | 6 | let last_char = str.as_bytes()[str.len()-1]; |
234 | 6 | assert!( last_char.is_ascii_digit() ); |
235 | 6 | return (last_char - b'0') as usize; |
236 | 6 | } |
237 | 549 | } |
238 | } | |
239 | ||
240 | // convert the last digit of a Placemarker command to an integer | |
241 | 2 | fn convert_last_char_to_number(str: &str) -> usize { |
242 | 2 | let last_char = str.as_bytes()[str.len()-1]; |
243 | 2 | assert!( last_char.is_ascii_digit() ); |
244 | 2 | return (last_char - b'0') as usize; |
245 | 2 | } |
246 | ||
247 | /// Get the node associated with a `NavigationPosition`. | |
248 | /// This can be called on an intent tree | |
249 | 9.18k | fn get_node_by_id<'a>(mathml: Element<'a>, pos: &NavigationPosition) -> Option<Element<'a>> { |
250 | 9.18k | if let Some( |
251 | 9.17k | mathml_id == pos.current_node.as_str() && |
252 | 1.46k | (crate::xpath_functions::is_leaf(mathml) || |
253 | 537 | mathml.attribute_value(ID_OFFSET).unwrap_or("0") == pos.current_node_offset.to_string()) { |
254 | 1.46k | return Some(mathml); |
255 | 7.71k | } |
256 | ||
257 | 10.0k | for child in |
258 | 10.0k | if let Some( |
259 | 7.71k | let Some( |
260 | 4.41k | return Some(found); |
261 | 5.60k | } |
262 | } | |
263 | 3.29k | return None; |
264 | 9.18k | } |
265 | ||
266 | /// Search the mathml for the id and set the navigation node to that id | |
267 | /// Resets the navigation stack | |
268 | 2 | pub fn set_navigation_node_from_id(mathml: Element, id: &str, offset: usize) -> Result<()> { |
269 | 2 | let current_node = id.to_string(); |
270 | 2 | let pos = NavigationPosition { current_node: current_node.clone(), current_node_offset: offset }; |
271 | 2 | let node = get_node_by_id(mathml, &pos); |
272 | 2 | if node.is_some() { |
273 | 2 | return NAVIGATION_STATE.with(|nav_state| { |
274 | 2 | let mut nav_state = nav_state.borrow_mut(); |
275 | 2 | nav_state.reset(); |
276 | 2 | nav_state.push(NavigationPosition{ |
277 | 2 | current_node, |
278 | 2 | current_node_offset: offset |
279 | 2 | }, "None"); |
280 | 2 | return Ok( () ); |
281 | 2 | }) |
282 | } else { | |
283 | 0 | bail!("Id {} not found in MathML {}", id, mml_to_string(mathml)); |
284 | } | |
285 | 2 | } |
286 | ||
287 | /// Get's the Nav Node from the context, with some exceptions such as Toggle commands where it isn't set. | |
288 | /// Note: mathml can be any node. It isn't really used but some Element needs to be part of Evaluate(). | |
289 | 571 | pub fn get_nav_node<'c>(context: &sxd_xpath::Context<'c>, var_name: &str, mathml: Element<'c>, start_node: Element<'c>, command: &str, nav_mode: &str) -> Result<String> { |
290 | 571 | let start_id = start_node.attribute_value("id").unwrap_or_default(); |
291 | 571 | if command.starts_with("Toggle") { |
292 | 1 | return Ok( start_id.to_string() ); |
293 | } else { | |
294 | 570 | return context_get_variable(context, var_name, mathml) |
295 | 570 | .with_context(|| |
296 | 0 | command, start_node.attribute_value("id").unwrap_or_default(), nav_mode)); |
297 | } | |
298 | 571 | } |
299 | ||
300 | // FIX: think of a better place to put this, and maybe a better interface | |
301 | /// Note: mathml can be any node. It isn't really used but some Element needs to be part of Evaluate(). | |
302 | /// If the context variable has String, Number, or Boolean xpath value, return it as a string. Otherwise it is an error | |
303 | 4.55k | pub fn context_get_variable<'c>(context: &sxd_xpath::Context<'c>, var_name: &str, mathml: Element<'c>) -> Result<String> { |
304 | // This is slightly roundabout because Context doesn't expose a way to get the values. | |
305 | // Instead, we create an "Evaluation", which is just one level of indirection. | |
306 | use sxd_xpath::nodeset::Node; | |
307 | 4.55k | let evaluation = Evaluation::new(context, Node::Element(mathml)); |
308 | 4.55k | return match evaluation.value_of(var_name.into()) { |
309 | 4.55k | Some(value) => match value { |
310 | 1.74k | Value::String(s) => Ok(s.clone()), |
311 | 1.20k | Value::Number(f) => Ok(f.to_string()), |
312 | 1.09k | Value::Boolean(b) => Ok(format!("{b}")), // "true" or "false" |
313 | 509 | Value::Nodeset(nodes) => { |
314 | 509 | if nodes.size() == 1 && |
315 | 509 | let Some(attr) = nodes.document_order_first().unwrap().attribute() { |
316 | 509 | return Ok(attr.value().to_string()); |
317 | 0 | }; |
318 | 0 | let mut error_message = format!("Variable '{var_name}' set somewhere in navigate.yaml is nodeset and not an attribute: "); |
319 | 0 | if nodes.size() == 0 { |
320 | 0 | error_message += &format!("0 nodes (false) -- {} set to non-existent node in\n{}", |
321 | 0 | var_name, mml_to_string(mathml)); |
322 | 0 | } else { |
323 | 0 | let singular = nodes.size()==1; |
324 | 0 | error_message += &format!("{} node{}. {}:", |
325 | 0 | nodes.size(), |
326 | 0 | if singular {""} else {"s"}, |
327 | 0 | if singular {"Node is"} else {"Nodes are"}); |
328 | 0 | nodes.document_order() |
329 | 0 | .iter() |
330 | 0 | .enumerate() |
331 | 0 | .for_each(|(i, node)| { |
332 | 0 | match node { |
333 | 0 | sxd_xpath::nodeset::Node::Element(mathml) => |
334 | 0 | error_message += &format!("#{}:\n{}",i, mml_to_string(*mathml)), |
335 | 0 | _ => error_message += &format!("'{node:?}'"), |
336 | } | |
337 | 0 | }) |
338 | }; | |
339 | 0 | bail!(error_message); |
340 | }, | |
341 | }, | |
342 | 0 | None => bail!("Could not find value for navigation variable '{}'", var_name), |
343 | } | |
344 | 4.55k | } |
345 | ||
346 | /// Wrapper around context_get_variable to get an integer variable | |
347 | 1.70k | fn context_get_int_variable<'c>(context: &sxd_xpath::Context<'c>, var_name: &str, mathml: Element<'c>) -> Result<usize> { |
348 | 1.70k | let value = context_get_variable(context, var_name, mathml) |
349 | 1.70k | return match value.parse::<usize>() { |
350 | 1.70k | Ok(i) => Ok(i), |
351 | 0 | Err(e) => bail!("Could not parse navigation variable '{}' with value '{}' as integer: {}", var_name, value, e), |
352 | } | |
353 | 1.70k | } |
354 | ||
355 | /// Given a key code along with the modifier keys, the current node is moved accordingly (or value reported in some cases).] | |
356 | /// The spoken text for the new current node is returned. | |
357 | 0 | pub fn do_mathml_navigate_key_press(mathml: Element, |
358 | 0 | key: usize, shift_key: bool, control_key: bool, alt_key: bool, meta_key: bool) -> Result<String> { |
359 | 0 | let (command, param) = key_press_to_command_and_param(key, shift_key, control_key, alt_key, meta_key)?; |
360 | 0 | return do_navigate_command_and_param(mathml, command, param); |
361 | 0 | } |
362 | ||
363 | 2 | fn do_navigate_command_and_param(mathml: Element, command: NavigationCommand, param: NavigationParam) -> Result<String> { |
364 | 2 | return do_navigate_command_string(mathml, navigation_command_string(command, param)); |
365 | 2 | } |
366 | ||
367 | 549 | pub fn do_navigate_command_string(mathml: Element, nav_command: &'static str) -> Result<String> { |
368 | // first check to see if nav file has been changed -- don't bother checking in loop below | |
369 | 549 | NAVIGATION_RULES.with(|rules| { |
370 | 549 | rules.borrow_mut().read_files() |
371 | 549 | }) |
372 | ||
373 | 549 | if mathml.children().is_empty() { |
374 | 0 | bail!("MathML has not been set -- can't navigate"); |
375 | 549 | }; |
376 | ||
377 | 549 | return NAVIGATION_STATE.with(|nav_state| { |
378 | 549 | let mut nav_state = nav_state.borrow_mut(); |
379 | // debug!("MathML: {}", mml_to_string(mathml)); | |
380 | 549 | if nav_state.position_stack.is_empty() { |
381 | // initialize to root node | |
382 | 47 | nav_state.push(NavigationPosition{ |
383 | 47 | current_node: mathml.attribute_value("id").unwrap().to_string(), |
384 | 47 | current_node_offset: 0 |
385 | 47 | }, "None") |
386 | 502 | }; |
387 | ||
388 | 549 | return NAVIGATION_RULES.with(|rules| { |
389 | 549 | let rules = rules.borrow(); |
390 | 549 | let new_package = Package::new(); |
391 | 549 | let mut rules_with_context = SpeechRulesWithContext::new(&rules, new_package.as_document(), "", 0); |
392 | ||
393 | 549 | nav_state.mode = rules.pref_manager.as_ref().borrow().pref_to_string("NavMode"); |
394 | 549 | nav_state.speak_overview = rules.pref_manager.as_ref().borrow().pref_to_string("Overview") == "true"; |
395 | ||
396 | 549 | nav_state.init_navigation_context(rules_with_context.get_context(), nav_command, nav_state.top()); |
397 | ||
398 | // start navigation off at the right node | |
399 | 549 | if nav_command == "MoveLastLocation" { |
400 | 3 | nav_state.pop(); |
401 | 546 | } |
402 | ||
403 | // If no speech happened for some calls, we try the call again (e.g, no speech for invisible times). | |
404 | // To prevent to infinite loop, we limit the number of tries | |
405 | const LOOP_LIMIT: usize = 3; | |
406 | 549 | let mut cumulative_speech = String::with_capacity(120); |
407 | 569 | for loop_count in |
408 | 569 | match apply_navigation_rules(mathml, nav_command, &rules, &mut rules_with_context, &mut nav_state, loop_count) { |
409 | 569 | Ok( (speech, done)) => { |
410 | 569 | cumulative_speech = cumulative_speech + if loop_count==0 { |
411 | 569 | if done { |
412 | 549 | let (tts, rate) = { |
413 | 549 | let prefs = rules.pref_manager.borrow(); |
414 | 549 | (prefs.pref_to_string("TTS"), prefs.pref_to_string("MathRate")) |
415 | 549 | }; |
416 | 549 | if rate != "100" { |
417 | 0 | match tts.as_str() { |
418 | 0 | "SSML" |
419 | 0 | if !cumulative_speech.starts_with("<prosody rate") => { |
420 | 0 | cumulative_speech = format!("<prosody rate='{}%'>{}</prosody>", &rate, &cumulative_speech); |
421 | 0 | } |
422 | 0 | "SAPI5" |
423 | 0 | if !cumulative_speech.starts_with("<rate speed") => { |
424 | 0 | cumulative_speech = format!( |
425 | 0 | "<rate speed='{:.1}'>{}</rate>", |
426 | 0 | 10.0 * (0.01 * rate.parse::<f32>().unwrap_or(100.0)).log(3.0), |
427 | 0 | cumulative_speech |
428 | 0 | ); |
429 | 0 | } |
430 | 0 | _ => (), // do nothing |
431 | } | |
432 | 549 | } |
433 | 549 | return Ok( rules.pref_manager.borrow().get_tts() |
434 | 549 | .merge_pauses(crate::speech::remove_optional_indicators( |
435 | 549 | &cumulative_speech.replace(CONCAT_STRING, "") |
436 | 549 | .replace(CONCAT_INDICATOR, "") |
437 | 549 | ) |
438 | 549 | .trim_start().trim_end_matches([' ', ',', ';'])) ); |
439 | 20 | } |
440 | }, | |
441 | 0 | Err(e) => { |
442 | 0 | return Err(e); |
443 | } | |
444 | } | |
445 | } | |
446 | 0 | bail!("Internal error: Navigation exceeded limit of number of times no speech generated |
447 | when attempting to {} in {} mode start at id={} in this MathML:\n{}.", | |
448 | 0 | nav_command, nav_state.mode, nav_state.top().unwrap().0.current_node, mml_to_string(mathml)); |
449 | 549 | }); |
450 | 549 | }); |
451 | ||
452 | 570 | fn get_start_node<'m>(mathml: Element<'m>, nav_state: &RefMut<NavigationState>) -> Result<Element<'m>> { |
453 | 570 | let element = match nav_state.top() { |
454 | None => { | |
455 | 0 | let nav_position = NavigationPosition { current_node: mathml.attribute_value("id").unwrap().to_string(), current_node_offset: 0 }; |
456 | 0 | get_node_by_id(mathml, &nav_position) |
457 | }, | |
458 | 570 | Some( (position, _) ) => get_node_by_id(mathml, position), |
459 | }; | |
460 | ||
461 | 570 | return match element { |
462 | 569 | Some(node) => Ok(node), |
463 | None => { | |
464 | 1 | bail!("Internal Error: didn't find id/offset '{:?}' while attempting to start navigation. MathML is\n{}", |
465 | 1 | nav_state.top().map(|t| t.0), mml_to_string(mathml)); |
466 | } | |
467 | }; | |
468 | 570 | } |
469 | ||
470 | ||
471 | ||
472 | 569 | fn apply_navigation_rules<'c, 'm:'c>(mathml: Element<'m>, nav_command: &'static str, |
473 | 569 | rules: &Ref<SpeechRules>, rules_with_context: &mut SpeechRulesWithContext<'c, '_, 'm>, nav_state: &mut RefMut<NavigationState>, |
474 | 569 | loop_count: usize) -> Result<(String, bool)> { |
475 | { | |
476 | 569 | let context = rules_with_context.get_context(); |
477 | 569 | context.set_variable("MatchCounter", loop_count as f64); |
478 | 569 | nav_state.mode = context_get_variable(context, "NavMode", mathml) |
479 | } | |
480 | ||
481 | 569 | let mut add_literal = nav_state.mode == "Character"; |
482 | 569 | let (intent, nav_intent) = if add_literal { |
483 | 206 | (mathml, mathml) |
484 | } else { | |
485 | 363 | let intent = crate::speech::intent_from_mathml(mathml, rules_with_context.get_document()) |
486 | 363 | (intent, add_fixity_children(copy_mathml(intent))) |
487 | }; | |
488 | ||
489 | 569 | let mut properties = ""; |
490 | 569 | if add_literal { |
491 | 206 | properties = mathml.attribute_value("data-intent-property").unwrap_or_default(); |
492 | 206 | if properties.contains(":literal:") { |
493 | 0 | add_literal = false; |
494 | 206 | } else { |
495 | 206 | mathml.set_attribute_value("data-intent-property", (":literal:".to_string() + properties).as_str()); |
496 | 206 | }; |
497 | 363 | } |
498 | // we should always find the start node. | |
499 | // however, if we were navigating by character, then switched the NavMode, the intent tree might not have that node in it | |
500 | 569 | let start_node = match get_start_node(nav_intent, nav_state) { |
501 | 568 | Ok(node) => node, |
502 | Err(_) => { | |
503 | // find the node in the other tree (probably mathml) and walk up to find a parent that has an id in both | |
504 | 1 | debug!("Could not find start_node in nav_intent -- trying other_tree"); |
505 | 1 | let other_tree = if nav_state.mode == "Character" { |
506 | 1 | let mut found_node = get_start_node(other_tree, nav_state) |
507 | 2 | while name(found_node) != "math" { |
508 | 2 | found_node = get_parent(found_node); |
509 | // debug!("found_node:\n{}", mml_to_string(found_node)); | |
510 | 2 | let temp_pos = NavigationPosition { |
511 | 2 | current_node: found_node.attribute_value("id").unwrap_or_default().to_string().clone(), |
512 | 2 | current_node_offset: found_node.attribute_value(ID_OFFSET).unwrap_or_default().parse::<usize>().unwrap_or_default(), |
513 | 2 | }; |
514 | 2 | if let Some( |
515 | 1 | found_node = intent_node; |
516 | 1 | break; |
517 | 1 | } |
518 | } | |
519 | 1 | found_node |
520 | } | |
521 | }; | |
522 | ||
523 | // debug!("intent=\n{}", mml_to_string(intent)); | |
524 | // debug!("nav intent=\n{}", mml_to_string(nav_intent)); | |
525 | // debug!("start_node id={}\n{}", nav_state.top().unwrap().0.current_node.as_str(), mml_to_string(start_node)); | |
526 | // if name(start_node) != "math" { | |
527 | // let mut parent= get_parent(start_node); | |
528 | // if name(parent) != "math" { | |
529 | // parent = get_parent(parent); | |
530 | // } | |
531 | // debug!("parent or grandparent of start_node:\n{}", mml_to_string(parent)); | |
532 | // } | |
533 | 569 | let offset = context_get_int_variable(rules_with_context.get_context(), "NavNodeOffset", intent) |
534 | 569 | rules_with_context.set_nav_node_offset(offset); |
535 | 569 | debug!("starting nav_position: {}, start node ={}", |
536 | ||
537 | 569 | let raw_speech_string = rules_with_context.match_pattern::<String>(start_node) |
538 | 569 | .context("Pattern match/replacement failure during math navigation!") |
539 | 569 | let speech = rules.pref_manager.borrow().get_tts() |
540 | 569 | .merge_pauses(crate::speech::remove_optional_indicators( |
541 | 569 | &raw_speech_string.replace(CONCAT_STRING, "") |
542 | 569 | .replace(CONCAT_INDICATOR, "") |
543 | 569 | ) |
544 | 569 | .trim()); |
545 | // debug!("Nav Speech: {}", speech); | |
546 | ||
547 | // FIX: add things that need to do a speech replacement based on some marker for "where am i" and others that loop ([Speak: id])??? | |
548 | // what else needs to be done/set??? | |
549 | ||
550 | // transfer some values that might have been set into the prefs | |
551 | 569 | let offset = context_get_int_variable(rules_with_context.get_context(), "NavNodeOffset", intent) |
552 | 569 | rules_with_context.set_nav_node_offset(offset); |
553 | 569 | let context = rules_with_context.get_context(); |
554 | 569 | nav_state.speak_overview = context_get_variable(context, "Overview", intent) |
555 | 569 | nav_state.mode = context_get_variable(context, "NavMode", intent) |
556 | 569 | rules.pref_manager.as_ref().borrow_mut().set_user_prefs("NavMode", &nav_state.mode) |
557 | ||
558 | 569 | debug!("context value of NavNodeOffset: {:?}", |
559 | 569 | let nav_position = NavigationPosition { |
560 | 569 | current_node: get_nav_node(context, "NavNode", intent, start_node, nav_command, &nav_state.mode) |
561 | 569 | current_node_offset: context_get_int_variable(context, "NavNodeOffset", intent) |
562 | }; | |
563 | ||
564 | // after a command, we either read or describe the new location (part of state) | |
565 | // also some commands are DescribeXXX/ReadXXX, so we need to look at the commands also | |
566 | 569 | let use_read_rules = if nav_command.starts_with("Read") { |
567 | 5 | true |
568 | 564 | } else if nav_command.starts_with("Describe") { |
569 | 3 | false |
570 | } else { | |
571 | 561 | !nav_state.speak_overview |
572 | }; | |
573 | ||
574 | 569 | debug!("after match nav_position: {}", nav_position); |
575 | // push the new location on the stack | |
576 | 569 | if nav_position != NavigationPosition::default() && &nav_position != nav_state.top().unwrap().0 { |
577 | 483 | nav_state.push(nav_position.clone(), nav_command); |
578 | 483 |
|
579 | ||
580 | 569 | if nav_command.starts_with("SetPlacemarker") { |
581 | 2 | let new_node_id = get_nav_node(context, "NavNode", intent, start_node, nav_command, &nav_state.mode) |
582 | 2 | nav_state.place_markers[convert_last_char_to_number(nav_command)] = NavigationPosition{ |
583 | 2 | current_node: new_node_id, |
584 | 2 | current_node_offset: context_get_int_variable(context, "NavNodeOffset", intent) |
585 | } | |
586 | 567 | } |
587 | ||
588 | 569 | let nav_mathml = get_node_by_id(intent, &nav_position); |
589 | 569 | if nav_mathml.is_some() && context_get_variable(context, "SpeakExpression", intent) |
590 | // Speak/Overview of where we landed (if we are supposed to speak it) -- use intent, not nav_intent | |
591 | // Note: NavMode might have changed, so we need to recheck the mode to see if we use LiteralSpeak | |
592 | 519 | let literal_speak = nav_state.mode == "Character"; |
593 | 519 | let node_speech_result = speak(mathml, intent, &nav_position, literal_speak, use_read_rules); |
594 | 519 | remove_literal_property(mathml, add_literal, properties); |
595 | 519 | let node_speech = match node_speech_result { |
596 | 519 | Ok(speech) => speech, |
597 | 0 | Err(e) => { |
598 | 0 | if e.to_string() == crate::speech::NAV_NODE_SPEECH_NOT_FOUND { |
599 | 0 | bail!("Internal error: With {}/{} in {} mode, can't {} from expression with id '{}' inside:\n{}", |
600 | 0 | rules.pref_manager.as_ref().borrow().pref_to_string("Language"), |
601 | 0 | rules.pref_manager.as_ref().borrow().pref_to_string("SpeechStyle"), |
602 | 0 | &nav_state.mode, nav_command, &nav_position.current_node, mml_to_string(if literal_speak {mathml} else {intent})); |
603 | 0 | } |
604 | 0 | return Err(e); |
605 | } | |
606 | }; | |
607 | ||
608 | // debug!("node_speech: '{}', speech: '{}'\n", node_speech, speech); | |
609 | 519 | if node_speech.is_empty() { |
610 | // try again in loop | |
611 | 20 | return Ok( (speech, false)); |
612 | } else { | |
613 | 499 | pop_stack(nav_state, loop_count, nav_command); |
614 | // debug!("returning: '{}'", speech.clone() + " " + &node_speech); | |
615 | 499 | return Ok( (speech + " " + &node_speech, true) ); |
616 | } | |
617 | } else { | |
618 | 50 | remove_literal_property(mathml, add_literal, properties); |
619 | 50 | pop_stack(nav_state, loop_count, nav_command); |
620 | 50 | return Ok( (speech, true) ); |
621 | }; | |
622 | ||
623 | 569 | fn remove_literal_property(mathml: Element, add_literal: bool, properties: &str) { |
624 | 569 | if add_literal { |
625 | 206 | if properties.is_empty() { |
626 | 206 | mathml.remove_attribute("data-intent-property"); |
627 | 206 | } else { |
628 | 0 | mathml.set_attribute_value("data-intent-property", properties); |
629 | 0 | } |
630 | 363 | } |
631 | 569 | } |
632 | ||
633 | 569 | } |
634 | ||
635 | ||
636 | 549 | fn pop_stack(nav_state: &mut NavigationState, count: usize, nav_command: &'static str) { |
637 | // save the final state and pop the intermediate states that did nothing | |
638 | 549 | let push_command_on_stack = (nav_command.starts_with("Move") && |
639 | // debug!("pop_stack: nav_command={}, count={}, push? {} stack=\n{}", nav_command, count, push_command_on_stack, nav_state); | |
640 | 549 | if count == 0 { |
641 | 529 | if !push_command_on_stack && |
642 | 3 | nav_state.pop(); // remove ReadXXX, SetPlacemarker, etc. commands that don't change the state |
643 | 526 | } |
644 | 529 | return; |
645 | 20 | } |
646 | 20 | let (top_position, top_command) = nav_state.pop().unwrap(); |
647 | 20 | let mut count = count - 1; |
648 | loop { | |
649 | // debug!(" ... loop count={}", count); | |
650 | 20 | nav_state.pop(); |
651 | 20 | if count == 0 { |
652 | 20 | break; |
653 | 0 | }; |
654 | 0 | count -= 1; |
655 | }; | |
656 | 20 | if push_command_on_stack { |
657 | 19 | nav_state.push(top_position, top_command); |
658 | 19 |
|
659 | // debug!("END pop_stack: stack=\n{}", nav_state); | |
660 | 549 | } |
661 | 549 | } |
662 | ||
663 | /// Speak the intent tree at the nav_node_id if that id exists in the intent tree; otherwise use the mathml tree. | |
664 | /// If full_read is true, we speak the tree, otherwise we use the overview rules. | |
665 | /// If literal_speak is true, we use the literal speak rules (and use the mathml tree). | |
666 | 519 | fn speak(mathml: Element, intent: Element, nav_position: &NavigationPosition, literal_speak: bool, full_read: bool) -> Result<String> { |
667 | 519 | if full_read { |
668 | // In something like x^3, we might be looking for the '3', but it will be "cubed", so we don't find it. | |
669 | // Or we might be on a "(" surrounding a matrix and that isn't part of the intent | |
670 | // We are probably safer in terms of getting the same speech if we retry intent starting at the nav node, | |
671 | // but the node to speak is almost certainly trivial. | |
672 | // By speaking the non-intent tree, we are certain to speak on the next try | |
673 | 505 | if !literal_speak && |
674 | // debug!("speak: nav_node_id={}, intent=\n{}", nav_node_id, mml_to_string(intent)); | |
675 | 327 | match crate::speech::speak_mathml(intent, &nav_position.current_node, nav_position.current_node_offset) { |
676 | 326 | Ok(speech) => return Ok(speech), |
677 | 1 | Err(e) => { |
678 | 1 | if e.to_string() != crate::speech::NAV_NODE_SPEECH_NOT_FOUND { |
679 | 0 | return Err(e); |
680 | 1 | } |
681 | // else could be something like '3' in 'x^3' ("cubed") | |
682 | }, | |
683 | } | |
684 | 178 | } |
685 | // debug!("speak (literal): nav_node_id={}, mathml=\n{}", nav_node_id, mml_to_string(mathml)); | |
686 | 179 | let speech = crate::speech::speak_mathml(mathml, |
687 | 179 | &nav_position.current_node, nav_position.current_node_offset); |
688 | // debug!("speech from speak: {:?}", speech); | |
689 | 179 | return speech; |
690 | } else { | |
691 | 14 | return crate::speech::overview_mathml(mathml, &nav_position.current_node, nav_position.current_node_offset); |
692 | } | |
693 | 519 | } |
694 | ||
695 | ||
696 | // MathPlayer's interface mentions these, so we keep them. | |
697 | // These (KeyboardEvent.keyCode) are consistent across platforms (mostly?) but are deprecated. | |
698 | // KeyboardEvent.code is recommended instead (a string) | |
699 | const VK_LEFT: usize = 0x25; | |
700 | const VK_RIGHT: usize = 0x27; | |
701 | const VK_UP: usize = 0x26; | |
702 | const VK_DOWN: usize = 0x28; | |
703 | const VK_RETURN: usize = 0x0D; | |
704 | const VK_SPACE: usize = 0x20; | |
705 | const VK_HOME: usize = 0x24; | |
706 | const VK_END: usize = 0x23; | |
707 | const VK_BACK: usize = 0x08; | |
708 | const VK_ESCAPE: usize = 0x1B; | |
709 | ||
710 | // Utilities that returns one of four commands/params based on shift/control key combinations | |
711 | ||
712 | enum NavigationCommand { | |
713 | Move, | |
714 | Zoom, | |
715 | MoveLastLocation, | |
716 | Read, | |
717 | Describe, | |
718 | ReadTo, | |
719 | Locate, | |
720 | ChangeNavMode, | |
721 | ToggleSpeakMode, | |
722 | SetPlacemarker, | |
723 | Exit, | |
724 | Last, | |
725 | } | |
726 | ||
727 | #[derive(PartialEq, PartialOrd, Clone, Copy)] | |
728 | enum NavigationParam { | |
729 | Placemarker0, | |
730 | Placemarker1, | |
731 | Placemarker2, | |
732 | Placemarker3, | |
733 | Placemarker4, | |
734 | Placemarker5, | |
735 | Placemarker6, | |
736 | Placemarker7, | |
737 | Placemarker8, | |
738 | Placemarker9, | |
739 | Previous, | |
740 | Current, | |
741 | Next, | |
742 | Start, | |
743 | End, | |
744 | LineStart, | |
745 | LineEnd, | |
746 | CellPrevious, | |
747 | CellCurrent, | |
748 | CellNext, | |
749 | ColStart, | |
750 | ColEnd, | |
751 | CellUp, | |
752 | CellDown, | |
753 | Last | |
754 | } | |
755 | ||
756 | ||
757 | 0 | fn choose_command( |
758 | 0 | shift_key: bool, |
759 | 0 | control_key: bool, |
760 | 0 | none: NavigationCommand, |
761 | 0 | shift: NavigationCommand, |
762 | 0 | control: NavigationCommand, |
763 | 0 | shift_control: NavigationCommand |
764 | 0 | ) -> NavigationCommand { |
765 | 0 | if shift_key && control_key { |
766 | 0 | return shift_control; |
767 | 0 | } else if control_key { |
768 | 0 | return control; |
769 | 0 | } else if shift_key { |
770 | 0 | return shift; |
771 | } else { | |
772 | 0 | return none; |
773 | } | |
774 | 0 | } |
775 | ||
776 | 0 | fn choose_param( |
777 | 0 | shift_key: bool, |
778 | 0 | control_key: bool, |
779 | 0 | none: NavigationParam, |
780 | 0 | shift: NavigationParam, |
781 | 0 | control: NavigationParam, |
782 | 0 | shift_control: NavigationParam |
783 | 0 | ) -> NavigationParam { |
784 | 0 | if shift_key && control_key { |
785 | 0 | return shift_control; |
786 | 0 | } else if control_key { |
787 | 0 | return control; |
788 | 0 | } else if shift_key { |
789 | 0 | return shift; |
790 | } else { | |
791 | 0 | return none; |
792 | } | |
793 | 0 | } |
794 | ||
795 | 0 | fn key_press_to_command_and_param( |
796 | 0 | key: usize, |
797 | 0 | shift_key: bool, |
798 | 0 | control_key: bool, |
799 | 0 | alt_key: bool, |
800 | 0 | meta_key: bool, |
801 | 0 | ) -> Result<(NavigationCommand, NavigationParam)> { |
802 | // key press mapping should probably be stored externally (registry) with an app that allows changes | |
803 | // for now, we build in the defaults | |
804 | ||
805 | // this is a hack to map alt+ctl+arrow to ctl+arrow to change table mappings (github.com/NSoiffer/MathCAT/issues/105) | |
806 | // if this change sticks, choose_command() needs to be changed and this hack should go away | |
807 | 0 | let mut alt_key = alt_key; |
808 | 0 | if alt_key && control_key && [VK_LEFT, VK_RIGHT, VK_UP, VK_DOWN].contains(&key) { |
809 | 0 | alt_key = false; |
810 | 0 | } |
811 | 0 | if alt_key || meta_key { |
812 | 0 | bail!("Invalid argument to key_press_to_command_and_param"); |
813 | 0 | } |
814 | ||
815 | let command; | |
816 | let param; | |
817 | 0 | match key { |
818 | 0 | VK_LEFT => { |
819 | 0 | command = choose_command(shift_key, control_key, NavigationCommand::Move, NavigationCommand::Read, NavigationCommand::Move, NavigationCommand::Describe); |
820 | 0 | param = choose_param( shift_key, control_key, NavigationParam::Previous, NavigationParam::Previous, NavigationParam::CellPrevious, NavigationParam::Previous); |
821 | 0 | }, |
822 | 0 | VK_RIGHT => { |
823 | 0 | command = choose_command(shift_key, control_key, NavigationCommand::Move, NavigationCommand::Read, NavigationCommand::Move, NavigationCommand::Describe); |
824 | 0 | param = choose_param( shift_key, control_key, NavigationParam::Next, NavigationParam::Next, NavigationParam::CellNext, NavigationParam::Next); |
825 | 0 | }, |
826 | 0 | VK_UP => { |
827 | 0 | command = choose_command(shift_key, control_key, NavigationCommand::Zoom, NavigationCommand::ChangeNavMode, NavigationCommand::Move, NavigationCommand::Zoom); |
828 | 0 | param = choose_param( shift_key, control_key, NavigationParam::Previous, NavigationParam::Previous, NavigationParam::CellUp, NavigationParam::Start); |
829 | 0 | }, |
830 | 0 | VK_DOWN => { |
831 | 0 | command = choose_command(shift_key, control_key, NavigationCommand::Zoom, NavigationCommand::ChangeNavMode, NavigationCommand::Move, NavigationCommand::Zoom); |
832 | 0 | param = choose_param( shift_key, control_key, NavigationParam::Next, NavigationParam::Next, NavigationParam::CellDown, NavigationParam::End); |
833 | 0 | }, |
834 | 0 | VK_RETURN => { |
835 | 0 | command = choose_command(shift_key, control_key, NavigationCommand::Locate, NavigationCommand::Last, NavigationCommand::Locate, NavigationCommand::Last); |
836 | 0 | param = choose_param( shift_key, control_key, NavigationParam::Previous,NavigationParam::Last, NavigationParam::Last, NavigationParam::Last); |
837 | 0 | }, |
838 | 0 | VK_SPACE => { |
839 | 0 | command = choose_command(shift_key, control_key, NavigationCommand::Read, NavigationCommand::ToggleSpeakMode, NavigationCommand::Read, NavigationCommand::Describe); |
840 | 0 | param = choose_param( shift_key, control_key, NavigationParam::Current, NavigationParam::Last, NavigationParam::CellCurrent, NavigationParam::Current); |
841 | 0 | }, |
842 | ||
843 | 0 | VK_HOME => { |
844 | 0 | command = choose_command(shift_key, control_key, NavigationCommand::Move, NavigationCommand::Move, NavigationCommand::Move, NavigationCommand::ReadTo); |
845 | 0 | param = choose_param( shift_key, control_key, NavigationParam::Start,NavigationParam::ColStart, NavigationParam::LineStart, NavigationParam::Start); |
846 | 0 | }, |
847 | 0 | VK_END => { |
848 | 0 | command = choose_command(shift_key, control_key, NavigationCommand::Move, NavigationCommand::Move, NavigationCommand::Move, NavigationCommand::ReadTo); |
849 | 0 | param = choose_param( shift_key, control_key, NavigationParam::End, NavigationParam::ColEnd, NavigationParam::LineEnd, NavigationParam::End); |
850 | 0 | }, |
851 | 0 | VK_BACK => { |
852 | 0 | command = NavigationCommand::MoveLastLocation; |
853 | 0 | param = NavigationParam::Last; |
854 | 0 | }, |
855 | 0 | VK_ESCAPE => { |
856 | 0 | command = NavigationCommand::Exit; |
857 | 0 | param = NavigationParam::Last; |
858 | 0 | }, |
859 | 0 | 0x30..=0x39 => { // '0' ... '9' |
860 | 0 | command = choose_command(shift_key, control_key, NavigationCommand::Move, NavigationCommand::Read, NavigationCommand::SetPlacemarker, NavigationCommand::Describe); |
861 | static PLACE_MARKER: &[NavigationParam] = &[ | |
862 | NavigationParam::Placemarker0, | |
863 | NavigationParam::Placemarker1, | |
864 | NavigationParam::Placemarker2, | |
865 | NavigationParam::Placemarker3, | |
866 | NavigationParam::Placemarker4, | |
867 | NavigationParam::Placemarker5, | |
868 | NavigationParam::Placemarker6, | |
869 | NavigationParam::Placemarker7, | |
870 | NavigationParam::Placemarker8, | |
871 | NavigationParam::Placemarker9, | |
872 | ]; | |
873 | 0 | param = PLACE_MARKER[key-0x30]; |
874 | }, | |
875 | 0 | _ => bail!("Unknown key press/command"), |
876 | }; | |
877 | ||
878 | 0 | return Ok( (command, param) ); |
879 | 0 | } |
880 | ||
881 | // translate the key presses into commands | |
882 | ||
883 | ||
884 | 2 | fn navigation_command_string(command: NavigationCommand, param: NavigationParam) -> &'static str { |
885 | 2 | match command { |
886 | NavigationCommand::Move => { | |
887 | 1 | return match param { |
888 | 0 | NavigationParam::Previous => "MovePrevious", |
889 | 0 | NavigationParam::Next => "MoveNext", |
890 | 1 | NavigationParam::Start => "MoveStart", |
891 | 0 | NavigationParam::End => "MoveEnd", |
892 | 0 | NavigationParam::LineStart => "MoveLineStart", |
893 | 0 | NavigationParam::LineEnd => "MoveLineEnd", |
894 | 0 | NavigationParam::CellPrevious => "MoveCellPrevious", |
895 | 0 | NavigationParam::CellNext => "MoveCellNext", |
896 | 0 | NavigationParam::CellUp => "MoveCellUp", |
897 | 0 | NavigationParam::CellDown => "MoveCellDown", |
898 | 0 | NavigationParam::ColStart => "MoveColumnStart", |
899 | 0 | NavigationParam::ColEnd => "MoveColumnEnd", |
900 | _ => { | |
901 | 0 | if param < NavigationParam::Placemarker0 || param > NavigationParam::Placemarker9 { |
902 | 0 | panic!("Internal Error: Found illegal value for param of NavigationCommand::Move"); |
903 | 0 | } |
904 | static MOVE_TO: &[&str] = &["MoveTo0","MoveTo1","MoveTo2","MoveTo3","MoveTo4","MoveTo5","MoveTo6","MoveTo7","MoveTo8","MoveTo9"]; | |
905 | 0 | return MOVE_TO[(param as usize) - (NavigationParam::Placemarker0 as usize)]; |
906 | } | |
907 | } | |
908 | }, | |
909 | NavigationCommand::Zoom => { | |
910 | 1 | return match param { |
911 | 0 | NavigationParam::Next => "ZoomIn", |
912 | 1 | NavigationParam::Previous => "ZoomOut", |
913 | 0 | NavigationParam::Start => "ZoomOutAll", |
914 | 0 | NavigationParam::End => "ZoomInAll", |
915 | 0 | _ => panic!("Illegal param for NavigationCommand::Zoom"), |
916 | } | |
917 | }, | |
918 | NavigationCommand::MoveLastLocation => { | |
919 | 0 | return "MoveLastLocation"; |
920 | }, | |
921 | NavigationCommand::Read => { | |
922 | 0 | return match param { |
923 | 0 | NavigationParam::Previous => "ReadPrevious", |
924 | 0 | NavigationParam::Next => "ReadNext", |
925 | 0 | NavigationParam::Current => "ReadCurrent", |
926 | 0 | NavigationParam::CellCurrent => "ReadCellCurrent", |
927 | 0 | NavigationParam::Start => "ReadStart", |
928 | 0 | NavigationParam::End => "ReadEnd", |
929 | 0 | NavigationParam::LineStart => "ReadLineStart", |
930 | 0 | NavigationParam::LineEnd => "ReadLineEnd", |
931 | _ => { | |
932 | 0 | if param < NavigationParam::Placemarker0 || param > NavigationParam::Placemarker9 { |
933 | 0 | panic!("Internal Error: Found illegal value for param of NavigationCommand::Move"); |
934 | 0 | } |
935 | static READ_PLACE_MARKERS: &[&str] = &["Read0","Read1","Read2","Read3","Read4","Read5","Read6","Read7","Read8","Read9"]; | |
936 | 0 | return READ_PLACE_MARKERS[(param as usize) - (NavigationParam::Placemarker0 as usize)]; |
937 | }, | |
938 | } | |
939 | }, | |
940 | NavigationCommand::Describe => { | |
941 | 0 | return match param { |
942 | 0 | NavigationParam::Previous => "DescribePrevious", |
943 | 0 | NavigationParam::Next => "DescribeNext", |
944 | 0 | NavigationParam::Current => "DescribeCurrent", |
945 | _ => { | |
946 | 0 | if param < NavigationParam::Placemarker0 || param > NavigationParam::Placemarker9 { |
947 | 0 | panic!("Internal Error: Found illegal value for param of NavigationCommand::Describe"); |
948 | 0 | } |
949 | static DESCRIBE_PLACE_MARKERS: &[&str] = &["Describe0","Describe1","Describe2","Describe3","Describe4","Describe5","Describe6","Describe7","Describe8","Describe9"]; | |
950 | 0 | return DESCRIBE_PLACE_MARKERS[(param as usize) - (NavigationParam::Placemarker0 as usize)]; |
951 | } | |
952 | } | |
953 | }, | |
954 | NavigationCommand::ReadTo => { | |
955 | 0 | todo!("ReadTo navigation command") |
956 | }, | |
957 | NavigationCommand::Locate => { | |
958 | 0 | if param ==NavigationParam::Previous { |
959 | 0 | return "WhereAmI"; |
960 | 0 | } else if param ==NavigationParam::Last { |
961 | 0 | return "WhereAmIAll"; |
962 | 0 | } |
963 | }, | |
964 | NavigationCommand::ChangeNavMode => { | |
965 | 0 | if param ==NavigationParam::Previous { |
966 | 0 | return "ToggleZoomLockUp"; |
967 | 0 | } else if param ==NavigationParam::Next { |
968 | 0 | return "ToggleZoomLockDown"; |
969 | 0 | } |
970 | }, | |
971 | NavigationCommand::ToggleSpeakMode => { | |
972 | 0 | return "ToggleSpeakMode"; |
973 | }, | |
974 | NavigationCommand::SetPlacemarker => { | |
975 | 0 | if param < NavigationParam::Placemarker0 || param > NavigationParam::Placemarker9 { |
976 | 0 | panic!("Internal Error: Found illegal value for param of NavigationCommand::SetPlacemarker"); |
977 | 0 | } |
978 | static SET_PLACE_MARKER: &[&str] = &["SetPlacemarker0","SetPlacemarker1","SetPlacemarker2","SetPlacemarker3","SetPlacemarker4","SetPlacemarker5","SetPlacemarker6","SetPlacemarker7","SetPlacemarker8","SetPlacemarker9"]; | |
979 | 0 | return SET_PLACE_MARKER[(param as usize) - (NavigationParam::Placemarker0 as usize)]; |
980 | }, | |
981 | NavigationCommand::Exit => { | |
982 | 0 | return "Exit"; |
983 | }, | |
984 | NavigationCommand::Last => { | |
985 | 0 | return "Error"; |
986 | } | |
987 | }; | |
988 | 0 | return "Error"; |
989 | 2 | } |
990 | ||
991 | #[cfg(test)] | |
992 | mod tests { | |
993 | use super::*; | |
994 | #[allow(unused_imports)] | |
995 | use crate::init_logger; | |
996 | use crate::interface::*; | |
997 | ||
998 | #[cfg(test)] | |
999 | /// Assert if result_id != '' and it doesn't match the id of the result of the move | |
1000 | /// Returns the speech from the command | |
1001 | 547 | fn test_command(command: &'static str, mathml: Element, result_id: &str) -> String { |
1002 | // debug!("\nCommand: {}", command); | |
1003 | 547 | NAVIGATION_STATE.with(|nav_stack| { |
1004 | 547 | let (start_id, _) = nav_stack.borrow().get_navigation_mathml_id(mathml); |
1005 | 547 | match do_navigate_command_string(mathml, command) { |
1006 | 0 | Err(e) => { |
1007 | 0 | panic!("\nStarting at '{}', '{} failed.\n{}", |
1008 | 0 | start_id, command, &crate::interface::errors_to_string(&e)) |
1009 | }, | |
1010 | 547 | Ok(nav_speech) => { |
1011 | 547 | let nav_speech = nav_speech.trim_end_matches(&[' ', ',', ';']); |
1012 | // debug!("Full speech: {}", nav_speech); | |
1013 | 547 | if !result_id.is_empty() { |
1014 | 547 | let (id, _) = nav_stack.borrow().get_navigation_mathml_id(mathml); |
1015 | 547 | assert_eq!(result_id, id, "\nStarting at '{}', '{} failed.", start_id, command); |
1016 | 0 | } |
1017 | 547 | return nav_speech.to_string(); |
1018 | } | |
1019 | }; | |
1020 | 547 | }) |
1021 | 547 | } |
1022 | ||
1023 | 56 | fn init_default_prefs(mathml: &str, nav_mode_default: &str) { |
1024 | 56 | set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); |
1025 | 56 | set_preference("NavMode", nav_mode_default).unwrap(); |
1026 | 56 | set_preference("NavVerbosity", "Verbose").unwrap(); |
1027 | 56 | set_preference("AutoZoomOut", "True").unwrap(); |
1028 | 56 | set_preference("Language", "en").unwrap(); |
1029 | 56 | set_preference("SpeechStyle", "SimpleSpeak").unwrap(); |
1030 | 56 | set_preference("Verbosity", "Medium").unwrap(); |
1031 | 56 | set_preference("Overview", "False").unwrap(); |
1032 | 56 | set_mathml(mathml).unwrap(); |
1033 | 56 | } |
1034 | ||
1035 | #[test] | |
1036 | 1 | fn zoom_in() -> Result<()> { |
1037 | 1 | let mathml_str = "<math id='math'><mfrac id='mfrac'> |
1038 | 1 | <msup id='msup'><mi id='base'>b</mi><mn id='exp'>2</mn></msup> |
1039 | 1 | <mi id='denom'>d</mi> |
1040 | 1 | </mfrac></math>"; |
1041 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1042 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1043 | 1 | let package_instance = package_instance.borrow(); |
1044 | 1 | let mathml = get_element(&package_instance); |
1045 | 1 | test_command("ZoomIn", mathml, "msup"); |
1046 | 1 | test_command("ZoomIn", mathml, "base"); |
1047 | 1 | test_command("ZoomIn", mathml, "base"); |
1048 | 1 | return Ok( () ); |
1049 | 1 | }); |
1050 | 1 | } |
1051 | ||
1052 | #[test] | |
1053 | 1 | fn test_init_navigate_move_right() -> Result<()> { |
1054 | // this is how navigation typically starts up | |
1055 | 1 | let mathml_str = " <math display='block' id='id-0'> |
1056 | 1 | <mrow id='id-1'> |
1057 | 1 | <msup id='msup'><mi id='base'>b</mi><mn id='exp'>2</mn></msup> |
1058 | 1 | <mo id='id-3'>=</mo> |
1059 | 1 | <mrow id='id-4'> |
1060 | 1 | <mi id='id-5'>a</mi> |
1061 | 1 | <mo id='id-6'>-</mo> |
1062 | 1 | <mn id='id-7'>2</mn> |
1063 | 1 | </mrow> |
1064 | 1 | </mrow> |
1065 | 1 | </math>"; |
1066 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1067 | 1 | debug!("--- Enhanced ---"); |
1068 | 1 | MATHML_INSTANCE.with(|package_instance| { |
1069 | 1 | let package_instance = package_instance.borrow(); |
1070 | 1 | let mathml = get_element(&package_instance); |
1071 | 1 | test_command("ZoomIn", mathml, "msup"); |
1072 | 1 | test_command("MoveNext", mathml, "id-3"); |
1073 | 1 | }); |
1074 | ||
1075 | 1 | init_default_prefs(mathml_str, "Simple"); |
1076 | 1 | debug!("--- Simple ---"); |
1077 | 1 | MATHML_INSTANCE.with(|package_instance: &RefCell<Package>| { |
1078 | 1 | let package_instance = package_instance.borrow(); |
1079 | 1 | let mathml = get_element(&package_instance); |
1080 | 1 | test_command("ZoomIn", mathml, "msup"); |
1081 | 1 | test_command("MoveNext", mathml, "id-3"); |
1082 | 1 | }); |
1083 | ||
1084 | 1 | init_default_prefs(mathml_str, "Character"); |
1085 | 1 | debug!("--- Character ---"); |
1086 | 1 | MATHML_INSTANCE.with(|package_instance| { |
1087 | 1 | let package_instance = package_instance.borrow(); |
1088 | 1 | let mathml = get_element(&package_instance); |
1089 | 1 | test_command("ZoomIn", mathml, "base"); |
1090 | 1 | test_command("MoveNext", mathml, "exp"); |
1091 | 1 | }); |
1092 | 1 | return Ok( () ); |
1093 | 1 | } |
1094 | ||
1095 | #[test] | |
1096 | 1 | fn zoom_in_parens() -> Result<()> { |
1097 | // (a+b)(c+d) + 1 | |
1098 | 1 | let mathml_str = " <math display='block' id='id-0'> |
1099 | 1 | <mrow id='id-1'> |
1100 | 1 | <mrow id='id-2'> |
1101 | 1 | <mrow id='id-3'> |
1102 | 1 | <mo stretchy='false' id='id-4'>(</mo> |
1103 | 1 | <mrow id='id-5'> |
1104 | 1 | <mi id='id-6'>a</mi> |
1105 | 1 | <mo id='id-7'>+</mo> |
1106 | 1 | <mi id='id-8'>b</mi> |
1107 | 1 | </mrow> |
1108 | 1 | <mo stretchy='false' id='id-9'>)</mo> |
1109 | 1 | </mrow> |
1110 | 1 | <mo id='id-10'>⁢</mo> |
1111 | 1 | <mrow id='id-11'> |
1112 | 1 | <mo stretchy='false' id='id-12'>(</mo> |
1113 | 1 | <mrow id='id-13'> |
1114 | 1 | <mi id='id-14'>c</mi> |
1115 | 1 | <mo id='id-15'>+</mo> |
1116 | 1 | <mi id='id-16'>d</mi> |
1117 | 1 | </mrow> |
1118 | 1 | <mo stretchy='false' id='id-17'>)</mo> |
1119 | 1 | </mrow> |
1120 | 1 | </mrow> |
1121 | 1 | <mo id='id-18'>+</mo> |
1122 | 1 | <mn id='id-19'>1</mn> |
1123 | 1 | </mrow> |
1124 | 1 | </math>"; |
1125 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1126 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1127 | 1 | let package_instance = package_instance.borrow(); |
1128 | 1 | let mathml = get_element(&package_instance); |
1129 | 1 | set_preference("NavMode", "Enhanced") |
1130 | 1 | debug!("\n------EnhancedMode----------"); |
1131 | 1 | test_command("ZoomIn", mathml, "id-2"); |
1132 | 1 | test_command("ZoomIn", mathml, "id-5"); |
1133 | 1 | test_command("ZoomIn", mathml, "id-6"); |
1134 | ||
1135 | // repeat, but this time with "Simple | |
1136 | 1 | set_preference("NavMode", "Simple") |
1137 | 1 | debug!("\n------SimpleMode----------"); |
1138 | 1 | test_command("ZoomOutAll", mathml, "id-1"); |
1139 | 1 | test_command("ZoomIn", mathml, "id-4"); |
1140 | 1 | test_command("ZoomIn", mathml, "id-4"); |
1141 | 1 | return Ok( () ); |
1142 | 1 | }); |
1143 | 1 | } |
1144 | ||
1145 | #[test] | |
1146 | 1 | fn zoom_in_all() -> Result<()> { |
1147 | 1 | let mathml_str = "<math id='math'><mfrac id='mfrac'> |
1148 | 1 | <msup id='msup'><mi id='base'>b</mi><mn id='exp'>2</mn></msup> |
1149 | 1 | <mi id='denom'>d</mi> |
1150 | 1 | </mfrac></math>"; |
1151 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1152 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1153 | 1 | let package_instance = package_instance.borrow(); |
1154 | 1 | let mathml = get_element(&package_instance); |
1155 | 1 | test_command("ZoomInAll", mathml, "base"); |
1156 | 1 | return Ok( () ); |
1157 | 1 | }); |
1158 | 1 | } |
1159 | ||
1160 | ||
1161 | #[test] | |
1162 | 1 | fn zoom_out() -> Result<()> { |
1163 | 1 | let mathml_str = "<math id='math'><mfrac id='mfrac'> |
1164 | 1 | <msup id='msup'><mi id='base'>b</mi><mn id='exp'>2</mn></msup> |
1165 | 1 | <mi id='denom'>d</mi> |
1166 | 1 | </mfrac></math>"; |
1167 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1168 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1169 | 1 | let package_instance = package_instance.borrow(); |
1170 | 1 | let mathml = get_element(&package_instance); |
1171 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1172 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
1173 | 1 | current_node: "base".to_string(), |
1174 | 1 | current_node_offset: 0 |
1175 | 1 | }, "None") |
1176 | 1 | }); |
1177 | 1 | test_command("ZoomOut", mathml, "msup"); |
1178 | ||
1179 | 1 | let _nav_speech = do_navigate_command_and_param(mathml, NavigationCommand::Zoom, NavigationParam::Previous) |
1180 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1181 | 1 | let (id, _) = nav_stack.borrow().get_navigation_mathml_id(mathml); |
1182 | 1 | assert_eq!(id, "mfrac"); |
1183 | 1 | }); |
1184 | 1 | return Ok( () ); |
1185 | 1 | }); |
1186 | 1 | } |
1187 | ||
1188 | #[test] | |
1189 | 1 | fn zoom_out_all() -> Result<()> { |
1190 | 1 | let mathml_str = "<math id='math'><mfrac id='mfrac'> |
1191 | 1 | <msup id='msup'><mi id='base'>b</mi><mn id='exp'>2</mn></msup> |
1192 | 1 | <mi id='denom'>d</mi> |
1193 | 1 | </mfrac></math>"; |
1194 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1195 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1196 | 1 | let package_instance = package_instance.borrow(); |
1197 | 1 | let mathml = get_element(&package_instance); |
1198 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1199 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
1200 | 1 | current_node: "base".to_string(), |
1201 | 1 | current_node_offset: 0 |
1202 | 1 | }, "None") |
1203 | 1 | }); |
1204 | ||
1205 | 1 | test_command("ZoomOutAll", mathml, "mfrac"); |
1206 | 1 | return Ok( () ); |
1207 | 1 | }); |
1208 | 1 | } |
1209 | ||
1210 | #[test] | |
1211 | 1 | fn move_start_end() -> Result<()> { |
1212 | 1 | let mathml_str = " <math display='block' id='id-0'> |
1213 | 1 | <mrow id='id-1'> |
1214 | 1 | <mi id='id-2'>x</mi> |
1215 | 1 | <mo id='id-3'>=</mo> |
1216 | 1 | <mrow id='id-4'> |
1217 | 1 | <mi id='id-5'>a</mi> |
1218 | 1 | <mo id='id-6'>-</mo> |
1219 | 1 | <mn id='id-7'>2</mn> |
1220 | 1 | </mrow> |
1221 | 1 | </mrow> |
1222 | 1 | </math>"; |
1223 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1224 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1225 | 1 | let package_instance = package_instance.borrow(); |
1226 | 1 | let mathml = get_element(&package_instance); |
1227 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1228 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
1229 | 1 | current_node: "id-4".to_string(), |
1230 | 1 | current_node_offset: 0 |
1231 | 1 | }, "None") |
1232 | 1 | }); |
1233 | ||
1234 | 1 | set_preference("NavMode", "Character") |
1235 | 1 | test_command("MoveStart", mathml, "id-2"); |
1236 | 1 | test_command("MoveEnd", mathml, "id-7"); |
1237 | 1 | set_preference("NavMode", "Simple") |
1238 | 1 | test_command("MoveStart", mathml, "id-2"); |
1239 | 1 | test_command("MoveEnd", mathml, "id-7"); |
1240 | 1 | set_preference("NavMode", "Enhanced") |
1241 | 1 | test_command("MoveStart", mathml, "id-2"); |
1242 | 1 | test_command("MovePrevious", mathml, "id-2"); |
1243 | 1 | test_command("MoveEnd", mathml, "id-4"); |
1244 | 1 | test_command("MoveNext", mathml, "id-4"); |
1245 | 1 | return Ok( () ); |
1246 | 1 | }); |
1247 | 1 | } |
1248 | ||
1249 | #[test] | |
1250 | 1 | fn move_line_start_end() -> Result<()> { |
1251 | 1 | let mathml_str = " <math display='block' id='id-0'> |
1252 | 1 | <mfrac displaystyle='true' id='id-1'> |
1253 | 1 | <mi id='id-2'>x</mi> |
1254 | 1 | <mrow id='id-3'> |
1255 | 1 | <msup id='id-4'> |
1256 | 1 | <mi id='id-5'>y</mi> |
1257 | 1 | <mn id='id-6'>2</mn> |
1258 | 1 | </msup> |
1259 | 1 | <mo id='id-7'>+</mo> |
1260 | 1 | <mn id='id-8'>1</mn> |
1261 | 1 | </mrow> |
1262 | 1 | </mfrac> |
1263 | 1 | </math>"; |
1264 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1265 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1266 | 1 | let package_instance = package_instance.borrow(); |
1267 | 1 | let mathml = get_element(&package_instance); |
1268 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1269 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
1270 | 1 | current_node: "id-7".to_string(), |
1271 | 1 | current_node_offset: 0 |
1272 | 1 | }, "None") |
1273 | 1 | }); |
1274 | ||
1275 | 1 | set_preference("NavMode", "Character") |
1276 | 1 | test_command("MoveLineStart", mathml, "id-5"); |
1277 | 1 | test_command("MoveLineEnd", mathml, "id-8"); |
1278 | 1 | set_preference("NavMode", "Simple") |
1279 | 1 | test_command("MoveLineStart", mathml, "id-4"); |
1280 | 1 | test_command("MoveLineEnd", mathml, "id-8"); |
1281 | 1 | set_preference("NavMode", "Enhanced") |
1282 | 1 | test_command("MoveLineStart", mathml, "id-4"); |
1283 | 1 | test_command("MoveLineEnd", mathml, "id-8"); |
1284 | 1 | test_command("MoveEnd", mathml, "id-3"); |
1285 | 1 | return Ok( () ); |
1286 | 1 | }); |
1287 | 1 | } |
1288 | ||
1289 | #[test] | |
1290 | 1 | fn text_extremes_and_move_last_location() -> Result<()> { |
1291 | 1 | let mathml_str = "<math id='math'><mfrac id='mfrac'> |
1292 | 1 | <msup id='msup'><mi id='base'>b</mi><mn id='exp'>2</mn></msup> |
1293 | 1 | <mi id='denom'>d</mi> |
1294 | 1 | </mfrac></math>"; |
1295 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1296 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1297 | 1 | let package_instance = package_instance.borrow(); |
1298 | 1 | let mathml = get_element(&package_instance); |
1299 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1300 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
1301 | 1 | current_node: "base".to_string(), |
1302 | 1 | current_node_offset: 0 |
1303 | 1 | }, "None") |
1304 | 1 | }); |
1305 | ||
1306 | 1 | test_command("ZoomOutAll", mathml, "mfrac"); |
1307 | 1 | test_command("ZoomOut", mathml, "mfrac"); |
1308 | 1 | test_command("MoveLastLocation", mathml, "base"); // second zoom out should do nothing |
1309 | ||
1310 | 1 | test_command("ZoomOut", mathml, "msup"); |
1311 | 1 | test_command("ZoomInAll", mathml, "base"); |
1312 | 1 | test_command("ZoomIn", mathml, "base"); |
1313 | 1 | test_command("MoveLastLocation", mathml, "msup"); // second zoom in should do nothing |
1314 | ||
1315 | 1 | return Ok( () ); |
1316 | 1 | }); |
1317 | 1 | } |
1318 | ||
1319 | #[test] | |
1320 | 1 | fn move_to_start() -> Result<()> { |
1321 | 1 | let mathml_str = "<math id='math'><mfrac id='mfrac'> |
1322 | 1 | <mrow id='num'><msup id='msup'><mi id='base'>b</mi><mn id='exp'>2</mn></msup><mo id='factorial'>!</mo></mrow> |
1323 | 1 | <mi id='denom'>d</mi> |
1324 | 1 | </mfrac></math>"; |
1325 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1326 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1327 | 1 | let package_instance = package_instance.borrow(); |
1328 | 1 | let mathml = get_element(&package_instance); |
1329 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1330 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
1331 | 1 | current_node: "denom".to_string(), |
1332 | 1 | current_node_offset: 0 |
1333 | 1 | }, "None") |
1334 | 1 | }); |
1335 | 1 | test_command("MoveLineStart", mathml, "denom"); |
1336 | ||
1337 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1338 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
1339 | 1 | current_node: "factorial".to_string(), |
1340 | 1 | current_node_offset: 0 |
1341 | 1 | }, "None") |
1342 | 1 | }); |
1343 | 1 | test_command("MoveLineStart", mathml, "msup"); |
1344 | ||
1345 | 1 | let _nav_speech = do_navigate_command_and_param(mathml, NavigationCommand::Move, NavigationParam::Start) |
1346 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1347 | 1 | let (id, _) = nav_stack.borrow().get_navigation_mathml_id(mathml); |
1348 | 1 | assert_eq!(id, "num"); |
1349 | 1 | }); |
1350 | 1 | return Ok( () ); |
1351 | 1 | }); |
1352 | 1 | } |
1353 | ||
1354 | #[test] | |
1355 | 1 | fn move_right_sup() -> Result<()> { |
1356 | 1 | let mathml_str = "<math display='block' id='id-0'> |
1357 | 1 | <mrow id='id-1'> |
1358 | 1 | <msup id='id-2'> |
1359 | 1 | <mn id='id-3'>2</mn> |
1360 | 1 | <mi id='id-4'>q</mi> |
1361 | 1 | </msup> |
1362 | 1 | <mo id='id-5'>-</mo> |
1363 | 1 | <mi id='id-6'>x</mi> |
1364 | 1 | </mrow> |
1365 | 1 | </math>"; |
1366 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1367 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1368 | 1 | let package_instance = package_instance.borrow(); |
1369 | 1 | let mathml = get_element(&package_instance); |
1370 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1371 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
1372 | 1 | current_node: "id-2".to_string(), |
1373 | 1 | current_node_offset: 0 |
1374 | 1 | }, "None") |
1375 | 1 | }); |
1376 | 1 | set_preference("NavMode", "Enhanced") |
1377 | 1 | test_command("MoveNext", mathml, "id-5"); |
1378 | ||
1379 | // reset start and test Simple | |
1380 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1381 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
1382 | 1 | current_node: "id-2".to_string(), |
1383 | 1 | current_node_offset: 0 |
1384 | 1 | }, "None") |
1385 | 1 | }); |
1386 | 1 | set_preference("NavMode", "Simple") |
1387 | 1 | test_command("MoveNext", mathml, "id-5"); |
1388 | ||
1389 | // reset start and test Character | |
1390 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1391 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
1392 | 1 | current_node: "id-3".to_string(), |
1393 | 1 | current_node_offset: 0 |
1394 | 1 | }, "None") |
1395 | 1 | }); |
1396 | 1 | set_preference("NavMode", "Character") |
1397 | 1 | test_command("MoveNext", mathml, "id-4"); |
1398 | 1 | test_command("MoveNext", mathml, "id-5"); |
1399 | 1 | return Ok( () ); |
1400 | 1 | }); |
1401 | 1 | } |
1402 | ||
1403 | ||
1404 | #[test] | |
1405 | 1 | fn move_msubsup_char() -> Result<()> { |
1406 | 1 | let mathml_str = "<math display='block' id='id-0'> |
1407 | 1 | <mrow id='id-1'> |
1408 | 1 | <mn id='id-2'>1</mn> |
1409 | 1 | <mo id='id-3'>+</mo> |
1410 | 1 | <msubsup id='id-4'> |
1411 | 1 | <mi id='id-5'>x</mi> |
1412 | 1 | <mn id='id-6'>2</mn> |
1413 | 1 | <mn id='id-7'>3</mn> |
1414 | 1 | </msubsup> |
1415 | 1 | <mo id='id-8'>+</mo> |
1416 | 1 | <mn id='id-9'>4</mn> |
1417 | 1 | </mrow> |
1418 | 1 | </math>"; |
1419 | 1 | init_default_prefs(mathml_str, "Character"); |
1420 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1421 | 1 | let package_instance = package_instance.borrow(); |
1422 | 1 | let mathml = get_element(&package_instance); |
1423 | 1 | assert_eq!("zoomed in all of the way; 1", test_command("ZoomInAll", mathml, "id-2")); |
1424 | 1 | assert_eq!("move right; plus", test_command("MoveNext", mathml, "id-3")); |
1425 | 1 | assert_eq!("move right; in base; x", test_command("MoveNext", mathml, "id-5")); |
1426 | 1 | assert_eq!("move right; in subscript; 2", test_command("MoveNext", mathml, "id-6")); |
1427 | 1 | assert_eq!("move right; in superscript; 3", test_command("MoveNext", mathml, "id-7")); |
1428 | 1 | assert_eq!("move right; out of superscript; plus", test_command("MoveNext", mathml, "id-8")); |
1429 | 1 | assert_eq!("move left; in superscript; 3", test_command("MovePrevious", mathml, "id-7")); |
1430 | 1 | assert_eq!("move left; in subscript; 2", test_command("MovePrevious", mathml, "id-6")); |
1431 | 1 | assert_eq!("move left; in base; x", test_command("MovePrevious", mathml, "id-5")); |
1432 | 1 | assert_eq!("move left; out of base; plus", test_command("MovePrevious", mathml, "id-3")); |
1433 | ||
1434 | 1 | return Ok( () ); |
1435 | 1 | }); |
1436 | 1 | } |
1437 | ||
1438 | #[test] | |
1439 | 1 | fn zoom_logbase() -> Result<()> { |
1440 | 1 | let mathml_str = "<math display='block' id='id-0'> |
1441 | 1 | <mrow displaystyle='true' id='id-1'> |
1442 | 1 | <msub id='id-2'> |
1443 | 1 | <mi id='id-3'>log</mi> |
1444 | 1 | <mn id='id-4'>2</mn> |
1445 | 1 | </msub> |
1446 | 1 | <mo data-changed='added' id='id-5'>⁡</mo> |
1447 | 1 | <mi id='id-6'>x</mi>a |
1448 | 1 | </mrow> |
1449 | 1 | </math>"; |
1450 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1451 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1452 | 1 | let package_instance = package_instance.borrow(); |
1453 | 1 | let mathml = get_element(&package_instance); |
1454 | 1 | assert_eq!("zoom in; the log base 2", test_command("ZoomIn", mathml, "id-2")); |
1455 | 1 | assert_eq!("zoom in; in base; 2", test_command("ZoomIn", mathml, "id-4")); |
1456 | 1 | assert_eq!("zoomed in all of the way; 2", test_command("ZoomIn", mathml, "id-4")); |
1457 | 1 | debug!("Now zooming out"); |
1458 | 1 | assert_eq!("zoom out; out of base; the log base 2", test_command("ZoomOut", mathml, "id-2")); |
1459 | 1 | assert_eq!("zoom out; the log base 2, of x", test_command("ZoomOut", mathml, "id-1")); |
1460 | 1 | assert_eq!("zoomed out all of the way; the log base 2, of x", test_command("ZoomOut", mathml, "id-1")); |
1461 | 1 | return Ok( () ); |
1462 | 1 | }); |
1463 | 1 | } |
1464 | ||
1465 | #[test] | |
1466 | 1 | fn zoom_logbase_power() -> Result<()> { |
1467 | 1 | let mathml_str = "<math display='block' id='id-0'> |
1468 | 1 | <mrow displaystyle='true' id='id-1'> |
1469 | 1 | <msubsup id='id-2'> |
1470 | 1 | <mi id='id-3'>log</mi> |
1471 | 1 | <mn id='id-4'>2</mn> |
1472 | 1 | <mn id='id-5'>3</mn> |
1473 | 1 | </msubsup> |
1474 | 1 | <mo data-changed='added' id='id-6'>⁡</mo> |
1475 | 1 | <mi id='id-7'>x</mi> |
1476 | 1 | </mrow> |
1477 | 1 | </math>"; |
1478 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1479 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1480 | 1 | let package_instance = package_instance.borrow(); |
1481 | 1 | let mathml = get_element(&package_instance); |
1482 | 1 | assert_eq!("zoom in; the log base 2, cubed", test_command("ZoomIn", mathml, "id-2")); |
1483 | 1 | assert_eq!("zoom in; in base; the log base 2", test_command("ZoomIn", mathml, "id-2-log-base")); |
1484 | 1 | assert_eq!("zoom in; in base; 2", test_command("ZoomIn", mathml, "id-4")); |
1485 | 1 | assert_eq!("zoomed in all of the way; 2", test_command("ZoomIn", mathml, "id-4")); |
1486 | 1 | debug!("Now zooming out"); |
1487 | 1 | assert_eq!("zoom out; out of base; the log base 2", test_command("ZoomOut", mathml, "id-2-log-base")); |
1488 | 1 | assert_eq!("zoom out; out of base; the log base 2, cubed", test_command("ZoomOut", mathml, "id-2")); |
1489 | 1 | assert_eq!("zoom out; the log base 2, cubed of x", test_command("ZoomOut", mathml, "id-1")); |
1490 | 1 | assert_eq!("zoomed out all of the way; the log base 2, cubed of x", test_command("ZoomOut", mathml, "id-1")); |
1491 | 1 | return Ok( () ); |
1492 | 1 | }); |
1493 | 1 | } |
1494 | ||
1495 | #[test] | |
1496 | 1 | fn zoom_msubsup() -> Result<()> { |
1497 | // msubsup is trickier because it creates an intent within an intent, so offsets need to be handled properly | |
1498 | 1 | let mathml_str = "<math id='math'><msubsup id='msubsup'><mi id='base'>𝑥</mi><mn id='sub'>1</mn><mn id='sup'>2</mn></msubsup></math>"; |
1499 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1500 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1501 | 1 | let package_instance = package_instance.borrow(); |
1502 | 1 | let mathml = get_element(&package_instance); |
1503 | 1 | set_preference("NavMode", "Enhanced").unwrap(); |
1504 | 1 | debug!("Enhanced mode"); |
1505 | 1 | do_commands(mathml) |
1506 | 1 | set_preference("NavMode", "Simple").unwrap(); |
1507 | 1 | debug!("Simple mode"); |
1508 | 1 | do_commands(mathml) |
1509 | 1 | set_preference("NavMode", "Character").unwrap(); |
1510 | 1 | debug!("Character mode"); |
1511 | 1 | assert_eq!("zoom in; in base; x", test_command("ZoomIn", mathml, "base")); |
1512 | 1 | assert_eq!("zoom out; out of base; x sub 1 super 2 end super", test_command("ZoomOut", mathml, "msubsup")); |
1513 | 1 | return Ok( () ); |
1514 | ||
1515 | /// Enhanced and Simple mode should behave the same | |
1516 | 2 | fn do_commands(mathml: Element) -> Result<()> { |
1517 | 2 | assert_eq!("zoom in; in base; x sub 1", test_command("ZoomIn", mathml, "msubsup-indexed-by")); |
1518 | 2 | assert_eq!("zoom in; in base; x", test_command("ZoomIn", mathml, "base")); |
1519 | 2 | assert_eq!("zoomed in all of the way; x", test_command("ZoomIn", mathml, "base")); |
1520 | 2 | debug!("Now zooming out"); |
1521 | 2 | assert_eq!("zoom out; out of base; x sub 1", test_command("ZoomOut", mathml, "msubsup-indexed-by")); |
1522 | 2 | assert_eq!("zoom out; out of base; x sub 1, squared", test_command("ZoomOut", mathml, "msubsup")); |
1523 | 2 | assert_eq!("zoomed out all of the way; x sub 1, squared", test_command("ZoomOut", mathml, "msubsup")); |
1524 | 2 | return Ok( () ); |
1525 | 2 | } |
1526 | 1 | }); |
1527 | 1 | } |
1528 | ||
1529 | #[test] | |
1530 | 1 | fn move_mmultiscripts_char() -> Result<()> { |
1531 | 1 | let mathml_str = "<math display='block' id='id-0'> |
1532 | 1 | <mmultiscripts data-mjx-texclass='ORD' data-chem-formula='5' id='id-1'> |
1533 | 1 | <mrow data-chem-formula='3' id='id-2'> |
1534 | 1 | <mo stretchy='false' id='id-3'>[</mo> |
1535 | 1 | <mmultiscripts data-chem-formula='3' id='id-4'> |
1536 | 1 | <mi data-chem-element='3' id='id-5'>Co</mi> |
1537 | 1 | <mn id='id-6'>6</mn> |
1538 | 1 | <none id='id-7'></none> |
1539 | 1 | </mmultiscripts> |
1540 | 1 | <mo stretchy='false' id='id-8'>]</mo> |
1541 | 1 | </mrow> |
1542 | 1 | <none id='id-9'></none> |
1543 | 1 | <mrow id='id-10'> |
1544 | 1 | <mn id='id-11'>3</mn> |
1545 | 1 | <mo id='id-12'>+</mo> |
1546 | 1 | </mrow> |
1547 | 1 | </mmultiscripts> |
1548 | 1 | </math>"; |
1549 | 1 | init_default_prefs(mathml_str, "Character"); |
1550 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1551 | 1 | let package_instance = package_instance.borrow(); |
1552 | 1 | let mathml = get_element(&package_instance); |
1553 | 1 | assert_eq!("zoomed in all of the way; in base; open bracket", test_command("ZoomInAll", mathml, "id-3")); |
1554 | 1 | assert_eq!("move right; in base; cap c o", test_command("MoveNext", mathml, "id-5")); |
1555 | 1 | assert_eq!("move right; in subscript; 6", test_command("MoveNext", mathml, "id-6")); |
1556 | 1 | assert_eq!("move right; out of subscript; close bracket", test_command("MoveNext", mathml, "id-8")); |
1557 | 1 | assert_eq!("move right; in superscript; 3", test_command("MoveNext", mathml, "id-11")); |
1558 | 1 | assert_eq!("move right; plus", test_command("MoveNext", mathml, "id-12")); |
1559 | 1 | assert_eq!("cannot move right, end of math", test_command("MoveNext", mathml, "id-12")); |
1560 | 1 | assert_eq!("move left; 3", test_command("MovePrevious", mathml, "id-11")); |
1561 | 1 | assert_eq!("move left; in base; close bracket", test_command("MovePrevious", mathml, "id-8")); |
1562 | 1 | assert_eq!("move left; in subscript; 6", test_command("MovePrevious", mathml, "id-6")); |
1563 | 1 | assert_eq!("move left; in base; cap c o", test_command("MovePrevious", mathml, "id-5")); |
1564 | 1 | assert_eq!("move left; out of base; open bracket", test_command("MovePrevious", mathml, "id-3")); |
1565 | ||
1566 | 1 | return Ok( () ); |
1567 | 1 | }); |
1568 | 1 | } |
1569 | ||
1570 | #[test] | |
1571 | 1 | fn move_right_char() -> Result<()> { |
1572 | 1 | let mathml_str = "<math id='id-0'> |
1573 | 1 | <mrow displaystyle='true' id='id-1'> |
1574 | 1 | <mi id='id-2'>x</mi> |
1575 | 1 | <mo id='id-3'>=</mo> |
1576 | 1 | <mrow id='id-4'> |
1577 | 1 | <mfrac id='id-5'> |
1578 | 1 | <mn id='id-6'>1</mn> |
1579 | 1 | <mrow id='id-7'> |
1580 | 1 | <mi id='id-8'>a</mi> |
1581 | 1 | <mo id='id-9'>+</mo> |
1582 | 1 | <mn id='id-10'>2</mn> |
1583 | 1 | </mrow> |
1584 | 1 | </mfrac> |
1585 | 1 | <mo id='id-11'>+</mo> |
1586 | 1 | <mrow id='id-12'> |
1587 | 1 | <mn id='id-13'>3</mn> |
1588 | 1 | <mo id='id-14'>⁢</mo> |
1589 | 1 | <mi id='id-15'>b</mi> |
1590 | 1 | </mrow> |
1591 | 1 | </mrow> |
1592 | 1 | </mrow> |
1593 | 1 | </math>"; |
1594 | 1 | init_default_prefs(mathml_str, "Character"); |
1595 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1596 | 1 | let package_instance = package_instance.borrow(); |
1597 | 1 | let mathml = get_element(&package_instance); |
1598 | 1 | test_command("ZoomInAll", mathml, "id-2"); |
1599 | 1 | test_command("MoveNext", mathml, "id-3"); |
1600 | 1 | test_command("MoveNext", mathml, "id-6"); |
1601 | 1 | test_command("MoveNext", mathml, "id-8"); |
1602 | 1 | test_command("MoveNext", mathml, "id-9"); |
1603 | 1 | test_command("MoveNext", mathml, "id-10"); |
1604 | 1 | test_command("MoveNext", mathml, "id-11"); |
1605 | 1 | test_command("MoveNext", mathml, "id-13"); |
1606 | 1 | test_command("MoveNext", mathml, "id-15"); |
1607 | 1 | test_command("MoveNext", mathml, "id-15"); |
1608 | ||
1609 | 1 | return Ok( () ); |
1610 | 1 | }); |
1611 | 1 | } |
1612 | ||
1613 | #[test] | |
1614 | 1 | fn char_mode_paren_test() -> Result<()> { |
1615 | 1 | let mathml_str = "<math display='block' id='id-0'> |
1616 | 1 | <mrow displaystyle='true' id='id-1'> |
1617 | 1 | <mrow id='id-2'> |
1618 | 1 | <mo id='id-3'>(</mo> |
1619 | 1 | <mi id='id-4'>a</mi> |
1620 | 1 | <mo id='id-5'>)</mo> |
1621 | 1 | </mrow> |
1622 | 1 | <mo id='id-6'>⁢</mo> |
1623 | 1 | <mrow id='id-7'> |
1624 | 1 | <mo id='id-8'>(</mo> |
1625 | 1 | <mi id='id-9'>b</mi> |
1626 | 1 | <mo id='id-10'>)</mo> |
1627 | 1 | </mrow> |
1628 | 1 | </mrow> |
1629 | 1 | </math>"; |
1630 | 1 | init_default_prefs(mathml_str, "Character"); |
1631 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1632 | 1 | let package_instance = package_instance.borrow(); |
1633 | 1 | let mathml = get_element(&package_instance); |
1634 | 1 | debug!("Character mode"); |
1635 | 1 | do_commands(mathml) |
1636 | 1 | set_preference("NavMode", "Simple").unwrap(); |
1637 | 1 | debug!("Simple mode"); |
1638 | 1 | test_command("ZoomIn", mathml, "id-3"); // zooms to the first parenthesis |
1639 | 1 | do_commands(mathml) |
1640 | 1 | set_preference("NavMode", "Enhanced").unwrap(); |
1641 | 1 | debug!("Enhanced mode"); |
1642 | 1 | test_command("ZoomIn", mathml, "id-4"); |
1643 | 1 | test_command("MoveNext", mathml, "id-6"); |
1644 | 1 | test_command("MoveNext", mathml, "id-9"); |
1645 | 1 | test_command("MovePrevious", mathml, "id-6"); |
1646 | 1 | test_command("MovePrevious", mathml, "id-4"); |
1647 | ||
1648 | 1 | return Ok( () ); |
1649 | 1 | }); |
1650 | ||
1651 | /// Simple and Character mode should behave the same | |
1652 | 2 | fn do_commands(mathml: Element) -> Result<()> { |
1653 | 2 | test_command("ZoomIn", mathml, "id-3"); |
1654 | 2 | test_command("MoveNext", mathml, "id-4"); |
1655 | 2 | test_command("MoveNext", mathml, "id-5"); |
1656 | 2 | test_command("MoveNext", mathml, "id-8"); |
1657 | 2 | test_command("MoveNext", mathml, "id-9"); |
1658 | 2 | test_command("MoveNext", mathml, "id-10"); |
1659 | 2 | test_command("MovePrevious", mathml, "id-9"); |
1660 | 2 | test_command("MovePrevious", mathml, "id-8"); |
1661 | 2 | test_command("MovePrevious", mathml, "id-5"); |
1662 | 2 | test_command("ZoomOutAll", mathml, "id-1"); |
1663 | 2 | return Ok( () ); |
1664 | 2 | } |
1665 | 1 | } |
1666 | ||
1667 | #[test] | |
1668 | 1 | fn char_mode_trig_test() -> Result<()> { |
1669 | 1 | let mathml_str = "<math id='id-0'> |
1670 | 1 | <mrow id='id-1'> |
1671 | 1 | <mi id='id-2'>sin</mi> |
1672 | 1 | <mo id='id-3'>⁡</mo> |
1673 | 1 | <mrow id='id-4'> |
1674 | 1 | <mo id='id-5'>(</mo> |
1675 | 1 | <mi id='id-6'>x</mi> |
1676 | 1 | <mo id='id-7'>)</mo> |
1677 | 1 | </mrow> |
1678 | 1 | </mrow> |
1679 | 1 | </math>"; |
1680 | 1 | init_default_prefs(mathml_str, "Simple"); |
1681 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1682 | 1 | let package_instance = package_instance.borrow(); |
1683 | 1 | let mathml = get_element(&package_instance); |
1684 | 1 | do_commands(mathml) |
1685 | 1 | set_preference("NavMode", "Simple").unwrap(); |
1686 | 1 | do_commands(mathml) |
1687 | 1 | set_preference("NavMode", "Enhanced").unwrap(); |
1688 | 1 | test_command("ZoomIn", mathml, "id-2"); |
1689 | 1 | test_command("MoveNext", mathml, "id-6"); |
1690 | 1 | test_command("MovePrevious", mathml, "id-2"); |
1691 | ||
1692 | 1 | return Ok( () ); |
1693 | 1 | }); |
1694 | ||
1695 | ||
1696 | /// Simple and Character mode should behave the same | |
1697 | 2 | fn do_commands(mathml: Element) -> Result<()> { |
1698 | 2 | test_command("ZoomIn", mathml, "id-2"); |
1699 | 2 | test_command("MoveNext", mathml, "id-5"); |
1700 | 2 | test_command("MoveNext", mathml, "id-6"); |
1701 | 2 | test_command("MoveNext", mathml, "id-7"); |
1702 | 2 | test_command("MovePrevious", mathml, "id-6"); |
1703 | 2 | test_command("MovePrevious", mathml, "id-5"); |
1704 | 2 | test_command("MovePrevious", mathml, "id-2"); |
1705 | 2 | test_command("ZoomOutAll", mathml, "id-1"); |
1706 | 2 | return Ok( () ); |
1707 | 2 | } |
1708 | 1 | } |
1709 | ||
1710 | #[test] | |
1711 | 1 | fn move_char_speech() -> Result<()> { |
1712 | 1 | let mathml_str = "<math display='block' id='id-0'> |
1713 | 1 | <mrow id='id-1'> |
1714 | 1 | <mfrac id='id-2'> |
1715 | 1 | <mi id='id-3'>x</mi> |
1716 | 1 | <mi id='id-4'>y</mi> |
1717 | 1 | </mfrac> |
1718 | 1 | <mo id='id-5'>⁢</mo> |
1719 | 1 | <mi id='id-6'>z</mi> |
1720 | 1 | </mrow> |
1721 | 1 | </math>"; |
1722 | 1 | init_default_prefs(mathml_str, "Character"); |
1723 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1724 | 1 | let package_instance = package_instance.borrow(); |
1725 | 1 | let mathml = get_element(&package_instance); |
1726 | 1 | test_command("ZoomInAll", mathml, "id-3"); |
1727 | 1 | assert_eq!("move right; in denominator; y", test_command("MoveNext", mathml, "id-4")); |
1728 | 1 | assert_eq!("move right; out of denominator; z", test_command("MoveNext", mathml, "id-6")); |
1729 | 1 | assert_eq!("move left; in denominator; y", test_command("MovePrevious", mathml, "id-4")); |
1730 | 1 | assert_eq!("move left; in numerator; x", test_command("MovePrevious", mathml, "id-3")); |
1731 | ||
1732 | 1 | return Ok( () ); |
1733 | 1 | }); |
1734 | 1 | } |
1735 | ||
1736 | #[test] | |
1737 | 1 | fn move_inside_leaves() -> Result<()> { |
1738 | 1 | let mathml_str = "<math display='block' id='id-0'> |
1739 | 1 | <mrow id='id-1'> |
1740 | 1 | <mfrac id='id-2'> |
1741 | 1 | <mi id='id-3'>top</mi> |
1742 | 1 | <mi id='id-4'>αβγ</mi> |
1743 | 1 | </mfrac> |
1744 | 1 | </mrow> |
1745 | 1 | </math>"; |
1746 | 1 | init_default_prefs(mathml_str, "Character"); |
1747 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1748 | 1 | let package_instance = package_instance.borrow(); |
1749 | 1 | let mathml = get_element(&package_instance); |
1750 | 1 | test_command("ZoomInAll", mathml, "id-3"); |
1751 | 1 | assert_eq!("zoomed in to first character; t", test_command("ZoomIn", mathml, "id-3")); |
1752 | 1 | assert_eq!("move right; o", test_command("MoveNext", mathml, "id-3")); |
1753 | 1 | assert_eq!("move right; p", test_command("MoveNext", mathml, "id-3")); |
1754 | 1 | assert_eq!("move right; in denominator; αβγ", test_command("MoveNext", mathml, "id-4")); |
1755 | 1 | assert_eq!("zoomed in to first character; alpha", test_command("ZoomIn", mathml, "id-4")); |
1756 | 1 | assert_eq!("move right; beta", test_command("MoveNext", mathml, "id-4")); |
1757 | 1 | assert_eq!("move right; gamma", test_command("MoveNext", mathml, "id-4")); |
1758 | 1 | assert_eq!("cannot move right, end of math", test_command("MoveNext", mathml, "id-4")); |
1759 | 1 | assert_eq!("move left; beta", test_command("MovePrevious", mathml, "id-4")); |
1760 | 1 | assert_eq!("zoom out; αβγ", test_command("ZoomOut", mathml, "id-4")); |
1761 | ||
1762 | 1 | return Ok( () ); |
1763 | 1 | }); |
1764 | 1 | } |
1765 | ||
1766 | #[test] | |
1767 | 1 | fn move_enhanced_times() -> Result<()> { |
1768 | 1 | let mathml_str = "<math display='block' id='id-0'> |
1769 | 1 | <mrow displaystyle='true' id='id-1'> |
1770 | 1 | <mn id='id-2'>2</mn> |
1771 | 1 | <mo id='id-3'>⁢</mo> |
1772 | 1 | <mrow id='id-4'> |
1773 | 1 | <mo id='id-5'>(</mo> |
1774 | 1 | <mrow id='id-6'> |
1775 | 1 | <mn id='id-7'>1</mn> |
1776 | 1 | <mo id='id-8'>-</mo> |
1777 | 1 | <mi id='id-9'>x</mi> |
1778 | 1 | </mrow> |
1779 | 1 | <mo id='id-10'>)</mo> |
1780 | 1 | </mrow> |
1781 | 1 | </mrow> |
1782 | 1 | </math>"; |
1783 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1784 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1785 | 1 | let package_instance = package_instance.borrow(); |
1786 | 1 | let mathml = get_element(&package_instance); |
1787 | 1 | test_command("ZoomIn", mathml, "id-2"); |
1788 | 1 | assert_eq!("move right; times", test_command("MoveNext", mathml, "id-3")); |
1789 | 1 | assert_eq!("move right; 1 minus x", test_command("MoveNext", mathml, "id-6")); |
1790 | 1 | assert_eq!("move left; times", test_command("MovePrevious", mathml, "id-3")); |
1791 | 1 | assert_eq!("move left; 2", test_command("MovePrevious", mathml, "id-2")); |
1792 | ||
1793 | 1 | return Ok( () ); |
1794 | 1 | }); |
1795 | 1 | } |
1796 | ||
1797 | #[test] | |
1798 | 1 | fn move_simple_no_times() -> Result<()> { |
1799 | 1 | let mathml_str = "<math display='block' id='id-0'> |
1800 | 1 | <mrow displaystyle='true' id='id-1'> |
1801 | 1 | <mn id='id-2'>2</mn> |
1802 | 1 | <mo id='id-3'>⁢</mo> |
1803 | 1 | <mrow id='id-4'> |
1804 | 1 | <mo id='id-5'>(</mo> |
1805 | 1 | <mrow id='id-6'> |
1806 | 1 | <mn id='id-7'>1</mn> |
1807 | 1 | <mo id='id-8'>-</mo> |
1808 | 1 | <mi id='id-9'>x</mi> |
1809 | 1 | </mrow> |
1810 | 1 | <mo id='id-10'>)</mo> |
1811 | 1 | </mrow> |
1812 | 1 | </mrow> |
1813 | 1 | </math>"; |
1814 | 1 | init_default_prefs(mathml_str, "Simple"); |
1815 | 1 | set_preference("SpeechStyle", "ClearSpeak").unwrap(); |
1816 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1817 | 1 | let package_instance = package_instance.borrow(); |
1818 | 1 | let mathml = get_element(&package_instance); |
1819 | 1 | test_command("ZoomIn", mathml, "id-2"); |
1820 | 1 | assert_eq!("move right; open paren", test_command("MoveNext", mathml, "id-5")); |
1821 | 1 | assert_eq!("move right; 1", test_command("MoveNext", mathml, "id-7")); |
1822 | 1 | assert_eq!("move left; open paren", test_command("MovePrevious", mathml, "id-5")); |
1823 | 1 | assert_eq!("move left; 2", test_command("MovePrevious", mathml, "id-2")); |
1824 | ||
1825 | 1 | return Ok( () ); |
1826 | 1 | }); |
1827 | 1 | } |
1828 | ||
1829 | ||
1830 | #[test] | |
1831 | 1 | fn move_cell() -> Result<()> { |
1832 | 1 | let mathml_str = "<math id='nav-0'> |
1833 | 1 | <mtable id='nav-1'> |
1834 | 1 | <mtr id='nav-2'> |
1835 | 1 | <mtd id='nav-3'> <mn id='nav-4'>1</mn></mtd> |
1836 | 1 | <mtd id='nav-5'> <mn id='nav-6'>2</mn></mtd> |
1837 | 1 | <mtd id='nav-7'><mn id='nav-8'>3</mn> </mtd> |
1838 | 1 | </mtr> |
1839 | 1 | <mtr id='nav-9'> |
1840 | 1 | <mtd id='nav-10'> |
1841 | 1 | <mrow id='nav-11'> |
1842 | 1 | <mi id='nav-12'>x</mi> |
1843 | 1 | <mo id='nav-13'>-</mo> |
1844 | 1 | <mi id='nav-14'>y</mi> |
1845 | 1 | </mrow> |
1846 | 1 | </mtd> |
1847 | 1 | <mtd id='nav-15'> |
1848 | 1 | <mfrac id='nav-16'> |
1849 | 1 | <mn id='nav-17'>1</mn> |
1850 | 1 | <mn id='nav-18'>2</mn> |
1851 | 1 | </mfrac> |
1852 | 1 | </mtd> |
1853 | 1 | <mtd id='nav-19'> |
1854 | 1 | <mi id='nav-20'>z</mi> |
1855 | 1 | </mtd> |
1856 | 1 | </mtr> |
1857 | 1 | <mtr id='nav-21'> |
1858 | 1 | <mtd id='nav-22'><mn id='nav-23'>7</mn> </mtd> |
1859 | 1 | <mtd id='nav-24'><mn id='nav-25'>8</mn> </mtd> |
1860 | 1 | <mtd id='nav-26'> <mn id='nav-27'>9</mn></mtd> |
1861 | 1 | </mtr> |
1862 | 1 | <mtr id='nav-28'> |
1863 | 1 | <mtd id='nav-29'> |
1864 | 1 | <mrow id='nav-30'> |
1865 | 1 | <mi id='nav-31'>sin</mi> |
1866 | 1 | <mo id='nav-32'>⁡</mo> |
1867 | 1 | <mi id='nav-33'>x</mi> |
1868 | 1 | </mrow> |
1869 | 1 | </mtd> |
1870 | 1 | <mtd id='nav-34'> |
1871 | 1 | <msup id='nav-35'> |
1872 | 1 | <mi id='nav-36'>e</mi> |
1873 | 1 | <mi id='nav-37'>x</mi> |
1874 | 1 | </msup> |
1875 | 1 | </mtd> |
1876 | 1 | <mtd id='nav-38'> |
1877 | 1 | <mrow id='nav-39'> |
1878 | 1 | <mn id='nav-40'>2</mn> |
1879 | 1 | <mo id='nav-41'>-</mo> |
1880 | 1 | <mi id='nav-42'>y</mi> |
1881 | 1 | </mrow> |
1882 | 1 | </mtd> |
1883 | 1 | </mtr> |
1884 | 1 | </mtable> |
1885 | 1 | </math>"; |
1886 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
1887 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1888 | 1 | let package_instance = package_instance.borrow(); |
1889 | 1 | let mathml = get_element(&package_instance); |
1890 | 1 | test_command("ZoomInAll", mathml, "nav-4"); |
1891 | 1 | test_command("MoveCellNext", mathml, "nav-6"); |
1892 | 1 | test_command("MoveCellNext", mathml, "nav-8"); |
1893 | 1 | test_command("MoveCellNext", mathml, "nav-8"); |
1894 | 1 | test_command("MoveCellDown", mathml, "nav-20"); |
1895 | 1 | test_command("MoveCellDown", mathml, "nav-27"); |
1896 | 1 | let speech = test_command("MoveCellDown", mathml, "nav-39"); |
1897 | 1 | assert_eq!(speech, "move down, row 4, column 3; 2 minus y"); |
1898 | 1 | let speech = test_command("MoveCellDown", mathml, "nav-39"); |
1899 | 1 | assert_eq!(speech, "no next row"); |
1900 | 1 | test_command("MoveCellPrevious", mathml, "nav-35"); |
1901 | 1 | test_command("ZoomIn", mathml, "nav-36"); |
1902 | 1 | test_command("MoveCellUp", mathml, "nav-25"); |
1903 | 1 | test_command("MoveCellUp", mathml, "nav-16"); |
1904 | 1 | test_command("MoveCellUp", mathml, "nav-6"); |
1905 | 1 | test_command("MoveCellUp", mathml, "nav-6"); |
1906 | ||
1907 | 1 | return Ok( () ); |
1908 | 1 | }); |
1909 | 1 | } |
1910 | ||
1911 | #[test] | |
1912 | 1 | fn move_cell_char_mode() -> Result<()> { |
1913 | 1 | let mathml_str = "<math id='nav-0'> |
1914 | 1 | <mtable id='nav-1'> |
1915 | 1 | <mtr id='nav-2'> |
1916 | 1 | <mtd id='nav-3'> <mn id='nav-4'>1</mn></mtd> |
1917 | 1 | <mtd id='nav-5'> <mn id='nav-6'>2</mn></mtd> |
1918 | 1 | <mtd id='nav-7'><mn id='nav-8'>3</mn> </mtd> |
1919 | 1 | </mtr> |
1920 | 1 | <mtr id='nav-9'> |
1921 | 1 | <mtd id='nav-10'> |
1922 | 1 | <mrow id='nav-11'> |
1923 | 1 | <mi id='nav-12'>x</mi> |
1924 | 1 | <mo id='nav-13'>-</mo> |
1925 | 1 | <mi id='nav-14'>y</mi> |
1926 | 1 | </mrow> |
1927 | 1 | </mtd> |
1928 | 1 | <mtd id='nav-15'> |
1929 | 1 | <mfrac id='nav-16'> |
1930 | 1 | <mn id='nav-17'>1</mn> |
1931 | 1 | <mn id='nav-18'>2</mn> |
1932 | 1 | </mfrac> |
1933 | 1 | </mtd> |
1934 | 1 | <mtd id='nav-19'> |
1935 | 1 | <mi id='nav-20'>z</mi> |
1936 | 1 | </mtd> |
1937 | 1 | </mtr> |
1938 | 1 | <mtr id='nav-21'> |
1939 | 1 | <mtd id='nav-22'><mn id='nav-23'>7</mn> </mtd> |
1940 | 1 | <mtd id='nav-24'><mn id='nav-25'>8</mn> </mtd> |
1941 | 1 | <mtd id='nav-26'> <mn id='nav-27'>9</mn></mtd> |
1942 | 1 | </mtr> |
1943 | 1 | <mtr id='nav-28'> |
1944 | 1 | <mtd id='nav-29'> |
1945 | 1 | <mrow id='nav-30'> |
1946 | 1 | <mi id='nav-31'>sin</mi> |
1947 | 1 | <mo id='nav-32'>⁡</mo> |
1948 | 1 | <mi id='nav-33'>x</mi> |
1949 | 1 | </mrow> |
1950 | 1 | </mtd> |
1951 | 1 | <mtd id='nav-34'> |
1952 | 1 | <msup id='nav-35'> |
1953 | 1 | <mi id='nav-36'>e</mi> |
1954 | 1 | <mi id='nav-37'>x</mi> |
1955 | 1 | </msup> |
1956 | 1 | </mtd> |
1957 | 1 | <mtd id='nav-38'> |
1958 | 1 | <mrow id='nav-39'> |
1959 | 1 | <mn id='nav-40'>2</mn> |
1960 | 1 | <mo id='nav-41'>-</mo> |
1961 | 1 | <mi id='nav-42'>y</mi> |
1962 | 1 | </mrow> |
1963 | 1 | </mtd> |
1964 | 1 | </mtr> |
1965 | 1 | </mtable> |
1966 | 1 | </math>"; |
1967 | 1 | init_default_prefs(mathml_str, "Character"); |
1968 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
1969 | 1 | let package_instance = package_instance.borrow(); |
1970 | 1 | let mathml = get_element(&package_instance); |
1971 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
1972 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
1973 | 1 | current_node: "nav-8".to_string(), |
1974 | 1 | current_node_offset: 0 |
1975 | 1 | }, "None") |
1976 | 1 | }); |
1977 | 1 | test_command("MoveNext", mathml, "nav-12"); |
1978 | 1 | test_command("MoveNext", mathml, "nav-13"); |
1979 | 1 | test_command("MoveNext", mathml, "nav-14"); |
1980 | 1 | test_command("MoveNext", mathml, "nav-17"); |
1981 | 1 | test_command("MovePrevious", mathml, "nav-14"); |
1982 | 1 | test_command("MoveCellNext", mathml, "nav-17"); |
1983 | 1 | test_command("MoveCellPrevious", mathml, "nav-14"); |
1984 | 1 | test_command("MovePrevious", mathml, "nav-13"); |
1985 | 1 | test_command("MovePrevious", mathml, "nav-12"); |
1986 | 1 | test_command("MoveCellPrevious", mathml, "nav-12"); |
1987 | 1 | test_command("MovePrevious", mathml, "nav-8"); |
1988 | 1 | test_command("MoveCellDown", mathml, "nav-20"); |
1989 | 1 | test_command("MoveCellDown", mathml, "nav-27"); |
1990 | 1 | test_command("MoveCellDown", mathml, "nav-40"); |
1991 | 1 | test_command("MoveCellDown", mathml, "nav-40"); |
1992 | 1 | test_command("MoveCellPrevious", mathml, "nav-37"); |
1993 | 1 | test_command("MoveCellUp", mathml, "nav-25"); |
1994 | ||
1995 | 1 | return Ok( () ); |
1996 | 1 | }); |
1997 | 1 | } |
1998 | ||
1999 | #[test] | |
2000 | 1 | fn placemarker() -> Result<()> { |
2001 | 1 | let mathml_str = "<math display='block' id='math'> |
2002 | 1 | <mrow displaystyle='true' id='mrow'> |
2003 | 1 | <mi id='a'>a</mi> |
2004 | 1 | <mo id='plus-1'>+</mo> |
2005 | 1 | <mi id='b'>b</mi> |
2006 | 1 | <mo id='plus-2'>+</mo> |
2007 | 1 | <mi id='c'>c</mi> |
2008 | 1 | </mrow> |
2009 | 1 | </math>"; |
2010 | 1 | init_default_prefs(mathml_str, "Character"); |
2011 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2012 | 1 | let package_instance = package_instance.borrow(); |
2013 | 1 | let mathml = get_element(&package_instance); |
2014 | 1 | test_command("MoveStart", mathml, "a"); |
2015 | 1 | test_command("SetPlacemarker0", mathml, "a"); |
2016 | 1 | test_command("MoveEnd", mathml, "c"); |
2017 | 1 | test_command("Read0", mathml, "c"); |
2018 | 1 | test_command("Describe0", mathml, "c"); |
2019 | 1 | test_command("SetPlacemarker1", mathml, "c"); |
2020 | 1 | test_command("MoveTo0", mathml, "a"); |
2021 | 1 | test_command("MoveTo1", mathml, "c"); |
2022 | 1 | test_command("MoveLastLocation", mathml, "a"); |
2023 | ||
2024 | 1 | return Ok( () ); |
2025 | 1 | }); |
2026 | 1 | } |
2027 | ||
2028 | #[test] | |
2029 | 1 | fn where_am_i_all() -> Result<()> { |
2030 | 1 | let mathml_str = "<math id='math'><mfrac id='mfrac'> |
2031 | 1 | <msup id='msup'><mi id='base'>b</mi><mn id='exp'>2</mn></msup> |
2032 | 1 | <mi id='denom'>d</mi> |
2033 | 1 | </mfrac></math>"; |
2034 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
2035 | 1 | set_preference("SpeechStyle", "ClearSpeak").unwrap(); |
2036 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2037 | 1 | let package_instance = package_instance.borrow(); |
2038 | 1 | let mathml = get_element(&package_instance); |
2039 | 1 | NAVIGATION_STATE.with(|nav_stack| { |
2040 | 1 | nav_stack.borrow_mut().push(NavigationPosition{ |
2041 | 1 | current_node: "exp".to_string(), |
2042 | 1 | current_node_offset: 0 |
2043 | 1 | }, "None") |
2044 | 1 | }); |
2045 | // WhereAmIAll doesn't change the stack | |
2046 | 1 | let speech =test_command("WhereAmIAll", mathml, "exp"); |
2047 | // should be 2 "inside" strings corresponding to steps to the root | |
2048 | 1 | assert_eq!(speech, "2; inside; b squared; inside; the fraction with numerator; b squared; and denominator d"); |
2049 | 1 | return Ok( () ); |
2050 | 1 | }); |
2051 | 1 | } |
2052 | ||
2053 | #[test] | |
2054 | 1 | fn auto_zoom_out_mrow() -> Result<()> { |
2055 | 1 | let mathml_str = "<math id='math'> |
2056 | 1 | <mrow id='id-1'> |
2057 | 1 | <mrow id='id-2'> |
2058 | 1 | <mrow id='2ax'> |
2059 | 1 | <mn id='2'>2</mn> |
2060 | 1 | <mo id='id-5'>⁢</mo> |
2061 | 1 | <mi id='a'>a</mi> |
2062 | 1 | <mo id='id-7'>⁢</mo> |
2063 | 1 | <mi id='x'>x</mi> |
2064 | 1 | </mrow> |
2065 | 1 | <mo id='plus'>+</mo> |
2066 | 1 | <mi id='b'>b</mi> |
2067 | 1 | </mrow> |
2068 | 1 | <mo id='equal'>=</mo> |
2069 | 1 | <mn id='10'>10</mn> |
2070 | 1 | </mrow> |
2071 | 1 | </math>"; |
2072 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
2073 | 1 | set_preference("AutoZoomOut", "False") |
2074 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2075 | 1 | let package_instance = package_instance.borrow(); |
2076 | 1 | let mathml = get_element(&package_instance); |
2077 | 1 | test_command("ZoomInAll", mathml, "2"); |
2078 | 1 | test_command("MoveNext", mathml, "a"); |
2079 | 1 | test_command("MoveNext", mathml, "x"); |
2080 | 1 | test_command("MoveNext", mathml, "plus"); |
2081 | 1 | test_command("MovePrevious", mathml, "2ax"); |
2082 | 1 | return Ok( () ); |
2083 | 1 | }); |
2084 | 1 | } |
2085 | ||
2086 | #[test] | |
2087 | 1 | fn auto_zoom_out_fraction() -> Result<()> { |
2088 | 1 | let mathml_str = "<math id='math'> |
2089 | 1 | <mrow id='mrow'> |
2090 | 1 | <mfrac id='frac'> |
2091 | 1 | <mrow id='num'><mi id='a'>a</mi><mo id='plus'>+</mo><mn id='1'>1</mn></mrow> |
2092 | 1 | <mrow id='denom'><mn id='2'>2</mn><mo id='invisible-times'>⁢</mo><mi id='b'>b</mi></mrow> |
2093 | 1 | </mfrac> |
2094 | 1 | <mo id='minus'>-</mo> |
2095 | 1 | <mn id='3'>3</mn> |
2096 | 1 | </mrow> |
2097 | 1 | </math>"; |
2098 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
2099 | 1 | set_preference("AutoZoomOut", "False") |
2100 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2101 | 1 | let package_instance = package_instance.borrow(); |
2102 | 1 | let mathml = get_element(&package_instance); |
2103 | 1 | test_command("ZoomIn", mathml, "frac"); |
2104 | 1 | test_command("ZoomIn", mathml, "num"); |
2105 | 1 | test_command("MoveNext", mathml, "denom"); |
2106 | 1 | test_command("MoveNext", mathml, "denom"); |
2107 | 1 | test_command("MovePrevious", mathml, "num"); |
2108 | 1 | test_command("MovePrevious", mathml, "num"); |
2109 | 1 | test_command("ZoomOut", mathml, "frac"); |
2110 | 1 | test_command("MoveNext", mathml, "minus"); |
2111 | 1 | return Ok( () ); |
2112 | 1 | }); |
2113 | 1 | } |
2114 | ||
2115 | #[test] | |
2116 | 1 | fn zoom_root() -> Result<()> { |
2117 | 1 | let mathml_str = r#"<math display='block' id='id-0'> |
2118 | 1 | <mrow id='id-1'> |
2119 | 1 | <mo id='id-9'>±</mo> |
2120 | 1 | <msqrt id='id-10'> |
2121 | 1 | <mrow id='id-11'> |
2122 | 1 | <msup id='id-12'> <mi id='id-13'>b</mi> <mn id='id-14'>2</mn> </msup> |
2123 | 1 | <mo id='id-15'>-</mo> |
2124 | 1 | <mn id='id-17'>4</mn> |
2125 | 1 | </mrow> |
2126 | 1 | </msqrt> |
2127 | 1 | </mrow> |
2128 | 1 | </math>"#; |
2129 | ||
2130 | 1 | test_mode(mathml_str, "Enhanced") |
2131 | 1 | test_mode(mathml_str, "Simple") |
2132 | 1 | test_mode(mathml_str, "Character") |
2133 | 1 | return Ok( () ); |
2134 | ||
2135 | 3 | fn test_mode(mathml_str: &str, mode: &str) -> Result<()> { |
2136 | 3 | init_default_prefs(mathml_str, mode); |
2137 | 3 | set_preference("AutoZoomOut", "False") |
2138 | 3 | return MATHML_INSTANCE.with(|package_instance| { |
2139 | 3 | debug!("--- Testing mode {mode} ---"); |
2140 | 3 | let package_instance = package_instance.borrow(); |
2141 | 3 | let mathml = get_element(&package_instance); |
2142 | 3 | test_command("ZoomIn", mathml, "id-9"); |
2143 | 3 | debug!("\nStart zoom in"); |
2144 | 3 | match mode { |
2145 | 3 | "Enhanced" => { |
2146 | 1 | test_command("MoveNext", mathml, "id-10"); |
2147 | 1 | let speech = test_command("ZoomIn", mathml, "id-11"); |
2148 | 1 | assert_eq!(speech, "zoom in; in root; b squared minus 4"); // only one arg, so don't say "in root" |
2149 | 1 | let speech = test_command("ZoomIn", mathml, "id-12"); |
2150 | 1 | assert_eq!(speech, "zoom in; b squared"); // only one arg, so don't say "in root" |
2151 | 1 | let speech = test_command("ZoomIn", mathml, "id-13"); |
2152 | 1 | assert_eq!(speech, "zoom in; in base; b"); |
2153 | }, | |
2154 | 2 | "Simple" => { |
2155 | 1 | test_command("MoveNext", mathml, "id-10"); |
2156 | 1 | let speech = test_command("ZoomIn", mathml, "id-12"); |
2157 | 1 | assert_eq!(speech, "zoom in; in root; b squared"); |
2158 | 1 | let speech = test_command("ZoomIn", mathml, "id-13"); |
2159 | 1 | assert_eq!(speech, "zoom in; in base; b"); |
2160 | }, | |
2161 | _ => { // "Character" | |
2162 | 1 | let speech = test_command("MoveNext", mathml, "id-13"); |
2163 | 1 | assert_eq!(speech, "move right; in root; in base; b"); |
2164 | } | |
2165 | } | |
2166 | 3 | let squared_speech = if mode == "Character" { |
2167 | 3 | let sqrt_speech = if mode == "Character" { |
2168 | 3 | let speech = test_command("ZoomOut", mathml, "id-12"); |
2169 | 3 | assert_eq!(speech, format!("zoom out; out of base; {squared_speech}")); |
2170 | 3 | let speech = test_command("ZoomOut", mathml, "id-11"); |
2171 | 3 | assert_eq!(speech, format!("zoom out; {squared_speech} minus 4")); |
2172 | 3 | let speech = test_command("ZoomOut", mathml, "id-10"); |
2173 | 3 | assert_eq!(speech, format!("zoom out; out of root; the {sqrt_speech} of {squared_speech} minus 4, end root",)); |
2174 | 3 | return Ok( () ); |
2175 | 3 | }); |
2176 | 3 | } |
2177 | 1 | } |
2178 | ||
2179 | #[test] | |
2180 | 1 | fn matrix_speech() -> Result<()> { |
2181 | 1 | let mathml_str = r#"<math id='math'> |
2182 | 1 | <mrow id='mrow'> |
2183 | 1 | <mo id='open'>[</mo> |
2184 | 1 | <mtable columnspacing='1em' rowspacing='4pt' id='table'> |
2185 | 1 | <mtr id='row-1'> |
2186 | 1 | <mtd id='1-1'><mn id='id-6'>9</mn></mtd> |
2187 | 1 | <mtd id='1-2'><mrow id='id-8'><mo id='id-9'>-</mo><mn id='id-10'>13</mn></mrow></mtd> |
2188 | 1 | </mtr> |
2189 | 1 | <mtr id='row-2'> |
2190 | 1 | <mtd id='2-1'><mn id='id-13'>5</mn></mtd> |
2191 | 1 | <mtd id='2-2'><mo id='id-16'>-</mo><mn id='id-17'>6</mn></mtd> |
2192 | 1 | </mtr> |
2193 | 1 | </mtable> |
2194 | 1 | <mo id='close'>]</mo> |
2195 | 1 | </mrow> |
2196 | 1 | </math>"#; |
2197 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
2198 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2199 | 1 | let package_instance = package_instance.borrow(); |
2200 | 1 | let mathml = get_element(&package_instance); |
2201 | 1 | test_command("ZoomIn", mathml, "row-1"); |
2202 | 1 | let speech = test_command("MoveNext", mathml, "row-2"); |
2203 | 1 | assert_eq!(speech, "move right; row 2; 5, negative 6"); |
2204 | 1 | let speech = test_command("ZoomIn", mathml, "id-13"); |
2205 | 1 | assert_eq!(speech, "zoom in; column 1; 5"); |
2206 | 1 | let speech = test_command("ZoomOut", mathml, "row-2"); |
2207 | 1 | assert_eq!(speech, "zoom out; row 2; 5, negative 6"); |
2208 | 1 | let speech = test_command("ZoomOut", mathml, "table"); |
2209 | 1 | assert_eq!(speech, "zoom out; the 2 by 2 matrix; row 1; 9, negative 13; row 2; 5, negative 6"); |
2210 | 1 | return Ok( () ); |
2211 | 1 | }); |
2212 | 1 | } |
2213 | ||
2214 | #[test] | |
2215 | 1 | fn chem_speech() -> Result<()> { |
2216 | // this comes from bug 218 | |
2217 | 1 | let mathml_str = "<math display='block' id='id-0'> |
2218 | 1 | <mrow data-chem-formula='5' id='id-1'> |
2219 | 1 | <msub data-chem-formula='1' id='id-2'> |
2220 | 1 | <mi data-chem-element='1' id='id-3'>H</mi> |
2221 | 1 | <mn id='id-4'>2</mn> |
2222 | 1 | </msub> |
2223 | 1 | <mo data-chem-formula-op='0' id='id-5'>⁣</mo> |
2224 | 1 | <mi data-chem-element='1' id='id-6'>S</mi> |
2225 | 1 | <mo data-chem-formula-op='0' id='id-7'>⁣</mo> |
2226 | 1 | <msub data-chem-formula='1' id='id-8'> |
2227 | 1 | <mi data-chem-element='1' id='id-9'>O</mi> |
2228 | 1 | <mn id='id-10'>4</mn> |
2229 | 1 | </msub> |
2230 | 1 | </mrow> |
2231 | 1 | </math>"; |
2232 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
2233 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2234 | 1 | let package_instance = package_instance.borrow(); |
2235 | 1 | let mathml = get_element(&package_instance); |
2236 | 1 | test_command("ZoomIn", mathml, "id-2"); |
2237 | 1 | let speech = test_command("MoveNext", mathml, "id-6"); |
2238 | // tables need to check their parent for proper speech | |
2239 | 1 | assert_eq!(speech, "move right; cap s"); |
2240 | 1 | return Ok( () ); |
2241 | 1 | }); |
2242 | 1 | } |
2243 | ||
2244 | #[test] | |
2245 | 1 | fn determinant_speech() -> Result<()> { |
2246 | 1 | let mathml_str = "<math id='math'> |
2247 | 1 | <mrow id='mrow'> |
2248 | 1 | <mo id='open'>|</mo> |
2249 | 1 | <mtable columnspacing='1em' rowspacing='4pt' id='table'> |
2250 | 1 | <mtr id='row-1'> |
2251 | 1 | <mtd id='1-1'><mn id='id-6'>9</mn></mtd> |
2252 | 1 | <mtd id='1-2'><mrow id='id-8'><mo id='id-9'>-</mo><mn id='id-10'>13</mn></mrow></mtd> |
2253 | 1 | </mtr> |
2254 | 1 | <mtr id='row-2'> |
2255 | 1 | <mtd id='2-1'><mn id='id-13'>5</mn></mtd> |
2256 | 1 | <mtd id='2-2'><mrow id='row2-negative'><mo id='id-16'>-</mo><mn id='id-17'>6</mn></mrow></mtd> |
2257 | 1 | </mtr> |
2258 | 1 | </mtable> |
2259 | 1 | <mo id='close'>|</mo> |
2260 | 1 | </mrow> |
2261 | 1 | </math>"; |
2262 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
2263 | 1 | set_preference("SpeechStyle", "ClearSpeak").unwrap(); |
2264 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2265 | 1 | let package_instance = package_instance.borrow(); |
2266 | 1 | let mathml = get_element(&package_instance); |
2267 | 1 | let speech = test_command("ZoomIn", mathml, "row-1"); |
2268 | 1 | assert_eq!(speech, "zoom in; row 1; 9, negative 13"); |
2269 | 1 | let speech = test_command("MoveNext", mathml, "row-2"); |
2270 | 1 | assert_eq!(speech, "move right; row 2; 5, negative 6"); |
2271 | 1 | let speech = test_command("MoveNext", mathml, "row-2"); |
2272 | 1 | assert_eq!(speech, "cannot move right, end of math"); |
2273 | 1 | let speech = test_command("ZoomIn", mathml, "id-13"); |
2274 | 1 | assert_eq!(speech, "zoom in; column 1; 5"); |
2275 | 1 | let speech = test_command("MoveNext", mathml, "row2-negative"); |
2276 | 1 | assert_eq!(speech, "move right; column 2, negative 6"); |
2277 | 1 | let speech = test_command("ZoomOutAll", mathml, "table"); |
2278 | 1 | assert_eq!(speech, "zoomed out all of the way; the 2 by 2 determinant; row 1; 9, negative 13; row 2; 5, negative 6"); |
2279 | 1 | return Ok( () ); |
2280 | 1 | }); |
2281 | 1 | } |
2282 | ||
2283 | #[test] | |
2284 | 1 | fn cases_speech() -> Result<()> { |
2285 | 1 | let mathml_str = "<math id='id-0'> |
2286 | 1 | <mrow id='id-1'> |
2287 | 1 | <mo id='open'>{</mo> |
2288 | 1 | <mtable columnalign='left left' columnspacing='1em' displaystyle='false' rowspacing='.2em' id='table'> |
2289 | 1 | <mtr id='row-1'> |
2290 | 1 | <mtd id='id-5'><mrow id='id-6'><mrow id='id-7'><mo id='id-8'>-</mo><mi id='id-9'>x</mi></mrow><mo id='id-10'>,</mo></mrow></mtd> |
2291 | 1 | <mtd id='id-11'><mrow id='id-12'><mrow id='id-13'><mtext id='id-14'>if</mtext><mo id='id-15'>⁢</mo><mi id='id-16'>x</mi></mrow><mo id='id-17'><</mo><mn id='id-18'>0</mn></mrow></mtd> |
2292 | 1 | </mtr> |
2293 | 1 | <mtr id='row-2'> |
2294 | 1 | <mtd id='id-20'><mrow id='id-21'><mrow id='id-22'><mo id='id-23'>+</mo><mi id='id-24'>x</mi></mrow><mo id='id-25'>,</mo></mrow></mtd> |
2295 | 1 | <mtd id='id-26'><mrow id='id-27'><mrow id='id-28'><mtext id='id-29'>if</mtext><mo id='id-30'>⁢</mo><mi id='id-31'>x</mi></mrow><mo id='id-32'>≥</mo><mn id='id-33'>0</mn></mrow></mtd> |
2296 | 1 | </mtr> |
2297 | 1 | </mtable> |
2298 | 1 | </mrow> |
2299 | 1 | </math>"; |
2300 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
2301 | 1 | set_preference("SpeechStyle", "ClearSpeak").unwrap(); |
2302 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2303 | 1 | let package_instance = package_instance.borrow(); |
2304 | 1 | let mathml = get_element(&package_instance); |
2305 | 1 | test_command("ZoomIn", mathml, "row-1"); |
2306 | 1 | let speech = test_command("MovePrevious", mathml, "row-1"); |
2307 | 1 | assert_eq!(speech, "move left; start of math"); |
2308 | 1 | let speech = test_command("MoveNext", mathml, "row-2"); |
2309 | 1 | assert_eq!(speech, "move right; case 2; positive x comma; if x, is greater than or equal to 0"); |
2310 | 1 | let speech = test_command("ZoomOut", mathml, "table"); |
2311 | 1 | assert_eq!(speech, "zoom out; 2 cases; case 1; negative x comma; if x is less than 0; case 2; positive x comma; if x, is greater than or equal to 0"); |
2312 | 1 | let speech = test_command("ZoomIn", mathml, "row-1"); |
2313 | 1 | assert_eq!(speech, "zoom in; case 1; negative x comma; if x is less than 0"); |
2314 | 1 | set_preference("NavMode", "Character").unwrap(); |
2315 | 1 | let speech = test_command("MovePrevious", mathml, "open"); |
2316 | 1 | assert_eq!(speech, "move left; open brace"); |
2317 | 1 | return Ok( () ); |
2318 | 1 | }); |
2319 | 1 | } |
2320 | ||
2321 | #[test] | |
2322 | 1 | fn base_superscript() -> Result<()> { |
2323 | // bug #217 -- zoom into base of parenthesized script | |
2324 | 1 | let mathml_str = "<math display='block' id='id-0'> |
2325 | 1 | <msup id='id-1'> |
2326 | 1 | <mrow id='id-2'> |
2327 | 1 | <mo stretchy='false' id='id-3'>(</mo> |
2328 | 1 | <mrow id='id-4'> |
2329 | 1 | <mn id='id-5'>2</mn> |
2330 | 1 | <mo id='id-6'>⁢</mo> |
2331 | 1 | <mi id='id-7'>x</mi> |
2332 | 1 | </mrow> |
2333 | 1 | <mo stretchy='false' id='id-8'>)</mo> |
2334 | 1 | </mrow> |
2335 | 1 | <mn id='id-9'>2</mn> |
2336 | 1 | </msup> |
2337 | 1 | </math>"; |
2338 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
2339 | 1 | set_preference("SpeechStyle", "ClearSpeak").unwrap(); |
2340 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2341 | 1 | let package_instance = package_instance.borrow(); |
2342 | 1 | let mathml = get_element(&package_instance); |
2343 | 1 | let speech = test_command("ZoomIn", mathml, "id-4"); |
2344 | 1 | assert_eq!(speech, "zoom in; in base; 2 x"); |
2345 | 1 | let speech = test_command("MoveNext", mathml, "id-9"); |
2346 | 1 | assert_eq!(speech, "move right; in exponent; 2"); |
2347 | 1 | return Ok( () ); |
2348 | 1 | }); |
2349 | 1 | } |
2350 | ||
2351 | #[test] | |
2352 | 1 | fn binomial_intent() -> Result<()> { |
2353 | 1 | let mathml_str = "<math display='block' id='id-0'> |
2354 | 1 | <mrow intent='binomial($n,$k)' id='id-1'> |
2355 | 1 | <mo id='id-2'>(</mo> |
2356 | 1 | <mfrac linethickness='0pt' id='id-3'> |
2357 | 1 | <mi arg='n' id='id-4'>n</mi> |
2358 | 1 | <mi arg='k' id='id-5'>k</mi> |
2359 | 1 | </mfrac> |
2360 | 1 | <mo id='id-6'>)</mo> |
2361 | 1 | </mrow> |
2362 | 1 | </math>"; |
2363 | 1 | init_default_prefs(mathml_str, "Character"); |
2364 | 1 | set_preference("SpeechStyle", "ClearSpeak").unwrap(); |
2365 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2366 | 1 | let package_instance = package_instance.borrow(); |
2367 | 1 | let mathml = get_element(&package_instance); |
2368 | 1 | debug!("Character mode"); |
2369 | 1 | let speech = test_command("MoveStart", mathml, "id-2"); |
2370 | 1 | assert_eq!(speech, "move to start of math; open paren"); |
2371 | 1 | let speech = test_command("MoveNext", mathml, "id-4"); |
2372 | // I'm not keen on the use of numerator/denominator here, but character mode turns off intent | |
2373 | 1 | assert_eq!(speech, "move right; in numerator; n"); |
2374 | 1 | let speech = test_command("MoveNext", mathml, "id-5"); |
2375 | 1 | assert_eq!(speech, "move right; in denominator; k"); |
2376 | 1 | debug!("before zoom out"); |
2377 | 1 | let speech = test_command("ZoomOut", mathml, "id-3"); |
2378 | 1 | assert_eq!(speech, "zoom out; out of denominator; n over k"); |
2379 | // let speech = test_command("ZoomOut", mathml, "id-1"); | |
2380 | // assert_eq!(speech, "zoom out; open paren n over k, close paren"); | |
2381 | ||
2382 | 1 | set_preference("NavMode", "Simple").unwrap(); |
2383 | 1 | debug!("Simple mode"); |
2384 | 1 | let speech = test_command("ZoomIn", mathml, "id-4"); |
2385 | 1 | assert_eq!(speech, "zoom in; in part 1; n"); |
2386 | 1 | let speech = test_command("MoveNext", mathml, "id-5"); |
2387 | 1 | assert_eq!(speech, "move right; in part 2; k"); |
2388 | 1 | let speech = test_command("MoveNext", mathml, "id-5"); |
2389 | 1 | assert_eq!(speech, "cannot move right, end of math"); |
2390 | 1 | let speech = test_command("ZoomOut", mathml, "id-1-literal-0"); |
2391 | 1 | assert_eq!(speech, "zoom out; out of part 2; n choose k"); |
2392 | ||
2393 | 1 | set_preference("NavMode", "Enhanced").unwrap(); |
2394 | 1 | debug!("Enhanced mode"); |
2395 | 1 | let speech = test_command("ZoomIn", mathml, "id-4"); |
2396 | 1 | assert_eq!(speech, "zoom in; in part 1; n"); |
2397 | 1 | let speech = test_command("MoveNext", mathml, "id-5"); |
2398 | 1 | assert_eq!(speech, "move right; in part 2; k"); |
2399 | 1 | let speech = test_command("MoveNext", mathml, "id-5"); |
2400 | 1 | assert_eq!(speech, "cannot move right, end of math"); |
2401 | 1 | let speech = test_command("ZoomOut", mathml, "id-1-literal-0"); |
2402 | 1 | assert_eq!(speech, "zoom out; out of part 2; n choose k"); |
2403 | ||
2404 | 1 | return Ok( () ); |
2405 | 1 | }); |
2406 | 1 | } |
2407 | ||
2408 | #[test] | |
2409 | 1 | fn matrix_literal_intent() -> Result<()> { |
2410 | 1 | let mathml_str = r#"<math display='block' id='id-0'> |
2411 | 1 | <mrow intent='$m' id='id-1'> |
2412 | 1 | <mo id='id-2'>(</mo> |
2413 | 1 | <mtable arg='m' intent='_diagonal:prefix(1,2,3)' id='id-3'> |
2414 | 1 | <mtr id='id-4'> |
2415 | 1 | <mtd id='id-5'><mn id='id-6'>1</mn></mtd> |
2416 | 1 | <mtd id='id-7'><mn id='id-8'>0</mn></mtd> |
2417 | 1 | <mtd id='id-9'><mn id='id-10'>0</mn></mtd> |
2418 | 1 | </mtr> |
2419 | 1 | <mtr id='id-11'> |
2420 | 1 | <mtd id='id-12'><mn id='id-13'>0</mn></mtd> |
2421 | 1 | <mtd id='id-14'><mn id='id-15'>2</mn></mtd> |
2422 | 1 | <mtd id='id-16'><mn id='id-17'>0</mn></mtd> |
2423 | 1 | </mtr> |
2424 | 1 | <mtr id='id-18'> |
2425 | 1 | <mtd id='id-19'><mn id='id-20'>0</mn></mtd> |
2426 | 1 | <mtd id='id-21'><mn id='id-22'>0</mn></mtd> |
2427 | 1 | <mtd id='id-23'><mn id='id-24'>3</mn></mtd> |
2428 | 1 | </mtr> |
2429 | 1 | </mtable> |
2430 | 1 | <mo id='id-25'>)</mo> |
2431 | 1 | </mrow> |
2432 | 1 | </math>"#; |
2433 | 1 | init_default_prefs(mathml_str, "Simple"); |
2434 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2435 | 1 | let package_instance = package_instance.borrow(); |
2436 | 1 | let mathml = get_element(&package_instance); |
2437 | 1 | let speech = test_command("ZoomIn", mathml, "id-3-literal-1"); |
2438 | 1 | assert_eq!(speech, "zoom in; 1"); |
2439 | 1 | let speech = test_command("MoveNext", mathml, "id-3-literal-2"); |
2440 | 1 | assert_eq!(speech, "move right; 2"); |
2441 | 1 | let speech = test_command("MoveNext", mathml, "id-3-literal-3"); |
2442 | 1 | assert_eq!(speech, "move right; 3"); |
2443 | 1 | let speech = test_command("MoveNext", mathml, "id-3-literal-3"); |
2444 | 1 | assert_eq!(speech, "cannot move right, end of math"); |
2445 | 1 | let speech = test_command("ZoomOut", mathml, "id-3-literal-0"); |
2446 | 1 | assert_eq!(speech, "zoom out; diagonal 1 2 3"); |
2447 | ||
2448 | 1 | return Ok( () ); |
2449 | 1 | }); |
2450 | 1 | } |
2451 | ||
2452 | #[test] | |
2453 | 1 | fn absolute_value() -> Result<()> { |
2454 | 1 | let mathml_str = "<math id='math'> |
2455 | 1 | <mrow id='expr'> |
2456 | 1 | <mn id='2'>2</mn> |
2457 | 1 | <mrow id='abs'> |
2458 | 1 | <mo id='start'>|</mo> |
2459 | 1 | <mi id='x'>x</mi> |
2460 | 1 | <mo id='end'>|</mo> |
2461 | 1 | </mrow> |
2462 | 1 | </mrow> |
2463 | 1 | </math>"; |
2464 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
2465 | 1 | set_preference("SpeechStyle", "ClearSpeak").unwrap(); |
2466 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2467 | 1 | let package_instance = package_instance.borrow(); |
2468 | 1 | let mathml = get_element(&package_instance); |
2469 | 1 | let speech = test_command("ZoomIn", mathml, "2"); |
2470 | 1 | assert_eq!(speech, "zoom in; 2"); |
2471 | 1 | let speech = test_command("MoveNext", mathml, "abs"); |
2472 | 1 | assert_eq!(speech, "move right; the absolute value of x"); |
2473 | 1 | let speech = test_command("ZoomIn", mathml, "x"); |
2474 | 1 | assert_eq!(speech, "zoom in; in absolute value; x"); |
2475 | 1 | let speech = test_command("MoveNext", mathml, "x"); |
2476 | 1 | assert_eq!(speech, "cannot move right, end of math"); |
2477 | 1 | set_preference("NavMode", "Character").unwrap(); |
2478 | 1 | let speech = test_command("MoveNext", mathml, "end"); |
2479 | 1 | assert_eq!(speech, "move right; vertical line"); |
2480 | 1 | let speech = test_command("MoveLineStart", mathml, "2"); |
2481 | 1 | assert_eq!(speech, "move to start of line; 2"); |
2482 | 1 | let speech = test_command("MoveNext", mathml, "start"); |
2483 | 1 | assert_eq!(speech, "move right; vertical line"); |
2484 | 1 | return Ok( () ); |
2485 | 1 | }); |
2486 | 1 | } |
2487 | ||
2488 | #[test] | |
2489 | 1 | fn read_and_describe_fraction() -> Result<()> { |
2490 | 1 | let mathml_str = "<math id='math'> |
2491 | 1 | <mrow id='mrow'> |
2492 | 1 | <mfrac id='frac'> |
2493 | 1 | <mrow id='numerator'><mi>b</mi><mo>+</mo><mn>1</mn></mrow> |
2494 | 1 | <mn id='denom'>3</mn> |
2495 | 1 | </mfrac> |
2496 | 1 | <mo id='minus'>-</mo> |
2497 | 1 | <mn id='3'>3</mn> |
2498 | 1 | </mrow> |
2499 | 1 | </math>"; |
2500 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
2501 | 1 | set_preference("SpeechStyle", "SimpleSpeak").unwrap(); |
2502 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2503 | 1 | let package_instance = package_instance.borrow(); |
2504 | 1 | let mathml = get_element(&package_instance); |
2505 | 1 | test_command("ZoomIn", mathml, "frac"); |
2506 | 1 | let speech = test_command("ReadCurrent", mathml, "frac"); |
2507 | 1 | assert_eq!(speech, "read current; fraction, b plus 1, over 3, end fraction"); |
2508 | 1 | let speech = test_command("DescribeCurrent", mathml, "frac"); |
2509 | 1 | assert_eq!(speech, "describe current; fraction"); |
2510 | 1 | return Ok( () ); |
2511 | 1 | }); |
2512 | 1 | } |
2513 | ||
2514 | ||
2515 | #[test] | |
2516 | 1 | fn read_and_describe_mrow() -> Result<()> { |
2517 | 1 | let mathml_str = "<math id='math'> |
2518 | 1 | <mrow id='mrow'> |
2519 | 1 | <mn>1</mn><mo>+</mo> |
2520 | 1 | <mn>2</mn><mo>+</mo> |
2521 | 1 | <mn>3</mn><mo>+</mo> |
2522 | 1 | <mn>4</mn><mo>+</mo> |
2523 | 1 | <mn>5</mn><mo>+</mo> |
2524 | 1 | <mn>6</mn><mo>+</mo> |
2525 | 1 | <mn>7</mn> |
2526 | 1 | </mrow> |
2527 | 1 | </math>"; |
2528 | 1 | init_default_prefs(mathml_str, "Enhanced"); |
2529 | 1 | set_preference("SpeechStyle", "SimpleSpeak").unwrap(); |
2530 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2531 | 1 | let package_instance = package_instance.borrow(); |
2532 | 1 | let mathml = get_element(&package_instance); |
2533 | 1 | let speech = test_command("ZoomOutAll", mathml, "mrow"); |
2534 | 1 | assert_eq!(speech, "zoomed out all of the way; 1 plus 2 plus 3 plus 4 plus 5 plus 6 plus 7"); |
2535 | 1 | let speech = test_command("ReadCurrent", mathml, "mrow"); |
2536 | 1 | assert_eq!(speech, "read current; 1 plus 2 plus 3 plus 4 plus 5 plus 6 plus 7"); |
2537 | 1 | let speech = test_command("DescribeCurrent", mathml, "mrow"); |
2538 | 1 | assert_eq!(speech, "describe current; 1 plus 2 plus 3 and so on"); |
2539 | 1 | return Ok( () ); |
2540 | 1 | }); |
2541 | 1 | } |
2542 | ||
2543 | ||
2544 | #[test] | |
2545 | 1 | fn read_next_invisible_char() -> Result<()> { |
2546 | 1 | let mathml_str = "<math id='id-0'> |
2547 | 1 | <mrow id='id-1'> |
2548 | 1 | <mi id='id-2'>x</mi> |
2549 | 1 | <mo id='id-3'>⁢</mo> |
2550 | 1 | <mi id='id-4'>y</mi> |
2551 | 1 | </mrow> |
2552 | 1 | </math>"; |
2553 | 1 | init_default_prefs(mathml_str, "Simple"); |
2554 | 1 | set_preference("SpeechStyle", "SimpleSpeak").unwrap(); |
2555 | 1 | return MATHML_INSTANCE.with(|package_instance| { |
2556 | 1 | let package_instance = package_instance.borrow(); |
2557 | 1 | let mathml = get_element(&package_instance); |
2558 | 1 | let speech = test_command("ZoomIn", mathml, "id-2"); |
2559 | 1 | assert_eq!(speech, "zoom in; x"); |
2560 | 1 | let speech = test_command("ToggleZoomLockUp", mathml, "id-2"); |
2561 | 1 | assert_eq!(speech, "enhanced mode; x"); |
2562 | 1 | let speech = test_command("ReadNext", mathml, "id-2"); |
2563 | 1 | assert_eq!(speech, "read right; y"); |
2564 | 1 | return Ok( () ); |
2565 | 1 | }); |
2566 | 1 | } |
2567 | ||
2568 | ||
2569 | #[test] | |
2570 | 1 | fn basic_language_test() -> Result<()> { |
2571 | // this is basically a sanity check that all the language's navigation.yaml files are at least syntactically correct | |
2572 | // FIX: should look through the Languages dir and figure this is out | |
2573 | 1 | let mathml_str = "<math id='math'> |
2574 | 1 | <mrow id='contents'> |
2575 | 1 | <mrow id='lhs'> |
2576 | 1 | <mrow id='term'> |
2577 | 1 | <mn id='2'>2</mn> |
2578 | 1 | <mo id='invisible-times'>⁢</mo> |
2579 | 1 | <msup id='msup'> |
2580 | 1 | <mi id='x'>x</mi> |
2581 | 1 | <mn id='3'>3</mn> |
2582 | 1 | </msup> |
2583 | 1 | </mrow> |
2584 | 1 | <mo id='plus'>+</mo> |
2585 | 1 | <mn id='1'>1</mn> |
2586 | 1 | </mrow> |
2587 | 1 | <mo id='id-11'>=</mo> |
2588 | 1 | <mi id='id-12'>y</mi> |
2589 | 1 | </mrow> |
2590 | 1 | </math>"; |
2591 | ||
2592 | 1 | set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); |
2593 | 11 | for lang in |
2594 | 11 | test_language(&lang, mathml_str); |
2595 | 11 | } |
2596 | 1 | return Ok( () ); |
2597 | ||
2598 | 11 | fn test_language(lang: &str, mathml_str: &str) { |
2599 | 11 | init_default_prefs(mathml_str, "Enhanced"); |
2600 | 11 | set_preference("Language", lang).unwrap(); |
2601 | ||
2602 | 11 | set_preference("NavMode", "Enhanced").unwrap(); |
2603 | 11 | MATHML_INSTANCE.with(|package_instance| { |
2604 | 11 | let package_instance = package_instance.borrow(); |
2605 | 11 | let mathml = get_element(&package_instance); |
2606 | 11 | test_command("ZoomInAll", mathml, "2"); |
2607 | 11 | test_command("MoveNext", mathml, "msup"); |
2608 | 11 | test_command("MoveNext", mathml, "plus"); |
2609 | 11 | test_command("MovePrevious", mathml, "term"); |
2610 | 11 | test_command("MovePrevious", mathml, "term"); |
2611 | 11 | test_command("ZoomOutAll", mathml, "contents"); |
2612 | 11 | }); |
2613 | ||
2614 | 11 | set_preference("NavMode", "Simple").unwrap(); |
2615 | 11 | MATHML_INSTANCE.with(|package_instance: &RefCell<Package>| { |
2616 | 11 | let package_instance = package_instance.borrow(); |
2617 | 11 | let mathml = get_element(&package_instance); |
2618 | 11 | test_command("ZoomInAll", mathml, "2"); |
2619 | 11 | test_command("MoveNext", mathml, "msup"); |
2620 | 11 | test_command("MoveNext", mathml, "plus"); |
2621 | 11 | test_command("MovePrevious", mathml, "msup"); |
2622 | 11 | test_command("MovePrevious", mathml, "2"); |
2623 | 11 | test_command("MovePrevious", mathml, "2"); |
2624 | 11 | test_command("ZoomOutAll", mathml, "contents"); |
2625 | 11 | }); |
2626 | ||
2627 | 11 | set_preference("NavMode", "Character").unwrap(); |
2628 | 11 | MATHML_INSTANCE.with(|package_instance| { |
2629 | 11 | let package_instance = package_instance.borrow(); |
2630 | 11 | let mathml = get_element(&package_instance); |
2631 | 11 | test_command("ZoomIn", mathml, "2"); |
2632 | 11 | test_command("MoveNext", mathml, "x"); |
2633 | 11 | test_command("MoveNext", mathml, "3"); |
2634 | 11 | test_command("MoveNext", mathml, "plus"); |
2635 | 11 | test_command("MovePrevious", mathml, "3"); |
2636 | 11 | test_command("MovePrevious", mathml, "x"); |
2637 | 11 | test_command("MovePrevious", mathml, "2"); |
2638 | 11 | test_command("MovePrevious", mathml, "2"); |
2639 | 11 | }); |
2640 | ||
2641 | // simple sanity check that "overview.yaml" doesn't have a syntax error | |
2642 | 11 | set_preference("Overview", "True").unwrap(); |
2643 | 11 | set_preference("NavMode", "Character").unwrap(); |
2644 | 11 | MATHML_INSTANCE.with(|package_instance| { |
2645 | 11 | let package_instance = package_instance.borrow(); |
2646 | 11 | let mathml = get_element(&package_instance); |
2647 | 11 | test_command("ZoomIn", mathml, "2"); |
2648 | 11 | }); |
2649 | 11 | } |
2650 | 1 | } |
2651 | } |
Line | Count | Source |
1 | //! Preferences come from either the user or are programmatically set by the AT. | |
2 | //! Either source can set any preference, but users and AT typically set different preferences. | |
3 | //! | |
4 | //! User prefs are read in from a YAML file (prefs.yaml). They can be written by hand. | |
5 | //! In the future, there will hopefully be a nice UI that writes out the YAML file. | |
6 | //! | |
7 | //! AT prefs are set via the API given in the [crate::interface] module. | |
8 | //! These in turn call [`PreferenceManager::set_string_pref`] and [`PreferenceManager::set_api_float_pref`]. | |
9 | //! Ultimately, user and api prefs are stored in a hashmap. | |
10 | //! | |
11 | //! Preferences can be found in a few places: | |
12 | //! 1. Language-independent prefs found in the Rules dir | |
13 | //! 2. Language-specific prefs | |
14 | //! 3. Language-region-specific prefs | |
15 | //! | |
16 | //! If there are multiple definitions, the later ones overwrite the former ones. | |
17 | //! This means that region-specific variants will overwrite more general variants. | |
18 | //! | |
19 | //! Note: there are a number of public 'get_xxx' functions that really are meant to be public only to the [crate::speech] module as speech needs access | |
20 | //! to the preferences to generate the speech. | |
21 | #![allow(clippy::needless_return)] | |
22 | use yaml_rust::{Yaml, YamlLoader}; | |
23 | use crate::pretty_print::yaml_to_string; | |
24 | use crate::tts::TTS; | |
25 | use std::cell::RefCell; | |
26 | use std::rc::Rc; | |
27 | use log::{debug, error, warn}; | |
28 | use std::path::{Path, PathBuf}; | |
29 | use std::sync::LazyLock; | |
30 | use crate::speech::{as_str_checked, RulesFor, FileAndTime}; | |
31 | use std::collections::{HashMap, HashSet}; | |
32 | use phf::phf_set; | |
33 | use crate::shim_filesystem::*; | |
34 | use crate::errors::*; | |
35 | ||
36 | /// Use to indicate preference not found with Preference::to_string() | |
37 | pub static NO_PREFERENCE: &str = "\u{FFFF}"; | |
38 | ||
39 | 3 | static DEFAULT_LANG: LazyLock<Yaml> = LazyLock::new(|| Yaml::String("en".to_string())); |
40 | ||
41 | ||
42 | // Preferences are recorded here | |
43 | /// Preferences are stored in a HashMap. It maps the name of the pref (a String) to its value (stored as YAML string/float) | |
44 | pub type PreferenceHashMap = HashMap<String, Yaml>; | |
45 | #[derive(Debug, Clone, Default)] | |
46 | pub struct Preferences { | |
47 | prefs: PreferenceHashMap // FIX: pub so can get at iterator, should add iterator to Preferences instead | |
48 | } | |
49 | ||
50 | use std::fmt; | |
51 | impl fmt::Display for Preferences { | |
52 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
53 | 0 | let mut pref_vec: Vec<(&String, &Yaml)> = self.prefs.iter().collect(); |
54 | 0 | pref_vec.sort(); |
55 | 0 | for (name, value) in pref_vec { |
56 | 0 | writeln!(f, " {}: {}", name, yaml_to_string(value, 0))?; |
57 | } | |
58 | 0 | return Ok(()); |
59 | 0 | } |
60 | } | |
61 | ||
62 | impl Preferences{ | |
63 | // default values needed in case nothing else gets set | |
64 | 4.14k | fn user_defaults() -> Preferences { |
65 | 4.14k | let mut prefs = PreferenceHashMap::with_capacity(39); |
66 | 4.14k | prefs.insert("Language".to_string(), Yaml::String("en".to_string())); |
67 | 4.14k | prefs.insert("LanguageAuto".to_string(), Yaml::String("".to_string())); // illegal value so change will be recognized |
68 | 4.14k | prefs.insert("SpeechStyle".to_string(), Yaml::String("ClearSpeak".to_string())); |
69 | 4.14k | prefs.insert("Verbosity".to_string(), Yaml::String("Medium".to_string())); |
70 | 4.14k | prefs.insert("SpeechOverrides_CapitalLetters".to_string(), Yaml::String("".to_string())); // important for testing |
71 | 4.14k | prefs.insert("Blind".to_string(), Yaml::Boolean(true)); |
72 | 4.14k | prefs.insert("MathRate".to_string(), Yaml::Real("100.0".to_string())); |
73 | 4.14k | prefs.insert("PauseFactor".to_string(), Yaml::Real("100.0".to_string())); |
74 | 4.14k | prefs.insert("NavMode".to_string(), Yaml::String("Enhanced".to_string())); |
75 | 4.14k | prefs.insert("Overview".to_string(), Yaml::Boolean(false)); |
76 | 4.14k | prefs.insert("ResetOverView".to_string(), Yaml::Boolean(true)); |
77 | 4.14k | prefs.insert("NavVerbosity".to_string(), Yaml::String("Verbose".to_string())); |
78 | 4.14k | prefs.insert("AutoZoomOut".to_string(), Yaml::Boolean(true)); |
79 | 4.14k | prefs.insert("BrailleCode".to_string(), Yaml::String("Nemeth".to_string())); |
80 | 4.14k | prefs.insert("BrailleNavHighlight".to_string(), Yaml::String("EndPoints".to_string())); |
81 | 4.14k | prefs.insert("UEB_START_MODE".to_string(), Yaml::String("Grade2".to_string())); |
82 | 4.14k | prefs.insert("DecimalSeparators".to_string(), Yaml::String(".".to_string())); |
83 | 4.14k | prefs.insert("BlockSeparators".to_string(), Yaml::String(", \u{00A0}\u{202F}".to_string())); |
84 | ||
85 | 4.14k | return Preferences{ prefs }; |
86 | 4.14k | } |
87 | ||
88 | // default values needed in case nothing else gets set | |
89 | 4.14k | fn api_defaults() -> Preferences { |
90 | 4.14k | let mut prefs = PreferenceHashMap::with_capacity(19); |
91 | 4.14k | prefs.insert("TTS".to_string(), Yaml::String("none".to_string())); |
92 | 4.14k | prefs.insert("Pitch".to_string(), Yaml::Real("0.0".to_string())); |
93 | 4.14k | prefs.insert("Rate".to_string(), Yaml::Real("180.0".to_string())); |
94 | 4.14k | prefs.insert("Volume".to_string(), Yaml::Real("100.0".to_string())); |
95 | 4.14k | prefs.insert("Voice".to_string(), Yaml::String("none".to_string())); |
96 | 4.14k | prefs.insert("Gender".to_string(), Yaml::String("none".to_string())); |
97 | 4.14k | prefs.insert("Bookmark".to_string(), Yaml::Boolean(false)); |
98 | 4.14k | prefs.insert("CapitalLetters_UseWord".to_string(), Yaml::Boolean(true)); |
99 | 4.14k | prefs.insert("CapitalLetters_Pitch".to_string(), Yaml::Real("0.0".to_string())); |
100 | 4.14k | prefs.insert("CapitalLetters_Beep".to_string(), Yaml::Boolean(false)); |
101 | 4.14k | prefs.insert("IntentErrorRecovery".to_string(), Yaml::String("IgnoreIntent".to_string())); // also Error |
102 | 4.14k | prefs.insert("CheckRuleFiles".to_string(), Yaml::String( |
103 | 4.14k | (if cfg!(target_family = "wasm") { |
104 | 4.14k | return Preferences{ prefs }; |
105 | 4.14k | } |
106 | ||
107 | 4.14k | fn read_prefs_file(file: &Path, mut base_prefs: Preferences) -> Result<Preferences> { |
108 | 4.14k | let file_name = file.to_str().unwrap(); |
109 | let docs; | |
110 | 4.14k | match read_to_string_shim(file) { |
111 | 0 | Err(e) => { |
112 | 0 | bail!("Couldn't read file {}\n{}", file_name, e); |
113 | }, | |
114 | 4.14k | Ok( file_contents) => { |
115 | 4.14k | match YamlLoader::load_from_str(&file_contents) { |
116 | 0 | Err(e) => { |
117 | 0 | bail!("Yaml parse error ('{}') in preference file {}.", e, file_name); |
118 | }, | |
119 | 4.14k | Ok(d) => docs = d, |
120 | } | |
121 | ||
122 | } | |
123 | } | |
124 | 4.14k | if docs.len() != 1 { |
125 | 0 | bail!("MathCAT: error in prefs file '{}'.\nFound {} 'documents' -- should only be 1.", file_name, docs.len()); |
126 | 4.14k | } |
127 | ||
128 | 4.14k | let doc = &docs[0]; |
129 | 4.14k | if cfg!(debug_assertions) { |
130 | 4.14k | verify_keys(doc, "Speech", file_name) |
131 | 4.14k | verify_keys(doc, "Navigation", file_name) |
132 | 4.14k | verify_keys(doc, "Braille", file_name) |
133 | 4.14k | verify_keys(doc, "Other", file_name) |
134 | 0 | } |
135 | ||
136 | 4.14k | let prefs = &mut base_prefs.prefs; |
137 | 4.14k | add_prefs(prefs, &doc["Speech"], "", file_name); |
138 | 4.14k | add_prefs(prefs, &doc["Navigation"], "", file_name); |
139 | 4.14k | add_prefs(prefs, &doc["Braille"], "", file_name); |
140 | 4.14k | add_prefs(prefs, &doc["Other"], "", file_name); |
141 | 4.14k | return Ok( Preferences{ prefs: prefs.to_owned() } ); |
142 | ||
143 | ||
144 | ||
145 | 16.5k | fn verify_keys(dict: &Yaml, key: &str, file_name: &str) -> Result<()> { |
146 | 16.5k | let prefs = &dict[key]; |
147 | 16.5k | if prefs.is_badvalue() { |
148 | 0 | bail!("Yaml error in file {}.\nDidn't find '{}' key.", file_name, key); |
149 | 16.5k | } |
150 | 16.5k | if prefs.as_hash().is_none() { |
151 | 0 | bail!("Yaml error in file {}.\n'{}' key is not a dictionary. Value found is {}.", |
152 | 0 | file_name, key, yaml_to_string(dict, 1)); |
153 | 16.5k | } |
154 | 16.5k | return Ok(()); |
155 | 16.5k | } |
156 | ||
157 | 41.4k | fn add_prefs(map: &mut PreferenceHashMap, new_prefs: &Yaml, name_prefix: &str, file_name: &str) { |
158 | 41.4k | if new_prefs.is_badvalue() || new_prefs.is_null() || new_prefs.as_hash().is_none() { |
159 | 0 | return; |
160 | 41.4k | } |
161 | 41.4k | let new_prefs = new_prefs.as_hash().unwrap(); |
162 | 302k | for (yaml_name, yaml_value) in |
163 | 302k | let name = as_str_checked(yaml_name); |
164 | 302k | if let Err( |
165 | 0 | error!("{}", e.context( |
166 | 0 | format!("name '{}' is not a string in file {}", yaml_to_string(yaml_name, 0), file_name))); |
167 | } else { | |
168 | 302k | match yaml_value { |
169 | 24.8k | Yaml::Hash(_) => add_prefs(map, yaml_value, &(name.unwrap().to_string() + "_"), file_name), |
170 | 0 | Yaml::Array(_) => error!("name '{}' has illegal array value {} in file '{}'", |
171 | 0 | yaml_to_string(yaml_name, 0), yaml_to_string(yaml_value, 0), file_name), |
172 | Yaml::String(_) | Yaml::Boolean(_) | Yaml::Integer(_) | Yaml::Real(_) => { | |
173 | 277k | let trimmed_name = name_prefix.to_string() + name.unwrap().trim(); |
174 | 277k | let mut yaml_value = yaml_value.to_owned(); |
175 | 277k | if let Some( |
176 | 236k | yaml_value = Yaml::String(value.to_string()); |
177 | 236k |
|
178 | 277k | map.insert(trimmed_name, yaml_value); |
179 | }, | |
180 | 0 | _ => error!("name '{}' has illegal {:#?} value {} in file '{}'", |
181 | 0 | yaml_to_string(yaml_name, 0), yaml_value, yaml_to_string(yaml_value, 0), file_name), |
182 | } | |
183 | } | |
184 | } | |
185 | 41.4k | } |
186 | 4.14k | } |
187 | ||
188 | #[allow(dead_code)] // used in testing | |
189 | 0 | fn set_string_value(&mut self, name: &str, value: &str) { |
190 | 0 | self.prefs.insert(name.to_string(), Yaml::String(value.trim().to_string())); |
191 | 0 | } |
192 | ||
193 | #[allow(dead_code)] // used in testing | |
194 | 0 | fn set_bool_value(&mut self, name: &str, value: bool) { |
195 | 0 | self.prefs.insert(name.to_string(), Yaml::Boolean(value)); |
196 | 0 | } |
197 | } | |
198 | ||
199 | ||
200 | thread_local!{ | |
201 | static DEFAULT_USER_PREFERENCES: Preferences = Preferences::user_defaults(); | |
202 | static DEFAULT_API_PREFERENCES: Preferences = Preferences::api_defaults(); | |
203 | static PREF_MANAGER: Rc<RefCell<PreferenceManager>> = | |
204 | Rc::new( RefCell::new( PreferenceManager::default() ) ); | |
205 | ||
206 | } | |
207 | ||
208 | /// PreferenceManager keeps track of user and api prefs along with current files | |
209 | /// | |
210 | /// If one of the `FileAndTime` files changes while the program is running, the values will auto-update | |
211 | /// Among other things, that means that a UI that changes a user pref will be reflected the next time someone gets speech, braille, etc. | |
212 | // | |
213 | // Note: I experimented with PREF_MANAGER being a Result<PreferenceManager> in the case of no rule files, | |
214 | // but it ended up being a mess (lots of unwrapping). Having a field is much cleaner. | |
215 | // Also note that if 'error' is not an empty string, SpeechRules can't work so using those requires a check. | |
216 | #[derive(Debug, Default)] | |
217 | pub struct PreferenceManager { | |
218 | rules_dir: PathBuf, // full path to rules dir | |
219 | error: String, // empty/default string if fields are set, otherwise error message | |
220 | user_prefs: Preferences, // prefs that come from reading prefs.yaml (system and user locations) | |
221 | api_prefs: Preferences, // prefs set by API calls (along with some defaults not in the user settings such as "pitch") | |
222 | sys_prefs_file: Option<FileAndTime>, // the system prefs.yaml file | |
223 | user_prefs_file: Option<FileAndTime>, // the user prefs.yaml file | |
224 | intent: PathBuf, // the intent rule style file | |
225 | speech: PathBuf, // the speech rule style file | |
226 | overview: PathBuf, // the overview rule file | |
227 | navigation: PathBuf, // the navigation rule file | |
228 | speech_unicode: PathBuf, // short unicode.yaml file | |
229 | speech_unicode_full: PathBuf, // full unicode.yaml file | |
230 | speech_defs: PathBuf, // the definition.yaml file | |
231 | braille: PathBuf, // the braille rule file | |
232 | braille_unicode: PathBuf, // short braille unicode file | |
233 | braille_unicode_full: PathBuf, // full braille unicode file | |
234 | braille_defs: PathBuf, // the definition.yaml file | |
235 | } | |
236 | ||
237 | ||
238 | impl fmt::Display for PreferenceManager { | |
239 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
240 | 0 | writeln!(f, "PreferenceManager:")?; |
241 | 0 | if self.error.is_empty() { |
242 | 0 | writeln!(f, " not initialized!!! Error is {}", &self.error)?; |
243 | } else { | |
244 | 0 | writeln!(f, " user prefs:\n{}", self.user_prefs)?; |
245 | 0 | writeln!(f, " api prefs:\n{}", self.api_prefs)?; |
246 | 0 | writeln!(f, " style files: {:?}", self.speech.as_path())?; |
247 | 0 | writeln!(f, " unicode files: {:?}", self.speech_unicode.as_path())?; |
248 | 0 | writeln!(f, " intent files: {:?}", self.intent.as_path())?; |
249 | 0 | writeln!(f, " speech definition files: {:?}", self.speech_defs)?; |
250 | 0 | writeln!(f, " braille definition files: {:?}", self.braille_defs)?; |
251 | } | |
252 | 0 | return Ok(()); |
253 | 0 | } |
254 | } | |
255 | ||
256 | impl PreferenceManager { | |
257 | /// Initialize (the) PreferenceManager (a global var). | |
258 | /// 'rules_dir' is the path to "Rules" unless the env var MathCATRulesDir is set | |
259 | /// | |
260 | /// If rules_dir is an empty PathBuf, the existing rules_dir is used (an error if it doesn't exist) | |
261 | 5.09k | pub fn initialize(&mut self, rules_dir: PathBuf) -> Result<()> { |
262 | #[cfg(not(feature = "include-zip"))] | |
263 | 5.09k | let |
264 | 1 | Err(e) => bail!("set_rules_dir: could not canonicalize path {}: {}", rules_dir.display(), e), |
265 | 5.09k | Ok(rules_dir) => rules_dir, |
266 | }; | |
267 | ||
268 | 5.09k | self.set_rules_dir(&rules_dir) |
269 | 5.09k | self.set_preference_files() |
270 | 5.09k | self.set_all_files(&rules_dir) |
271 | 5.09k | return Ok( () ); |
272 | ||
273 | 5.09k | } |
274 | ||
275 | 60.3k | pub fn get() -> Rc<RefCell<PreferenceManager>> { |
276 | 60.3k | return PREF_MANAGER.with( |pm| pm.clone() ); |
277 | 60.3k | } |
278 | ||
279 | 0 | pub fn get_error(&self) -> &str { |
280 | 0 | return &self.error; |
281 | 0 | } |
282 | ||
283 | /// Return a `PreferenceHashMap` that is the merger of the api prefs onto the user prefs. | |
284 | 22.7k | pub fn merge_prefs(&self) -> PreferenceHashMap { |
285 | 22.7k | let mut merged_prefs = self.user_prefs.prefs.clone(); |
286 | 22.7k | merged_prefs.extend(self.api_prefs.prefs.clone()); |
287 | 22.7k | return merged_prefs; |
288 | 22.7k | } |
289 | ||
290 | /// Set the rules dir and return failure if it is a bad directory (non-existent, can't find all files, ...) | |
291 | 5.09k | fn set_rules_dir(&mut self, rules_dir: &Path) -> Result<()> { |
292 | // Fix: should make sure all files exists -- fail if not true | |
293 | 5.09k | if !is_dir_shim(rules_dir) { |
294 | 0 | bail!("Unable to find MathCAT Rules directory '{}'", rules_dir.to_string_lossy()) |
295 | 5.09k | } |
296 | 5.09k | self.rules_dir = rules_dir.to_path_buf(); |
297 | 5.09k | return Ok( () ); |
298 | 5.09k | } |
299 | ||
300 | /// Set the rules dir and return failure if it is a bad directory (non-existent, can't find all files, ...) | |
301 | 1 | pub fn get_rules_dir(&self) -> PathBuf { |
302 | // Fix: should make sure rules_dir is set -- fail if not true | |
303 | 1 | return self.rules_dir.clone(); |
304 | 1 | } |
305 | ||
306 | /// Read the preferences from the files (if not up to date) and set the preferences and preference files | |
307 | /// Returns failure if the files don't exist or have errors | |
308 | 20.3k | pub fn set_preference_files(&mut self) -> Result<()> { |
309 | // first, read in the preferences -- need to determine which files to read next | |
310 | // the prefs files are in the rules dir and the user dir; differs from other files | |
311 | 20.3k | if self.api_prefs.prefs.is_empty() { |
312 | 4.14k | self.api_prefs = Preferences{ prefs: DEFAULT_API_PREFERENCES.with(|defaults| defaults.prefs.clone()) }; |
313 | 16.2k | } |
314 | ||
315 | 20.3k | let should_update_system_prefs = self.sys_prefs_file.is_none() || |
316 | 20.3k | let should_update_user_prefs = self.user_prefs_file.is_none() || |
317 | 20.3k | if !(should_update_system_prefs || |
318 | 16.2k | return Ok( () ); // no need to do anything else |
319 | 4.14k | } |
320 | ||
321 | 4.14k | let mut prefs = Preferences::default(); |
322 | ||
323 | 4.14k | let mut system_prefs_file = self.rules_dir.to_path_buf(); |
324 | 4.14k | system_prefs_file.push("prefs.yaml"); |
325 | 4.14k | if is_file_shim(&system_prefs_file) { |
326 | 4.14k | let defaults = DEFAULT_USER_PREFERENCES.with(|defaults| defaults.clone()); |
327 | 4.14k | prefs = Preferences::read_prefs_file(&system_prefs_file, defaults) |
328 | 4.14k | self.sys_prefs_file = Some( FileAndTime::new_with_time(system_prefs_file.clone()) ); |
329 | } else { | |
330 | 0 | error!("MathCAT couldn't open file system preference file '{}'.\nUsing fallback defaults which may be inappropriate.", |
331 | 0 | system_prefs_file.to_str().unwrap()); |
332 | }; | |
333 | ||
334 | 4.14k | let mut user_prefs_file = dirs::config_dir(); |
335 | 4.14k | if let Some(mut user_prefs_file_path_buf) = user_prefs_file { |
336 | 4.14k | user_prefs_file_path_buf.push("MathCAT/prefs.yaml"); |
337 | 4.14k | if is_file_shim(&user_prefs_file_path_buf) { |
338 | 0 | prefs = Preferences::read_prefs_file(&user_prefs_file_path_buf, prefs)?; |
339 | 4.14k | } |
340 | // set the time otherwise keeps needing to do updates | |
341 | 4.14k | self.user_prefs_file = Some( FileAndTime::new_with_time(user_prefs_file_path_buf.clone()) ); |
342 | 4.14k | user_prefs_file = Some(user_prefs_file_path_buf); |
343 | 0 | } |
344 | ||
345 | 4.14k | if prefs.prefs.is_empty() { |
346 | 0 | let user_prefs_file_name = match user_prefs_file { |
347 | 0 | None => "No user config directory".to_string(), |
348 | 0 | Some(file) => file.to_string_lossy().to_string(), |
349 | }; | |
350 | 0 | bail!("Didn't find preferences in rule directory ('{}') or user directory ('{}')", &system_prefs_file.to_string_lossy(), user_prefs_file_name); |
351 | 4.14k | } |
352 | 4.14k | self.set_files_based_on_changes(&prefs) |
353 | 4.14k | self.user_prefs = prefs; |
354 | ||
355 | // set computed values for BLOCK_SEPARATORS and DECIMAL_SEPARATORS (a little messy about the language due immutable and mutable borrows) | |
356 | 4.14k | let language = self.user_prefs.prefs.get("Language").unwrap_or(&DEFAULT_LANG).clone(); |
357 | 4.14k | let language = language.as_str().unwrap(); |
358 | 4.14k | self.set_separators(language) |
359 | ||
360 | 4.14k | return Ok( () ); |
361 | 20.3k | } |
362 | ||
363 | 5.09k | fn set_all_files(&mut self, rules_dir: &Path) -> Result<()> { |
364 | // try to find ./Rules/lang/style.yaml and ./Rules/lang/style.yaml | |
365 | // we go through a series of fallbacks -- we try to maintain the language if possible | |
366 | ||
367 | 5.09k | let language = self.pref_to_string("Language"); |
368 | 5.09k | let language = if language.as_str() == "Auto" { |
369 | 5.09k | let language_dir = rules_dir.to_path_buf().join("Languages"); |
370 | 5.09k | self.set_speech_files(&language_dir, language, None) |
371 | ||
372 | 5.09k | let braille_code = self.pref_to_string("BrailleCode"); |
373 | 5.09k | let braille_dir = rules_dir.to_path_buf().join("Braille"); |
374 | 5.09k | self.set_braille_files(&braille_dir, &braille_code) |
375 | 5.09k | return Ok(()); |
376 | 5.09k | } |
377 | ||
378 | 9.19k | fn set_speech_files(&mut self, language_dir: &Path, language: &str, new_speech_style: Option<&str>) -> Result<()> { |
379 | 9.19k | PreferenceManager::unzip_files(language_dir, language, Some("en")) |
380 | 9.19k | self.intent = PreferenceManager::find_file(language_dir, language, Some("en"), "intent.yaml") |
381 | 9.19k | self.overview = PreferenceManager::find_file(language_dir, language, Some("en"), "overview.yaml") |
382 | 9.19k | self.navigation = PreferenceManager::find_file(language_dir, language, Some("en"), "navigate.yaml") |
383 | ||
384 | 9.19k | self.speech_unicode = PreferenceManager::find_file(language_dir, language, Some("en"), "unicode.yaml") |
385 | 9.19k | self.speech_unicode_full = PreferenceManager::find_file(language_dir, language, Some("en"), "unicode-full.yaml") |
386 | ||
387 | 9.19k | self.speech_defs = PreferenceManager::find_file(language_dir, language, Some("en"), "definitions.yaml") |
388 | ||
389 | 9.19k | match new_speech_style { |
390 | 0 | Some(style_name) => self.set_style_file(language_dir, language, style_name)?, |
391 | // use the old style name if one isn't given | |
392 | 9.19k | None => self.set_style_file(language_dir, language, &self.pref_to_string("SpeechStyle")) |
393 | } | |
394 | 9.19k | return Ok( () ); |
395 | 9.19k | } |
396 | ||
397 | 10.7k | fn set_style_file(&mut self, language_dir: &Path, language: &str, style_file_name: &str) -> Result<()> { |
398 | 10.7k | let style_file_name = style_file_name.to_string() + "_Rules.yaml"; |
399 | 10.7k | self.speech = PreferenceManager::find_file(language_dir, language, Some("en"), &style_file_name) |
400 | // debug!("set_style_file: language_dir: {}, language: {}, style_file_name: {}, self.speech: {}", | |
401 | // language_dir.display(), language, style_file_name, self.speech.display()); | |
402 | 10.7k | return Ok( () ); |
403 | 10.7k | } |
404 | ||
405 | 5.69k | fn set_braille_files(&mut self, braille_rules_dir: &Path, braille_code_name: &str) -> Result<()> { |
406 | // Fix: Currently the braille code and the directory it lives in have to have the same name | |
407 | 5.69k | PreferenceManager::unzip_files(braille_rules_dir, braille_code_name, Some("UEB")) |
408 | ||
409 | 5.69k | let braille_file = braille_code_name.to_string() + "_Rules.yaml"; |
410 | ||
411 | 5.69k | self.braille = PreferenceManager::find_file(braille_rules_dir, braille_code_name, Some("UEB"), &(braille_file)) |
412 | ||
413 | 5.69k | self.braille_unicode = PreferenceManager::find_file(braille_rules_dir, braille_code_name, Some("UEB"), "unicode.yaml") |
414 | 5.69k | self.braille_unicode_full = PreferenceManager::find_file(braille_rules_dir, braille_code_name, Some("UEB"), "unicode-full.yaml") |
415 | ||
416 | 5.69k | self.braille_defs = PreferenceManager::find_file(braille_rules_dir, braille_code_name, Some("UEB"), "definitions.yaml") |
417 | 5.69k | return Ok( () ); |
418 | 5.69k | } |
419 | ||
420 | /// If some preferences have changed, we may need to recompute other ones | |
421 | /// The key prefs are Language, SpeechStyle, and BrailleCode, along with DecimalSeparator | |
422 | 4.14k | fn set_files_based_on_changes(&mut self, new_prefs: &Preferences) -> Result<()> { |
423 | 4.14k | let old_language = self.user_prefs.prefs.get("Language"); // not set if first time |
424 | 4.14k | if old_language.is_none() { |
425 | 4.14k | return Ok( () ); // if "Language" isn't set yet, nothing else is either -- first time through, so no updating needed. |
426 | 0 | } |
427 | ||
428 | 0 | let old_language = old_language.unwrap(); |
429 | 0 | let new_language = new_prefs.prefs.get("Language").unwrap(); |
430 | 0 | debug!("set_files_based_on_changes: old_language={old_language:?}, new_language={new_language:?}"); |
431 | 0 | if old_language != new_language { |
432 | 0 | let language_dir = self.rules_dir.to_path_buf().join("Languages"); |
433 | 0 | self.set_speech_files(&language_dir, new_language.as_str().unwrap(), None)?; // also sets style file |
434 | } else { | |
435 | 0 | let old_speech_style = self.user_prefs.prefs.get("SpeechStyle").unwrap(); |
436 | 0 | let new_speech_style = new_prefs.prefs.get("SpeechStyle").unwrap(); |
437 | 0 | let language_dir = self.rules_dir.to_path_buf().join("Languages"); |
438 | 0 | if old_speech_style != new_speech_style { |
439 | 0 | self.set_speech_files(&language_dir, new_language.as_str().unwrap(), new_speech_style.as_str())?; |
440 | 0 | } |
441 | } | |
442 | ||
443 | 0 | let old_braille_code = self.user_prefs.prefs.get("BrailleCode").unwrap(); |
444 | 0 | let new_braille_code = new_prefs.prefs.get("BrailleCode").unwrap(); |
445 | 0 | if old_braille_code != new_braille_code { |
446 | 0 | let braille_code_dir = self.rules_dir.to_path_buf().join("Braille"); |
447 | 0 | self.set_braille_files(&braille_code_dir, new_braille_code.as_str().unwrap())?; // also sets style file |
448 | 0 | } |
449 | ||
450 | 0 | return Ok( () ); |
451 | 4.14k | } |
452 | ||
453 | /// Unzip the files if needed | |
454 | /// Returns true if it unzipped them | |
455 | 41.3k | pub fn unzip_files(path: &Path, lang: &str, default_lang: Option<&str>) -> Result<bool> { |
456 | thread_local!{ | |
457 | /// when a language/braille code dir is unzipped, it is recorded here | |
458 | static UNZIPPED_FILES: RefCell<HashSet<String>> = RefCell::new( HashSet::with_capacity(31)); | |
459 | } | |
460 | // ignore regional subdirs | |
461 | 41.3k | let dir = PreferenceManager::get_language_dir(path, lang, default_lang) |
462 | 41.3k | let language = if dir.ends_with(lang) { |
463 | 41.3k | let zip_file_name = language.to_string() + ".zip"; |
464 | 41.3k | let zip_file_path = dir.join(&zip_file_name); |
465 | 41.3k | let zip_file_string = zip_file_path.to_string_lossy().to_string(); |
466 | // debug!("unzip_files: dir: {}, zip_file_name: {}, zip_file_path: {}", dir.display(), zip_file_name, zip_file_string); | |
467 | 41.3k | if UNZIPPED_FILES.with( |unzipped_files| unzipped_files.borrow().contains(&zip_file_string)) { |
468 | 28.6k | return Ok(false); |
469 | 12.6k | } |
470 | ||
471 | 12.6k | let |
472 | 1.44k | Err(e) => { |
473 | 1.44k | if lang.contains('-') { |
474 | // try again in parent dir of regional language | |
475 | 0 | let language = lang.split_once('-').unwrap_or((lang, "")).0; // get the parent language |
476 | // debug!("unzip_files: trying again in parent language: {}", language); | |
477 | 0 | PreferenceManager::unzip_files(path, language, default_lang) |
478 | 0 | .with_context(|| format!("Couldn't open zip file {zip_file_string} in parent {language}: {e}."))? |
479 | } else { | |
480 | // maybe just regional dialects | |
481 | 1.44k | let mut regional_dirs = Vec::new(); |
482 | 1.44k | find_all_dirs_shim(&dir, &mut regional_dirs); |
483 | 1.44k | for dir in regional_dirs { |
484 | // debug!("unzip_files: trying again in subdir: {}", dir.display()); | |
485 | 1.44k | let language = format!("{}-{}", lang, dir.file_name().unwrap().to_str().unwrap()); |
486 | 1.44k | if let Ok(result) =PreferenceManager::unzip_files(path, &language, default_lang) { |
487 | 1.44k | return Ok(result); |
488 | 0 | } |
489 | } | |
490 | 0 | bail!("Couldn't open zip file {}: {}.", zip_file_string, e) |
491 | } | |
492 | }, | |
493 | 11.2k | Ok(result) => { |
494 | 11.2k | result |
495 | }, | |
496 | }; | |
497 | ||
498 | 11.2k | UNZIPPED_FILES.with( |unzipped_files| unzipped_files.borrow_mut().insert(zip_file_string.clone()) ); |
499 | // debug!(" unzip_files: unzipped {} files from {}", result, &zip_file_string); | |
500 | // UNZIPPED_FILES.with( |unzipped_files| { | |
501 | // debug!("unzip_files: unzipped_files: {:?}", unzipped_files.borrow()); | |
502 | // }); | |
503 | ||
504 | 11.2k | return Ok(result); |
505 | 41.3k | } |
506 | ||
507 | /// Set BlockSeparators and DecimalSeparators | |
508 | /// FIX: changing these values could change the parse, so we really should reparse the original expr, but that doesn't exist anymore (store the original string???) | |
509 | /// | |
510 | /// Note: DecimalSeparator is user-facing (can be Auto), DecimalSeparators is code-facing (always a char) | |
511 | 8.24k | fn set_separators(&mut self, language_country: &str) -> Result<()> { |
512 | // This list was generated from https://en.wikipedia.org/wiki/Decimal_separator#Countries_using_decimal_point | |
513 | // The countries were then mapped to language(s) using https://en.wikipedia.org/wiki/List_of_official_languages_by_country_and_territory | |
514 | // When a language was used in other countries that used a "," separator, the language+country is listed | |
515 | // Sometimes there are multiple languages used in a country -- they are all listed, sometimes with a country code | |
516 | // The country code isn't used when the language is used in smaller countries (i.e, when "." is more likely correct) | |
517 | // This decision is sometimes a bit arbitrary | |
518 | // For example, Swahili (sw) is used in: Democratic Republic of the Congo, Kenya, Rwanda, Tanzania, and Uganda. | |
519 | // Of these, Kenya, Tanzania, and Uganda are listed as using "." and I include Swahili in the list below. | |
520 | static USE_DECIMAL_SEPARATOR: phf::Set<&str> = phf_set! { | |
521 | "en", "bn", "km", "el-cy", "tr-cy", "zh", "es-do", "ar", "es-sv", "es-gt", "es-hn", "hi", "as", "gu", "kn", "ks", | |
522 | "ml", "mr", "ne", "or", "pa", "sa", "sd", "ta", "te", "ur", "he", "ja", "sw", "ko", "de-li", "ms", "dv", "mt", "es-mx", "my", | |
523 | "af-na", "es-ni", "es-pa", "fil", "ms-sg", "si", "th", | |
524 | "es-419", // latin america | |
525 | }; | |
526 | ||
527 | 8.24k | let decimal_separator = self.pref_to_string("DecimalSeparator"); |
528 | 8.24k | if !["Auto", ",", "."].contains(&decimal_separator.as_str()) { |
529 | 2 | return Ok( () ); |
530 | 8.24k | } |
531 | ||
532 | 8.24k | if language_country == "Auto" && |
533 | 4.14k | return Ok( () ); // "Auto" doesn't tell us anything -- we will get called again when Language is set |
534 | 4.10k | } |
535 | ||
536 | 4.10k | let language_country = language_country.to_ascii_lowercase(); |
537 | 4.10k | let language_country = &language_country; |
538 | 4.10k | let mut lang_country_split = language_country.split('-'); |
539 | 4.10k | let language = lang_country_split.next().unwrap_or(""); |
540 | 4.10k | let country = lang_country_split.next().unwrap_or(""); |
541 | 4.10k | let mut use_period = decimal_separator == "."; |
542 | 4.10k | if decimal_separator == "Auto" { |
543 | // if we don't have a match for the lang-country, then just try lang | |
544 | 4.09k | use_period = USE_DECIMAL_SEPARATOR.contains(language_country) || |
545 | 2 | } |
546 | // debug!("set_separators: use_period: {}", use_period); | |
547 | 4.10k | self.user_prefs.prefs.insert("DecimalSeparators".to_string(), Yaml::String((if use_period { |
548 | 4.10k | let mut block_separators = (if use_period { |
549 | 4.10k | if country == "ch" || country == "li" { // Switzerland and Liechtenstein also use ` as a block separator, at least in some cases |
550 | 0 | block_separators.push('\''); |
551 | 4.10k | } |
552 | 4.10k | self.user_prefs.prefs.insert("BlockSeparators".to_string(), Yaml::String(block_separators)); |
553 | 4.10k | return Ok( () ); |
554 | 8.24k | } |
555 | ||
556 | ||
557 | /// Find a file matching `file_name` by starting in the regional directory and looking to the language. | |
558 | /// If that fails, fall back to looking for the default repeating the same process -- something needs to be found or MathCAT crashes | |
559 | 88.6k | fn find_file(rules_dir: &Path, lang: &str, default_lang: Option<&str>, file_name: &str) -> Result<PathBuf> { |
560 | // rules_dir: is the root of the search | |
561 | // to that we add the language dir(s) | |
562 | // if file_name doesn't exist in the language dir(s), we try to find it in the default dir | |
563 | // the exception to this is if it ends with _Rules.yaml, we look for other _Rules.yaml files | |
564 | // returns the location of the file_name found | |
565 | ||
566 | // start by trying to find a dir that exists | |
567 | 88.6k | let lang_dir = PreferenceManager::get_language_dir(rules_dir, lang, default_lang) |
568 | // now find the file name in the dirs | |
569 | // we start with the deepest dir and walk back to towards Rules | |
570 | 88.6k | let mut alternative_style_file = None; // back up in case we don't find the target style in lang_dir |
571 | 88.6k | let looking_for_style_file = file_name.ends_with("_Rules.yaml"); |
572 | 108k | for os_path in |
573 | 108k | let path = PathBuf::from(os_path).join(file_name); |
574 | // debug!("find_file: checking file: {}", path.to_string_lossy()); | |
575 | 108k | if is_file_shim(&path) { |
576 | // we make an exception for definitions.yaml -- there a language specific checks for Hundreds, etc | |
577 | 88.4k | if !(file_name == "definitions.yaml" && |
578 | // debug!("find_file -- found={}", path.to_string_lossy()); | |
579 | 88.4k | return Ok(path); |
580 | 2 | } |
581 | 19.7k | }; |
582 | 19.7k | if looking_for_style_file && |
583 | 257 | let Ok( |
584 | 249 | // debug!("find_file: found alternative style file '{}'", alt_file_path.display()); |
585 | 249 | alternative_style_file = Some(alt_file_path); |
586 | 19.4k | } |
587 | 19.7k | if os_path.ends_with("Rules") { |
588 | // at root of Rules directory | |
589 | 256 | break; |
590 | 19.4k | } |
591 | } | |
592 | ||
593 | ||
594 | 256 | if let Some( |
595 | // debug!("find_file: found alternative_style_file '{}'", result.to_string_lossy()); | |
596 | 248 | return Ok(result); // found an alternative style file in the same lang dir |
597 | 8 | } |
598 | ||
599 | // try a subdir (regional dialect) of the language dir | |
600 | 8 | let mut regional_dirs = Vec::new(); |
601 | 8 | find_all_dirs_shim(&lang_dir, &mut regional_dirs); |
602 | 8 | for dir in regional_dirs { |
603 | // debug!("find_file: trying again in subdir: {}", dir.display()); | |
604 | // debug!(" ... files found = {:?}", find_files_in_dir_that_ends_with_shim(&dir, file_name)); | |
605 | 8 | if find_files_in_dir_that_ends_with_shim(&dir, ".yaml").contains(&file_name.to_string()) { |
606 | 0 | let path = dir.join(file_name); |
607 | 0 | if is_file_shim(&path) { |
608 | 0 | return Ok(path); |
609 | 0 | } |
610 | 8 | } |
611 | } | |
612 | ||
613 | 8 | if let Some(default_lang) = default_lang { |
614 | // try again with the default language (we're likely in trouble) | |
615 | 8 | return PreferenceManager::find_file(rules_dir, default_lang, None, file_name); |
616 | 0 | } |
617 | ||
618 | // We are done for -- MathCAT can't do anything without the required files! | |
619 | 0 | bail!("Wasn't able to find/read MathCAT required file in directory: {}\n\ |
620 | Initially looked in there for language specific directory: {}\n\ | |
621 | Looking for file: {}", | |
622 | 0 | rules_dir.to_str().unwrap(), lang, file_name); |
623 | ||
624 | ||
625 | /// try to find a xxx_Rules.yaml file -- returns an error if none is found () | |
626 | 257 | fn find_any_style_file(path: &Path) -> Result<PathBuf> { |
627 | // try to find a xxx_Rules.yaml file | |
628 | // we find the first file because this is the deepest (most language specific) speech rule file | |
629 | 257 | let rule_files = find_files_in_dir_that_ends_with_shim(path, "_Rules.yaml"); |
630 | 257 | if rule_files.is_empty() { |
631 | 8 | bail!{"didn't find file"}; |
632 | } else { | |
633 | 249 | return Ok( path.join(rule_files[0].clone()) ); |
634 | } | |
635 | 257 | } |
636 | 88.6k | } |
637 | ||
638 | 129k | fn get_language_dir(rules_dir: &Path, lang: &str, default_lang: Option<&str>) -> Result<PathBuf> { |
639 | // return 'Rules/Language/fr', 'Rules/Language/en/gb', etc, if they exist. | |
640 | // fall back to main language, and then to default_dir if language dir doesn't exist | |
641 | 129k | let mut full_path = rules_dir.to_path_buf(); |
642 | 129k | full_path.push(lang.replace('-', std::path::MAIN_SEPARATOR_STR)); |
643 | 130k | for parent in |
644 | 130k | if parent == rules_dir { |
645 | 0 | break; |
646 | 130k | } else if is_dir_shim(parent) { |
647 | 129k | return Ok(parent.to_path_buf()); |
648 | 24 | } |
649 | } | |
650 | ||
651 | // didn't find the language -- try again with the default language | |
652 | 0 | match default_lang { |
653 | 0 | Some(default_lang) => { |
654 | 0 | warn!("Couldn't find rules for language {lang}, "); |
655 | 0 | return PreferenceManager::get_language_dir(rules_dir, default_lang, None); |
656 | }, | |
657 | None => { | |
658 | // We are done for -- MathCAT can't do anything without the required files! | |
659 | 0 | bail!("Wasn't able to find/read directory for language {}\n |
660 | Wasn't able to find/read MathCAT default language directory: {}", | |
661 | 0 | lang, rules_dir.join(default_lang.unwrap_or("")).as_os_str().to_str().unwrap()); |
662 | } | |
663 | } | |
664 | 129k | } |
665 | ||
666 | ||
667 | /// Return the speech rule style file locations. | |
668 | 15.3k | pub fn get_rule_file(&self, name: &RulesFor) -> &Path { |
669 | 15.3k | if !self.error.is_empty() { |
670 | 0 | panic!("Internal error: get_rule_file called on invalid PreferenceManager -- error message\n{}", &self.error); |
671 | 15.3k | }; |
672 | ||
673 | 15.3k | let files = match name { |
674 | 3.88k | RulesFor::Intent => &self.intent, |
675 | 9.03k | RulesFor::Speech => &self.speech, |
676 | 14 | RulesFor::OverView => &self.overview, |
677 | 549 | RulesFor::Navigation => &self.navigation, |
678 | 1.83k | RulesFor::Braille => &self.braille, |
679 | }; | |
680 | 15.3k | return files.as_path(); |
681 | 15.3k | } |
682 | ||
683 | /// Return the unicode.yaml file locations. | |
684 | 18.9k | pub fn get_speech_unicode_file(&self) ->(&Path, &Path) { |
685 | 18.9k | if !self.error.is_empty() { |
686 | 0 | panic!("Internal error: get_speech_unicode_file called on invalid PreferenceManager -- error message\n{}", &self.error); |
687 | 18.9k | }; |
688 | 18.9k | return (self.speech_unicode.as_path(), self.speech_unicode_full.as_path()); |
689 | 18.9k | } |
690 | ||
691 | /// Return the unicode.yaml file locations. | |
692 | 3.92k | pub fn get_braille_unicode_file(&self) -> (&Path, &Path) { |
693 | 3.92k | if !self.error.is_empty() { |
694 | 0 | panic!("Internal error: get_braille_unicode_file called on invalid PreferenceManager -- error message\n{}", &self.error); |
695 | 3.92k | }; |
696 | ||
697 | 3.92k | return (self.braille_unicode.as_path(), self.braille_unicode_full.as_path()); |
698 | 3.92k | } |
699 | ||
700 | /// Return the definitions.yaml file locations. | |
701 | 15.3k | pub fn get_definitions_file(&self, use_speech_defs: bool) -> &Path { |
702 | 15.3k | if !self.error.is_empty() { |
703 | 0 | panic!("Internal error: get_definitions_file called on invalid PreferenceManager -- error message\n{}", &self.error); |
704 | 15.3k | }; |
705 | ||
706 | 15.3k | let defs_file = if use_speech_defs { |
707 | 15.3k | return defs_file; |
708 | 15.3k | } |
709 | ||
710 | /// Return the TTS engine currently in use. | |
711 | 85.4k | pub fn get_tts(&self) -> TTS { |
712 | 85.4k | if !self.error.is_empty() { |
713 | 0 | panic!("Internal error: get_tts called on invalid PreferenceManager -- error message\n{}", &self.error); |
714 | 85.4k | }; |
715 | ||
716 | 85.4k | return match self.pref_to_string("TTS").as_str().to_ascii_lowercase().as_str() { |
717 | 85.4k | "none" => TTS::None, |
718 | 0 | "ssml" => TTS::SSML, |
719 | 0 | "sapi5" => TTS::SAPI5, |
720 | _ => { | |
721 | 0 | warn!("found unknown value for TTS: '{}'", self.pref_to_string("TTS").as_str()); |
722 | 0 | TTS::None |
723 | } | |
724 | } | |
725 | 85.4k | } |
726 | ||
727 | /// Set the string-valued preference. | |
728 | /// | |
729 | /// Note: changing the language, speech style, or braille code might fail if the files don't exist. | |
730 | /// If this happens, the preference is not set and an error is returned. | |
731 | /// If "LanguageAuto" is set, we assume "Language" has already be checked to be "Auto" | |
732 | 16.2k | pub fn set_string_pref(&mut self, key: &str, value: &str) -> Result<()> { |
733 | 16.2k | if !self.error.is_empty() { |
734 | 0 | panic!("Internal error: set_string_pref called on invalid PreferenceManager -- error message\n{}", &self.error); |
735 | 16.2k | }; |
736 | ||
737 | // verify language, braille, and SpeechStyle because these are used as access into the file system | |
738 | // should be an ascii string with only letters, dashes, and underscores | |
739 | 16.2k | if |
740 | 55.9k | ! |
741 | 3 | bail!("{} is an invalid value! Must contains only ascii letters, '_', or'-'", key); |
742 | 16.2k | } |
743 | ||
744 | // don't do an update if the value hasn't changed | |
745 | 16.2k | let mut is_user_pref = true; |
746 | 16.2k | if let Some( |
747 | 61 | if pref_value.as_str().unwrap() != value { |
748 | 59 | is_user_pref = false; |
749 | 59 | self.reset_files_from_preference_change(key, value) |
750 | 2 | } |
751 | 16.1k | } else if let Some(pref_value) = self.user_prefs.prefs.get(key) { |
752 | 16.1k | if pref_value.as_str().unwrap() != value { |
753 | 8.77k | self.reset_files_from_preference_change(key, value) |
754 | 7.40k | } |
755 | } else { | |
756 | 0 | bail!("{} is an unknown MathCAT preference!", key); |
757 | } | |
758 | ||
759 | // debug!("Setting ({}) {} to '{}'", if is_user_pref {"user"} else {"sys"}, key, value); | |
760 | 16.2k | if is_user_pref { |
761 | // a little messy about the DecimalSeparator due immutable and mutable borrows | |
762 | 16.1k | let current_decimal_separator = self.user_prefs.prefs.get("DecimalSeparator").unwrap().clone(); |
763 | 16.1k | let current_decimal_separator = current_decimal_separator.as_str().unwrap(); |
764 | 16.1k | let is_decimal_separators_changed = key == "DecimalSeparator" && |
765 | 16.1k | let is_language_changed = key == "Language" && |
766 | 16.1k | self.user_prefs.prefs.insert(key.to_string(), Yaml::String(value.to_string())); |
767 | 16.1k | if is_decimal_separators_changed || (current_decimal_separator == "Auto" && is_language_changed) { |
768 | // a little messy about the language due immutable and mutable borrows) | |
769 | 4.08k | let language = self.user_prefs.prefs.get("Language").unwrap_or(&DEFAULT_LANG).clone(); |
770 | 4.08k | let language = language.as_str().unwrap(); |
771 | 4.08k | self.set_separators(language) |
772 | 12.0k | } |
773 | 59 | } else { |
774 | 59 | self.api_prefs.prefs.insert(key.to_string(), Yaml::String(value.to_string())); |
775 | 59 | } |
776 | 16.2k | return Ok( () ); |
777 | 16.2k | } |
778 | ||
779 | 30.2k | fn reset_files_from_preference_change(&mut self, changed_pref: &str, changed_value: &str) -> Result<()> { |
780 | 30.2k | if changed_pref == "Language" && |
781 | // Language must have had a non-Auto value -- set LanguageAuto to old value so (probable) next change to LanguageAuto works well | |
782 | 0 | self.api_prefs.prefs.insert("LanguageAuto".to_string(), |
783 | 0 | self.api_prefs.prefs.get("Language").unwrap_or(&DEFAULT_LANG).clone() ); |
784 | 0 | return Ok( () ); |
785 | 30.2k | } |
786 | ||
787 | 30.2k | let changed_pref = if changed_pref == "LanguageAuto" { |
788 | 30.2k | let language_dir = self.rules_dir.to_path_buf().join("Languages"); |
789 | 30.2k | match changed_pref { |
790 | 30.2k | "Language" => { |
791 | 4.09k | self.set_speech_files(&language_dir, changed_value, None) |
792 | }, | |
793 | 26.1k | "SpeechStyle" => { |
794 | 1.51k | let language = self.pref_to_string("Language"); |
795 | 1.51k | let language = if language.as_str() == "Auto" { |
796 | 1.51k | self.set_style_file(&language_dir, language, changed_value) |
797 | }, | |
798 | 24.6k | "BrailleCode" => { |
799 | 601 | let braille_dir = self.rules_dir.to_path_buf().join("Braille"); |
800 | 601 | self.set_braille_files(&braille_dir, changed_value) |
801 | }, | |
802 | 24.0k | _ => (), |
803 | } | |
804 | 30.2k | return Ok( () ); |
805 | 30.2k | } |
806 | ||
807 | /// Set the number-valued preference. | |
808 | /// All number-valued preferences are stored with type `f64`. | |
809 | 0 | pub fn set_api_float_pref(&mut self, key: &str, value: f64) { |
810 | 0 | if !self.error.is_empty() { |
811 | 0 | panic!("Internal error: set_api_float_pref called on invalid PreferenceManager -- error message\n{}", &self.error); |
812 | 0 | }; |
813 | ||
814 | 0 | self.api_prefs.prefs.insert(key.to_string(), Yaml::Real(value.to_string())); |
815 | 0 | } |
816 | ||
817 | 1.50k | pub fn set_api_boolean_pref(&mut self, key: &str, value: bool) { |
818 | 1.50k | if !self.error.is_empty() { |
819 | 0 | panic!("Internal error: set_api_boolean_pref called on invalid PreferenceManager -- error message\n{}", &self.error); |
820 | 1.50k | }; |
821 | ||
822 | 1.50k | self.api_prefs.prefs.insert(key.to_string(), Yaml::Boolean(value)); |
823 | 1.50k | } |
824 | ||
825 | /// Return the current speech rate. | |
826 | 0 | pub fn get_rate(&self) -> f64 { |
827 | 0 | if !self.error.is_empty() { |
828 | 0 | panic!("Internal error: get_rate called on invalid PreferenceManager -- error message\n{}", &self.error); |
829 | 0 | }; |
830 | ||
831 | 0 | return match &self.pref_to_string("Rate").parse::<f64>() { |
832 | 0 | Ok(val) => *val, |
833 | Err(_) => { | |
834 | 0 | warn!("Rate ('{}') can't be converted to a floating point number", &self.pref_to_string("Rate")); |
835 | 0 | DEFAULT_API_PREFERENCES.with(|defaults| defaults.prefs["Rate"].as_f64().unwrap()) |
836 | } | |
837 | }; | |
838 | 0 | } |
839 | ||
840 | 0 | pub fn get_api_prefs(&self) -> &Preferences { |
841 | 0 | return &self.api_prefs; |
842 | 0 | } |
843 | ||
844 | /// returns value associated with 'name' or string NO_PREFERENCE | |
845 | /// | |
846 | /// Note: Option/Result not used because most of the time we know the preference exists, so no unwrapping is needed for 95% of calls | |
847 | 268k | pub fn pref_to_string(&self, name: &str) -> String { |
848 | 268k | let mut value = self.api_prefs.prefs.get(name); |
849 | 268k | if value.is_none() { |
850 | 122k | value = self.user_prefs.prefs.get(name); |
851 | 145k | } |
852 | 268k | return match value { |
853 | 11 | None => NO_PREFERENCE.to_string(), |
854 | 268k | Some(v) => match v { |
855 | 236k | Yaml::String(s) => s.clone(), |
856 | 27.3k | Yaml::Boolean(b) => b.to_string(), |
857 | 4.23k | Yaml::Integer(i) => i.to_string(), |
858 | 0 | Yaml::Real(s) => s.clone(), |
859 | 0 | _ => NO_PREFERENCE.to_string(), // shouldn't happen |
860 | } | |
861 | } | |
862 | 268k | } |
863 | ||
864 | // occasionally useful to check a pref value when debugging | |
865 | // fn get_pref(&self, pref_name: &str) -> String { | |
866 | // return yaml_to_string(self.user_prefs.prefs.get(pref_name).unwrap(), 1); | |
867 | // } | |
868 | ||
869 | /// Warning!!! This is meant for testing only -- it overwrites any values from a user pref file and will be overwritten if the file is reread. | |
870 | /// set_preference() is the function that should be called. | |
871 | /// This differs from set_preference in that the user preferences are changed, not the api ones | |
872 | 21.4k | pub fn set_user_prefs(&mut self, key: &str, value: &str) -> Result<()> { |
873 | 21.4k | if !self.error.is_empty() { |
874 | 0 | panic!("Internal error: set_user_prefs called on invalid PreferenceManager -- error message\n{}", &self.error); |
875 | 21.4k | }; |
876 | ||
877 | 21.4k | self.reset_files_from_preference_change(key, value) |
878 | 21.4k | let is_decimal_separators_changed = key == "DecimalSeparator" && |
879 | 21.4k | let is_language_changed = key == "Language" && |
880 | 21.4k | self.user_prefs.prefs.insert(key.to_string(), Yaml::String(value.to_string())); |
881 | 21.4k | if is_decimal_separators_changed || |
882 | // set computed values for BLOCK_SEPARATORS and DECIMAL_SEPARATORS (a little messy about the language due immutable and mutable borrows) | |
883 | 17 | let language = self.user_prefs.prefs.get("Language").unwrap_or(&DEFAULT_LANG).clone(); |
884 | 17 | let language = language.as_str().unwrap(); |
885 | 17 | self.set_separators(language) |
886 | 21.3k | } |
887 | ||
888 | 21.4k | return Ok(()); |
889 | 21.4k | } |
890 | } | |
891 | ||
892 | ||
893 | #[cfg(test)] | |
894 | mod tests { | |
895 | #[allow(unused_imports)] | |
896 | use crate::init_logger; | |
897 | ||
898 | // For these tests, it is assumed that there are Rules subdirs zz and zz/aa dir; there is no zz/ab | |
899 | // definitions.yaml is in Rules, zz, aa dirs | |
900 | // unicode.yaml is in zz | |
901 | // ClearSpeak_Rules.yaml is in zz | |
902 | // These files are NOT in the zipped up version -- hence the config | |
903 | use super::*; | |
904 | ||
905 | /// Version of abs_rules_dir_path that returns a PathBuf | |
906 | 11 | fn abs_rules_dir_path() -> PathBuf { |
907 | 11 | return PathBuf::from(super::super::abs_rules_dir_path()); |
908 | 11 | } |
909 | /// Return a relative path to Rules dir (ie, .../Rules/zz... returns zz/...) | |
910 | /// strip .../Rules from file path | |
911 | 33 | fn rel_path<'a>(rules_dir: &'a Path, path: &'a Path) -> &'a Path { |
912 | 33 | let stripped_path = path.strip_prefix(rules_dir).unwrap(); |
913 | 33 | return stripped_path |
914 | 33 | } |
915 | ||
916 | #[test] | |
917 | 1 | fn separators() { |
918 | 1 | PREF_MANAGER.with(|pref_manager| { |
919 | 1 | let mut pref_manager = pref_manager.borrow_mut(); |
920 | 1 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
921 | 1 | pref_manager.set_user_prefs("Language", "en").unwrap(); |
922 | 1 | pref_manager.set_user_prefs("DecimalSeparator", "Auto").unwrap(); |
923 | 1 | assert_eq!(&pref_manager.pref_to_string("DecimalSeparators"), "."); |
924 | 1 | assert_eq!(&pref_manager.pref_to_string("BlockSeparators"), ", \u{00A0}\u{202F}"); |
925 | ||
926 | 1 | pref_manager.set_user_prefs("Language", "sv").unwrap(); |
927 | 1 | assert_eq!(&pref_manager.pref_to_string("DecimalSeparators"), ","); |
928 | 1 | assert_eq!(&pref_manager.pref_to_string("BlockSeparators"), ". \u{00A0}\u{202F}"); |
929 | ||
930 | // test potentially ambiguous language (defaults to comma decimal separator) | |
931 | 1 | pref_manager.set_user_prefs("Language", "es").unwrap(); |
932 | 1 | assert_eq!(&pref_manager.pref_to_string("DecimalSeparators"), ","); |
933 | 1 | assert_eq!(&pref_manager.pref_to_string("BlockSeparators"), ". \u{00A0}\u{202F}"); |
934 | ||
935 | // test country override | |
936 | 1 | pref_manager.set_user_prefs("Language", "es-mx").unwrap(); |
937 | 1 | assert_eq!(&pref_manager.pref_to_string("DecimalSeparators"), "."); |
938 | 1 | assert_eq!(&pref_manager.pref_to_string("BlockSeparators"), ", \u{00A0}\u{202F}"); |
939 | ||
940 | 1 | pref_manager.set_user_prefs("DecimalSeparator", ",").unwrap(); |
941 | 1 | assert_eq!(&pref_manager.pref_to_string("DecimalSeparators"), ","); |
942 | 1 | assert_eq!(&pref_manager.pref_to_string("BlockSeparators"), ". \u{00A0}\u{202F}"); |
943 | ||
944 | 1 | pref_manager.set_user_prefs("DecimalSeparator", ".").unwrap(); |
945 | 1 | assert_eq!(&pref_manager.pref_to_string("DecimalSeparators"), "."); |
946 | 1 | assert_eq!(&pref_manager.pref_to_string("BlockSeparators"), ", \u{00A0}\u{202F}"); |
947 | ||
948 | // set to illegal value -- should leave values as before | |
949 | 1 | pref_manager.set_user_prefs("DecimalSeparator", ";").unwrap(); |
950 | 1 | assert_eq!(&pref_manager.pref_to_string("DecimalSeparators"), "."); |
951 | 1 | assert_eq!(&pref_manager.pref_to_string("BlockSeparators"), ", \u{00A0}\u{202F}"); |
952 | ||
953 | // manual | |
954 | 1 | pref_manager.set_user_prefs("DecimalSeparators", ",").unwrap(); |
955 | 1 | pref_manager.set_user_prefs("BlockSeparators", " ").unwrap(); |
956 | 1 | pref_manager.set_user_prefs("DecimalSeparator", "None").unwrap(); |
957 | 1 | assert_eq!(&pref_manager.pref_to_string("DecimalSeparators"), ","); |
958 | 1 | assert_eq!(&pref_manager.pref_to_string("BlockSeparators"), " "); |
959 | 1 | }); |
960 | 1 | } |
961 | ||
962 | #[test] | |
963 | 1 | fn find_simple_style() { |
964 | 1 | PREF_MANAGER.with(|pref_manager| { |
965 | 1 | let mut pref_manager = pref_manager.borrow_mut(); |
966 | 1 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
967 | 1 | pref_manager.set_user_prefs("Language", "en").unwrap(); |
968 | 1 | pref_manager.set_user_prefs("SpeechStyle", "ClearSpeak").unwrap(); |
969 | 1 | assert_eq!(&pref_manager.pref_to_string("Language"), "en"); |
970 | 1 | assert_eq!(&pref_manager.pref_to_string("SpeechStyle"), "ClearSpeak"); |
971 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/en/ClearSpeak_Rules.yaml")); |
972 | 1 | }); |
973 | 1 | } |
974 | ||
975 | cfg_if::cfg_if! {if #[cfg(not(feature = "include-zip"))] { | |
976 | #[test] | |
977 | 1 | fn find_style_other_language() { |
978 | // zz dir should have both ClearSpeak and SimpleSpeak styles | |
979 | // zz-aa dir should have only ClearSpeak style and unicode.yaml that includes the zz unicode but overrides "+" | |
980 | 1 | PREF_MANAGER.with(|pref_manager| { |
981 | 1 | let mut pref_manager = pref_manager.borrow_mut(); |
982 | 1 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
983 | 1 | pref_manager.set_user_prefs("Language", "en").unwrap(); |
984 | 1 | pref_manager.set_user_prefs("SpeechStyle", "SimpleSpeak").unwrap(); |
985 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/en/SimpleSpeak_Rules.yaml")); |
986 | ||
987 | 1 | pref_manager.set_user_prefs("Language", "zz").unwrap(); |
988 | 1 | assert_eq!(&pref_manager.pref_to_string("Language"), "zz"); |
989 | 1 | assert_eq!(&pref_manager.pref_to_string("SpeechStyle"), "SimpleSpeak"); |
990 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/zz/SimpleSpeak_Rules.yaml")); |
991 | ||
992 | // make sure language stays the same | |
993 | 1 | pref_manager.set_user_prefs("SpeechStyle", "ClearSpeak").unwrap(); |
994 | 1 | assert_eq!(&pref_manager.pref_to_string("SpeechStyle"), "ClearSpeak"); |
995 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/zz/ClearSpeak_Rules.yaml")); |
996 | ||
997 | // make sure language stays the same | |
998 | 1 | pref_manager.set_user_prefs("SpeechStyle", "SimpleSpeak").unwrap(); |
999 | 1 | assert_eq!(&pref_manager.pref_to_string("SpeechStyle"), "SimpleSpeak"); |
1000 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/zz/SimpleSpeak_Rules.yaml")); |
1001 | 1 | }); |
1002 | 1 | } |
1003 | ||
1004 | #[test] | |
1005 | 1 | fn find_regional_overrides() { |
1006 | // zz dir should have both ClearSpeak and SimpleSpeak styles | |
1007 | // zz-aa dir should have ClearSpeak style and unicode.yaml that includes the zz unicode but overrides "+" | |
1008 | 1 | PREF_MANAGER.with(|pref_manager| { |
1009 | 1 | let mut pref_manager = pref_manager.borrow_mut(); |
1010 | 1 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
1011 | 1 | pref_manager.set_user_prefs("SpeechStyle", "ClearSpeak").unwrap(); |
1012 | 1 | pref_manager.set_user_prefs("Language", "zz-aa").unwrap(); |
1013 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/zz/aa/ClearSpeak_Rules.yaml")); |
1014 | ||
1015 | 1 | pref_manager.set_user_prefs("SpeechStyle", "SimpleSpeak").unwrap(); |
1016 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/zz/SimpleSpeak_Rules.yaml")); |
1017 | 1 | }); |
1018 | 1 | } |
1019 | ||
1020 | #[test] | |
1021 | 1 | fn find_style_no_sublanguage() { |
1022 | 1 | PREF_MANAGER.with(|pref_manager| { |
1023 | 1 | let mut pref_manager = pref_manager.borrow_mut(); |
1024 | 1 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
1025 | 1 | pref_manager.set_user_prefs("SpeechStyle", "ClearSpeak").unwrap(); |
1026 | 1 | pref_manager.set_user_prefs("Language", "zz-ab").unwrap(); |
1027 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/zz/ClearSpeak_Rules.yaml")); |
1028 | 1 | }); |
1029 | 1 | } |
1030 | ||
1031 | #[test] | |
1032 | 1 | fn found_all_files() { |
1033 | 1 | PREF_MANAGER.with(|pref_manager| { |
1034 | 1 | let mut pref_manager = pref_manager.borrow_mut(); |
1035 | 1 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
1036 | 1 | pref_manager.set_user_prefs("SpeechStyle", "ClearSpeak").unwrap(); |
1037 | 1 | pref_manager.set_user_prefs("Language", "zz-aa").unwrap(); |
1038 | 1 | pref_manager.set_user_prefs("BrailleCode", "UEB").unwrap(); |
1039 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.intent.as_path()), PathBuf::from("intent.yaml")); |
1040 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.overview.as_path()), PathBuf::from("Languages/zz/overview.yaml")); |
1041 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech_defs.as_path()), PathBuf::from("Languages/zz/aa/definitions.yaml")); |
1042 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/zz/aa/ClearSpeak_Rules.yaml")); |
1043 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech_unicode.as_path()), PathBuf::from("Languages/zz/aa/unicode.yaml")); |
1044 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech_unicode_full.as_path()), PathBuf::from("Languages/zz/unicode-full.yaml")); |
1045 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.braille.as_path()), PathBuf::from("Braille/UEB/UEB_Rules.yaml")); |
1046 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.braille_unicode.as_path()), PathBuf::from("Braille/UEB/unicode.yaml")); |
1047 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.braille_unicode_full.as_path()), PathBuf::from("Braille/UEB/unicode-full.yaml")); |
1048 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.braille_defs.as_path()), PathBuf::from("Braille/UEB/definitions.yaml")); |
1049 | ||
1050 | 1 | pref_manager.set_user_prefs("Language", "zz-ab").unwrap(); |
1051 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.intent.as_path()), PathBuf::from("intent.yaml")); |
1052 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.overview.as_path()), PathBuf::from("Languages/zz/overview.yaml")); |
1053 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech_defs.as_path()), PathBuf::from("Languages/zz/definitions.yaml")); |
1054 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/zz/ClearSpeak_Rules.yaml")); |
1055 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech_unicode.as_path()), PathBuf::from("Languages/zz/unicode.yaml")); |
1056 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech_unicode_full.as_path()), PathBuf::from("Languages/zz/unicode-full.yaml")); |
1057 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.braille.as_path()), PathBuf::from("Braille/UEB/UEB_Rules.yaml")); |
1058 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.braille_unicode.as_path()), PathBuf::from("Braille/UEB/unicode.yaml")); |
1059 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.braille_unicode_full.as_path()), PathBuf::from("Braille/UEB/unicode-full.yaml")); |
1060 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.braille_defs.as_path()), PathBuf::from("Braille/UEB/definitions.yaml")); |
1061 | 1 | }) |
1062 | 1 | } |
1063 | ||
1064 | #[test] | |
1065 | 1 | fn test_prefs() { |
1066 | 1 | PREF_MANAGER.with(|pref_manager| { |
1067 | // first test with internal settings | |
1068 | { | |
1069 | 1 | let mut pref_manager = pref_manager.borrow_mut(); |
1070 | 1 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
1071 | ||
1072 | 1 | pref_manager.set_user_prefs("Language", "en").unwrap(); |
1073 | 1 | pref_manager.set_user_prefs("ClearSpeak_AbsoluteValue", "Determinant").unwrap(); |
1074 | 1 | pref_manager.set_user_prefs("ResetNavMode", "true").unwrap(); |
1075 | 1 | pref_manager.set_user_prefs("BrailleCode", "Nemeth").unwrap(); |
1076 | 1 | assert_eq!(pref_manager.pref_to_string("Language").as_str(), "en"); |
1077 | 1 | assert_eq!(pref_manager.pref_to_string("SubjectArea").as_str(), "General"); |
1078 | 1 | assert_eq!(pref_manager.pref_to_string("ClearSpeak_AbsoluteValue").as_str(), "Determinant"); |
1079 | 1 | assert_eq!(pref_manager.pref_to_string("ResetNavMode").as_str(), "true"); |
1080 | 1 | assert_eq!(pref_manager.pref_to_string("BrailleCode").as_str(), "Nemeth"); |
1081 | 1 | assert_eq!(pref_manager.pref_to_string("X_Y_Z").as_str(), NO_PREFERENCE); |
1082 | } | |
1083 | ||
1084 | // now test with the interface | |
1085 | { | |
1086 | use crate::interface::{set_preference, get_preference}; | |
1087 | 1 | set_preference("Language", "zz").unwrap(); |
1088 | 1 | set_preference("ClearSpeak_AbsoluteValue", "Cardinality").unwrap(); |
1089 | 1 | set_preference("Overview", "true").unwrap(); |
1090 | 1 | set_preference("BrailleCode", "UEB").unwrap(); |
1091 | 1 | assert_eq!(&get_preference("Language").unwrap(), "zz"); |
1092 | 1 | assert_eq!(&get_preference("ClearSpeak_AbsoluteValue").unwrap(), "Cardinality"); |
1093 | 1 | assert_eq!(&get_preference("Overview").unwrap(), "true"); |
1094 | 1 | assert_eq!(&get_preference("BrailleCode").unwrap(), "UEB"); |
1095 | 1 | assert!(&get_preference("X_Y_Z").is_err()); |
1096 | ||
1097 | } | |
1098 | 1 | }); |
1099 | 1 | } |
1100 | ||
1101 | #[test] | |
1102 | 1 | fn test_language_change() { |
1103 | // set_preference borrows the pref manager, so the previous borrow's lifetime needs to be ended before using it | |
1104 | 1 | PREF_MANAGER.with(|pref_manager| { |
1105 | 1 | let mut pref_manager = pref_manager.borrow_mut(); |
1106 | 1 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
1107 | 1 | }); |
1108 | 1 | crate::interface::set_preference("Language", "en").unwrap(); |
1109 | 1 | crate::interface::set_preference("SpeechStyle", "ClearSpeak").unwrap(); |
1110 | 1 | PREF_MANAGER.with(|pref_manager| { |
1111 | 1 | let pref_manager = pref_manager.borrow_mut(); |
1112 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.get_rule_file(&RulesFor::Speech)), PathBuf::from("Languages/en/ClearSpeak_Rules.yaml")); |
1113 | 1 | }); |
1114 | ||
1115 | 1 | crate::interface::set_preference("Language", "zz").unwrap(); |
1116 | 1 | PREF_MANAGER.with(|pref_manager| { |
1117 | 1 | let pref_manager = pref_manager.borrow_mut(); |
1118 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.get_rule_file(&RulesFor::Speech)), PathBuf::from("Languages/zz/ClearSpeak_Rules.yaml")); |
1119 | 1 | }); |
1120 | 1 | } |
1121 | ||
1122 | #[test] | |
1123 | 1 | fn test_speech_style_change() { |
1124 | 1 | PREF_MANAGER.with(|pref_manager| { |
1125 | 1 | let mut pref_manager = pref_manager.borrow_mut(); |
1126 | 1 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
1127 | 1 | pref_manager.set_user_prefs("Language", "en").unwrap(); |
1128 | 1 | pref_manager.set_user_prefs("SpeechStyle", "ClearSpeak").unwrap(); |
1129 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.get_rule_file(&RulesFor::Speech)), PathBuf::from("Languages/en/ClearSpeak_Rules.yaml")); |
1130 | ||
1131 | 1 | pref_manager.set_user_prefs("SpeechStyle", "SimpleSpeak").unwrap(); |
1132 | ||
1133 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.get_rule_file(&RulesFor::Speech)), PathBuf::from("Languages/en/SimpleSpeak_Rules.yaml")); |
1134 | 1 | }); |
1135 | 1 | } |
1136 | ||
1137 | #[test] | |
1138 | 1 | fn test_some_changes() { |
1139 | 1 | PREF_MANAGER.with(|pref_manager| { |
1140 | 1 | let mut pref_manager = pref_manager.borrow_mut(); |
1141 | 1 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
1142 | 1 | pref_manager.set_user_prefs("Verbosity", "Terse").unwrap(); |
1143 | ||
1144 | 1 | assert_eq!(&pref_manager.pref_to_string("Verbosity"), "Terse"); |
1145 | ||
1146 | 1 | pref_manager.set_user_prefs("BrailleCode", "UEB").unwrap(); |
1147 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.get_rule_file(&RulesFor::Braille)), PathBuf::from("Braille/UEB/UEB_Rules.yaml")); |
1148 | ||
1149 | // make sure they show up when building context for speech generation | |
1150 | 1 | let merged_prefs = pref_manager.merge_prefs(); |
1151 | 1 | assert_eq!(merged_prefs.get("Verbosity").unwrap().as_str().unwrap(), "Terse"); |
1152 | 1 | }); |
1153 | ||
1154 | 1 | crate::interface::set_preference("NavVerbosity", "Terse").unwrap(); |
1155 | 1 | PREF_MANAGER.with(|pref_manager| { |
1156 | 1 | let pref_manager = pref_manager.borrow_mut(); |
1157 | 1 | let merged_prefs = pref_manager.merge_prefs(); |
1158 | 1 | assert_eq!(merged_prefs.get("NavVerbosity").unwrap().as_str().unwrap(), "Terse"); |
1159 | 1 | }); |
1160 | 1 | } |
1161 | ||
1162 | #[test] | |
1163 | 1 | fn test_illegal_pref_values() { |
1164 | 1 | PREF_MANAGER.with(|pref_manager| { |
1165 | 1 | let mut pref_manager = pref_manager.borrow_mut(); |
1166 | 1 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
1167 | 1 | assert!(pref_manager.set_string_pref("Language", "../../../my/path").is_err()); |
1168 | 1 | assert!(pref_manager.set_string_pref("BrailleCode", "C:\\my\\path").is_err()); |
1169 | 1 | assert!(pref_manager.set_string_pref("SpeechStyle", "/my/path").is_err()); |
1170 | 1 | }); |
1171 | 1 | } |
1172 | ||
1173 | #[test] | |
1174 | #[ignore] // this is an ugly test for #262 -- it changes the prefs file and so is a bad thing in general | |
1175 | 0 | fn test_up_to_date() { |
1176 | use std::fs; | |
1177 | use std::thread::sleep; | |
1178 | use std::time::Duration; | |
1179 | use crate::interface; | |
1180 | 0 | PREF_MANAGER.with(|pref_manager| { |
1181 | 0 | let mut pref_manager = pref_manager.borrow_mut(); |
1182 | 0 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
1183 | 0 | assert_eq!(&pref_manager.pref_to_string("SpeechStyle"), "ClearSpeak"); |
1184 | 0 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/zz/ClearSpeak_Rules.yaml")); |
1185 | 0 | }); |
1186 | 0 | interface::set_mathml("<math><mo>+</mo><mn>10</mn></math>").unwrap(); |
1187 | 0 | assert_eq!(interface::get_spoken_text().unwrap(), "ClearSpeak positive from zz 10"); |
1188 | ||
1189 | 0 | let mut file_path = PathBuf::default(); |
1190 | 0 | let mut contents = vec![]; |
1191 | 0 | PREF_MANAGER.with(|pref_manager| { |
1192 | 0 | let pref_manager = pref_manager.borrow(); |
1193 | 0 | if let Some(file_name) = pref_manager.user_prefs_file.as_ref().unwrap().debug_get_file() { |
1194 | 0 | file_path = PathBuf::from(file_name); |
1195 | 0 | contents = fs::read(&file_path).expect(&format!("Failed to write file {} during test", file_name)); |
1196 | 0 | let changed_contents = String::from_utf8(contents.clone()).unwrap() |
1197 | 0 | .replace("SpeechStyle: ClearSpeak", "SpeechStyle: SimpleSpeak"); |
1198 | 0 | fs::write(&file_path, changed_contents).unwrap(); |
1199 | 0 | sleep(Duration::from_millis(5)); // make sure the time changes enough to be recognized |
1200 | 0 | } |
1201 | 0 | }); |
1202 | 0 | assert_eq!(interface::get_spoken_text().unwrap(), "SimpleSpeak positive from zz 10"); |
1203 | 0 | fs::write(&file_path, contents).unwrap(); |
1204 | ||
1205 | // assert_eq!(&pref_manager.pref_to_string("SpeechStyle"), "SimpleSpeak"); | |
1206 | // assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/zz/SimpleSpeak_Rules.yaml")); | |
1207 | 0 | } |
1208 | ||
1209 | }} | |
1210 | } |
Line | Count | Source |
1 | //! Preferences come from either the user or are programmatically set by the AT. | |
2 | //! Either source can set any preference, but users and AT typically set different preferences. | |
3 | //! | |
4 | //! User prefs are read in from a YAML file (prefs.yaml). They can be written by hand. | |
5 | //! In the future, there will hopefully be a nice UI that writes out the YAML file. | |
6 | //! | |
7 | //! AT prefs are set via the API given in the [crate::interface] module. | |
8 | //! These in turn call [`PreferenceManager::set_string_pref`] and [`PreferenceManager::set_api_float_pref`]. | |
9 | //! Ultimately, user and api prefs are stored in a hashmap. | |
10 | //! | |
11 | //! Preferences can be found in a few places: | |
12 | //! 1. Language-independent prefs found in the Rules dir | |
13 | //! 2. Language-specific prefs | |
14 | //! 3. Language-region-specific prefs | |
15 | //! | |
16 | //! If there are multiple definitions, the later ones overwrite the former ones. | |
17 | //! This means that region-specific variants will overwrite more general variants. | |
18 | //! | |
19 | //! Note: there are a number of public 'get_xxx' functions that really are meant to be public only to the [crate::speech] module as speech needs access | |
20 | //! to the preferences to generate the speech. | |
21 | #![allow(clippy::needless_return)] | |
22 | use yaml_rust::{Yaml, YamlLoader}; | |
23 | use crate::pretty_print::yaml_to_string; | |
24 | use crate::tts::TTS; | |
25 | use std::cell::RefCell; | |
26 | use std::rc::Rc; | |
27 | use log::{debug, error, warn}; | |
28 | use std::path::{Path, PathBuf}; | |
29 | use std::sync::LazyLock; | |
30 | use crate::speech::{as_str_checked, RulesFor, FileAndTime}; | |
31 | use std::collections::{HashMap, HashSet}; | |
32 | use phf::phf_set; | |
33 | use crate::shim_filesystem::*; | |
34 | use crate::errors::*; | |
35 | ||
36 | /// Use to indicate preference not found with Preference::to_string() | |
37 | pub static NO_PREFERENCE: &str = "\u{FFFF}"; | |
38 | ||
39 | 3 | static DEFAULT_LANG: LazyLock<Yaml> = LazyLock::new(|| Yaml::String("en".to_string())); |
40 | ||
41 | ||
42 | // Preferences are recorded here | |
43 | /// Preferences are stored in a HashMap. It maps the name of the pref (a String) to its value (stored as YAML string/float) | |
44 | pub type PreferenceHashMap = HashMap<String, Yaml>; | |
45 | #[derive(Debug, Clone, Default)] | |
46 | pub struct Preferences { | |
47 | prefs: PreferenceHashMap // FIX: pub so can get at iterator, should add iterator to Preferences instead | |
48 | } | |
49 | ||
50 | use std::fmt; | |
51 | impl fmt::Display for Preferences { | |
52 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
53 | 0 | let mut pref_vec: Vec<(&String, &Yaml)> = self.prefs.iter().collect(); |
54 | 0 | pref_vec.sort(); |
55 | 0 | for (name, value) in pref_vec { |
56 | 0 | writeln!(f, " {}: {}", name, yaml_to_string(value, 0))?; |
57 | } | |
58 | 0 | return Ok(()); |
59 | 0 | } |
60 | } | |
61 | ||
62 | impl Preferences{ | |
63 | // default values needed in case nothing else gets set | |
64 | 4.14k | fn user_defaults() -> Preferences { |
65 | 4.14k | let mut prefs = PreferenceHashMap::with_capacity(39); |
66 | 4.14k | prefs.insert("Language".to_string(), Yaml::String("en".to_string())); |
67 | 4.14k | prefs.insert("LanguageAuto".to_string(), Yaml::String("".to_string())); // illegal value so change will be recognized |
68 | 4.14k | prefs.insert("SpeechStyle".to_string(), Yaml::String("ClearSpeak".to_string())); |
69 | 4.14k | prefs.insert("Verbosity".to_string(), Yaml::String("Medium".to_string())); |
70 | 4.14k | prefs.insert("SpeechOverrides_CapitalLetters".to_string(), Yaml::String("".to_string())); // important for testing |
71 | 4.14k | prefs.insert("Blind".to_string(), Yaml::Boolean(true)); |
72 | 4.14k | prefs.insert("MathRate".to_string(), Yaml::Real("100.0".to_string())); |
73 | 4.14k | prefs.insert("PauseFactor".to_string(), Yaml::Real("100.0".to_string())); |
74 | 4.14k | prefs.insert("NavMode".to_string(), Yaml::String("Enhanced".to_string())); |
75 | 4.14k | prefs.insert("Overview".to_string(), Yaml::Boolean(false)); |
76 | 4.14k | prefs.insert("ResetOverView".to_string(), Yaml::Boolean(true)); |
77 | 4.14k | prefs.insert("NavVerbosity".to_string(), Yaml::String("Verbose".to_string())); |
78 | 4.14k | prefs.insert("AutoZoomOut".to_string(), Yaml::Boolean(true)); |
79 | 4.14k | prefs.insert("BrailleCode".to_string(), Yaml::String("Nemeth".to_string())); |
80 | 4.14k | prefs.insert("BrailleNavHighlight".to_string(), Yaml::String("EndPoints".to_string())); |
81 | 4.14k | prefs.insert("UEB_START_MODE".to_string(), Yaml::String("Grade2".to_string())); |
82 | 4.14k | prefs.insert("DecimalSeparators".to_string(), Yaml::String(".".to_string())); |
83 | 4.14k | prefs.insert("BlockSeparators".to_string(), Yaml::String(", \u{00A0}\u{202F}".to_string())); |
84 | ||
85 | 4.14k | return Preferences{ prefs }; |
86 | 4.14k | } |
87 | ||
88 | // default values needed in case nothing else gets set | |
89 | 4.14k | fn api_defaults() -> Preferences { |
90 | 4.14k | let mut prefs = PreferenceHashMap::with_capacity(19); |
91 | 4.14k | prefs.insert("TTS".to_string(), Yaml::String("none".to_string())); |
92 | 4.14k | prefs.insert("Pitch".to_string(), Yaml::Real("0.0".to_string())); |
93 | 4.14k | prefs.insert("Rate".to_string(), Yaml::Real("180.0".to_string())); |
94 | 4.14k | prefs.insert("Volume".to_string(), Yaml::Real("100.0".to_string())); |
95 | 4.14k | prefs.insert("Voice".to_string(), Yaml::String("none".to_string())); |
96 | 4.14k | prefs.insert("Gender".to_string(), Yaml::String("none".to_string())); |
97 | 4.14k | prefs.insert("Bookmark".to_string(), Yaml::Boolean(false)); |
98 | 4.14k | prefs.insert("CapitalLetters_UseWord".to_string(), Yaml::Boolean(true)); |
99 | 4.14k | prefs.insert("CapitalLetters_Pitch".to_string(), Yaml::Real("0.0".to_string())); |
100 | 4.14k | prefs.insert("CapitalLetters_Beep".to_string(), Yaml::Boolean(false)); |
101 | 4.14k | prefs.insert("IntentErrorRecovery".to_string(), Yaml::String("IgnoreIntent".to_string())); // also Error |
102 | 4.14k | prefs.insert("CheckRuleFiles".to_string(), Yaml::String( |
103 | 4.14k | (if cfg!(target_family = "wasm") { |
104 | 4.14k | return Preferences{ prefs }; |
105 | 4.14k | } |
106 | ||
107 | 4.14k | fn read_prefs_file(file: &Path, mut base_prefs: Preferences) -> Result<Preferences> { |
108 | 4.14k | let file_name = file.to_str().unwrap(); |
109 | let docs; | |
110 | 4.14k | match read_to_string_shim(file) { |
111 | 0 | Err(e) => { |
112 | 0 | bail!("Couldn't read file {}\n{}", file_name, e); |
113 | }, | |
114 | 4.14k | Ok( file_contents) => { |
115 | 4.14k | match YamlLoader::load_from_str(&file_contents) { |
116 | 0 | Err(e) => { |
117 | 0 | bail!("Yaml parse error ('{}') in preference file {}.", e, file_name); |
118 | }, | |
119 | 4.14k | Ok(d) => docs = d, |
120 | } | |
121 | ||
122 | } | |
123 | } | |
124 | 4.14k | if docs.len() != 1 { |
125 | 0 | bail!("MathCAT: error in prefs file '{}'.\nFound {} 'documents' -- should only be 1.", file_name, docs.len()); |
126 | 4.14k | } |
127 | ||
128 | 4.14k | let doc = &docs[0]; |
129 | 4.14k | if cfg!(debug_assertions) { |
130 | 4.14k | verify_keys(doc, "Speech", file_name) |
131 | 4.14k | verify_keys(doc, "Navigation", file_name) |
132 | 4.14k | verify_keys(doc, "Braille", file_name) |
133 | 4.14k | verify_keys(doc, "Other", file_name) |
134 | 0 | } |
135 | ||
136 | 4.14k | let prefs = &mut base_prefs.prefs; |
137 | 4.14k | add_prefs(prefs, &doc["Speech"], "", file_name); |
138 | 4.14k | add_prefs(prefs, &doc["Navigation"], "", file_name); |
139 | 4.14k | add_prefs(prefs, &doc["Braille"], "", file_name); |
140 | 4.14k | add_prefs(prefs, &doc["Other"], "", file_name); |
141 | 4.14k | return Ok( Preferences{ prefs: prefs.to_owned() } ); |
142 | ||
143 | ||
144 | ||
145 | 16.5k | fn verify_keys(dict: &Yaml, key: &str, file_name: &str) -> Result<()> { |
146 | 16.5k | let prefs = &dict[key]; |
147 | 16.5k | if prefs.is_badvalue() { |
148 | 0 | bail!("Yaml error in file {}.\nDidn't find '{}' key.", file_name, key); |
149 | 16.5k | } |
150 | 16.5k | if prefs.as_hash().is_none() { |
151 | 0 | bail!("Yaml error in file {}.\n'{}' key is not a dictionary. Value found is {}.", |
152 | 0 | file_name, key, yaml_to_string(dict, 1)); |
153 | 16.5k | } |
154 | 16.5k | return Ok(()); |
155 | 16.5k | } |
156 | ||
157 | 41.4k | fn add_prefs(map: &mut PreferenceHashMap, new_prefs: &Yaml, name_prefix: &str, file_name: &str) { |
158 | 41.4k | if new_prefs.is_badvalue() || new_prefs.is_null() || new_prefs.as_hash().is_none() { |
159 | 0 | return; |
160 | 41.4k | } |
161 | 41.4k | let new_prefs = new_prefs.as_hash().unwrap(); |
162 | 302k | for (yaml_name, yaml_value) in |
163 | 302k | let name = as_str_checked(yaml_name); |
164 | 302k | if let Err( |
165 | 0 | error!("{}", e.context( |
166 | 0 | format!("name '{}' is not a string in file {}", yaml_to_string(yaml_name, 0), file_name))); |
167 | } else { | |
168 | 302k | match yaml_value { |
169 | 24.8k | Yaml::Hash(_) => add_prefs(map, yaml_value, &(name.unwrap().to_string() + "_"), file_name), |
170 | 0 | Yaml::Array(_) => error!("name '{}' has illegal array value {} in file '{}'", |
171 | 0 | yaml_to_string(yaml_name, 0), yaml_to_string(yaml_value, 0), file_name), |
172 | Yaml::String(_) | Yaml::Boolean(_) | Yaml::Integer(_) | Yaml::Real(_) => { | |
173 | 277k | let trimmed_name = name_prefix.to_string() + name.unwrap().trim(); |
174 | 277k | let mut yaml_value = yaml_value.to_owned(); |
175 | 277k | if let Some( |
176 | 236k | yaml_value = Yaml::String(value.to_string()); |
177 | 236k |
|
178 | 277k | map.insert(trimmed_name, yaml_value); |
179 | }, | |
180 | 0 | _ => error!("name '{}' has illegal {:#?} value {} in file '{}'", |
181 | 0 | yaml_to_string(yaml_name, 0), yaml_value, yaml_to_string(yaml_value, 0), file_name), |
182 | } | |
183 | } | |
184 | } | |
185 | 41.4k | } |
186 | 4.14k | } |
187 | ||
188 | #[allow(dead_code)] // used in testing | |
189 | 0 | fn set_string_value(&mut self, name: &str, value: &str) { |
190 | 0 | self.prefs.insert(name.to_string(), Yaml::String(value.trim().to_string())); |
191 | 0 | } |
192 | ||
193 | #[allow(dead_code)] // used in testing | |
194 | 0 | fn set_bool_value(&mut self, name: &str, value: bool) { |
195 | 0 | self.prefs.insert(name.to_string(), Yaml::Boolean(value)); |
196 | 0 | } |
197 | } | |
198 | ||
199 | ||
200 | thread_local!{ | |
201 | static DEFAULT_USER_PREFERENCES: Preferences = Preferences::user_defaults(); | |
202 | static DEFAULT_API_PREFERENCES: Preferences = Preferences::api_defaults(); | |
203 | static PREF_MANAGER: Rc<RefCell<PreferenceManager>> = | |
204 | Rc::new( RefCell::new( PreferenceManager::default() ) ); | |
205 | ||
206 | } | |
207 | ||
208 | /// PreferenceManager keeps track of user and api prefs along with current files | |
209 | /// | |
210 | /// If one of the `FileAndTime` files changes while the program is running, the values will auto-update | |
211 | /// Among other things, that means that a UI that changes a user pref will be reflected the next time someone gets speech, braille, etc. | |
212 | // | |
213 | // Note: I experimented with PREF_MANAGER being a Result<PreferenceManager> in the case of no rule files, | |
214 | // but it ended up being a mess (lots of unwrapping). Having a field is much cleaner. | |
215 | // Also note that if 'error' is not an empty string, SpeechRules can't work so using those requires a check. | |
216 | #[derive(Debug, Default)] | |
217 | pub struct PreferenceManager { | |
218 | rules_dir: PathBuf, // full path to rules dir | |
219 | error: String, // empty/default string if fields are set, otherwise error message | |
220 | user_prefs: Preferences, // prefs that come from reading prefs.yaml (system and user locations) | |
221 | api_prefs: Preferences, // prefs set by API calls (along with some defaults not in the user settings such as "pitch") | |
222 | sys_prefs_file: Option<FileAndTime>, // the system prefs.yaml file | |
223 | user_prefs_file: Option<FileAndTime>, // the user prefs.yaml file | |
224 | intent: PathBuf, // the intent rule style file | |
225 | speech: PathBuf, // the speech rule style file | |
226 | overview: PathBuf, // the overview rule file | |
227 | navigation: PathBuf, // the navigation rule file | |
228 | speech_unicode: PathBuf, // short unicode.yaml file | |
229 | speech_unicode_full: PathBuf, // full unicode.yaml file | |
230 | speech_defs: PathBuf, // the definition.yaml file | |
231 | braille: PathBuf, // the braille rule file | |
232 | braille_unicode: PathBuf, // short braille unicode file | |
233 | braille_unicode_full: PathBuf, // full braille unicode file | |
234 | braille_defs: PathBuf, // the definition.yaml file | |
235 | } | |
236 | ||
237 | ||
238 | impl fmt::Display for PreferenceManager { | |
239 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
240 | 0 | writeln!(f, "PreferenceManager:")?; |
241 | 0 | if self.error.is_empty() { |
242 | 0 | writeln!(f, " not initialized!!! Error is {}", &self.error)?; |
243 | } else { | |
244 | 0 | writeln!(f, " user prefs:\n{}", self.user_prefs)?; |
245 | 0 | writeln!(f, " api prefs:\n{}", self.api_prefs)?; |
246 | 0 | writeln!(f, " style files: {:?}", self.speech.as_path())?; |
247 | 0 | writeln!(f, " unicode files: {:?}", self.speech_unicode.as_path())?; |
248 | 0 | writeln!(f, " intent files: {:?}", self.intent.as_path())?; |
249 | 0 | writeln!(f, " speech definition files: {:?}", self.speech_defs)?; |
250 | 0 | writeln!(f, " braille definition files: {:?}", self.braille_defs)?; |
251 | } | |
252 | 0 | return Ok(()); |
253 | 0 | } |
254 | } | |
255 | ||
256 | impl PreferenceManager { | |
257 | /// Initialize (the) PreferenceManager (a global var). | |
258 | /// 'rules_dir' is the path to "Rules" unless the env var MathCATRulesDir is set | |
259 | /// | |
260 | /// If rules_dir is an empty PathBuf, the existing rules_dir is used (an error if it doesn't exist) | |
261 | 5.09k | pub fn initialize(&mut self, rules_dir: PathBuf) -> Result<()> { |
262 | // Resolve the rules directory to an absolute, canonical path. | |
263 | // If canonicalize() fails (e.g., ACCESS_DENIED in containers), fall back to: | |
264 | // - returning the path as-is if it is already absolute, | |
265 | // - prepending the current working directory if it is relative. | |
266 | // Note: if current_dir() also fails, unwrap_or_default yields an empty PathBuf, | |
267 | // and the result may remain relative. | |
268 | #[cfg(not(feature = "include-zip"))] | |
269 | 5.09k | let rules_dir = match rules_dir.canonicalize() { |
270 | 1 | Err(_e) => { |
271 | 1 | if rules_dir.is_absolute() { |
272 | 0 | rules_dir |
273 | } else { | |
274 | 1 | std::env::current_dir() |
275 | 1 | .unwrap_or_default() |
276 | 1 | .join(&rules_dir) |
277 | } | |
278 | }, | |
279 | 5.09k | Ok(rules_dir) => rules_dir, |
280 | }; | |
281 | ||
282 | 5.09k | self.set_rules_dir(&rules_dir) |
283 | 5.09k | self.set_preference_files() |
284 | 5.09k | self.set_all_files(&rules_dir) |
285 | 5.09k | return Ok( () ); |
286 | ||
287 | 5.09k | } |
288 | ||
289 | 60.3k | pub fn get() -> Rc<RefCell<PreferenceManager>> { |
290 | 60.3k | return PREF_MANAGER.with( |pm| pm.clone() ); |
291 | 60.3k | } |
292 | ||
293 | 0 | pub fn get_error(&self) -> &str { |
294 | 0 | return &self.error; |
295 | 0 | } |
296 | ||
297 | /// Return a `PreferenceHashMap` that is the merger of the api prefs onto the user prefs. | |
298 | 22.7k | pub fn merge_prefs(&self) -> PreferenceHashMap { |
299 | 22.7k | let mut merged_prefs = self.user_prefs.prefs.clone(); |
300 | 22.7k | merged_prefs.extend(self.api_prefs.prefs.clone()); |
301 | 22.7k | return merged_prefs; |
302 | 22.7k | } |
303 | ||
304 | /// Set the rules dir and return failure if it is a bad directory (non-existent, can't find all files, ...) | |
305 | 5.09k | fn set_rules_dir(&mut self, rules_dir: &Path) -> Result<()> { |
306 | // Fix: should make sure all files exists -- fail if not true | |
307 | 5.09k | if !is_dir_shim(rules_dir) { |
308 | 1 | bail!("Unable to find MathCAT Rules directory '{}'", rules_dir.to_string_lossy()) |
309 | 5.09k | } |
310 | 5.09k | self.rules_dir = rules_dir.to_path_buf(); |
311 | 5.09k | return Ok( () ); |
312 | 5.09k | } |
313 | ||
314 | /// Set the rules dir and return failure if it is a bad directory (non-existent, can't find all files, ...) | |
315 | 1 | pub fn get_rules_dir(&self) -> PathBuf { |
316 | // Fix: should make sure rules_dir is set -- fail if not true | |
317 | 1 | return self.rules_dir.clone(); |
318 | 1 | } |
319 | ||
320 | /// Read the preferences from the files (if not up to date) and set the preferences and preference files | |
321 | /// Returns failure if the files don't exist or have errors | |
322 | 20.3k | pub fn set_preference_files(&mut self) -> Result<()> { |
323 | // first, read in the preferences -- need to determine which files to read next | |
324 | // the prefs files are in the rules dir and the user dir; differs from other files | |
325 | 20.3k | if self.api_prefs.prefs.is_empty() { |
326 | 4.14k | self.api_prefs = Preferences{ prefs: DEFAULT_API_PREFERENCES.with(|defaults| defaults.prefs.clone()) }; |
327 | 16.2k | } |
328 | ||
329 | 20.3k | let should_update_system_prefs = self.sys_prefs_file.is_none() || |
330 | 20.3k | let should_update_user_prefs = self.user_prefs_file.is_none() || |
331 | 20.3k | if !(should_update_system_prefs || |
332 | 16.2k | return Ok( () ); // no need to do anything else |
333 | 4.14k | } |
334 | ||
335 | 4.14k | let mut prefs = Preferences::default(); |
336 | ||
337 | 4.14k | let mut system_prefs_file = self.rules_dir.to_path_buf(); |
338 | 4.14k | system_prefs_file.push("prefs.yaml"); |
339 | 4.14k | if is_file_shim(&system_prefs_file) { |
340 | 4.14k | let defaults = DEFAULT_USER_PREFERENCES.with(|defaults| defaults.clone()); |
341 | 4.14k | prefs = Preferences::read_prefs_file(&system_prefs_file, defaults) |
342 | 4.14k | self.sys_prefs_file = Some( FileAndTime::new_with_time(system_prefs_file.clone()) ); |
343 | } else { | |
344 | 0 | error!("MathCAT couldn't open file system preference file '{}'.\nUsing fallback defaults which may be inappropriate.", |
345 | 0 | system_prefs_file.to_str().unwrap()); |
346 | }; | |
347 | ||
348 | 4.14k | let mut user_prefs_file = dirs::config_dir(); |
349 | 4.14k | if let Some(mut user_prefs_file_path_buf) = user_prefs_file { |
350 | 4.14k | user_prefs_file_path_buf.push("MathCAT/prefs.yaml"); |
351 | 4.14k | if is_file_shim(&user_prefs_file_path_buf) { |
352 | 0 | prefs = Preferences::read_prefs_file(&user_prefs_file_path_buf, prefs)?; |
353 | 4.14k | } |
354 | // set the time otherwise keeps needing to do updates | |
355 | 4.14k | self.user_prefs_file = Some( FileAndTime::new_with_time(user_prefs_file_path_buf.clone()) ); |
356 | 4.14k | user_prefs_file = Some(user_prefs_file_path_buf); |
357 | 0 | } |
358 | ||
359 | 4.14k | if prefs.prefs.is_empty() { |
360 | 0 | let user_prefs_file_name = match user_prefs_file { |
361 | 0 | None => "No user config directory".to_string(), |
362 | 0 | Some(file) => file.to_string_lossy().to_string(), |
363 | }; | |
364 | 0 | bail!("Didn't find preferences in rule directory ('{}') or user directory ('{}')", &system_prefs_file.to_string_lossy(), user_prefs_file_name); |
365 | 4.14k | } |
366 | 4.14k | self.set_files_based_on_changes(&prefs) |
367 | 4.14k | self.user_prefs = prefs; |
368 | ||
369 | // set computed values for BLOCK_SEPARATORS and DECIMAL_SEPARATORS (a little messy about the language due immutable and mutable borrows) | |
370 | 4.14k | let language = self.user_prefs.prefs.get("Language").unwrap_or(&DEFAULT_LANG).clone(); |
371 | 4.14k | let language = language.as_str().unwrap(); |
372 | 4.14k | self.set_separators(language) |
373 | ||
374 | 4.14k | return Ok( () ); |
375 | 20.3k | } |
376 | ||
377 | 5.09k | fn set_all_files(&mut self, rules_dir: &Path) -> Result<()> { |
378 | // try to find ./Rules/lang/style.yaml and ./Rules/lang/style.yaml | |
379 | // we go through a series of fallbacks -- we try to maintain the language if possible | |
380 | ||
381 | 5.09k | let language = self.pref_to_string("Language"); |
382 | 5.09k | let language = if language.as_str() == "Auto" { |
383 | 5.09k | let language_dir = rules_dir.to_path_buf().join("Languages"); |
384 | 5.09k | self.set_speech_files(&language_dir, language, None) |
385 | ||
386 | 5.09k | let braille_code = self.pref_to_string("BrailleCode"); |
387 | 5.09k | let braille_dir = rules_dir.to_path_buf().join("Braille"); |
388 | 5.09k | self.set_braille_files(&braille_dir, &braille_code) |
389 | 5.09k | return Ok(()); |
390 | 5.09k | } |
391 | ||
392 | 9.19k | fn set_speech_files(&mut self, language_dir: &Path, language: &str, new_speech_style: Option<&str>) -> Result<()> { |
393 | 9.19k | PreferenceManager::unzip_files(language_dir, language, Some("en")) |
394 | 9.19k | self.intent = PreferenceManager::find_file(language_dir, language, Some("en"), "intent.yaml") |
395 | 9.19k | self.overview = PreferenceManager::find_file(language_dir, language, Some("en"), "overview.yaml") |
396 | 9.19k | self.navigation = PreferenceManager::find_file(language_dir, language, Some("en"), "navigate.yaml") |
397 | ||
398 | 9.19k | self.speech_unicode = PreferenceManager::find_file(language_dir, language, Some("en"), "unicode.yaml") |
399 | 9.19k | self.speech_unicode_full = PreferenceManager::find_file(language_dir, language, Some("en"), "unicode-full.yaml") |
400 | ||
401 | 9.19k | self.speech_defs = PreferenceManager::find_file(language_dir, language, Some("en"), "definitions.yaml") |
402 | ||
403 | 9.19k | match new_speech_style { |
404 | 0 | Some(style_name) => self.set_style_file(language_dir, language, style_name)?, |
405 | // use the old style name if one isn't given | |
406 | 9.19k | None => self.set_style_file(language_dir, language, &self.pref_to_string("SpeechStyle")) |
407 | } | |
408 | 9.19k | return Ok( () ); |
409 | 9.19k | } |
410 | ||
411 | 10.7k | fn set_style_file(&mut self, language_dir: &Path, language: &str, style_file_name: &str) -> Result<()> { |
412 | 10.7k | let style_file_name = style_file_name.to_string() + "_Rules.yaml"; |
413 | 10.7k | self.speech = PreferenceManager::find_file(language_dir, language, Some("en"), &style_file_name) |
414 | // debug!("set_style_file: language_dir: {}, language: {}, style_file_name: {}, self.speech: {}", | |
415 | // language_dir.display(), language, style_file_name, self.speech.display()); | |
416 | 10.7k | return Ok( () ); |
417 | 10.7k | } |
418 | ||
419 | 5.69k | fn set_braille_files(&mut self, braille_rules_dir: &Path, braille_code_name: &str) -> Result<()> { |
420 | // Fix: Currently the braille code and the directory it lives in have to have the same name | |
421 | 5.69k | PreferenceManager::unzip_files(braille_rules_dir, braille_code_name, Some("UEB")) |
422 | ||
423 | 5.69k | let braille_file = braille_code_name.to_string() + "_Rules.yaml"; |
424 | ||
425 | 5.69k | self.braille = PreferenceManager::find_file(braille_rules_dir, braille_code_name, Some("UEB"), &(braille_file)) |
426 | ||
427 | 5.69k | self.braille_unicode = PreferenceManager::find_file(braille_rules_dir, braille_code_name, Some("UEB"), "unicode.yaml") |
428 | 5.69k | self.braille_unicode_full = PreferenceManager::find_file(braille_rules_dir, braille_code_name, Some("UEB"), "unicode-full.yaml") |
429 | ||
430 | 5.69k | self.braille_defs = PreferenceManager::find_file(braille_rules_dir, braille_code_name, Some("UEB"), "definitions.yaml") |
431 | 5.69k | return Ok( () ); |
432 | 5.69k | } |
433 | ||
434 | /// If some preferences have changed, we may need to recompute other ones | |
435 | /// The key prefs are Language, SpeechStyle, and BrailleCode, along with DecimalSeparator | |
436 | 4.14k | fn set_files_based_on_changes(&mut self, new_prefs: &Preferences) -> Result<()> { |
437 | 4.14k | let old_language = self.user_prefs.prefs.get("Language"); // not set if first time |
438 | 4.14k | if old_language.is_none() { |
439 | 4.14k | return Ok( () ); // if "Language" isn't set yet, nothing else is either -- first time through, so no updating needed. |
440 | 0 | } |
441 | ||
442 | 0 | let old_language = old_language.unwrap(); |
443 | 0 | let new_language = new_prefs.prefs.get("Language").unwrap(); |
444 | 0 | debug!("set_files_based_on_changes: old_language={old_language:?}, new_language={new_language:?}"); |
445 | 0 | if old_language != new_language { |
446 | 0 | let language_dir = self.rules_dir.to_path_buf().join("Languages"); |
447 | 0 | self.set_speech_files(&language_dir, new_language.as_str().unwrap(), None)?; // also sets style file |
448 | } else { | |
449 | 0 | let old_speech_style = self.user_prefs.prefs.get("SpeechStyle").unwrap(); |
450 | 0 | let new_speech_style = new_prefs.prefs.get("SpeechStyle").unwrap(); |
451 | 0 | let language_dir = self.rules_dir.to_path_buf().join("Languages"); |
452 | 0 | if old_speech_style != new_speech_style { |
453 | 0 | self.set_speech_files(&language_dir, new_language.as_str().unwrap(), new_speech_style.as_str())?; |
454 | 0 | } |
455 | } | |
456 | ||
457 | 0 | let old_braille_code = self.user_prefs.prefs.get("BrailleCode").unwrap(); |
458 | 0 | let new_braille_code = new_prefs.prefs.get("BrailleCode").unwrap(); |
459 | 0 | if old_braille_code != new_braille_code { |
460 | 0 | let braille_code_dir = self.rules_dir.to_path_buf().join("Braille"); |
461 | 0 | self.set_braille_files(&braille_code_dir, new_braille_code.as_str().unwrap())?; // also sets style file |
462 | 0 | } |
463 | ||
464 | 0 | return Ok( () ); |
465 | 4.14k | } |
466 | ||
467 | /// Unzip the files if needed | |
468 | /// Returns true if it unzipped them | |
469 | 41.3k | pub fn unzip_files(path: &Path, lang: &str, default_lang: Option<&str>) -> Result<bool> { |
470 | thread_local!{ | |
471 | /// when a language/braille code dir is unzipped, it is recorded here | |
472 | static UNZIPPED_FILES: RefCell<HashSet<String>> = RefCell::new( HashSet::with_capacity(31)); | |
473 | } | |
474 | // ignore regional subdirs | |
475 | 41.3k | let dir = PreferenceManager::get_language_dir(path, lang, default_lang) |
476 | 41.3k | let language = if dir.ends_with(lang) { |
477 | 41.3k | let zip_file_name = language.to_string() + ".zip"; |
478 | 41.3k | let zip_file_path = dir.join(&zip_file_name); |
479 | 41.3k | let zip_file_string = zip_file_path.to_string_lossy().to_string(); |
480 | // debug!("unzip_files: dir: {}, zip_file_name: {}, zip_file_path: {}", dir.display(), zip_file_name, zip_file_string); | |
481 | 41.3k | if UNZIPPED_FILES.with( |unzipped_files| unzipped_files.borrow().contains(&zip_file_string)) { |
482 | 28.6k | return Ok(false); |
483 | 12.6k | } |
484 | ||
485 | 12.6k | let |
486 | 1.44k | Err(e) => { |
487 | 1.44k | if lang.contains('-') { |
488 | // try again in parent dir of regional language | |
489 | 0 | let language = lang.split_once('-').unwrap_or((lang, "")).0; // get the parent language |
490 | // debug!("unzip_files: trying again in parent language: {}", language); | |
491 | 0 | PreferenceManager::unzip_files(path, language, default_lang) |
492 | 0 | .with_context(|| format!("Couldn't open zip file {zip_file_string} in parent {language}: {e}."))? |
493 | } else { | |
494 | // maybe just regional dialects | |
495 | 1.44k | let mut regional_dirs = Vec::new(); |
496 | 1.44k | find_all_dirs_shim(&dir, &mut regional_dirs); |
497 | 1.44k | for dir in regional_dirs { |
498 | // debug!("unzip_files: trying again in subdir: {}", dir.display()); | |
499 | 1.44k | let language = format!("{}-{}", lang, dir.file_name().unwrap().to_str().unwrap()); |
500 | 1.44k | if let Ok(result) =PreferenceManager::unzip_files(path, &language, default_lang) { |
501 | 1.44k | return Ok(result); |
502 | 0 | } |
503 | } | |
504 | 0 | bail!("Couldn't open zip file {}: {}.", zip_file_string, e) |
505 | } | |
506 | }, | |
507 | 11.2k | Ok(result) => { |
508 | 11.2k | result |
509 | }, | |
510 | }; | |
511 | ||
512 | 11.2k | UNZIPPED_FILES.with( |unzipped_files| unzipped_files.borrow_mut().insert(zip_file_string.clone()) ); |
513 | // debug!(" unzip_files: unzipped {} files from {}", result, &zip_file_string); | |
514 | // UNZIPPED_FILES.with( |unzipped_files| { | |
515 | // debug!("unzip_files: unzipped_files: {:?}", unzipped_files.borrow()); | |
516 | // }); | |
517 | ||
518 | 11.2k | return Ok(result); |
519 | 41.3k | } |
520 | ||
521 | /// Set BlockSeparators and DecimalSeparators | |
522 | /// FIX: changing these values could change the parse, so we really should reparse the original expr, but that doesn't exist anymore (store the original string???) | |
523 | /// | |
524 | /// Note: DecimalSeparator is user-facing (can be Auto), DecimalSeparators is code-facing (always a char) | |
525 | 8.24k | fn set_separators(&mut self, language_country: &str) -> Result<()> { |
526 | // This list was generated from https://en.wikipedia.org/wiki/Decimal_separator#Countries_using_decimal_point | |
527 | // The countries were then mapped to language(s) using https://en.wikipedia.org/wiki/List_of_official_languages_by_country_and_territory | |
528 | // When a language was used in other countries that used a "," separator, the language+country is listed | |
529 | // Sometimes there are multiple languages used in a country -- they are all listed, sometimes with a country code | |
530 | // The country code isn't used when the language is used in smaller countries (i.e, when "." is more likely correct) | |
531 | // This decision is sometimes a bit arbitrary | |
532 | // For example, Swahili (sw) is used in: Democratic Republic of the Congo, Kenya, Rwanda, Tanzania, and Uganda. | |
533 | // Of these, Kenya, Tanzania, and Uganda are listed as using "." and I include Swahili in the list below. | |
534 | static USE_DECIMAL_SEPARATOR: phf::Set<&str> = phf_set! { | |
535 | "en", "bn", "km", "el-cy", "tr-cy", "zh", "es-do", "ar", "es-sv", "es-gt", "es-hn", "hi", "as", "gu", "kn", "ks", | |
536 | "ml", "mr", "ne", "or", "pa", "sa", "sd", "ta", "te", "ur", "he", "ja", "sw", "ko", "de-li", "ms", "dv", "mt", "es-mx", "my", | |
537 | "af-na", "es-ni", "es-pa", "fil", "ms-sg", "si", "th", | |
538 | "es-419", // latin america | |
539 | }; | |
540 | ||
541 | 8.24k | let decimal_separator = self.pref_to_string("DecimalSeparator"); |
542 | 8.24k | if !["Auto", ",", "."].contains(&decimal_separator.as_str()) { |
543 | 2 | return Ok( () ); |
544 | 8.24k | } |
545 | ||
546 | 8.24k | if language_country == "Auto" && |
547 | 4.14k | return Ok( () ); // "Auto" doesn't tell us anything -- we will get called again when Language is set |
548 | 4.10k | } |
549 | ||
550 | 4.10k | let language_country = language_country.to_ascii_lowercase(); |
551 | 4.10k | let language_country = &language_country; |
552 | 4.10k | let mut lang_country_split = language_country.split('-'); |
553 | 4.10k | let language = lang_country_split.next().unwrap_or(""); |
554 | 4.10k | let country = lang_country_split.next().unwrap_or(""); |
555 | 4.10k | let mut use_period = decimal_separator == "."; |
556 | 4.10k | if decimal_separator == "Auto" { |
557 | // if we don't have a match for the lang-country, then just try lang | |
558 | 4.09k | use_period = USE_DECIMAL_SEPARATOR.contains(language_country) || |
559 | 2 | } |
560 | // debug!("set_separators: use_period: {}", use_period); | |
561 | 4.10k | self.user_prefs.prefs.insert("DecimalSeparators".to_string(), Yaml::String((if use_period { |
562 | 4.10k | let mut block_separators = (if use_period { |
563 | 4.10k | if country == "ch" || country == "li" { // Switzerland and Liechtenstein also use ` as a block separator, at least in some cases |
564 | 0 | block_separators.push('\''); |
565 | 4.10k | } |
566 | 4.10k | self.user_prefs.prefs.insert("BlockSeparators".to_string(), Yaml::String(block_separators)); |
567 | 4.10k | return Ok( () ); |
568 | 8.24k | } |
569 | ||
570 | ||
571 | /// Find a file matching `file_name` by starting in the regional directory and looking to the language. | |
572 | /// If that fails, fall back to looking for the default repeating the same process -- something needs to be found or MathCAT crashes | |
573 | 88.6k | fn find_file(rules_dir: &Path, lang: &str, default_lang: Option<&str>, file_name: &str) -> Result<PathBuf> { |
574 | // rules_dir: is the root of the search | |
575 | // to that we add the language dir(s) | |
576 | // if file_name doesn't exist in the language dir(s), we try to find it in the default dir | |
577 | // the exception to this is if it ends with _Rules.yaml, we look for other _Rules.yaml files | |
578 | // returns the location of the file_name found | |
579 | ||
580 | // start by trying to find a dir that exists | |
581 | 88.6k | let lang_dir = PreferenceManager::get_language_dir(rules_dir, lang, default_lang) |
582 | // now find the file name in the dirs | |
583 | // we start with the deepest dir and walk back to towards Rules | |
584 | 88.6k | let mut alternative_style_file = None; // back up in case we don't find the target style in lang_dir |
585 | 88.6k | let looking_for_style_file = file_name.ends_with("_Rules.yaml"); |
586 | 108k | for os_path in |
587 | 108k | let path = PathBuf::from(os_path).join(file_name); |
588 | // debug!("find_file: checking file: {}", path.to_string_lossy()); | |
589 | 108k | if is_file_shim(&path) { |
590 | // we make an exception for definitions.yaml -- there a language specific checks for Hundreds, etc | |
591 | 88.4k | if !(file_name == "definitions.yaml" && |
592 | // debug!("find_file -- found={}", path.to_string_lossy()); | |
593 | 88.4k | return Ok(path); |
594 | 2 | } |
595 | 19.7k | }; |
596 | 19.7k | if looking_for_style_file && |
597 | 257 | let Ok( |
598 | 249 | // debug!("find_file: found alternative style file '{}'", alt_file_path.display()); |
599 | 249 | alternative_style_file = Some(alt_file_path); |
600 | 19.4k | } |
601 | 19.7k | if os_path.ends_with("Rules") { |
602 | // at root of Rules directory | |
603 | 256 | break; |
604 | 19.4k | } |
605 | } | |
606 | ||
607 | ||
608 | 256 | if let Some( |
609 | // debug!("find_file: found alternative_style_file '{}'", result.to_string_lossy()); | |
610 | 248 | return Ok(result); // found an alternative style file in the same lang dir |
611 | 8 | } |
612 | ||
613 | // try a subdir (regional dialect) of the language dir | |
614 | 8 | let mut regional_dirs = Vec::new(); |
615 | 8 | find_all_dirs_shim(&lang_dir, &mut regional_dirs); |
616 | 8 | for dir in regional_dirs { |
617 | // debug!("find_file: trying again in subdir: {}", dir.display()); | |
618 | // debug!(" ... files found = {:?}", find_files_in_dir_that_ends_with_shim(&dir, file_name)); | |
619 | 8 | if find_files_in_dir_that_ends_with_shim(&dir, ".yaml").contains(&file_name.to_string()) { |
620 | 0 | let path = dir.join(file_name); |
621 | 0 | if is_file_shim(&path) { |
622 | 0 | return Ok(path); |
623 | 0 | } |
624 | 8 | } |
625 | } | |
626 | ||
627 | 8 | if let Some(default_lang) = default_lang { |
628 | // try again with the default language (we're likely in trouble) | |
629 | 8 | return PreferenceManager::find_file(rules_dir, default_lang, None, file_name); |
630 | 0 | } |
631 | ||
632 | // We are done for -- MathCAT can't do anything without the required files! | |
633 | 0 | bail!("Wasn't able to find/read MathCAT required file in directory: {}\n\ |
634 | Initially looked in there for language specific directory: {}\n\ | |
635 | Looking for file: {}", | |
636 | 0 | rules_dir.to_str().unwrap(), lang, file_name); |
637 | ||
638 | ||
639 | /// try to find a xxx_Rules.yaml file -- returns an error if none is found () | |
640 | 257 | fn find_any_style_file(path: &Path) -> Result<PathBuf> { |
641 | // try to find a xxx_Rules.yaml file | |
642 | // we find the first file because this is the deepest (most language specific) speech rule file | |
643 | 257 | let rule_files = find_files_in_dir_that_ends_with_shim(path, "_Rules.yaml"); |
644 | 257 | if rule_files.is_empty() { |
645 | 8 | bail!{"didn't find file"}; |
646 | } else { | |
647 | 249 | return Ok( path.join(rule_files[0].clone()) ); |
648 | } | |
649 | 257 | } |
650 | 88.6k | } |
651 | ||
652 | 129k | fn get_language_dir(rules_dir: &Path, lang: &str, default_lang: Option<&str>) -> Result<PathBuf> { |
653 | // return 'Rules/Language/fr', 'Rules/Language/en/gb', etc, if they exist. | |
654 | // fall back to main language, and then to default_dir if language dir doesn't exist | |
655 | 129k | let mut full_path = rules_dir.to_path_buf(); |
656 | 129k | full_path.push(lang.replace('-', std::path::MAIN_SEPARATOR_STR)); |
657 | 130k | for parent in |
658 | 130k | if parent == rules_dir { |
659 | 0 | break; |
660 | 130k | } else if is_dir_shim(parent) { |
661 | 129k | return Ok(parent.to_path_buf()); |
662 | 24 | } |
663 | } | |
664 | ||
665 | // didn't find the language -- try again with the default language | |
666 | 0 | match default_lang { |
667 | 0 | Some(default_lang) => { |
668 | 0 | warn!("Couldn't find rules for language {lang}, "); |
669 | 0 | return PreferenceManager::get_language_dir(rules_dir, default_lang, None); |
670 | }, | |
671 | None => { | |
672 | // We are done for -- MathCAT can't do anything without the required files! | |
673 | 0 | bail!("Wasn't able to find/read directory for language {}\n |
674 | Wasn't able to find/read MathCAT default language directory: {}", | |
675 | 0 | lang, rules_dir.join(default_lang.unwrap_or("")).as_os_str().to_str().unwrap()); |
676 | } | |
677 | } | |
678 | 129k | } |
679 | ||
680 | ||
681 | /// Return the speech rule style file locations. | |
682 | 15.3k | pub fn get_rule_file(&self, name: &RulesFor) -> &Path { |
683 | 15.3k | if !self.error.is_empty() { |
684 | 0 | panic!("Internal error: get_rule_file called on invalid PreferenceManager -- error message\n{}", &self.error); |
685 | 15.3k | }; |
686 | ||
687 | 15.3k | let files = match name { |
688 | 3.88k | RulesFor::Intent => &self.intent, |
689 | 9.03k | RulesFor::Speech => &self.speech, |
690 | 14 | RulesFor::OverView => &self.overview, |
691 | 549 | RulesFor::Navigation => &self.navigation, |
692 | 1.83k | RulesFor::Braille => &self.braille, |
693 | }; | |
694 | 15.3k | return files.as_path(); |
695 | 15.3k | } |
696 | ||
697 | /// Return the unicode.yaml file locations. | |
698 | 18.9k | pub fn get_speech_unicode_file(&self) ->(&Path, &Path) { |
699 | 18.9k | if !self.error.is_empty() { |
700 | 0 | panic!("Internal error: get_speech_unicode_file called on invalid PreferenceManager -- error message\n{}", &self.error); |
701 | 18.9k | }; |
702 | 18.9k | return (self.speech_unicode.as_path(), self.speech_unicode_full.as_path()); |
703 | 18.9k | } |
704 | ||
705 | /// Return the unicode.yaml file locations. | |
706 | 3.92k | pub fn get_braille_unicode_file(&self) -> (&Path, &Path) { |
707 | 3.92k | if !self.error.is_empty() { |
708 | 0 | panic!("Internal error: get_braille_unicode_file called on invalid PreferenceManager -- error message\n{}", &self.error); |
709 | 3.92k | }; |
710 | ||
711 | 3.92k | return (self.braille_unicode.as_path(), self.braille_unicode_full.as_path()); |
712 | 3.92k | } |
713 | ||
714 | /// Return the definitions.yaml file locations. | |
715 | 15.3k | pub fn get_definitions_file(&self, use_speech_defs: bool) -> &Path { |
716 | 15.3k | if !self.error.is_empty() { |
717 | 0 | panic!("Internal error: get_definitions_file called on invalid PreferenceManager -- error message\n{}", &self.error); |
718 | 15.3k | }; |
719 | ||
720 | 15.3k | let defs_file = if use_speech_defs { |
721 | 15.3k | return defs_file; |
722 | 15.3k | } |
723 | ||
724 | /// Return the TTS engine currently in use. | |
725 | 85.4k | pub fn get_tts(&self) -> TTS { |
726 | 85.4k | if !self.error.is_empty() { |
727 | 0 | panic!("Internal error: get_tts called on invalid PreferenceManager -- error message\n{}", &self.error); |
728 | 85.4k | }; |
729 | ||
730 | 85.4k | return match self.pref_to_string("TTS").as_str().to_ascii_lowercase().as_str() { |
731 | 85.4k | "none" => TTS::None, |
732 | 0 | "ssml" => TTS::SSML, |
733 | 0 | "sapi5" => TTS::SAPI5, |
734 | _ => { | |
735 | 0 | warn!("found unknown value for TTS: '{}'", self.pref_to_string("TTS").as_str()); |
736 | 0 | TTS::None |
737 | } | |
738 | } | |
739 | 85.4k | } |
740 | ||
741 | /// Set the string-valued preference. | |
742 | /// | |
743 | /// Note: changing the language, speech style, or braille code might fail if the files don't exist. | |
744 | /// If this happens, the preference is not set and an error is returned. | |
745 | /// If "LanguageAuto" is set, we assume "Language" has already be checked to be "Auto" | |
746 | 16.2k | pub fn set_string_pref(&mut self, key: &str, value: &str) -> Result<()> { |
747 | 16.2k | if !self.error.is_empty() { |
748 | 0 | panic!("Internal error: set_string_pref called on invalid PreferenceManager -- error message\n{}", &self.error); |
749 | 16.2k | }; |
750 | ||
751 | // verify language, braille, and SpeechStyle because these are used as access into the file system | |
752 | // should be an ascii string with only letters, dashes, and underscores | |
753 | 16.2k | if |
754 | 55.9k | ! |
755 | 3 | bail!("{} is an invalid value! Must contains only ascii letters, '_', or'-'", key); |
756 | 16.2k | } |
757 | ||
758 | // don't do an update if the value hasn't changed | |
759 | 16.2k | let mut is_user_pref = true; |
760 | 16.2k | if let Some( |
761 | 61 | if pref_value.as_str().unwrap() != value { |
762 | 59 | is_user_pref = false; |
763 | 59 | self.reset_files_from_preference_change(key, value) |
764 | 2 | } |
765 | 16.1k | } else if let Some(pref_value) = self.user_prefs.prefs.get(key) { |
766 | 16.1k | if pref_value.as_str().unwrap() != value { |
767 | 8.77k | self.reset_files_from_preference_change(key, value) |
768 | 7.40k | } |
769 | } else { | |
770 | 0 | bail!("{} is an unknown MathCAT preference!", key); |
771 | } | |
772 | ||
773 | // debug!("Setting ({}) {} to '{}'", if is_user_pref {"user"} else {"sys"}, key, value); | |
774 | 16.2k | if is_user_pref { |
775 | // a little messy about the DecimalSeparator due immutable and mutable borrows | |
776 | 16.1k | let current_decimal_separator = self.user_prefs.prefs.get("DecimalSeparator").unwrap().clone(); |
777 | 16.1k | let current_decimal_separator = current_decimal_separator.as_str().unwrap(); |
778 | 16.1k | let is_decimal_separators_changed = key == "DecimalSeparator" && |
779 | 16.1k | let is_language_changed = key == "Language" && |
780 | 16.1k | self.user_prefs.prefs.insert(key.to_string(), Yaml::String(value.to_string())); |
781 | 16.1k | if is_decimal_separators_changed || (current_decimal_separator == "Auto" && is_language_changed) { |
782 | // a little messy about the language due immutable and mutable borrows) | |
783 | 4.08k | let language = self.user_prefs.prefs.get("Language").unwrap_or(&DEFAULT_LANG).clone(); |
784 | 4.08k | let language = language.as_str().unwrap(); |
785 | 4.08k | self.set_separators(language) |
786 | 12.0k | } |
787 | 59 | } else { |
788 | 59 | self.api_prefs.prefs.insert(key.to_string(), Yaml::String(value.to_string())); |
789 | 59 | } |
790 | 16.2k | return Ok( () ); |
791 | 16.2k | } |
792 | ||
793 | 30.2k | fn reset_files_from_preference_change(&mut self, changed_pref: &str, changed_value: &str) -> Result<()> { |
794 | 30.2k | if changed_pref == "Language" && |
795 | // Language must have had a non-Auto value -- set LanguageAuto to old value so (probable) next change to LanguageAuto works well | |
796 | 0 | self.api_prefs.prefs.insert("LanguageAuto".to_string(), |
797 | 0 | self.api_prefs.prefs.get("Language").unwrap_or(&DEFAULT_LANG).clone() ); |
798 | 0 | return Ok( () ); |
799 | 30.2k | } |
800 | ||
801 | 30.2k | let changed_pref = if changed_pref == "LanguageAuto" { |
802 | 30.2k | let language_dir = self.rules_dir.to_path_buf().join("Languages"); |
803 | 30.2k | match changed_pref { |
804 | 30.2k | "Language" => { |
805 | 4.09k | self.set_speech_files(&language_dir, changed_value, None) |
806 | }, | |
807 | 26.1k | "SpeechStyle" => { |
808 | 1.51k | let language = self.pref_to_string("Language"); |
809 | 1.51k | let language = if language.as_str() == "Auto" { |
810 | 1.51k | self.set_style_file(&language_dir, language, changed_value) |
811 | }, | |
812 | 24.6k | "BrailleCode" => { |
813 | 601 | let braille_dir = self.rules_dir.to_path_buf().join("Braille"); |
814 | 601 | self.set_braille_files(&braille_dir, changed_value) |
815 | }, | |
816 | 24.0k | _ => (), |
817 | } | |
818 | 30.2k | return Ok( () ); |
819 | 30.2k | } |
820 | ||
821 | /// Set the number-valued preference. | |
822 | /// All number-valued preferences are stored with type `f64`. | |
823 | 0 | pub fn set_api_float_pref(&mut self, key: &str, value: f64) { |
824 | 0 | if !self.error.is_empty() { |
825 | 0 | panic!("Internal error: set_api_float_pref called on invalid PreferenceManager -- error message\n{}", &self.error); |
826 | 0 | }; |
827 | ||
828 | 0 | self.api_prefs.prefs.insert(key.to_string(), Yaml::Real(value.to_string())); |
829 | 0 | } |
830 | ||
831 | 1.50k | pub fn set_api_boolean_pref(&mut self, key: &str, value: bool) { |
832 | 1.50k | if !self.error.is_empty() { |
833 | 0 | panic!("Internal error: set_api_boolean_pref called on invalid PreferenceManager -- error message\n{}", &self.error); |
834 | 1.50k | }; |
835 | ||
836 | 1.50k | self.api_prefs.prefs.insert(key.to_string(), Yaml::Boolean(value)); |
837 | 1.50k | } |
838 | ||
839 | /// Return the current speech rate. | |
840 | 0 | pub fn get_rate(&self) -> f64 { |
841 | 0 | if !self.error.is_empty() { |
842 | 0 | panic!("Internal error: get_rate called on invalid PreferenceManager -- error message\n{}", &self.error); |
843 | 0 | }; |
844 | ||
845 | 0 | return match &self.pref_to_string("Rate").parse::<f64>() { |
846 | 0 | Ok(val) => *val, |
847 | Err(_) => { | |
848 | 0 | warn!("Rate ('{}') can't be converted to a floating point number", &self.pref_to_string("Rate")); |
849 | 0 | DEFAULT_API_PREFERENCES.with(|defaults| defaults.prefs["Rate"].as_f64().unwrap()) |
850 | } | |
851 | }; | |
852 | 0 | } |
853 | ||
854 | 0 | pub fn get_api_prefs(&self) -> &Preferences { |
855 | 0 | return &self.api_prefs; |
856 | 0 | } |
857 | ||
858 | /// returns value associated with 'name' or string NO_PREFERENCE | |
859 | /// | |
860 | /// Note: Option/Result not used because most of the time we know the preference exists, so no unwrapping is needed for 95% of calls | |
861 | 268k | pub fn pref_to_string(&self, name: &str) -> String { |
862 | 268k | let mut value = self.api_prefs.prefs.get(name); |
863 | 268k | if value.is_none() { |
864 | 122k | value = self.user_prefs.prefs.get(name); |
865 | 145k | } |
866 | 268k | return match value { |
867 | 11 | None => NO_PREFERENCE.to_string(), |
868 | 268k | Some(v) => match v { |
869 | 236k | Yaml::String(s) => s.clone(), |
870 | 27.3k | Yaml::Boolean(b) => b.to_string(), |
871 | 4.23k | Yaml::Integer(i) => i.to_string(), |
872 | 0 | Yaml::Real(s) => s.clone(), |
873 | 0 | _ => NO_PREFERENCE.to_string(), // shouldn't happen |
874 | } | |
875 | } | |
876 | 268k | } |
877 | ||
878 | // occasionally useful to check a pref value when debugging | |
879 | // fn get_pref(&self, pref_name: &str) -> String { | |
880 | // return yaml_to_string(self.user_prefs.prefs.get(pref_name).unwrap(), 1); | |
881 | // } | |
882 | ||
883 | /// Warning!!! This is meant for testing only -- it overwrites any values from a user pref file and will be overwritten if the file is reread. | |
884 | /// set_preference() is the function that should be called. | |
885 | /// This differs from set_preference in that the user preferences are changed, not the api ones | |
886 | 21.4k | pub fn set_user_prefs(&mut self, key: &str, value: &str) -> Result<()> { |
887 | 21.4k | if !self.error.is_empty() { |
888 | 0 | panic!("Internal error: set_user_prefs called on invalid PreferenceManager -- error message\n{}", &self.error); |
889 | 21.4k | }; |
890 | ||
891 | 21.4k | self.reset_files_from_preference_change(key, value) |
892 | 21.4k | let is_decimal_separators_changed = key == "DecimalSeparator" && |
893 | 21.4k | let is_language_changed = key == "Language" && |
894 | 21.4k | self.user_prefs.prefs.insert(key.to_string(), Yaml::String(value.to_string())); |
895 | 21.4k | if is_decimal_separators_changed || |
896 | // set computed values for BLOCK_SEPARATORS and DECIMAL_SEPARATORS (a little messy about the language due immutable and mutable borrows) | |
897 | 17 | let language = self.user_prefs.prefs.get("Language").unwrap_or(&DEFAULT_LANG).clone(); |
898 | 17 | let language = language.as_str().unwrap(); |
899 | 17 | self.set_separators(language) |
900 | 21.3k | } |
901 | ||
902 | 21.4k | return Ok(()); |
903 | 21.4k | } |
904 | } | |
905 | ||
906 | ||
907 | #[cfg(test)] | |
908 | mod tests { | |
909 | #[allow(unused_imports)] | |
910 | use crate::init_logger; | |
911 | ||
912 | // For these tests, it is assumed that there are Rules subdirs zz and zz/aa dir; there is no zz/ab | |
913 | // definitions.yaml is in Rules, zz, aa dirs | |
914 | // unicode.yaml is in zz | |
915 | // ClearSpeak_Rules.yaml is in zz | |
916 | // These files are NOT in the zipped up version -- hence the config | |
917 | use super::*; | |
918 | ||
919 | /// Version of abs_rules_dir_path that returns a PathBuf | |
920 | 11 | fn abs_rules_dir_path() -> PathBuf { |
921 | 11 | return PathBuf::from(super::super::abs_rules_dir_path()); |
922 | 11 | } |
923 | /// Return a relative path to Rules dir (ie, .../Rules/zz... returns zz/...) | |
924 | /// strip .../Rules from file path | |
925 | 33 | fn rel_path<'a>(rules_dir: &'a Path, path: &'a Path) -> &'a Path { |
926 | 33 | let stripped_path = path.strip_prefix(rules_dir).unwrap(); |
927 | 33 | return stripped_path |
928 | 33 | } |
929 | ||
930 | #[test] | |
931 | 1 | fn separators() { |
932 | 1 | PREF_MANAGER.with(|pref_manager| { |
933 | 1 | let mut pref_manager = pref_manager.borrow_mut(); |
934 | 1 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
935 | 1 | pref_manager.set_user_prefs("Language", "en").unwrap(); |
936 | 1 | pref_manager.set_user_prefs("DecimalSeparator", "Auto").unwrap(); |
937 | 1 | assert_eq!(&pref_manager.pref_to_string("DecimalSeparators"), "."); |
938 | 1 | assert_eq!(&pref_manager.pref_to_string("BlockSeparators"), ", \u{00A0}\u{202F}"); |
939 | ||
940 | 1 | pref_manager.set_user_prefs("Language", "sv").unwrap(); |
941 | 1 | assert_eq!(&pref_manager.pref_to_string("DecimalSeparators"), ","); |
942 | 1 | assert_eq!(&pref_manager.pref_to_string("BlockSeparators"), ". \u{00A0}\u{202F}"); |
943 | ||
944 | // test potentially ambiguous language (defaults to comma decimal separator) | |
945 | 1 | pref_manager.set_user_prefs("Language", "es").unwrap(); |
946 | 1 | assert_eq!(&pref_manager.pref_to_string("DecimalSeparators"), ","); |
947 | 1 | assert_eq!(&pref_manager.pref_to_string("BlockSeparators"), ". \u{00A0}\u{202F}"); |
948 | ||
949 | // test country override | |
950 | 1 | pref_manager.set_user_prefs("Language", "es-mx").unwrap(); |
951 | 1 | assert_eq!(&pref_manager.pref_to_string("DecimalSeparators"), "."); |
952 | 1 | assert_eq!(&pref_manager.pref_to_string("BlockSeparators"), ", \u{00A0}\u{202F}"); |
953 | ||
954 | 1 | pref_manager.set_user_prefs("DecimalSeparator", ",").unwrap(); |
955 | 1 | assert_eq!(&pref_manager.pref_to_string("DecimalSeparators"), ","); |
956 | 1 | assert_eq!(&pref_manager.pref_to_string("BlockSeparators"), ". \u{00A0}\u{202F}"); |
957 | ||
958 | 1 | pref_manager.set_user_prefs("DecimalSeparator", ".").unwrap(); |
959 | 1 | assert_eq!(&pref_manager.pref_to_string("DecimalSeparators"), "."); |
960 | 1 | assert_eq!(&pref_manager.pref_to_string("BlockSeparators"), ", \u{00A0}\u{202F}"); |
961 | ||
962 | // set to illegal value -- should leave values as before | |
963 | 1 | pref_manager.set_user_prefs("DecimalSeparator", ";").unwrap(); |
964 | 1 | assert_eq!(&pref_manager.pref_to_string("DecimalSeparators"), "."); |
965 | 1 | assert_eq!(&pref_manager.pref_to_string("BlockSeparators"), ", \u{00A0}\u{202F}"); |
966 | ||
967 | // manual | |
968 | 1 | pref_manager.set_user_prefs("DecimalSeparators", ",").unwrap(); |
969 | 1 | pref_manager.set_user_prefs("BlockSeparators", " ").unwrap(); |
970 | 1 | pref_manager.set_user_prefs("DecimalSeparator", "None").unwrap(); |
971 | 1 | assert_eq!(&pref_manager.pref_to_string("DecimalSeparators"), ","); |
972 | 1 | assert_eq!(&pref_manager.pref_to_string("BlockSeparators"), " "); |
973 | 1 | }); |
974 | 1 | } |
975 | ||
976 | #[test] | |
977 | 1 | fn find_simple_style() { |
978 | 1 | PREF_MANAGER.with(|pref_manager| { |
979 | 1 | let mut pref_manager = pref_manager.borrow_mut(); |
980 | 1 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
981 | 1 | pref_manager.set_user_prefs("Language", "en").unwrap(); |
982 | 1 | pref_manager.set_user_prefs("SpeechStyle", "ClearSpeak").unwrap(); |
983 | 1 | assert_eq!(&pref_manager.pref_to_string("Language"), "en"); |
984 | 1 | assert_eq!(&pref_manager.pref_to_string("SpeechStyle"), "ClearSpeak"); |
985 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/en/ClearSpeak_Rules.yaml")); |
986 | 1 | }); |
987 | 1 | } |
988 | ||
989 | cfg_if::cfg_if! {if #[cfg(not(feature = "include-zip"))] { | |
990 | #[test] | |
991 | 1 | fn find_style_other_language() { |
992 | // zz dir should have both ClearSpeak and SimpleSpeak styles | |
993 | // zz-aa dir should have only ClearSpeak style and unicode.yaml that includes the zz unicode but overrides "+" | |
994 | 1 | PREF_MANAGER.with(|pref_manager| { |
995 | 1 | let mut pref_manager = pref_manager.borrow_mut(); |
996 | 1 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
997 | 1 | pref_manager.set_user_prefs("Language", "en").unwrap(); |
998 | 1 | pref_manager.set_user_prefs("SpeechStyle", "SimpleSpeak").unwrap(); |
999 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/en/SimpleSpeak_Rules.yaml")); |
1000 | ||
1001 | 1 | pref_manager.set_user_prefs("Language", "zz").unwrap(); |
1002 | 1 | assert_eq!(&pref_manager.pref_to_string("Language"), "zz"); |
1003 | 1 | assert_eq!(&pref_manager.pref_to_string("SpeechStyle"), "SimpleSpeak"); |
1004 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/zz/SimpleSpeak_Rules.yaml")); |
1005 | ||
1006 | // make sure language stays the same | |
1007 | 1 | pref_manager.set_user_prefs("SpeechStyle", "ClearSpeak").unwrap(); |
1008 | 1 | assert_eq!(&pref_manager.pref_to_string("SpeechStyle"), "ClearSpeak"); |
1009 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/zz/ClearSpeak_Rules.yaml")); |
1010 | ||
1011 | // make sure language stays the same | |
1012 | 1 | pref_manager.set_user_prefs("SpeechStyle", "SimpleSpeak").unwrap(); |
1013 | 1 | assert_eq!(&pref_manager.pref_to_string("SpeechStyle"), "SimpleSpeak"); |
1014 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/zz/SimpleSpeak_Rules.yaml")); |
1015 | 1 | }); |
1016 | 1 | } |
1017 | ||
1018 | #[test] | |
1019 | 1 | fn find_regional_overrides() { |
1020 | // zz dir should have both ClearSpeak and SimpleSpeak styles | |
1021 | // zz-aa dir should have ClearSpeak style and unicode.yaml that includes the zz unicode but overrides "+" | |
1022 | 1 | PREF_MANAGER.with(|pref_manager| { |
1023 | 1 | let mut pref_manager = pref_manager.borrow_mut(); |
1024 | 1 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
1025 | 1 | pref_manager.set_user_prefs("SpeechStyle", "ClearSpeak").unwrap(); |
1026 | 1 | pref_manager.set_user_prefs("Language", "zz-aa").unwrap(); |
1027 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/zz/aa/ClearSpeak_Rules.yaml")); |
1028 | ||
1029 | 1 | pref_manager.set_user_prefs("SpeechStyle", "SimpleSpeak").unwrap(); |
1030 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/zz/SimpleSpeak_Rules.yaml")); |
1031 | 1 | }); |
1032 | 1 | } |
1033 | ||
1034 | #[test] | |
1035 | 1 | fn find_style_no_sublanguage() { |
1036 | 1 | PREF_MANAGER.with(|pref_manager| { |
1037 | 1 | let mut pref_manager = pref_manager.borrow_mut(); |
1038 | 1 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
1039 | 1 | pref_manager.set_user_prefs("SpeechStyle", "ClearSpeak").unwrap(); |
1040 | 1 | pref_manager.set_user_prefs("Language", "zz-ab").unwrap(); |
1041 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/zz/ClearSpeak_Rules.yaml")); |
1042 | 1 | }); |
1043 | 1 | } |
1044 | ||
1045 | #[test] | |
1046 | 1 | fn found_all_files() { |
1047 | 1 | PREF_MANAGER.with(|pref_manager| { |
1048 | 1 | let mut pref_manager = pref_manager.borrow_mut(); |
1049 | 1 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
1050 | 1 | pref_manager.set_user_prefs("SpeechStyle", "ClearSpeak").unwrap(); |
1051 | 1 | pref_manager.set_user_prefs("Language", "zz-aa").unwrap(); |
1052 | 1 | pref_manager.set_user_prefs("BrailleCode", "UEB").unwrap(); |
1053 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.intent.as_path()), PathBuf::from("intent.yaml")); |
1054 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.overview.as_path()), PathBuf::from("Languages/zz/overview.yaml")); |
1055 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech_defs.as_path()), PathBuf::from("Languages/zz/aa/definitions.yaml")); |
1056 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/zz/aa/ClearSpeak_Rules.yaml")); |
1057 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech_unicode.as_path()), PathBuf::from("Languages/zz/aa/unicode.yaml")); |
1058 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech_unicode_full.as_path()), PathBuf::from("Languages/zz/unicode-full.yaml")); |
1059 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.braille.as_path()), PathBuf::from("Braille/UEB/UEB_Rules.yaml")); |
1060 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.braille_unicode.as_path()), PathBuf::from("Braille/UEB/unicode.yaml")); |
1061 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.braille_unicode_full.as_path()), PathBuf::from("Braille/UEB/unicode-full.yaml")); |
1062 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.braille_defs.as_path()), PathBuf::from("Braille/UEB/definitions.yaml")); |
1063 | ||
1064 | 1 | pref_manager.set_user_prefs("Language", "zz-ab").unwrap(); |
1065 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.intent.as_path()), PathBuf::from("intent.yaml")); |
1066 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.overview.as_path()), PathBuf::from("Languages/zz/overview.yaml")); |
1067 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech_defs.as_path()), PathBuf::from("Languages/zz/definitions.yaml")); |
1068 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/zz/ClearSpeak_Rules.yaml")); |
1069 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech_unicode.as_path()), PathBuf::from("Languages/zz/unicode.yaml")); |
1070 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech_unicode_full.as_path()), PathBuf::from("Languages/zz/unicode-full.yaml")); |
1071 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.braille.as_path()), PathBuf::from("Braille/UEB/UEB_Rules.yaml")); |
1072 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.braille_unicode.as_path()), PathBuf::from("Braille/UEB/unicode.yaml")); |
1073 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.braille_unicode_full.as_path()), PathBuf::from("Braille/UEB/unicode-full.yaml")); |
1074 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.braille_defs.as_path()), PathBuf::from("Braille/UEB/definitions.yaml")); |
1075 | 1 | }) |
1076 | 1 | } |
1077 | ||
1078 | #[test] | |
1079 | 1 | fn test_prefs() { |
1080 | 1 | PREF_MANAGER.with(|pref_manager| { |
1081 | // first test with internal settings | |
1082 | { | |
1083 | 1 | let mut pref_manager = pref_manager.borrow_mut(); |
1084 | 1 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
1085 | ||
1086 | 1 | pref_manager.set_user_prefs("Language", "en").unwrap(); |
1087 | 1 | pref_manager.set_user_prefs("ClearSpeak_AbsoluteValue", "Determinant").unwrap(); |
1088 | 1 | pref_manager.set_user_prefs("ResetNavMode", "true").unwrap(); |
1089 | 1 | pref_manager.set_user_prefs("BrailleCode", "Nemeth").unwrap(); |
1090 | 1 | assert_eq!(pref_manager.pref_to_string("Language").as_str(), "en"); |
1091 | 1 | assert_eq!(pref_manager.pref_to_string("SubjectArea").as_str(), "General"); |
1092 | 1 | assert_eq!(pref_manager.pref_to_string("ClearSpeak_AbsoluteValue").as_str(), "Determinant"); |
1093 | 1 | assert_eq!(pref_manager.pref_to_string("ResetNavMode").as_str(), "true"); |
1094 | 1 | assert_eq!(pref_manager.pref_to_string("BrailleCode").as_str(), "Nemeth"); |
1095 | 1 | assert_eq!(pref_manager.pref_to_string("X_Y_Z").as_str(), NO_PREFERENCE); |
1096 | } | |
1097 | ||
1098 | // now test with the interface | |
1099 | { | |
1100 | use crate::interface::{set_preference, get_preference}; | |
1101 | 1 | set_preference("Language", "zz").unwrap(); |
1102 | 1 | set_preference("ClearSpeak_AbsoluteValue", "Cardinality").unwrap(); |
1103 | 1 | set_preference("Overview", "true").unwrap(); |
1104 | 1 | set_preference("BrailleCode", "UEB").unwrap(); |
1105 | 1 | assert_eq!(&get_preference("Language").unwrap(), "zz"); |
1106 | 1 | assert_eq!(&get_preference("ClearSpeak_AbsoluteValue").unwrap(), "Cardinality"); |
1107 | 1 | assert_eq!(&get_preference("Overview").unwrap(), "true"); |
1108 | 1 | assert_eq!(&get_preference("BrailleCode").unwrap(), "UEB"); |
1109 | 1 | assert!(&get_preference("X_Y_Z").is_err()); |
1110 | ||
1111 | } | |
1112 | 1 | }); |
1113 | 1 | } |
1114 | ||
1115 | #[test] | |
1116 | 1 | fn test_language_change() { |
1117 | // set_preference borrows the pref manager, so the previous borrow's lifetime needs to be ended before using it | |
1118 | 1 | PREF_MANAGER.with(|pref_manager| { |
1119 | 1 | let mut pref_manager = pref_manager.borrow_mut(); |
1120 | 1 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
1121 | 1 | }); |
1122 | 1 | crate::interface::set_preference("Language", "en").unwrap(); |
1123 | 1 | crate::interface::set_preference("SpeechStyle", "ClearSpeak").unwrap(); |
1124 | 1 | PREF_MANAGER.with(|pref_manager| { |
1125 | 1 | let pref_manager = pref_manager.borrow_mut(); |
1126 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.get_rule_file(&RulesFor::Speech)), PathBuf::from("Languages/en/ClearSpeak_Rules.yaml")); |
1127 | 1 | }); |
1128 | ||
1129 | 1 | crate::interface::set_preference("Language", "zz").unwrap(); |
1130 | 1 | PREF_MANAGER.with(|pref_manager| { |
1131 | 1 | let pref_manager = pref_manager.borrow_mut(); |
1132 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.get_rule_file(&RulesFor::Speech)), PathBuf::from("Languages/zz/ClearSpeak_Rules.yaml")); |
1133 | 1 | }); |
1134 | 1 | } |
1135 | ||
1136 | #[test] | |
1137 | 1 | fn test_speech_style_change() { |
1138 | 1 | PREF_MANAGER.with(|pref_manager| { |
1139 | 1 | let mut pref_manager = pref_manager.borrow_mut(); |
1140 | 1 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
1141 | 1 | pref_manager.set_user_prefs("Language", "en").unwrap(); |
1142 | 1 | pref_manager.set_user_prefs("SpeechStyle", "ClearSpeak").unwrap(); |
1143 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.get_rule_file(&RulesFor::Speech)), PathBuf::from("Languages/en/ClearSpeak_Rules.yaml")); |
1144 | ||
1145 | 1 | pref_manager.set_user_prefs("SpeechStyle", "SimpleSpeak").unwrap(); |
1146 | ||
1147 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.get_rule_file(&RulesFor::Speech)), PathBuf::from("Languages/en/SimpleSpeak_Rules.yaml")); |
1148 | 1 | }); |
1149 | 1 | } |
1150 | ||
1151 | #[test] | |
1152 | 1 | fn test_some_changes() { |
1153 | 1 | PREF_MANAGER.with(|pref_manager| { |
1154 | 1 | let mut pref_manager = pref_manager.borrow_mut(); |
1155 | 1 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
1156 | 1 | pref_manager.set_user_prefs("Verbosity", "Terse").unwrap(); |
1157 | ||
1158 | 1 | assert_eq!(&pref_manager.pref_to_string("Verbosity"), "Terse"); |
1159 | ||
1160 | 1 | pref_manager.set_user_prefs("BrailleCode", "UEB").unwrap(); |
1161 | 1 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.get_rule_file(&RulesFor::Braille)), PathBuf::from("Braille/UEB/UEB_Rules.yaml")); |
1162 | ||
1163 | // make sure they show up when building context for speech generation | |
1164 | 1 | let merged_prefs = pref_manager.merge_prefs(); |
1165 | 1 | assert_eq!(merged_prefs.get("Verbosity").unwrap().as_str().unwrap(), "Terse"); |
1166 | 1 | }); |
1167 | ||
1168 | 1 | crate::interface::set_preference("NavVerbosity", "Terse").unwrap(); |
1169 | 1 | PREF_MANAGER.with(|pref_manager| { |
1170 | 1 | let pref_manager = pref_manager.borrow_mut(); |
1171 | 1 | let merged_prefs = pref_manager.merge_prefs(); |
1172 | 1 | assert_eq!(merged_prefs.get("NavVerbosity").unwrap().as_str().unwrap(), "Terse"); |
1173 | 1 | }); |
1174 | 1 | } |
1175 | ||
1176 | #[test] | |
1177 | 1 | fn test_illegal_pref_values() { |
1178 | 1 | PREF_MANAGER.with(|pref_manager| { |
1179 | 1 | let mut pref_manager = pref_manager.borrow_mut(); |
1180 | 1 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
1181 | 1 | assert!(pref_manager.set_string_pref("Language", "../../../my/path").is_err()); |
1182 | 1 | assert!(pref_manager.set_string_pref("BrailleCode", "C:\\my\\path").is_err()); |
1183 | 1 | assert!(pref_manager.set_string_pref("SpeechStyle", "/my/path").is_err()); |
1184 | 1 | }); |
1185 | 1 | } |
1186 | ||
1187 | #[test] | |
1188 | #[ignore] // this is an ugly test for #262 -- it changes the prefs file and so is a bad thing in general | |
1189 | 0 | fn test_up_to_date() { |
1190 | use std::fs; | |
1191 | use std::thread::sleep; | |
1192 | use std::time::Duration; | |
1193 | use crate::interface; | |
1194 | 0 | PREF_MANAGER.with(|pref_manager| { |
1195 | 0 | let mut pref_manager = pref_manager.borrow_mut(); |
1196 | 0 | pref_manager.initialize(abs_rules_dir_path()).unwrap(); |
1197 | 0 | assert_eq!(&pref_manager.pref_to_string("SpeechStyle"), "ClearSpeak"); |
1198 | 0 | assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/zz/ClearSpeak_Rules.yaml")); |
1199 | 0 | }); |
1200 | 0 | interface::set_mathml("<math><mo>+</mo><mn>10</mn></math>").unwrap(); |
1201 | 0 | assert_eq!(interface::get_spoken_text().unwrap(), "ClearSpeak positive from zz 10"); |
1202 | ||
1203 | 0 | let mut file_path = PathBuf::default(); |
1204 | 0 | let mut contents = vec![]; |
1205 | 0 | PREF_MANAGER.with(|pref_manager| { |
1206 | 0 | let pref_manager = pref_manager.borrow(); |
1207 | 0 | if let Some(file_name) = pref_manager.user_prefs_file.as_ref().unwrap().debug_get_file() { |
1208 | 0 | file_path = PathBuf::from(file_name); |
1209 | 0 | contents = fs::read(&file_path).expect(&format!("Failed to write file {} during test", file_name)); |
1210 | 0 | let changed_contents = String::from_utf8(contents.clone()).unwrap() |
1211 | 0 | .replace("SpeechStyle: ClearSpeak", "SpeechStyle: SimpleSpeak"); |
1212 | 0 | fs::write(&file_path, changed_contents).unwrap(); |
1213 | 0 | sleep(Duration::from_millis(5)); // make sure the time changes enough to be recognized |
1214 | 0 | } |
1215 | 0 | }); |
1216 | 0 | assert_eq!(interface::get_spoken_text().unwrap(), "SimpleSpeak positive from zz 10"); |
1217 | 0 | fs::write(&file_path, contents).unwrap(); |
1218 | ||
1219 | // assert_eq!(&pref_manager.pref_to_string("SpeechStyle"), "SimpleSpeak"); | |
1220 | // assert_eq!(rel_path(&pref_manager.rules_dir, pref_manager.speech.as_path()), PathBuf::from("Languages/zz/SimpleSpeak_Rules.yaml")); | |
1221 | 0 | } |
1222 | ||
1223 | }} | |
1224 | } |
Line | Count | Source |
1 | //! Useful functions for debugging and error messages. | |
2 | #![allow(clippy::needless_return)] | |
3 | ||
4 | use sxd_document::dom::{Element, ChildOfElement, Attribute}; | |
5 | ||
6 | // #[allow(dead_code)] | |
7 | // pub fn pp_doc(doc: &Document) { | |
8 | // for root_child in doc.root().children() { | |
9 | // if let ChildOfRoot::Element(e) = root_child { | |
10 | // format_element(&e, 0); | |
11 | // break; | |
12 | // } | |
13 | // }; | |
14 | // } | |
15 | ||
16 | /// Pretty-print the MathML represented by `element`. | |
17 | 4.90k | pub fn mml_to_string(e: Element) -> String { |
18 | 4.90k | return format_element(e, 0); |
19 | 4.90k | } |
20 | ||
21 | /// Pretty-print the MathML represented by `element`. | |
22 | /// * `indent` -- the amount of indentation to start with | |
23 | 57.6k | pub fn format_element(e: Element, indent: usize) -> String { |
24 | // let namespace = match e.name().namespace_uri() { | |
25 | // None => "".to_string(), | |
26 | // Some(prefix) => prefix.to_string() + ":", | |
27 | // }; | |
28 | // let namespace = namespace.as_str(); | |
29 | 57.6k | let namespace = ""; |
30 | 57.6k | let mut answer = format!("{:in$}<{ns}{name}{attrs}>", " ", in=2*indent, ns=namespace, name=e.name().local_part(), attrs=format_attrs(&e.attributes())); |
31 | 57.6k | let children = e.children(); |
32 | 57.6k | let has_element = children.iter().find(|&&c| matches!( |
33 | 57.6k | if has_element.is_none() { |
34 | // print text content | |
35 | 35.9k | let content = children.iter() |
36 | 35.9k | .map(|c| if let ChildOfElement::Text( |
37 | 35.9k | .collect::<Vec<&str>>() |
38 | 35.9k | .join(""); |
39 | 35.9k | return format!("{}{}</{}{}>\n", answer, &handle_special_chars(&content), namespace, e.name().local_part()); |
40 | // for child in children { | |
41 | // if let ChildOfElement::Text(t) = child { | |
42 | // return format!("{}{}</{}{}>\n", answer, &make_invisible_chars_visible(t.text()), namespace, e.name().local_part()); | |
43 | // } | |
44 | // }; | |
45 | } else { | |
46 | 21.6k | answer += "\n"; // tag with children should start on new line |
47 | // recurse on each Element child | |
48 | 52.7k | for c in |
49 | 52.7k | if let ChildOfElement::Element(e) = c { |
50 | 52.7k | answer += &format_element(e, indent+1); |
51 | 52.7k |
|
52 | } | |
53 | } | |
54 | 21.6k | return answer + &format!("{:in$}</{ns}{name}>\n", " ", in=2*indent, ns=namespace, name=e.name().local_part()); |
55 | ||
56 | // Use the &#x....; representation for invisible chars when printing | |
57 | 57.6k | } |
58 | ||
59 | /// Format a vector of attributes as a string with a leading space | |
60 | 57.6k | pub fn format_attrs(attrs: &[Attribute]) -> String { |
61 | 57.6k | let mut result = String::new(); |
62 | 134k | for attr in |
63 | 134k | result += format!(" {}='{}'", attr.name().local_part(), &handle_special_chars(attr.value())).as_str(); |
64 | 134k | } |
65 | 57.6k | result |
66 | 57.6k | } |
67 | ||
68 | 170k | fn handle_special_chars(text: &str) -> String { |
69 | // Pre-allocate a buffer. We guess the size is roughly the same as input, maybe slightly larger. | |
70 | 170k | let mut s = String::with_capacity(text.len()); |
71 | 963k | for ch in |
72 | 963k | match ch { |
73 | 32 | '"' => s.push_str("""), |
74 | 5 | '&' => s.push_str("&"), |
75 | 277 | '\'' => s.push_str("'"), |
76 | 367 | '<' => s.push_str("<"), |
77 | 410 | '>' => s.push_str(">"), |
78 | 724 | '\u{2061}' => s.push_str("⁡"), |
79 | 3.46k | '\u{2062}' => s.push_str("⁢"), |
80 | 571 | '\u{2063}' => s.push_str("⁣"), |
81 | 76 | '\u{2064}' => s.push_str("⁤"), |
82 | 957k | _ => s.push(ch), |
83 | } | |
84 | } | |
85 | 170k | s |
86 | 170k | } |
87 | ||
88 | ||
89 | // /// Pretty print an xpath value. | |
90 | // /// If the value is a `NodeSet`, the MathML for the node/element is returned. | |
91 | // pub fn pp_xpath_value(value: Value) { | |
92 | // use sxd_xpath::Value; | |
93 | // use sxd_xpath::nodeset::Node; | |
94 | // debug!("XPath value:"); | |
95 | // if let Value::Nodeset(nodeset) = &value { | |
96 | // for node in nodeset.document_order() { | |
97 | // match node { | |
98 | // Node::Element(el) => {debug!("{}", crate::pretty_print::format_element(&el, 1))}, | |
99 | // Node::Text(t) => {debug!("found Text value: {}", t.text())}, | |
100 | // _ => {debug!("found unexpected node type")} | |
101 | // } | |
102 | // } | |
103 | // } | |
104 | // } | |
105 | ||
106 | /// Convert YAML to a string using with `indent` amount of space. | |
107 | 2.41M | pub fn yaml_to_string(yaml: &Yaml, indent: usize) -> String { |
108 | 2.41M | let mut result = String::new(); |
109 | 2.41M | { |
110 | 2.41M | let mut emitter = YamlEmitter::new(&mut result); |
111 | 2.41M | emitter.compact(true); |
112 | 2.41M | emitter.emit_node(yaml).unwrap(); // dump the YAML object to a String |
113 | 2.41M | } |
114 | 2.41M | if indent == 0 { |
115 | 2.41M | return result; |
116 | 0 | } |
117 | 0 | let indent_str = format!("{:in$}", " ", in=2*indent); |
118 | 0 | result = result.replace('\n',&("\n".to_string() + &indent_str)); // add indentation to all but first line |
119 | 0 | return indent_str + result.trim_end(); // add indent to first line and remove an extra indent at end |
120 | 2.41M | } |
121 | ||
122 | /* --------------------- Tweaked pretty printer for YAML (from YAML code) --------------------- */ | |
123 | ||
124 | // Changed: new function to determine if more compact notation can be used (when child is a one entry simple array/hash). Writes | |
125 | // -foo [bar: bletch] | |
126 | // -foo {bar: bletch} | |
127 | 20.0k | fn is_scalar(v: &Yaml) -> bool { |
128 | 20.0k | return !matches!(v, Yaml::Hash(_) | Yaml::Array(_)); |
129 | 20.0k | } |
130 | ||
131 | 20.0k | fn is_complex(v: &Yaml) -> bool { |
132 | 20.0k | return match v { |
133 | 1 | Yaml::Hash(h) => { |
134 | 1 | return match h.len() { |
135 | 0 | 0 => false, |
136 | 1 => { | |
137 | 1 | let (key,val) = h.iter().next().unwrap(); |
138 | 1 | return !(is_scalar(key) && is_scalar(val)) |
139 | }, | |
140 | 0 | _ => true, |
141 | } | |
142 | }, | |
143 | 0 | Yaml::Array(v) => { |
144 | 0 | return match v.len() { |
145 | 0 | 0 => false, |
146 | 1 => { | |
147 | 0 | let hash = v[0].as_hash(); |
148 | 0 | if let Some(hash) = hash { |
149 | 0 | return match hash.len() { |
150 | 0 | 0 => false, |
151 | 1 => { | |
152 | 0 | let (key, val) = hash.iter().next().unwrap(); |
153 | 0 | return !(is_scalar(key) && is_scalar(val)); |
154 | }, | |
155 | 0 | _ => true, |
156 | } | |
157 | } else { | |
158 | 0 | return !is_scalar(&v[0]); |
159 | } | |
160 | }, | |
161 | 0 | _ => true, |
162 | } | |
163 | }, | |
164 | 20.0k | _ => false, |
165 | } | |
166 | 20.0k | } |
167 | ||
168 | use std::error::Error; | |
169 | use std::fmt::{self, Display}; | |
170 | use yaml_rust::{Yaml, yaml::Hash}; | |
171 | ||
172 | //use crate::yaml::{Hash, Yaml}; | |
173 | ||
174 | #[derive(Copy, Clone, Debug)] | |
175 | #[allow(dead_code)] // from original YAML code (isn't used here) | |
176 | enum EmitError { | |
177 | FmtError(fmt::Error), | |
178 | BadHashmapKey, | |
179 | } | |
180 | ||
181 | impl Error for EmitError { | |
182 | 0 | fn cause(&self) -> Option<&dyn Error> { |
183 | 0 | None |
184 | 0 | } |
185 | } | |
186 | ||
187 | impl Display for EmitError { | |
188 | 0 | fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { |
189 | 0 | match *self { |
190 | 0 | EmitError::FmtError(ref err) => Display::fmt(err, formatter), |
191 | 0 | EmitError::BadHashmapKey => formatter.write_str("bad hashmap key"), |
192 | } | |
193 | 0 | } |
194 | } | |
195 | ||
196 | impl From<fmt::Error> for EmitError { | |
197 | 0 | fn from(f: fmt::Error) -> Self { |
198 | 0 | EmitError::FmtError(f) |
199 | 0 | } |
200 | } | |
201 | ||
202 | struct YamlEmitter<'a> { | |
203 | writer: &'a mut dyn fmt::Write, | |
204 | best_indent: usize, | |
205 | compact: bool, | |
206 | ||
207 | level: isize, | |
208 | } | |
209 | ||
210 | type EmitResult = Result<(), EmitError>; | |
211 | ||
212 | // from serialize::json | |
213 | 1.12M | fn escape_str(wr: &mut dyn fmt::Write, v: &str) -> Result<(), fmt::Error> { |
214 | 1.12M | wr.write_str("\"") |
215 | ||
216 | 1.12M | let mut start = 0; |
217 | ||
218 | 101M | for (i, byte) in |
219 | 101M | let |
220 | 0 | b'"' => "\\\"", |
221 | 0 | b'\\' => "\\\\", |
222 | 0 | b'\x00' => "\\u0000", |
223 | 0 | b'\x01' => "\\u0001", |
224 | 0 | b'\x02' => "\\u0002", |
225 | 0 | b'\x03' => "\\u0003", |
226 | 0 | b'\x04' => "\\u0004", |
227 | 0 | b'\x05' => "\\u0005", |
228 | 0 | b'\x06' => "\\u0006", |
229 | 0 | b'\x07' => "\\u0007", |
230 | 0 | b'\x08' => "\\b", |
231 | 0 | b'\t' => "\\t", |
232 | 0 | b'\n' => "\\n", |
233 | 0 | b'\x0b' => "\\u000b", |
234 | 0 | b'\x0c' => "\\f", |
235 | 0 | b'\r' => "\\r", |
236 | 0 | b'\x0e' => "\\u000e", |
237 | 0 | b'\x0f' => "\\u000f", |
238 | 0 | b'\x10' => "\\u0010", |
239 | 0 | b'\x11' => "\\u0011", |
240 | 0 | b'\x12' => "\\u0012", |
241 | 0 | b'\x13' => "\\u0013", |
242 | 0 | b'\x14' => "\\u0014", |
243 | 0 | b'\x15' => "\\u0015", |
244 | 0 | b'\x16' => "\\u0016", |
245 | 0 | b'\x17' => "\\u0017", |
246 | 0 | b'\x18' => "\\u0018", |
247 | 0 | b'\x19' => "\\u0019", |
248 | 0 | b'\x1a' => "\\u001a", |
249 | 0 | b'\x1b' => "\\u001b", |
250 | 0 | b'\x1c' => "\\u001c", |
251 | 0 | b'\x1d' => "\\u001d", |
252 | 0 | b'\x1e' => "\\u001e", |
253 | 0 | b'\x1f' => "\\u001f", |
254 | 0 | b'\x7f' => "\\u007f", |
255 | 101M | _ => continue, |
256 | }; | |
257 | ||
258 | 0 | if start < i { |
259 | 0 | wr.write_str(&v[start..i])?; |
260 | 0 | } |
261 | ||
262 | 0 | wr.write_str(escaped)?; |
263 | ||
264 | 0 | start = i + 1; |
265 | } | |
266 | ||
267 | 1.12M | if start != v.len() { |
268 | 1.12M | wr.write_str(&v[start..]) |
269 | 0 | } |
270 | ||
271 | 1.12M | wr.write_str("\"") |
272 | 1.12M | Ok(()) |
273 | 1.12M | } |
274 | ||
275 | impl<'a> YamlEmitter<'a> { | |
276 | 2.41M | pub fn new(writer: &'a mut dyn fmt::Write) -> YamlEmitter<'a> { |
277 | 2.41M | YamlEmitter { |
278 | 2.41M | writer, |
279 | 2.41M | best_indent: 2, |
280 | 2.41M | compact: true, |
281 | 2.41M | level: -1, |
282 | 2.41M | } |
283 | 2.41M | } |
284 | ||
285 | /// Set 'compact inline notation' on or off, as described for block | |
286 | /// [sequences](http://www.yaml.org/spec/1.2/spec.html#id2797382) | |
287 | /// and | |
288 | /// [mappings](http://www.yaml.org/spec/1.2/spec.html#id2798057). | |
289 | /// | |
290 | /// In this form, blocks cannot have any properties (such as anchors | |
291 | /// or tags), which should be OK, because this emitter doesn't | |
292 | /// (currently) emit those anyways. | |
293 | 2.41M | pub fn compact(&mut self, compact: bool) { |
294 | 2.41M | self.compact = compact; |
295 | 2.41M | } |
296 | ||
297 | /// Determine if this emitter is using 'compact inline notation'. | |
298 | #[allow(dead_code)] // not all fields are used in this program | |
299 | 0 | pub fn is_compact(&self) -> bool { |
300 | 0 | self.compact |
301 | 0 | } |
302 | ||
303 | // fn dump(&mut self, doc: &Yaml) -> EmitResult { | |
304 | // // write DocumentStart | |
305 | // writeln!(self.writer, "---")?; | |
306 | // self.level = -1; | |
307 | // self.emit_node(doc) | |
308 | // } | |
309 | ||
310 | 15.0k | fn write_indent(&mut self) -> EmitResult { |
311 | 15.0k | if self.level <= 0 { |
312 | 15.0k | return Ok(()); |
313 | 0 | } |
314 | 0 | for _ in 0..self.level { |
315 | 0 | for _ in 0..self.best_indent { |
316 | 0 | write!(self.writer, " ")?; |
317 | } | |
318 | } | |
319 | 0 | Ok(()) |
320 | 15.0k | } |
321 | ||
322 | 2.45M | fn emit_node(&mut self, node: &Yaml) -> EmitResult { |
323 | 2.45M | match *node { |
324 | 5.02k | Yaml::Array(ref v) => self.emit_array(v), |
325 | 0 | Yaml::Hash(ref h) => self.emit_hash(h), |
326 | 2.45M | Yaml::String(ref v) => { |
327 | 2.45M | if need_quotes(v) { |
328 | 1.12M | escape_str(self.writer, v) |
329 | } else { | |
330 | 1.32M | write!(self.writer, "{v}") |
331 | } | |
332 | 2.45M | Ok(()) |
333 | } | |
334 | 0 | Yaml::Boolean(v) => { |
335 | 0 | if v { |
336 | 0 | self.writer.write_str("true")?; |
337 | } else { | |
338 | 0 | self.writer.write_str("false")?; |
339 | } | |
340 | 0 | Ok(()) |
341 | } | |
342 | 0 | Yaml::Integer(v) => { |
343 | 0 | write!(self.writer, "{v}")?; |
344 | 0 | Ok(()) |
345 | } | |
346 | 0 | Yaml::Real(ref v) => { |
347 | 0 | write!(self.writer, "{v}")?; |
348 | 0 | Ok(()) |
349 | } | |
350 | Yaml::Null | Yaml::BadValue => { | |
351 | 0 | write!(self.writer, "~")?; |
352 | 0 | Ok(()) |
353 | } | |
354 | // XXX(chenyh) Alias | |
355 | 0 | _ => Ok(()), |
356 | } | |
357 | 2.45M | } |
358 | ||
359 | 5.02k | fn emit_array(&mut self, v: &[Yaml]) -> EmitResult { |
360 | 5.02k | if v.is_empty() { |
361 | 0 | write!(self.writer, "[]")?; |
362 | 5.02k | } else if v.len() == 1 && |
363 | // changed -- for arrays that have only one simple element, make them more compact by using [...] notation | |
364 | 1 | write!(self.writer, "[") |
365 | 1 | self.emit_val(true, &v[0]) |
366 | 1 | write!(self.writer, "]") |
367 | } else { | |
368 | 5.02k | self.level += 1; |
369 | ||
370 | 20.0k | for (cnt, x) in |
371 | 20.0k | if cnt > 0 { |
372 | 15.0k | writeln!(self.writer) |
373 | 15.0k | self.write_indent() |
374 | 5.02k | } |
375 | 20.0k | write!(self.writer, "- ") |
376 | 20.0k | self.emit_val(true, x) |
377 | } | |
378 | 5.02k | self.level -= 1; |
379 | } | |
380 | 5.02k | return Ok(()); |
381 | 5.02k | } |
382 | ||
383 | 20.0k | fn emit_hash(&mut self, h: &Hash) -> EmitResult { |
384 | 20.0k | if h.is_empty() { |
385 | 0 | self.writer.write_str("{}")?; |
386 | } else { | |
387 | // changed -- for hashmaps that have only one simple element, make them more compact by using {...}} notation | |
388 | 20.0k | self.level += 1; |
389 | 20.0k | for (cnt, (k, v)) in h.iter().enumerate() { |
390 | // changed: use new function is_scalar() | |
391 | // let complex_key = match *k { | |
392 | // Yaml::Hash(_) | Yaml::Array(_) => true, | |
393 | // _ => false, | |
394 | // }; | |
395 | 20.0k | if cnt > 0 { |
396 | 0 | writeln!(self.writer)?; |
397 | 0 | self.write_indent()?; |
398 | 20.0k | } |
399 | 20.0k | if !is_scalar(k) { |
400 | 0 | write!(self.writer, "? ")?; |
401 | 0 | self.emit_val(true, k)?; |
402 | 0 | writeln!(self.writer)?; |
403 | 0 | self.write_indent()?; |
404 | 0 | write!(self.writer, ": ")?; |
405 | 0 | self.emit_val(true, v)?; |
406 | } else { | |
407 | 20.0k | self.emit_node(k) |
408 | 20.0k | write!(self.writer, ": ") |
409 | ||
410 | // changed to use braces in some cases | |
411 | 20.0k | let complex_value = is_complex(v); |
412 | 20.0k | if !complex_value && v.as_hash().is_some() { |
413 | 0 | write!(self.writer, "{{")?; |
414 | 20.0k | } |
415 | // changed to use complex_value from 'false' | |
416 | 20.0k | self.emit_val(!complex_value, v) |
417 | 20.0k | if !complex_value && v.as_hash().is_some() { |
418 | 0 | write!(self.writer, "}}")?; |
419 | 20.0k | } |
420 | } | |
421 | } | |
422 | 20.0k | self.level -= 1; |
423 | } | |
424 | 20.0k | Ok(()) |
425 | 20.0k | } |
426 | ||
427 | /// Emit a yaml as a hash or array value: i.e., which should appear | |
428 | /// following a ":" or "-", either after a space, or on a new line. | |
429 | /// If `inline` is true, then the preceding characters are distinct | |
430 | /// and short enough to respect the compact flag. | |
431 | // changed: use to always emit ' ' for inline -- that is now handled elsewhere | |
432 | 40.1k | fn emit_val(&mut self, inline: bool, val: &Yaml) -> EmitResult { |
433 | 40.1k | match *val { |
434 | 0 | Yaml::Array(ref v) => { |
435 | 0 | if !((inline && self.compact) || v.is_empty()) { |
436 | 0 | writeln!(self.writer)?; |
437 | 0 | self.level += 1; |
438 | 0 | self.write_indent()?; |
439 | 0 | self.level -= 1; |
440 | 0 | } |
441 | 0 | self.emit_array(v) |
442 | } | |
443 | 20.0k | Yaml::Hash(ref h) => { |
444 | 20.0k | if !((inline && self.compact) || |
445 | 0 | writeln!(self.writer)?; |
446 | 0 | self.level += 1; |
447 | 0 | self.write_indent()?; |
448 | 0 | self.level -= 1; |
449 | 20.0k | } |
450 | 20.0k | self.emit_hash(h) |
451 | } | |
452 | _ => { | |
453 | // write!(self.writer, " ")?; | |
454 | 20.0k | self.emit_node(val) |
455 | } | |
456 | } | |
457 | 40.1k | } |
458 | } | |
459 | ||
460 | /// Check if the string requires quoting. | |
461 | /// Strings starting with any of the following characters must be quoted. | |
462 | /// :, &, *, ?, |, -, <, >, =, !, %, @ | |
463 | /// Strings containing any of the following characters must be quoted. | |
464 | /// {, }, [, ], ,, #, ` | |
465 | /// | |
466 | /// If the string contains any of the following control characters, it must be escaped with double quotes: | |
467 | /// \0, \x01, \x02, \x03, \x04, \x05, \x06, \a, \b, \t, \n, \v, \f, \r, \x0e, \x0f, \x10, \x11, \x12, \x13, \x14, \x15, \x16, \x17, \x18, \x19, \x1a, \e, \x1c, \x1d, \x1e, \x1f, \N, \_, \L, \P | |
468 | /// | |
469 | /// Finally, there are other cases when the strings must be quoted, no matter if you're using single or double quotes: | |
470 | /// * When the string is true or false (otherwise, it would be treated as a boolean value); | |
471 | /// * When the string is null or ~ (otherwise, it would be considered as a null value); | |
472 | /// * When the string looks like a number, such as integers (e.g. 2, 14, etc.), floats (e.g. 2.6, 14.9) and exponential numbers (e.g. 12e7, etc.) (otherwise, it would be treated as a numeric value); | |
473 | /// * When the string looks like a date (e.g. 2014-12-31) (otherwise it would be automatically converted into a Unix timestamp). | |
474 | 2.45M | fn need_quotes(string: &str) -> bool { |
475 | 2.45M | fn need_quotes_spaces(string: &str) -> bool { |
476 | 2.45M | string.starts_with(' ') || |
477 | 2.45M | } |
478 | ||
479 | 2.45M | string.is_empty() |
480 | 2.45M | || need_quotes_spaces(string) |
481 | 2.45M | || string.starts_with(['&', '*', '?', '|', '-', '<', '>', '=', '!', '%', '@']) |
482 | 19.8M | || |
483 | ':' | |
484 | | '{' | |
485 | | '}' | |
486 | | '[' | |
487 | | ']' | |
488 | | ',' | |
489 | | '#' | |
490 | | '`' | |
491 | | '\"' | |
492 | | '\'' | |
493 | | '\\' | |
494 | 18.9M | | '\0'..='\x06' |
495 | | '\t' | |
496 | | '\n' | |
497 | | '\r' | |
498 | 18.9M | | '\x0e'..='\x1a' |
499 | 18.9M | | '\x1c'..='\x1f') ) |
500 | 1.32M | || [ |
501 | 1.32M | // http://yaml.org/type/bool.html |
502 | 1.32M | // Note: 'y', 'Y', 'n', 'N', is not quoted deliberately, as in libyaml. PyYAML also parse |
503 | 1.32M | // them as string, not booleans, although it is violating the YAML 1.1 specification. |
504 | 1.32M | // See https://github.com/dtolnay/serde-yaml/pull/83#discussion_r152628088. |
505 | 1.32M | "yes", "Yes", "YES", "no", "No", "NO", "True", "TRUE", "true", "False", "FALSE", |
506 | 1.32M | "false", "on", "On", "ON", "off", "Off", "OFF", |
507 | 1.32M | // http://yaml.org/type/null.html |
508 | 1.32M | "null", "Null", "NULL", "~", |
509 | 1.32M | ] |
510 | 1.32M | .contains(&string) |
511 | 1.32M | || string.starts_with('.') |
512 | 1.32M | || string.starts_with("0x") |
513 | 1.32M | || string.parse::<i64>().is_ok() |
514 | 1.32M | || string.parse::<f64>().is_ok() |
515 | 2.45M | } |
516 | ||
517 | #[cfg(test)] | |
518 | mod tests { | |
519 | use super::*; | |
520 | use sxd_document::dom::{ChildOfElement, ChildOfRoot}; | |
521 | use sxd_document::parser; | |
522 | ||
523 | /// helper function | |
524 | 10 | fn first_element(package: &sxd_document::Package) -> Element<'_> { |
525 | 10 | let doc = package.as_document(); |
526 | 10 | for child in doc.root().children() { |
527 | 10 | if let ChildOfRoot::Element(e) = child { |
528 | 10 | return e; |
529 | 0 | } |
530 | } | |
531 | 0 | panic!("No root element found"); |
532 | 10 | } |
533 | ||
534 | #[test] | |
535 | /// Escapes XML entities and invisible characters for safe display. | |
536 | /// Tests the method on a few hardcoded characters. | |
537 | 1 | fn handle_special_chars_escapes() { |
538 | 1 | let input = "& < > \" ' \u{2061} \u{2062} \u{2063} \u{2064} x"; |
539 | 1 | let expected = "& < > " ' ⁡ ⁢ ⁣ ⁤ x"; |
540 | 1 | assert_eq!(handle_special_chars(input), expected); |
541 | 1 | } |
542 | ||
543 | #[test] | |
544 | /// Formats a leaf element as a single line with escaped text. | |
545 | 1 | fn format_element_leaf_text() { |
546 | 1 | let package = parser::parse("<math><mi>&</mi></math>").unwrap(); |
547 | 1 | let math = first_element(&package); |
548 | 1 | let mi = math |
549 | 1 | .children() |
550 | 1 | .iter() |
551 | 1 | .find_map(|c| match c { |
552 | 1 | ChildOfElement::Element(e) => Some(*e), |
553 | 0 | _ => None, |
554 | 1 | }) |
555 | 1 | .unwrap(); |
556 | 1 | assert_eq!(format_element(mi, 0), " <mi>&</mi>\n"); |
557 | 1 | } |
558 | ||
559 | #[test] | |
560 | /// Formats a nested element with indentation and newlines. | |
561 | 1 | fn format_element_nested() { |
562 | 1 | let package = parser::parse("<math><mi>x</mi><mo>+</mo></math>").unwrap(); |
563 | 1 | let math = first_element(&package); |
564 | 1 | let rendered = format_element(math, 0); |
565 | 1 | assert!(rendered.starts_with(" <math>\n")); |
566 | 1 | assert!(rendered.contains("\n <mi>x</mi>\n")); |
567 | 1 | assert!(rendered.contains("\n <mo>+</mo>\n")); |
568 | 1 | assert!(rendered.ends_with("</math>\n")); |
569 | 1 | } |
570 | ||
571 | #[test] | |
572 | /// Escapes special characters in attribute values. | |
573 | 1 | fn format_attrs_escapes() { |
574 | 1 | let package = parser::parse("<math a=\"&\" b=\"<\"></math>").unwrap(); |
575 | 1 | let math = first_element(&package); |
576 | 1 | let rendered = format_attrs(&math.attributes()); |
577 | 1 | assert!(rendered.contains(" a='&'")); |
578 | 1 | assert!(rendered.contains(" b='<'")); |
579 | 1 | } |
580 | ||
581 | #[test] | |
582 | /// Preserves non-BMP characters from a literal XML form. | |
583 | 1 | fn format_element_non_bmp_character_literal() { |
584 | 1 | let package = parser::parse("<math><mi>𝞪</mi></math>").unwrap(); |
585 | 1 | let math = first_element(&package); |
586 | 1 | let mi = math |
587 | 1 | .children() |
588 | 1 | .iter() |
589 | 1 | .find_map(|c| match c { |
590 | 1 | ChildOfElement::Element(e) => Some(*e), |
591 | 0 | _ => None, |
592 | 1 | }) |
593 | 1 | .unwrap(); |
594 | 1 | let rendered = format_element(mi, 0); |
595 | 1 | assert!(rendered.contains("𝞪")); |
596 | 1 | } |
597 | ||
598 | #[test] | |
599 | /// Preserves non-BMP characters from a numeric XML form. | |
600 | 1 | fn format_element_non_bmp_character_numeric() { |
601 | 1 | let package = parser::parse("<math><mi>𝞪</mi></math>").unwrap(); |
602 | 1 | let math = first_element(&package); |
603 | 1 | let mi = math |
604 | 1 | .children() |
605 | 1 | .iter() |
606 | 1 | .find_map(|c| match c { |
607 | 1 | ChildOfElement::Element(e) => Some(*e), |
608 | 0 | _ => None, |
609 | 1 | }) |
610 | 1 | .unwrap(); |
611 | 1 | let rendered = format_element(mi, 0); |
612 | 1 | assert!(rendered.contains("𝞪")); |
613 | 1 | } |
614 | ||
615 | #[test] | |
616 | /// Evaluates non-BMP literal text through sxd_xpath. | |
617 | 1 | fn xpath_non_bmp_literal() { |
618 | use sxd_xpath::{Factory, Value}; | |
619 | ||
620 | 1 | let package = parser::parse("<math><mi>𝞪</mi></math>").unwrap(); |
621 | 1 | let xpath = Factory::new().build("string(/math/mi)").unwrap().unwrap(); |
622 | 1 | let context = sxd_xpath::Context::new(); |
623 | ||
624 | 1 | let value = xpath.evaluate(&context, first_element(&package)).unwrap(); |
625 | 1 | match value { |
626 | 1 | Value::String(s) => assert_eq!(s, "𝞪"), |
627 | 0 | _ => panic!("Expected string value from xpath"), |
628 | } | |
629 | 1 | } |
630 | ||
631 | #[test] | |
632 | /// Evaluates non-BMP numeric text through sxd_xpath. | |
633 | 1 | fn xpath_non_bmp_numeric() { |
634 | use sxd_xpath::{Factory, Value}; | |
635 | ||
636 | 1 | let package = parser::parse("<math><mi>𝞪</mi></math>").unwrap(); |
637 | 1 | let xpath = Factory::new().build("string(/math/mi)").unwrap().unwrap(); |
638 | 1 | let context = sxd_xpath::Context::new(); |
639 | ||
640 | 1 | let value = xpath.evaluate(&context, first_element(&package)).unwrap(); |
641 | 1 | match value { |
642 | 1 | Value::String(s) => assert_eq!(s, "𝞪"), |
643 | 0 | _ => panic!("Expected string value from xpath"), |
644 | } | |
645 | 1 | } |
646 | ||
647 | #[test] | |
648 | /// Evaluates non-BMP literal text with a MathML namespace-qualified XPath. | |
649 | 1 | fn xpath_non_bmp_namespace_literal() { |
650 | use sxd_xpath::{Factory, Value}; | |
651 | ||
652 | 1 | let xml = "<math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>𝞪</mi></math>"; |
653 | 1 | let package = parser::parse(xml).unwrap(); |
654 | 1 | let xpath = Factory::new() |
655 | 1 | .build("string(/m:math/m:mi)") |
656 | 1 | .unwrap() |
657 | 1 | .unwrap(); |
658 | 1 | let mut context = sxd_xpath::Context::new(); |
659 | 1 | context.set_namespace("m", "http://www.w3.org/1998/Math/MathML"); |
660 | ||
661 | 1 | let value = xpath.evaluate(&context, first_element(&package)).unwrap(); |
662 | 1 | match value { |
663 | 1 | Value::String(s) => assert_eq!(s, "𝞪"), |
664 | 0 | _ => panic!("Expected string value from xpath"), |
665 | } | |
666 | 1 | } |
667 | ||
668 | #[test] | |
669 | /// Evaluates non-BMP numeric text with a MathML namespace-qualified XPath. | |
670 | 1 | fn xpath_non_bmp_namespace_numeric() { |
671 | use sxd_xpath::{Factory, Value}; | |
672 | ||
673 | 1 | let xml = "<math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>𝞪</mi></math>"; |
674 | 1 | let package = parser::parse(xml).unwrap(); |
675 | 1 | let xpath = Factory::new() |
676 | 1 | .build("string(/m:math/m:mi)") |
677 | 1 | .unwrap() |
678 | 1 | .unwrap(); |
679 | 1 | let mut context = sxd_xpath::Context::new(); |
680 | 1 | context.set_namespace("m", "http://www.w3.org/1998/Math/MathML"); |
681 | ||
682 | 1 | let value = xpath.evaluate(&context, first_element(&package)).unwrap(); |
683 | 1 | match value { |
684 | 1 | Value::String(s) => assert_eq!(s, "𝞪"), |
685 | 0 | _ => panic!("Expected string value from xpath"), |
686 | } | |
687 | 1 | } |
688 | ||
689 | #[test] | |
690 | /// Extracts a text node via XPath (nodeset result) and verifies the non-BMP character survives. | |
691 | 1 | fn xpath_non_bmp_text_nodeset() { |
692 | use sxd_xpath::{Factory, Value}; | |
693 | ||
694 | 1 | let xml = "<math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>𝞪</mi></math>"; |
695 | 1 | let package = parser::parse(xml).unwrap(); |
696 | 1 | let xpath = Factory::new().build("/m:math/m:mi/text()").unwrap().unwrap(); |
697 | 1 | let mut context = sxd_xpath::Context::new(); |
698 | 1 | context.set_namespace("m", "http://www.w3.org/1998/Math/MathML"); |
699 | ||
700 | 1 | let value = xpath.evaluate(&context, first_element(&package)).unwrap(); |
701 | 1 | match value { |
702 | 1 | Value::Nodeset(nodes) => { |
703 | 1 | let ordered = nodes.document_order(); |
704 | 1 | let node = ordered.first().expect("Expected one text node"); |
705 | 1 | let text = node.text().expect("Expected text node"); |
706 | 1 | assert_eq!(text.text(), "𝞪"); |
707 | 1 | assert_eq!(ordered.len(), 1); |
708 | } | |
709 | 0 | _ => panic!("Expected nodeset value from xpath"), |
710 | } | |
711 | 1 | } |
712 | } |
Line | Count | Source |
1 | //! Useful functions for debugging and error messages. | |
2 | #![allow(clippy::needless_return)] | |
3 | ||
4 | use sxd_document::dom::{Element, ChildOfElement, Attribute}; | |
5 | ||
6 | // #[allow(dead_code)] | |
7 | // pub fn pp_doc(doc: &Document) { | |
8 | // for root_child in doc.root().children() { | |
9 | // if let ChildOfRoot::Element(e) = root_child { | |
10 | // format_element(&e, 0); | |
11 | // break; | |
12 | // } | |
13 | // }; | |
14 | // } | |
15 | ||
16 | /// Pretty-print the MathML represented by `element`. | |
17 | 4.90k | pub fn mml_to_string(e: Element) -> String { |
18 | 4.90k | return format_element(e, 0); |
19 | 4.90k | } |
20 | ||
21 | /// Pretty-print the MathML represented by `element`. | |
22 | /// * `indent` -- the amount of indentation to start with | |
23 | 57.6k | pub fn format_element(e: Element, indent: usize) -> String { |
24 | // let namespace = match e.name().namespace_uri() { | |
25 | // None => "".to_string(), | |
26 | // Some(prefix) => prefix.to_string() + ":", | |
27 | // }; | |
28 | // let namespace = namespace.as_str(); | |
29 | 57.6k | let namespace = ""; |
30 | 57.6k | let mut answer = format!("{:in$}<{ns}{name}{attrs}>", " ", in=2*indent, ns=namespace, name=e.name().local_part(), attrs=format_attrs(&e.attributes())); |
31 | 57.6k | let children = e.children(); |
32 | 57.6k | let has_element = children.iter().find(|&&c| matches!( |
33 | 57.6k | if has_element.is_none() { |
34 | // print text content | |
35 | 35.9k | let content = children.iter() |
36 | 35.9k | .map(|c| if let ChildOfElement::Text( |
37 | 35.9k | .collect::<Vec<&str>>() |
38 | 35.9k | .join(""); |
39 | 35.9k | return format!("{}{}</{}{}>\n", answer, &handle_special_chars(&content), namespace, e.name().local_part()); |
40 | // for child in children { | |
41 | // if let ChildOfElement::Text(t) = child { | |
42 | // return format!("{}{}</{}{}>\n", answer, &make_invisible_chars_visible(t.text()), namespace, e.name().local_part()); | |
43 | // } | |
44 | // }; | |
45 | } else { | |
46 | 21.6k | answer += "\n"; // tag with children should start on new line |
47 | // recurse on each Element child | |
48 | 52.7k | for c in |
49 | 52.7k | if let ChildOfElement::Element(e) = c { |
50 | 52.7k | answer += &format_element(e, indent+1); |
51 | 52.7k |
|
52 | } | |
53 | } | |
54 | 21.6k | return answer + &format!("{:in$}</{ns}{name}>\n", " ", in=2*indent, ns=namespace, name=e.name().local_part()); |
55 | ||
56 | // Use the &#x....; representation for invisible chars when printing | |
57 | 57.6k | } |
58 | ||
59 | /// Format a vector of attributes as a string with a leading space | |
60 | 57.6k | pub fn format_attrs(attrs: &[Attribute]) -> String { |
61 | 57.6k | let mut result = String::new(); |
62 | 134k | for attr in |
63 | 134k | result += format!(" {}='{}'", attr.name().local_part(), &handle_special_chars(attr.value())).as_str(); |
64 | 134k | } |
65 | 57.6k | result |
66 | 57.6k | } |
67 | ||
68 | 170k | fn handle_special_chars(text: &str) -> String { |
69 | // Pre-allocate a buffer. We guess the size is roughly the same as input, maybe slightly larger. | |
70 | 170k | let mut s = String::with_capacity(text.len()); |
71 | 963k | for ch in |
72 | 963k | match ch { |
73 | 32 | '"' => s.push_str("""), |
74 | 5 | '&' => s.push_str("&"), |
75 | 277 | '\'' => s.push_str("'"), |
76 | 367 | '<' => s.push_str("<"), |
77 | 410 | '>' => s.push_str(">"), |
78 | 724 | '\u{2061}' => s.push_str("⁡"), |
79 | 3.46k | '\u{2062}' => s.push_str("⁢"), |
80 | 571 | '\u{2063}' => s.push_str("⁣"), |
81 | 76 | '\u{2064}' => s.push_str("⁤"), |
82 | 957k | _ => s.push(ch), |
83 | } | |
84 | } | |
85 | 170k | s |
86 | 170k | } |
87 | ||
88 | ||
89 | // /// Pretty print an xpath value. | |
90 | // /// If the value is a `NodeSet`, the MathML for the node/element is returned. | |
91 | // pub fn pp_xpath_value(value: Value) { | |
92 | // use sxd_xpath::Value; | |
93 | // use sxd_xpath::nodeset::Node; | |
94 | // debug!("XPath value:"); | |
95 | // if let Value::Nodeset(nodeset) = &value { | |
96 | // for node in nodeset.document_order() { | |
97 | // match node { | |
98 | // Node::Element(el) => {debug!("{}", crate::pretty_print::format_element(&el, 1))}, | |
99 | // Node::Text(t) => {debug!("found Text value: {}", t.text())}, | |
100 | // _ => {debug!("found unexpected node type")} | |
101 | // } | |
102 | // } | |
103 | // } | |
104 | // } | |
105 | ||
106 | /// Convert YAML to a string using with `indent` amount of space. | |
107 | 2.41M | pub fn yaml_to_string(yaml: &Yaml, indent: usize) -> String { |
108 | 2.41M | let mut result = String::new(); |
109 | 2.41M | { |
110 | 2.41M | let mut emitter = YamlEmitter::new(&mut result); |
111 | 2.41M | emitter.compact(true); |
112 | 2.41M | emitter.emit_node(yaml).unwrap(); // dump the YAML object to a String |
113 | 2.41M | } |
114 | 2.41M | if indent == 0 { |
115 | 2.41M | return result; |
116 | 0 | } |
117 | 0 | let indent_str = format!("{:in$}", " ", in=2*indent); |
118 | 0 | result = result.replace('\n',&("\n".to_string() + &indent_str)); // add indentation to all but first line |
119 | 0 | return indent_str + result.trim_end(); // add indent to first line and remove an extra indent at end |
120 | 2.41M | } |
121 | ||
122 | /* --------------------- Tweaked pretty printer for YAML (from YAML code) --------------------- */ | |
123 | ||
124 | // Changed: new function to determine if more compact notation can be used (when child is a one entry simple array/hash). Writes | |
125 | // -foo [bar: bletch] | |
126 | // -foo {bar: bletch} | |
127 | 20.0k | fn is_scalar(v: &Yaml) -> bool { |
128 | 20.0k | return !matches!(v, Yaml::Hash(_) | Yaml::Array(_)); |
129 | 20.0k | } |
130 | ||
131 | 20.0k | fn is_complex(v: &Yaml) -> bool { |
132 | 20.0k | return match v { |
133 | 1 | Yaml::Hash(h) => { |
134 | 1 | return match h.len() { |
135 | 0 | 0 => false, |
136 | 1 => { | |
137 | 1 | let (key,val) = h.iter().next().unwrap(); |
138 | 1 | return !(is_scalar(key) && is_scalar(val)) |
139 | }, | |
140 | 0 | _ => true, |
141 | } | |
142 | }, | |
143 | 0 | Yaml::Array(v) => { |
144 | 0 | return match v.len() { |
145 | 0 | 0 => false, |
146 | 1 => { | |
147 | 0 | let hash = v[0].as_hash(); |
148 | 0 | if let Some(hash) = hash { |
149 | 0 | return match hash.len() { |
150 | 0 | 0 => false, |
151 | 1 => { | |
152 | 0 | let (key, val) = hash.iter().next().unwrap(); |
153 | 0 | return !(is_scalar(key) && is_scalar(val)); |
154 | }, | |
155 | 0 | _ => true, |
156 | } | |
157 | } else { | |
158 | 0 | return !is_scalar(&v[0]); |
159 | } | |
160 | }, | |
161 | 0 | _ => true, |
162 | } | |
163 | }, | |
164 | 20.0k | _ => false, |
165 | } | |
166 | 20.0k | } |
167 | ||
168 | use std::error::Error; | |
169 | use std::fmt::{self, Display}; | |
170 | use yaml_rust::{Yaml, yaml::Hash}; | |
171 | ||
172 | //use crate::yaml::{Hash, Yaml}; | |
173 | ||
174 | #[derive(Copy, Clone, Debug)] | |
175 | #[allow(dead_code)] // from original YAML code (isn't used here) | |
176 | enum EmitError { | |
177 | FmtError(fmt::Error), | |
178 | BadHashmapKey, | |
179 | } | |
180 | ||
181 | impl Error for EmitError { | |
182 | 0 | fn cause(&self) -> Option<&dyn Error> { |
183 | 0 | None |
184 | 0 | } |
185 | } | |
186 | ||
187 | impl Display for EmitError { | |
188 | 0 | fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { |
189 | 0 | match *self { |
190 | 0 | EmitError::FmtError(ref err) => Display::fmt(err, formatter), |
191 | 0 | EmitError::BadHashmapKey => formatter.write_str("bad hashmap key"), |
192 | } | |
193 | 0 | } |
194 | } | |
195 | ||
196 | impl From<fmt::Error> for EmitError { | |
197 | 0 | fn from(f: fmt::Error) -> Self { |
198 | 0 | EmitError::FmtError(f) |
199 | 0 | } |
200 | } | |
201 | ||
202 | struct YamlEmitter<'a> { | |
203 | writer: &'a mut dyn fmt::Write, | |
204 | best_indent: usize, | |
205 | compact: bool, | |
206 | ||
207 | level: isize, | |
208 | } | |
209 | ||
210 | type EmitResult = Result<(), EmitError>; | |
211 | ||
212 | // from serialize::json | |
213 | 1.12M | fn escape_str(wr: &mut dyn fmt::Write, v: &str) -> Result<(), fmt::Error> { |
214 | 1.12M | wr.write_str("\"") |
215 | ||
216 | 1.12M | let mut start = 0; |
217 | ||
218 | 101M | for (i, byte) in |
219 | 101M | let |
220 | 0 | b'"' => "\\\"", |
221 | 0 | b'\\' => "\\\\", |
222 | 0 | b'\x00' => "\\u0000", |
223 | 0 | b'\x01' => "\\u0001", |
224 | 0 | b'\x02' => "\\u0002", |
225 | 0 | b'\x03' => "\\u0003", |
226 | 0 | b'\x04' => "\\u0004", |
227 | 0 | b'\x05' => "\\u0005", |
228 | 0 | b'\x06' => "\\u0006", |
229 | 0 | b'\x07' => "\\u0007", |
230 | 0 | b'\x08' => "\\b", |
231 | 0 | b'\t' => "\\t", |
232 | 0 | b'\n' => "\\n", |
233 | 0 | b'\x0b' => "\\u000b", |
234 | 0 | b'\x0c' => "\\f", |
235 | 0 | b'\r' => "\\r", |
236 | 0 | b'\x0e' => "\\u000e", |
237 | 0 | b'\x0f' => "\\u000f", |
238 | 0 | b'\x10' => "\\u0010", |
239 | 0 | b'\x11' => "\\u0011", |
240 | 0 | b'\x12' => "\\u0012", |
241 | 0 | b'\x13' => "\\u0013", |
242 | 0 | b'\x14' => "\\u0014", |
243 | 0 | b'\x15' => "\\u0015", |
244 | 0 | b'\x16' => "\\u0016", |
245 | 0 | b'\x17' => "\\u0017", |
246 | 0 | b'\x18' => "\\u0018", |
247 | 0 | b'\x19' => "\\u0019", |
248 | 0 | b'\x1a' => "\\u001a", |
249 | 0 | b'\x1b' => "\\u001b", |
250 | 0 | b'\x1c' => "\\u001c", |
251 | 0 | b'\x1d' => "\\u001d", |
252 | 0 | b'\x1e' => "\\u001e", |
253 | 0 | b'\x1f' => "\\u001f", |
254 | 0 | b'\x7f' => "\\u007f", |
255 | 101M | _ => continue, |
256 | }; | |
257 | ||
258 | 0 | if start < i { |
259 | 0 | wr.write_str(&v[start..i])?; |
260 | 0 | } |
261 | ||
262 | 0 | wr.write_str(escaped)?; |
263 | ||
264 | 0 | start = i + 1; |
265 | } | |
266 | ||
267 | 1.12M | if start != v.len() { |
268 | 1.12M | wr.write_str(&v[start..]) |
269 | 0 | } |
270 | ||
271 | 1.12M | wr.write_str("\"") |
272 | 1.12M | Ok(()) |
273 | 1.12M | } |
274 | ||
275 | impl<'a> YamlEmitter<'a> { | |
276 | 2.41M | pub fn new(writer: &'a mut dyn fmt::Write) -> YamlEmitter<'a> { |
277 | 2.41M | YamlEmitter { |
278 | 2.41M | writer, |
279 | 2.41M | best_indent: 2, |
280 | 2.41M | compact: true, |
281 | 2.41M | level: -1, |
282 | 2.41M | } |
283 | 2.41M | } |
284 | ||
285 | /// Set 'compact inline notation' on or off, as described for block | |
286 | /// [sequences](http://www.yaml.org/spec/1.2/spec.html#id2797382) | |
287 | /// and | |
288 | /// [mappings](http://www.yaml.org/spec/1.2/spec.html#id2798057). | |
289 | /// | |
290 | /// In this form, blocks cannot have any properties (such as anchors | |
291 | /// or tags), which should be OK, because this emitter doesn't | |
292 | /// (currently) emit those anyways. | |
293 | 2.41M | pub fn compact(&mut self, compact: bool) { |
294 | 2.41M | self.compact = compact; |
295 | 2.41M | } |
296 | ||
297 | /// Determine if this emitter is using 'compact inline notation'. | |
298 | #[allow(dead_code)] // not all fields are used in this program | |
299 | 0 | pub fn is_compact(&self) -> bool { |
300 | 0 | self.compact |
301 | 0 | } |
302 | ||
303 | // fn dump(&mut self, doc: &Yaml) -> EmitResult { | |
304 | // // write DocumentStart | |
305 | // writeln!(self.writer, "---")?; | |
306 | // self.level = -1; | |
307 | // self.emit_node(doc) | |
308 | // } | |
309 | ||
310 | 15.0k | fn write_indent(&mut self) -> EmitResult { |
311 | 15.0k | if self.level <= 0 { |
312 | 15.0k | return Ok(()); |
313 | 0 | } |
314 | 0 | for _ in 0..self.level { |
315 | 0 | for _ in 0..self.best_indent { |
316 | 0 | write!(self.writer, " ")?; |
317 | } | |
318 | } | |
319 | 0 | Ok(()) |
320 | 15.0k | } |
321 | ||
322 | 2.45M | fn emit_node(&mut self, node: &Yaml) -> EmitResult { |
323 | 2.45M | match *node { |
324 | 5.02k | Yaml::Array(ref v) => self.emit_array(v), |
325 | 0 | Yaml::Hash(ref h) => self.emit_hash(h), |
326 | 2.45M | Yaml::String(ref v) => { |
327 | 2.45M | if need_quotes(v) { |
328 | 1.12M | escape_str(self.writer, v) |
329 | } else { | |
330 | 1.32M | write!(self.writer, "{v}") |
331 | } | |
332 | 2.45M | Ok(()) |
333 | } | |
334 | 0 | Yaml::Boolean(v) => { |
335 | 0 | if v { |
336 | 0 | self.writer.write_str("true")?; |
337 | } else { | |
338 | 0 | self.writer.write_str("false")?; |
339 | } | |
340 | 0 | Ok(()) |
341 | } | |
342 | 0 | Yaml::Integer(v) => { |
343 | 0 | write!(self.writer, "{v}")?; |
344 | 0 | Ok(()) |
345 | } | |
346 | 0 | Yaml::Real(ref v) => { |
347 | 0 | write!(self.writer, "{v}")?; |
348 | 0 | Ok(()) |
349 | } | |
350 | Yaml::Null | Yaml::BadValue => { | |
351 | 0 | write!(self.writer, "~")?; |
352 | 0 | Ok(()) |
353 | } | |
354 | // XXX(chenyh) Alias | |
355 | 0 | _ => Ok(()), |
356 | } | |
357 | 2.45M | } |
358 | ||
359 | 5.02k | fn emit_array(&mut self, v: &[Yaml]) -> EmitResult { |
360 | 5.02k | if v.is_empty() { |
361 | 0 | write!(self.writer, "[]")?; |
362 | 5.02k | } else if v.len() == 1 && |
363 | // changed -- for arrays that have only one simple element, make them more compact by using [...] notation | |
364 | 1 | write!(self.writer, "[") |
365 | 1 | self.emit_val(true, &v[0]) |
366 | 1 | write!(self.writer, "]") |
367 | } else { | |
368 | 5.02k | self.level += 1; |
369 | ||
370 | 20.0k | for (cnt, x) in |
371 | 20.0k | if cnt > 0 { |
372 | 15.0k | writeln!(self.writer) |
373 | 15.0k | self.write_indent() |
374 | 5.02k | } |
375 | 20.0k | write!(self.writer, "- ") |
376 | 20.0k | self.emit_val(true, x) |
377 | } | |
378 | 5.02k | self.level -= 1; |
379 | } | |
380 | 5.02k | return Ok(()); |
381 | 5.02k | } |
382 | ||
383 | 20.0k | fn emit_hash(&mut self, h: &Hash) -> EmitResult { |
384 | 20.0k | if h.is_empty() { |
385 | 0 | self.writer.write_str("{}")?; |
386 | } else { | |
387 | // changed -- for hashmaps that have only one simple element, make them more compact by using {...}} notation | |
388 | 20.0k | self.level += 1; |
389 | 20.0k | for (cnt, (k, v)) in h.iter().enumerate() { |
390 | // changed: use new function is_scalar() | |
391 | // let complex_key = match *k { | |
392 | // Yaml::Hash(_) | Yaml::Array(_) => true, | |
393 | // _ => false, | |
394 | // }; | |
395 | 20.0k | if cnt > 0 { |
396 | 0 | writeln!(self.writer)?; |
397 | 0 | self.write_indent()?; |
398 | 20.0k | } |
399 | 20.0k | if !is_scalar(k) { |
400 | 0 | write!(self.writer, "? ")?; |
401 | 0 | self.emit_val(true, k)?; |
402 | 0 | writeln!(self.writer)?; |
403 | 0 | self.write_indent()?; |
404 | 0 | write!(self.writer, ": ")?; |
405 | 0 | self.emit_val(true, v)?; |
406 | } else { | |
407 | 20.0k | self.emit_node(k) |
408 | 20.0k | write!(self.writer, ": ") |
409 | ||
410 | // changed to use braces in some cases | |
411 | 20.0k | let complex_value = is_complex(v); |
412 | 20.0k | if !complex_value && v.as_hash().is_some() { |
413 | 0 | write!(self.writer, "{{")?; |
414 | 20.0k | } |
415 | // changed to use complex_value from 'false' | |
416 | 20.0k | self.emit_val(!complex_value, v) |
417 | 20.0k | if !complex_value && v.as_hash().is_some() { |
418 | 0 | write!(self.writer, "}}")?; |
419 | 20.0k | } |
420 | } | |
421 | } | |
422 | 20.0k | self.level -= 1; |
423 | } | |
424 | 20.0k | Ok(()) |
425 | 20.0k | } |
426 | ||
427 | /// Emit a yaml as a hash or array value: i.e., which should appear | |
428 | /// following a ":" or "-", either after a space, or on a new line. | |
429 | /// If `inline` is true, then the preceding characters are distinct | |
430 | /// and short enough to respect the compact flag. | |
431 | // changed: use to always emit ' ' for inline -- that is now handled elsewhere | |
432 | 40.1k | fn emit_val(&mut self, inline: bool, val: &Yaml) -> EmitResult { |
433 | 40.1k | match *val { |
434 | 0 | Yaml::Array(ref v) => { |
435 | 0 | if !((inline && self.compact) || v.is_empty()) { |
436 | 0 | writeln!(self.writer)?; |
437 | 0 | self.level += 1; |
438 | 0 | self.write_indent()?; |
439 | 0 | self.level -= 1; |
440 | 0 | } |
441 | 0 | self.emit_array(v) |
442 | } | |
443 | 20.0k | Yaml::Hash(ref h) => { |
444 | 20.0k | if !((inline && self.compact) || |
445 | 0 | writeln!(self.writer)?; |
446 | 0 | self.level += 1; |
447 | 0 | self.write_indent()?; |
448 | 0 | self.level -= 1; |
449 | 20.0k | } |
450 | 20.0k | self.emit_hash(h) |
451 | } | |
452 | _ => { | |
453 | // write!(self.writer, " ")?; | |
454 | 20.0k | self.emit_node(val) |
455 | } | |
456 | } | |
457 | 40.1k | } |
458 | } | |
459 | ||
460 | /// Check if the string requires quoting. | |
461 | /// Strings starting with any of the following characters must be quoted. | |
462 | /// :, &, *, ?, |, -, <, >, =, !, %, @ | |
463 | /// Strings containing any of the following characters must be quoted. | |
464 | /// {, }, [, ], ,, #, ` | |
465 | /// | |
466 | /// If the string contains any of the following control characters, it must be escaped with double quotes: | |
467 | /// \0, \x01, \x02, \x03, \x04, \x05, \x06, \a, \b, \t, \n, \v, \f, \r, \x0e, \x0f, \x10, \x11, \x12, \x13, \x14, \x15, \x16, \x17, \x18, \x19, \x1a, \e, \x1c, \x1d, \x1e, \x1f, \N, \_, \L, \P | |
468 | /// | |
469 | /// Finally, there are other cases when the strings must be quoted, no matter if you're using single or double quotes: | |
470 | /// * When the string is true or false (otherwise, it would be treated as a boolean value); | |
471 | /// * When the string is null or ~ (otherwise, it would be considered as a null value); | |
472 | /// * When the string looks like a number, such as integers (e.g. 2, 14, etc.), floats (e.g. 2.6, 14.9) and exponential numbers (e.g. 12e7, etc.) (otherwise, it would be treated as a numeric value); | |
473 | /// * When the string looks like a date (e.g. 2014-12-31) (otherwise it would be automatically converted into a Unix timestamp). | |
474 | 2.45M | fn need_quotes(string: &str) -> bool { |
475 | 2.45M | fn need_quotes_spaces(string: &str) -> bool { |
476 | 2.45M | string.starts_with(' ') || |
477 | 2.45M | } |
478 | ||
479 | 2.45M | string.is_empty() |
480 | 2.45M | || need_quotes_spaces(string) |
481 | 2.45M | || string.starts_with(['&', '*', '?', '|', '-', '<', '>', '=', '!', '%', '@']) |
482 | 19.8M | || |
483 | ':' | |
484 | | '{' | |
485 | | '}' | |
486 | | '[' | |
487 | | ']' | |
488 | | ',' | |
489 | | '#' | |
490 | | '`' | |
491 | | '\"' | |
492 | | '\'' | |
493 | | '\\' | |
494 | 18.9M | | '\0'..='\x06' |
495 | | '\t' | |
496 | | '\n' | |
497 | | '\r' | |
498 | 18.9M | | '\x0e'..='\x1a' |
499 | 18.9M | | '\x1c'..='\x1f') ) |
500 | 1.32M | || [ |
501 | 1.32M | // http://yaml.org/type/bool.html |
502 | 1.32M | // Note: 'y', 'Y', 'n', 'N', is not quoted deliberately, as in libyaml. PyYAML also parse |
503 | 1.32M | // them as string, not booleans, although it is violating the YAML 1.1 specification. |
504 | 1.32M | // See https://github.com/dtolnay/serde-yaml/pull/83#discussion_r152628088. |
505 | 1.32M | "yes", "Yes", "YES", "no", "No", "NO", "True", "TRUE", "true", "False", "FALSE", |
506 | 1.32M | "false", "on", "On", "ON", "off", "Off", "OFF", |
507 | 1.32M | // http://yaml.org/type/null.html |
508 | 1.32M | "null", "Null", "NULL", "~", |
509 | 1.32M | ] |
510 | 1.32M | .contains(&string) |
511 | 1.32M | || string.starts_with('.') |
512 | 1.32M | || string.starts_with("0x") |
513 | 1.32M | || string.parse::<i64>().is_ok() |
514 | 1.32M | || string.parse::<f64>().is_ok() |
515 | 2.45M | } |
516 | ||
517 | #[cfg(test)] | |
518 | mod tests { | |
519 | use super::*; | |
520 | use sxd_document::dom::{ChildOfElement, ChildOfRoot}; | |
521 | use sxd_document::parser; | |
522 | ||
523 | /// helper function | |
524 | 10 | fn first_element(package: &sxd_document::Package) -> Element<'_> { |
525 | 10 | let doc = package.as_document(); |
526 | 10 | for child in doc.root().children() { |
527 | 10 | if let ChildOfRoot::Element(e) = child { |
528 | 10 | return e; |
529 | 0 | } |
530 | } | |
531 | 0 | panic!("No root element found"); |
532 | 10 | } |
533 | ||
534 | #[test] | |
535 | /// Escapes XML entities and invisible characters for safe display. | |
536 | /// Tests the method on a few hardcoded characters. | |
537 | 1 | fn handle_special_chars_escapes() { |
538 | 1 | let input = "& < > \" ' \u{2061} \u{2062} \u{2063} \u{2064} x"; |
539 | 1 | let expected = "& < > " ' ⁡ ⁢ ⁣ ⁤ x"; |
540 | 1 | assert_eq!(handle_special_chars(input), expected); |
541 | 1 | } |
542 | ||
543 | #[test] | |
544 | /// Formats a leaf element as a single line with escaped text. | |
545 | 1 | fn format_element_leaf_text() { |
546 | 1 | let package = parser::parse("<math><mi>&</mi></math>").unwrap(); |
547 | 1 | let math = first_element(&package); |
548 | 1 | let mi = math |
549 | 1 | .children() |
550 | 1 | .iter() |
551 | 1 | .find_map(|c| match c { |
552 | 1 | ChildOfElement::Element(e) => Some(*e), |
553 | 0 | _ => None, |
554 | 1 | }) |
555 | 1 | .unwrap(); |
556 | 1 | assert_eq!(format_element(mi, 0), " <mi>&</mi>\n"); |
557 | 1 | } |
558 | ||
559 | #[test] | |
560 | /// Formats a nested element with indentation and newlines. | |
561 | 1 | fn format_element_nested() { |
562 | 1 | let package = parser::parse("<math><mi>x</mi><mo>+</mo></math>").unwrap(); |
563 | 1 | let math = first_element(&package); |
564 | 1 | let rendered = format_element(math, 0); |
565 | 1 | assert!(rendered.starts_with(" <math>\n")); |
566 | 1 | assert!(rendered.contains("\n <mi>x</mi>\n")); |
567 | 1 | assert!(rendered.contains("\n <mo>+</mo>\n")); |
568 | 1 | assert!(rendered.ends_with("</math>\n")); |
569 | 1 | } |
570 | ||
571 | #[test] | |
572 | /// Escapes special characters in attribute values. | |
573 | 1 | fn format_attrs_escapes() { |
574 | 1 | let package = parser::parse("<math a=\"&\" b=\"<\"></math>").unwrap(); |
575 | 1 | let math = first_element(&package); |
576 | 1 | let rendered = format_attrs(&math.attributes()); |
577 | 1 | assert!(rendered.contains(" a='&'")); |
578 | 1 | assert!(rendered.contains(" b='<'")); |
579 | 1 | } |
580 | ||
581 | #[test] | |
582 | /// Preserves non-BMP characters from a literal XML form. | |
583 | 1 | fn format_element_non_bmp_character_literal() { |
584 | 1 | let package = parser::parse("<math><mi>𝞪</mi></math>").unwrap(); |
585 | 1 | let math = first_element(&package); |
586 | 1 | let mi = math |
587 | 1 | .children() |
588 | 1 | .iter() |
589 | 1 | .find_map(|c| match c { |
590 | 1 | ChildOfElement::Element(e) => Some(*e), |
591 | 0 | _ => None, |
592 | 1 | }) |
593 | 1 | .unwrap(); |
594 | 1 | let rendered = format_element(mi, 0); |
595 | 1 | assert!(rendered.contains("𝞪")); |
596 | 1 | } |
597 | ||
598 | #[test] | |
599 | /// Preserves non-BMP characters from a numeric XML form. | |
600 | 1 | fn format_element_non_bmp_character_numeric() { |
601 | 1 | let package = parser::parse("<math><mi>𝞪</mi></math>").unwrap(); |
602 | 1 | let math = first_element(&package); |
603 | 1 | let mi = math |
604 | 1 | .children() |
605 | 1 | .iter() |
606 | 1 | .find_map(|c| match c { |
607 | 1 | ChildOfElement::Element(e) => Some(*e), |
608 | 0 | _ => None, |
609 | 1 | }) |
610 | 1 | .unwrap(); |
611 | 1 | let rendered = format_element(mi, 0); |
612 | 1 | assert!(rendered.contains("𝞪")); |
613 | 1 | } |
614 | ||
615 | #[test] | |
616 | /// Evaluates non-BMP literal text through sxd_xpath. | |
617 | 1 | fn xpath_non_bmp_literal() { |
618 | use sxd_xpath::{Factory, Value}; | |
619 | ||
620 | 1 | let package = parser::parse("<math><mi>𝞪</mi></math>").unwrap(); |
621 | 1 | let xpath = Factory::new().build("string(/math/mi)").unwrap().unwrap(); |
622 | 1 | let context = sxd_xpath::Context::new(); |
623 | ||
624 | 1 | let value = xpath.evaluate(&context, first_element(&package)).unwrap(); |
625 | 1 | match value { |
626 | 1 | Value::String(s) => assert_eq!(s, "𝞪"), |
627 | 0 | _ => panic!("Expected string value from xpath"), |
628 | } | |
629 | 1 | } |
630 | ||
631 | #[test] | |
632 | /// Evaluates non-BMP numeric text through sxd_xpath. | |
633 | 1 | fn xpath_non_bmp_numeric() { |
634 | use sxd_xpath::{Factory, Value}; | |
635 | ||
636 | 1 | let package = parser::parse("<math><mi>𝞪</mi></math>").unwrap(); |
637 | 1 | let xpath = Factory::new().build("string(/math/mi)").unwrap().unwrap(); |
638 | 1 | let context = sxd_xpath::Context::new(); |
639 | ||
640 | 1 | let value = xpath.evaluate(&context, first_element(&package)).unwrap(); |
641 | 1 | match value { |
642 | 1 | Value::String(s) => assert_eq!(s, "𝞪"), |
643 | 0 | _ => panic!("Expected string value from xpath"), |
644 | } | |
645 | 1 | } |
646 | ||
647 | #[test] | |
648 | /// Evaluates non-BMP literal text with a MathML namespace-qualified XPath. | |
649 | 1 | fn xpath_non_bmp_namespace_literal() { |
650 | use sxd_xpath::{Factory, Value}; | |
651 | ||
652 | 1 | let xml = "<math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>𝞪</mi></math>"; |
653 | 1 | let package = parser::parse(xml).unwrap(); |
654 | 1 | let xpath = Factory::new() |
655 | 1 | .build("string(/m:math/m:mi)") |
656 | 1 | .unwrap() |
657 | 1 | .unwrap(); |
658 | 1 | let mut context = sxd_xpath::Context::new(); |
659 | 1 | context.set_namespace("m", "http://www.w3.org/1998/Math/MathML"); |
660 | ||
661 | 1 | let value = xpath.evaluate(&context, first_element(&package)).unwrap(); |
662 | 1 | match value { |
663 | 1 | Value::String(s) => assert_eq!(s, "𝞪"), |
664 | 0 | _ => panic!("Expected string value from xpath"), |
665 | } | |
666 | 1 | } |
667 | ||
668 | #[test] | |
669 | /// Evaluates non-BMP numeric text with a MathML namespace-qualified XPath. | |
670 | 1 | fn xpath_non_bmp_namespace_numeric() { |
671 | use sxd_xpath::{Factory, Value}; | |
672 | ||
673 | 1 | let xml = "<math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>𝞪</mi></math>"; |
674 | 1 | let package = parser::parse(xml).unwrap(); |
675 | 1 | let xpath = Factory::new() |
676 | 1 | .build("string(/m:math/m:mi)") |
677 | 1 | .unwrap() |
678 | 1 | .unwrap(); |
679 | 1 | let mut context = sxd_xpath::Context::new(); |
680 | 1 | context.set_namespace("m", "http://www.w3.org/1998/Math/MathML"); |
681 | ||
682 | 1 | let value = xpath.evaluate(&context, first_element(&package)).unwrap(); |
683 | 1 | match value { |
684 | 1 | Value::String(s) => assert_eq!(s, "𝞪"), |
685 | 0 | _ => panic!("Expected string value from xpath"), |
686 | } | |
687 | 1 | } |
688 | ||
689 | #[test] | |
690 | /// Extracts a text node via XPath (nodeset result) and verifies the non-BMP character survives. | |
691 | 1 | fn xpath_non_bmp_text_nodeset() { |
692 | use sxd_xpath::{Factory, Value}; | |
693 | ||
694 | 1 | let xml = "<math xmlns=\"http://www.w3.org/1998/Math/MathML\"><mi>𝞪</mi></math>"; |
695 | 1 | let package = parser::parse(xml).unwrap(); |
696 | 1 | let xpath = Factory::new().build("/m:math/m:mi/text()").unwrap().unwrap(); |
697 | 1 | let mut context = sxd_xpath::Context::new(); |
698 | 1 | context.set_namespace("m", "http://www.w3.org/1998/Math/MathML"); |
699 | ||
700 | 1 | let value = xpath.evaluate(&context, first_element(&package)).unwrap(); |
701 | 1 | match value { |
702 | 1 | Value::Nodeset(nodes) => { |
703 | 1 | let ordered = nodes.document_order(); |
704 | 1 | let node = ordered.first().expect("Expected one text node"); |
705 | 1 | let text = node.text().expect("Expected text node"); |
706 | 1 | assert_eq!(text.text(), "𝞪"); |
707 | 1 | assert_eq!(ordered.len(), 1); |
708 | } | |
709 | 0 | _ => panic!("Expected nodeset value from xpath"), |
710 | } | |
711 | 1 | } |
712 | } |
Line | Count | Source |
1 | #![allow(clippy::needless_return)] | |
2 | //! This is used to paste over normal reading of the Rules files and building them into the code for web assembly (WASM) which | |
3 | //! can't do file system access. For the latter, the Rules directory is zipped up. | |
4 | ||
5 | use std::path::{Path, PathBuf}; | |
6 | use crate::errors::*; | |
7 | use cfg_if::cfg_if; | |
8 | ||
9 | #[allow(unused_imports)] | |
10 | use log::{debug}; | |
11 | ||
12 | ||
13 | // The zipped files are needed by WASM builds. | |
14 | // However, they are also useful for other builds because there really isn't another good way to get at the rules. | |
15 | // Other build scripts can extract these files and unzip to their needed locations. | |
16 | // I'm not thrilled with this solution as it seems hacky, but I don't know another way for crates to allow for each access to data. | |
17 | cfg_if! { | |
18 | if #[cfg(any(target_family = "wasm", feature = "include-zip"))] { | |
19 | // For the include-zip builds, we build a fake file system based on ZIPPED_RULE_FILES. | |
20 | // That stream encodes other zip files that must be unzipped. | |
21 | // Only one level of embedded zip files is supported. | |
22 | use zip::ZipArchive; | |
23 | pub static ZIPPED_RULE_FILES: &[u8] = include_bytes!(concat!(env!("OUT_DIR"),"/rules.zip")); | |
24 | ||
25 | /// Struct to indicate where a file is located in the zip archive(s) | |
26 | #[derive(Debug, Copy, Clone)] | |
27 | struct ArchivePath { | |
28 | main: usize, // index into ZIPPED_RULE_FILES | |
29 | zipped: Option<usize>, // if Some, index into embedded zip file, None if top-level zip file | |
30 | } | |
31 | ||
32 | use std::cell::RefCell; | |
33 | use std::io::Cursor; | |
34 | use std::io::Read; | |
35 | use std::collections::{HashMap, HashSet}; | |
36 | thread_local! { | |
37 | // mapping the file names to whether they are a directory or a file | |
38 | // Note: these are always stored with "/" as the path separator | |
39 | static DIRECTORIES: RefCell<HashSet<String>> = RefCell::new(HashSet::with_capacity(127)); | |
40 | // if a file, we note whether it is in ZIPPED_RULE_FILES or the index of a zipped file within ZIPPED_RULE_FILES | |
41 | static FILES: RefCell<HashMap<String, ArchivePath>> = RefCell::new(HashMap::with_capacity(1023)); | |
42 | } | |
43 | ||
44 | /// Canonicalize path separators to "/" | |
45 | fn canonicalize_path_separators(path: &Path) -> String { | |
46 | return path.to_str().unwrap_or_default().replace("\\", "/"); | |
47 | } | |
48 | ||
49 | /// Return a zip archive given the zip bytes | |
50 | fn get_zip_archive(zip_bytes: &[u8]) -> Result<ZipArchive<Cursor<&[u8]>>> { | |
51 | let buf_reader = Cursor::new(zip_bytes); | |
52 | let archive = match zip::ZipArchive::new(buf_reader) { | |
53 | Err(e) => bail!("get_zip_archive: failed to create ZipArchive: {}", e), | |
54 | Ok(archive) => archive, | |
55 | }; | |
56 | return Ok(archive); | |
57 | } | |
58 | ||
59 | /// Read ZIPPED_RULE_FILES and build up the FILES and DIRECTORIES static variables. | |
60 | /// This is called lazily when the first file or directory check is done. | |
61 | fn initialize_static_vars() -> Result<()> { | |
62 | let mut archive = get_zip_archive(ZIPPED_RULE_FILES)?; | |
63 | read_zip_file("", &mut archive, None)?; | |
64 | ||
65 | // Because of Rust's borrow checker, we can't recursively unzip contained zip files (FILES, etc., are borrowed mut) | |
66 | // Here we gather up the zip files that were found and iterate over them non-recursively. | |
67 | // Note: there shouldn't be embedded zip files in these files (if there are, they won't be unzipped) | |
68 | let zip_files = FILES.with(|files| files.borrow().iter() | |
69 | .filter_map(|(name, archive_path)| if name.ends_with(".zip") { Some((name.clone(), *archive_path)) } else { None } ) | |
70 | .collect::<Vec<_>>() | |
71 | ); | |
72 | // debug!("Found {:?} embedded zip files", zip_files); | |
73 | for (zip_file_name, archive_path) in zip_files.iter() { | |
74 | let bytes = get_bytes_from_index(&mut archive, archive_path.main)?; | |
75 | let mut inner_archive = get_zip_archive(bytes.as_slice())?; | |
76 | // debug!(" internal zip file {} has {} files", zip_file_name, inner_archive.len()); | |
77 | let new_containing_dir = zip_file_name.rsplit_once("/").map(|(before, _)| before).unwrap_or(""); | |
78 | read_zip_file(new_containing_dir, &mut inner_archive, Some(archive_path.main))?; | |
79 | } | |
80 | // FILES.with(|files| { | |
81 | // let files = files.borrow(); | |
82 | // debug!("{} files={:?}", files.len(), files); | |
83 | // }); | |
84 | return Ok(()); | |
85 | } | |
86 | ||
87 | /// Get the bytes for a file in the zip archive (intended for embedded zip files) | |
88 | fn get_bytes_from_index(archive: &mut ZipArchive<Cursor<&[u8]>>, index: usize) -> Result<Vec<u8>> { | |
89 | let mut file = archive.by_index(index) | |
90 | .map_err(|e| anyhow!(format!("Error getting index={} from zip archive: {}", index, e)) )?; | |
91 | let mut contents = Vec::new(); | |
92 | file.read_to_end(&mut contents) | |
93 | .map_err(|e| anyhow!(format!("Error reading index={} from zip archive: {}", index, e)) )?; | |
94 | return Ok(contents); | |
95 | } | |
96 | /// Unzip the zip file (given by zip_archive) and record the file and dir names | |
97 | /// 'containing_dir' is the rule dir (RulesDir or a subdir) and establishes a full path for unzipped file(s) | |
98 | /// embedded_zip_file is index into ZIPPED_RULE_FILES if this is an embedded zip file, None if it is the top-level zip file | |
99 | fn read_zip_file(containing_dir: &str, zip_archive: &mut ZipArchive<Cursor<&[u8]>>, embedded_zip_file: Option<usize>) -> Result<()> { | |
100 | // debug!("read_zip_file: containing_dir='{}', zip_archive.len()={}", containing_dir, zip_archive.len()); | |
101 | return FILES.with(|files| { | |
102 | let mut files = files.borrow_mut(); | |
103 | return DIRECTORIES.with(|dirs| { | |
104 | let mut dirs = dirs.borrow_mut(); | |
105 | for i in 0..zip_archive.len() { | |
106 | let file = zip_archive.by_index(i).unwrap(); | |
107 | // A little bit of safety/sanity checking | |
108 | let path = match file.enclosed_name() { | |
109 | Some(path) => PathBuf::from(containing_dir).join(path), | |
110 | None => { | |
111 | bail!("Entry {} has a suspicious path (outside of archive)", file.name()); | |
112 | } | |
113 | }; | |
114 | // debug!("read_zip_file: file path='{}'", path.display()); | |
115 | // add all the dirs up to the containing dir -- skip the first one as that is a file | |
116 | // for files like unicode.yaml, this loop is a no-op, but for files in the Shared folder, it will go one time. | |
117 | for parent in path.ancestors().skip(1) { | |
118 | if parent.to_str().unwrap_or_default() == containing_dir { | |
119 | break; | |
120 | } | |
121 | dirs.insert(canonicalize_path_separators(parent)); | |
122 | } | |
123 | let file_name = canonicalize_path_separators(&path); | |
124 | if file.is_file() { | |
125 | let archive_path = match embedded_zip_file { | |
126 | None => ArchivePath{ main: i, zipped: None }, | |
127 | Some(main) => ArchivePath{ main, zipped: Some(i) }, | |
128 | }; | |
129 | files.insert(file_name, archive_path); | |
130 | } else if file.is_dir() { | |
131 | dirs.insert(file_name); | |
132 | } else { | |
133 | bail!("read_zip_file: {} is neither a file nor a directory", path.display()); | |
134 | } | |
135 | }; | |
136 | // debug!("{} files={:?}", files.len(), files); | |
137 | // debug!("{} dirs={:?}", dirs.len(), dirs); | |
138 | return Ok::<(), Error>( () ); | |
139 | }); | |
140 | }); | |
141 | } | |
142 | ||
143 | pub fn is_file_shim(path: &Path) -> bool { | |
144 | if FILES.with(|files| files.borrow().is_empty()) { | |
145 | let _ignore_result = initialize_static_vars(); | |
146 | } | |
147 | return FILES.with(|files| files.borrow().contains_key(&canonicalize_path_separators(path)) ); | |
148 | } | |
149 | ||
150 | pub fn is_dir_shim(path: &Path) -> bool { | |
151 | if FILES.with(|files| files.borrow().is_empty()) { | |
152 | let _ignore_result = initialize_static_vars(); | |
153 | } | |
154 | return DIRECTORIES.with(|dirs| dirs.borrow().contains(&canonicalize_path_separators(path)) ); | |
155 | } | |
156 | ||
157 | /// Find files in 'dir' that end with 'ending' (e.g., "_Rules.yaml") | |
158 | pub fn find_files_in_dir_that_ends_with_shim(dir: &Path, ending: &str) -> Vec<String> { | |
159 | // FIX: this is very inefficient because it looks through all the files -- maybe dirs should list the files in them? | |
160 | // look for files that have 'path' as a prefix | |
161 | return FILES.with(|files| { | |
162 | let files = files.borrow(); | |
163 | let mut answer = Vec::new(); | |
164 | ||
165 | let dir_name = canonicalize_path_separators(dir); | |
166 | for file_name in files.keys() { | |
167 | if let Some(dir_relative_name) = file_name.strip_prefix(&dir_name) && | |
168 | file_name.ends_with(ending) | |
169 | { | |
170 | // this could be (e.g.) xxx_Rules.yaml or it could be subdir/xxx_Rules.yaml | |
171 | let file_name = dir_relative_name.split_once("/").map(|(_, after)| after).unwrap_or(dir_relative_name); | |
172 | answer.push(file_name.to_string()); | |
173 | } | |
174 | } | |
175 | // debug!("find_files_in_dir_that_ends_with_shim: in dir '{}' found {:?}", dir.display(), answer); | |
176 | return answer; | |
177 | }); | |
178 | } | |
179 | ||
180 | ||
181 | pub fn find_all_dirs_shim(dir: &Path, found_dirs: &mut Vec<PathBuf> ) { | |
182 | return DIRECTORIES.with(|dirs| { | |
183 | let dirs = dirs.borrow(); | |
184 | ||
185 | let common_dir_name = canonicalize_path_separators(dir); | |
186 | for dir_name in dirs.iter() { | |
187 | if dir_name.starts_with(&common_dir_name) && !dir_name.contains("SharedRules") { | |
188 | found_dirs.push(PathBuf::from(&dir_name)); | |
189 | }; | |
190 | } | |
191 | }); | |
192 | } | |
193 | ||
194 | ||
195 | pub fn canonicalize_shim(path: &Path) -> std::io::Result<PathBuf> { | |
196 | use std::ffi::OsStr; | |
197 | let dot_dot = OsStr::new(".."); | |
198 | let mut result = PathBuf::new(); | |
199 | for part in path.iter() { | |
200 | if dot_dot == part { | |
201 | result.pop(); | |
202 | } else { | |
203 | result.push(part); | |
204 | } | |
205 | } | |
206 | return Ok(result); | |
207 | } | |
208 | ||
209 | /// Read the file at 'path' and return its contents as a String | |
210 | pub fn read_to_string_shim(path: &Path) -> Result<String> { | |
211 | let path = canonicalize_shim(path).unwrap(); // can't fail | |
212 | let file_name = canonicalize_path_separators(&path); | |
213 | // Is this the debugging override? | |
214 | if let Some(contents) = OVERRIDE_FILE_NAME.with(|override_name| { | |
215 | if file_name == override_name.borrow().as_str() { | |
216 | // debug!("override read_to_string_shim: {}",file_name); | |
217 | return OVERRIDE_FILE_CONTENTS.with(|contents| return Some(contents.borrow().clone())); | |
218 | } else { | |
219 | return None; | |
220 | } | |
221 | }) { | |
222 | return Ok(contents); | |
223 | }; | |
224 | ||
225 | let file_name = file_name.replace('\\', "/"); // zip files always use forward slash | |
226 | // top-level zip file or embedded zip file | |
227 | return FILES.with(|files| { | |
228 | let files = files.borrow(); | |
229 | let inner_bytes; | |
230 | let (bytes, index) = match files.get(&file_name) { | |
231 | Some(archive_path) => { | |
232 | match &archive_path.zipped { | |
233 | None => (ZIPPED_RULE_FILES, archive_path.main), | |
234 | Some(i) => { | |
235 | // debug!("read_to_string_shim: reading embedded zip file {} at index {}", file_name, *i); | |
236 | let mut archive = get_zip_archive(ZIPPED_RULE_FILES)?; | |
237 | inner_bytes = get_bytes_from_index(&mut archive, archive_path.main)?; // need to hold temp value | |
238 | (inner_bytes.as_slice(), *i) | |
239 | } | |
240 | } | |
241 | }, | |
242 | None => bail!("read_to_string_shim: didn't find {} in zip archive", file_name), | |
243 | }; | |
244 | let mut archive = get_zip_archive(bytes)?; | |
245 | let mut file = match archive.by_index(index) { | |
246 | Ok(file) => { | |
247 | // debug!("read_to_string_shim: want {}; name of zipped file={:?}", file_name, file.enclosed_name().unwrap()); | |
248 | file | |
249 | }, | |
250 | Err(..) => { | |
251 | bail!("Didn't find {} in zip archive", file_name); | |
252 | } | |
253 | }; | |
254 | ||
255 | let mut contents = String::new(); | |
256 | if let Err(e) = file.read_to_string(&mut contents) { | |
257 | bail!("read_to_string: {}", e); | |
258 | } | |
259 | return Ok(contents); | |
260 | }); | |
261 | } | |
262 | ||
263 | pub fn zip_extract_shim(dir: &Path, zip_file_name: &str) -> Result<bool> { | |
264 | let zip_file_path = dir.join(zip_file_name); | |
265 | let full_zip_file_name = canonicalize_path_separators(&zip_file_path); | |
266 | match FILES.with(|files| files.borrow().contains_key(full_zip_file_name.as_str()) ) { | |
267 | true => Ok(true), | |
268 | false => bail!("zip_extract_shim: didn't find {} in zip archive", full_zip_file_name), | |
269 | } | |
270 | } | |
271 | ||
272 | thread_local! { | |
273 | // For debugging rules files (mainly nav file) via MathCATDemo | |
274 | static OVERRIDE_FILE_NAME: RefCell<String> = RefCell::new("".to_string()); | |
275 | static OVERRIDE_FILE_CONTENTS: RefCell<String> = RefCell::new("".to_string()); | |
276 | } | |
277 | pub fn override_file_for_debugging_rules(file_name: &str, file_contents: &str) { | |
278 | // file_name should be path name starting at Rules dir: e.g, "Rules/en/navigate.yaml" | |
279 | OVERRIDE_FILE_NAME.with(|name| *name.borrow_mut() = file_name.to_string().replace("/", "\\")); | |
280 | OVERRIDE_FILE_CONTENTS.with(|contents| *contents.borrow_mut() = file_contents.to_string()); | |
281 | crate::interface::set_rules_dir("Rules").unwrap(); // force reinitialization after the change | |
282 | } | |
283 | } else { | |
284 | 116k | pub fn is_file_shim(path: &Path) -> bool { |
285 | 116k | return path.is_file(); |
286 | 116k | } |
287 | ||
288 | 135k | pub fn is_dir_shim(path: &Path) -> bool { |
289 | 135k | return path.is_dir(); |
290 | 135k | } |
291 | ||
292 | 12.9k | pub fn find_files_in_dir_that_ends_with_shim(dir: &Path, ending: &str) -> Vec<String> { |
293 | 12.9k | match dir.read_dir() { |
294 | 0 | Err(_) => return vec![], // empty |
295 | 12.9k | Ok(read_dir) => { |
296 | 12.9k | let mut answer = Vec::new(); |
297 | 78.4k | for dir_entry in |
298 | 78.4k | let file_name = dir_entry.file_name(); |
299 | 78.4k | let file_name = file_name.to_string_lossy().to_string(); |
300 | 78.4k | if file_name.ends_with(ending) { |
301 | // this could be (e.g.) xxx_Rules.yaml or it could be subdir/xxx_Rules.yaml | |
302 | 64.4k | let file_name = file_name.split_once(std::path::MAIN_SEPARATOR).map(|(_, after)| after).unwrap_or(&file_name); |
303 | 64.4k | answer.push( file_name.to_string() ); |
304 | 14.0k | } |
305 | } | |
306 | 12.9k | return answer; |
307 | } | |
308 | } | |
309 | 12.9k | } |
310 | ||
311 | 2.90k | pub fn find_all_dirs_shim(dir: &Path, found_dirs: &mut Vec<PathBuf> ) { |
312 | // FIX: this doesn't work for subdirectories that haven't been unzipped yet | |
313 | 2.90k | assert!(dir.is_dir(), "find_all_dirs_shim called with non-directory path: {}", |
314 | 2.90k | let mut found_rules_file = false; |
315 | 2.90k | if let Ok(entries) = std::fs::read_dir(dir) { |
316 | 13.0k | for entry in |
317 | 13.0k | let path = entry.path(); |
318 | 13.0k | if path.is_dir() { |
319 | // skip "SharedRules" directory | |
320 | 2.90k | if let Some(dir_name) = path.file_name() && |
321 | 2.90k | dir_name.to_str().unwrap_or_default() != "SharedRules" { |
322 | 1.45k | find_all_dirs_shim(&path, found_dirs); |
323 | 1.45k |
|
324 | } else { | |
325 | 10.1k | let file_name = path.file_name().unwrap_or_default().to_str().unwrap_or_default(); |
326 | 10.1k | if !found_rules_file && |
327 | 2.91k | (file_name.starts_with("unicode") || |
328 | 1.46k | found_dirs.push(path.parent().unwrap().to_path_buf()); |
329 | // FIX: hack to get around not unzipping files and having zh/tw not found | |
330 | 1.46k | if file_name == "zh.zip" { |
331 | 0 | let tw_dir = path.parent().unwrap().join("tw"); |
332 | 0 | if !found_dirs.contains(&tw_dir) { |
333 | 0 | found_dirs.push(tw_dir.to_path_buf()); |
334 | 0 | } |
335 | 1.46k | } |
336 | 1.46k | found_rules_file = true; |
337 | 8.71k | } |
338 | } | |
339 | } | |
340 | 0 | } |
341 | 2.90k | } |
342 | ||
343 | 73.1k | pub fn canonicalize_shim(path: &Path) -> std::io::Result<PathBuf> { |
344 | 73.1k | return path.canonicalize(); |
345 | 73.1k | } |
346 | ||
347 | 60.5k | pub fn read_to_string_shim(path: &Path) -> Result<String> { |
348 | 60.5k | let path = match path.canonicalize() { |
349 | 60.5k | Ok(path) => path, |
350 | 0 | Err(e) => bail!("Read error while trying to canonicalize in read_to_string_shim {}: {}", path.display(), e), |
351 | }; | |
352 | 60.5k | debug!("Reading file '{}'", |
353 | 60.5k | match std::fs::read_to_string(&path) { |
354 | 60.5k | Ok(str) => return Ok(str), |
355 | 0 | Err(e) => bail!("Read error while trying to read {}: {}", &path.display(), e), |
356 | } | |
357 | 60.5k | } |
358 | ||
359 | 12.6k | pub fn zip_extract_shim(dir: &Path, zip_file_name: &str) -> Result<bool> { |
360 | 12.6k | let zip_file = dir.join(zip_file_name); |
361 | 12.6k | return match std::fs::read(zip_file) { |
362 | 12.6k | Err(e) => { |
363 | // no zip file? -- maybe started out with all the files unzipped? See if there is a .yaml file | |
364 | 12.6k | let yaml_files = find_files_in_dir_that_ends_with_shim(dir, ".yaml"); |
365 | 12.6k | if yaml_files.is_empty() { |
366 | 1.44k | bail!("{}", e) |
367 | } else { | |
368 | 11.2k | Ok(false) |
369 | } | |
370 | }, | |
371 | 0 | Ok(contents) => { |
372 | 0 | let archive = std::io::Cursor::new(contents); |
373 | 0 | let mut zip_archive = zip::ZipArchive::new(archive).unwrap(); |
374 | 0 | zip_archive.extract(dir).expect("Zip extraction failed"); |
375 | 0 | Ok(true) |
376 | }, | |
377 | }; | |
378 | 12.6k | } |
379 | } | |
380 | } |
Line | Count | Source |
1 | #![allow(clippy::needless_return)] | |
2 | //! This is used to paste over normal reading of the Rules files and building them into the code for web assembly (WASM) which | |
3 | //! can't do file system access. For the latter, the Rules directory is zipped up. | |
4 | ||
5 | use std::path::{Path, PathBuf}; | |
6 | use crate::errors::*; | |
7 | use cfg_if::cfg_if; | |
8 | ||
9 | #[allow(unused_imports)] | |
10 | use log::{debug}; | |
11 | ||
12 | ||
13 | // The zipped files are needed by WASM builds. | |
14 | // However, they are also useful for other builds because there really isn't another good way to get at the rules. | |
15 | // Other build scripts can extract these files and unzip to their needed locations. | |
16 | // I'm not thrilled with this solution as it seems hacky, but I don't know another way for crates to allow for each access to data. | |
17 | cfg_if! { | |
18 | if #[cfg(any(target_family = "wasm", feature = "include-zip"))] { | |
19 | // For the include-zip builds, we build a fake file system based on ZIPPED_RULE_FILES. | |
20 | // That stream encodes other zip files that must be unzipped. | |
21 | // Only one level of embedded zip files is supported. | |
22 | use zip::ZipArchive; | |
23 | pub static ZIPPED_RULE_FILES: &[u8] = include_bytes!(concat!(env!("OUT_DIR"),"/rules.zip")); | |
24 | ||
25 | /// Struct to indicate where a file is located in the zip archive(s) | |
26 | #[derive(Debug, Copy, Clone)] | |
27 | struct ArchivePath { | |
28 | main: usize, // index into ZIPPED_RULE_FILES | |
29 | zipped: Option<usize>, // if Some, index into embedded zip file, None if top-level zip file | |
30 | } | |
31 | ||
32 | use std::cell::RefCell; | |
33 | use std::io::Cursor; | |
34 | use std::io::Read; | |
35 | use std::collections::{HashMap, HashSet}; | |
36 | thread_local! { | |
37 | // mapping the file names to whether they are a directory or a file | |
38 | // Note: these are always stored with "/" as the path separator | |
39 | static DIRECTORIES: RefCell<HashSet<String>> = RefCell::new(HashSet::with_capacity(127)); | |
40 | // if a file, we note whether it is in ZIPPED_RULE_FILES or the index of a zipped file within ZIPPED_RULE_FILES | |
41 | static FILES: RefCell<HashMap<String, ArchivePath>> = RefCell::new(HashMap::with_capacity(1023)); | |
42 | } | |
43 | ||
44 | /// Canonicalize path separators to "/" | |
45 | fn canonicalize_path_separators(path: &Path) -> String { | |
46 | return path.to_str().unwrap_or_default().replace("\\", "/"); | |
47 | } | |
48 | ||
49 | /// Return a zip archive given the zip bytes | |
50 | fn get_zip_archive(zip_bytes: &[u8]) -> Result<ZipArchive<Cursor<&[u8]>>> { | |
51 | let buf_reader = Cursor::new(zip_bytes); | |
52 | let archive = match zip::ZipArchive::new(buf_reader) { | |
53 | Err(e) => bail!("get_zip_archive: failed to create ZipArchive: {}", e), | |
54 | Ok(archive) => archive, | |
55 | }; | |
56 | return Ok(archive); | |
57 | } | |
58 | ||
59 | /// Read ZIPPED_RULE_FILES and build up the FILES and DIRECTORIES static variables. | |
60 | /// This is called lazily when the first file or directory check is done. | |
61 | fn initialize_static_vars() -> Result<()> { | |
62 | let mut archive = get_zip_archive(ZIPPED_RULE_FILES)?; | |
63 | read_zip_file("", &mut archive, None)?; | |
64 | ||
65 | // Because of Rust's borrow checker, we can't recursively unzip contained zip files (FILES, etc., are borrowed mut) | |
66 | // Here we gather up the zip files that were found and iterate over them non-recursively. | |
67 | // Note: there shouldn't be embedded zip files in these files (if there are, they won't be unzipped) | |
68 | let zip_files = FILES.with(|files| files.borrow().iter() | |
69 | .filter_map(|(name, archive_path)| if name.ends_with(".zip") { Some((name.clone(), *archive_path)) } else { None } ) | |
70 | .collect::<Vec<_>>() | |
71 | ); | |
72 | // debug!("Found {:?} embedded zip files", zip_files); | |
73 | for (zip_file_name, archive_path) in zip_files.iter() { | |
74 | let bytes = get_bytes_from_index(&mut archive, archive_path.main)?; | |
75 | let mut inner_archive = get_zip_archive(bytes.as_slice())?; | |
76 | // debug!(" internal zip file {} has {} files", zip_file_name, inner_archive.len()); | |
77 | let new_containing_dir = zip_file_name.rsplit_once("/").map(|(before, _)| before).unwrap_or(""); | |
78 | read_zip_file(new_containing_dir, &mut inner_archive, Some(archive_path.main))?; | |
79 | } | |
80 | // FILES.with(|files| { | |
81 | // let files = files.borrow(); | |
82 | // debug!("{} files={:?}", files.len(), files); | |
83 | // }); | |
84 | return Ok(()); | |
85 | } | |
86 | ||
87 | /// Get the bytes for a file in the zip archive (intended for embedded zip files) | |
88 | fn get_bytes_from_index(archive: &mut ZipArchive<Cursor<&[u8]>>, index: usize) -> Result<Vec<u8>> { | |
89 | let mut file = archive.by_index(index) | |
90 | .map_err(|e| anyhow!(format!("Error getting index={} from zip archive: {}", index, e)) )?; | |
91 | let mut contents = Vec::new(); | |
92 | file.read_to_end(&mut contents) | |
93 | .map_err(|e| anyhow!(format!("Error reading index={} from zip archive: {}", index, e)) )?; | |
94 | return Ok(contents); | |
95 | } | |
96 | /// Unzip the zip file (given by zip_archive) and record the file and dir names | |
97 | /// 'containing_dir' is the rule dir (RulesDir or a subdir) and establishes a full path for unzipped file(s) | |
98 | /// embedded_zip_file is index into ZIPPED_RULE_FILES if this is an embedded zip file, None if it is the top-level zip file | |
99 | fn read_zip_file(containing_dir: &str, zip_archive: &mut ZipArchive<Cursor<&[u8]>>, embedded_zip_file: Option<usize>) -> Result<()> { | |
100 | // debug!("read_zip_file: containing_dir='{}', zip_archive.len()={}", containing_dir, zip_archive.len()); | |
101 | return FILES.with(|files| { | |
102 | let mut files = files.borrow_mut(); | |
103 | return DIRECTORIES.with(|dirs| { | |
104 | let mut dirs = dirs.borrow_mut(); | |
105 | for i in 0..zip_archive.len() { | |
106 | let file = zip_archive.by_index(i).unwrap(); | |
107 | // A little bit of safety/sanity checking | |
108 | let path = match file.enclosed_name() { | |
109 | Some(path) => PathBuf::from(containing_dir).join(path), | |
110 | None => { | |
111 | bail!("Entry {} has a suspicious path (outside of archive)", file.name()); | |
112 | } | |
113 | }; | |
114 | // debug!("read_zip_file: file path='{}'", path.display()); | |
115 | // add all the dirs up to the containing dir -- skip the first one as that is a file | |
116 | // for files like unicode.yaml, this loop is a no-op, but for files in the Shared folder, it will go one time. | |
117 | for parent in path.ancestors().skip(1) { | |
118 | if parent.to_str().unwrap_or_default() == containing_dir { | |
119 | break; | |
120 | } | |
121 | dirs.insert(canonicalize_path_separators(parent)); | |
122 | } | |
123 | let file_name = canonicalize_path_separators(&path); | |
124 | if file.is_file() { | |
125 | let archive_path = match embedded_zip_file { | |
126 | None => ArchivePath{ main: i, zipped: None }, | |
127 | Some(main) => ArchivePath{ main, zipped: Some(i) }, | |
128 | }; | |
129 | files.insert(file_name, archive_path); | |
130 | } else if file.is_dir() { | |
131 | dirs.insert(file_name); | |
132 | } else { | |
133 | bail!("read_zip_file: {} is neither a file nor a directory", path.display()); | |
134 | } | |
135 | }; | |
136 | // debug!("{} files={:?}", files.len(), files); | |
137 | // debug!("{} dirs={:?}", dirs.len(), dirs); | |
138 | return Ok::<(), Error>( () ); | |
139 | }); | |
140 | }); | |
141 | } | |
142 | ||
143 | pub fn is_file_shim(path: &Path) -> bool { | |
144 | if FILES.with(|files| files.borrow().is_empty()) { | |
145 | let _ignore_result = initialize_static_vars(); | |
146 | } | |
147 | return FILES.with(|files| files.borrow().contains_key(&canonicalize_path_separators(path)) ); | |
148 | } | |
149 | ||
150 | pub fn is_dir_shim(path: &Path) -> bool { | |
151 | if FILES.with(|files| files.borrow().is_empty()) { | |
152 | let _ignore_result = initialize_static_vars(); | |
153 | } | |
154 | return DIRECTORIES.with(|dirs| dirs.borrow().contains(&canonicalize_path_separators(path)) ); | |
155 | } | |
156 | ||
157 | /// Find files in 'dir' that end with 'ending' (e.g., "_Rules.yaml") | |
158 | pub fn find_files_in_dir_that_ends_with_shim(dir: &Path, ending: &str) -> Vec<String> { | |
159 | // FIX: this is very inefficient because it looks through all the files -- maybe dirs should list the files in them? | |
160 | // look for files that have 'path' as a prefix | |
161 | return FILES.with(|files| { | |
162 | let files = files.borrow(); | |
163 | let mut answer = Vec::new(); | |
164 | ||
165 | let dir_name = canonicalize_path_separators(dir); | |
166 | for file_name in files.keys() { | |
167 | if let Some(dir_relative_name) = file_name.strip_prefix(&dir_name) && | |
168 | file_name.ends_with(ending) | |
169 | { | |
170 | // this could be (e.g.) xxx_Rules.yaml or it could be subdir/xxx_Rules.yaml | |
171 | let file_name = dir_relative_name.split_once("/").map(|(_, after)| after).unwrap_or(dir_relative_name); | |
172 | answer.push(file_name.to_string()); | |
173 | } | |
174 | } | |
175 | // debug!("find_files_in_dir_that_ends_with_shim: in dir '{}' found {:?}", dir.display(), answer); | |
176 | return answer; | |
177 | }); | |
178 | } | |
179 | ||
180 | ||
181 | pub fn find_all_dirs_shim(dir: &Path, found_dirs: &mut Vec<PathBuf> ) { | |
182 | return DIRECTORIES.with(|dirs| { | |
183 | let dirs = dirs.borrow(); | |
184 | ||
185 | let common_dir_name = canonicalize_path_separators(dir); | |
186 | for dir_name in dirs.iter() { | |
187 | if dir_name.starts_with(&common_dir_name) && !dir_name.contains("SharedRules") { | |
188 | found_dirs.push(PathBuf::from(&dir_name)); | |
189 | }; | |
190 | } | |
191 | }); | |
192 | } | |
193 | ||
194 | ||
195 | pub fn canonicalize_shim(path: &Path) -> std::io::Result<PathBuf> { | |
196 | use std::ffi::OsStr; | |
197 | let dot_dot = OsStr::new(".."); | |
198 | let mut result = PathBuf::new(); | |
199 | for part in path.iter() { | |
200 | if dot_dot == part { | |
201 | result.pop(); | |
202 | } else { | |
203 | result.push(part); | |
204 | } | |
205 | } | |
206 | return Ok(result); | |
207 | } | |
208 | ||
209 | /// Read the file at 'path' and return its contents as a String | |
210 | pub fn read_to_string_shim(path: &Path) -> Result<String> { | |
211 | let path = canonicalize_shim(path).unwrap(); // can't fail | |
212 | let file_name = canonicalize_path_separators(&path); | |
213 | // Is this the debugging override? | |
214 | if let Some(contents) = OVERRIDE_FILE_NAME.with(|override_name| { | |
215 | if file_name == override_name.borrow().as_str() { | |
216 | // debug!("override read_to_string_shim: {}",file_name); | |
217 | return OVERRIDE_FILE_CONTENTS.with(|contents| return Some(contents.borrow().clone())); | |
218 | } else { | |
219 | return None; | |
220 | } | |
221 | }) { | |
222 | return Ok(contents); | |
223 | }; | |
224 | ||
225 | let file_name = file_name.replace('\\', "/"); // zip files always use forward slash | |
226 | // top-level zip file or embedded zip file | |
227 | return FILES.with(|files| { | |
228 | let files = files.borrow(); | |
229 | let inner_bytes; | |
230 | let (bytes, index) = match files.get(&file_name) { | |
231 | Some(archive_path) => { | |
232 | match &archive_path.zipped { | |
233 | None => (ZIPPED_RULE_FILES, archive_path.main), | |
234 | Some(i) => { | |
235 | // debug!("read_to_string_shim: reading embedded zip file {} at index {}", file_name, *i); | |
236 | let mut archive = get_zip_archive(ZIPPED_RULE_FILES)?; | |
237 | inner_bytes = get_bytes_from_index(&mut archive, archive_path.main)?; // need to hold temp value | |
238 | (inner_bytes.as_slice(), *i) | |
239 | } | |
240 | } | |
241 | }, | |
242 | None => bail!("read_to_string_shim: didn't find {} in zip archive", file_name), | |
243 | }; | |
244 | let mut archive = get_zip_archive(bytes)?; | |
245 | let mut file = match archive.by_index(index) { | |
246 | Ok(file) => { | |
247 | // debug!("read_to_string_shim: want {}; name of zipped file={:?}", file_name, file.enclosed_name().unwrap()); | |
248 | file | |
249 | }, | |
250 | Err(..) => { | |
251 | bail!("Didn't find {} in zip archive", file_name); | |
252 | } | |
253 | }; | |
254 | ||
255 | let mut contents = String::new(); | |
256 | if let Err(e) = file.read_to_string(&mut contents) { | |
257 | bail!("read_to_string: {}", e); | |
258 | } | |
259 | return Ok(contents); | |
260 | }); | |
261 | } | |
262 | ||
263 | pub fn zip_extract_shim(dir: &Path, zip_file_name: &str) -> Result<bool> { | |
264 | let zip_file_path = dir.join(zip_file_name); | |
265 | let full_zip_file_name = canonicalize_path_separators(&zip_file_path); | |
266 | match FILES.with(|files| files.borrow().contains_key(full_zip_file_name.as_str()) ) { | |
267 | true => Ok(true), | |
268 | false => bail!("zip_extract_shim: didn't find {} in zip archive", full_zip_file_name), | |
269 | } | |
270 | } | |
271 | ||
272 | thread_local! { | |
273 | // For debugging rules files (mainly nav file) via MathCATDemo | |
274 | static OVERRIDE_FILE_NAME: RefCell<String> = RefCell::new("".to_string()); | |
275 | static OVERRIDE_FILE_CONTENTS: RefCell<String> = RefCell::new("".to_string()); | |
276 | } | |
277 | pub fn override_file_for_debugging_rules(file_name: &str, file_contents: &str) { | |
278 | // file_name should be path name starting at Rules dir: e.g, "Rules/en/navigate.yaml" | |
279 | OVERRIDE_FILE_NAME.with(|name| *name.borrow_mut() = file_name.to_string().replace("/", "\\")); | |
280 | OVERRIDE_FILE_CONTENTS.with(|contents| *contents.borrow_mut() = file_contents.to_string()); | |
281 | crate::interface::set_rules_dir("Rules").unwrap(); // force reinitialization after the change | |
282 | } | |
283 | } else { | |
284 | 116k | pub fn is_file_shim(path: &Path) -> bool { |
285 | 116k | return path.is_file(); |
286 | 116k | } |
287 | ||
288 | 135k | pub fn is_dir_shim(path: &Path) -> bool { |
289 | 135k | return path.is_dir(); |
290 | 135k | } |
291 | ||
292 | 12.9k | pub fn find_files_in_dir_that_ends_with_shim(dir: &Path, ending: &str) -> Vec<String> { |
293 | 12.9k | match dir.read_dir() { |
294 | 0 | Err(_) => return vec![], // empty |
295 | 12.9k | Ok(read_dir) => { |
296 | 12.9k | let mut answer = Vec::new(); |
297 | 78.4k | for dir_entry in |
298 | 78.4k | let file_name = dir_entry.file_name(); |
299 | 78.4k | let file_name = file_name.to_string_lossy().to_string(); |
300 | 78.4k | if file_name.ends_with(ending) { |
301 | // this could be (e.g.) xxx_Rules.yaml or it could be subdir/xxx_Rules.yaml | |
302 | 64.4k | let file_name = file_name.split_once(std::path::MAIN_SEPARATOR).map(|(_, after)| after).unwrap_or(&file_name); |
303 | 64.4k | answer.push( file_name.to_string() ); |
304 | 14.0k | } |
305 | } | |
306 | 12.9k | return answer; |
307 | } | |
308 | } | |
309 | 12.9k | } |
310 | ||
311 | 2.90k | pub fn find_all_dirs_shim(dir: &Path, found_dirs: &mut Vec<PathBuf> ) { |
312 | // FIX: this doesn't work for subdirectories that haven't been unzipped yet | |
313 | 2.90k | assert!(dir.is_dir(), "find_all_dirs_shim called with non-directory path: {}", |
314 | 2.90k | let mut found_rules_file = false; |
315 | 2.90k | if let Ok(entries) = std::fs::read_dir(dir) { |
316 | 13.0k | for entry in |
317 | 13.0k | let path = entry.path(); |
318 | 13.0k | if path.is_dir() { |
319 | // skip "SharedRules" directory | |
320 | 2.90k | if let Some(dir_name) = path.file_name() && |
321 | 2.90k | dir_name.to_str().unwrap_or_default() != "SharedRules" { |
322 | 1.45k | find_all_dirs_shim(&path, found_dirs); |
323 | 1.45k |
|
324 | } else { | |
325 | 10.1k | let file_name = path.file_name().unwrap_or_default().to_str().unwrap_or_default(); |
326 | 10.1k | if !found_rules_file && |
327 | 2.91k | (file_name.starts_with("unicode") || |
328 | 1.46k | found_dirs.push(path.parent().unwrap().to_path_buf()); |
329 | // FIX: hack to get around not unzipping files and having zh/tw not found | |
330 | 1.46k | if file_name == "zh.zip" { |
331 | 0 | let tw_dir = path.parent().unwrap().join("tw"); |
332 | 0 | if !found_dirs.contains(&tw_dir) { |
333 | 0 | found_dirs.push(tw_dir.to_path_buf()); |
334 | 0 | } |
335 | 1.46k | } |
336 | 1.46k | found_rules_file = true; |
337 | 8.71k | } |
338 | } | |
339 | } | |
340 | 0 | } |
341 | 2.90k | } |
342 | ||
343 | /// Resolves the path to an absolute, canonical form using the OS. | |
344 | /// If `canonicalize()` fails (e.g., ACCESS_DENIED in containers), falls back to: | |
345 | /// - returning the path as-is if it is already absolute, | |
346 | /// - prepending the current working directory if it is relative. | |
347 | /// Note: the fallback does not resolve symlinks or normalize `..`/`.` segments. | |
348 | 73.1k | pub fn canonicalize_shim(path: &Path) -> std::io::Result<PathBuf> { |
349 | 73.1k | match path.canonicalize() { |
350 | 73.1k | Ok(p) => Ok(p), |
351 | Err(_) => { | |
352 | 0 | if path.is_absolute() { |
353 | 0 | Ok(path.to_path_buf()) |
354 | } else { | |
355 | // Prepend cwd to make the relative path absolute. | |
356 | // unwrap_or_default yields an empty PathBuf if cwd is unavailable, | |
357 | // in which case the returned path will still be relative. | |
358 | 0 | Ok(std::env::current_dir().unwrap_or_default().join(path)) |
359 | } | |
360 | } | |
361 | } | |
362 | 73.1k | } |
363 | ||
364 | 60.5k | pub fn read_to_string_shim(path: &Path) -> Result<String> { |
365 | 60.5k | let path = match path.canonicalize() { |
366 | 60.5k | Ok(path) => path, |
367 | 0 | Err(_) => path.to_path_buf(), |
368 | }; | |
369 | 60.5k | debug!("Reading file '{}'", |
370 | 60.5k | match std::fs::read_to_string(&path) { |
371 | 60.5k | Ok(str) => return Ok(str), |
372 | 0 | Err(e) => bail!("Read error while trying to read {}: {}", &path.display(), e), |
373 | } | |
374 | 60.5k | } |
375 | ||
376 | 12.6k | pub fn zip_extract_shim(dir: &Path, zip_file_name: &str) -> Result<bool> { |
377 | 12.6k | let zip_file = dir.join(zip_file_name); |
378 | 12.6k | return match std::fs::read(zip_file) { |
379 | 12.6k | Err(e) => { |
380 | // no zip file? -- maybe started out with all the files unzipped? See if there is a .yaml file | |
381 | 12.6k | let yaml_files = find_files_in_dir_that_ends_with_shim(dir, ".yaml"); |
382 | 12.6k | if yaml_files.is_empty() { |
383 | 1.44k | bail!("{}", e) |
384 | } else { | |
385 | 11.2k | Ok(false) |
386 | } | |
387 | }, | |
388 | 0 | Ok(contents) => { |
389 | 0 | let archive = std::io::Cursor::new(contents); |
390 | 0 | let mut zip_archive = zip::ZipArchive::new(archive).unwrap(); |
391 | 0 | zip_archive.extract(dir).expect("Zip extraction failed"); |
392 | 0 | Ok(true) |
393 | }, | |
394 | }; | |
395 | 12.6k | } |
396 | } | |
397 | } |
Line | Count | Source |
1 | //! The speech module is where the speech rules are read in and speech generated. | |
2 | //! | |
3 | //! The speech rules call out to the preferences and tts modules and the dividing line is not always clean. | |
4 | //! A number of useful utility functions used by other modules are defined here. | |
5 | #![allow(clippy::needless_return)] | |
6 | use std::path::PathBuf; | |
7 | use std::collections::HashMap; | |
8 | use std::cell::{RefCell, RefMut}; | |
9 | use std::sync::LazyLock; | |
10 | use sxd_document::dom::{ChildOfElement, Document, Element}; | |
11 | use sxd_document::{Package, QName}; | |
12 | use sxd_xpath::context::Evaluation; | |
13 | use sxd_xpath::{Factory, Value, XPath}; | |
14 | use sxd_xpath::nodeset::Node; | |
15 | use std::fmt; | |
16 | use std::time::SystemTime; | |
17 | use crate::definitions::read_definitions_file; | |
18 | use crate::errors::*; | |
19 | use crate::prefs::*; | |
20 | use crate::xpath_functions::is_leaf; | |
21 | use yaml_rust::{YamlLoader, Yaml, yaml::Hash}; | |
22 | use crate::tts::*; | |
23 | use crate::infer_intent::*; | |
24 | use crate::pretty_print::{mml_to_string, yaml_to_string}; | |
25 | use std::path::Path; | |
26 | use std::rc::Rc; | |
27 | use crate::shim_filesystem::{read_to_string_shim, canonicalize_shim}; | |
28 | use crate::canonicalize::{as_element, create_mathml_element, set_mathml_name, name, MATHML_FROM_NAME_ATTR}; | |
29 | use regex::Regex; | |
30 | use log::{debug, error, info}; | |
31 | ||
32 | ||
33 | pub const NAV_NODE_SPEECH_NOT_FOUND: &str = "NAV_NODE_NOT_FOUND"; | |
34 | ||
35 | /// Like lisp's ' (quote foo), this is used to block "replace_chars" being called. | |
36 | /// Unlike lisp, this appended to the end of a string (more efficient) | |
37 | /// At the moment, the only use is BrailleChars(...) -- internally, it calls replace_chars and we don't want it called again. | |
38 | /// Note: an alternative to this hack is to add "xq" (execute but don't eval the result), but that's heavy-handed for the current need | |
39 | const NO_EVAL_QUOTE_CHAR: char = '\u{efff}'; // a private space char | |
40 | const NO_EVAL_QUOTE_CHAR_AS_BYTES: [u8;3] = [0xee,0xbf,0xbf]; | |
41 | const N_BYTES_NO_EVAL_QUOTE_CHAR: usize = NO_EVAL_QUOTE_CHAR.len_utf8(); | |
42 | ||
43 | /// Converts 'string' into a "quoted" string -- use is_quoted_string and unquote_string | |
44 | 12.5k | pub fn make_quoted_string(mut string: String) -> String { |
45 | 12.5k | string.push(NO_EVAL_QUOTE_CHAR); |
46 | 12.5k | return string; |
47 | 12.5k | } |
48 | ||
49 | /// Checks the string to see if it is "quoted" | |
50 | 58.0k | pub fn is_quoted_string(str: &str) -> bool { |
51 | 58.0k | if str.len() < N_BYTES_NO_EVAL_QUOTE_CHAR { |
52 | 34.1k | return false; |
53 | 23.9k | } |
54 | 23.9k | let bytes = str.as_bytes(); |
55 | 23.9k | return bytes[bytes.len()-N_BYTES_NO_EVAL_QUOTE_CHAR..] == NO_EVAL_QUOTE_CHAR_AS_BYTES; |
56 | 58.0k | } |
57 | ||
58 | /// Converts 'string' into a "quoted" string -- use is_quoted_string and unquote_string | |
59 | /// IMPORTANT: this assumes the string is quoted -- no check is made | |
60 | 12.5k | pub fn unquote_string(str: &str) -> &str { |
61 | 12.5k | return &str[..str.len()-N_BYTES_NO_EVAL_QUOTE_CHAR]; |
62 | 12.5k | } |
63 | ||
64 | ||
65 | /// The main external call, `intent_from_mathml` returns a string for the speech associated with the `mathml`. | |
66 | /// It matches against the rules that are computed by user prefs such as "Language" and "SpeechStyle". | |
67 | /// | |
68 | /// The speech rules assume `mathml` has been "cleaned" via the canonicalization step. | |
69 | /// | |
70 | /// If the preferences change (and hence the speech rules to use change), or if the rule file changes, | |
71 | /// `intent_from_mathml` will detect that and (re)load the proper rules. | |
72 | /// | |
73 | /// A string is returned in call cases. | |
74 | /// If there is an error, the speech string will indicate an error. | |
75 | 3.88k | pub fn intent_from_mathml<'m>(mathml: Element, doc: Document<'m>) -> Result<Element<'m>> { |
76 | 3.88k | let |
77 | 3.87k | doc.root().append_child(intent_tree); |
78 | 3.87k | return Ok(intent_tree); |
79 | 3.88k | } |
80 | ||
81 | 3.96k | pub fn speak_mathml(mathml: Element, nav_node_id: &str, nav_node_offset: usize) -> Result<String> { |
82 | 3.96k | return speak_rules(&SPEECH_RULES, mathml, nav_node_id, nav_node_offset); |
83 | 3.96k | } |
84 | ||
85 | 14 | pub fn overview_mathml(mathml: Element, nav_node_id: &str, nav_node_offset: usize) -> Result<String> { |
86 | 14 | return speak_rules(&OVERVIEW_RULES, mathml, nav_node_id, nav_node_offset); |
87 | 14 | } |
88 | ||
89 | ||
90 | 3.88k | fn intent_rules<'m>(rules: &'static std::thread::LocalKey<RefCell<SpeechRules>>, doc: Document<'m>, mathml: Element, nav_node_id: &'m str) -> Result<Element<'m>> { |
91 | 3.88k | rules.with(|rules| { |
92 | 3.88k | rules.borrow_mut().read_files() |
93 | 3.88k | let rules = rules.borrow(); |
94 | // debug!("intent_rules:\n{}", mml_to_string(mathml)); | |
95 | 3.88k | let should_set_literal_intent = rules.pref_manager.borrow().pref_to_string("SpeechStyle").as_str() == "LiteralSpeak"; |
96 | 3.88k | let original_intent = mathml.attribute_value("intent"); |
97 | 3.88k | if should_set_literal_intent { |
98 | 10 | if let Some( |
99 | 4 | let intent = if intent.contains('(') { |
100 | 4 | mathml.set_attribute_value("intent", &intent); |
101 | 6 | } else { |
102 | 6 | mathml.set_attribute_value("intent", ":literal"); |
103 | 6 | }; |
104 | 3.87k | } |
105 | 3.88k | let mut rules_with_context = SpeechRulesWithContext::new(&rules, doc, nav_node_id, 0); |
106 | 3.88k | let |
107 | 3.88k | .context("Pattern match/replacement failure!") |
108 | 3.87k | let answer = if name(intent) == "TEMP_NAME" { // unneeded extra layer |
109 | 0 | assert_eq!(intent.children().len(), 1); |
110 | 0 | as_element(intent.children()[0]) |
111 | } else { | |
112 | 3.87k | intent |
113 | }; | |
114 | 3.87k | if should_set_literal_intent { |
115 | 10 | if let Some( |
116 | 4 | mathml.set_attribute_value("intent", original_intent); |
117 | 6 | } else { |
118 | 6 | mathml.remove_attribute("intent"); |
119 | 6 | } |
120 | 3.86k | } |
121 | 3.87k | return Ok(answer); |
122 | 3.88k | }) |
123 | 3.88k | } |
124 | ||
125 | /// Speak the MathML | |
126 | /// If 'nav_node_id' is not an empty string, then the element with that id will have [[...]] around it | |
127 | 3.98k | fn speak_rules(rules: &'static std::thread::LocalKey<RefCell<SpeechRules>>, mathml: Element, nav_node_id: &str, nav_node_offset: usize) -> Result<String> { |
128 | 3.98k | return rules.with(|rules| { |
129 | 3.98k | rules.borrow_mut().read_files() |
130 | 3.98k | let rules = rules.borrow(); |
131 | // debug!("speak_rules:\n{}", mml_to_string(mathml)); | |
132 | 3.98k | let new_package = Package::new(); |
133 | 3.98k | let mut rules_with_context = SpeechRulesWithContext::new(&rules, new_package.as_document(), nav_node_id, nav_node_offset); |
134 | 3.98k | let |
135 | 3.98k | return Ok( rules.pref_manager.borrow().get_tts() |
136 | 3.98k | .merge_pauses(remove_optional_indicators( |
137 | 3.98k | &speech_string.replace(CONCAT_STRING, "") |
138 | 3.98k | .replace(CONCAT_INDICATOR, "") |
139 | 3.98k | ) |
140 | 3.98k | .trim_start().trim_end_matches([' ', ',', ';'])) ); |
141 | 3.98k | }); |
142 | ||
143 | 3.99k | fn nestable_speak_rules<'c, 's:'c, 'm:'c>(rules_with_context: &mut SpeechRulesWithContext<'c, 's, 'm>, mathml: Element<'c>) -> Result<String> { |
144 | 3.99k | let mut speech_string = rules_with_context.match_pattern::<String>(mathml) |
145 | 3.99k | .context("Pattern match/replacement failure!") |
146 | // Note: [[...]] is added around a matching child, but if the "id" is on 'mathml', the whole string is used | |
147 | 3.99k | if !rules_with_context.nav_node_id.is_empty() { |
148 | // See https://github.com/NSoiffer/MathCAT/issues/174 for why we can just start the speech at the nav node | |
149 | 536 | let intent_attr = mathml.attribute_value("data-intent-property").unwrap_or_default(); |
150 | 536 | if let Some( |
151 | 521 | match speech_string[start+2..].find("]]") { |
152 | 0 | None => bail!("Internal error: looking for '[[...]]' during navigation -- only found '[[' in '{}'", speech_string), |
153 | 521 | Some(end) => speech_string = speech_string[start+2..start+2+end].to_string(), |
154 | } | |
155 | 15 | } else if !intent_attr.contains(":literal:") { |
156 | // try again with LiteralSpeak -- some parts might have been elided in other SpeechStyles | |
157 | 14 | mathml.set_attribute_value("data-intent-property", (":literal:".to_string() + intent_attr).as_str()); |
158 | 14 | let speech = nestable_speak_rules(rules_with_context, mathml); |
159 | 14 | mathml.set_attribute_value("data-intent-property", intent_attr); |
160 | 14 | return speech; |
161 | } else { | |
162 | 1 | bail!(NAV_NODE_SPEECH_NOT_FOUND); // NAV_NODE_SPEECH_NOT_FOUND is tested for later |
163 | } | |
164 | 3.46k | } |
165 | 3.98k | return Ok(speech_string); |
166 | 3.99k | } |
167 | 3.98k | } |
168 | ||
169 | /// Converts its argument to a string that can be used in a debugging message. | |
170 | 0 | pub fn yaml_to_type(yaml: &Yaml) -> String { |
171 | 0 | return match yaml { |
172 | 0 | Yaml::Real(v)=> format!("real='{v:#}'"), |
173 | 0 | Yaml::Integer(v)=> format!("integer='{v:#}'"), |
174 | 0 | Yaml::String(v)=> format!("string='{v:#}'"), |
175 | 0 | Yaml::Boolean(v)=> format!("boolean='{v:#}'"), |
176 | 0 | Yaml::Array(v)=> match v.len() { |
177 | 0 | 0 => "array with no entries".to_string(), |
178 | 0 | 1 => format!("array with the entry: {}", yaml_to_type(&v[0])), |
179 | 0 | _ => format!("array with {} entries. First entry: {}", v.len(), yaml_to_type(&v[0])), |
180 | } | |
181 | 0 | Yaml::Hash(h)=> { |
182 | 0 | let first_pair = |
183 | 0 | if h.is_empty() { |
184 | 0 | "no pairs".to_string() |
185 | } else { | |
186 | 0 | let (key, val) = h.iter().next().unwrap(); |
187 | 0 | format!("({}, {})", yaml_to_type(key), yaml_to_type(val)) |
188 | }; | |
189 | 0 | format!("dictionary with {} pair{}. A pair: {}", h.len(), if h.len()==1 {""} else {"s"}, first_pair) |
190 | } | |
191 | 0 | Yaml::Alias(_)=> "Alias".to_string(), |
192 | 0 | Yaml::Null=> "Null".to_string(), |
193 | 0 | Yaml::BadValue=> "BadValue".to_string(), |
194 | } | |
195 | 0 | } |
196 | ||
197 | 0 | fn yaml_type_err(yaml: &Yaml, str: &str) -> Error { |
198 | 0 | anyhow!("Expected {}, found {}", str, yaml_to_type(yaml)) |
199 | 0 | } |
200 | ||
201 | // fn yaml_key_err(dict: &Yaml, key: &str, yaml_type: &str) -> String { | |
202 | // if dict.as_hash().is_none() { | |
203 | // return format!("Expected dictionary with key '{}', found\n{}", key, yaml_to_string(dict, 1)); | |
204 | // } | |
205 | // let str = &dict[key]; | |
206 | // if str.is_badvalue() { | |
207 | // return format!("Did not find '{}' in\n{}", key, yaml_to_string(dict, 1)); | |
208 | // } | |
209 | // return format!("Type of '{}' is not a {}.\nIt is a {}. YAML value is\n{}", | |
210 | // key, yaml_type, yaml_to_type(str), yaml_to_string(dict, 0)); | |
211 | // } | |
212 | ||
213 | 4.86M | fn find_str<'a>(dict: &'a Yaml, key: &'a str) -> Option<&'a str> { |
214 | 4.86M | return dict[key].as_str(); |
215 | 4.86M | } |
216 | ||
217 | /// Returns the Yaml as a `Hash` or an error if it isn't. | |
218 | 175k | pub fn as_hash_checked(value: &Yaml) -> Result<&Hash> { |
219 | 175k | let result = value.as_hash(); |
220 | 175k | let result = result.ok_or_else(|| |
221 | 175k | return Ok( result ); |
222 | 175k | } |
223 | ||
224 | /// Returns the Yaml as a `Vec` or an error if it isn't. | |
225 | 11.7k | pub fn as_vec_checked(value: &Yaml) -> Result<&Vec<Yaml>> { |
226 | 11.7k | let result = value.as_vec(); |
227 | 11.7k | let result = result.ok_or_else(|| |
228 | 11.7k | return Ok( result ); |
229 | 11.7k | } |
230 | ||
231 | /// Returns the Yaml as a `&str` or an error if it isn't. | |
232 | 8.08M | pub fn as_str_checked(yaml: &Yaml) -> Result<&str> { |
233 | 8.08M | return yaml.as_str().ok_or_else(|| |
234 | 8.08M | } |
235 | ||
236 | ||
237 | /// A bit of a hack to concatenate replacements (without a ' '). | |
238 | /// The CONCAT_INDICATOR is added by a "ct:" (instead of 't:') in the speech rules | |
239 | /// and checked for by the tts code. | |
240 | pub const CONCAT_INDICATOR: &str = "\u{F8FE}"; | |
241 | ||
242 | // This is the pattern that needs to be matched (and deleted) | |
243 | pub const CONCAT_STRING: &str = " \u{F8FE}"; | |
244 | ||
245 | // a similar hack to potentially delete (repetitive) optional replacements | |
246 | // the OPTIONAL_INDICATOR is added by "ot:" before and after the optional string | |
247 | const OPTIONAL_INDICATOR: &str = "\u{F8FD}"; | |
248 | const OPTIONAL_INDICATOR_LEN: usize = OPTIONAL_INDICATOR.len(); | |
249 | ||
250 | 5.10k | pub fn remove_optional_indicators(str: &str) -> String { |
251 | 5.10k | return str.replace(OPTIONAL_INDICATOR, ""); |
252 | 5.10k | } |
253 | ||
254 | /// Given a string that should be Yaml, it calls `build_fn` with that string. | |
255 | /// The build function/closure should process the Yaml as appropriate and capture any errors and write them to `std_err`. | |
256 | /// The returned value should be a Vector containing the paths of all the files that were included. | |
257 | 56.3k | pub fn compile_rule<F>(str: &str, mut build_fn: F) -> Result<Vec<PathBuf>> where |
258 | 56.3k | F: FnMut(&Yaml) -> Result<Vec<PathBuf>> { |
259 | 56.3k | let docs = YamlLoader::load_from_str(str); |
260 | 56.3k | match docs { |
261 | 0 | Err(e) => { |
262 | 0 | bail!("Parse error!!: {}", e); |
263 | }, | |
264 | 56.3k | Ok(docs) => { |
265 | 56.3k | if docs.len() != 1 { |
266 | 0 | bail!("Didn't find rules!"); |
267 | 56.3k | } |
268 | 56.3k | return build_fn(&docs[0]); |
269 | } | |
270 | } | |
271 | 56.3k | } |
272 | ||
273 | 36.5k | pub fn process_include<F>(current_file: &Path, new_file_name: &str, mut read_new_file: F) -> Result<Vec<PathBuf>> |
274 | 36.5k | where F: FnMut(&Path) -> Result<Vec<PathBuf>> { |
275 | 36.5k | let parent_path = current_file.parent(); |
276 | 36.5k | if parent_path.is_none() { |
277 | 0 | bail!("Internal error: {:?} is not a valid file name", current_file); |
278 | 36.5k | } |
279 | 36.5k | let mut new_file = match canonicalize_shim(parent_path.unwrap()) { |
280 | 36.5k | Ok(path) => path, |
281 | 0 | Err(e) => bail!("process_include: canonicalize failed for {} with message {}", parent_path.unwrap().display(), e), |
282 | }; | |
283 | ||
284 | // the referenced file might be in a directory that hasn't been zipped up -- find the dir and call the unzip function | |
285 | 89.0k | for unzip_dir in |
286 | 89.0k | if unzip_dir.ends_with("Rules") { |
287 | 36.5k | break; // nothing to unzip |
288 | 52.4k | } |
289 | 52.4k | if unzip_dir.ends_with("Languages") || |
290 | // get the subdir ...Rules/Braille/en/... | |
291 | // could have ...Rules/Braille/definitions.yaml, so 'next()' doesn't exist in this case, but the file wasn't zipped up | |
292 | 26.0k | if let Some( |
293 | 24.9k | let default_lang = if unzip_dir.ends_with("Languages") { |
294 | 24.9k | PreferenceManager::unzip_files(unzip_dir, subdir.to_str().unwrap(), Some(default_lang)).unwrap_or_default(); |
295 | 1.06k | } |
296 | 26.4k | } |
297 | } | |
298 | 36.5k | new_file.push(new_file_name); |
299 | 36.5k | info!("...processing include: {new_file_name}..."); |
300 | 36.5k | let new_file = match crate::shim_filesystem::canonicalize_shim(new_file.as_path()) { |
301 | 36.5k | Ok(buf) => buf, |
302 | 0 | Err(msg) => bail!("-include: constructed file name '{}' causes error '{}'", |
303 | 0 | new_file.to_str().unwrap(), msg), |
304 | }; | |
305 | ||
306 | 36.5k | let mut included_files = read_new_file(new_file.as_path()) |
307 | 36.5k | let mut files_read = vec![new_file]; |
308 | 36.5k | files_read.append(&mut included_files); |
309 | 36.5k | return Ok(files_read); |
310 | 36.5k | } |
311 | ||
312 | /// As the name says, TreeOrString is either a Tree (Element) or a String | |
313 | /// It is used to share code during pattern matching | |
314 | pub trait TreeOrString<'c, 'm:'c, T> { | |
315 | fn from_element(e: Element<'m>) -> Result<T>; | |
316 | fn from_string(s: String, doc: Document<'m>) -> Result<T>; | |
317 | fn replace_tts<'s:'c, 'r>(tts: &TTS, command: &TTSCommandRule, prefs: &PreferenceManager, rules_with_context: &'r mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<T>; | |
318 | fn replace<'s:'c, 'r>(ra: &ReplacementArray, rules_with_context: &'r mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<T>; | |
319 | fn replace_nodes<'s:'c, 'r>(rules: &'r mut SpeechRulesWithContext<'c, 's,'m>, nodes: Vec<Node<'c>>, mathml: Element<'c>) -> Result<T>; | |
320 | fn highlight_braille(braille: T, highlight_style: String) -> T; | |
321 | fn mark_nav_speech(speech: T) -> T; | |
322 | } | |
323 | ||
324 | impl<'c, 'm:'c> TreeOrString<'c, 'm, String> for String { | |
325 | 0 | fn from_element(_e: Element<'m>) -> Result<String> { |
326 | 0 | bail!("from_element not allowed for strings"); |
327 | 0 | } |
328 | ||
329 | 180k | fn from_string(s: String, _doc: Document<'m>) -> Result<String> { |
330 | 180k | return Ok(s); |
331 | 180k | } |
332 | ||
333 | 60.7k | fn replace_tts<'s:'c, 'r>(tts: &TTS, command: &TTSCommandRule, prefs: &PreferenceManager, rules_with_context: &'r mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<String> { |
334 | 60.7k | return tts.replace_string(command, prefs, rules_with_context, mathml); |
335 | 60.7k | } |
336 | ||
337 | 142k | fn replace<'s:'c, 'r>(ra: &ReplacementArray, rules_with_context: &'r mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<String> { |
338 | 142k | return ra.replace_array_string(rules_with_context, mathml); |
339 | 142k | } |
340 | ||
341 | 72.9k | fn replace_nodes<'s:'c, 'r>(rules: &'r mut SpeechRulesWithContext<'c, 's,'m>, nodes: Vec<Node<'c>>, mathml: Element<'c>) -> Result<String> { |
342 | 72.9k | return rules.replace_nodes_string(nodes, mathml); |
343 | 72.9k | } |
344 | ||
345 | 469 | fn highlight_braille(braille: String, highlight_style: String) -> String { |
346 | 469 | return SpeechRulesWithContext::highlight_braille_string(braille, highlight_style); |
347 | 469 | } |
348 | ||
349 | 521 | fn mark_nav_speech(speech: String) -> String { |
350 | 521 | return SpeechRulesWithContext::mark_nav_speech(speech); |
351 | 521 | } |
352 | } | |
353 | ||
354 | impl<'c, 'm:'c> TreeOrString<'c, 'm, Element<'m>> for Element<'m> { | |
355 | 48.0k | fn from_element(e: Element<'m>) -> Result<Element<'m>> { |
356 | 48.0k | return Ok(e); |
357 | 48.0k | } |
358 | ||
359 | 213 | fn from_string(s: String, doc: Document<'m>) -> Result<Element<'m>> { |
360 | // FIX: is 'mi' really ok? Don't want to use TEMP_NAME because this name needs to move to the outside world | |
361 | 213 | let leaf = create_mathml_element(&doc, "mi"); |
362 | 213 | leaf.set_text(&s); |
363 | 213 | return Ok(leaf); |
364 | 213 | } |
365 | ||
366 | 0 | fn replace_tts<'s:'c, 'r>(_tts: &TTS, _command: &TTSCommandRule, _prefs: &PreferenceManager, _rules_with_context: &'r mut SpeechRulesWithContext<'c, 's,'m>, _mathml: Element<'c>) -> Result<Element<'m>> { |
367 | 0 | bail!("Internal error: applying a TTS rule to a tree"); |
368 | 0 | } |
369 | ||
370 | 132k | fn replace<'s:'c, 'r>(ra: &ReplacementArray, rules_with_context: &'r mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<Element<'m>> { |
371 | 132k | return ra.replace_array_tree(rules_with_context, mathml); |
372 | 132k | } |
373 | ||
374 | 48.6k | fn replace_nodes<'s:'c, 'r>(rules: &'r mut SpeechRulesWithContext<'c, 's,'m>, nodes: Vec<Node<'c>>, mathml: Element<'c>) -> Result<Element<'m>> { |
375 | 48.6k | return rules.replace_nodes_tree(nodes, mathml); |
376 | 48.6k | } |
377 | ||
378 | 0 | fn highlight_braille(_braille: Element<'c>, _highlight_style: String) -> Element<'m> { |
379 | 0 | panic!("Internal error: highlight_braille called on a tree"); |
380 | } | |
381 | ||
382 | 0 | fn mark_nav_speech(_speech: Element<'c>) -> Element<'m> { |
383 | 0 | panic!("Internal error: mark_nav_speech called on a tree"); |
384 | } | |
385 | } | |
386 | ||
387 | /// 'Replacement' is an enum that contains all the potential replacement types/structs | |
388 | /// Hence there are fields 'Test' ("test:"), 'Text" ("t:"), "XPath", etc | |
389 | #[derive(Debug, Clone)] | |
390 | #[allow(clippy::upper_case_acronyms)] | |
391 | enum Replacement { | |
392 | // Note: all of these are pointer types | |
393 | Text(String), | |
394 | XPath(MyXPath), | |
395 | Intent(Box<Intent>), | |
396 | Test(Box<TestArray>), | |
397 | TTS(Box<TTSCommandRule>), | |
398 | With(Box<With>), | |
399 | SetVariables(Box<SetVariables>), | |
400 | Insert(Box<InsertChildren>), | |
401 | Translate(TranslateExpression), | |
402 | } | |
403 | ||
404 | impl fmt::Display for Replacement { | |
405 | 10 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
406 | 10 | return write!(f, "{}", |
407 | 10 | match self { |
408 | 0 | Replacement::Test(c) => c.to_string(), |
409 | 0 | Replacement::Text(t) => format!("t: \"{t}\""), |
410 | 10 | Replacement::XPath(x) => x.to_string(), |
411 | 0 | Replacement::Intent(i) => i.to_string(), |
412 | 0 | Replacement::TTS(t) => t.to_string(), |
413 | 0 | Replacement::With(w) => w.to_string(), |
414 | 0 | Replacement::SetVariables(v) => v.to_string(), |
415 | 0 | Replacement::Insert(ic) => ic.to_string(), |
416 | 0 | Replacement::Translate(x) => x.to_string(), |
417 | } | |
418 | ); | |
419 | 10 | } |
420 | } | |
421 | ||
422 | impl Replacement { | |
423 | 13.5M | fn build(replacement: &Yaml) -> Result<Replacement> { |
424 | // Replacement -- single key/value (see below for allowed values) | |
425 | 13.5M | let dictionary = replacement.as_hash(); |
426 | 13.5M | if dictionary.is_none() { |
427 | 0 | bail!(" expected a key/value pair. Found {}.", yaml_to_string(replacement, 0)); |
428 | 13.5M | }; |
429 | 13.5M | let dictionary = dictionary.unwrap(); |
430 | 13.5M | if dictionary.is_empty() { |
431 | 0 | bail!("No key/value pairs found for key 'replace'.\n\ |
432 | Suggestion: are the following lines indented properly?"); | |
433 | 13.5M | } |
434 | 13.5M | if dictionary.len() > 1 { |
435 | 0 | bail!("Should only be one key/value pair for the replacement.\n \ |
436 | Suggestion: are the following lines indented properly?\n \ | |
437 | 0 | The key/value pairs found are\n{}", yaml_to_string(replacement, 2)); |
438 | 13.5M | } |
439 | ||
440 | // get the single value | |
441 | 13.5M | let (key, value) = dictionary.iter().next().unwrap(); |
442 | 13.5M | let key = key.as_str().ok_or_else(|| |
443 | 13.5M | match key { |
444 | 13.5M | "t" | |
445 | 5.76M | return Ok( Replacement::Text( as_str_checked(value) |
446 | }, | |
447 | 7.75M | "ct" | |
448 | 24.7k | return Ok( Replacement::Text( CONCAT_INDICATOR.to_string() + as_str_checked(value) |
449 | }, | |
450 | 7.72M | "ot" | |
451 | 36.2k | return Ok( Replacement::Text( OPTIONAL_INDICATOR.to_string() + as_str_checked(value) |
452 | }, | |
453 | 7.69M | "x" => { |
454 | 2.27M | return Ok( Replacement::XPath( MyXPath::build(value) |
455 | 2.27M | .context("while trying to evaluate value of 'x:'") |
456 | }, | |
457 | 5.41M | "pause" | |
458 | 2.41M | return Ok( Replacement::TTS( TTS::build(&key.to_ascii_lowercase(), value) |
459 | }, | |
460 | 2.99M | "intent" => { |
461 | 283k | return Ok( Replacement::Intent( Intent::build(value) |
462 | }, | |
463 | 2.71M | "test" => { |
464 | 2.58M | return Ok( Replacement::Test( Box::new( TestArray::build(value) |
465 | }, | |
466 | 128k | "with" => { |
467 | 77.5k | return Ok( Replacement::With( With::build(value) |
468 | }, | |
469 | 51.3k | "set_variables" => { |
470 | 30.3k | return Ok( Replacement::SetVariables( SetVariables::build(value) |
471 | }, | |
472 | 21.0k | "insert" => { |
473 | 20.9k | return Ok( Replacement::Insert( InsertChildren::build(value) |
474 | }, | |
475 | 102 | "translate" => { |
476 | 102 | return Ok( Replacement::Translate( TranslateExpression::build(value) |
477 | 102 | .context("while trying to evaluate value of 'speak:'") |
478 | }, | |
479 | _ => { | |
480 | 0 | bail!("Unknown 'replace' command ({}) with value: {}", key, yaml_to_string(value, 0)); |
481 | } | |
482 | } | |
483 | 13.5M | } |
484 | } | |
485 | ||
486 | // structure used when "insert:" is encountered in a rule | |
487 | // the 'replacements' are inserted between each node in the 'xpath' | |
488 | #[derive(Debug, Clone)] | |
489 | struct InsertChildren { | |
490 | xpath: MyXPath, // the replacement nodes | |
491 | replacements: ReplacementArray, // what is inserted between each node | |
492 | } | |
493 | ||
494 | impl fmt::Display for InsertChildren { | |
495 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
496 | 0 | return write!(f, "InsertChildren:\n nodes {}\n replacements {}", self.xpath, &self.replacements); |
497 | 0 | } |
498 | } | |
499 | ||
500 | impl InsertChildren { | |
501 | 20.9k | fn build(insert: &Yaml) -> Result<Box<InsertChildren>> { |
502 | // 'insert:' -- 'nodes': xxx 'replace': xxx | |
503 | 20.9k | if insert.as_hash().is_none() { |
504 | 0 | bail!("") |
505 | 20.9k | } |
506 | 20.9k | let nodes = &insert["nodes"]; |
507 | 20.9k | if nodes.is_badvalue() { |
508 | 0 | bail!("Missing 'nodes' as part of 'insert'.\n \ |
509 | Suggestion: add 'nodes:' or if present, indent so it is contained in 'insert'"); | |
510 | 20.9k | } |
511 | 20.9k | let nodes = as_str_checked(nodes) |
512 | 20.9k | let replace = &insert["replace"]; |
513 | 20.9k | if replace.is_badvalue() { |
514 | 0 | bail!("Missing 'replace' as part of 'insert'.\n \ |
515 | Suggestion: add 'replace:' or if present, indent so it is contained in 'insert'"); | |
516 | 20.9k | } |
517 | 20.9k | return Ok( Box::new( InsertChildren { |
518 | 20.9k | xpath: MyXPath::new(nodes.to_string()) |
519 | 20.9k | replacements: ReplacementArray::build(replace).context("'replace:'") |
520 | } ) ); | |
521 | 20.9k | } |
522 | ||
523 | // It would be most efficient to do an xpath eval, get the nodes (type: NodeSet) and then intersperse the node_replace() | |
524 | // calls with replacements for the ReplacementArray parts. But that causes problems with the "pause: auto" calculation because | |
525 | // the replacements are segmented (can't look to neighbors for the calculation there) | |
526 | // An alternative is to introduce another Replacement enum value, but that's a lot of complication for not that much | |
527 | // gain (and Node's have contagious lifetimes) | |
528 | // The solution adopted is to find out the number of nodes and build up MyXPaths with each node selected (e.g, "*" => "*[3]") | |
529 | // and put those nodes into a flat ReplacementArray and then do a standard replace on that. | |
530 | // This is slower than the alternatives, but reuses a bunch of code and hence is less complicated. | |
531 | 7.45k | fn replace<'c, 's:'c, 'm: 'c, T:TreeOrString<'c, 'm, T>>(&self, rules_with_context: &mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<T> { |
532 | 7.45k | let result = self.xpath.evaluate(&rules_with_context.context_stack.base, mathml) |
533 | 7.45k | .with_context(|| |
534 | 7.45k | match result { |
535 | 7.45k | Value::Nodeset(nodes) => { |
536 | 7.45k | if nodes.size() == 0 { |
537 | 0 | bail!("During replacement, no matching element found"); |
538 | 7.45k | }; |
539 | 7.45k | let nodes = nodes.document_order(); |
540 | 7.45k | let n_nodes = nodes.len(); |
541 | 7.45k | let mut expanded_result = Vec::with_capacity(n_nodes + (n_nodes+1)*self.replacements.replacements.len()); |
542 | 7.45k | expanded_result.push( |
543 | Replacement::XPath( | |
544 | 7.45k | MyXPath::new(format!("{}[{}]", self.xpath.rc.string , 1)) |
545 | ) | |
546 | ); | |
547 | 19.3k | for i in |
548 | 19.3k | expanded_result.extend_from_slice(&self.replacements.replacements); |
549 | 19.3k | expanded_result.push( |
550 | Replacement::XPath( | |
551 | 19.3k | MyXPath::new(format!("{}[{}]", self.xpath.rc.string , i)) |
552 | ) | |
553 | ); | |
554 | } | |
555 | 7.45k | let replacements = ReplacementArray{ replacements: expanded_result }; |
556 | 7.45k | return replacements.replace(rules_with_context, mathml); |
557 | }, | |
558 | ||
559 | // FIX: should the options be errors??? | |
560 | 0 | Value::String(t) => { return T::from_string(rules_with_context.replace_chars(&t, mathml)?, rules_with_context.doc); }, |
561 | 0 | Value::Number(num) => { return T::from_string( num.to_string(), rules_with_context.doc ); }, |
562 | 0 | Value::Boolean(b) => { return T::from_string( b.to_string(), rules_with_context.doc ); }, // FIX: is this right??? |
563 | } | |
564 | ||
565 | 7.45k | } |
566 | } | |
567 | ||
568 | ||
569 | 2 | static ATTR_NAME_VALUE: LazyLock<Regex> = LazyLock::new(|| { |
570 | 2 | Regex::new( |
571 | // match name='value', where name is sort of an NCNAME (see CONCEPT_OR_LITERAL in infer_intent.rs) | |
572 | // The quotes can be either single or double quotes | |
573 | 2 | r#"(?P<name>[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*)\s*=\s*('(?P<value>[^']+)'|"(?P<dqvalue>[^"]+)")"# |
574 | 2 | ).unwrap() |
575 | 2 | }); |
576 | ||
577 | // structure used when "intent:" is encountered in a rule | |
578 | // the name is either a string or an xpath that needs evaluation. 99% of the time it is a string | |
579 | #[derive(Debug, Clone)] | |
580 | struct Intent { | |
581 | name: Option<String>, // name of node | |
582 | xpath: Option<MyXPath>, // alternative to directly using the string | |
583 | attrs: String, // optional attrs -- format "attr1='val1' [attr2='val2'...]" | |
584 | children: ReplacementArray, // children of node | |
585 | } | |
586 | ||
587 | impl fmt::Display for Intent { | |
588 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
589 | 0 | let name = if let Some(name) = &self.name { |
590 | 0 | name.to_string() |
591 | } else { | |
592 | 0 | self.xpath.as_ref().unwrap().to_string() |
593 | }; | |
594 | 0 | return write!(f, "intent: {}: {}, attrs='{}'>\n children: {}", |
595 | 0 | if self.name.is_some() {"name"} else {"xpath-name"}, name, |
596 | self.attrs, | |
597 | 0 | &self.children); |
598 | 0 | } |
599 | } | |
600 | ||
601 | impl Intent { | |
602 | 283k | fn build(yaml_dict: &Yaml) -> Result<Box<Intent>> { |
603 | // 'intent:' -- 'name': xxx 'children': xxx | |
604 | 283k | if yaml_dict.as_hash().is_none() { |
605 | 0 | bail!("Array found for contents of 'intent' -- should be dictionary with keys 'name' and 'children'") |
606 | 283k | } |
607 | 283k | let name = &yaml_dict["name"]; |
608 | 283k | let xpath_name = &yaml_dict["xpath-name"]; |
609 | 283k | if name.is_badvalue() && |
610 | 0 | bail!("Missing 'name' or 'xpath-name' as part of 'intent'.\n \ |
611 | Suggestion: add 'name:' or if present, indent so it is contained in 'intent'"); | |
612 | 283k | } |
613 | 283k | let attrs = &yaml_dict["attrs"]; |
614 | 283k | let replace = &yaml_dict["children"]; |
615 | 283k | if replace.is_badvalue() { |
616 | 0 | bail!("Missing 'children' as part of 'intent'.\n \ |
617 | Suggestion: add 'children:' or if present, indent so it is contained in 'intent'"); | |
618 | 283k | } |
619 | 283k | return Ok( Box::new( Intent { |
620 | 283k | name: if name.is_badvalue() { |
621 | 283k | xpath: if xpath_name.is_badvalue() { |
622 | 283k | attrs: if attrs.is_badvalue() { |
623 | 283k | children: ReplacementArray::build(replace).context("'children:'") |
624 | } ) ); | |
625 | 283k | } |
626 | ||
627 | 45.5k | fn replace<'c, 's:'c, 'm: 'c, T:TreeOrString<'c, 'm, T>>(&self, rules_with_context: &mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<T> { |
628 | 45.5k | let result = self.children.replace::<Element<'m>>(rules_with_context, mathml) |
629 | 45.5k | .context("replacing inside 'intent'") |
630 | 45.5k | let mut result = lift_children(result); |
631 | 45.5k | if name(result) != "TEMP_NAME" && |
632 | 235 | // this case happens when you have an 'intent' replacement as a direct child of an 'intent' replacement |
633 | 235 | let temp = create_mathml_element(&result.document(), "TEMP_NAME"); |
634 | 235 | temp.append_child(result); |
635 | 235 | result = temp; |
636 | 45.3k | } |
637 | 45.5k | if let Some( |
638 | 11.2k | result.set_attribute_value(MATHML_FROM_NAME_ATTR, name(mathml)); |
639 | 11.2k | set_mathml_name(result, intent_name.as_str()); |
640 | 34.2k | } |
641 | 45.5k | if let Some( |
642 | 34.2k | let xpath_value = my_xpath.evaluate(rules_with_context.get_context(), mathml) |
643 | 34.2k | match xpath_value { |
644 | 34.2k | Value::String(intent_name) => { |
645 | 34.2k | result.set_attribute_value(MATHML_FROM_NAME_ATTR, name(mathml)); |
646 | 34.2k | set_mathml_name(result, intent_name.as_str()) |
647 | }, | |
648 | 0 | _ => bail!("'xpath-name' value '{}' was not a string", &my_xpath), |
649 | } | |
650 | 11.2k | } |
651 | 45.5k | if self.name.is_none() && |
652 | 0 | bail!("Intent::replace: internal error -- neither 'name' nor 'xpath' is set"); |
653 | 45.5k | }; |
654 | ||
655 | 100k | for attr in |
656 | 100k | result.set_attribute_value(attr.name(), attr.value()); |
657 | 100k | } |
658 | ||
659 | // can't test against name == "math" because intent might a new element | |
660 | 45.5k | if mathml.parent().is_some() && mathml.parent().unwrap().element().is_some() && |
661 | 41.7k | result.attribute_value("id") == crate::canonicalize::get_parent(mathml).attribute_value("id") { |
662 | 32 | // avoid duplicate ids -- it's a bug if it does, but this helps in that case |
663 | 32 | result.remove_attribute("id"); |
664 | 45.5k | } |
665 | ||
666 | 45.5k | if !self.attrs.is_empty() { |
667 | // debug!("MathML after children, before attr processing:\n{}", mml_to_string(mathml)); | |
668 | // debug!("Result after children, before attr processing:\n{}", mml_to_string(result)); | |
669 | // debug!("Intent::replace attrs = \"{}\"", &self.attrs); | |
670 | 5.63k | for cap in |
671 | 5.63k | let matched_value = if cap["value"].is_empty() { |
672 | 5.63k | let value_as_xpath = MyXPath::new(matched_value.to_string()).context("attr value inside 'intent'") |
673 | 5.63k | let value = value_as_xpath.evaluate(rules_with_context.get_context(), result) |
674 | 5.63k | .context("attr xpath evaluation value inside 'intent'") |
675 | 5.63k | let mut value = value.into_string(); |
676 | 5.63k | if &cap["name"] == INTENT_PROPERTY { |
677 | 5.23k | value = simplify_fixity_properties(&value); |
678 | 5.23k |
|
679 | // debug!("Intent::replace match\n name={}\n value={}\n xpath value={}", &cap["name"], &cap["value"], &value); | |
680 | 5.63k | if &cap["name"] == INTENT_PROPERTY && |
681 | 1.81k | // should have been an empty string, so remove the attribute |
682 | 1.81k | result.remove_attribute(INTENT_PROPERTY); |
683 | 3.82k | } else { |
684 | 3.82k | result.set_attribute_value(&cap["name"], &value); |
685 | 3.82k | } |
686 | }; | |
687 | 39.9k | } |
688 | ||
689 | // debug!("Result from 'intent:'\n{}", mml_to_string(result)); | |
690 | 45.5k | return T::from_element(result); |
691 | ||
692 | ||
693 | /// "lift" up the children any "TEMP_NAME" child -- could short circuit when only one child | |
694 | 45.5k | fn lift_children(result: Element) -> Element { |
695 | // debug!("lift_children:\n{}", mml_to_string(result)); | |
696 | // most likely there will be the same number of new children as result has, but there could be more | |
697 | 45.5k | let mut new_children = Vec::with_capacity(2*result.children().len()); |
698 | 69.6k | for child_of_element in |
699 | 69.6k | match child_of_element { |
700 | 69.6k | ChildOfElement::Element(child) => { |
701 | 69.6k | if name(child) == "TEMP_NAME" { |
702 | 34.1k | new_children.append(&mut child.children()); // almost always just one |
703 | 35.5k | } else { |
704 | 35.5k | new_children.push(child_of_element); |
705 | 35.5k | } |
706 | }, | |
707 | 7 | _ => new_children.push(child_of_element), // text() |
708 | } | |
709 | } | |
710 | 45.5k | result.replace_children(new_children); |
711 | 45.5k | return result; |
712 | 45.5k | } |
713 | 45.5k | } |
714 | } | |
715 | ||
716 | // structure used when "with:" is encountered in a rule | |
717 | // the variables are placed on (and later) popped of a variable stack before/after the replacement | |
718 | #[derive(Debug, Clone)] | |
719 | struct With { | |
720 | variables: VariableDefinitions, // variables and values | |
721 | replacements: ReplacementArray, // what to do with these vars | |
722 | } | |
723 | ||
724 | impl fmt::Display for With { | |
725 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
726 | 0 | return write!(f, "with:\n variables: {}\n replace: {}", &self.variables, &self.replacements); |
727 | 0 | } |
728 | } | |
729 | ||
730 | impl With { | |
731 | 77.5k | fn build(vars_replacements: &Yaml) -> Result<Box<With>> { |
732 | // 'with:' -- 'variables': xxx 'replace': xxx | |
733 | 77.5k | if vars_replacements.as_hash().is_none() { |
734 | 0 | bail!("Array found for contents of 'with' -- should be dictionary with keys 'variables' and 'replace'") |
735 | 77.5k | } |
736 | 77.5k | let var_defs = &vars_replacements["variables"]; |
737 | 77.5k | if var_defs.is_badvalue() { |
738 | 0 | bail!("Missing 'variables' as part of 'with'.\n \ |
739 | Suggestion: add 'variables:' or if present, indent so it is contained in 'with'"); | |
740 | 77.5k | } |
741 | 77.5k | let replace = &vars_replacements["replace"]; |
742 | 77.5k | if replace.is_badvalue() { |
743 | 0 | bail!("Missing 'replace' as part of 'with'.\n \ |
744 | Suggestion: add 'replace:' or if present, indent so it is contained in 'with'"); | |
745 | 77.5k | } |
746 | 77.5k | return Ok( Box::new( With { |
747 | 77.5k | variables: VariableDefinitions::build(var_defs).context("'variables'") |
748 | 77.5k | replacements: ReplacementArray::build(replace).context("'replace:'") |
749 | } ) ); | |
750 | 77.5k | } |
751 | ||
752 | 7.28k | fn replace<'c, 's:'c, 'm: 'c, T:TreeOrString<'c, 'm, T>>(&self, rules_with_context: &mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<T> { |
753 | 7.28k | rules_with_context.context_stack.push(self.variables.clone(), mathml) |
754 | 7.28k | let result = self.replacements.replace(rules_with_context, mathml) |
755 | 7.28k | .context("replacing inside 'with'") |
756 | 7.28k | rules_with_context.context_stack.pop(); |
757 | 7.28k | return Ok( result ); |
758 | 7.28k | } |
759 | } | |
760 | ||
761 | // structure used when "set_variables:" is encountered in a rule | |
762 | // the variables are global and are placed in the base context and never popped off | |
763 | #[derive(Debug, Clone)] | |
764 | struct SetVariables { | |
765 | variables: VariableDefinitions, // variables and values | |
766 | } | |
767 | ||
768 | impl fmt::Display for SetVariables { | |
769 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
770 | 0 | return write!(f, "SetVariables: variables {}", &self.variables); |
771 | 0 | } |
772 | } | |
773 | ||
774 | impl SetVariables { | |
775 | 30.3k | fn build(vars: &Yaml) -> Result<Box<SetVariables>> { |
776 | // 'set_variables:' -- 'variables': xxx (array) | |
777 | 30.3k | if vars.as_vec().is_none() { |
778 | 0 | bail!("'set_variables' -- should be an array of variable name, xpath value"); |
779 | 30.3k | } |
780 | 30.3k | return Ok( Box::new( SetVariables { |
781 | 30.3k | variables: VariableDefinitions::build(vars).context("'set_variables'") |
782 | } ) ); | |
783 | 30.3k | } |
784 | ||
785 | 3.78k | fn replace<'c, 's:'c, 'm: 'c, T:TreeOrString<'c, 'm, T>>(&self, rules_with_context: &mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<T> { |
786 | 3.78k | rules_with_context.context_stack.set_globals(self.variables.clone(), mathml) |
787 | 3.78k | return T::from_string( "".to_string(), rules_with_context.doc ); |
788 | 3.78k | } |
789 | } | |
790 | ||
791 | ||
792 | /// Allow speech of an expression in the middle of a rule (used by "WhereAmI" for navigation) | |
793 | #[derive(Debug, Clone)] | |
794 | struct TranslateExpression { | |
795 | xpath: MyXPath, // variables and values | |
796 | } | |
797 | ||
798 | impl fmt::Display for TranslateExpression { | |
799 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
800 | 0 | return write!(f, "speak: {}", &self.xpath); |
801 | 0 | } |
802 | } | |
803 | impl TranslateExpression { | |
804 | 102 | fn build(vars: &Yaml) -> Result<TranslateExpression> { |
805 | // 'translate:' -- xpath (should evaluate to an id) | |
806 | 102 | return Ok( TranslateExpression { xpath: MyXPath::build(vars).context("'translate'") |
807 | 102 | } |
808 | ||
809 | 2 | fn replace<'c, 's:'c, 'm:'c, T:TreeOrString<'c, 'm, T>>(&self, rules_with_context: &mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<T> { |
810 | 2 | if self.xpath.rc.string.starts_with('@') { |
811 | 2 | let xpath_value = self.xpath.evaluate(rules_with_context.get_context(), mathml) |
812 | 2 | let id = match xpath_value { |
813 | 0 | Value::String(s) => Some(s), |
814 | 2 | Value::Nodeset(nodes) => { |
815 | 2 | if nodes.size() == 1 { |
816 | 2 | nodes.document_order_first().unwrap().attribute().map(|attr| attr.value().to_string()) |
817 | } else { | |
818 | 0 | None |
819 | } | |
820 | }, | |
821 | 0 | _ => None, |
822 | }; | |
823 | 2 | match id { |
824 | 0 | None => bail!("'translate' value '{}' is not a string or an attribute value (correct by using '@id'??):\n", self.xpath), |
825 | 2 | Some(id) => { |
826 | 2 | let speech = speak_mathml(mathml, &id, 0) |
827 | 2 | return T::from_string(speech, rules_with_context.doc); |
828 | } | |
829 | } | |
830 | } else { | |
831 | 0 | return T::from_string( |
832 | 0 | self.xpath.replace(rules_with_context, mathml).context("'translate'")?, |
833 | 0 | rules_with_context.doc |
834 | ); | |
835 | } | |
836 | 2 | } |
837 | } | |
838 | ||
839 | ||
840 | /// An array of rule `Replacement`s (text, xpath, tts commands, etc) | |
841 | #[derive(Debug, Clone)] | |
842 | pub struct ReplacementArray { | |
843 | replacements: Vec<Replacement> | |
844 | } | |
845 | ||
846 | impl fmt::Display for ReplacementArray { | |
847 | 1 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
848 | 1 | return write!(f, "{}", self.pretty_print_replacements()); |
849 | 1 | } |
850 | } | |
851 | ||
852 | impl ReplacementArray { | |
853 | /// Return an empty `ReplacementArray` | |
854 | 1.98M | pub fn build_empty() -> ReplacementArray { |
855 | 1.98M | return ReplacementArray { |
856 | 1.98M | replacements: vec![] |
857 | 1.98M | } |
858 | 1.98M | } |
859 | ||
860 | /// Convert a Yaml input into a [`ReplacementArray`]. | |
861 | /// Any errors are passed back out. | |
862 | 9.24M | pub fn build(replacements: &Yaml) -> Result<ReplacementArray> { |
863 | // replacements is either a single replacement or an array of replacements | |
864 | 9.24M | let result= if replacements.is_array() { |
865 | 9.22M | let replacements = replacements.as_vec().unwrap(); |
866 | 9.22M | replacements |
867 | 9.22M | .iter() |
868 | 9.22M | .enumerate() // useful for errors |
869 | 13.4M | . |
870 | 13.4M | .with_context(|| |
871 | 9.22M | .collect::<Result<Vec<Replacement>>>() |
872 | } else { | |
873 | 21.2k | vec![ Replacement::build(replacements) |
874 | }; | |
875 | ||
876 | 9.24M | return Ok( ReplacementArray{ replacements: result } ); |
877 | 9.24M | } |
878 | ||
879 | /// Do all the replacements in `mathml` using `rules`. | |
880 | 275k | pub fn replace<'c, 's:'c, 'm:'c, T:TreeOrString<'c, 'm, T>>(&self, rules_with_context: &mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<T> { |
881 | 275k | return T::replace(self, rules_with_context, mathml); |
882 | 275k | } |
883 | ||
884 | 142k | pub fn replace_array_string<'c, 's:'c, 'm:'c>(&self, rules_with_context: &mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<String> { |
885 | // loop over the replacements and build up a vector of strings, excluding empty ones. | |
886 | // * eliminate any redundance | |
887 | // * add/replace auto-pauses | |
888 | // * join the remaining vector together | |
889 | 142k | let mut replacement_strings = Vec::with_capacity(self.replacements.len()); // probably conservative guess |
890 | 271k | for replacement in |
891 | 271k | let string: String = rules_with_context.replace(replacement, mathml) |
892 | 271k | if !string.is_empty() { |
893 | 200k | replacement_strings.push(string); |
894 | 200k |
|
895 | } | |
896 | ||
897 | 142k | if replacement_strings.is_empty() { |
898 | 12.9k | return Ok( "".to_string() ); |
899 | 129k | } |
900 | // delete an optional text that is repetitive | |
901 | // we do this by looking for the optional text marker, and if present, check for repetition at end of previous string | |
902 | // if repetitive, we delete the optional string | |
903 | // if not, we leave the markers because the repetition might happen several "levels" up | |
904 | // this could also be done in a final cleanup of the entire string (where we remove any markers), | |
905 | // but the match is harder (rust regex lacks look behind pattern match) and it is less efficient | |
906 | // Note: we skip the first string since it can't be repetitive of something at this level | |
907 | 129k | for |
908 | 45.4k | if let Some( |
909 | 13 | replacement_strings[i] = bytes.to_string(); |
910 | 45.4k | } |
911 | } | |
912 | ||
913 | 200k | for i in |
914 | 200k | if replacement_strings[i].contains(PAUSE_AUTO_STR) { |
915 | 19.5k | let before = if i == 0 { |
916 | 19.5k | let after = if i+1 == replacement_strings.len() { |
917 | 19.5k | replacement_strings[i] = replacement_strings[i].replace( |
918 | 19.5k | PAUSE_AUTO_STR, |
919 | 19.5k | &rules_with_context.speech_rules.pref_manager.borrow().get_tts().compute_auto_pause(&rules_with_context.speech_rules.pref_manager.borrow(), before, after)); |
920 | 181k | } |
921 | } | |
922 | ||
923 | // join the strings together with spaces in between | |
924 | // concatenation (removal of spaces) is saved for the top level because they otherwise are stripped at the wrong sometimes | |
925 | 129k | return Ok( replacement_strings.join(" ") ); |
926 | ||
927 | /// delete an optional text (in 'next') that is repetitive at the end of 'prev' | |
928 | /// we do this by looking for the optional text marker, and if present, check for repetition at end of previous string | |
929 | /// if repetitive, we delete the optional string | |
930 | 45.4k | fn is_repetitive<'a>(prev: &str, next: &'a str) -> Option<&'a str> { |
931 | // OPTIONAL_INDICATOR optionally surrounds the end of 'prev'(ignoring trailing whitespace) | |
932 | // OPTIONAL_INDICATOR surrounds the start of 'next' | |
933 | // minor optimization -- lots of short strings and the OPTIONAL_INDICATOR takes a few bytes, so skip the check for those strings | |
934 | 45.4k | if next.len() <= 2 * OPTIONAL_INDICATOR_LEN { |
935 | 14.2k | return None; |
936 | 31.2k | } |
937 | ||
938 | // should be exactly one match -- ignore more than one for now | |
939 | 31.2k | let |
940 | 36 | let start_repeat_word_in_next = &next[i_start + OPTIONAL_INDICATOR_LEN..]; |
941 | 36 | let i_end = start_repeat_word_in_next.find(OPTIONAL_INDICATOR) |
942 | 36 | .unwrap_or_else(|| |
943 | 36 | let repeat_word = &start_repeat_word_in_next[..i_end]; |
944 | // debug!("check if '{}' is repetitive, end_index={}", repeat_word, i_end); | |
945 | // debug!(" prev: '{}', next '{}'", prev, next); | |
946 | ||
947 | 36 | let prev_trimmed = prev.trim_end(); |
948 | 36 | let ends_with_word = prev_trimmed.len() > repeat_word.len() && |
949 | 36 | let ends_with_wrapped_word = |
950 | 36 | prev_trimmed |
951 | 36 | .strip_suffix(OPTIONAL_INDICATOR) |
952 | 36 | .and_then(|s| |
953 | 36 | .and_then(|s| |
954 | 36 | .is_some(); |
955 | 36 | if ends_with_word || |
956 | // debug!(" is repetitive"); | |
957 | 13 | Some(start_repeat_word_in_next[i_end + OPTIONAL_INDICATOR_LEN..].trim_start()) // remove repeat word and OPTIONAL_INDICATOR |
958 | } else { | |
959 | 23 | None |
960 | } | |
961 | 45.4k | } |
962 | 142k | } |
963 | ||
964 | 132k | pub fn replace_array_tree<'c, 's:'c, 'm:'c>(&self, rules_with_context: &mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<Element<'m>> { |
965 | // shortcut for common case (don't build a new tree node) | |
966 | 132k | if self.replacements.len() == 1 { |
967 | 129k | return rules_with_context.replace::<Element<'m>>(&self.replacements[0], mathml); |
968 | 3.20k | } |
969 | ||
970 | 3.20k | let new_element = create_mathml_element(&rules_with_context.doc, "Unknown"); // Hopefully set later (in Intent::Replace()) |
971 | 3.20k | let mut new_children = Vec::with_capacity(self.replacements.len()); |
972 | 6.12k | for child in |
973 | 6.12k | let child = rules_with_context.replace::<Element<'m>>(child, mathml) |
974 | 6.12k | new_children.push(ChildOfElement::Element(child)); |
975 | }; | |
976 | 3.20k | new_element.append_children(new_children); |
977 | 3.20k | return Ok(new_element); |
978 | 132k | } |
979 | ||
980 | ||
981 | /// Return true if there are no replacements. | |
982 | 29.8k | pub fn is_empty(&self) -> bool { |
983 | 29.8k | return self.replacements.is_empty(); |
984 | 29.8k | } |
985 | ||
986 | 10 | fn pretty_print_replacements(&self) -> String { |
987 | 10 | let mut group_string = String::with_capacity(128); |
988 | 10 | if self.replacements.len() == 1 { |
989 | 9 | group_string += &format!("[{}]", self.replacements[0]); |
990 | 9 | } else { |
991 | 1 | group_string += &self.replacements.iter() |
992 | 1 | .map(|replacement| |
993 | 1 | .collect::<Vec<String>>() |
994 | 1 | .join(""); |
995 | 1 | group_string += "\n"; |
996 | } | |
997 | 10 | return group_string; |
998 | 10 | } |
999 | } | |
1000 | ||
1001 | ||
1002 | ||
1003 | // MyXPath is a wrapper around an 'XPath' that keeps around the original xpath expr (as a string) so it can be used in error reporting. | |
1004 | // Because we want to be able to clone them and XPath doesn't support clone(), this is a wrapper around an internal MyXPath. | |
1005 | // It supports the standard SpeechRule functionality of building and replacing. | |
1006 | #[derive(Debug)] | |
1007 | struct RCMyXPath { | |
1008 | xpath: XPath, | |
1009 | string: String, // store for error reporting | |
1010 | } | |
1011 | ||
1012 | #[derive(Debug, Clone)] | |
1013 | pub struct MyXPath { | |
1014 | rc: Rc<RCMyXPath> // rather than putting Rc around both 'xpath' and 'string', just use one and indirect to internal RCMyXPath | |
1015 | } | |
1016 | ||
1017 | ||
1018 | impl fmt::Display for MyXPath { | |
1019 | 2.79k | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1020 | 2.79k | return write!(f, "\"{}\"", self.rc.string); |
1021 | 2.79k | } |
1022 | } | |
1023 | ||
1024 | // pub fn xpath_count() -> (usize, usize) { | |
1025 | // return (XPATH_CACHE.with( |cache| cache.borrow().len()), unsafe{XPATH_CACHE_HITS} ); | |
1026 | // } | |
1027 | thread_local!{ | |
1028 | static XPATH_CACHE: RefCell<HashMap<String, MyXPath>> = RefCell::new( HashMap::with_capacity(2047) ); | |
1029 | } | |
1030 | // static mut XPATH_CACHE_HITS: usize = 0; | |
1031 | ||
1032 | impl MyXPath { | |
1033 | 8.88M | fn new(xpath: String) -> Result<MyXPath> { |
1034 | 8.88M | return XPATH_CACHE.with( |cache| { |
1035 | 8.88M | let mut cache = cache.borrow_mut(); |
1036 | return Ok( | |
1037 | 8.88M | match cache.get(&xpath) { |
1038 | 5.82M | Some(compiled_xpath) => { |
1039 | // unsafe{ XPATH_CACHE_HITS += 1;}; | |
1040 | 5.82M | compiled_xpath.clone() |
1041 | }, | |
1042 | None => { | |
1043 | 3.06M | let new_xpath = MyXPath { |
1044 | 3.06M | rc: Rc::new( RCMyXPath { |
1045 | 3.06M | xpath: MyXPath::compile_xpath(&xpath) |
1046 | 3.06M | string: xpath.clone() |
1047 | })}; | |
1048 | 3.06M | cache.insert(xpath.clone(), new_xpath.clone()); |
1049 | 3.06M | new_xpath |
1050 | }, | |
1051 | } | |
1052 | ) | |
1053 | 8.88M | }); |
1054 | 8.88M | } |
1055 | ||
1056 | 8.83M | pub fn build(xpath: &Yaml) -> Result<MyXPath> { |
1057 | 8.83M | let xpath = match xpath { |
1058 | 8.64M | Yaml::String(s) => s.to_string(), |
1059 | 0 | Yaml::Integer(i) => i.to_string(), |
1060 | 0 | Yaml::Real(s) => s.to_string(), |
1061 | 0 | Yaml::Boolean(s) => s.to_string(), |
1062 | 193k | Yaml::Array(v) => |
1063 | // array of strings -- concatenate them together | |
1064 | 193k | v.iter() |
1065 | 193k | .map(as_str_checked) |
1066 | 193k | .collect::<Result<Vec<&str>>>() |
1067 | 193k | .join(" "), |
1068 | 0 | _ => bail!("Bad value when trying to create an xpath: {}", yaml_to_string(xpath, 1)), |
1069 | }; | |
1070 | 8.83M | return MyXPath::new(xpath); |
1071 | 8.83M | } |
1072 | ||
1073 | 3.06M | fn compile_xpath(xpath: &str) -> Result<XPath> { |
1074 | 3.06M | let factory = Factory::new(); |
1075 | 3.06M | let xpath_with_debug_info = MyXPath::add_debug_string_arg(xpath) |
1076 | 3.06M | let compiled_xpath = factory.build(&xpath_with_debug_info) |
1077 | 3.06M | .with_context(|| |
1078 | "Could not compile XPath for pattern:\n{}{}", | |
1079 | 0 | &xpath, more_details(xpath)))?; |
1080 | 3.06M | return match compiled_xpath { |
1081 | 3.06M | Some(xpath) => Ok(xpath), |
1082 | 0 | None => bail!("Problem compiling Xpath for pattern:\n{}{}", |
1083 | 0 | &xpath, more_details(xpath)), |
1084 | }; | |
1085 | ||
1086 | ||
1087 | 0 | fn more_details(xpath: &str) -> String { |
1088 | // try to give a better error message by counting [], (), 's, and "s | |
1089 | 0 | if xpath.is_empty() { |
1090 | 0 | return "xpath is empty string".to_string(); |
1091 | 0 | } |
1092 | 0 | let as_bytes = xpath.trim().as_bytes(); |
1093 | 0 | if as_bytes[0] == b'\'' && as_bytes[as_bytes.len()-1] != b'\'' { |
1094 | 0 | return "\nmissing \"'\"".to_string(); |
1095 | 0 | } |
1096 | 0 | if (as_bytes[0] == b'"' && as_bytes[as_bytes.len()-1] != b'"') || |
1097 | 0 | (as_bytes[0] != b'"' && as_bytes[as_bytes.len()-1] == b'"'){ |
1098 | 0 | return "\nmissing '\"'".to_string(); |
1099 | 0 | } |
1100 | ||
1101 | 0 | let mut i_bytes = 0; // keep track of # of bytes into string for error reporting |
1102 | 0 | let mut paren_count = 0; // counter to make sure they are balanced |
1103 | 0 | let mut i_paren = 0; // position of the outermost open paren |
1104 | 0 | let mut bracket_count = 0; |
1105 | 0 | let mut i_bracket = 0; |
1106 | 0 | for ch in xpath.chars() { |
1107 | 0 | if ch == '(' { |
1108 | 0 | if paren_count == 0 { |
1109 | 0 | i_paren = i_bytes; |
1110 | 0 | } |
1111 | 0 | paren_count += 1; |
1112 | 0 | } else if ch == '[' { |
1113 | 0 | if bracket_count == 0 { |
1114 | 0 | i_bracket = i_bytes; |
1115 | 0 | } |
1116 | 0 | bracket_count += 1; |
1117 | 0 | } else if ch == ')' { |
1118 | 0 | if paren_count == 0 { |
1119 | 0 | return format!("\nExtra ')' found after '{}'", &xpath[i_paren..i_bytes]); |
1120 | 0 | } |
1121 | 0 | paren_count -= 1; |
1122 | 0 | if paren_count == 0 && bracket_count > 0 && i_bracket > i_paren { |
1123 | 0 | return format!("\nUnclosed brackets found at '{}'", &xpath[i_paren..i_bytes]); |
1124 | 0 | } |
1125 | 0 | } else if ch == ']' { |
1126 | 0 | if bracket_count == 0 { |
1127 | 0 | return format!("\nExtra ']' found after '{}'", &xpath[i_bracket..i_bytes]); |
1128 | 0 | } |
1129 | 0 | bracket_count -= 1; |
1130 | 0 | if bracket_count == 0 && paren_count > 0 && i_paren > i_bracket { |
1131 | 0 | return format!("\nUnclosed parens found at '{}'", &xpath[i_bracket..i_bytes]); |
1132 | 0 | } |
1133 | 0 | } |
1134 | 0 | i_bytes += ch.len_utf8(); |
1135 | } | |
1136 | 0 | return "".to_string(); |
1137 | 0 | } |
1138 | 3.06M | } |
1139 | ||
1140 | /// Convert DEBUG(...) input to the internal function which is DEBUG(arg, arg_as_string) | |
1141 | 3.06M | fn add_debug_string_arg(xpath: &str) -> Result<String> { |
1142 | // do a quick check to see if "DEBUG" is in the string -- this is the common case | |
1143 | 3.06M | let debug_start = xpath.find("DEBUG("); |
1144 | 3.06M | if debug_start.is_none() { |
1145 | 3.06M | return Ok( xpath.to_string() ); |
1146 | 1.56k | } |
1147 | ||
1148 | 1.56k | let debug_start = debug_start.unwrap(); |
1149 | 1.56k | let mut before_paren = xpath[..debug_start+5].to_string(); // includes "DEBUG" |
1150 | 1.56k | let chars = xpath[debug_start+5..].chars().collect::<Vec<char>>(); // begins at '(' |
1151 | 1.56k | before_paren.push_str(&chars_add_debug_string_arg(&chars).with_context(|| |
1152 | // debug!("add_debug_string_arg: {}", before_paren); | |
1153 | 1.56k | return Ok(before_paren); |
1154 | ||
1155 | 1.56k | fn chars_add_debug_string_arg(chars: &[char]) -> Result<String> { |
1156 | // Find all the DEBUG(...) commands in 'xpath' and adds a string argument. | |
1157 | // The DEBUG function that is used internally takes two arguments, the second one being a string version of the DEBUG arg. | |
1158 | // Being a string, any quotes need to be escaped, and DEBUGs inside of DEBUGs need more escaping. | |
1159 | // This is done via recursive calls to this function. | |
1160 | 1.56k | assert_eq!(chars[0], '(', "{} does not start with ')'", |
1161 | 1.56k | let mut count = 1; // open/close count |
1162 | 1.56k | let mut i = 1; |
1163 | 1.56k | let mut inside_quote = false; |
1164 | 50.8k | while i < chars.len() { |
1165 | 50.8k | let ch = chars[i]; |
1166 | 805 | match ch { |
1167 | '\\' => { | |
1168 | 0 | if i+1 == chars.len() { |
1169 | 0 | bail!("Syntax error in DEBUG: last char is escape char\nDebug string: '{}'", chars.iter().collect::<String>()); |
1170 | 0 | } |
1171 | 0 | i += 1; |
1172 | }, | |
1173 | 2.21k | '\'' => inside_quote = !inside_quote, |
1174 | 804 | '(' if !inside_quote => { |
1175 | 804 | count += 1; |
1176 | 804 | // FIX: it would be more efficient to spot "DEBUG" preceding this and recurse rather than matching the whole string and recursing |
1177 | 804 | }, |
1178 | 1 | '(' => (), |
1179 | 2.36k | ')' if !inside_quote => { |
1180 | 2.36k | count -= 1; |
1181 | 2.36k | if count == 0 { |
1182 | 1.56k | let arg = &chars[1..i].iter().collect::<String>(); |
1183 | 1.56k | let escaped_arg = arg.replace('"', "\\\""); |
1184 | // DEBUG(...) may be inside 'arg' -- recurse | |
1185 | 1.56k | let processed_arg = MyXPath::add_debug_string_arg(arg) |
1186 | ||
1187 | // DEBUG(...) may be in the remainder of the string -- recurse | |
1188 | 1.56k | let processed_rest = MyXPath::add_debug_string_arg(&chars[i+1..].iter().collect::<String>()) |
1189 | 1.56k | return Ok( format!("({processed_arg}, \"{escaped_arg}\"){processed_rest}") ); |
1190 | 804 | } |
1191 | }, | |
1192 | 0 | ')' => (), |
1193 | 45.4k | _ => (), |
1194 | } | |
1195 | 49.2k | i += 1; |
1196 | } | |
1197 | 0 | bail!("Syntax error in DEBUG: didn't find matching closing paren\nDEBUG{}", chars.iter().collect::<String>()); |
1198 | 1.56k | } |
1199 | 3.06M | } |
1200 | ||
1201 | 156k | fn is_true(&self, context: &sxd_xpath::Context, mathml: Element) -> Result<bool> { |
1202 | // return true if there is no condition or if the condition evaluates to true | |
1203 | return Ok( | |
1204 | 156k | match self.evaluate(context, mathml) |
1205 | 115k | Value::Boolean(b) => b, |
1206 | 40.6k | Value::Nodeset(nodes) => nodes.size() > 0, |
1207 | 0 | _ => false, |
1208 | } | |
1209 | ) | |
1210 | 156k | } |
1211 | ||
1212 | 153k | pub fn replace<'c, 's:'c, 'm:'c, T:TreeOrString<'c, 'm, T>>(&self, rules_with_context: &mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<T> { |
1213 | 153k | if self.rc.string == "process-intent(.)" { |
1214 | 2.47k | return |
1215 | 150k | } |
1216 | ||
1217 | 150k | let result = self.evaluate(&rules_with_context.context_stack.base, mathml) |
1218 | 150k | .with_context(|| |
1219 | 150k | let |
1220 | 121k | Value::Nodeset(nodes) => { |
1221 | 121k | if nodes.size() == 0 { |
1222 | 0 | bail!("During replacement, no matching element found"); |
1223 | 121k | } |
1224 | 121k | return rules_with_context.replace_nodes(nodes.document_order(), mathml); |
1225 | }, | |
1226 | 25.1k | Value::String(s) => s, |
1227 | 3.80k | Value::Number(num) => num.to_string(), |
1228 | 0 | Value::Boolean(b) => b.to_string(), // FIX: is this right??? |
1229 | }; | |
1230 | // Hack!: this test for input that starts with a '$' (defined variable), avoids a double evaluate; | |
1231 | // We don't need NO_EVAL_QUOTE_CHAR here, but the more general solution of a quoted execute (- xq:) would avoid this hack | |
1232 | 28.9k | let result = if self.rc.string.starts_with('$') { |
1233 | 28.9k | return T::from_string(result, rules_with_context.doc ); |
1234 | 153k | } |
1235 | ||
1236 | 1.29M | pub fn evaluate<'c>(&self, context: &sxd_xpath::Context<'c>, mathml: Element<'c>) -> Result<Value<'c>> { |
1237 | // debug!("evaluate: {}", self); | |
1238 | 1.29M | let result = self.rc.xpath.evaluate(context, mathml); |
1239 | 1.29M | return match result { |
1240 | 1.29M | Ok(val) => Ok( val ), |
1241 | 0 | Err(e) => { |
1242 | // debug!("MyXPath::trying to evaluate:\n '{}'\n caused the error\n'{}'", self, e.to_string().replace("OwnedPrefixedName { prefix: None, local_part:", "").replace(" }", "")); | |
1243 | 0 | bail!( "{}\n\n", |
1244 | // remove confusing parts of error message from xpath | |
1245 | 0 | e.to_string().replace("OwnedPrefixedName { prefix: None, local_part:", "").replace(" }", "") ); |
1246 | } | |
1247 | }; | |
1248 | 1.29M | } |
1249 | ||
1250 | 0 | pub fn test_input<F>(self, f: F) -> bool where F: Fn(&str) -> bool { |
1251 | 0 | return f(self.rc.string.as_ref()); |
1252 | 0 | } |
1253 | } | |
1254 | ||
1255 | // 'SpeechPattern' holds a single pattern. | |
1256 | // Some info is not needed beyond converting the Yaml to the SpeechPattern, but is useful for error reporting. | |
1257 | // The two main parts are the pattern to be matched and the replacements to do if there is a match. | |
1258 | // Any variables/prefs that are defined/set are also stored. | |
1259 | #[derive(Debug)] | |
1260 | struct SpeechPattern { | |
1261 | pattern_name: String, | |
1262 | tag_name: String, | |
1263 | file_name: String, | |
1264 | pattern: MyXPath, // the xpath expr to attempt to match | |
1265 | match_uses_var_defs: bool, // include var_defs in context for matching | |
1266 | var_defs: VariableDefinitions, // any variable definitions [can be and probably is an empty vector most of the time] | |
1267 | replacements: ReplacementArray, // the replacements in case there is a match | |
1268 | } | |
1269 | ||
1270 | impl fmt::Display for SpeechPattern { | |
1271 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1272 | 0 | return write!(f, "[name: {}, tag: {},\n variables: {:?}, pattern: {},\n replacement: {}]", |
1273 | self.pattern_name, self.tag_name, self.var_defs, self.pattern, | |
1274 | 0 | self.replacements.pretty_print_replacements()); |
1275 | 0 | } |
1276 | } | |
1277 | ||
1278 | impl SpeechPattern { | |
1279 | 894k | fn build(dict: &Yaml, file: &Path, rules: &mut SpeechRules) -> Result<Option<Vec<PathBuf>>> { |
1280 | // Rule::SpeechPattern | |
1281 | // build { "pattern_name", "tag_name", "pattern", "replacement" } | |
1282 | // or recurse via include: file_name | |
1283 | ||
1284 | // debug!("\nbuild_speech_pattern: dict:\n{}", yaml_to_string(dict, 0)); | |
1285 | 894k | if let Some( |
1286 | 30.2k | let do_include_fn = |new_file: &Path| { |
1287 | 30.2k | rules.read_patterns(new_file) |
1288 | 30.2k | }; |
1289 | ||
1290 | 30.2k | return Ok( Some(process_include(file, include_file_name, do_include_fn) |
1291 | 864k | } |
1292 | ||
1293 | 864k | let pattern_name = find_str(dict, "name"); |
1294 | ||
1295 | // tag_named can be either a string (most common) or an array of strings | |
1296 | 864k | let mut tag_names: Vec<&str> = Vec::new(); |
1297 | 864k | match find_str(dict, "tag") { |
1298 | 740k | Some(str) => tag_names.push(str), |
1299 | None => { | |
1300 | // check for array | |
1301 | 124k | let tag_array = &dict["tag"]; |
1302 | 124k | tag_names = vec![]; |
1303 | 124k | if tag_array.is_array() { |
1304 | 263k | for (i, name) in |
1305 | 263k | match as_str_checked(name) { |
1306 | 0 | Err(e) => return Err( |
1307 | 0 | e.context( |
1308 | 0 | format!("tag name '{}' is not a string in:\n{}", |
1309 | 0 | &yaml_to_string(&tag_array.as_vec().unwrap()[i], 0), |
1310 | 0 | &yaml_to_string(dict, 1))) |
1311 | 0 | ), |
1312 | 263k | Ok(str) => tag_names.push(str), |
1313 | }; | |
1314 | } | |
1315 | } else { | |
1316 | 0 | bail!("Errors trying to find 'tag' in:\n{}", &yaml_to_string(dict, 1)); |
1317 | } | |
1318 | } | |
1319 | } | |
1320 | ||
1321 | 864k | if pattern_name.is_none() { |
1322 | 0 | if dict.is_null() { |
1323 | 0 | bail!("Error trying to find 'name': empty value (two consecutive '-'s?"); |
1324 | } else { | |
1325 | 0 | bail!("Errors trying to find 'name' in:\n{}", &yaml_to_string(dict, 1)); |
1326 | }; | |
1327 | 864k | }; |
1328 | 864k | let pattern_name = pattern_name.unwrap().to_string(); |
1329 | ||
1330 | // FIX: add check to make sure tag_name is a valid MathML tag name | |
1331 | 864k | if dict["match"].is_badvalue() { |
1332 | 0 | bail!("Did not find 'match' in\n{}", yaml_to_string(dict, 1)); |
1333 | 864k | } |
1334 | 864k | if dict["replace"].is_badvalue() { |
1335 | 0 | bail!("Did not find 'replace' in\n{}", yaml_to_string(dict, 1)); |
1336 | 864k | } |
1337 | ||
1338 | // xpath's can't be cloned, so we need to do a 'build_xxx' for each tag name | |
1339 | 1.00M | for tag_name in |
1340 | 1.00M | let tag_name = tag_name.to_string(); |
1341 | 1.00M | let pattern_xpath = MyXPath::build(&dict["match"]) |
1342 | 1.00M | .with_context(|| |
1343 | 0 | format!("value for 'match' in rule ({}: {}):\n{}", |
1344 | 0 | tag_name, pattern_name, yaml_to_string(dict, 1)) |
1345 | 0 | })?; |
1346 | 1.00M | let speech_pattern = |
1347 | 1.00M | Box::new( SpeechPattern{ |
1348 | 1.00M | pattern_name: pattern_name.clone(), |
1349 | 1.00M | tag_name: tag_name.clone(), |
1350 | 1.00M | file_name: file.to_str().unwrap().to_string(), |
1351 | 1.00M | match_uses_var_defs: dict["variables"].is_array() && |
1352 | 1.00M | pattern: pattern_xpath, |
1353 | 1.00M | var_defs: VariableDefinitions::build(&dict["variables"]) |
1354 | 1.00M | .with_context(|| |
1355 | 0 | format!("value for 'variables' in rule ({}: {}):\n{}", |
1356 | 0 | tag_name, pattern_name, yaml_to_string(dict, 1)) |
1357 | 0 | })?, |
1358 | 1.00M | replacements: ReplacementArray::build(&dict["replace"]) |
1359 | 1.00M | .with_context(|| |
1360 | 0 | format!("value for 'replace' in rule ({}: {}). Replacements:\n{}", |
1361 | 0 | tag_name, pattern_name, yaml_to_string(&dict["replace"], 1)) |
1362 | 0 | })? |
1363 | } ); | |
1364 | // get the array of rules for the tag name | |
1365 | 1.00M | let rule_value = rules.rules.entry(tag_name).or_default(); |
1366 | ||
1367 | // if the name exists, replace it. Otherwise add the new rule | |
1368 | 2.67M | match |
1369 | 1.00M | None => rule_value.push(speech_pattern), |
1370 | 9 | Some((i, _old_pattern)) => { |
1371 | 9 | let old_rule = &rule_value[i]; |
1372 | 9 | info!("\n\n***WARNING***: replacing {}/'{}' in {} with rule from {}\n", |
1373 | old_rule.tag_name, old_rule.pattern_name, old_rule.file_name, speech_pattern.file_name); | |
1374 | 9 | rule_value[i] = speech_pattern; |
1375 | }, | |
1376 | } | |
1377 | } | |
1378 | ||
1379 | 864k | return Ok(None); |
1380 | 894k | } |
1381 | ||
1382 | 870k | fn is_match(&self, context: &sxd_xpath::Context, mathml: Element) -> Result<bool> { |
1383 | 870k | if self.tag_name != mathml.name().local_part() && |
1384 | 0 | return Ok( false ); |
1385 | 870k | } |
1386 | ||
1387 | // debug!("\nis_match: pattern='{}'", self.pattern_name); | |
1388 | // debug!(" pattern_expr {:?}", self.pattern); | |
1389 | // debug!("is_match: mathml is\n{}", mml_to_string(mathml)); | |
1390 | return Ok( | |
1391 | 870k | match self.pattern.evaluate(context, mathml) |
1392 | 652k | Value::Boolean(b) => b, |
1393 | 217k | Value::Nodeset(nodes) => nodes.size() > 0, |
1394 | 0 | _ => false, |
1395 | } | |
1396 | ); | |
1397 | 870k | } |
1398 | } | |
1399 | ||
1400 | ||
1401 | // 'Test' holds information used if the replacement is a "test:" clause. | |
1402 | // The condition is an xpath expr and the "else:" part is optional. | |
1403 | ||
1404 | #[derive(Debug, Clone)] | |
1405 | struct TestArray { | |
1406 | tests: Vec<Test> | |
1407 | } | |
1408 | ||
1409 | impl fmt::Display for TestArray { | |
1410 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1411 | 0 | for test in &self.tests { |
1412 | 0 | writeln!(f, "{test}")?; |
1413 | } | |
1414 | 0 | return Ok( () ); |
1415 | 0 | } |
1416 | } | |
1417 | ||
1418 | impl TestArray { | |
1419 | 3.29M | fn build(test: &Yaml) -> Result<TestArray> { |
1420 | // 'test:' for convenience takes either a dictionary with keys if/else_if/then/then_test/else/else_test or | |
1421 | // or an array of those values (there should be at most one else/else_test) | |
1422 | ||
1423 | // if 'test' is a dictionary ('Hash'), we convert it to an array with one entry and proceed | |
1424 | 3.29M | let tests = if test.as_hash().is_some() { |
1425 | 3.01M | vec![test] |
1426 | 287k | } else if let Some(vec) = test.as_vec() { |
1427 | 287k | vec.iter().collect() |
1428 | } else { | |
1429 | 0 | bail!("Value for 'test:' is neither a dictionary or an array.") |
1430 | }; | |
1431 | ||
1432 | // each entry in 'tests' should be a dictionary with keys if/then/then_test/else/else_test | |
1433 | // a valid entry is one of: | |
1434 | // if:/else_if:, then:/then_test: and optional else:/else_test: | |
1435 | // else:/else_test: -- if this case, it should be the last entry in 'tests' | |
1436 | // 'if:' should only be the first entry in the array; 'else_if' should never be the first entry. Otherwise, they are the same | |
1437 | 3.29M | let mut test_array = vec![]; |
1438 | 3.73M | for test in |
1439 | 3.73M | if test.as_hash().is_none() { |
1440 | 0 | bail!("Value for array entry in 'test:' must be a dictionary/contain keys"); |
1441 | 3.73M | } |
1442 | 3.73M | let if_part = &test[if test_array.is_empty() { |
1443 | 3.73M | if !if_part.is_badvalue() { |
1444 | // first case: if:, then:, optional else: | |
1445 | 3.69M | let condition = Some( MyXPath::build(if_part) |
1446 | 3.69M | let then_part = TestOrReplacements::build(test, "then", "then_test", true) |
1447 | 3.69M | let else_part = TestOrReplacements::build(test, "else", "else_test", false) |
1448 | 3.69M | let n_keys = if else_part.is_none() { |
1449 | 3.69M | if test.as_hash().unwrap().len() > n_keys { |
1450 | 0 | bail!("A key other than 'if', 'else_if', 'then', 'then_test', 'else', or 'else_test' was found in the 'then' clause of 'test'"); |
1451 | 3.69M | }; |
1452 | 3.69M | test_array.push( |
1453 | 3.69M | Test { condition, then_part, else_part } |
1454 | ); | |
1455 | } else { | |
1456 | // second case: should be else/else_test | |
1457 | 42.3k | let else_part = TestOrReplacements::build(test, "else", "else_test", true) |
1458 | 42.3k | if test.as_hash().unwrap().len() > 1 { |
1459 | 0 | bail!("A key other than 'if', 'else_if', 'then', 'then_test', 'else', or 'else_test' was found the 'else' clause of 'test'"); |
1460 | 42.3k | }; |
1461 | 42.3k | test_array.push( |
1462 | 42.3k | Test { condition: None, then_part: None, else_part } |
1463 | ); | |
1464 | ||
1465 | // there shouldn't be any trailing tests | |
1466 | 42.3k | if test_array.len() < test.as_hash().unwrap().len() { |
1467 | 0 | bail!("'else'/'else_test' key is not last key in 'test:'"); |
1468 | 42.3k | } |
1469 | } | |
1470 | }; | |
1471 | ||
1472 | 3.29M | if test_array.is_empty() { |
1473 | 0 | bail!("No entries for 'test:'"); |
1474 | 3.29M | } |
1475 | ||
1476 | 3.29M | return Ok( TestArray { tests: test_array } ); |
1477 | 3.29M | } |
1478 | ||
1479 | 121k | fn replace<'c, 's:'c, 'm:'c, T:TreeOrString<'c, 'm, T>>(&self, rules_with_context: &mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<T> { |
1480 | 156k | for test in |
1481 | 156k | if test.is_true(&rules_with_context.context_stack.base, mathml) |
1482 | 85.2k | assert!(test.then_part.is_some()); |
1483 | 85.2k | return test.then_part.as_ref().unwrap().replace(rules_with_context, mathml); |
1484 | 71.1k | } else if let Some( |
1485 | 12.9k | return else_part.replace(rules_with_context, mathml); |
1486 | 58.1k | } |
1487 | } | |
1488 | 23.4k | return T::from_string("".to_string(), rules_with_context.doc); |
1489 | 121k | } |
1490 | } | |
1491 | ||
1492 | #[derive(Debug, Clone)] | |
1493 | // Used to hold then/then_test and also else/else_test -- only one of these can be present at a time | |
1494 | enum TestOrReplacements { | |
1495 | Replacements(ReplacementArray), // replacements to use when a test is true | |
1496 | Test(TestArray), // the array of if/then/else tests | |
1497 | } | |
1498 | ||
1499 | impl fmt::Display for TestOrReplacements { | |
1500 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1501 | 0 | if let TestOrReplacements::Test(_) = self { |
1502 | 0 | write!(f, " _test")?; |
1503 | 0 | } |
1504 | 0 | write!(f, ":")?; |
1505 | 0 | return match self { |
1506 | 0 | TestOrReplacements::Test(t) => write!(f, "{t}"), |
1507 | 0 | TestOrReplacements::Replacements(r) => write!(f, "{r}"), |
1508 | }; | |
1509 | 0 | } |
1510 | } | |
1511 | ||
1512 | impl TestOrReplacements { | |
1513 | 7.43M | fn build(test: &Yaml, replace_key: &str, test_key: &str, key_required: bool) -> Result<Option<TestOrReplacements>> { |
1514 | 7.43M | let part = &test[replace_key]; |
1515 | 7.43M | let test_part = &test[test_key]; |
1516 | 7.43M | if !part.is_badvalue() && |
1517 | 0 | bail!(format!("Only one of '{}' or '{}' is allowed as part of 'test'.\n{}\n \ |
1518 | Suggestion: delete one or adjust indentation", | |
1519 | 0 | replace_key, test_key, yaml_to_string(test, 2))); |
1520 | 7.43M | } |
1521 | 7.43M | if part.is_badvalue() && |
1522 | 2.45M | if key_required { |
1523 | 0 | bail!(format!("Missing one of '{}'/'{}:' as part of 'test:'\n{}\n \ |
1524 | Suggestion: add the missing key or indent so it is contained in 'test'", | |
1525 | 0 | replace_key, test_key, yaml_to_string(test, 2))) |
1526 | } else { | |
1527 | 2.45M | return Ok( None ); |
1528 | } | |
1529 | 4.97M | } |
1530 | // at this point, we have only one of the two options | |
1531 | 4.97M | if test_part.is_badvalue() { |
1532 | 4.26M | return Ok( Some( TestOrReplacements::Replacements( ReplacementArray::build(part) |
1533 | } else { | |
1534 | 712k | return Ok( Some( TestOrReplacements::Test( TestArray::build(test_part) |
1535 | } | |
1536 | 7.43M | } |
1537 | ||
1538 | 98.2k | fn replace<'c, 's:'c, 'm:'c, T:TreeOrString<'c, 'm, T>>(&self, rules_with_context: &mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<T> { |
1539 | 98.2k | return match self { |
1540 | 92.4k | TestOrReplacements::Replacements(r) => r.replace(rules_with_context, mathml), |
1541 | 5.74k | TestOrReplacements::Test(t) => t.replace(rules_with_context, mathml), |
1542 | } | |
1543 | 98.2k | } |
1544 | } | |
1545 | ||
1546 | #[derive(Debug, Clone)] | |
1547 | struct Test { | |
1548 | condition: Option<MyXPath>, | |
1549 | then_part: Option<TestOrReplacements>, | |
1550 | else_part: Option<TestOrReplacements>, | |
1551 | } | |
1552 | impl fmt::Display for Test { | |
1553 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1554 | 0 | write!(f, "test: [ ")?; |
1555 | 0 | if let Some(if_part) = &self.condition { |
1556 | 0 | write!(f, " if: '{if_part}'")?; |
1557 | 0 | } |
1558 | 0 | if let Some(then_part) = &self.then_part { |
1559 | 0 | write!(f, " then{then_part}")?; |
1560 | 0 | } |
1561 | 0 | if let Some(else_part) = &self.else_part { |
1562 | 0 | write!(f, " else{else_part}")?; |
1563 | 0 | } |
1564 | 0 | return write!(f, "]"); |
1565 | 0 | } |
1566 | } | |
1567 | ||
1568 | impl Test { | |
1569 | 156k | fn is_true(&self, context: &sxd_xpath::Context, mathml: Element) -> Result<bool> { |
1570 | 156k | return match self.condition.as_ref() { |
1571 | 136 | None => Ok( false ), // trivially false -- want to do else part |
1572 | 156k | Some(condition) => condition.is_true(context, mathml) |
1573 | 156k | .context("Failure in conditional test"), |
1574 | } | |
1575 | 156k | } |
1576 | } | |
1577 | ||
1578 | // Used for speech rules with "variables: ..." | |
1579 | #[derive(Debug, Clone)] | |
1580 | struct VariableDefinition { | |
1581 | name: String, // name of variable | |
1582 | value: MyXPath, // xpath value, typically a constant like "true" or "0", but could be "*/*[1]" to store some nodes | |
1583 | } | |
1584 | ||
1585 | impl fmt::Display for VariableDefinition { | |
1586 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1587 | 0 | return write!(f, "[name: {}={}]", self.name, self.value); |
1588 | 0 | } |
1589 | } | |
1590 | ||
1591 | // Used for speech rules with "variables: ..." | |
1592 | #[derive(Debug)] | |
1593 | struct VariableValue<'v> { | |
1594 | name: String, // name of variable | |
1595 | value: Option<Value<'v>>, // xpath value, typically a constant like "true" or "0", but could be "*/*[1]" to store some nodes | |
1596 | } | |
1597 | ||
1598 | impl fmt::Display for VariableValue<'_> { | |
1599 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1600 | 0 | let value = match &self.value { |
1601 | 0 | None => "unset".to_string(), |
1602 | 0 | Some(val) => format!("{val:?}") |
1603 | }; | |
1604 | 0 | return write!(f, "[name: {}, value: {}]", self.name, value); |
1605 | 0 | } |
1606 | } | |
1607 | ||
1608 | impl VariableDefinition { | |
1609 | 472k | fn build(name_value_def: &Yaml) -> Result<VariableDefinition> { |
1610 | 472k | match name_value_def.as_hash() { |
1611 | 472k | Some(map) => { |
1612 | 472k | if map.len() != 1 { |
1613 | 0 | bail!("definition is not a key/value pair. Found {}", |
1614 | 0 | yaml_to_string(name_value_def, 1) ); |
1615 | 472k | } |
1616 | 472k | let (name, value) = map.iter().next().unwrap(); |
1617 | 472k | let name = as_str_checked( name) |
1618 | 472k | .with_context(|| |
1619 | 472k |
|
1620 | 472k | match value { |
1621 | 472k | Yaml::Boolean(_) | Yaml::String(_) | Yaml::Integer(_) | Yaml::Real(_) => (), |
1622 | 0 | _ => bail!("definition value is not a string, boolean, or number. Found {}", |
1623 | 0 | yaml_to_string(value, 1) ) |
1624 | }; | |
1625 | return Ok( | |
1626 | VariableDefinition{ | |
1627 | 472k | name, |
1628 | 472k | value: MyXPath::build(value) |
1629 | } | |
1630 | ); | |
1631 | }, | |
1632 | 0 | None => bail!("definition is not a key/value pair. Found {}", |
1633 | 0 | yaml_to_string(name_value_def, 1) ) |
1634 | } | |
1635 | 472k | } |
1636 | } | |
1637 | ||
1638 | ||
1639 | #[derive(Debug, Clone)] | |
1640 | struct VariableDefinitions { | |
1641 | defs: Vec<VariableDefinition> | |
1642 | } | |
1643 | ||
1644 | impl fmt::Display for VariableDefinitions { | |
1645 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1646 | 0 | for def in &self.defs { |
1647 | 0 | write!(f, "{def},")?; |
1648 | } | |
1649 | 0 | return Ok( () ); |
1650 | 0 | } |
1651 | } | |
1652 | ||
1653 | struct VariableValues<'v> { | |
1654 | defs: Vec<VariableValue<'v>> | |
1655 | } | |
1656 | ||
1657 | impl fmt::Display for VariableValues<'_> { | |
1658 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1659 | 0 | for value in &self.defs { |
1660 | 0 | write!(f, "{value}")?; |
1661 | } | |
1662 | 0 | return writeln!(f); |
1663 | 0 | } |
1664 | } | |
1665 | ||
1666 | impl VariableDefinitions { | |
1667 | 1.11M | fn new(len: usize) -> VariableDefinitions { |
1668 | 1.11M | return VariableDefinitions{ defs: Vec::with_capacity(len) }; |
1669 | 1.11M | } |
1670 | ||
1671 | 1.11M | fn build(defs: &Yaml) -> Result<VariableDefinitions> { |
1672 | 1.11M | if defs.is_badvalue() { |
1673 | 834k | return Ok( VariableDefinitions::new(0) ); |
1674 | 277k | }; |
1675 | 277k | if defs.is_array() { |
1676 | 277k | let defs = defs.as_vec().unwrap(); |
1677 | 277k | let mut definitions = VariableDefinitions::new(defs.len()); |
1678 | 472k | for def in |
1679 | 472k | let variable_def = VariableDefinition::build(def) |
1680 | 472k | .context("definition of 'variables'") |
1681 | 472k | definitions.push( variable_def); |
1682 | }; | |
1683 | 277k | return Ok (definitions ); |
1684 | 0 | } |
1685 | 0 | bail!( "'variables' is not an array of {{name: xpath-value}} definitions. Found {}'", |
1686 | 0 | yaml_to_string(defs, 1) ); |
1687 | 1.11M | } |
1688 | ||
1689 | 472k | fn push(&mut self, var_def: VariableDefinition) { |
1690 | 472k | self.defs.push(var_def); |
1691 | 472k | } |
1692 | ||
1693 | 241k | fn len(&self) -> usize { |
1694 | 241k | return self.defs.len(); |
1695 | 241k | } |
1696 | } | |
1697 | ||
1698 | struct ContextStack<'c> { | |
1699 | // Note: values are generated by calling value_of on an Evaluation -- that makes the two lifetimes the same | |
1700 | old_values: Vec<VariableValues<'c>>, // store old values so they can be set on pop | |
1701 | base: sxd_xpath::Context<'c> // initial context -- contains all the function defs and pref variables | |
1702 | } | |
1703 | ||
1704 | impl fmt::Display for ContextStack<'_> { | |
1705 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1706 | 0 | writeln!(f, " {} old_values", self.old_values.len())?; |
1707 | 0 | for values in &self.old_values { |
1708 | 0 | writeln!(f, " {values}")?; |
1709 | } | |
1710 | 0 | return writeln!(f); |
1711 | 0 | } |
1712 | } | |
1713 | ||
1714 | impl<'c, 'r> ContextStack<'c> { | |
1715 | 22.7k | fn new<'a,>(pref_manager: &'a PreferenceManager) -> ContextStack<'c> { |
1716 | 22.7k | let prefs = pref_manager.merge_prefs(); |
1717 | 22.7k | let mut context_stack = ContextStack { |
1718 | 22.7k | base: ContextStack::base_context(prefs), |
1719 | 22.7k | old_values: Vec::with_capacity(31) // should avoid allocations |
1720 | 22.7k | }; |
1721 | // FIX: the list of variables to set should come from definitions.yaml | |
1722 | // These can't be set on the <math> tag because of the "translate" command which starts speech at an 'id' | |
1723 | 22.7k | context_stack.base.set_variable("MatchingPause", Value::Boolean(false)); |
1724 | 22.7k | context_stack.base.set_variable("IsColumnSilent", Value::Boolean(false)); |
1725 | ||
1726 | ||
1727 | 22.7k | return context_stack; |
1728 | 22.7k | } |
1729 | ||
1730 | 22.7k | fn base_context(var_defs: PreferenceHashMap) -> sxd_xpath::Context<'c> { |
1731 | 22.7k | let mut context = sxd_xpath::Context::new(); |
1732 | 22.7k | context.set_namespace("m", "http://www.w3.org/1998/Math/MathML"); |
1733 | 22.7k | crate::xpath_functions::add_builtin_functions(&mut context); |
1734 | 1.88M | for (key, value) in |
1735 | 1.88M | context.set_variable(key.as_str(), yaml_to_value(&value)); |
1736 | 1.88M | // if let Some(str_value) = value.as_str() { |
1737 | 1.88M | // if str_value != "Auto" { |
1738 | 1.88M | // debug!("Set {}='{}'", key.as_str(), str_value); |
1739 | 1.88M | // } |
1740 | 1.88M | // } |
1741 | 1.88M | }; |
1742 | 22.7k | return context; |
1743 | 22.7k | } |
1744 | ||
1745 | 3.78k | fn set_globals(&'r mut self, new_vars: VariableDefinitions, mathml: Element<'c>) -> Result<()> { |
1746 | // for each var/value pair, evaluate the value and add the var/value to the base context | |
1747 | 4.84k | for def in |
1748 | // set the new value | |
1749 | 4.84k | let new_value = match def.value.evaluate(&self.base, mathml) { |
1750 | 4.84k | Ok(val) => val, |
1751 | 0 | Err(_) => bail!(format!("Can't evaluate variable def for {}", def)), |
1752 | }; | |
1753 | 4.84k | let qname = QName::new(def.name.as_str()); |
1754 | 4.84k | self.base.set_variable(qname, new_value); |
1755 | } | |
1756 | 3.78k | return Ok( () ); |
1757 | 3.78k | } |
1758 | ||
1759 | 27.3k | fn push(&'r mut self, new_vars: VariableDefinitions, mathml: Element<'c>) -> Result<()> { |
1760 | // store the old value and set the new one | |
1761 | 27.3k | let mut old_values = VariableValues {defs: Vec::with_capacity(new_vars.defs.len()) }; |
1762 | 27.3k | let evaluation = Evaluation::new(&self.base, Node::Element(mathml)); |
1763 | 66.9k | for def in |
1764 | 66.9k | // get the old value (might not be defined) |
1765 | 66.9k | let qname = QName::new(def.name.as_str()); |
1766 | 66.9k | let old_value = evaluation.value_of(qname).cloned(); |
1767 | 66.9k | old_values.defs.push( VariableValue{ name: def.name.clone(), value: old_value} ); |
1768 | 66.9k | } |
1769 | ||
1770 | // use a second loop because of borrow problem with self.base and 'evaluation' | |
1771 | 66.9k | for def in |
1772 | // set the new value | |
1773 | 66.9k | let new_value = match def.value.evaluate(&self.base, mathml) { |
1774 | 66.9k | Ok(val) => val, |
1775 | 0 | Err(_) => Value::Nodeset(sxd_xpath::nodeset::Nodeset::new()), |
1776 | }; | |
1777 | 66.9k | let qname = QName::new(def.name.as_str()); |
1778 | 66.9k | self.base.set_variable(qname, new_value); |
1779 | } | |
1780 | 27.3k | self.old_values.push(old_values); |
1781 | 27.3k | return Ok( () ); |
1782 | 27.3k | } |
1783 | ||
1784 | 27.3k | fn pop(&mut self) { |
1785 | const MISSING_VALUE: &str = "-- unset value --"; // can't remove a variable from context, so use this value | |
1786 | 27.3k | let old_values = self.old_values.pop().unwrap(); |
1787 | 66.9k | for variable in |
1788 | 66.9k | let qname = QName::new(&variable.name); |
1789 | 66.9k | let old_value = match variable.value { |
1790 | 22.8k | None => Value::String(MISSING_VALUE.to_string()), |
1791 | 44.1k | Some(val) => val, |
1792 | }; | |
1793 | 66.9k | self.base.set_variable(qname, old_value); |
1794 | } | |
1795 | 27.3k | } |
1796 | } | |
1797 | ||
1798 | ||
1799 | 1.88M | fn yaml_to_value<'b>(yaml: &Yaml) -> Value<'b> { |
1800 | 1.88M | return match yaml { |
1801 | 1.47M | Yaml::String(s) => Value::String(s.clone()), |
1802 | 295k | Yaml::Boolean(b) => Value::Boolean(*b), |
1803 | 31.7k | Yaml::Integer(i) => Value::Number(*i as f64), |
1804 | 91.0k | Yaml::Real(s) => Value::Number(s.parse::<f64>().unwrap()), |
1805 | _ => { | |
1806 | 0 | error!("yaml_to_value: illegal type found in Yaml value: {}", yaml_to_string(yaml, 1)); |
1807 | 0 | Value::String("".to_string()) |
1808 | }, | |
1809 | } | |
1810 | 1.88M | } |
1811 | ||
1812 | ||
1813 | // Information for matching a Unicode char (defined in unicode.yaml) and building its replacement | |
1814 | struct UnicodeDef { | |
1815 | ch: u32, | |
1816 | speech: ReplacementArray | |
1817 | } | |
1818 | ||
1819 | impl fmt::Display for UnicodeDef { | |
1820 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1821 | 0 | return write!(f, "UnicodeDef{{ch: {}, speech: {:?}}}", self.ch, self.speech); |
1822 | 0 | } |
1823 | } | |
1824 | ||
1825 | impl UnicodeDef { | |
1826 | 2.24M | fn build(unicode_def: &Yaml, file_name: &Path, speech_rules: &SpeechRules, use_short: bool) -> Result<Option<Vec<PathBuf>>> { |
1827 | 2.24M | if let Some( |
1828 | 3 | let do_include_fn = |new_file: &Path| { |
1829 | 3 | speech_rules.read_unicode(Some(new_file.to_path_buf()), use_short) |
1830 | 3 | }; |
1831 | 3 | return Ok( Some(process_include(file_name, include_file_name, do_include_fn) |
1832 | 2.24M | } |
1833 | // key: char, value is replacement or array of replacements | |
1834 | 2.24M | let dictionary = unicode_def.as_hash(); |
1835 | 2.24M | if dictionary.is_none() { |
1836 | 0 | bail!("Expected a unicode definition (e.g, '+':[t: \"plus\"]'), found {}", yaml_to_string(unicode_def, 0)); |
1837 | 2.24M | } |
1838 | ||
1839 | 2.24M | let dictionary = dictionary.unwrap(); |
1840 | 2.24M | if dictionary.len() != 1 { |
1841 | 0 | bail!("Expected a unicode definition (e.g, '+':[t: \"plus\"]'), found {}", yaml_to_string(unicode_def, 0)); |
1842 | 2.24M | } |
1843 | ||
1844 | 2.24M | let (ch, replacements) = dictionary.iter().next().ok_or_else(|| |
1845 | 2.24M | let mut unicode_table = if use_short { |
1846 | 1.06M | speech_rules.unicode_short.borrow_mut() |
1847 | } else { | |
1848 | 1.17M | speech_rules.unicode_full.borrow_mut() |
1849 | }; | |
1850 | 2.24M | if let Some(str) = ch.as_str() { |
1851 | 2.24M | if str.is_empty() { |
1852 | 0 | bail!("Empty character definition. Replacement is {}", replacements.as_str().unwrap()); |
1853 | 2.24M | } |
1854 | 2.24M | let mut chars = str.chars(); |
1855 | 2.24M | let first_ch = chars.next().unwrap(); // non-empty string, so a char exists |
1856 | 2.24M | if chars.next().is_some() { // more than one char |
1857 | 54.7k | if str.contains('-') { |
1858 | 38.4k | return process_range(str, replacements, unicode_table); |
1859 | 16.2k | } else if first_ch != '0' { // exclude 0xDDDD |
1860 | 74.5k | for ch in |
1861 | 74.5k | let ch_as_str = ch.to_string(); |
1862 | 74.5k | if unicode_table.insert(ch as u32, ReplacementArray::build(&substitute_ch(replacements, &ch_as_str)) |
1863 | 74.5k | .with_context(|| |
1864 | 0 | error!("*** Character '{}' (0x{:X}) is repeated", ch, ch as u32); |
1865 | 74.5k | } |
1866 | } | |
1867 | 16.2k | return Ok(None); |
1868 | 0 | } |
1869 | 2.18M | } |
1870 | 0 | } |
1871 | ||
1872 | 2.18M | let ch = UnicodeDef::get_unicode_char(ch) |
1873 | 2.18M | if unicode_table.insert(ch, ReplacementArray::build(replacements) |
1874 | 2.18M | .with_context(|| |
1875 | 2.18M |
|
1876 | 147 | error!("*** Character '{}' (0x{:X}) is repeated", |
1877 | 2.18M | } |
1878 | 2.18M | return Ok(None); |
1879 | ||
1880 | 38.4k | fn process_range(def_range: &str, replacements: &Yaml, mut unicode_table: RefMut<HashMap<u32,Vec<Replacement>>>) -> Result<Option<Vec<PathBuf>>> { |
1881 | // should be a character range (e.g., "A-Z") | |
1882 | // iterate over that range and also substitute the char for '.' in the | |
1883 | 38.4k | let mut range = def_range.split('-'); |
1884 | 38.4k | let first = range.next().unwrap().chars().next().unwrap() as u32; |
1885 | 38.4k | let last = range.next().unwrap().chars().next().unwrap() as u32; |
1886 | 38.4k | if range.next().is_some() { |
1887 | 0 | bail!("Character range definition has more than one '-': '{}'", def_range); |
1888 | 38.4k | } |
1889 | ||
1890 | 889k | for ch in |
1891 | 889k | let ch_as_str = char::from_u32(ch).unwrap().to_string(); |
1892 | 889k | unicode_table.insert(ch, ReplacementArray::build(&substitute_ch(replacements, &ch_as_str)) |
1893 | 889k | .with_context(|| |
1894 | }; | |
1895 | ||
1896 | 38.4k | return Ok(None) |
1897 | 38.4k | } |
1898 | ||
1899 | 10.3M | fn substitute_ch(yaml: &Yaml, ch: &str) -> Yaml { |
1900 | 10.3M | return match yaml { |
1901 | 2.34M | Yaml::Array(v) => { |
1902 | Yaml::Array( | |
1903 | 2.34M | v.iter() |
1904 | 3.03M | . |
1905 | 2.34M | .collect::<Vec<Yaml>>() |
1906 | ) | |
1907 | }, | |
1908 | 4.52M | Yaml::Hash(h) => { |
1909 | Yaml::Hash( | |
1910 | 4.52M | h.iter() |
1911 | 6.34M | . |
1912 | 4.52M | .collect::<Hash>() |
1913 | ) | |
1914 | }, | |
1915 | 3.47M | Yaml::String(s) => Yaml::String( s.replace('.', ch) ), |
1916 | 0 | _ => yaml.clone(), |
1917 | } | |
1918 | 10.3M | } |
1919 | 2.24M | } |
1920 | ||
1921 | 2.18M | fn get_unicode_char(ch: &Yaml) -> Result<u32> { |
1922 | // either "a" or 0x1234 (number) | |
1923 | 2.18M | if let Some(ch) = ch.as_str() { |
1924 | 2.18M | let mut ch_iter = ch.chars(); |
1925 | 2.18M | let unicode_ch = ch_iter.next(); |
1926 | 2.18M | if unicode_ch.is_none() || ch_iter.next().is_some() { |
1927 | 0 | bail!("Wanted unicode char, found string '{}')", ch); |
1928 | 2.18M | }; |
1929 | 2.18M | return Ok( unicode_ch.unwrap() as u32 ); |
1930 | 0 | } |
1931 | ||
1932 | 0 | if let Some(num) = ch.as_i64() { |
1933 | 0 | return Ok( num as u32 ); |
1934 | 0 | } |
1935 | 0 | bail!("Unicode character '{}' can't be converted to an code point", yaml_to_string(ch, 0)); |
1936 | 2.18M | } |
1937 | } | |
1938 | ||
1939 | // Fix: there should be a cache so subsequent library calls don't have to read in the same speech rules | |
1940 | // likely a cache of size 1 is fine | |
1941 | // Fix: all statics should be gathered together into one structure that is a Mutex | |
1942 | // for each library call, we should grab a lock on the Mutex in case others try to call | |
1943 | // at the same time. | |
1944 | // If this turns out to be something that others actually do, then a cache > 1 would be good | |
1945 | ||
1946 | type RuleTable = HashMap<String, Vec<Box<SpeechPattern>>>; | |
1947 | type UnicodeTable = Rc<RefCell<HashMap<u32,Vec<Replacement>>>>; | |
1948 | type FilesAndTimesShared = Rc<RefCell<FilesAndTimes>>; | |
1949 | ||
1950 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] | |
1951 | pub enum RulesFor { | |
1952 | Intent, | |
1953 | Speech, | |
1954 | OverView, | |
1955 | Navigation, | |
1956 | Braille, | |
1957 | } | |
1958 | ||
1959 | impl fmt::Display for RulesFor { | |
1960 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1961 | 0 | let name = match self { |
1962 | 0 | RulesFor::Intent => "Intent", |
1963 | 0 | RulesFor::Speech => "Speech", |
1964 | 0 | RulesFor::OverView => "OverView", |
1965 | 0 | RulesFor::Navigation => "Navigation", |
1966 | 0 | RulesFor::Braille => "Braille", |
1967 | }; | |
1968 | 0 | return write!(f, "{name}"); |
1969 | 0 | } |
1970 | } | |
1971 | ||
1972 | ||
1973 | #[derive(Debug, Clone)] | |
1974 | pub struct FileAndTime { | |
1975 | file: PathBuf, | |
1976 | time: SystemTime, | |
1977 | } | |
1978 | ||
1979 | impl FileAndTime { | |
1980 | 0 | fn new(file: PathBuf) -> FileAndTime { |
1981 | 0 | return FileAndTime { |
1982 | 0 | file, |
1983 | 0 | time: SystemTime::UNIX_EPOCH, |
1984 | 0 | } |
1985 | 0 | } |
1986 | ||
1987 | // used for debugging preference settings | |
1988 | 0 | pub fn debug_get_file(&self) -> Option<&str> { |
1989 | 0 | return self.file.to_str(); |
1990 | 0 | } |
1991 | ||
1992 | 8.29k | pub fn new_with_time(file: PathBuf) -> FileAndTime { |
1993 | 8.29k | return FileAndTime { |
1994 | 8.29k | time: FileAndTime::get_metadata(&file), |
1995 | 8.29k | file, |
1996 | 8.29k | } |
1997 | 8.29k | } |
1998 | ||
1999 | 33.7k | pub fn is_up_to_date(&self) -> bool { |
2000 | 33.7k | let file_mod_time = FileAndTime::get_metadata(&self.file); |
2001 | 33.7k | return self.time >= file_mod_time; |
2002 | 33.7k | } |
2003 | ||
2004 | 140k | fn get_metadata(path: &Path) -> SystemTime { |
2005 | use std::fs; | |
2006 | 140k | if !cfg!(target_family = "wasm") { |
2007 | 140k | let metadata = fs::metadata(path); |
2008 | 140k | if let Ok( |
2009 | 120k | let Ok(mod_time) = metadata.modified() { |
2010 | 120k | return mod_time; |
2011 | 20.3k | } |
2012 | 0 | } |
2013 | 20.3k | return SystemTime::UNIX_EPOCH |
2014 | 140k | } |
2015 | ||
2016 | } | |
2017 | #[derive(Debug, Default)] | |
2018 | pub struct FilesAndTimes { | |
2019 | // ft[0] is the main file -- other files are included by it (or recursively) | |
2020 | // We could be a little smarter about invalidation by tracking what file is the parent (including file), | |
2021 | // but it seems more complicated than it is worth | |
2022 | ft: Vec<FileAndTime> | |
2023 | } | |
2024 | ||
2025 | impl FilesAndTimes { | |
2026 | 0 | pub fn new(start_path: PathBuf) -> FilesAndTimes { |
2027 | 0 | let mut ft = Vec::with_capacity(8); |
2028 | 0 | ft.push( FileAndTime::new(start_path) ); |
2029 | 0 | return FilesAndTimes{ ft }; |
2030 | 0 | } |
2031 | ||
2032 | /// Returns true if the main file matches the corresponding preference location and files' times are all current | |
2033 | 33.4k | pub fn is_file_up_to_date(&self, pref_path: &Path, should_ignore_file_time: bool) -> bool { |
2034 | ||
2035 | // if the time isn't set or the path is different from the preference (which might have changed), return false | |
2036 | 33.4k | if self.ft.is_empty() || |
2037 | 5.74k | return false; |
2038 | 27.7k | } |
2039 | 27.7k | if should_ignore_file_time || |
2040 | 26.5k | return true; |
2041 | 1.18k | } |
2042 | 1.18k | if self.ft[0].time == SystemTime::UNIX_EPOCH { |
2043 | 0 | return false; |
2044 | 1.18k | } |
2045 | ||
2046 | ||
2047 | // check the time stamp on the included files -- if the head file hasn't changed, the paths for the included files will be the same | |
2048 | 1.19k | for file in |
2049 | 1.19k | if !file.is_up_to_date() { |
2050 | 1 | return false; |
2051 | 1.19k | } |
2052 | } | |
2053 | 1.18k | return true; |
2054 | 33.4k | } |
2055 | ||
2056 | 19.8k | fn set_files_and_times(&mut self, new_files: Vec<PathBuf>) { |
2057 | 19.8k | self.ft.clear(); |
2058 | 98.4k | for path in |
2059 | 98.4k | let time = FileAndTime::get_metadata(&path); // do before move below |
2060 | 98.4k | self.ft.push( FileAndTime{ file: path, time }) |
2061 | } | |
2062 | 19.8k | } |
2063 | ||
2064 | 28.0k | pub fn as_path(&self) -> &Path { |
2065 | 28.0k | assert!(!self.ft.is_empty()); |
2066 | 28.0k | return &self.ft[0].file; |
2067 | 28.0k | } |
2068 | ||
2069 | 0 | pub fn paths(&self) -> Vec<PathBuf> { |
2070 | 0 | return self.ft.iter().map(|ft| ft.file.clone()).collect::<Vec<PathBuf>>(); |
2071 | 0 | } |
2072 | ||
2073 | } | |
2074 | ||
2075 | ||
2076 | /// `SpeechRulesWithContext` encapsulates a named group of speech rules (e.g, "ClearSpeak") | |
2077 | /// along with the preferences to be used for speech. | |
2078 | // Note: if we can't read the files, an error message is stored in the structure and needs to be checked. | |
2079 | // I tried using Result<SpeechRules>, but it was a mess with all the unwrapping. | |
2080 | // Important: the code needs to be careful to check this at the top level calls | |
2081 | pub struct SpeechRules { | |
2082 | error: String, | |
2083 | name: RulesFor, | |
2084 | pub pref_manager: Rc<RefCell<PreferenceManager>>, | |
2085 | rules: RuleTable, // the speech rules used (partitioned into MathML tags in hashmap, then linearly searched) | |
2086 | rule_files: FilesAndTimes, // files that were read | |
2087 | translate_single_chars_only: bool, // strings like "half" don't want 'a's translated, but braille does | |
2088 | unicode_short: UnicodeTable, // the short list of rules used for Unicode characters | |
2089 | unicode_short_files: FilesAndTimesShared, // files that were read | |
2090 | unicode_full: UnicodeTable, // the long remaining rules used for Unicode characters | |
2091 | unicode_full_files: FilesAndTimesShared, // files that were read | |
2092 | definitions_files: FilesAndTimesShared, // files that were read | |
2093 | } | |
2094 | ||
2095 | impl fmt::Display for SpeechRules { | |
2096 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
2097 | 0 | writeln!(f, "SpeechRules '{}'\n{})", self.name, self.pref_manager.borrow())?; |
2098 | 0 | let mut rules_vec: Vec<(&String, &Vec<Box<SpeechPattern>>)> = self.rules.iter().collect(); |
2099 | 0 | rules_vec.sort_by_key(|(tag_name, _)| tag_name.as_str()); |
2100 | 0 | for (tag_name, rules) in rules_vec { |
2101 | 0 | writeln!(f, " {}: #patterns {}", tag_name, rules.len())?; |
2102 | }; | |
2103 | 0 | return writeln!(f, " {}+{} unicode entries", &self.unicode_short.borrow().len(), &self.unicode_full.borrow().len()); |
2104 | 0 | } |
2105 | } | |
2106 | ||
2107 | ||
2108 | /// `SpeechRulesWithContext` encapsulates a named group of speech rules (e.g, "ClearSpeak") | |
2109 | /// along with the preferences to be used for speech. | |
2110 | /// Because speech rules can define variables, there is also a context that is carried with them | |
2111 | pub struct SpeechRulesWithContext<'c, 's:'c, 'm:'c> { | |
2112 | speech_rules: &'s SpeechRules, | |
2113 | context_stack: ContextStack<'c>, // current value of (context) variables | |
2114 | doc: Document<'m>, | |
2115 | nav_node_id: &'m str, | |
2116 | nav_node_offset: usize, | |
2117 | pub inside_spell: bool, // hack to allow 'spell' to avoid infinite loop (see 'spell' implementation in tts.rs) | |
2118 | pub translate_count: usize, // hack to avoid 'translate' infinite loop (see 'spell' implementation in tts.rs) | |
2119 | } | |
2120 | ||
2121 | impl<'c, 's:'c, 'm:'c> fmt::Display for SpeechRulesWithContext<'c, 's,'m> { | |
2122 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
2123 | 0 | writeln!(f, "SpeechRulesWithContext \n{})", self.speech_rules)?; |
2124 | 0 | return writeln!(f, " {} context entries, nav node id '({}, {})'", &self.context_stack, self.nav_node_id, self.nav_node_offset); |
2125 | 0 | } |
2126 | } | |
2127 | ||
2128 | thread_local!{ | |
2129 | /// SPEECH_UNICODE_SHORT is shared among several rules, so "RC" is used | |
2130 | static SPEECH_UNICODE_SHORT: UnicodeTable = | |
2131 | Rc::new( RefCell::new( HashMap::with_capacity(500) ) ); | |
2132 | ||
2133 | /// SPEECH_UNICODE_FULL is shared among several rules, so "RC" is used | |
2134 | static SPEECH_UNICODE_FULL: UnicodeTable = | |
2135 | Rc::new( RefCell::new( HashMap::with_capacity(6500) ) ); | |
2136 | ||
2137 | /// BRAILLE_UNICODE_SHORT is shared among several rules, so "RC" is used | |
2138 | static BRAILLE_UNICODE_SHORT: UnicodeTable = | |
2139 | Rc::new( RefCell::new( HashMap::with_capacity(500) ) ); | |
2140 | ||
2141 | /// BRAILLE_UNICODE_FULL is shared among several rules, so "RC" is used | |
2142 | static BRAILLE_UNICODE_FULL: UnicodeTable = | |
2143 | Rc::new( RefCell::new( HashMap::with_capacity(5000) ) ); | |
2144 | ||
2145 | /// SPEECH_DEFINITION_FILES_AND_TIMES is shared among several rules, so "RC" is used | |
2146 | static SPEECH_DEFINITION_FILES_AND_TIMES: FilesAndTimesShared = | |
2147 | Rc::new( RefCell::new(FilesAndTimes::default()) ); | |
2148 | ||
2149 | /// BRAILLE_DEFINITION_FILES_AND_TIMES is shared among several rules, so "RC" is used | |
2150 | static BRAILLE_DEFINITION_FILES_AND_TIMES: FilesAndTimesShared = | |
2151 | Rc::new( RefCell::new(FilesAndTimes::default()) ); | |
2152 | ||
2153 | /// SPEECH_UNICODE_SHORT_FILES_AND_TIMES is shared among several rules, so "RC" is used | |
2154 | static SPEECH_UNICODE_SHORT_FILES_AND_TIMES: FilesAndTimesShared = | |
2155 | Rc::new( RefCell::new(FilesAndTimes::default()) ); | |
2156 | ||
2157 | /// SPEECH_UNICODE_FULL_FILES_AND_TIMES is shared among several rules, so "RC" is used | |
2158 | static SPEECH_UNICODE_FULL_FILES_AND_TIMES: FilesAndTimesShared = | |
2159 | Rc::new( RefCell::new(FilesAndTimes::default()) ); | |
2160 | ||
2161 | /// BRAILLE_UNICODE_SHORT_FILES_AND_TIMES is shared among several rules, so "RC" is used | |
2162 | static BRAILLE_UNICODE_SHORT_FILES_AND_TIMES: FilesAndTimesShared = | |
2163 | Rc::new( RefCell::new(FilesAndTimes::default()) ); | |
2164 | ||
2165 | /// BRAILLE_UNICODE_FULL_FILES_AND_TIMES is shared among several rules, so "RC" is used | |
2166 | static BRAILLE_UNICODE_FULL_FILES_AND_TIMES: FilesAndTimesShared = | |
2167 | Rc::new( RefCell::new(FilesAndTimes::default()) ); | |
2168 | ||
2169 | /// The current set of speech rules | |
2170 | // maybe this should be a small cache of rules in case people switch rules/prefs? | |
2171 | pub static INTENT_RULES: RefCell<SpeechRules> = | |
2172 | RefCell::new( SpeechRules::new(RulesFor::Intent, true) ); | |
2173 | ||
2174 | pub static SPEECH_RULES: RefCell<SpeechRules> = | |
2175 | RefCell::new( SpeechRules::new(RulesFor::Speech, true) ); | |
2176 | ||
2177 | pub static OVERVIEW_RULES: RefCell<SpeechRules> = | |
2178 | RefCell::new( SpeechRules::new(RulesFor::OverView, true) ); | |
2179 | ||
2180 | pub static NAVIGATION_RULES: RefCell<SpeechRules> = | |
2181 | RefCell::new( SpeechRules::new(RulesFor::Navigation, true) ); | |
2182 | ||
2183 | pub static BRAILLE_RULES: RefCell<SpeechRules> = | |
2184 | RefCell::new( SpeechRules::new(RulesFor::Braille, false) ); | |
2185 | } | |
2186 | ||
2187 | impl SpeechRules { | |
2188 | 8.16k | pub fn new(name: RulesFor, translate_single_chars_only: bool) -> SpeechRules { |
2189 | 8.16k | let globals = if name == RulesFor::Braille { |
2190 | 1.35k | ( |
2191 | 1.35k | (BRAILLE_UNICODE_SHORT.with(Rc::clone), BRAILLE_UNICODE_SHORT_FILES_AND_TIMES.with(Rc::clone)), |
2192 | 1.35k | (BRAILLE_UNICODE_FULL. with(Rc::clone), BRAILLE_UNICODE_FULL_FILES_AND_TIMES.with(Rc::clone)), |
2193 | 1.35k | BRAILLE_DEFINITION_FILES_AND_TIMES.with(Rc::clone), |
2194 | 1.35k | ) |
2195 | } else { | |
2196 | 6.80k | ( |
2197 | 6.80k | (SPEECH_UNICODE_SHORT.with(Rc::clone), SPEECH_UNICODE_SHORT_FILES_AND_TIMES.with(Rc::clone)), |
2198 | 6.80k | (SPEECH_UNICODE_FULL. with(Rc::clone), SPEECH_UNICODE_FULL_FILES_AND_TIMES.with(Rc::clone)), |
2199 | 6.80k | SPEECH_DEFINITION_FILES_AND_TIMES.with(Rc::clone), |
2200 | 6.80k | ) |
2201 | }; | |
2202 | ||
2203 | return SpeechRules { | |
2204 | 8.16k | error: Default::default(), |
2205 | 8.16k | name, |
2206 | 8.16k | rules: HashMap::with_capacity(if name == RulesFor::Intent || |
2207 | 8.16k | rule_files: FilesAndTimes::default(), |
2208 | 8.16k | unicode_short: globals.0.0, // lazy load them |
2209 | 8.16k | unicode_short_files: globals.0.1, |
2210 | 8.16k | unicode_full: globals.1.0, // lazy load them |
2211 | 8.16k | unicode_full_files: globals.1.1, |
2212 | 8.16k | definitions_files: globals.2, |
2213 | 8.16k | translate_single_chars_only, |
2214 | 8.16k | pref_manager: PreferenceManager::get(), |
2215 | }; | |
2216 | 8.16k | } |
2217 | ||
2218 | 17.7k | pub fn get_error(&self) -> Option<&str> { |
2219 | 17.7k | return if self.error.is_empty() { |
2220 | 17.7k | None |
2221 | } else { | |
2222 | 0 | Some(&self.error) |
2223 | } | |
2224 | 17.7k | } |
2225 | ||
2226 | 15.3k | pub fn read_files(&mut self) -> Result<()> { |
2227 | 15.3k | let check_rule_files = self.pref_manager.borrow().pref_to_string("CheckRuleFiles"); |
2228 | 15.3k | if check_rule_files != "None" { // "Prefs" or "All" are other values |
2229 | 15.3k | self.pref_manager.borrow_mut().set_preference_files() |
2230 | 2 | } |
2231 | 15.3k | let should_ignore_file_time = self.pref_manager.borrow().pref_to_string("CheckRuleFiles") != "All"; // ignore for "None", "Prefs" |
2232 | 15.3k | let rule_file = self.pref_manager.borrow().get_rule_file(&self.name).to_path_buf(); // need to create PathBuf to avoid a move/use problem |
2233 | 15.3k | if self.rules.is_empty() || |
2234 | 8.35k | self.rules.clear(); |
2235 | 8.35k | let files_read = self.read_patterns(&rule_file) |
2236 | 8.35k | self.rule_files.set_files_and_times(files_read); |
2237 | 6.94k | } |
2238 | ||
2239 | 15.3k | let pref_manager = self.pref_manager.borrow(); |
2240 | 15.3k | let unicode_pref_files = if self.name == RulesFor::Braille { |
2241 | ||
2242 | 15.3k | if !self.unicode_short_files.borrow().is_file_up_to_date(unicode_pref_files.0, should_ignore_file_time) { |
2243 | 5.50k | self.unicode_short.borrow_mut().clear(); |
2244 | 5.50k | self.unicode_short_files.borrow_mut().set_files_and_times(self.read_unicode(None, true) |
2245 | 9.80k | } |
2246 | ||
2247 | 15.3k | if self.definitions_files.borrow().ft.is_empty() || |
2248 | 9.82k | pref_manager.get_definitions_file(self.name != RulesFor::Braille), |
2249 | 9.82k | should_ignore_file_time |
2250 | 9.82k | ) { |
2251 | 5.49k | self.definitions_files.borrow_mut().set_files_and_times(read_definitions_file(self.name != RulesFor::Braille) |
2252 | 9.80k | } |
2253 | 15.3k | return Ok( () ); |
2254 | 15.3k | } |
2255 | ||
2256 | 38.6k | fn read_patterns(&mut self, path: &Path) -> Result<Vec<PathBuf>> { |
2257 | // info!("Reading rule file: {}", p.to_str().unwrap()); | |
2258 | 38.6k | let rule_file_contents = read_to_string_shim(path).with_context(|| |
2259 | 38.6k | let rules_build_fn = |pattern: &Yaml| { |
2260 | 38.6k | self.build_speech_patterns(pattern, path) |
2261 | 38.6k | .with_context(|| |
2262 | 38.6k | }; |
2263 | 38.6k | return compile_rule(&rule_file_contents, rules_build_fn) |
2264 | 38.6k | .with_context(|| |
2265 | 38.6k | } |
2266 | ||
2267 | 38.6k | fn build_speech_patterns(&mut self, patterns: &Yaml, file_name: &Path) -> Result<Vec<PathBuf>> { |
2268 | // Rule::SpeechPatternList | |
2269 | 38.6k | let patterns_vec = patterns.as_vec(); |
2270 | 38.6k | if patterns_vec.is_none() { |
2271 | 0 | bail!(yaml_type_err(patterns, "array")); |
2272 | 38.6k | } |
2273 | 38.6k | let patterns_vec = patterns.as_vec().unwrap(); |
2274 | 38.6k | let mut files_read = vec![file_name.to_path_buf()]; |
2275 | 894k | for entry in |
2276 | 894k | if let Some( |
2277 | 30.2k | files_read.append(&mut added_files); |
2278 | 864k | } |
2279 | } | |
2280 | 38.6k | return Ok(files_read) |
2281 | 38.6k | } |
2282 | ||
2283 | 5.97k | fn read_unicode(&self, path: Option<PathBuf>, use_short: bool) -> Result<Vec<PathBuf>> { |
2284 | 5.97k | let path = match path { |
2285 | 3 | Some(p) => p, |
2286 | None => { | |
2287 | // get the path to either the short or long unicode file | |
2288 | 5.97k | let pref_manager = self.pref_manager.borrow(); |
2289 | 5.97k | let unicode_files = if self.name == RulesFor::Braille { |
2290 | 1.57k | pref_manager.get_braille_unicode_file() |
2291 | } else { | |
2292 | 4.40k | pref_manager.get_speech_unicode_file() |
2293 | }; | |
2294 | 5.97k | let unicode_files = if use_short { |
2295 | 5.97k | unicode_files.to_path_buf() |
2296 | } | |
2297 | }; | |
2298 | ||
2299 | // FIX: should read first (lang), then supplement with second (region) | |
2300 | // info!("Reading unicode file {}", path.to_str().unwrap()); | |
2301 | 5.97k | let unicode_file_contents = read_to_string_shim(&path) |
2302 | 5.97k | let unicode_build_fn = |unicode_def_list: &Yaml| { |
2303 | 5.97k | let unicode_defs = unicode_def_list.as_vec(); |
2304 | 5.97k | if unicode_defs.is_none() { |
2305 | 0 | bail!("File '{}' does not begin with an array", yaml_to_type(unicode_def_list)); |
2306 | 5.97k | }; |
2307 | 5.97k | let mut files_read = vec![path.to_path_buf()]; |
2308 | 2.24M | for unicode_def in |
2309 | 2.24M | if let Some( |
2310 | 2.24M | .with_context(|| |
2311 | 3 | files_read.append(&mut added_files); |
2312 | 2.24M | } |
2313 | }; | |
2314 | 5.97k | return Ok(files_read) |
2315 | 5.97k | }; |
2316 | ||
2317 | 5.97k | return compile_rule(&unicode_file_contents, unicode_build_fn) |
2318 | 5.97k | .with_context(|| |
2319 | 5.97k | } |
2320 | ||
2321 | 0 | pub fn print_sizes() -> String { |
2322 | // let _ = &SPEECH_RULES.with_borrow(|rules| { | |
2323 | // debug!("SPEECH RULES entries\n"); | |
2324 | // let rules = &rules.rules; | |
2325 | // for (key, _) in rules.iter() { | |
2326 | // debug!("key: {}", key); | |
2327 | // } | |
2328 | // }); | |
2329 | 0 | let mut answer = rule_size(&SPEECH_RULES, "SPEECH_RULES"); |
2330 | 0 | answer += &rule_size(&INTENT_RULES, "INTENT_RULES"); |
2331 | 0 | answer += &rule_size(&BRAILLE_RULES, "BRAILLE_RULES"); |
2332 | 0 | answer += &rule_size(&NAVIGATION_RULES, "NAVIGATION_RULES"); |
2333 | 0 | answer += &rule_size(&OVERVIEW_RULES, "OVERVIEW_RULES"); |
2334 | 0 | SPEECH_RULES.with_borrow(|rule| { |
2335 | 0 | answer += &format!("Speech Unicode tables: short={}/{}, long={}/{}\n", |
2336 | 0 | rule.unicode_short.borrow().len(), rule.unicode_short.borrow().capacity(), |
2337 | 0 | rule.unicode_full.borrow().len(), rule.unicode_full.borrow().capacity()); |
2338 | 0 | }); |
2339 | 0 | BRAILLE_RULES.with_borrow(|rule| { |
2340 | 0 | answer += &format!("Braille Unicode tables: short={}/{}, long={}/{}\n", |
2341 | 0 | rule.unicode_short.borrow().len(), rule.unicode_short.borrow().capacity(), |
2342 | 0 | rule.unicode_full.borrow().len(), rule.unicode_full.borrow().capacity()); |
2343 | 0 | }); |
2344 | 0 | return answer; |
2345 | ||
2346 | 0 | fn rule_size(rules: &'static std::thread::LocalKey<RefCell<SpeechRules>>, name: &str) -> String { |
2347 | 0 | rules.with_borrow(|rule| { |
2348 | 0 | let hash_map = &rule.rules; |
2349 | 0 | return format!("{}: {}/{}\n", name, hash_map.len(), hash_map.capacity()); |
2350 | 0 | }) |
2351 | 0 | } |
2352 | 0 | } |
2353 | } | |
2354 | ||
2355 | ||
2356 | /// We track three different lifetimes: | |
2357 | /// 'c -- the lifetime of the context and mathml | |
2358 | /// 's -- the lifetime of the speech rules (which is static) | |
2359 | /// 'r -- the lifetime of the reference (this seems to be key to keep the rust memory checker happy) | |
2360 | impl<'c, 's:'c, 'r, 'm:'c> SpeechRulesWithContext<'c, 's,'m> { | |
2361 | 22.7k | pub fn new(speech_rules: &'s SpeechRules, doc: Document<'m>, nav_node_id: &'m str, nav_node_offset: usize) -> SpeechRulesWithContext<'c, 's, 'm> { |
2362 | 22.7k | return SpeechRulesWithContext { |
2363 | 22.7k | speech_rules, |
2364 | 22.7k | context_stack: ContextStack::new(&speech_rules.pref_manager.borrow()), |
2365 | 22.7k | doc, |
2366 | 22.7k | nav_node_id, |
2367 | 22.7k | nav_node_offset, |
2368 | 22.7k | inside_spell: false, |
2369 | 22.7k | translate_count: 0, |
2370 | 22.7k | } |
2371 | 22.7k | } |
2372 | ||
2373 | 1.84k | pub fn get_rules(&mut self) -> &SpeechRules { |
2374 | 1.84k | return self.speech_rules; |
2375 | 1.84k | } |
2376 | ||
2377 | 45.5k | pub fn get_context(&mut self) -> &mut sxd_xpath::Context<'c> { |
2378 | 45.5k | return &mut self.context_stack.base; |
2379 | 45.5k | } |
2380 | ||
2381 | 3.23k | pub fn get_document(&mut self) -> Document<'m> { |
2382 | 3.23k | return self.doc; |
2383 | 3.23k | } |
2384 | ||
2385 | 1.13k | pub fn set_nav_node_offset(&mut self, offset: usize) { |
2386 | // debug!("Setting nav node offset to {}", offset); | |
2387 | 1.13k | self.nav_node_offset = offset; |
2388 | 1.13k | } |
2389 | ||
2390 | 121k | pub fn match_pattern<T:TreeOrString<'c, 'm, T>>(&'r mut self, mathml: Element<'c>) -> Result<T> { |
2391 | // debug!("Looking for a match for: \n{}", mml_to_string(mathml)); | |
2392 | 121k | let tag_name = mathml.name().local_part(); |
2393 | 121k | let rules = &self.speech_rules.rules; |
2394 | ||
2395 | // start with priority rules that apply to any node (should be a very small number) | |
2396 | 121k | if let Some( |
2397 | 95.8k | let Some( |
2398 | 3.18k | return Ok(result); // found a match |
2399 | 118k | } |
2400 | ||
2401 | 118k | if let Some( |
2402 | 116k | let Some( |
2403 | 82.1k | return Ok(result); // found a match |
2404 | 35.9k | } |
2405 | ||
2406 | // no rules for specific element, fall back to rules for "*" which *should* be present in all rule files as fallback | |
2407 | 35.9k | if let Some(rule_vector) = rules.get("*") && |
2408 | 35.9k | let Some(result) = self.find_match(rule_vector, mathml) |
2409 | 35.9k | return Ok(result); // found a match |
2410 | 0 | } |
2411 | ||
2412 | // no rules matched -- poorly written rule file -- let flow through to default error | |
2413 | // report error message with file name | |
2414 | 0 | let speech_manager = self.speech_rules.pref_manager.borrow(); |
2415 | 0 | let file_name = speech_manager.get_rule_file(&self.speech_rules.name); |
2416 | // FIX: handle error appropriately | |
2417 | 0 | bail!("\nNo match found!\nMissing patterns in {} for MathML.\n{}", file_name.to_string_lossy(), mml_to_string(mathml)); |
2418 | 121k | } |
2419 | ||
2420 | 248k | fn find_match<T:TreeOrString<'c, 'm, T>>(&'r mut self, rule_vector: &[Box<SpeechPattern>], mathml: Element<'c>) -> Result<Option<T>> { |
2421 | 870k | for pattern in |
2422 | // debug!("Pattern name: {}", pattern.pattern_name); | |
2423 | // always pushing and popping around the is_match would be a little cleaner, but push/pop is relatively expensive, | |
2424 | // so we optimize and only push first if the variables are needed to do the match | |
2425 | 870k | if pattern.match_uses_var_defs { |
2426 | 7.05k | self.context_stack.push(pattern.var_defs.clone(), mathml) |
2427 | 863k | } |
2428 | 870k | if pattern.is_match(&self.context_stack.base, mathml) |
2429 | 870k | .with_context(|| |
2430 | // debug!(" find_match: FOUND!!!"); | |
2431 | 121k | if !pattern.match_uses_var_defs && |
2432 | 13.0k | self.context_stack.push(pattern.var_defs.clone(), mathml) |
2433 | 108k | } |
2434 | 121k | let result = if self.nav_node_offset > 0 && |
2435 | 47 | self.nav_node_id == mathml.attribute_value("id").unwrap_or_default() && |
2436 | 7 | let ch = crate::canonicalize::as_text(mathml).chars().nth(self.nav_node_offset-1).unwrap_or_default(); |
2437 | 7 | let ch = self.replace_single_char(ch, mathml) |
2438 | // debug!("find_match: ch={} from '{}'; matched pattern name/tag: {}/{} with nav_node_offset={}", | |
2439 | // ch, crate::canonicalize::as_text(mathml), | |
2440 | // pattern.pattern_name, pattern.tag_name, self.nav_node_offset); | |
2441 | 7 | T::from_string(ch.to_string(), self.doc) |
2442 | } else { | |
2443 | 121k | pattern.replacements.replace(self, mathml) |
2444 | }; | |
2445 | 121k | if pattern.var_defs.len() > 0 { |
2446 | 14.5k | self.context_stack.pop(); |
2447 | 106k | } |
2448 | 121k | return match result { |
2449 | 121k | Ok(s) => { |
2450 | // for all except braille and navigation, nav_node_id will be an empty string and will not match | |
2451 | 121k | if self.nav_node_id.is_empty() { |
2452 | 102k | Ok( Some(s) ) |
2453 | } else { | |
2454 | 18.5k | if self.nav_node_id == mathml.attribute_value("id").unwrap_or_default() { |
2455 | 18.5k | Ok ( Some(self.nav_node_adjust(s, mathml)) ) |
2456 | } | |
2457 | }, | |
2458 | 9 | Err(e) => Err( e.context( |
2459 | 9 | format!( |
2460 | 9 | "attempting replacement pattern: \"{}\" for \"{}\".\n\ |
2461 | 9 | Replacement\n{}\n...due to matching the MathML\n{} with the pattern\n\ |
2462 | 9 | {}\n\ |
2463 | 9 | The patterns are in {}.\n", |
2464 | 9 | pattern.pattern_name, pattern.tag_name, |
2465 | 9 | pattern.replacements.pretty_print_replacements(), |
2466 | 9 | mml_to_string(mathml), pattern.pattern, |
2467 | 9 | pattern.file_name |
2468 | 9 | ) |
2469 | 9 | )) |
2470 | } | |
2471 | 749k | } else if pattern.match_uses_var_defs { |
2472 | 5.60k | self.context_stack.pop(); |
2473 | 743k | } |
2474 | }; | |
2475 | 127k | return Ok(None); // no matches |
2476 | ||
2477 | 0 | fn error_string(pattern: &SpeechPattern, mathml: Element) -> String { |
2478 | 0 | return format!( |
2479 | "error during pattern match using: \"{}\" for \"{}\".\n\ | |
2480 | Pattern is \n{}\nMathML for the match:\n\ | |
2481 | {}\ | |
2482 | The patterns are in {}.\n", | |
2483 | pattern.pattern_name, pattern.tag_name, | |
2484 | pattern.pattern, | |
2485 | 0 | mml_to_string(mathml), |
2486 | pattern.file_name | |
2487 | ); | |
2488 | 0 | } |
2489 | ||
2490 | 248k | } |
2491 | ||
2492 | 18.5k | fn nav_node_adjust<T:TreeOrString<'c, 'm, T>>(&self, speech: T, mathml: Element<'c>) -> T { |
2493 | 18.5k | if let Some(id) = mathml.attribute_value("id") && |
2494 | 18.5k | self.nav_node_id == id { |
2495 | 990 | let offset = mathml.attribute_value(crate::navigate::ID_OFFSET).unwrap_or("0"); |
2496 | 990 | debug!("nav_node_adjust: id/name='{}/{}' offset?='{}'", id, |
2497 | 0 | self.nav_node_offset.to_string().as_str() == offset |
2498 | ); | |
2499 | 990 | if is_leaf(mathml) || |
2500 | 990 | if self.speech_rules.name == RulesFor::Braille { |
2501 | 469 | let highlight_style = self.speech_rules.pref_manager.borrow().pref_to_string("BrailleNavHighlight"); |
2502 | 469 | return T::highlight_braille(speech, highlight_style); |
2503 | } else { | |
2504 | 521 | debug!("nav_node_adjust: id='{}' offset='{}/{}'", id, self.nav_node_offset, offset); |
2505 | 521 | return T::mark_nav_speech(speech) |
2506 | } | |
2507 | 0 | } |
2508 | 17.5k | } |
2509 | 17.5k | return speech; |
2510 | 18.5k | } |
2511 | ||
2512 | 469 | fn highlight_braille_string(braille: String, highlight_style: String) -> String { |
2513 | // add dots 7 & 8 to the Unicode braille (28xx) | |
2514 | 469 | if &highlight_style == "Off" || braille.is_empty() { |
2515 | 6 | return braille; |
2516 | 463 | } |
2517 | ||
2518 | // FIX: this seems needlessly complex. It is much simpler if the char can be changed in place... | |
2519 | // find first char that can get the dots and add them | |
2520 | 463 | let mut chars = braille.chars().collect::<Vec<char>>(); |
2521 | ||
2522 | // the 'b' for baseline indicator is really part of the previous token, so it needs to be highlighted but isn't because it is not Unicode braille | |
2523 | 463 | let baseline_indicator_hack = PreferenceManager::get().borrow().pref_to_string("BrailleCode") == "Nemeth"; |
2524 | // debug!("highlight_braille_string: highlight_style={}\n braille={}", highlight_style, braille); | |
2525 | 463 | let mut i_first_modified = 0; |
2526 | 760 | for (i, ch) in |
2527 | 760 | let modified_ch = add_dots_to_braille_char(*ch, baseline_indicator_hack); |
2528 | 760 | if *ch != modified_ch { |
2529 | 463 | *ch = modified_ch; |
2530 | 463 | i_first_modified = i; |
2531 | 463 | break; |
2532 | 297 | }; |
2533 | }; | |
2534 | ||
2535 | 463 | let mut i_last_modified = i_first_modified; |
2536 | 463 | if &highlight_style != "FirstChar" { |
2537 | // find last char so that we know when to modify the char | |
2538 | 491 | for i in ( |
2539 | 491 | let ch = chars[i]; |
2540 | 491 | let modified_ch = add_dots_to_braille_char(ch, baseline_indicator_hack); |
2541 | 491 | chars[i] = modified_ch; |
2542 | 491 | if ch != modified_ch { |
2543 | 390 | i_last_modified = i; |
2544 | 390 | break; |
2545 | 101 | } |
2546 | } | |
2547 | 0 | } |
2548 | ||
2549 | 463 | if &highlight_style == "All" { |
2550 | // finish going through the string | |
2551 | #[allow(clippy::needless_range_loop)] // I don't like enumerate/take/skip here | |
2552 | 4 | for |
2553 | 0 | chars[i] = add_dots_to_braille_char(chars[i], baseline_indicator_hack); |
2554 | 0 | }; |
2555 | 459 | } |
2556 | ||
2557 | 463 | let result = chars.into_iter().collect::<String>(); |
2558 | // debug!(" result={}", result); | |
2559 | 463 | return result; |
2560 | ||
2561 | 1.25k | fn add_dots_to_braille_char(ch: char, baseline_indicator_hack: bool) -> char { |
2562 | 1.25k | let as_u32 = ch as u32; |
2563 | 1.25k | if (0x2800..0x28FF).contains(&as_u32) { |
2564 | 919 | return unsafe {char::from_u32_unchecked(as_u32 | 0xC0)}; // safe because we have checked the range |
2565 | 332 | } else if baseline_indicator_hack && |
2566 | 7 | return '𝑏' |
2567 | } else { | |
2568 | 325 | return ch; |
2569 | } | |
2570 | 1.25k | } |
2571 | 469 | } |
2572 | ||
2573 | 521 | fn mark_nav_speech(speech: String) -> String { |
2574 | // add unique markers (since speech is mostly ascii letters and digits, most any symbol will do) | |
2575 | // it's a bug (but happened during intent generation), we might have identical id's, choose innermost one | |
2576 | 521 | debug!("mark_nav_speech: adding [[ {} ]] ", |
2577 | 521 | if !speech.contains("[[") { |
2578 | 521 | return "[[".to_string() + &speech + "]]"; |
2579 | } else { | |
2580 | 0 | return speech |
2581 | } | |
2582 | 521 | } |
2583 | ||
2584 | 456k | fn replace<T:TreeOrString<'c, 'm, T>>(&'r mut self, replacement: &Replacement, mathml: Element<'c>) -> Result<T> { |
2585 | return Ok( | |
2586 | 456k | match replacement { |
2587 | 63.6k | Replacement::Text(t) => T::from_string(t.clone(), self.doc) |
2588 | 151k | Replacement::XPath(xpath) => xpath.replace(self, mathml) |
2589 | 60.7k | Replacement::TTS(tts) => { |
2590 | 60.7k | T::from_string( |
2591 | 60.7k | self.speech_rules.pref_manager.borrow().get_tts().replace(tts, &self.speech_rules.pref_manager.borrow(), self, mathml) |
2592 | 60.7k | self.doc |
2593 | 0 | )? |
2594 | }, | |
2595 | 45.5k | Replacement::Intent(intent) => { |
2596 | 45.5k | intent.replace(self, mathml) |
2597 | }, | |
2598 | 115k | Replacement::Test(test) => { |
2599 | 115k | test.replace(self, mathml) |
2600 | }, | |
2601 | 7.28k | Replacement::With(with) => { |
2602 | 7.28k | with.replace(self, mathml) |
2603 | }, | |
2604 | 3.78k | Replacement::SetVariables(vars) => { |
2605 | 3.78k | vars.replace(self, mathml) |
2606 | }, | |
2607 | 7.45k | Replacement::Insert(ic) => { |
2608 | 7.45k | ic.replace(self, mathml) |
2609 | }, | |
2610 | 2 | Replacement::Translate(id) => { |
2611 | 2 | id.replace(self, mathml) |
2612 | }, | |
2613 | } | |
2614 | ) | |
2615 | 456k | } |
2616 | ||
2617 | /// Iterate over all the nodes, concatenating the result strings together with a ' ' between them | |
2618 | /// If the node is an element, pattern match it | |
2619 | /// For 'Text' and 'Attribute' nodes, convert them to strings | |
2620 | 121k | fn replace_nodes<T:TreeOrString<'c, 'm, T>>(&'r mut self, nodes: Vec<Node<'c>>, mathml: Element<'c>) -> Result<T> { |
2621 | 121k | return T::replace_nodes(self, nodes, mathml); |
2622 | 121k | } |
2623 | ||
2624 | /// Iterate over all the nodes finding matches for the elements | |
2625 | /// For this case of returning MathML, everything else is an error | |
2626 | 48.6k | fn replace_nodes_tree(&'r mut self, nodes: Vec<Node<'c>>, _mathml: Element<'c>) -> Result<Element<'m>> { |
2627 | 48.6k | let mut children = Vec::with_capacity(3*nodes.len()); // guess (2 chars/node + space) |
2628 | 69.6k | for node in |
2629 | 69.6k | let matched = match node { |
2630 | 41.9k | Node::Element(n) => self.match_pattern::<Element<'m>>(n) |
2631 | 27.5k | Node::Text(t) => { |
2632 | 27.5k | let leaf = create_mathml_element(&self.doc, "TEMP_NAME"); |
2633 | 27.5k | leaf.set_text(t.text()); |
2634 | 27.5k | leaf |
2635 | }, | |
2636 | 32 | Node::Attribute(attr) => { |
2637 | // debug!(" from attr with text '{}'", attr.value()); | |
2638 | 32 | let leaf = create_mathml_element(&self.doc, "TEMP_NAME"); |
2639 | 32 | leaf.set_text(attr.value()); |
2640 | 32 | leaf |
2641 | }, | |
2642 | _ => { | |
2643 | 0 | bail!("replace_nodes: found unexpected node type!!!"); |
2644 | }, | |
2645 | }; | |
2646 | 69.6k | children.push(matched); |
2647 | } | |
2648 | ||
2649 | 48.6k | let result = create_mathml_element(&self.doc, "TEMP_NAME"); // FIX: what name should be used? |
2650 | 48.6k | result.append_children(children); |
2651 | // debug!("replace_nodes_tree\n{}\n====>>>>>\n", mml_to_string(result)); | |
2652 | 48.6k | return Ok( result ); |
2653 | 48.6k | } |
2654 | ||
2655 | 72.9k | fn replace_nodes_string(&'r mut self, nodes: Vec<Node<'c>>, mathml: Element<'c>) -> Result<String> { |
2656 | // debug!("replace_nodes: working on {} nodes", nodes.len()); | |
2657 | 72.9k | let mut result = String::with_capacity(3*nodes.len()); // guess (2 chars/node + space) |
2658 | 72.9k | let mut first_time = true; |
2659 | 85.7k | for node in |
2660 | 85.7k | if first_time { |
2661 | 72.9k | first_time = false; |
2662 | 72.9k | } else { |
2663 | 12.8k | result.push(' '); |
2664 | 12.8k | }; |
2665 | 85.7k | let matched = match node { |
2666 | 66.5k | Node::Element(n) => self.match_pattern::<String>(n) |
2667 | 19.2k | Node::Text(t) => self.replace_chars(t.text(), mathml) |
2668 | 14 | Node::Attribute(attr) => self.replace_chars(attr.value(), mathml) |
2669 | 0 | _ => bail!("replace_nodes: found unexpected node type!!!"), |
2670 | }; | |
2671 | 85.7k | result += &matched; |
2672 | } | |
2673 | 72.9k | return Ok( result ); |
2674 | 72.9k | } |
2675 | ||
2676 | /// Lookup unicode "pronunciation" of char. | |
2677 | /// Note: TTS is not supported here (not needed and a little less efficient) | |
2678 | 58.0k | pub fn replace_chars(&'r mut self, str: &str, mathml: Element<'c>) -> Result<String> { |
2679 | 58.0k | let chars = str.chars().collect::<Vec<char>>(); |
2680 | 58.0k | let rules = self.speech_rules; |
2681 | // handled in match_pattern -- temporarily leaving as comments in case something is missed and needed here | |
2682 | // if self.nav_node_offset > 0 && chars.len() > 1 { | |
2683 | // if self.nav_node_offset > chars.len() { | |
2684 | // debug!("replace_chars: nav_node_offset {} is larger than string length {}", self.nav_node_offset, chars.len()); | |
2685 | // self.nav_node_offset = chars.len(); | |
2686 | // } | |
2687 | // let ch = chars[self.nav_node_offset-1]; | |
2688 | // debug!("replace_chars: adjusted string to '{}' based on nav_node_offset {}", ch, self.nav_node_offset); | |
2689 | // if rules.translate_single_chars_only { | |
2690 | // return self.replace_single_char(ch, mathml); | |
2691 | // } else { | |
2692 | // return Ok( ch.to_string() ); | |
2693 | // } | |
2694 | // } | |
2695 | 58.0k | if is_quoted_string(str) { // quoted string -- already translated (set in get_braille_chars) |
2696 | 12.5k | return Ok(unquote_string(str).to_string()); |
2697 | 45.5k | } |
2698 | // in a string, avoid "a" -> "eigh", "." -> "point", etc | |
2699 | 45.5k | if rules.translate_single_chars_only { |
2700 | 30.0k | if chars.len() == 1 { |
2701 | 27.3k | return self.replace_single_char(chars[0], mathml) |
2702 | } else { | |
2703 | // more than one char -- fix up non-breaking space | |
2704 | 2.69k | return Ok(str.replace('\u{00A0}', " ").replace(['\u{2061}', '\u{2062}', '\u{2063}', '\u{2064}'], "")) |
2705 | } | |
2706 | 15.5k | }; |
2707 | ||
2708 | 15.5k | let result = chars.iter() |
2709 | 18.2k | . |
2710 | 15.5k | .collect::<Result<Vec<String>>>() |
2711 | 15.5k | .join(""); |
2712 | 15.5k | return Ok( result ); |
2713 | 58.0k | } |
2714 | ||
2715 | 45.6k | fn replace_single_char(&'r mut self, ch: char, mathml: Element<'c>) -> Result<String> { |
2716 | 45.6k | let ch_as_u32 = ch as u32; |
2717 | 45.6k | let rules = self.speech_rules; |
2718 | 45.6k | let mut unicode = rules.unicode_short.borrow(); |
2719 | 45.6k | let mut replacements = unicode.get( &ch_as_u32 ); |
2720 | // debug!("replace_single_char: looking for unicode {} for char '{}'/{:#06x}, found: {:?}", rules.name, ch, ch_as_u32, replacements); | |
2721 | 45.6k | if replacements.is_none() { |
2722 | // see if it in the full unicode table (if it isn't loaded already) | |
2723 | 1.64k | let pref_manager = rules.pref_manager.borrow(); |
2724 | 1.64k | let unicode_pref_files = if rules.name == RulesFor::Braille { |
2725 | 1.64k | let should_ignore_file_time = pref_manager.pref_to_string("CheckRuleFiles") == "All"; |
2726 | 1.64k | if rules.unicode_full.borrow().is_empty() || |
2727 | 468 | info!("*** Loading full unicode {} for char '{}'/{:#06x}", rules.name, ch, ch_as_u32); |
2728 | 468 | rules.unicode_full.borrow_mut().clear(); |
2729 | 468 | rules.unicode_full_files.borrow_mut().set_files_and_times(rules.read_unicode(None, false) |
2730 | 468 | info!("# Unicode defs = {}/{}", |
2731 | 1.18k | } |
2732 | 1.64k | unicode = rules.unicode_full.borrow(); |
2733 | 1.64k | replacements = unicode.get( &ch_as_u32 ); |
2734 | 1.64k | if replacements.is_none() { |
2735 | 269 | self.translate_count = 0; // not in loop |
2736 | // debug!("*** Did not find unicode {} for char '{}'/{:#06x}", rules.name, ch, ch_as_u32); | |
2737 | 269 | if rules.translate_single_chars_only || |
2738 | 269 | return Ok(String::from(ch)); // no replacement, so just return the char and hope for the best |
2739 | } else { // braille -- must turn into braille dots | |
2740 | // Emulate what NVDA does: generate (including single quotes) '\xhhhh' or '\yhhhhhh' | |
2741 | 0 | let ch_as_int = ch as u32; |
2742 | 0 | let prefix_indicator = if ch_as_int < 1<<16 {'x'} else {'y'}; |
2743 | 0 | return self.replace_chars( &format!("'\\{prefix_indicator}{:06x}'", ch_as_int), mathml); |
2744 | } | |
2745 | 1.37k | } |
2746 | 43.9k | }; |
2747 | ||
2748 | // map across all the parts of the replacement, collect them up into a Vec, and then concat them together | |
2749 | 45.3k | let result = replacements.unwrap() |
2750 | 45.3k | .iter() |
2751 | 45.3k | .map(|replacement| |
2752 | 49.1k | self.replace(replacement, mathml) |
2753 | 49.1k | .with_context(|| |
2754 | 45.3k | .collect::<Result<Vec<String>>>() |
2755 | 45.3k | .join(" "); |
2756 | 45.3k | self.translate_count = 0; // found a replacement, so not in a loop |
2757 | 45.3k | return Ok(result); |
2758 | 45.6k | } |
2759 | } | |
2760 | ||
2761 | /// Hack to allow replacement of `str` with braille chars. | |
2762 | 12.5k | pub fn braille_replace_chars(str: &str, mathml: Element) -> Result<String> { |
2763 | 12.5k | return BRAILLE_RULES.with(|rules| { |
2764 | 12.5k | let rules = rules.borrow(); |
2765 | 12.5k | let new_package = Package::new(); |
2766 | 12.5k | let mut rules_with_context = SpeechRulesWithContext::new(&rules, new_package.as_document(), "", 0); |
2767 | 12.5k | return rules_with_context.replace_chars(str, mathml); |
2768 | 12.5k | }) |
2769 | 12.5k | } |
2770 | ||
2771 | ||
2772 | ||
2773 | #[cfg(test)] | |
2774 | mod tests { | |
2775 | #[allow(unused_imports)] | |
2776 | use crate::init_logger; | |
2777 | ||
2778 | use super::*; | |
2779 | ||
2780 | #[test] | |
2781 | 1 | fn test_read_statement() { |
2782 | 1 | let str = r#"--- |
2783 | 1 | {name: default, tag: math, match: ".", replace: [x: "./*"] }"#; |
2784 | 1 | let doc = YamlLoader::load_from_str(str).unwrap(); |
2785 | 1 | assert_eq!(doc.len(), 1); |
2786 | 1 | let mut rules = SpeechRules::new(RulesFor::Speech, true); |
2787 | ||
2788 | 1 | SpeechPattern::build(&doc[0], Path::new("testing"), &mut rules).unwrap(); |
2789 | 1 | assert_eq!(rules.rules["math"].len(), 1, "\nshould only be one rule"); |
2790 | ||
2791 | 1 | let speech_pattern = &rules.rules["math"][0]; |
2792 | 1 | assert_eq!(speech_pattern.pattern_name, "default", "\npattern name failure"); |
2793 | 1 | assert_eq!(speech_pattern.tag_name, "math", "\ntag name failure"); |
2794 | 1 | assert_eq!(speech_pattern.pattern.rc.string, ".", "\npattern failure"); |
2795 | 1 | assert_eq!(speech_pattern.replacements.replacements.len(), 1, "\nreplacement failure"); |
2796 | 1 | assert_eq!(speech_pattern.replacements.replacements[0].to_string(), r#""./*""#, "\nreplacement failure"); |
2797 | 1 | } |
2798 | ||
2799 | #[test] | |
2800 | 1 | fn test_read_statements_with_replace() { |
2801 | 1 | let str = r#"--- |
2802 | 1 | {name: default, tag: math, match: ".", replace: [x: "./*"] }"#; |
2803 | 1 | let doc = YamlLoader::load_from_str(str).unwrap(); |
2804 | 1 | assert_eq!(doc.len(), 1); |
2805 | 1 | let mut rules = SpeechRules::new(RulesFor::Speech, true); |
2806 | 1 | SpeechPattern::build(&doc[0], Path::new("testing"), &mut rules).unwrap(); |
2807 | ||
2808 | 1 | let str = r#"--- |
2809 | 1 | {name: default, tag: math, match: ".", replace: [t: "test", x: "./*"] }"#; |
2810 | 1 | let doc2 = YamlLoader::load_from_str(str).unwrap(); |
2811 | 1 | assert_eq!(doc2.len(), 1); |
2812 | 1 | SpeechPattern::build(&doc2[0], Path::new("testing"), &mut rules).unwrap(); |
2813 | 1 | assert_eq!(rules.rules["math"].len(), 1, "\nfirst rule not replaced"); |
2814 | ||
2815 | 1 | let speech_pattern = &rules.rules["math"][0]; |
2816 | 1 | assert_eq!(speech_pattern.pattern_name, "default", "\npattern name failure"); |
2817 | 1 | assert_eq!(speech_pattern.tag_name, "math", "\ntag name failure"); |
2818 | 1 | assert_eq!(speech_pattern.pattern.rc.string, ".", "\npattern failure"); |
2819 | 1 | assert_eq!(speech_pattern.replacements.replacements.len(), 2, "\nreplacement failure"); |
2820 | 1 | } |
2821 | ||
2822 | #[test] | |
2823 | 1 | fn test_read_statements_with_add() { |
2824 | 1 | let str = r#"--- |
2825 | 1 | {name: default, tag: math, match: ".", replace: [x: "./*"] }"#; |
2826 | 1 | let doc = YamlLoader::load_from_str(str).unwrap(); |
2827 | 1 | assert_eq!(doc.len(), 1); |
2828 | 1 | let mut rules = SpeechRules::new(RulesFor::Speech, true); |
2829 | 1 | SpeechPattern::build(&doc[0], Path::new("testing"), &mut rules).unwrap(); |
2830 | ||
2831 | 1 | let str = r#"--- |
2832 | 1 | {name: another-rule, tag: math, match: ".", replace: [t: "test", x: "./*"] }"#; |
2833 | 1 | let doc2 = YamlLoader::load_from_str(str).unwrap(); |
2834 | 1 | assert_eq!(doc2.len(), 1); |
2835 | 1 | SpeechPattern::build(&doc2[0], Path::new("testing"), &mut rules).unwrap(); |
2836 | 1 | assert_eq!(rules.rules["math"].len(), 2, "\nsecond rule not added"); |
2837 | ||
2838 | 1 | let speech_pattern = &rules.rules["math"][0]; |
2839 | 1 | assert_eq!(speech_pattern.pattern_name, "default", "\npattern name failure"); |
2840 | 1 | assert_eq!(speech_pattern.tag_name, "math", "\ntag name failure"); |
2841 | 1 | assert_eq!(speech_pattern.pattern.rc.string, ".", "\npattern failure"); |
2842 | 1 | assert_eq!(speech_pattern.replacements.replacements.len(), 1, "\nreplacement failure"); |
2843 | 1 | } |
2844 | ||
2845 | #[test] | |
2846 | 1 | fn test_debug_no_debug() { |
2847 | 1 | let str = r#"*[2]/*[3][text()='3']"#; |
2848 | 1 | let result = MyXPath::add_debug_string_arg(str); |
2849 | 1 | assert!(result.is_ok()); |
2850 | 1 | assert_eq!(result.unwrap(), str); |
2851 | 1 | } |
2852 | ||
2853 | #[test] | |
2854 | 1 | fn test_debug_no_debug_with_quote() { |
2855 | 1 | let str = r#"*[2]/*[3][text()='(']"#; |
2856 | 1 | let result = MyXPath::add_debug_string_arg(str); |
2857 | 1 | assert!(result.is_ok()); |
2858 | 1 | assert_eq!(result.unwrap(), str); |
2859 | 1 | } |
2860 | ||
2861 | #[test] | |
2862 | 1 | fn test_debug_no_quoted_paren() { |
2863 | 1 | let str = r#"DEBUG(*[2]/*[3][text()='3'])"#; |
2864 | 1 | let result = MyXPath::add_debug_string_arg(str); |
2865 | 1 | assert!(result.is_ok()); |
2866 | 1 | assert_eq!(result.unwrap(), r#"DEBUG(*[2]/*[3][text()='3'], "*[2]/*[3][text()='3']")"#); |
2867 | 1 | } |
2868 | ||
2869 | #[test] | |
2870 | 1 | fn test_debug_quoted_paren() { |
2871 | 1 | let str = r#"DEBUG(*[2]/*[3][text()='('])"#; |
2872 | 1 | let result = MyXPath::add_debug_string_arg(str); |
2873 | 1 | assert!(result.is_ok()); |
2874 | 1 | assert_eq!(result.unwrap(), r#"DEBUG(*[2]/*[3][text()='('], "*[2]/*[3][text()='(']")"#); |
2875 | 1 | } |
2876 | ||
2877 | #[test] | |
2878 | 1 | fn test_debug_quoted_paren_before_paren() { |
2879 | 1 | let str = r#"DEBUG(ClearSpeak_Matrix = 'Combinatorics') and IsBracketed(., '(', ')')"#; |
2880 | 1 | let result = MyXPath::add_debug_string_arg(str); |
2881 | 1 | assert!(result.is_ok()); |
2882 | 1 | assert_eq!(result.unwrap(), r#"DEBUG(ClearSpeak_Matrix = 'Combinatorics', "ClearSpeak_Matrix = 'Combinatorics'") and IsBracketed(., '(', ')')"#); |
2883 | 1 | } |
2884 | ||
2885 | ||
2886 | // zipped files do NOT include "zz", hence we need to exclude this test | |
2887 | cfg_if::cfg_if! {if #[cfg(not(feature = "include-zip"))] { | |
2888 | #[test] | |
2889 | 1 | fn test_up_to_date() { |
2890 | use crate::interface::*; | |
2891 | // initialize and move to a directory where making a time change doesn't really matter | |
2892 | 1 | set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); |
2893 | 1 | set_preference("Language", "zz-aa").unwrap(); |
2894 | // not much is support in zz | |
2895 | 1 | if let Err( |
2896 | 0 | error!("{}", crate::errors_to_string(&e)); |
2897 | 0 | panic!("Should not be an error in setting MathML") |
2898 | 1 | } |
2899 | ||
2900 | 1 | set_preference("CheckRuleFiles", "All").unwrap(); |
2901 | 1 | assert!(!is_file_time_same(), "file's time did not get updated"); |
2902 | 1 | set_preference("CheckRuleFiles", "None").unwrap(); |
2903 | 1 | assert!(is_file_time_same(), "file's time was wrongly updated (preference 'CheckRuleFiles' should have prevented updating)"); |
2904 | ||
2905 | // change a file, cause read_files to be called, and return if MathCAT noticed the change and updated its time | |
2906 | 2 | fn is_file_time_same() -> bool { |
2907 | // read and write a unicode file in a test dir | |
2908 | // files are read in due to setting the MathML | |
2909 | ||
2910 | use std::time::Duration; | |
2911 | 2 | return SPEECH_RULES.with(|rules| { |
2912 | 2 | let start_main_file = rules.borrow().unicode_short_files.borrow().ft[0].clone(); |
2913 | ||
2914 | // open the file, read all the contents, then write them back so the time changes | |
2915 | 2 | let contents = std::fs::read(&start_main_file.file).expect(&format!("Failed to read file {} during test", &start_main_file.file.to_string_lossy())); |
2916 | 2 | std::fs::write(start_main_file.file, contents).unwrap(); |
2917 | 2 | std::thread::sleep(Duration::from_millis(5)); // pause a little to make sure the time changes |
2918 | ||
2919 | // speak should cause the file stored to have a new time | |
2920 | 2 | if let Err( |
2921 | 0 | error!("{}", crate::errors_to_string(&e)); |
2922 | 0 | panic!("Should not be an error in speech") |
2923 | 2 | } |
2924 | 2 | return rules.borrow().unicode_short_files.borrow().ft[0].time == start_main_file.time; |
2925 | 2 | }); |
2926 | 2 | } |
2927 | 1 | } |
2928 | }} | |
2929 | ||
2930 | // #[test] | |
2931 | // fn test_nested_debug_quoted_paren() { | |
2932 | // let str = r#"DEBUG(*[2]/*[3][DEBUG(text()='(')])"#; | |
2933 | // let result = MyXPath::add_debug_string_arg(str); | |
2934 | // assert!(result.is_ok()); | |
2935 | // assert_eq!(result.unwrap(), r#"DEBUG(*[2]/*[3][DEBUG(text()='(')], "DEBUG(*[2]/*[3][DEBUG(text()='(')], \"text()='(')]\")"#); | |
2936 | // } | |
2937 | ||
2938 | } |
Line | Count | Source |
1 | //! The speech module is where the speech rules are read in and speech generated. | |
2 | //! | |
3 | //! The speech rules call out to the preferences and tts modules and the dividing line is not always clean. | |
4 | //! A number of useful utility functions used by other modules are defined here. | |
5 | #![allow(clippy::needless_return)] | |
6 | use std::path::PathBuf; | |
7 | use std::collections::HashMap; | |
8 | use std::cell::{RefCell, RefMut}; | |
9 | use std::sync::LazyLock; | |
10 | use sxd_document::dom::{ChildOfElement, Document, Element}; | |
11 | use sxd_document::{Package, QName}; | |
12 | use sxd_xpath::context::Evaluation; | |
13 | use sxd_xpath::{Factory, Value, XPath}; | |
14 | use sxd_xpath::nodeset::Node; | |
15 | use std::fmt; | |
16 | use std::time::SystemTime; | |
17 | use crate::definitions::read_definitions_file; | |
18 | use crate::errors::*; | |
19 | use crate::prefs::*; | |
20 | use crate::xpath_functions::is_leaf; | |
21 | use yaml_rust::{YamlLoader, Yaml, yaml::Hash}; | |
22 | use crate::tts::*; | |
23 | use crate::infer_intent::*; | |
24 | use crate::pretty_print::{mml_to_string, yaml_to_string}; | |
25 | use std::path::Path; | |
26 | use std::rc::Rc; | |
27 | use crate::shim_filesystem::{read_to_string_shim, canonicalize_shim}; | |
28 | use crate::canonicalize::{as_element, create_mathml_element, set_mathml_name, name, MATHML_FROM_NAME_ATTR}; | |
29 | use regex::Regex; | |
30 | use log::{debug, error, info}; | |
31 | ||
32 | ||
33 | pub const NAV_NODE_SPEECH_NOT_FOUND: &str = "NAV_NODE_NOT_FOUND"; | |
34 | ||
35 | /// Like lisp's ' (quote foo), this is used to block "replace_chars" being called. | |
36 | /// Unlike lisp, this appended to the end of a string (more efficient) | |
37 | /// At the moment, the only use is BrailleChars(...) -- internally, it calls replace_chars and we don't want it called again. | |
38 | /// Note: an alternative to this hack is to add "xq" (execute but don't eval the result), but that's heavy-handed for the current need | |
39 | const NO_EVAL_QUOTE_CHAR: char = '\u{efff}'; // a private space char | |
40 | const NO_EVAL_QUOTE_CHAR_AS_BYTES: [u8;3] = [0xee,0xbf,0xbf]; | |
41 | const N_BYTES_NO_EVAL_QUOTE_CHAR: usize = NO_EVAL_QUOTE_CHAR.len_utf8(); | |
42 | ||
43 | /// Converts 'string' into a "quoted" string -- use is_quoted_string and unquote_string | |
44 | 12.5k | pub fn make_quoted_string(mut string: String) -> String { |
45 | 12.5k | string.push(NO_EVAL_QUOTE_CHAR); |
46 | 12.5k | return string; |
47 | 12.5k | } |
48 | ||
49 | /// Checks the string to see if it is "quoted" | |
50 | 58.0k | pub fn is_quoted_string(str: &str) -> bool { |
51 | 58.0k | if str.len() < N_BYTES_NO_EVAL_QUOTE_CHAR { |
52 | 34.1k | return false; |
53 | 23.9k | } |
54 | 23.9k | let bytes = str.as_bytes(); |
55 | 23.9k | return bytes[bytes.len()-N_BYTES_NO_EVAL_QUOTE_CHAR..] == NO_EVAL_QUOTE_CHAR_AS_BYTES; |
56 | 58.0k | } |
57 | ||
58 | /// Converts 'string' into a "quoted" string -- use is_quoted_string and unquote_string | |
59 | /// IMPORTANT: this assumes the string is quoted -- no check is made | |
60 | 12.5k | pub fn unquote_string(str: &str) -> &str { |
61 | 12.5k | return &str[..str.len()-N_BYTES_NO_EVAL_QUOTE_CHAR]; |
62 | 12.5k | } |
63 | ||
64 | ||
65 | /// The main external call, `intent_from_mathml` returns a string for the speech associated with the `mathml`. | |
66 | /// It matches against the rules that are computed by user prefs such as "Language" and "SpeechStyle". | |
67 | /// | |
68 | /// The speech rules assume `mathml` has been "cleaned" via the canonicalization step. | |
69 | /// | |
70 | /// If the preferences change (and hence the speech rules to use change), or if the rule file changes, | |
71 | /// `intent_from_mathml` will detect that and (re)load the proper rules. | |
72 | /// | |
73 | /// A string is returned in call cases. | |
74 | /// If there is an error, the speech string will indicate an error. | |
75 | 3.88k | pub fn intent_from_mathml<'m>(mathml: Element, doc: Document<'m>) -> Result<Element<'m>> { |
76 | 3.88k | let |
77 | 3.87k | doc.root().append_child(intent_tree); |
78 | 3.87k | return Ok(intent_tree); |
79 | 3.88k | } |
80 | ||
81 | 3.96k | pub fn speak_mathml(mathml: Element, nav_node_id: &str, nav_node_offset: usize) -> Result<String> { |
82 | 3.96k | return speak_rules(&SPEECH_RULES, mathml, nav_node_id, nav_node_offset); |
83 | 3.96k | } |
84 | ||
85 | 14 | pub fn overview_mathml(mathml: Element, nav_node_id: &str, nav_node_offset: usize) -> Result<String> { |
86 | 14 | return speak_rules(&OVERVIEW_RULES, mathml, nav_node_id, nav_node_offset); |
87 | 14 | } |
88 | ||
89 | ||
90 | 3.88k | fn intent_rules<'m>(rules: &'static std::thread::LocalKey<RefCell<SpeechRules>>, doc: Document<'m>, mathml: Element, nav_node_id: &'m str) -> Result<Element<'m>> { |
91 | 3.88k | rules.with(|rules| { |
92 | 3.88k | rules.borrow_mut().read_files() |
93 | 3.88k | let rules = rules.borrow(); |
94 | // debug!("intent_rules:\n{}", mml_to_string(mathml)); | |
95 | 3.88k | let should_set_literal_intent = rules.pref_manager.borrow().pref_to_string("SpeechStyle").as_str() == "LiteralSpeak"; |
96 | 3.88k | let original_intent = mathml.attribute_value("intent"); |
97 | 3.88k | if should_set_literal_intent { |
98 | 10 | if let Some( |
99 | 4 | let intent = if intent.contains('(') { |
100 | 4 | mathml.set_attribute_value("intent", &intent); |
101 | 6 | } else { |
102 | 6 | mathml.set_attribute_value("intent", ":literal"); |
103 | 6 | }; |
104 | 3.87k | } |
105 | 3.88k | let mut rules_with_context = SpeechRulesWithContext::new(&rules, doc, nav_node_id, 0); |
106 | 3.88k | let |
107 | 3.88k | .context("Pattern match/replacement failure!") |
108 | 3.87k | let answer = if name(intent) == "TEMP_NAME" { // unneeded extra layer |
109 | 0 | assert_eq!(intent.children().len(), 1); |
110 | 0 | as_element(intent.children()[0]) |
111 | } else { | |
112 | 3.87k | intent |
113 | }; | |
114 | 3.87k | if should_set_literal_intent { |
115 | 10 | if let Some( |
116 | 4 | mathml.set_attribute_value("intent", original_intent); |
117 | 6 | } else { |
118 | 6 | mathml.remove_attribute("intent"); |
119 | 6 | } |
120 | 3.86k | } |
121 | 3.87k | return Ok(answer); |
122 | 3.88k | }) |
123 | 3.88k | } |
124 | ||
125 | /// Speak the MathML | |
126 | /// If 'nav_node_id' is not an empty string, then the element with that id will have [[...]] around it | |
127 | 3.98k | fn speak_rules(rules: &'static std::thread::LocalKey<RefCell<SpeechRules>>, mathml: Element, nav_node_id: &str, nav_node_offset: usize) -> Result<String> { |
128 | 3.98k | return rules.with(|rules| { |
129 | 3.98k | rules.borrow_mut().read_files() |
130 | 3.98k | let rules = rules.borrow(); |
131 | // debug!("speak_rules:\n{}", mml_to_string(mathml)); | |
132 | 3.98k | let new_package = Package::new(); |
133 | 3.98k | let mut rules_with_context = SpeechRulesWithContext::new(&rules, new_package.as_document(), nav_node_id, nav_node_offset); |
134 | 3.98k | let |
135 | 3.98k | return Ok( rules.pref_manager.borrow().get_tts() |
136 | 3.98k | .merge_pauses(remove_optional_indicators( |
137 | 3.98k | &speech_string.replace(CONCAT_STRING, "") |
138 | 3.98k | .replace(CONCAT_INDICATOR, "") |
139 | 3.98k | ) |
140 | 3.98k | .trim_start().trim_end_matches([' ', ',', ';'])) ); |
141 | 3.98k | }); |
142 | ||
143 | 3.99k | fn nestable_speak_rules<'c, 's:'c, 'm:'c>(rules_with_context: &mut SpeechRulesWithContext<'c, 's, 'm>, mathml: Element<'c>) -> Result<String> { |
144 | 3.99k | let mut speech_string = rules_with_context.match_pattern::<String>(mathml) |
145 | 3.99k | .context("Pattern match/replacement failure!") |
146 | // Note: [[...]] is added around a matching child, but if the "id" is on 'mathml', the whole string is used | |
147 | 3.99k | if !rules_with_context.nav_node_id.is_empty() { |
148 | // See https://github.com/NSoiffer/MathCAT/issues/174 for why we can just start the speech at the nav node | |
149 | 536 | let intent_attr = mathml.attribute_value("data-intent-property").unwrap_or_default(); |
150 | 536 | if let Some( |
151 | 521 | match speech_string[start+2..].find("]]") { |
152 | 0 | None => bail!("Internal error: looking for '[[...]]' during navigation -- only found '[[' in '{}'", speech_string), |
153 | 521 | Some(end) => speech_string = speech_string[start+2..start+2+end].to_string(), |
154 | } | |
155 | 15 | } else if !intent_attr.contains(":literal:") { |
156 | // try again with LiteralSpeak -- some parts might have been elided in other SpeechStyles | |
157 | 14 | mathml.set_attribute_value("data-intent-property", (":literal:".to_string() + intent_attr).as_str()); |
158 | 14 | let speech = nestable_speak_rules(rules_with_context, mathml); |
159 | 14 | mathml.set_attribute_value("data-intent-property", intent_attr); |
160 | 14 | return speech; |
161 | } else { | |
162 | 1 | bail!(NAV_NODE_SPEECH_NOT_FOUND); // NAV_NODE_SPEECH_NOT_FOUND is tested for later |
163 | } | |
164 | 3.46k | } |
165 | 3.98k | return Ok(speech_string); |
166 | 3.99k | } |
167 | 3.98k | } |
168 | ||
169 | /// Converts its argument to a string that can be used in a debugging message. | |
170 | 0 | pub fn yaml_to_type(yaml: &Yaml) -> String { |
171 | 0 | return match yaml { |
172 | 0 | Yaml::Real(v)=> format!("real='{v:#}'"), |
173 | 0 | Yaml::Integer(v)=> format!("integer='{v:#}'"), |
174 | 0 | Yaml::String(v)=> format!("string='{v:#}'"), |
175 | 0 | Yaml::Boolean(v)=> format!("boolean='{v:#}'"), |
176 | 0 | Yaml::Array(v)=> match v.len() { |
177 | 0 | 0 => "array with no entries".to_string(), |
178 | 0 | 1 => format!("array with the entry: {}", yaml_to_type(&v[0])), |
179 | 0 | _ => format!("array with {} entries. First entry: {}", v.len(), yaml_to_type(&v[0])), |
180 | } | |
181 | 0 | Yaml::Hash(h)=> { |
182 | 0 | let first_pair = |
183 | 0 | if h.is_empty() { |
184 | 0 | "no pairs".to_string() |
185 | } else { | |
186 | 0 | let (key, val) = h.iter().next().unwrap(); |
187 | 0 | format!("({}, {})", yaml_to_type(key), yaml_to_type(val)) |
188 | }; | |
189 | 0 | format!("dictionary with {} pair{}. A pair: {}", h.len(), if h.len()==1 {""} else {"s"}, first_pair) |
190 | } | |
191 | 0 | Yaml::Alias(_)=> "Alias".to_string(), |
192 | 0 | Yaml::Null=> "Null".to_string(), |
193 | 0 | Yaml::BadValue=> "BadValue".to_string(), |
194 | } | |
195 | 0 | } |
196 | ||
197 | 0 | fn yaml_type_err(yaml: &Yaml, str: &str) -> Error { |
198 | 0 | anyhow!("Expected {}, found {}", str, yaml_to_type(yaml)) |
199 | 0 | } |
200 | ||
201 | // fn yaml_key_err(dict: &Yaml, key: &str, yaml_type: &str) -> String { | |
202 | // if dict.as_hash().is_none() { | |
203 | // return format!("Expected dictionary with key '{}', found\n{}", key, yaml_to_string(dict, 1)); | |
204 | // } | |
205 | // let str = &dict[key]; | |
206 | // if str.is_badvalue() { | |
207 | // return format!("Did not find '{}' in\n{}", key, yaml_to_string(dict, 1)); | |
208 | // } | |
209 | // return format!("Type of '{}' is not a {}.\nIt is a {}. YAML value is\n{}", | |
210 | // key, yaml_type, yaml_to_type(str), yaml_to_string(dict, 0)); | |
211 | // } | |
212 | ||
213 | 4.86M | fn find_str<'a>(dict: &'a Yaml, key: &'a str) -> Option<&'a str> { |
214 | 4.86M | return dict[key].as_str(); |
215 | 4.86M | } |
216 | ||
217 | /// Returns the Yaml as a `Hash` or an error if it isn't. | |
218 | 175k | pub fn as_hash_checked(value: &Yaml) -> Result<&Hash> { |
219 | 175k | let result = value.as_hash(); |
220 | 175k | let result = result.ok_or_else(|| |
221 | 175k | return Ok( result ); |
222 | 175k | } |
223 | ||
224 | /// Returns the Yaml as a `Vec` or an error if it isn't. | |
225 | 11.7k | pub fn as_vec_checked(value: &Yaml) -> Result<&Vec<Yaml>> { |
226 | 11.7k | let result = value.as_vec(); |
227 | 11.7k | let result = result.ok_or_else(|| |
228 | 11.7k | return Ok( result ); |
229 | 11.7k | } |
230 | ||
231 | /// Returns the Yaml as a `&str` or an error if it isn't. | |
232 | 8.08M | pub fn as_str_checked(yaml: &Yaml) -> Result<&str> { |
233 | 8.08M | return yaml.as_str().ok_or_else(|| |
234 | 8.08M | } |
235 | ||
236 | ||
237 | /// A bit of a hack to concatenate replacements (without a ' '). | |
238 | /// The CONCAT_INDICATOR is added by a "ct:" (instead of 't:') in the speech rules | |
239 | /// and checked for by the tts code. | |
240 | pub const CONCAT_INDICATOR: &str = "\u{F8FE}"; | |
241 | ||
242 | // This is the pattern that needs to be matched (and deleted) | |
243 | pub const CONCAT_STRING: &str = " \u{F8FE}"; | |
244 | ||
245 | // a similar hack to potentially delete (repetitive) optional replacements | |
246 | // the OPTIONAL_INDICATOR is added by "ot:" before and after the optional string | |
247 | const OPTIONAL_INDICATOR: &str = "\u{F8FD}"; | |
248 | const OPTIONAL_INDICATOR_LEN: usize = OPTIONAL_INDICATOR.len(); | |
249 | ||
250 | 5.10k | pub fn remove_optional_indicators(str: &str) -> String { |
251 | 5.10k | return str.replace(OPTIONAL_INDICATOR, ""); |
252 | 5.10k | } |
253 | ||
254 | /// Given a string that should be Yaml, it calls `build_fn` with that string. | |
255 | /// The build function/closure should process the Yaml as appropriate and capture any errors and write them to `std_err`. | |
256 | /// The returned value should be a Vector containing the paths of all the files that were included. | |
257 | 56.3k | pub fn compile_rule<F>(str: &str, mut build_fn: F) -> Result<Vec<PathBuf>> where |
258 | 56.3k | F: FnMut(&Yaml) -> Result<Vec<PathBuf>> { |
259 | 56.3k | let docs = YamlLoader::load_from_str(str); |
260 | 56.3k | match docs { |
261 | 0 | Err(e) => { |
262 | 0 | bail!("Parse error!!: {}", e); |
263 | }, | |
264 | 56.3k | Ok(docs) => { |
265 | 56.3k | if docs.len() != 1 { |
266 | 0 | bail!("Didn't find rules!"); |
267 | 56.3k | } |
268 | 56.3k | return build_fn(&docs[0]); |
269 | } | |
270 | } | |
271 | 56.3k | } |
272 | ||
273 | 36.5k | pub fn process_include<F>(current_file: &Path, new_file_name: &str, mut read_new_file: F) -> Result<Vec<PathBuf>> |
274 | 36.5k | where F: FnMut(&Path) -> Result<Vec<PathBuf>> { |
275 | 36.5k | let parent_path = current_file.parent(); |
276 | 36.5k | if parent_path.is_none() { |
277 | 0 | bail!("Internal error: {:?} is not a valid file name", current_file); |
278 | 36.5k | } |
279 | 36.5k | let mut new_file = match canonicalize_shim(parent_path.unwrap()) { |
280 | 36.5k | Ok(path) => path, |
281 | 0 | Err(e) => bail!("process_include: canonicalize failed for {} with message {}", parent_path.unwrap().display(), e), |
282 | }; | |
283 | ||
284 | // the referenced file might be in a directory that hasn't been zipped up -- find the dir and call the unzip function | |
285 | 89.0k | for unzip_dir in |
286 | 89.0k | if unzip_dir.ends_with("Rules") { |
287 | 36.5k | break; // nothing to unzip |
288 | 52.4k | } |
289 | 52.4k | if unzip_dir.ends_with("Languages") || |
290 | // get the subdir ...Rules/Braille/en/... | |
291 | // could have ...Rules/Braille/definitions.yaml, so 'next()' doesn't exist in this case, but the file wasn't zipped up | |
292 | 26.0k | if let Some( |
293 | 24.9k | let default_lang = if unzip_dir.ends_with("Languages") { |
294 | 24.9k | PreferenceManager::unzip_files(unzip_dir, subdir.to_str().unwrap(), Some(default_lang)).unwrap_or_default(); |
295 | 1.06k | } |
296 | 26.4k | } |
297 | } | |
298 | 36.5k | new_file.push(new_file_name); |
299 | 36.5k | info!("...processing include: {new_file_name}..."); |
300 | 36.5k | let new_file = match crate::shim_filesystem::canonicalize_shim(new_file.as_path()) { |
301 | 36.5k | Ok(buf) => buf, |
302 | 0 | Err(msg) => bail!("-include: constructed file name '{}' causes error '{}'", |
303 | 0 | new_file.to_str().unwrap(), msg), |
304 | }; | |
305 | ||
306 | 36.5k | let mut included_files = read_new_file(new_file.as_path()) |
307 | 36.5k | let mut files_read = vec![new_file]; |
308 | 36.5k | files_read.append(&mut included_files); |
309 | 36.5k | return Ok(files_read); |
310 | 36.5k | } |
311 | ||
312 | /// As the name says, TreeOrString is either a Tree (Element) or a String | |
313 | /// It is used to share code during pattern matching | |
314 | pub trait TreeOrString<'c, 'm:'c, T> { | |
315 | fn from_element(e: Element<'m>) -> Result<T>; | |
316 | fn from_string(s: String, doc: Document<'m>) -> Result<T>; | |
317 | fn replace_tts<'s:'c, 'r>(tts: &TTS, command: &TTSCommandRule, prefs: &PreferenceManager, rules_with_context: &'r mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<T>; | |
318 | fn replace<'s:'c, 'r>(ra: &ReplacementArray, rules_with_context: &'r mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<T>; | |
319 | fn replace_nodes<'s:'c, 'r>(rules: &'r mut SpeechRulesWithContext<'c, 's,'m>, nodes: Vec<Node<'c>>, mathml: Element<'c>) -> Result<T>; | |
320 | fn highlight_braille(braille: T, highlight_style: String) -> T; | |
321 | fn mark_nav_speech(speech: T) -> T; | |
322 | } | |
323 | ||
324 | impl<'c, 'm:'c> TreeOrString<'c, 'm, String> for String { | |
325 | 0 | fn from_element(_e: Element<'m>) -> Result<String> { |
326 | 0 | bail!("from_element not allowed for strings"); |
327 | 0 | } |
328 | ||
329 | 180k | fn from_string(s: String, _doc: Document<'m>) -> Result<String> { |
330 | 180k | return Ok(s); |
331 | 180k | } |
332 | ||
333 | 60.7k | fn replace_tts<'s:'c, 'r>(tts: &TTS, command: &TTSCommandRule, prefs: &PreferenceManager, rules_with_context: &'r mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<String> { |
334 | 60.7k | return tts.replace_string(command, prefs, rules_with_context, mathml); |
335 | 60.7k | } |
336 | ||
337 | 142k | fn replace<'s:'c, 'r>(ra: &ReplacementArray, rules_with_context: &'r mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<String> { |
338 | 142k | return ra.replace_array_string(rules_with_context, mathml); |
339 | 142k | } |
340 | ||
341 | 72.9k | fn replace_nodes<'s:'c, 'r>(rules: &'r mut SpeechRulesWithContext<'c, 's,'m>, nodes: Vec<Node<'c>>, mathml: Element<'c>) -> Result<String> { |
342 | 72.9k | return rules.replace_nodes_string(nodes, mathml); |
343 | 72.9k | } |
344 | ||
345 | 469 | fn highlight_braille(braille: String, highlight_style: String) -> String { |
346 | 469 | return SpeechRulesWithContext::highlight_braille_string(braille, highlight_style); |
347 | 469 | } |
348 | ||
349 | 521 | fn mark_nav_speech(speech: String) -> String { |
350 | 521 | return SpeechRulesWithContext::mark_nav_speech(speech); |
351 | 521 | } |
352 | } | |
353 | ||
354 | impl<'c, 'm:'c> TreeOrString<'c, 'm, Element<'m>> for Element<'m> { | |
355 | 48.0k | fn from_element(e: Element<'m>) -> Result<Element<'m>> { |
356 | 48.0k | return Ok(e); |
357 | 48.0k | } |
358 | ||
359 | 213 | fn from_string(s: String, doc: Document<'m>) -> Result<Element<'m>> { |
360 | // FIX: is 'mi' really ok? Don't want to use TEMP_NAME because this name needs to move to the outside world | |
361 | 213 | let leaf = create_mathml_element(&doc, "mi"); |
362 | 213 | leaf.set_text(&s); |
363 | 213 | return Ok(leaf); |
364 | 213 | } |
365 | ||
366 | 0 | fn replace_tts<'s:'c, 'r>(_tts: &TTS, _command: &TTSCommandRule, _prefs: &PreferenceManager, _rules_with_context: &'r mut SpeechRulesWithContext<'c, 's,'m>, _mathml: Element<'c>) -> Result<Element<'m>> { |
367 | 0 | bail!("Internal error: applying a TTS rule to a tree"); |
368 | 0 | } |
369 | ||
370 | 132k | fn replace<'s:'c, 'r>(ra: &ReplacementArray, rules_with_context: &'r mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<Element<'m>> { |
371 | 132k | return ra.replace_array_tree(rules_with_context, mathml); |
372 | 132k | } |
373 | ||
374 | 48.6k | fn replace_nodes<'s:'c, 'r>(rules: &'r mut SpeechRulesWithContext<'c, 's,'m>, nodes: Vec<Node<'c>>, mathml: Element<'c>) -> Result<Element<'m>> { |
375 | 48.6k | return rules.replace_nodes_tree(nodes, mathml); |
376 | 48.6k | } |
377 | ||
378 | 0 | fn highlight_braille(_braille: Element<'c>, _highlight_style: String) -> Element<'m> { |
379 | 0 | panic!("Internal error: highlight_braille called on a tree"); |
380 | } | |
381 | ||
382 | 0 | fn mark_nav_speech(_speech: Element<'c>) -> Element<'m> { |
383 | 0 | panic!("Internal error: mark_nav_speech called on a tree"); |
384 | } | |
385 | } | |
386 | ||
387 | /// 'Replacement' is an enum that contains all the potential replacement types/structs | |
388 | /// Hence there are fields 'Test' ("test:"), 'Text" ("t:"), "XPath", etc | |
389 | #[derive(Debug, Clone)] | |
390 | #[allow(clippy::upper_case_acronyms)] | |
391 | enum Replacement { | |
392 | // Note: all of these are pointer types | |
393 | Text(String), | |
394 | XPath(MyXPath), | |
395 | Intent(Box<Intent>), | |
396 | Test(Box<TestArray>), | |
397 | TTS(Box<TTSCommandRule>), | |
398 | With(Box<With>), | |
399 | SetVariables(Box<SetVariables>), | |
400 | Insert(Box<InsertChildren>), | |
401 | Translate(TranslateExpression), | |
402 | } | |
403 | ||
404 | impl fmt::Display for Replacement { | |
405 | 10 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
406 | 10 | return write!(f, "{}", |
407 | 10 | match self { |
408 | 0 | Replacement::Test(c) => c.to_string(), |
409 | 0 | Replacement::Text(t) => format!("t: \"{t}\""), |
410 | 10 | Replacement::XPath(x) => x.to_string(), |
411 | 0 | Replacement::Intent(i) => i.to_string(), |
412 | 0 | Replacement::TTS(t) => t.to_string(), |
413 | 0 | Replacement::With(w) => w.to_string(), |
414 | 0 | Replacement::SetVariables(v) => v.to_string(), |
415 | 0 | Replacement::Insert(ic) => ic.to_string(), |
416 | 0 | Replacement::Translate(x) => x.to_string(), |
417 | } | |
418 | ); | |
419 | 10 | } |
420 | } | |
421 | ||
422 | impl Replacement { | |
423 | 13.5M | fn build(replacement: &Yaml) -> Result<Replacement> { |
424 | // Replacement -- single key/value (see below for allowed values) | |
425 | 13.5M | let dictionary = replacement.as_hash(); |
426 | 13.5M | if dictionary.is_none() { |
427 | 0 | bail!(" expected a key/value pair. Found {}.", yaml_to_string(replacement, 0)); |
428 | 13.5M | }; |
429 | 13.5M | let dictionary = dictionary.unwrap(); |
430 | 13.5M | if dictionary.is_empty() { |
431 | 0 | bail!("No key/value pairs found for key 'replace'.\n\ |
432 | Suggestion: are the following lines indented properly?"); | |
433 | 13.5M | } |
434 | 13.5M | if dictionary.len() > 1 { |
435 | 0 | bail!("Should only be one key/value pair for the replacement.\n \ |
436 | Suggestion: are the following lines indented properly?\n \ | |
437 | 0 | The key/value pairs found are\n{}", yaml_to_string(replacement, 2)); |
438 | 13.5M | } |
439 | ||
440 | // get the single value | |
441 | 13.5M | let (key, value) = dictionary.iter().next().unwrap(); |
442 | 13.5M | let key = key.as_str().ok_or_else(|| |
443 | 13.5M | match key { |
444 | 13.5M | "t" | |
445 | 5.76M | return Ok( Replacement::Text( as_str_checked(value) |
446 | }, | |
447 | 7.75M | "ct" | |
448 | 24.7k | return Ok( Replacement::Text( CONCAT_INDICATOR.to_string() + as_str_checked(value) |
449 | }, | |
450 | 7.72M | "ot" | |
451 | 36.2k | return Ok( Replacement::Text( OPTIONAL_INDICATOR.to_string() + as_str_checked(value) |
452 | }, | |
453 | 7.69M | "x" => { |
454 | 2.27M | return Ok( Replacement::XPath( MyXPath::build(value) |
455 | 2.27M | .context("while trying to evaluate value of 'x:'") |
456 | }, | |
457 | 5.41M | "pause" | |
458 | 2.41M | return Ok( Replacement::TTS( TTS::build(&key.to_ascii_lowercase(), value) |
459 | }, | |
460 | 2.99M | "intent" => { |
461 | 283k | return Ok( Replacement::Intent( Intent::build(value) |
462 | }, | |
463 | 2.71M | "test" => { |
464 | 2.58M | return Ok( Replacement::Test( Box::new( TestArray::build(value) |
465 | }, | |
466 | 128k | "with" => { |
467 | 77.5k | return Ok( Replacement::With( With::build(value) |
468 | }, | |
469 | 51.3k | "set_variables" => { |
470 | 30.3k | return Ok( Replacement::SetVariables( SetVariables::build(value) |
471 | }, | |
472 | 21.0k | "insert" => { |
473 | 20.9k | return Ok( Replacement::Insert( InsertChildren::build(value) |
474 | }, | |
475 | 102 | "translate" => { |
476 | 102 | return Ok( Replacement::Translate( TranslateExpression::build(value) |
477 | 102 | .context("while trying to evaluate value of 'speak:'") |
478 | }, | |
479 | _ => { | |
480 | 0 | bail!("Unknown 'replace' command ({}) with value: {}", key, yaml_to_string(value, 0)); |
481 | } | |
482 | } | |
483 | 13.5M | } |
484 | } | |
485 | ||
486 | // structure used when "insert:" is encountered in a rule | |
487 | // the 'replacements' are inserted between each node in the 'xpath' | |
488 | #[derive(Debug, Clone)] | |
489 | struct InsertChildren { | |
490 | xpath: MyXPath, // the replacement nodes | |
491 | replacements: ReplacementArray, // what is inserted between each node | |
492 | } | |
493 | ||
494 | impl fmt::Display for InsertChildren { | |
495 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
496 | 0 | return write!(f, "InsertChildren:\n nodes {}\n replacements {}", self.xpath, &self.replacements); |
497 | 0 | } |
498 | } | |
499 | ||
500 | impl InsertChildren { | |
501 | 20.9k | fn build(insert: &Yaml) -> Result<Box<InsertChildren>> { |
502 | // 'insert:' -- 'nodes': xxx 'replace': xxx | |
503 | 20.9k | if insert.as_hash().is_none() { |
504 | 0 | bail!("") |
505 | 20.9k | } |
506 | 20.9k | let nodes = &insert["nodes"]; |
507 | 20.9k | if nodes.is_badvalue() { |
508 | 0 | bail!("Missing 'nodes' as part of 'insert'.\n \ |
509 | Suggestion: add 'nodes:' or if present, indent so it is contained in 'insert'"); | |
510 | 20.9k | } |
511 | 20.9k | let nodes = as_str_checked(nodes) |
512 | 20.9k | let replace = &insert["replace"]; |
513 | 20.9k | if replace.is_badvalue() { |
514 | 0 | bail!("Missing 'replace' as part of 'insert'.\n \ |
515 | Suggestion: add 'replace:' or if present, indent so it is contained in 'insert'"); | |
516 | 20.9k | } |
517 | 20.9k | return Ok( Box::new( InsertChildren { |
518 | 20.9k | xpath: MyXPath::new(nodes.to_string()) |
519 | 20.9k | replacements: ReplacementArray::build(replace).context("'replace:'") |
520 | } ) ); | |
521 | 20.9k | } |
522 | ||
523 | // It would be most efficient to do an xpath eval, get the nodes (type: NodeSet) and then intersperse the node_replace() | |
524 | // calls with replacements for the ReplacementArray parts. But that causes problems with the "pause: auto" calculation because | |
525 | // the replacements are segmented (can't look to neighbors for the calculation there) | |
526 | // An alternative is to introduce another Replacement enum value, but that's a lot of complication for not that much | |
527 | // gain (and Node's have contagious lifetimes) | |
528 | // The solution adopted is to find out the number of nodes and build up MyXPaths with each node selected (e.g, "*" => "*[3]") | |
529 | // and put those nodes into a flat ReplacementArray and then do a standard replace on that. | |
530 | // This is slower than the alternatives, but reuses a bunch of code and hence is less complicated. | |
531 | 7.45k | fn replace<'c, 's:'c, 'm: 'c, T:TreeOrString<'c, 'm, T>>(&self, rules_with_context: &mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<T> { |
532 | 7.45k | let result = self.xpath.evaluate(&rules_with_context.context_stack.base, mathml) |
533 | 7.45k | .with_context(|| |
534 | 7.45k | match result { |
535 | 7.45k | Value::Nodeset(nodes) => { |
536 | 7.45k | if nodes.size() == 0 { |
537 | 0 | bail!("During replacement, no matching element found"); |
538 | 7.45k | }; |
539 | 7.45k | let nodes = nodes.document_order(); |
540 | 7.45k | let n_nodes = nodes.len(); |
541 | 7.45k | let mut expanded_result = Vec::with_capacity(n_nodes + (n_nodes+1)*self.replacements.replacements.len()); |
542 | 7.45k | expanded_result.push( |
543 | Replacement::XPath( | |
544 | 7.45k | MyXPath::new(format!("{}[{}]", self.xpath.rc.string , 1)) |
545 | ) | |
546 | ); | |
547 | 19.3k | for i in |
548 | 19.3k | expanded_result.extend_from_slice(&self.replacements.replacements); |
549 | 19.3k | expanded_result.push( |
550 | Replacement::XPath( | |
551 | 19.3k | MyXPath::new(format!("{}[{}]", self.xpath.rc.string , i)) |
552 | ) | |
553 | ); | |
554 | } | |
555 | 7.45k | let replacements = ReplacementArray{ replacements: expanded_result }; |
556 | 7.45k | return replacements.replace(rules_with_context, mathml); |
557 | }, | |
558 | ||
559 | // FIX: should the options be errors??? | |
560 | 0 | Value::String(t) => { return T::from_string(rules_with_context.replace_chars(&t, mathml)?, rules_with_context.doc); }, |
561 | 0 | Value::Number(num) => { return T::from_string( num.to_string(), rules_with_context.doc ); }, |
562 | 0 | Value::Boolean(b) => { return T::from_string( b.to_string(), rules_with_context.doc ); }, // FIX: is this right??? |
563 | } | |
564 | ||
565 | 7.45k | } |
566 | } | |
567 | ||
568 | ||
569 | 2 | static ATTR_NAME_VALUE: LazyLock<Regex> = LazyLock::new(|| { |
570 | 2 | Regex::new( |
571 | // match name='value', where name is sort of an NCNAME (see CONCEPT_OR_LITERAL in infer_intent.rs) | |
572 | // The quotes can be either single or double quotes | |
573 | 2 | r#"(?P<name>[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*)\s*=\s*('(?P<value>[^']+)'|"(?P<dqvalue>[^"]+)")"# |
574 | 2 | ).unwrap() |
575 | 2 | }); |
576 | ||
577 | // structure used when "intent:" is encountered in a rule | |
578 | // the name is either a string or an xpath that needs evaluation. 99% of the time it is a string | |
579 | #[derive(Debug, Clone)] | |
580 | struct Intent { | |
581 | name: Option<String>, // name of node | |
582 | xpath: Option<MyXPath>, // alternative to directly using the string | |
583 | attrs: String, // optional attrs -- format "attr1='val1' [attr2='val2'...]" | |
584 | children: ReplacementArray, // children of node | |
585 | } | |
586 | ||
587 | impl fmt::Display for Intent { | |
588 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
589 | 0 | let name = if let Some(name) = &self.name { |
590 | 0 | name.to_string() |
591 | } else { | |
592 | 0 | self.xpath.as_ref().unwrap().to_string() |
593 | }; | |
594 | 0 | return write!(f, "intent: {}: {}, attrs='{}'>\n children: {}", |
595 | 0 | if self.name.is_some() {"name"} else {"xpath-name"}, name, |
596 | self.attrs, | |
597 | 0 | &self.children); |
598 | 0 | } |
599 | } | |
600 | ||
601 | impl Intent { | |
602 | 283k | fn build(yaml_dict: &Yaml) -> Result<Box<Intent>> { |
603 | // 'intent:' -- 'name': xxx 'children': xxx | |
604 | 283k | if yaml_dict.as_hash().is_none() { |
605 | 0 | bail!("Array found for contents of 'intent' -- should be dictionary with keys 'name' and 'children'") |
606 | 283k | } |
607 | 283k | let name = &yaml_dict["name"]; |
608 | 283k | let xpath_name = &yaml_dict["xpath-name"]; |
609 | 283k | if name.is_badvalue() && |
610 | 0 | bail!("Missing 'name' or 'xpath-name' as part of 'intent'.\n \ |
611 | Suggestion: add 'name:' or if present, indent so it is contained in 'intent'"); | |
612 | 283k | } |
613 | 283k | let attrs = &yaml_dict["attrs"]; |
614 | 283k | let replace = &yaml_dict["children"]; |
615 | 283k | if replace.is_badvalue() { |
616 | 0 | bail!("Missing 'children' as part of 'intent'.\n \ |
617 | Suggestion: add 'children:' or if present, indent so it is contained in 'intent'"); | |
618 | 283k | } |
619 | 283k | return Ok( Box::new( Intent { |
620 | 283k | name: if name.is_badvalue() { |
621 | 283k | xpath: if xpath_name.is_badvalue() { |
622 | 283k | attrs: if attrs.is_badvalue() { |
623 | 283k | children: ReplacementArray::build(replace).context("'children:'") |
624 | } ) ); | |
625 | 283k | } |
626 | ||
627 | 45.5k | fn replace<'c, 's:'c, 'm: 'c, T:TreeOrString<'c, 'm, T>>(&self, rules_with_context: &mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<T> { |
628 | 45.5k | let result = self.children.replace::<Element<'m>>(rules_with_context, mathml) |
629 | 45.5k | .context("replacing inside 'intent'") |
630 | 45.5k | let mut result = lift_children(result); |
631 | 45.5k | if name(result) != "TEMP_NAME" && |
632 | 235 | // this case happens when you have an 'intent' replacement as a direct child of an 'intent' replacement |
633 | 235 | let temp = create_mathml_element(&result.document(), "TEMP_NAME"); |
634 | 235 | temp.append_child(result); |
635 | 235 | result = temp; |
636 | 45.3k | } |
637 | 45.5k | if let Some( |
638 | 11.2k | result.set_attribute_value(MATHML_FROM_NAME_ATTR, name(mathml)); |
639 | 11.2k | set_mathml_name(result, intent_name.as_str()); |
640 | 34.2k | } |
641 | 45.5k | if let Some( |
642 | 34.2k | let xpath_value = my_xpath.evaluate(rules_with_context.get_context(), mathml) |
643 | 34.2k | match xpath_value { |
644 | 34.2k | Value::String(intent_name) => { |
645 | 34.2k | result.set_attribute_value(MATHML_FROM_NAME_ATTR, name(mathml)); |
646 | 34.2k | set_mathml_name(result, intent_name.as_str()) |
647 | }, | |
648 | 0 | _ => bail!("'xpath-name' value '{}' was not a string", &my_xpath), |
649 | } | |
650 | 11.2k | } |
651 | 45.5k | if self.name.is_none() && |
652 | 0 | bail!("Intent::replace: internal error -- neither 'name' nor 'xpath' is set"); |
653 | 45.5k | }; |
654 | ||
655 | 100k | for attr in |
656 | 100k | result.set_attribute_value(attr.name(), attr.value()); |
657 | 100k | } |
658 | ||
659 | // can't test against name == "math" because intent might a new element | |
660 | 45.5k | if mathml.parent().is_some() && mathml.parent().unwrap().element().is_some() && |
661 | 41.7k | result.attribute_value("id") == crate::canonicalize::get_parent(mathml).attribute_value("id") { |
662 | 32 | // avoid duplicate ids -- it's a bug if it does, but this helps in that case |
663 | 32 | result.remove_attribute("id"); |
664 | 45.5k | } |
665 | ||
666 | 45.5k | if !self.attrs.is_empty() { |
667 | // debug!("MathML after children, before attr processing:\n{}", mml_to_string(mathml)); | |
668 | // debug!("Result after children, before attr processing:\n{}", mml_to_string(result)); | |
669 | // debug!("Intent::replace attrs = \"{}\"", &self.attrs); | |
670 | 5.63k | for cap in |
671 | 5.63k | let matched_value = if cap["value"].is_empty() { |
672 | 5.63k | let value_as_xpath = MyXPath::new(matched_value.to_string()).context("attr value inside 'intent'") |
673 | 5.63k | let value = value_as_xpath.evaluate(rules_with_context.get_context(), result) |
674 | 5.63k | .context("attr xpath evaluation value inside 'intent'") |
675 | 5.63k | let mut value = value.into_string(); |
676 | 5.63k | if &cap["name"] == INTENT_PROPERTY { |
677 | 5.23k | value = simplify_fixity_properties(&value); |
678 | 5.23k |
|
679 | // debug!("Intent::replace match\n name={}\n value={}\n xpath value={}", &cap["name"], &cap["value"], &value); | |
680 | 5.63k | if &cap["name"] == INTENT_PROPERTY && |
681 | 1.81k | // should have been an empty string, so remove the attribute |
682 | 1.81k | result.remove_attribute(INTENT_PROPERTY); |
683 | 3.82k | } else { |
684 | 3.82k | result.set_attribute_value(&cap["name"], &value); |
685 | 3.82k | } |
686 | }; | |
687 | 39.9k | } |
688 | ||
689 | // debug!("Result from 'intent:'\n{}", mml_to_string(result)); | |
690 | 45.5k | return T::from_element(result); |
691 | ||
692 | ||
693 | /// "lift" up the children any "TEMP_NAME" child -- could short circuit when only one child | |
694 | 45.5k | fn lift_children(result: Element) -> Element { |
695 | // debug!("lift_children:\n{}", mml_to_string(result)); | |
696 | // most likely there will be the same number of new children as result has, but there could be more | |
697 | 45.5k | let mut new_children = Vec::with_capacity(2*result.children().len()); |
698 | 69.6k | for child_of_element in |
699 | 69.6k | match child_of_element { |
700 | 69.6k | ChildOfElement::Element(child) => { |
701 | 69.6k | if name(child) == "TEMP_NAME" { |
702 | 34.1k | new_children.append(&mut child.children()); // almost always just one |
703 | 35.5k | } else { |
704 | 35.5k | new_children.push(child_of_element); |
705 | 35.5k | } |
706 | }, | |
707 | 7 | _ => new_children.push(child_of_element), // text() |
708 | } | |
709 | } | |
710 | 45.5k | result.replace_children(new_children); |
711 | 45.5k | return result; |
712 | 45.5k | } |
713 | 45.5k | } |
714 | } | |
715 | ||
716 | // structure used when "with:" is encountered in a rule | |
717 | // the variables are placed on (and later) popped of a variable stack before/after the replacement | |
718 | #[derive(Debug, Clone)] | |
719 | struct With { | |
720 | variables: VariableDefinitions, // variables and values | |
721 | replacements: ReplacementArray, // what to do with these vars | |
722 | } | |
723 | ||
724 | impl fmt::Display for With { | |
725 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
726 | 0 | return write!(f, "with:\n variables: {}\n replace: {}", &self.variables, &self.replacements); |
727 | 0 | } |
728 | } | |
729 | ||
730 | impl With { | |
731 | 77.5k | fn build(vars_replacements: &Yaml) -> Result<Box<With>> { |
732 | // 'with:' -- 'variables': xxx 'replace': xxx | |
733 | 77.5k | if vars_replacements.as_hash().is_none() { |
734 | 0 | bail!("Array found for contents of 'with' -- should be dictionary with keys 'variables' and 'replace'") |
735 | 77.5k | } |
736 | 77.5k | let var_defs = &vars_replacements["variables"]; |
737 | 77.5k | if var_defs.is_badvalue() { |
738 | 0 | bail!("Missing 'variables' as part of 'with'.\n \ |
739 | Suggestion: add 'variables:' or if present, indent so it is contained in 'with'"); | |
740 | 77.5k | } |
741 | 77.5k | let replace = &vars_replacements["replace"]; |
742 | 77.5k | if replace.is_badvalue() { |
743 | 0 | bail!("Missing 'replace' as part of 'with'.\n \ |
744 | Suggestion: add 'replace:' or if present, indent so it is contained in 'with'"); | |
745 | 77.5k | } |
746 | 77.5k | return Ok( Box::new( With { |
747 | 77.5k | variables: VariableDefinitions::build(var_defs).context("'variables'") |
748 | 77.5k | replacements: ReplacementArray::build(replace).context("'replace:'") |
749 | } ) ); | |
750 | 77.5k | } |
751 | ||
752 | 7.28k | fn replace<'c, 's:'c, 'm: 'c, T:TreeOrString<'c, 'm, T>>(&self, rules_with_context: &mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<T> { |
753 | 7.28k | rules_with_context.context_stack.push(self.variables.clone(), mathml) |
754 | 7.28k | let result = self.replacements.replace(rules_with_context, mathml) |
755 | 7.28k | .context("replacing inside 'with'") |
756 | 7.28k | rules_with_context.context_stack.pop(); |
757 | 7.28k | return Ok( result ); |
758 | 7.28k | } |
759 | } | |
760 | ||
761 | // structure used when "set_variables:" is encountered in a rule | |
762 | // the variables are global and are placed in the base context and never popped off | |
763 | #[derive(Debug, Clone)] | |
764 | struct SetVariables { | |
765 | variables: VariableDefinitions, // variables and values | |
766 | } | |
767 | ||
768 | impl fmt::Display for SetVariables { | |
769 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
770 | 0 | return write!(f, "SetVariables: variables {}", &self.variables); |
771 | 0 | } |
772 | } | |
773 | ||
774 | impl SetVariables { | |
775 | 30.3k | fn build(vars: &Yaml) -> Result<Box<SetVariables>> { |
776 | // 'set_variables:' -- 'variables': xxx (array) | |
777 | 30.3k | if vars.as_vec().is_none() { |
778 | 0 | bail!("'set_variables' -- should be an array of variable name, xpath value"); |
779 | 30.3k | } |
780 | 30.3k | return Ok( Box::new( SetVariables { |
781 | 30.3k | variables: VariableDefinitions::build(vars).context("'set_variables'") |
782 | } ) ); | |
783 | 30.3k | } |
784 | ||
785 | 3.78k | fn replace<'c, 's:'c, 'm: 'c, T:TreeOrString<'c, 'm, T>>(&self, rules_with_context: &mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<T> { |
786 | 3.78k | rules_with_context.context_stack.set_globals(self.variables.clone(), mathml) |
787 | 3.78k | return T::from_string( "".to_string(), rules_with_context.doc ); |
788 | 3.78k | } |
789 | } | |
790 | ||
791 | ||
792 | /// Allow speech of an expression in the middle of a rule (used by "WhereAmI" for navigation) | |
793 | #[derive(Debug, Clone)] | |
794 | struct TranslateExpression { | |
795 | xpath: MyXPath, // variables and values | |
796 | } | |
797 | ||
798 | impl fmt::Display for TranslateExpression { | |
799 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
800 | 0 | return write!(f, "speak: {}", &self.xpath); |
801 | 0 | } |
802 | } | |
803 | impl TranslateExpression { | |
804 | 102 | fn build(vars: &Yaml) -> Result<TranslateExpression> { |
805 | // 'translate:' -- xpath (should evaluate to an id) | |
806 | 102 | return Ok( TranslateExpression { xpath: MyXPath::build(vars).context("'translate'") |
807 | 102 | } |
808 | ||
809 | 2 | fn replace<'c, 's:'c, 'm:'c, T:TreeOrString<'c, 'm, T>>(&self, rules_with_context: &mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<T> { |
810 | 2 | if self.xpath.rc.string.starts_with('@') { |
811 | 2 | let xpath_value = self.xpath.evaluate(rules_with_context.get_context(), mathml) |
812 | 2 | let id = match xpath_value { |
813 | 0 | Value::String(s) => Some(s), |
814 | 2 | Value::Nodeset(nodes) => { |
815 | 2 | if nodes.size() == 1 { |
816 | 2 | nodes.document_order_first().unwrap().attribute().map(|attr| attr.value().to_string()) |
817 | } else { | |
818 | 0 | None |
819 | } | |
820 | }, | |
821 | 0 | _ => None, |
822 | }; | |
823 | 2 | match id { |
824 | 0 | None => bail!("'translate' value '{}' is not a string or an attribute value (correct by using '@id'??):\n", self.xpath), |
825 | 2 | Some(id) => { |
826 | 2 | let speech = speak_mathml(mathml, &id, 0) |
827 | 2 | return T::from_string(speech, rules_with_context.doc); |
828 | } | |
829 | } | |
830 | } else { | |
831 | 0 | return T::from_string( |
832 | 0 | self.xpath.replace(rules_with_context, mathml).context("'translate'")?, |
833 | 0 | rules_with_context.doc |
834 | ); | |
835 | } | |
836 | 2 | } |
837 | } | |
838 | ||
839 | ||
840 | /// An array of rule `Replacement`s (text, xpath, tts commands, etc) | |
841 | #[derive(Debug, Clone)] | |
842 | pub struct ReplacementArray { | |
843 | replacements: Vec<Replacement> | |
844 | } | |
845 | ||
846 | impl fmt::Display for ReplacementArray { | |
847 | 1 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
848 | 1 | return write!(f, "{}", self.pretty_print_replacements()); |
849 | 1 | } |
850 | } | |
851 | ||
852 | impl ReplacementArray { | |
853 | /// Return an empty `ReplacementArray` | |
854 | 1.98M | pub fn build_empty() -> ReplacementArray { |
855 | 1.98M | return ReplacementArray { |
856 | 1.98M | replacements: vec![] |
857 | 1.98M | } |
858 | 1.98M | } |
859 | ||
860 | /// Convert a Yaml input into a [`ReplacementArray`]. | |
861 | /// Any errors are passed back out. | |
862 | 9.24M | pub fn build(replacements: &Yaml) -> Result<ReplacementArray> { |
863 | // replacements is either a single replacement or an array of replacements | |
864 | 9.24M | let result= if replacements.is_array() { |
865 | 9.22M | let replacements = replacements.as_vec().unwrap(); |
866 | 9.22M | replacements |
867 | 9.22M | .iter() |
868 | 9.22M | .enumerate() // useful for errors |
869 | 13.4M | . |
870 | 13.4M | .with_context(|| |
871 | 9.22M | .collect::<Result<Vec<Replacement>>>() |
872 | } else { | |
873 | 21.2k | vec![ Replacement::build(replacements) |
874 | }; | |
875 | ||
876 | 9.24M | return Ok( ReplacementArray{ replacements: result } ); |
877 | 9.24M | } |
878 | ||
879 | /// Do all the replacements in `mathml` using `rules`. | |
880 | 275k | pub fn replace<'c, 's:'c, 'm:'c, T:TreeOrString<'c, 'm, T>>(&self, rules_with_context: &mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<T> { |
881 | 275k | return T::replace(self, rules_with_context, mathml); |
882 | 275k | } |
883 | ||
884 | 142k | pub fn replace_array_string<'c, 's:'c, 'm:'c>(&self, rules_with_context: &mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<String> { |
885 | // loop over the replacements and build up a vector of strings, excluding empty ones. | |
886 | // * eliminate any redundance | |
887 | // * add/replace auto-pauses | |
888 | // * join the remaining vector together | |
889 | 142k | let mut replacement_strings = Vec::with_capacity(self.replacements.len()); // probably conservative guess |
890 | 271k | for replacement in |
891 | 271k | let string: String = rules_with_context.replace(replacement, mathml) |
892 | 271k | if !string.is_empty() { |
893 | 200k | replacement_strings.push(string); |
894 | 200k |
|
895 | } | |
896 | ||
897 | 142k | if replacement_strings.is_empty() { |
898 | 12.9k | return Ok( "".to_string() ); |
899 | 129k | } |
900 | // delete an optional text that is repetitive | |
901 | // we do this by looking for the optional text marker, and if present, check for repetition at end of previous string | |
902 | // if repetitive, we delete the optional string | |
903 | // if not, we leave the markers because the repetition might happen several "levels" up | |
904 | // this could also be done in a final cleanup of the entire string (where we remove any markers), | |
905 | // but the match is harder (rust regex lacks look behind pattern match) and it is less efficient | |
906 | // Note: we skip the first string since it can't be repetitive of something at this level | |
907 | 129k | for |
908 | 45.4k | if let Some( |
909 | 13 | replacement_strings[i] = bytes.to_string(); |
910 | 45.4k | } |
911 | } | |
912 | ||
913 | 200k | for i in |
914 | 200k | if replacement_strings[i].contains(PAUSE_AUTO_STR) { |
915 | 19.5k | let before = if i == 0 { |
916 | 19.5k | let after = if i+1 == replacement_strings.len() { |
917 | 19.5k | replacement_strings[i] = replacement_strings[i].replace( |
918 | 19.5k | PAUSE_AUTO_STR, |
919 | 19.5k | &rules_with_context.speech_rules.pref_manager.borrow().get_tts().compute_auto_pause(&rules_with_context.speech_rules.pref_manager.borrow(), before, after)); |
920 | 181k | } |
921 | } | |
922 | ||
923 | // join the strings together with spaces in between | |
924 | // concatenation (removal of spaces) is saved for the top level because they otherwise are stripped at the wrong sometimes | |
925 | 129k | return Ok( replacement_strings.join(" ") ); |
926 | ||
927 | /// delete an optional text (in 'next') that is repetitive at the end of 'prev' | |
928 | /// we do this by looking for the optional text marker, and if present, check for repetition at end of previous string | |
929 | /// if repetitive, we delete the optional string | |
930 | 45.4k | fn is_repetitive<'a>(prev: &str, next: &'a str) -> Option<&'a str> { |
931 | // OPTIONAL_INDICATOR optionally surrounds the end of 'prev'(ignoring trailing whitespace) | |
932 | // OPTIONAL_INDICATOR surrounds the start of 'next' | |
933 | // minor optimization -- lots of short strings and the OPTIONAL_INDICATOR takes a few bytes, so skip the check for those strings | |
934 | 45.4k | if next.len() <= 2 * OPTIONAL_INDICATOR_LEN { |
935 | 14.2k | return None; |
936 | 31.2k | } |
937 | ||
938 | // should be exactly one match -- ignore more than one for now | |
939 | 31.2k | let |
940 | 36 | let start_repeat_word_in_next = &next[i_start + OPTIONAL_INDICATOR_LEN..]; |
941 | 36 | let i_end = start_repeat_word_in_next.find(OPTIONAL_INDICATOR) |
942 | 36 | .unwrap_or_else(|| |
943 | 36 | let repeat_word = &start_repeat_word_in_next[..i_end]; |
944 | // debug!("check if '{}' is repetitive, end_index={}", repeat_word, i_end); | |
945 | // debug!(" prev: '{}', next '{}'", prev, next); | |
946 | ||
947 | 36 | let prev_trimmed = prev.trim_end(); |
948 | 36 | let ends_with_word = prev_trimmed.len() > repeat_word.len() && |
949 | 36 | let ends_with_wrapped_word = |
950 | 36 | prev_trimmed |
951 | 36 | .strip_suffix(OPTIONAL_INDICATOR) |
952 | 36 | .and_then(|s| |
953 | 36 | .and_then(|s| |
954 | 36 | .is_some(); |
955 | 36 | if ends_with_word || |
956 | // debug!(" is repetitive"); | |
957 | 13 | Some(start_repeat_word_in_next[i_end + OPTIONAL_INDICATOR_LEN..].trim_start()) // remove repeat word and OPTIONAL_INDICATOR |
958 | } else { | |
959 | 23 | None |
960 | } | |
961 | 45.4k | } |
962 | 142k | } |
963 | ||
964 | 132k | pub fn replace_array_tree<'c, 's:'c, 'm:'c>(&self, rules_with_context: &mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<Element<'m>> { |
965 | // shortcut for common case (don't build a new tree node) | |
966 | 132k | if self.replacements.len() == 1 { |
967 | 129k | return rules_with_context.replace::<Element<'m>>(&self.replacements[0], mathml); |
968 | 3.20k | } |
969 | ||
970 | 3.20k | let new_element = create_mathml_element(&rules_with_context.doc, "Unknown"); // Hopefully set later (in Intent::Replace()) |
971 | 3.20k | let mut new_children = Vec::with_capacity(self.replacements.len()); |
972 | 6.12k | for child in |
973 | 6.12k | let child = rules_with_context.replace::<Element<'m>>(child, mathml) |
974 | 6.12k | new_children.push(ChildOfElement::Element(child)); |
975 | }; | |
976 | 3.20k | new_element.append_children(new_children); |
977 | 3.20k | return Ok(new_element); |
978 | 132k | } |
979 | ||
980 | ||
981 | /// Return true if there are no replacements. | |
982 | 29.8k | pub fn is_empty(&self) -> bool { |
983 | 29.8k | return self.replacements.is_empty(); |
984 | 29.8k | } |
985 | ||
986 | 10 | fn pretty_print_replacements(&self) -> String { |
987 | 10 | let mut group_string = String::with_capacity(128); |
988 | 10 | if self.replacements.len() == 1 { |
989 | 9 | group_string += &format!("[{}]", self.replacements[0]); |
990 | 9 | } else { |
991 | 1 | group_string += &self.replacements.iter() |
992 | 1 | .map(|replacement| |
993 | 1 | .collect::<Vec<String>>() |
994 | 1 | .join(""); |
995 | 1 | group_string += "\n"; |
996 | } | |
997 | 10 | return group_string; |
998 | 10 | } |
999 | } | |
1000 | ||
1001 | ||
1002 | ||
1003 | // MyXPath is a wrapper around an 'XPath' that keeps around the original xpath expr (as a string) so it can be used in error reporting. | |
1004 | // Because we want to be able to clone them and XPath doesn't support clone(), this is a wrapper around an internal MyXPath. | |
1005 | // It supports the standard SpeechRule functionality of building and replacing. | |
1006 | #[derive(Debug)] | |
1007 | struct RCMyXPath { | |
1008 | xpath: XPath, | |
1009 | string: String, // store for error reporting | |
1010 | } | |
1011 | ||
1012 | #[derive(Debug, Clone)] | |
1013 | pub struct MyXPath { | |
1014 | rc: Rc<RCMyXPath> // rather than putting Rc around both 'xpath' and 'string', just use one and indirect to internal RCMyXPath | |
1015 | } | |
1016 | ||
1017 | ||
1018 | impl fmt::Display for MyXPath { | |
1019 | 2.79k | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1020 | 2.79k | return write!(f, "\"{}\"", self.rc.string); |
1021 | 2.79k | } |
1022 | } | |
1023 | ||
1024 | // pub fn xpath_count() -> (usize, usize) { | |
1025 | // return (XPATH_CACHE.with( |cache| cache.borrow().len()), unsafe{XPATH_CACHE_HITS} ); | |
1026 | // } | |
1027 | thread_local!{ | |
1028 | static XPATH_CACHE: RefCell<HashMap<String, MyXPath>> = RefCell::new( HashMap::with_capacity(2047) ); | |
1029 | } | |
1030 | // static mut XPATH_CACHE_HITS: usize = 0; | |
1031 | ||
1032 | impl MyXPath { | |
1033 | 8.88M | fn new(xpath: String) -> Result<MyXPath> { |
1034 | 8.88M | return XPATH_CACHE.with( |cache| { |
1035 | 8.88M | let mut cache = cache.borrow_mut(); |
1036 | return Ok( | |
1037 | 8.88M | match cache.get(&xpath) { |
1038 | 5.82M | Some(compiled_xpath) => { |
1039 | // unsafe{ XPATH_CACHE_HITS += 1;}; | |
1040 | 5.82M | compiled_xpath.clone() |
1041 | }, | |
1042 | None => { | |
1043 | 3.06M | let new_xpath = MyXPath { |
1044 | 3.06M | rc: Rc::new( RCMyXPath { |
1045 | 3.06M | xpath: MyXPath::compile_xpath(&xpath) |
1046 | 3.06M | string: xpath.clone() |
1047 | })}; | |
1048 | 3.06M | cache.insert(xpath.clone(), new_xpath.clone()); |
1049 | 3.06M | new_xpath |
1050 | }, | |
1051 | } | |
1052 | ) | |
1053 | 8.88M | }); |
1054 | 8.88M | } |
1055 | ||
1056 | 8.83M | pub fn build(xpath: &Yaml) -> Result<MyXPath> { |
1057 | 8.83M | let xpath = match xpath { |
1058 | 8.64M | Yaml::String(s) => s.to_string(), |
1059 | 0 | Yaml::Integer(i) => i.to_string(), |
1060 | 0 | Yaml::Real(s) => s.to_string(), |
1061 | 0 | Yaml::Boolean(s) => s.to_string(), |
1062 | 193k | Yaml::Array(v) => |
1063 | // array of strings -- concatenate them together | |
1064 | 193k | v.iter() |
1065 | 193k | .map(as_str_checked) |
1066 | 193k | .collect::<Result<Vec<&str>>>() |
1067 | 193k | .join(" "), |
1068 | 0 | _ => bail!("Bad value when trying to create an xpath: {}", yaml_to_string(xpath, 1)), |
1069 | }; | |
1070 | 8.83M | return MyXPath::new(xpath); |
1071 | 8.83M | } |
1072 | ||
1073 | 3.06M | fn compile_xpath(xpath: &str) -> Result<XPath> { |
1074 | 3.06M | let factory = Factory::new(); |
1075 | 3.06M | let xpath_with_debug_info = MyXPath::add_debug_string_arg(xpath) |
1076 | 3.06M | let compiled_xpath = factory.build(&xpath_with_debug_info) |
1077 | 3.06M | .with_context(|| |
1078 | "Could not compile XPath for pattern:\n{}{}", | |
1079 | 0 | &xpath, more_details(xpath)))?; |
1080 | 3.06M | return match compiled_xpath { |
1081 | 3.06M | Some(xpath) => Ok(xpath), |
1082 | 0 | None => bail!("Problem compiling Xpath for pattern:\n{}{}", |
1083 | 0 | &xpath, more_details(xpath)), |
1084 | }; | |
1085 | ||
1086 | ||
1087 | 0 | fn more_details(xpath: &str) -> String { |
1088 | // try to give a better error message by counting [], (), 's, and "s | |
1089 | 0 | if xpath.is_empty() { |
1090 | 0 | return "xpath is empty string".to_string(); |
1091 | 0 | } |
1092 | 0 | let as_bytes = xpath.trim().as_bytes(); |
1093 | 0 | if as_bytes[0] == b'\'' && as_bytes[as_bytes.len()-1] != b'\'' { |
1094 | 0 | return "\nmissing \"'\"".to_string(); |
1095 | 0 | } |
1096 | 0 | if (as_bytes[0] == b'"' && as_bytes[as_bytes.len()-1] != b'"') || |
1097 | 0 | (as_bytes[0] != b'"' && as_bytes[as_bytes.len()-1] == b'"'){ |
1098 | 0 | return "\nmissing '\"'".to_string(); |
1099 | 0 | } |
1100 | ||
1101 | 0 | let mut i_bytes = 0; // keep track of # of bytes into string for error reporting |
1102 | 0 | let mut paren_count = 0; // counter to make sure they are balanced |
1103 | 0 | let mut i_paren = 0; // position of the outermost open paren |
1104 | 0 | let mut bracket_count = 0; |
1105 | 0 | let mut i_bracket = 0; |
1106 | 0 | for ch in xpath.chars() { |
1107 | 0 | if ch == '(' { |
1108 | 0 | if paren_count == 0 { |
1109 | 0 | i_paren = i_bytes; |
1110 | 0 | } |
1111 | 0 | paren_count += 1; |
1112 | 0 | } else if ch == '[' { |
1113 | 0 | if bracket_count == 0 { |
1114 | 0 | i_bracket = i_bytes; |
1115 | 0 | } |
1116 | 0 | bracket_count += 1; |
1117 | 0 | } else if ch == ')' { |
1118 | 0 | if paren_count == 0 { |
1119 | 0 | return format!("\nExtra ')' found after '{}'", &xpath[i_paren..i_bytes]); |
1120 | 0 | } |
1121 | 0 | paren_count -= 1; |
1122 | 0 | if paren_count == 0 && bracket_count > 0 && i_bracket > i_paren { |
1123 | 0 | return format!("\nUnclosed brackets found at '{}'", &xpath[i_paren..i_bytes]); |
1124 | 0 | } |
1125 | 0 | } else if ch == ']' { |
1126 | 0 | if bracket_count == 0 { |
1127 | 0 | return format!("\nExtra ']' found after '{}'", &xpath[i_bracket..i_bytes]); |
1128 | 0 | } |
1129 | 0 | bracket_count -= 1; |
1130 | 0 | if bracket_count == 0 && paren_count > 0 && i_paren > i_bracket { |
1131 | 0 | return format!("\nUnclosed parens found at '{}'", &xpath[i_bracket..i_bytes]); |
1132 | 0 | } |
1133 | 0 | } |
1134 | 0 | i_bytes += ch.len_utf8(); |
1135 | } | |
1136 | 0 | return "".to_string(); |
1137 | 0 | } |
1138 | 3.06M | } |
1139 | ||
1140 | /// Convert DEBUG(...) input to the internal function which is DEBUG(arg, arg_as_string) | |
1141 | 3.06M | fn add_debug_string_arg(xpath: &str) -> Result<String> { |
1142 | // do a quick check to see if "DEBUG" is in the string -- this is the common case | |
1143 | 3.06M | let debug_start = xpath.find("DEBUG("); |
1144 | 3.06M | if debug_start.is_none() { |
1145 | 3.06M | return Ok( xpath.to_string() ); |
1146 | 1.56k | } |
1147 | ||
1148 | 1.56k | let debug_start = debug_start.unwrap(); |
1149 | 1.56k | let mut before_paren = xpath[..debug_start+5].to_string(); // includes "DEBUG" |
1150 | 1.56k | let chars = xpath[debug_start+5..].chars().collect::<Vec<char>>(); // begins at '(' |
1151 | 1.56k | before_paren.push_str(&chars_add_debug_string_arg(&chars).with_context(|| |
1152 | // debug!("add_debug_string_arg: {}", before_paren); | |
1153 | 1.56k | return Ok(before_paren); |
1154 | ||
1155 | 1.56k | fn chars_add_debug_string_arg(chars: &[char]) -> Result<String> { |
1156 | // Find all the DEBUG(...) commands in 'xpath' and adds a string argument. | |
1157 | // The DEBUG function that is used internally takes two arguments, the second one being a string version of the DEBUG arg. | |
1158 | // Being a string, any quotes need to be escaped, and DEBUGs inside of DEBUGs need more escaping. | |
1159 | // This is done via recursive calls to this function. | |
1160 | 1.56k | assert_eq!(chars[0], '(', "{} does not start with ')'", |
1161 | 1.56k | let mut count = 1; // open/close count |
1162 | 1.56k | let mut i = 1; |
1163 | 1.56k | let mut inside_quote = false; |
1164 | 50.8k | while i < chars.len() { |
1165 | 50.8k | let ch = chars[i]; |
1166 | 805 | match ch { |
1167 | '\\' => { | |
1168 | 0 | if i+1 == chars.len() { |
1169 | 0 | bail!("Syntax error in DEBUG: last char is escape char\nDebug string: '{}'", chars.iter().collect::<String>()); |
1170 | 0 | } |
1171 | 0 | i += 1; |
1172 | }, | |
1173 | 2.21k | '\'' => inside_quote = !inside_quote, |
1174 | 804 | '(' if !inside_quote => { |
1175 | 804 | count += 1; |
1176 | 804 | // FIX: it would be more efficient to spot "DEBUG" preceding this and recurse rather than matching the whole string and recursing |
1177 | 804 | }, |
1178 | 1 | '(' => (), |
1179 | 2.36k | ')' if !inside_quote => { |
1180 | 2.36k | count -= 1; |
1181 | 2.36k | if count == 0 { |
1182 | 1.56k | let arg = &chars[1..i].iter().collect::<String>(); |
1183 | 1.56k | let escaped_arg = arg.replace('"', "\\\""); |
1184 | // DEBUG(...) may be inside 'arg' -- recurse | |
1185 | 1.56k | let processed_arg = MyXPath::add_debug_string_arg(arg) |
1186 | ||
1187 | // DEBUG(...) may be in the remainder of the string -- recurse | |
1188 | 1.56k | let processed_rest = MyXPath::add_debug_string_arg(&chars[i+1..].iter().collect::<String>()) |
1189 | 1.56k | return Ok( format!("({processed_arg}, \"{escaped_arg}\"){processed_rest}") ); |
1190 | 804 | } |
1191 | }, | |
1192 | 0 | ')' => (), |
1193 | 45.4k | _ => (), |
1194 | } | |
1195 | 49.2k | i += 1; |
1196 | } | |
1197 | 0 | bail!("Syntax error in DEBUG: didn't find matching closing paren\nDEBUG{}", chars.iter().collect::<String>()); |
1198 | 1.56k | } |
1199 | 3.06M | } |
1200 | ||
1201 | 156k | fn is_true(&self, context: &sxd_xpath::Context, mathml: Element) -> Result<bool> { |
1202 | // return true if there is no condition or if the condition evaluates to true | |
1203 | return Ok( | |
1204 | 156k | match self.evaluate(context, mathml) |
1205 | 115k | Value::Boolean(b) => b, |
1206 | 40.6k | Value::Nodeset(nodes) => nodes.size() > 0, |
1207 | 0 | _ => false, |
1208 | } | |
1209 | ) | |
1210 | 156k | } |
1211 | ||
1212 | 153k | pub fn replace<'c, 's:'c, 'm:'c, T:TreeOrString<'c, 'm, T>>(&self, rules_with_context: &mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<T> { |
1213 | 153k | if self.rc.string == "process-intent(.)" { |
1214 | 2.47k | return |
1215 | 150k | } |
1216 | ||
1217 | 150k | let result = self.evaluate(&rules_with_context.context_stack.base, mathml) |
1218 | 150k | .with_context(|| |
1219 | 150k | let |
1220 | 121k | Value::Nodeset(nodes) => { |
1221 | 121k | if nodes.size() == 0 { |
1222 | 0 | bail!("During replacement, no matching element found"); |
1223 | 121k | } |
1224 | 121k | return rules_with_context.replace_nodes(nodes.document_order(), mathml); |
1225 | }, | |
1226 | 25.1k | Value::String(s) => s, |
1227 | 3.80k | Value::Number(num) => num.to_string(), |
1228 | 0 | Value::Boolean(b) => b.to_string(), // FIX: is this right??? |
1229 | }; | |
1230 | // Hack!: this test for input that starts with a '$' (defined variable), avoids a double evaluate; | |
1231 | // We don't need NO_EVAL_QUOTE_CHAR here, but the more general solution of a quoted execute (- xq:) would avoid this hack | |
1232 | 28.9k | let result = if self.rc.string.starts_with('$') { |
1233 | 28.9k | return T::from_string(result, rules_with_context.doc ); |
1234 | 153k | } |
1235 | ||
1236 | 1.29M | pub fn evaluate<'c>(&self, context: &sxd_xpath::Context<'c>, mathml: Element<'c>) -> Result<Value<'c>> { |
1237 | // debug!("evaluate: {}", self); | |
1238 | 1.29M | let result = self.rc.xpath.evaluate(context, mathml); |
1239 | 1.29M | return match result { |
1240 | 1.29M | Ok(val) => Ok( val ), |
1241 | 0 | Err(e) => { |
1242 | // debug!("MyXPath::trying to evaluate:\n '{}'\n caused the error\n'{}'", self, e.to_string().replace("OwnedPrefixedName { prefix: None, local_part:", "").replace(" }", "")); | |
1243 | 0 | bail!( "{}\n\n", |
1244 | // remove confusing parts of error message from xpath | |
1245 | 0 | e.to_string().replace("OwnedPrefixedName { prefix: None, local_part:", "").replace(" }", "") ); |
1246 | } | |
1247 | }; | |
1248 | 1.29M | } |
1249 | ||
1250 | 0 | pub fn test_input<F>(self, f: F) -> bool where F: Fn(&str) -> bool { |
1251 | 0 | return f(self.rc.string.as_ref()); |
1252 | 0 | } |
1253 | } | |
1254 | ||
1255 | // 'SpeechPattern' holds a single pattern. | |
1256 | // Some info is not needed beyond converting the Yaml to the SpeechPattern, but is useful for error reporting. | |
1257 | // The two main parts are the pattern to be matched and the replacements to do if there is a match. | |
1258 | // Any variables/prefs that are defined/set are also stored. | |
1259 | #[derive(Debug)] | |
1260 | struct SpeechPattern { | |
1261 | pattern_name: String, | |
1262 | tag_name: String, | |
1263 | file_name: String, | |
1264 | pattern: MyXPath, // the xpath expr to attempt to match | |
1265 | match_uses_var_defs: bool, // include var_defs in context for matching | |
1266 | var_defs: VariableDefinitions, // any variable definitions [can be and probably is an empty vector most of the time] | |
1267 | replacements: ReplacementArray, // the replacements in case there is a match | |
1268 | } | |
1269 | ||
1270 | impl fmt::Display for SpeechPattern { | |
1271 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1272 | 0 | return write!(f, "[name: {}, tag: {},\n variables: {:?}, pattern: {},\n replacement: {}]", |
1273 | self.pattern_name, self.tag_name, self.var_defs, self.pattern, | |
1274 | 0 | self.replacements.pretty_print_replacements()); |
1275 | 0 | } |
1276 | } | |
1277 | ||
1278 | impl SpeechPattern { | |
1279 | 894k | fn build(dict: &Yaml, file: &Path, rules: &mut SpeechRules) -> Result<Option<Vec<PathBuf>>> { |
1280 | // Rule::SpeechPattern | |
1281 | // build { "pattern_name", "tag_name", "pattern", "replacement" } | |
1282 | // or recurse via include: file_name | |
1283 | ||
1284 | // debug!("\nbuild_speech_pattern: dict:\n{}", yaml_to_string(dict, 0)); | |
1285 | 894k | if let Some( |
1286 | 30.2k | let do_include_fn = |new_file: &Path| { |
1287 | 30.2k | rules.read_patterns(new_file) |
1288 | 30.2k | }; |
1289 | ||
1290 | 30.2k | return Ok( Some(process_include(file, include_file_name, do_include_fn) |
1291 | 864k | } |
1292 | ||
1293 | 864k | let pattern_name = find_str(dict, "name"); |
1294 | ||
1295 | // tag_named can be either a string (most common) or an array of strings | |
1296 | 864k | let mut tag_names: Vec<&str> = Vec::new(); |
1297 | 864k | match find_str(dict, "tag") { |
1298 | 740k | Some(str) => tag_names.push(str), |
1299 | None => { | |
1300 | // check for array | |
1301 | 124k | let tag_array = &dict["tag"]; |
1302 | 124k | tag_names = vec![]; |
1303 | 124k | if tag_array.is_array() { |
1304 | 263k | for (i, name) in |
1305 | 263k | match as_str_checked(name) { |
1306 | 0 | Err(e) => return Err( |
1307 | 0 | e.context( |
1308 | 0 | format!("tag name '{}' is not a string in:\n{}", |
1309 | 0 | &yaml_to_string(&tag_array.as_vec().unwrap()[i], 0), |
1310 | 0 | &yaml_to_string(dict, 1))) |
1311 | 0 | ), |
1312 | 263k | Ok(str) => tag_names.push(str), |
1313 | }; | |
1314 | } | |
1315 | } else { | |
1316 | 0 | bail!("Errors trying to find 'tag' in:\n{}", &yaml_to_string(dict, 1)); |
1317 | } | |
1318 | } | |
1319 | } | |
1320 | ||
1321 | 864k | if pattern_name.is_none() { |
1322 | 0 | if dict.is_null() { |
1323 | 0 | bail!("Error trying to find 'name': empty value (two consecutive '-'s?"); |
1324 | } else { | |
1325 | 0 | bail!("Errors trying to find 'name' in:\n{}", &yaml_to_string(dict, 1)); |
1326 | }; | |
1327 | 864k | }; |
1328 | 864k | let pattern_name = pattern_name.unwrap().to_string(); |
1329 | ||
1330 | // FIX: add check to make sure tag_name is a valid MathML tag name | |
1331 | 864k | if dict["match"].is_badvalue() { |
1332 | 0 | bail!("Did not find 'match' in\n{}", yaml_to_string(dict, 1)); |
1333 | 864k | } |
1334 | 864k | if dict["replace"].is_badvalue() { |
1335 | 0 | bail!("Did not find 'replace' in\n{}", yaml_to_string(dict, 1)); |
1336 | 864k | } |
1337 | ||
1338 | // xpath's can't be cloned, so we need to do a 'build_xxx' for each tag name | |
1339 | 1.00M | for tag_name in |
1340 | 1.00M | let tag_name = tag_name.to_string(); |
1341 | 1.00M | let pattern_xpath = MyXPath::build(&dict["match"]) |
1342 | 1.00M | .with_context(|| |
1343 | 0 | format!("value for 'match' in rule ({}: {}):\n{}", |
1344 | 0 | tag_name, pattern_name, yaml_to_string(dict, 1)) |
1345 | 0 | })?; |
1346 | 1.00M | let speech_pattern = |
1347 | 1.00M | Box::new( SpeechPattern{ |
1348 | 1.00M | pattern_name: pattern_name.clone(), |
1349 | 1.00M | tag_name: tag_name.clone(), |
1350 | 1.00M | file_name: file.to_str().unwrap().to_string(), |
1351 | 1.00M | match_uses_var_defs: dict["variables"].is_array() && |
1352 | 1.00M | pattern: pattern_xpath, |
1353 | 1.00M | var_defs: VariableDefinitions::build(&dict["variables"]) |
1354 | 1.00M | .with_context(|| |
1355 | 0 | format!("value for 'variables' in rule ({}: {}):\n{}", |
1356 | 0 | tag_name, pattern_name, yaml_to_string(dict, 1)) |
1357 | 0 | })?, |
1358 | 1.00M | replacements: ReplacementArray::build(&dict["replace"]) |
1359 | 1.00M | .with_context(|| |
1360 | 0 | format!("value for 'replace' in rule ({}: {}). Replacements:\n{}", |
1361 | 0 | tag_name, pattern_name, yaml_to_string(&dict["replace"], 1)) |
1362 | 0 | })? |
1363 | } ); | |
1364 | // get the array of rules for the tag name | |
1365 | 1.00M | let rule_value = rules.rules.entry(tag_name).or_default(); |
1366 | ||
1367 | // if the name exists, replace it. Otherwise add the new rule | |
1368 | 2.67M | match |
1369 | 1.00M | None => rule_value.push(speech_pattern), |
1370 | 9 | Some((i, _old_pattern)) => { |
1371 | 9 | let old_rule = &rule_value[i]; |
1372 | 9 | info!("\n\n***WARNING***: replacing {}/'{}' in {} with rule from {}\n", |
1373 | old_rule.tag_name, old_rule.pattern_name, old_rule.file_name, speech_pattern.file_name); | |
1374 | 9 | rule_value[i] = speech_pattern; |
1375 | }, | |
1376 | } | |
1377 | } | |
1378 | ||
1379 | 864k | return Ok(None); |
1380 | 894k | } |
1381 | ||
1382 | 870k | fn is_match(&self, context: &sxd_xpath::Context, mathml: Element) -> Result<bool> { |
1383 | 870k | if self.tag_name != mathml.name().local_part() && |
1384 | 0 | return Ok( false ); |
1385 | 870k | } |
1386 | ||
1387 | // debug!("\nis_match: pattern='{}'", self.pattern_name); | |
1388 | // debug!(" pattern_expr {:?}", self.pattern); | |
1389 | // debug!("is_match: mathml is\n{}", mml_to_string(mathml)); | |
1390 | return Ok( | |
1391 | 870k | match self.pattern.evaluate(context, mathml) |
1392 | 652k | Value::Boolean(b) => b, |
1393 | 217k | Value::Nodeset(nodes) => nodes.size() > 0, |
1394 | 0 | _ => false, |
1395 | } | |
1396 | ); | |
1397 | 870k | } |
1398 | } | |
1399 | ||
1400 | ||
1401 | // 'Test' holds information used if the replacement is a "test:" clause. | |
1402 | // The condition is an xpath expr and the "else:" part is optional. | |
1403 | ||
1404 | #[derive(Debug, Clone)] | |
1405 | struct TestArray { | |
1406 | tests: Vec<Test> | |
1407 | } | |
1408 | ||
1409 | impl fmt::Display for TestArray { | |
1410 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1411 | 0 | for test in &self.tests { |
1412 | 0 | writeln!(f, "{test}")?; |
1413 | } | |
1414 | 0 | return Ok( () ); |
1415 | 0 | } |
1416 | } | |
1417 | ||
1418 | impl TestArray { | |
1419 | 3.29M | fn build(test: &Yaml) -> Result<TestArray> { |
1420 | // 'test:' for convenience takes either a dictionary with keys if/else_if/then/then_test/else/else_test or | |
1421 | // or an array of those values (there should be at most one else/else_test) | |
1422 | ||
1423 | // if 'test' is a dictionary ('Hash'), we convert it to an array with one entry and proceed | |
1424 | 3.29M | let tests = if test.as_hash().is_some() { |
1425 | 3.01M | vec![test] |
1426 | 287k | } else if let Some(vec) = test.as_vec() { |
1427 | 287k | vec.iter().collect() |
1428 | } else { | |
1429 | 0 | bail!("Value for 'test:' is neither a dictionary or an array.") |
1430 | }; | |
1431 | ||
1432 | // each entry in 'tests' should be a dictionary with keys if/then/then_test/else/else_test | |
1433 | // a valid entry is one of: | |
1434 | // if:/else_if:, then:/then_test: and optional else:/else_test: | |
1435 | // else:/else_test: -- if this case, it should be the last entry in 'tests' | |
1436 | // 'if:' should only be the first entry in the array; 'else_if' should never be the first entry. Otherwise, they are the same | |
1437 | 3.29M | let mut test_array = vec![]; |
1438 | 3.73M | for test in |
1439 | 3.73M | if test.as_hash().is_none() { |
1440 | 0 | bail!("Value for array entry in 'test:' must be a dictionary/contain keys"); |
1441 | 3.73M | } |
1442 | 3.73M | let if_part = &test[if test_array.is_empty() { |
1443 | 3.73M | if !if_part.is_badvalue() { |
1444 | // first case: if:, then:, optional else: | |
1445 | 3.69M | let condition = Some( MyXPath::build(if_part) |
1446 | 3.69M | let then_part = TestOrReplacements::build(test, "then", "then_test", true) |
1447 | 3.69M | let else_part = TestOrReplacements::build(test, "else", "else_test", false) |
1448 | 3.69M | let n_keys = if else_part.is_none() { |
1449 | 3.69M | if test.as_hash().unwrap().len() > n_keys { |
1450 | 0 | bail!("A key other than 'if', 'else_if', 'then', 'then_test', 'else', or 'else_test' was found in the 'then' clause of 'test'"); |
1451 | 3.69M | }; |
1452 | 3.69M | test_array.push( |
1453 | 3.69M | Test { condition, then_part, else_part } |
1454 | ); | |
1455 | } else { | |
1456 | // second case: should be else/else_test | |
1457 | 42.3k | let else_part = TestOrReplacements::build(test, "else", "else_test", true) |
1458 | 42.3k | if test.as_hash().unwrap().len() > 1 { |
1459 | 0 | bail!("A key other than 'if', 'else_if', 'then', 'then_test', 'else', or 'else_test' was found the 'else' clause of 'test'"); |
1460 | 42.3k | }; |
1461 | 42.3k | test_array.push( |
1462 | 42.3k | Test { condition: None, then_part: None, else_part } |
1463 | ); | |
1464 | ||
1465 | // there shouldn't be any trailing tests | |
1466 | 42.3k | if test_array.len() < test.as_hash().unwrap().len() { |
1467 | 0 | bail!("'else'/'else_test' key is not last key in 'test:'"); |
1468 | 42.3k | } |
1469 | } | |
1470 | }; | |
1471 | ||
1472 | 3.29M | if test_array.is_empty() { |
1473 | 0 | bail!("No entries for 'test:'"); |
1474 | 3.29M | } |
1475 | ||
1476 | 3.29M | return Ok( TestArray { tests: test_array } ); |
1477 | 3.29M | } |
1478 | ||
1479 | 121k | fn replace<'c, 's:'c, 'm:'c, T:TreeOrString<'c, 'm, T>>(&self, rules_with_context: &mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<T> { |
1480 | 156k | for test in |
1481 | 156k | if test.is_true(&rules_with_context.context_stack.base, mathml) |
1482 | 85.2k | assert!(test.then_part.is_some()); |
1483 | 85.2k | return test.then_part.as_ref().unwrap().replace(rules_with_context, mathml); |
1484 | 71.1k | } else if let Some( |
1485 | 12.9k | return else_part.replace(rules_with_context, mathml); |
1486 | 58.1k | } |
1487 | } | |
1488 | 23.4k | return T::from_string("".to_string(), rules_with_context.doc); |
1489 | 121k | } |
1490 | } | |
1491 | ||
1492 | #[derive(Debug, Clone)] | |
1493 | // Used to hold then/then_test and also else/else_test -- only one of these can be present at a time | |
1494 | enum TestOrReplacements { | |
1495 | Replacements(ReplacementArray), // replacements to use when a test is true | |
1496 | Test(TestArray), // the array of if/then/else tests | |
1497 | } | |
1498 | ||
1499 | impl fmt::Display for TestOrReplacements { | |
1500 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1501 | 0 | if let TestOrReplacements::Test(_) = self { |
1502 | 0 | write!(f, " _test")?; |
1503 | 0 | } |
1504 | 0 | write!(f, ":")?; |
1505 | 0 | return match self { |
1506 | 0 | TestOrReplacements::Test(t) => write!(f, "{t}"), |
1507 | 0 | TestOrReplacements::Replacements(r) => write!(f, "{r}"), |
1508 | }; | |
1509 | 0 | } |
1510 | } | |
1511 | ||
1512 | impl TestOrReplacements { | |
1513 | 7.43M | fn build(test: &Yaml, replace_key: &str, test_key: &str, key_required: bool) -> Result<Option<TestOrReplacements>> { |
1514 | 7.43M | let part = &test[replace_key]; |
1515 | 7.43M | let test_part = &test[test_key]; |
1516 | 7.43M | if !part.is_badvalue() && |
1517 | 0 | bail!(format!("Only one of '{}' or '{}' is allowed as part of 'test'.\n{}\n \ |
1518 | Suggestion: delete one or adjust indentation", | |
1519 | 0 | replace_key, test_key, yaml_to_string(test, 2))); |
1520 | 7.43M | } |
1521 | 7.43M | if part.is_badvalue() && |
1522 | 2.45M | if key_required { |
1523 | 0 | bail!(format!("Missing one of '{}'/'{}:' as part of 'test:'\n{}\n \ |
1524 | Suggestion: add the missing key or indent so it is contained in 'test'", | |
1525 | 0 | replace_key, test_key, yaml_to_string(test, 2))) |
1526 | } else { | |
1527 | 2.45M | return Ok( None ); |
1528 | } | |
1529 | 4.97M | } |
1530 | // at this point, we have only one of the two options | |
1531 | 4.97M | if test_part.is_badvalue() { |
1532 | 4.26M | return Ok( Some( TestOrReplacements::Replacements( ReplacementArray::build(part) |
1533 | } else { | |
1534 | 712k | return Ok( Some( TestOrReplacements::Test( TestArray::build(test_part) |
1535 | } | |
1536 | 7.43M | } |
1537 | ||
1538 | 98.2k | fn replace<'c, 's:'c, 'm:'c, T:TreeOrString<'c, 'm, T>>(&self, rules_with_context: &mut SpeechRulesWithContext<'c, 's,'m>, mathml: Element<'c>) -> Result<T> { |
1539 | 98.2k | return match self { |
1540 | 92.4k | TestOrReplacements::Replacements(r) => r.replace(rules_with_context, mathml), |
1541 | 5.74k | TestOrReplacements::Test(t) => t.replace(rules_with_context, mathml), |
1542 | } | |
1543 | 98.2k | } |
1544 | } | |
1545 | ||
1546 | #[derive(Debug, Clone)] | |
1547 | struct Test { | |
1548 | condition: Option<MyXPath>, | |
1549 | then_part: Option<TestOrReplacements>, | |
1550 | else_part: Option<TestOrReplacements>, | |
1551 | } | |
1552 | impl fmt::Display for Test { | |
1553 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1554 | 0 | write!(f, "test: [ ")?; |
1555 | 0 | if let Some(if_part) = &self.condition { |
1556 | 0 | write!(f, " if: '{if_part}'")?; |
1557 | 0 | } |
1558 | 0 | if let Some(then_part) = &self.then_part { |
1559 | 0 | write!(f, " then{then_part}")?; |
1560 | 0 | } |
1561 | 0 | if let Some(else_part) = &self.else_part { |
1562 | 0 | write!(f, " else{else_part}")?; |
1563 | 0 | } |
1564 | 0 | return write!(f, "]"); |
1565 | 0 | } |
1566 | } | |
1567 | ||
1568 | impl Test { | |
1569 | 156k | fn is_true(&self, context: &sxd_xpath::Context, mathml: Element) -> Result<bool> { |
1570 | 156k | return match self.condition.as_ref() { |
1571 | 136 | None => Ok( false ), // trivially false -- want to do else part |
1572 | 156k | Some(condition) => condition.is_true(context, mathml) |
1573 | 156k | .context("Failure in conditional test"), |
1574 | } | |
1575 | 156k | } |
1576 | } | |
1577 | ||
1578 | // Used for speech rules with "variables: ..." | |
1579 | #[derive(Debug, Clone)] | |
1580 | struct VariableDefinition { | |
1581 | name: String, // name of variable | |
1582 | value: MyXPath, // xpath value, typically a constant like "true" or "0", but could be "*/*[1]" to store some nodes | |
1583 | } | |
1584 | ||
1585 | impl fmt::Display for VariableDefinition { | |
1586 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1587 | 0 | return write!(f, "[name: {}={}]", self.name, self.value); |
1588 | 0 | } |
1589 | } | |
1590 | ||
1591 | // Used for speech rules with "variables: ..." | |
1592 | #[derive(Debug)] | |
1593 | struct VariableValue<'v> { | |
1594 | name: String, // name of variable | |
1595 | value: Option<Value<'v>>, // xpath value, typically a constant like "true" or "0", but could be "*/*[1]" to store some nodes | |
1596 | } | |
1597 | ||
1598 | impl fmt::Display for VariableValue<'_> { | |
1599 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1600 | 0 | let value = match &self.value { |
1601 | 0 | None => "unset".to_string(), |
1602 | 0 | Some(val) => format!("{val:?}") |
1603 | }; | |
1604 | 0 | return write!(f, "[name: {}, value: {}]", self.name, value); |
1605 | 0 | } |
1606 | } | |
1607 | ||
1608 | impl VariableDefinition { | |
1609 | 472k | fn build(name_value_def: &Yaml) -> Result<VariableDefinition> { |
1610 | 472k | match name_value_def.as_hash() { |
1611 | 472k | Some(map) => { |
1612 | 472k | if map.len() != 1 { |
1613 | 0 | bail!("definition is not a key/value pair. Found {}", |
1614 | 0 | yaml_to_string(name_value_def, 1) ); |
1615 | 472k | } |
1616 | 472k | let (name, value) = map.iter().next().unwrap(); |
1617 | 472k | let name = as_str_checked( name) |
1618 | 472k | .with_context(|| |
1619 | 472k |
|
1620 | 472k | match value { |
1621 | 472k | Yaml::Boolean(_) | Yaml::String(_) | Yaml::Integer(_) | Yaml::Real(_) => (), |
1622 | 0 | _ => bail!("definition value is not a string, boolean, or number. Found {}", |
1623 | 0 | yaml_to_string(value, 1) ) |
1624 | }; | |
1625 | return Ok( | |
1626 | VariableDefinition{ | |
1627 | 472k | name, |
1628 | 472k | value: MyXPath::build(value) |
1629 | } | |
1630 | ); | |
1631 | }, | |
1632 | 0 | None => bail!("definition is not a key/value pair. Found {}", |
1633 | 0 | yaml_to_string(name_value_def, 1) ) |
1634 | } | |
1635 | 472k | } |
1636 | } | |
1637 | ||
1638 | ||
1639 | #[derive(Debug, Clone)] | |
1640 | struct VariableDefinitions { | |
1641 | defs: Vec<VariableDefinition> | |
1642 | } | |
1643 | ||
1644 | impl fmt::Display for VariableDefinitions { | |
1645 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1646 | 0 | for def in &self.defs { |
1647 | 0 | write!(f, "{def},")?; |
1648 | } | |
1649 | 0 | return Ok( () ); |
1650 | 0 | } |
1651 | } | |
1652 | ||
1653 | struct VariableValues<'v> { | |
1654 | defs: Vec<VariableValue<'v>> | |
1655 | } | |
1656 | ||
1657 | impl fmt::Display for VariableValues<'_> { | |
1658 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1659 | 0 | for value in &self.defs { |
1660 | 0 | write!(f, "{value}")?; |
1661 | } | |
1662 | 0 | return writeln!(f); |
1663 | 0 | } |
1664 | } | |
1665 | ||
1666 | impl VariableDefinitions { | |
1667 | 1.11M | fn new(len: usize) -> VariableDefinitions { |
1668 | 1.11M | return VariableDefinitions{ defs: Vec::with_capacity(len) }; |
1669 | 1.11M | } |
1670 | ||
1671 | 1.11M | fn build(defs: &Yaml) -> Result<VariableDefinitions> { |
1672 | 1.11M | if defs.is_badvalue() { |
1673 | 834k | return Ok( VariableDefinitions::new(0) ); |
1674 | 277k | }; |
1675 | 277k | if defs.is_array() { |
1676 | 277k | let defs = defs.as_vec().unwrap(); |
1677 | 277k | let mut definitions = VariableDefinitions::new(defs.len()); |
1678 | 472k | for def in |
1679 | 472k | let variable_def = VariableDefinition::build(def) |
1680 | 472k | .context("definition of 'variables'") |
1681 | 472k | definitions.push( variable_def); |
1682 | }; | |
1683 | 277k | return Ok (definitions ); |
1684 | 0 | } |
1685 | 0 | bail!( "'variables' is not an array of {{name: xpath-value}} definitions. Found {}'", |
1686 | 0 | yaml_to_string(defs, 1) ); |
1687 | 1.11M | } |
1688 | ||
1689 | 472k | fn push(&mut self, var_def: VariableDefinition) { |
1690 | 472k | self.defs.push(var_def); |
1691 | 472k | } |
1692 | ||
1693 | 241k | fn len(&self) -> usize { |
1694 | 241k | return self.defs.len(); |
1695 | 241k | } |
1696 | } | |
1697 | ||
1698 | struct ContextStack<'c> { | |
1699 | // Note: values are generated by calling value_of on an Evaluation -- that makes the two lifetimes the same | |
1700 | old_values: Vec<VariableValues<'c>>, // store old values so they can be set on pop | |
1701 | base: sxd_xpath::Context<'c> // initial context -- contains all the function defs and pref variables | |
1702 | } | |
1703 | ||
1704 | impl fmt::Display for ContextStack<'_> { | |
1705 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1706 | 0 | writeln!(f, " {} old_values", self.old_values.len())?; |
1707 | 0 | for values in &self.old_values { |
1708 | 0 | writeln!(f, " {values}")?; |
1709 | } | |
1710 | 0 | return writeln!(f); |
1711 | 0 | } |
1712 | } | |
1713 | ||
1714 | impl<'c, 'r> ContextStack<'c> { | |
1715 | 22.7k | fn new<'a,>(pref_manager: &'a PreferenceManager) -> ContextStack<'c> { |
1716 | 22.7k | let prefs = pref_manager.merge_prefs(); |
1717 | 22.7k | let mut context_stack = ContextStack { |
1718 | 22.7k | base: ContextStack::base_context(prefs), |
1719 | 22.7k | old_values: Vec::with_capacity(31) // should avoid allocations |
1720 | 22.7k | }; |
1721 | // FIX: the list of variables to set should come from definitions.yaml | |
1722 | // These can't be set on the <math> tag because of the "translate" command which starts speech at an 'id' | |
1723 | 22.7k | context_stack.base.set_variable("MatchingPause", Value::Boolean(false)); |
1724 | 22.7k | context_stack.base.set_variable("IsColumnSilent", Value::Boolean(false)); |
1725 | ||
1726 | ||
1727 | 22.7k | return context_stack; |
1728 | 22.7k | } |
1729 | ||
1730 | 22.7k | fn base_context(var_defs: PreferenceHashMap) -> sxd_xpath::Context<'c> { |
1731 | 22.7k | let mut context = sxd_xpath::Context::new(); |
1732 | 22.7k | context.set_namespace("m", "http://www.w3.org/1998/Math/MathML"); |
1733 | 22.7k | crate::xpath_functions::add_builtin_functions(&mut context); |
1734 | 1.88M | for (key, value) in |
1735 | 1.88M | context.set_variable(key.as_str(), yaml_to_value(&value)); |
1736 | 1.88M | // if let Some(str_value) = value.as_str() { |
1737 | 1.88M | // if str_value != "Auto" { |
1738 | 1.88M | // debug!("Set {}='{}'", key.as_str(), str_value); |
1739 | 1.88M | // } |
1740 | 1.88M | // } |
1741 | 1.88M | }; |
1742 | 22.7k | return context; |
1743 | 22.7k | } |
1744 | ||
1745 | 3.78k | fn set_globals(&'r mut self, new_vars: VariableDefinitions, mathml: Element<'c>) -> Result<()> { |
1746 | // for each var/value pair, evaluate the value and add the var/value to the base context | |
1747 | 4.84k | for def in |
1748 | // set the new value | |
1749 | 4.84k | let new_value = match def.value.evaluate(&self.base, mathml) { |
1750 | 4.84k | Ok(val) => val, |
1751 | 0 | Err(_) => bail!(format!("Can't evaluate variable def for {}", def)), |
1752 | }; | |
1753 | 4.84k | let qname = QName::new(def.name.as_str()); |
1754 | 4.84k | self.base.set_variable(qname, new_value); |
1755 | } | |
1756 | 3.78k | return Ok( () ); |
1757 | 3.78k | } |
1758 | ||
1759 | 27.3k | fn push(&'r mut self, new_vars: VariableDefinitions, mathml: Element<'c>) -> Result<()> { |
1760 | // store the old value and set the new one | |
1761 | 27.3k | let mut old_values = VariableValues {defs: Vec::with_capacity(new_vars.defs.len()) }; |
1762 | 27.3k | let evaluation = Evaluation::new(&self.base, Node::Element(mathml)); |
1763 | 66.9k | for def in |
1764 | 66.9k | // get the old value (might not be defined) |
1765 | 66.9k | let qname = QName::new(def.name.as_str()); |
1766 | 66.9k | let old_value = evaluation.value_of(qname).cloned(); |
1767 | 66.9k | old_values.defs.push( VariableValue{ name: def.name.clone(), value: old_value} ); |
1768 | 66.9k | } |
1769 | ||
1770 | // use a second loop because of borrow problem with self.base and 'evaluation' | |
1771 | 66.9k | for def in |
1772 | // set the new value | |
1773 | 66.9k | let new_value = match def.value.evaluate(&self.base, mathml) { |
1774 | 66.9k | Ok(val) => val, |
1775 | 0 | Err(_) => Value::Nodeset(sxd_xpath::nodeset::Nodeset::new()), |
1776 | }; | |
1777 | 66.9k | let qname = QName::new(def.name.as_str()); |
1778 | 66.9k | self.base.set_variable(qname, new_value); |
1779 | } | |
1780 | 27.3k | self.old_values.push(old_values); |
1781 | 27.3k | return Ok( () ); |
1782 | 27.3k | } |
1783 | ||
1784 | 27.3k | fn pop(&mut self) { |
1785 | const MISSING_VALUE: &str = "-- unset value --"; // can't remove a variable from context, so use this value | |
1786 | 27.3k | let old_values = self.old_values.pop().unwrap(); |
1787 | 66.9k | for variable in |
1788 | 66.9k | let qname = QName::new(&variable.name); |
1789 | 66.9k | let old_value = match variable.value { |
1790 | 22.8k | None => Value::String(MISSING_VALUE.to_string()), |
1791 | 44.1k | Some(val) => val, |
1792 | }; | |
1793 | 66.9k | self.base.set_variable(qname, old_value); |
1794 | } | |
1795 | 27.3k | } |
1796 | } | |
1797 | ||
1798 | ||
1799 | 1.88M | fn yaml_to_value<'b>(yaml: &Yaml) -> Value<'b> { |
1800 | 1.88M | return match yaml { |
1801 | 1.47M | Yaml::String(s) => Value::String(s.clone()), |
1802 | 295k | Yaml::Boolean(b) => Value::Boolean(*b), |
1803 | 31.7k | Yaml::Integer(i) => Value::Number(*i as f64), |
1804 | 91.0k | Yaml::Real(s) => Value::Number(s.parse::<f64>().unwrap()), |
1805 | _ => { | |
1806 | 0 | error!("yaml_to_value: illegal type found in Yaml value: {}", yaml_to_string(yaml, 1)); |
1807 | 0 | Value::String("".to_string()) |
1808 | }, | |
1809 | } | |
1810 | 1.88M | } |
1811 | ||
1812 | ||
1813 | // Information for matching a Unicode char (defined in unicode.yaml) and building its replacement | |
1814 | struct UnicodeDef { | |
1815 | ch: u32, | |
1816 | speech: ReplacementArray | |
1817 | } | |
1818 | ||
1819 | impl fmt::Display for UnicodeDef { | |
1820 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1821 | 0 | return write!(f, "UnicodeDef{{ch: {}, speech: {:?}}}", self.ch, self.speech); |
1822 | 0 | } |
1823 | } | |
1824 | ||
1825 | impl UnicodeDef { | |
1826 | 2.24M | fn build(unicode_def: &Yaml, file_name: &Path, speech_rules: &SpeechRules, use_short: bool) -> Result<Option<Vec<PathBuf>>> { |
1827 | 2.24M | if let Some( |
1828 | 3 | let do_include_fn = |new_file: &Path| { |
1829 | 3 | speech_rules.read_unicode(Some(new_file.to_path_buf()), use_short) |
1830 | 3 | }; |
1831 | 3 | return Ok( Some(process_include(file_name, include_file_name, do_include_fn) |
1832 | 2.24M | } |
1833 | // key: char, value is replacement or array of replacements | |
1834 | 2.24M | let dictionary = unicode_def.as_hash(); |
1835 | 2.24M | if dictionary.is_none() { |
1836 | 0 | bail!("Expected a unicode definition (e.g, '+':[t: \"plus\"]'), found {}", yaml_to_string(unicode_def, 0)); |
1837 | 2.24M | } |
1838 | ||
1839 | 2.24M | let dictionary = dictionary.unwrap(); |
1840 | 2.24M | if dictionary.len() != 1 { |
1841 | 0 | bail!("Expected a unicode definition (e.g, '+':[t: \"plus\"]'), found {}", yaml_to_string(unicode_def, 0)); |
1842 | 2.24M | } |
1843 | ||
1844 | 2.24M | let (ch, replacements) = dictionary.iter().next().ok_or_else(|| |
1845 | 2.24M | let mut unicode_table = if use_short { |
1846 | 1.06M | speech_rules.unicode_short.borrow_mut() |
1847 | } else { | |
1848 | 1.17M | speech_rules.unicode_full.borrow_mut() |
1849 | }; | |
1850 | 2.24M | if let Some(str) = ch.as_str() { |
1851 | 2.24M | if str.is_empty() { |
1852 | 0 | bail!("Empty character definition. Replacement is {}", replacements.as_str().unwrap()); |
1853 | 2.24M | } |
1854 | 2.24M | let mut chars = str.chars(); |
1855 | 2.24M | let first_ch = chars.next().unwrap(); // non-empty string, so a char exists |
1856 | 2.24M | if chars.next().is_some() { // more than one char |
1857 | 54.7k | if str.contains('-') { |
1858 | 38.4k | return process_range(str, replacements, unicode_table); |
1859 | 16.2k | } else if first_ch != '0' { // exclude 0xDDDD |
1860 | 74.5k | for ch in |
1861 | 74.5k | let ch_as_str = ch.to_string(); |
1862 | 74.5k | if unicode_table.insert(ch as u32, ReplacementArray::build(&substitute_ch(replacements, &ch_as_str)) |
1863 | 74.5k | .with_context(|| |
1864 | 0 | error!("*** Character '{}' (0x{:X}) is repeated", ch, ch as u32); |
1865 | 74.5k | } |
1866 | } | |
1867 | 16.2k | return Ok(None); |
1868 | 0 | } |
1869 | 2.18M | } |
1870 | 0 | } |
1871 | ||
1872 | 2.18M | let ch = UnicodeDef::get_unicode_char(ch) |
1873 | 2.18M | if unicode_table.insert(ch, ReplacementArray::build(replacements) |
1874 | 2.18M | .with_context(|| |
1875 | 2.18M |
|
1876 | 147 | error!("*** Character '{}' (0x{:X}) is repeated", |
1877 | 2.18M | } |
1878 | 2.18M | return Ok(None); |
1879 | ||
1880 | 38.4k | fn process_range(def_range: &str, replacements: &Yaml, mut unicode_table: RefMut<HashMap<u32,Vec<Replacement>>>) -> Result<Option<Vec<PathBuf>>> { |
1881 | // should be a character range (e.g., "A-Z") | |
1882 | // iterate over that range and also substitute the char for '.' in the | |
1883 | 38.4k | let mut range = def_range.split('-'); |
1884 | 38.4k | let first = range.next().unwrap().chars().next().unwrap() as u32; |
1885 | 38.4k | let last = range.next().unwrap().chars().next().unwrap() as u32; |
1886 | 38.4k | if range.next().is_some() { |
1887 | 0 | bail!("Character range definition has more than one '-': '{}'", def_range); |
1888 | 38.4k | } |
1889 | ||
1890 | 889k | for ch in |
1891 | 889k | let ch_as_str = char::from_u32(ch).unwrap().to_string(); |
1892 | 889k | unicode_table.insert(ch, ReplacementArray::build(&substitute_ch(replacements, &ch_as_str)) |
1893 | 889k | .with_context(|| |
1894 | }; | |
1895 | ||
1896 | 38.4k | return Ok(None) |
1897 | 38.4k | } |
1898 | ||
1899 | 10.3M | fn substitute_ch(yaml: &Yaml, ch: &str) -> Yaml { |
1900 | 10.3M | return match yaml { |
1901 | 2.34M | Yaml::Array(v) => { |
1902 | Yaml::Array( | |
1903 | 2.34M | v.iter() |
1904 | 3.03M | . |
1905 | 2.34M | .collect::<Vec<Yaml>>() |
1906 | ) | |
1907 | }, | |
1908 | 4.52M | Yaml::Hash(h) => { |
1909 | Yaml::Hash( | |
1910 | 4.52M | h.iter() |
1911 | 6.34M | . |
1912 | 4.52M | .collect::<Hash>() |
1913 | ) | |
1914 | }, | |
1915 | 3.47M | Yaml::String(s) => Yaml::String( s.replace('.', ch) ), |
1916 | 0 | _ => yaml.clone(), |
1917 | } | |
1918 | 10.3M | } |
1919 | 2.24M | } |
1920 | ||
1921 | 2.18M | fn get_unicode_char(ch: &Yaml) -> Result<u32> { |
1922 | // either "a" or 0x1234 (number) | |
1923 | 2.18M | if let Some(ch) = ch.as_str() { |
1924 | 2.18M | let mut ch_iter = ch.chars(); |
1925 | 2.18M | let unicode_ch = ch_iter.next(); |
1926 | 2.18M | if unicode_ch.is_none() || ch_iter.next().is_some() { |
1927 | 0 | bail!("Wanted unicode char, found string '{}')", ch); |
1928 | 2.18M | }; |
1929 | 2.18M | return Ok( unicode_ch.unwrap() as u32 ); |
1930 | 0 | } |
1931 | ||
1932 | 0 | if let Some(num) = ch.as_i64() { |
1933 | 0 | return Ok( num as u32 ); |
1934 | 0 | } |
1935 | 0 | bail!("Unicode character '{}' can't be converted to an code point", yaml_to_string(ch, 0)); |
1936 | 2.18M | } |
1937 | } | |
1938 | ||
1939 | // Fix: there should be a cache so subsequent library calls don't have to read in the same speech rules | |
1940 | // likely a cache of size 1 is fine | |
1941 | // Fix: all statics should be gathered together into one structure that is a Mutex | |
1942 | // for each library call, we should grab a lock on the Mutex in case others try to call | |
1943 | // at the same time. | |
1944 | // If this turns out to be something that others actually do, then a cache > 1 would be good | |
1945 | ||
1946 | type RuleTable = HashMap<String, Vec<Box<SpeechPattern>>>; | |
1947 | type UnicodeTable = Rc<RefCell<HashMap<u32,Vec<Replacement>>>>; | |
1948 | type FilesAndTimesShared = Rc<RefCell<FilesAndTimes>>; | |
1949 | ||
1950 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] | |
1951 | pub enum RulesFor { | |
1952 | Intent, | |
1953 | Speech, | |
1954 | OverView, | |
1955 | Navigation, | |
1956 | Braille, | |
1957 | } | |
1958 | ||
1959 | impl fmt::Display for RulesFor { | |
1960 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1961 | 0 | let name = match self { |
1962 | 0 | RulesFor::Intent => "Intent", |
1963 | 0 | RulesFor::Speech => "Speech", |
1964 | 0 | RulesFor::OverView => "OverView", |
1965 | 0 | RulesFor::Navigation => "Navigation", |
1966 | 0 | RulesFor::Braille => "Braille", |
1967 | }; | |
1968 | 0 | return write!(f, "{name}"); |
1969 | 0 | } |
1970 | } | |
1971 | ||
1972 | ||
1973 | #[derive(Debug, Clone)] | |
1974 | pub struct FileAndTime { | |
1975 | file: PathBuf, | |
1976 | time: SystemTime, | |
1977 | } | |
1978 | ||
1979 | impl FileAndTime { | |
1980 | 0 | fn new(file: PathBuf) -> FileAndTime { |
1981 | 0 | return FileAndTime { |
1982 | 0 | file, |
1983 | 0 | time: SystemTime::UNIX_EPOCH, |
1984 | 0 | } |
1985 | 0 | } |
1986 | ||
1987 | // used for debugging preference settings | |
1988 | 0 | pub fn debug_get_file(&self) -> Option<&str> { |
1989 | 0 | return self.file.to_str(); |
1990 | 0 | } |
1991 | ||
1992 | 8.29k | pub fn new_with_time(file: PathBuf) -> FileAndTime { |
1993 | 8.29k | return FileAndTime { |
1994 | 8.29k | time: FileAndTime::get_metadata(&file), |
1995 | 8.29k | file, |
1996 | 8.29k | } |
1997 | 8.29k | } |
1998 | ||
1999 | 33.7k | pub fn is_up_to_date(&self) -> bool { |
2000 | 33.7k | let file_mod_time = FileAndTime::get_metadata(&self.file); |
2001 | 33.7k | return self.time >= file_mod_time; |
2002 | 33.7k | } |
2003 | ||
2004 | 140k | fn get_metadata(path: &Path) -> SystemTime { |
2005 | use std::fs; | |
2006 | 140k | if !cfg!(target_family = "wasm") { |
2007 | 140k | let metadata = fs::metadata(path); |
2008 | 140k | if let Ok( |
2009 | 120k | let Ok(mod_time) = metadata.modified() { |
2010 | 120k | return mod_time; |
2011 | 20.3k | } |
2012 | 0 | } |
2013 | 20.3k | return SystemTime::UNIX_EPOCH |
2014 | 140k | } |
2015 | ||
2016 | } | |
2017 | #[derive(Debug, Default)] | |
2018 | pub struct FilesAndTimes { | |
2019 | // ft[0] is the main file -- other files are included by it (or recursively) | |
2020 | // We could be a little smarter about invalidation by tracking what file is the parent (including file), | |
2021 | // but it seems more complicated than it is worth | |
2022 | ft: Vec<FileAndTime> | |
2023 | } | |
2024 | ||
2025 | impl FilesAndTimes { | |
2026 | 0 | pub fn new(start_path: PathBuf) -> FilesAndTimes { |
2027 | 0 | let mut ft = Vec::with_capacity(8); |
2028 | 0 | ft.push( FileAndTime::new(start_path) ); |
2029 | 0 | return FilesAndTimes{ ft }; |
2030 | 0 | } |
2031 | ||
2032 | /// Returns true if the main file matches the corresponding preference location and files' times are all current | |
2033 | 33.4k | pub fn is_file_up_to_date(&self, pref_path: &Path, should_ignore_file_time: bool) -> bool { |
2034 | ||
2035 | // if the time isn't set or the path is different from the preference (which might have changed), return false | |
2036 | 33.4k | if self.ft.is_empty() || |
2037 | 5.74k | return false; |
2038 | 27.7k | } |
2039 | 27.7k | if should_ignore_file_time || |
2040 | 26.5k | return true; |
2041 | 1.18k | } |
2042 | 1.18k | if self.ft[0].time == SystemTime::UNIX_EPOCH { |
2043 | 0 | return false; |
2044 | 1.18k | } |
2045 | ||
2046 | ||
2047 | // check the time stamp on the included files -- if the head file hasn't changed, the paths for the included files will be the same | |
2048 | 1.19k | for file in |
2049 | 1.19k | if !file.is_up_to_date() { |
2050 | 1 | return false; |
2051 | 1.19k | } |
2052 | } | |
2053 | 1.18k | return true; |
2054 | 33.4k | } |
2055 | ||
2056 | 19.8k | fn set_files_and_times(&mut self, new_files: Vec<PathBuf>) { |
2057 | 19.8k | self.ft.clear(); |
2058 | 98.4k | for path in |
2059 | 98.4k | let time = FileAndTime::get_metadata(&path); // do before move below |
2060 | 98.4k | self.ft.push( FileAndTime{ file: path, time }) |
2061 | } | |
2062 | 19.8k | } |
2063 | ||
2064 | 28.0k | pub fn as_path(&self) -> &Path { |
2065 | 28.0k | assert!(!self.ft.is_empty()); |
2066 | 28.0k | return &self.ft[0].file; |
2067 | 28.0k | } |
2068 | ||
2069 | 0 | pub fn paths(&self) -> Vec<PathBuf> { |
2070 | 0 | return self.ft.iter().map(|ft| ft.file.clone()).collect::<Vec<PathBuf>>(); |
2071 | 0 | } |
2072 | ||
2073 | } | |
2074 | ||
2075 | ||
2076 | /// `SpeechRulesWithContext` encapsulates a named group of speech rules (e.g, "ClearSpeak") | |
2077 | /// along with the preferences to be used for speech. | |
2078 | // Note: if we can't read the files, an error message is stored in the structure and needs to be checked. | |
2079 | // I tried using Result<SpeechRules>, but it was a mess with all the unwrapping. | |
2080 | // Important: the code needs to be careful to check this at the top level calls | |
2081 | pub struct SpeechRules { | |
2082 | error: String, | |
2083 | name: RulesFor, | |
2084 | pub pref_manager: Rc<RefCell<PreferenceManager>>, | |
2085 | rules: RuleTable, // the speech rules used (partitioned into MathML tags in hashmap, then linearly searched) | |
2086 | rule_files: FilesAndTimes, // files that were read | |
2087 | translate_single_chars_only: bool, // strings like "half" don't want 'a's translated, but braille does | |
2088 | unicode_short: UnicodeTable, // the short list of rules used for Unicode characters | |
2089 | unicode_short_files: FilesAndTimesShared, // files that were read | |
2090 | unicode_full: UnicodeTable, // the long remaining rules used for Unicode characters | |
2091 | unicode_full_files: FilesAndTimesShared, // files that were read | |
2092 | definitions_files: FilesAndTimesShared, // files that were read | |
2093 | } | |
2094 | ||
2095 | impl fmt::Display for SpeechRules { | |
2096 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
2097 | 0 | writeln!(f, "SpeechRules '{}'\n{})", self.name, self.pref_manager.borrow())?; |
2098 | 0 | let mut rules_vec: Vec<(&String, &Vec<Box<SpeechPattern>>)> = self.rules.iter().collect(); |
2099 | 0 | rules_vec.sort_by_key(|(tag_name, _)| tag_name.as_str()); |
2100 | 0 | for (tag_name, rules) in rules_vec { |
2101 | 0 | writeln!(f, " {}: #patterns {}", tag_name, rules.len())?; |
2102 | }; | |
2103 | 0 | return writeln!(f, " {}+{} unicode entries", &self.unicode_short.borrow().len(), &self.unicode_full.borrow().len()); |
2104 | 0 | } |
2105 | } | |
2106 | ||
2107 | ||
2108 | /// `SpeechRulesWithContext` encapsulates a named group of speech rules (e.g, "ClearSpeak") | |
2109 | /// along with the preferences to be used for speech. | |
2110 | /// Because speech rules can define variables, there is also a context that is carried with them | |
2111 | pub struct SpeechRulesWithContext<'c, 's:'c, 'm:'c> { | |
2112 | speech_rules: &'s SpeechRules, | |
2113 | context_stack: ContextStack<'c>, // current value of (context) variables | |
2114 | doc: Document<'m>, | |
2115 | nav_node_id: &'m str, | |
2116 | nav_node_offset: usize, | |
2117 | pub inside_spell: bool, // hack to allow 'spell' to avoid infinite loop (see 'spell' implementation in tts.rs) | |
2118 | pub translate_count: usize, // hack to avoid 'translate' infinite loop (see 'spell' implementation in tts.rs) | |
2119 | } | |
2120 | ||
2121 | impl<'c, 's:'c, 'm:'c> fmt::Display for SpeechRulesWithContext<'c, 's,'m> { | |
2122 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
2123 | 0 | writeln!(f, "SpeechRulesWithContext \n{})", self.speech_rules)?; |
2124 | 0 | return writeln!(f, " {} context entries, nav node id '({}, {})'", &self.context_stack, self.nav_node_id, self.nav_node_offset); |
2125 | 0 | } |
2126 | } | |
2127 | ||
2128 | thread_local!{ | |
2129 | /// SPEECH_UNICODE_SHORT is shared among several rules, so "RC" is used | |
2130 | static SPEECH_UNICODE_SHORT: UnicodeTable = | |
2131 | Rc::new( RefCell::new( HashMap::with_capacity(500) ) ); | |
2132 | ||
2133 | /// SPEECH_UNICODE_FULL is shared among several rules, so "RC" is used | |
2134 | static SPEECH_UNICODE_FULL: UnicodeTable = | |
2135 | Rc::new( RefCell::new( HashMap::with_capacity(6500) ) ); | |
2136 | ||
2137 | /// BRAILLE_UNICODE_SHORT is shared among several rules, so "RC" is used | |
2138 | static BRAILLE_UNICODE_SHORT: UnicodeTable = | |
2139 | Rc::new( RefCell::new( HashMap::with_capacity(500) ) ); | |
2140 | ||
2141 | /// BRAILLE_UNICODE_FULL is shared among several rules, so "RC" is used | |
2142 | static BRAILLE_UNICODE_FULL: UnicodeTable = | |
2143 | Rc::new( RefCell::new( HashMap::with_capacity(5000) ) ); | |
2144 | ||
2145 | /// SPEECH_DEFINITION_FILES_AND_TIMES is shared among several rules, so "RC" is used | |
2146 | static SPEECH_DEFINITION_FILES_AND_TIMES: FilesAndTimesShared = | |
2147 | Rc::new( RefCell::new(FilesAndTimes::default()) ); | |
2148 | ||
2149 | /// BRAILLE_DEFINITION_FILES_AND_TIMES is shared among several rules, so "RC" is used | |
2150 | static BRAILLE_DEFINITION_FILES_AND_TIMES: FilesAndTimesShared = | |
2151 | Rc::new( RefCell::new(FilesAndTimes::default()) ); | |
2152 | ||
2153 | /// SPEECH_UNICODE_SHORT_FILES_AND_TIMES is shared among several rules, so "RC" is used | |
2154 | static SPEECH_UNICODE_SHORT_FILES_AND_TIMES: FilesAndTimesShared = | |
2155 | Rc::new( RefCell::new(FilesAndTimes::default()) ); | |
2156 | ||
2157 | /// SPEECH_UNICODE_FULL_FILES_AND_TIMES is shared among several rules, so "RC" is used | |
2158 | static SPEECH_UNICODE_FULL_FILES_AND_TIMES: FilesAndTimesShared = | |
2159 | Rc::new( RefCell::new(FilesAndTimes::default()) ); | |
2160 | ||
2161 | /// BRAILLE_UNICODE_SHORT_FILES_AND_TIMES is shared among several rules, so "RC" is used | |
2162 | static BRAILLE_UNICODE_SHORT_FILES_AND_TIMES: FilesAndTimesShared = | |
2163 | Rc::new( RefCell::new(FilesAndTimes::default()) ); | |
2164 | ||
2165 | /// BRAILLE_UNICODE_FULL_FILES_AND_TIMES is shared among several rules, so "RC" is used | |
2166 | static BRAILLE_UNICODE_FULL_FILES_AND_TIMES: FilesAndTimesShared = | |
2167 | Rc::new( RefCell::new(FilesAndTimes::default()) ); | |
2168 | ||
2169 | /// The current set of speech rules | |
2170 | // maybe this should be a small cache of rules in case people switch rules/prefs? | |
2171 | pub static INTENT_RULES: RefCell<SpeechRules> = | |
2172 | RefCell::new( SpeechRules::new(RulesFor::Intent, true) ); | |
2173 | ||
2174 | pub static SPEECH_RULES: RefCell<SpeechRules> = | |
2175 | RefCell::new( SpeechRules::new(RulesFor::Speech, true) ); | |
2176 | ||
2177 | pub static OVERVIEW_RULES: RefCell<SpeechRules> = | |
2178 | RefCell::new( SpeechRules::new(RulesFor::OverView, true) ); | |
2179 | ||
2180 | pub static NAVIGATION_RULES: RefCell<SpeechRules> = | |
2181 | RefCell::new( SpeechRules::new(RulesFor::Navigation, true) ); | |
2182 | ||
2183 | pub static BRAILLE_RULES: RefCell<SpeechRules> = | |
2184 | RefCell::new( SpeechRules::new(RulesFor::Braille, false) ); | |
2185 | } | |
2186 | ||
2187 | impl SpeechRules { | |
2188 | 8.16k | pub fn new(name: RulesFor, translate_single_chars_only: bool) -> SpeechRules { |
2189 | 8.16k | let globals = if name == RulesFor::Braille { |
2190 | 1.35k | ( |
2191 | 1.35k | (BRAILLE_UNICODE_SHORT.with(Rc::clone), BRAILLE_UNICODE_SHORT_FILES_AND_TIMES.with(Rc::clone)), |
2192 | 1.35k | (BRAILLE_UNICODE_FULL. with(Rc::clone), BRAILLE_UNICODE_FULL_FILES_AND_TIMES.with(Rc::clone)), |
2193 | 1.35k | BRAILLE_DEFINITION_FILES_AND_TIMES.with(Rc::clone), |
2194 | 1.35k | ) |
2195 | } else { | |
2196 | 6.80k | ( |
2197 | 6.80k | (SPEECH_UNICODE_SHORT.with(Rc::clone), SPEECH_UNICODE_SHORT_FILES_AND_TIMES.with(Rc::clone)), |
2198 | 6.80k | (SPEECH_UNICODE_FULL. with(Rc::clone), SPEECH_UNICODE_FULL_FILES_AND_TIMES.with(Rc::clone)), |
2199 | 6.80k | SPEECH_DEFINITION_FILES_AND_TIMES.with(Rc::clone), |
2200 | 6.80k | ) |
2201 | }; | |
2202 | ||
2203 | return SpeechRules { | |
2204 | 8.16k | error: Default::default(), |
2205 | 8.16k | name, |
2206 | 8.16k | rules: HashMap::with_capacity(if name == RulesFor::Intent || |
2207 | 8.16k | rule_files: FilesAndTimes::default(), |
2208 | 8.16k | unicode_short: globals.0.0, // lazy load them |
2209 | 8.16k | unicode_short_files: globals.0.1, |
2210 | 8.16k | unicode_full: globals.1.0, // lazy load them |
2211 | 8.16k | unicode_full_files: globals.1.1, |
2212 | 8.16k | definitions_files: globals.2, |
2213 | 8.16k | translate_single_chars_only, |
2214 | 8.16k | pref_manager: PreferenceManager::get(), |
2215 | }; | |
2216 | 8.16k | } |
2217 | ||
2218 | 17.7k | pub fn get_error(&self) -> Option<&str> { |
2219 | 17.7k | return if self.error.is_empty() { |
2220 | 17.7k | None |
2221 | } else { | |
2222 | 0 | Some(&self.error) |
2223 | } | |
2224 | 17.7k | } |
2225 | ||
2226 | 15.3k | pub fn read_files(&mut self) -> Result<()> { |
2227 | 15.3k | let check_rule_files = self.pref_manager.borrow().pref_to_string("CheckRuleFiles"); |
2228 | 15.3k | if check_rule_files != "None" { // "Prefs" or "All" are other values |
2229 | 15.3k | self.pref_manager.borrow_mut().set_preference_files() |
2230 | 2 | } |
2231 | 15.3k | let should_ignore_file_time = self.pref_manager.borrow().pref_to_string("CheckRuleFiles") != "All"; // ignore for "None", "Prefs" |
2232 | 15.3k | let rule_file = self.pref_manager.borrow().get_rule_file(&self.name).to_path_buf(); // need to create PathBuf to avoid a move/use problem |
2233 | 15.3k | if self.rules.is_empty() || |
2234 | 8.35k | self.rules.clear(); |
2235 | 8.35k | let files_read = self.read_patterns(&rule_file) |
2236 | 8.35k | self.rule_files.set_files_and_times(files_read); |
2237 | 6.94k | } |
2238 | ||
2239 | 15.3k | let pref_manager = self.pref_manager.borrow(); |
2240 | 15.3k | let unicode_pref_files = if self.name == RulesFor::Braille { |
2241 | ||
2242 | 15.3k | if !self.unicode_short_files.borrow().is_file_up_to_date(unicode_pref_files.0, should_ignore_file_time) { |
2243 | 5.50k | self.unicode_short.borrow_mut().clear(); |
2244 | 5.50k | self.unicode_short_files.borrow_mut().set_files_and_times(self.read_unicode(None, true) |
2245 | 9.80k | } |
2246 | ||
2247 | 15.3k | if self.definitions_files.borrow().ft.is_empty() || |
2248 | 9.82k | pref_manager.get_definitions_file(self.name != RulesFor::Braille), |
2249 | 9.82k | should_ignore_file_time |
2250 | 9.82k | ) { |
2251 | 5.49k | self.definitions_files.borrow_mut().set_files_and_times(read_definitions_file(self.name != RulesFor::Braille) |
2252 | 9.80k | } |
2253 | 15.3k | return Ok( () ); |
2254 | 15.3k | } |
2255 | ||
2256 | 38.6k | fn read_patterns(&mut self, path: &Path) -> Result<Vec<PathBuf>> { |
2257 | // info!("Reading rule file: {}", p.to_str().unwrap()); | |
2258 | 38.6k | let rule_file_contents = read_to_string_shim(path).with_context(|| |
2259 | 38.6k | let rules_build_fn = |pattern: &Yaml| { |
2260 | 38.6k | self.build_speech_patterns(pattern, path) |
2261 | 38.6k | .with_context(|| |
2262 | 38.6k | }; |
2263 | 38.6k | return compile_rule(&rule_file_contents, rules_build_fn) |
2264 | 38.6k | .with_context(|| |
2265 | 38.6k | } |
2266 | ||
2267 | 38.6k | fn build_speech_patterns(&mut self, patterns: &Yaml, file_name: &Path) -> Result<Vec<PathBuf>> { |
2268 | // Rule::SpeechPatternList | |
2269 | 38.6k | let patterns_vec = patterns.as_vec(); |
2270 | 38.6k | if patterns_vec.is_none() { |
2271 | 0 | bail!(yaml_type_err(patterns, "array")); |
2272 | 38.6k | } |
2273 | 38.6k | let patterns_vec = patterns.as_vec().unwrap(); |
2274 | 38.6k | let mut files_read = vec![file_name.to_path_buf()]; |
2275 | 894k | for entry in |
2276 | 894k | if let Some( |
2277 | 30.2k | files_read.append(&mut added_files); |
2278 | 864k | } |
2279 | } | |
2280 | 38.6k | return Ok(files_read) |
2281 | 38.6k | } |
2282 | ||
2283 | 5.97k | fn read_unicode(&self, path: Option<PathBuf>, use_short: bool) -> Result<Vec<PathBuf>> { |
2284 | 5.97k | let path = match path { |
2285 | 3 | Some(p) => p, |
2286 | None => { | |
2287 | // get the path to either the short or long unicode file | |
2288 | 5.97k | let pref_manager = self.pref_manager.borrow(); |
2289 | 5.97k | let unicode_files = if self.name == RulesFor::Braille { |
2290 | 1.57k | pref_manager.get_braille_unicode_file() |
2291 | } else { | |
2292 | 4.40k | pref_manager.get_speech_unicode_file() |
2293 | }; | |
2294 | 5.97k | let unicode_files = if use_short { |
2295 | 5.97k | unicode_files.to_path_buf() |
2296 | } | |
2297 | }; | |
2298 | ||
2299 | // FIX: should read first (lang), then supplement with second (region) | |
2300 | // info!("Reading unicode file {}", path.to_str().unwrap()); | |
2301 | 5.97k | let unicode_file_contents = read_to_string_shim(&path) |
2302 | 5.97k | let unicode_build_fn = |unicode_def_list: &Yaml| { |
2303 | 5.97k | let unicode_defs = unicode_def_list.as_vec(); |
2304 | 5.97k | if unicode_defs.is_none() { |
2305 | 0 | bail!("File '{}' does not begin with an array", yaml_to_type(unicode_def_list)); |
2306 | 5.97k | }; |
2307 | 5.97k | let mut files_read = vec![path.to_path_buf()]; |
2308 | 2.24M | for unicode_def in |
2309 | 2.24M | if let Some( |
2310 | 2.24M | .with_context(|| |
2311 | 3 | files_read.append(&mut added_files); |
2312 | 2.24M | } |
2313 | }; | |
2314 | 5.97k | return Ok(files_read) |
2315 | 5.97k | }; |
2316 | ||
2317 | 5.97k | return compile_rule(&unicode_file_contents, unicode_build_fn) |
2318 | 5.97k | .with_context(|| |
2319 | 5.97k | } |
2320 | ||
2321 | 0 | pub fn print_sizes() -> String { |
2322 | // let _ = &SPEECH_RULES.with_borrow(|rules| { | |
2323 | // debug!("SPEECH RULES entries\n"); | |
2324 | // let rules = &rules.rules; | |
2325 | // for (key, _) in rules.iter() { | |
2326 | // debug!("key: {}", key); | |
2327 | // } | |
2328 | // }); | |
2329 | 0 | let mut answer = rule_size(&SPEECH_RULES, "SPEECH_RULES"); |
2330 | 0 | answer += &rule_size(&INTENT_RULES, "INTENT_RULES"); |
2331 | 0 | answer += &rule_size(&BRAILLE_RULES, "BRAILLE_RULES"); |
2332 | 0 | answer += &rule_size(&NAVIGATION_RULES, "NAVIGATION_RULES"); |
2333 | 0 | answer += &rule_size(&OVERVIEW_RULES, "OVERVIEW_RULES"); |
2334 | 0 | SPEECH_RULES.with_borrow(|rule| { |
2335 | 0 | answer += &format!("Speech Unicode tables: short={}/{}, long={}/{}\n", |
2336 | 0 | rule.unicode_short.borrow().len(), rule.unicode_short.borrow().capacity(), |
2337 | 0 | rule.unicode_full.borrow().len(), rule.unicode_full.borrow().capacity()); |
2338 | 0 | }); |
2339 | 0 | BRAILLE_RULES.with_borrow(|rule| { |
2340 | 0 | answer += &format!("Braille Unicode tables: short={}/{}, long={}/{}\n", |
2341 | 0 | rule.unicode_short.borrow().len(), rule.unicode_short.borrow().capacity(), |
2342 | 0 | rule.unicode_full.borrow().len(), rule.unicode_full.borrow().capacity()); |
2343 | 0 | }); |
2344 | 0 | return answer; |
2345 | ||
2346 | 0 | fn rule_size(rules: &'static std::thread::LocalKey<RefCell<SpeechRules>>, name: &str) -> String { |
2347 | 0 | rules.with_borrow(|rule| { |
2348 | 0 | let hash_map = &rule.rules; |
2349 | 0 | return format!("{}: {}/{}\n", name, hash_map.len(), hash_map.capacity()); |
2350 | 0 | }) |
2351 | 0 | } |
2352 | 0 | } |
2353 | } | |
2354 | ||
2355 | ||
2356 | /// We track three different lifetimes: | |
2357 | /// 'c -- the lifetime of the context and mathml | |
2358 | /// 's -- the lifetime of the speech rules (which is static) | |
2359 | /// 'r -- the lifetime of the reference (this seems to be key to keep the rust memory checker happy) | |
2360 | impl<'c, 's:'c, 'r, 'm:'c> SpeechRulesWithContext<'c, 's,'m> { | |
2361 | 22.7k | pub fn new(speech_rules: &'s SpeechRules, doc: Document<'m>, nav_node_id: &'m str, nav_node_offset: usize) -> SpeechRulesWithContext<'c, 's, 'm> { |
2362 | 22.7k | return SpeechRulesWithContext { |
2363 | 22.7k | speech_rules, |
2364 | 22.7k | context_stack: ContextStack::new(&speech_rules.pref_manager.borrow()), |
2365 | 22.7k | doc, |
2366 | 22.7k | nav_node_id, |
2367 | 22.7k | nav_node_offset, |
2368 | 22.7k | inside_spell: false, |
2369 | 22.7k | translate_count: 0, |
2370 | 22.7k | } |
2371 | 22.7k | } |
2372 | ||
2373 | 1.84k | pub fn get_rules(&mut self) -> &SpeechRules { |
2374 | 1.84k | return self.speech_rules; |
2375 | 1.84k | } |
2376 | ||
2377 | 45.5k | pub fn get_context(&mut self) -> &mut sxd_xpath::Context<'c> { |
2378 | 45.5k | return &mut self.context_stack.base; |
2379 | 45.5k | } |
2380 | ||
2381 | 3.23k | pub fn get_document(&mut self) -> Document<'m> { |
2382 | 3.23k | return self.doc; |
2383 | 3.23k | } |
2384 | ||
2385 | 1.13k | pub fn set_nav_node_offset(&mut self, offset: usize) { |
2386 | // debug!("Setting nav node offset to {}", offset); | |
2387 | 1.13k | self.nav_node_offset = offset; |
2388 | 1.13k | } |
2389 | ||
2390 | 121k | pub fn match_pattern<T:TreeOrString<'c, 'm, T>>(&'r mut self, mathml: Element<'c>) -> Result<T> { |
2391 | // debug!("Looking for a match for: \n{}", mml_to_string(mathml)); | |
2392 | 121k | let tag_name = mathml.name().local_part(); |
2393 | 121k | let rules = &self.speech_rules.rules; |
2394 | ||
2395 | // start with priority rules that apply to any node (should be a very small number) | |
2396 | 121k | if let Some( |
2397 | 95.8k | let Some( |
2398 | 3.18k | return Ok(result); // found a match |
2399 | 118k | } |
2400 | ||
2401 | 118k | if let Some( |
2402 | 116k | let Some( |
2403 | 82.1k | return Ok(result); // found a match |
2404 | 35.9k | } |
2405 | ||
2406 | // no rules for specific element, fall back to rules for "*" which *should* be present in all rule files as fallback | |
2407 | 35.9k | if let Some(rule_vector) = rules.get("*") && |
2408 | 35.9k | let Some(result) = self.find_match(rule_vector, mathml) |
2409 | 35.9k | return Ok(result); // found a match |
2410 | 0 | } |
2411 | ||
2412 | // no rules matched -- poorly written rule file -- let flow through to default error | |
2413 | // report error message with file name | |
2414 | 0 | let speech_manager = self.speech_rules.pref_manager.borrow(); |
2415 | 0 | let file_name = speech_manager.get_rule_file(&self.speech_rules.name); |
2416 | // FIX: handle error appropriately | |
2417 | 0 | bail!("\nNo match found!\nMissing patterns in {} for MathML.\n{}", file_name.to_string_lossy(), mml_to_string(mathml)); |
2418 | 121k | } |
2419 | ||
2420 | 248k | fn find_match<T:TreeOrString<'c, 'm, T>>(&'r mut self, rule_vector: &[Box<SpeechPattern>], mathml: Element<'c>) -> Result<Option<T>> { |
2421 | 870k | for pattern in |
2422 | // debug!("Pattern name: {}", pattern.pattern_name); | |
2423 | // always pushing and popping around the is_match would be a little cleaner, but push/pop is relatively expensive, | |
2424 | // so we optimize and only push first if the variables are needed to do the match | |
2425 | 870k | if pattern.match_uses_var_defs { |
2426 | 7.05k | self.context_stack.push(pattern.var_defs.clone(), mathml) |
2427 | 863k | } |
2428 | 870k | if pattern.is_match(&self.context_stack.base, mathml) |
2429 | 870k | .with_context(|| |
2430 | // debug!(" find_match: FOUND!!!"); | |
2431 | 121k | if !pattern.match_uses_var_defs && |
2432 | 13.0k | self.context_stack.push(pattern.var_defs.clone(), mathml) |
2433 | 108k | } |
2434 | 121k | let result = if self.nav_node_offset > 0 && |
2435 | 47 | self.nav_node_id == mathml.attribute_value("id").unwrap_or_default() && |
2436 | 7 | let ch = crate::canonicalize::as_text(mathml).chars().nth(self.nav_node_offset-1).unwrap_or_default(); |
2437 | 7 | let ch = self.replace_single_char(ch, mathml) |
2438 | // debug!("find_match: ch={} from '{}'; matched pattern name/tag: {}/{} with nav_node_offset={}", | |
2439 | // ch, crate::canonicalize::as_text(mathml), | |
2440 | // pattern.pattern_name, pattern.tag_name, self.nav_node_offset); | |
2441 | 7 | T::from_string(ch.to_string(), self.doc) |
2442 | } else { | |
2443 | 121k | pattern.replacements.replace(self, mathml) |
2444 | }; | |
2445 | 121k | if pattern.var_defs.len() > 0 { |
2446 | 14.5k | self.context_stack.pop(); |
2447 | 106k | } |
2448 | 121k | return match result { |
2449 | 121k | Ok(s) => { |
2450 | // for all except braille and navigation, nav_node_id will be an empty string and will not match | |
2451 | 121k | if self.nav_node_id.is_empty() { |
2452 | 102k | Ok( Some(s) ) |
2453 | } else { | |
2454 | 18.5k | if self.nav_node_id == mathml.attribute_value("id").unwrap_or_default() { |
2455 | 18.5k | Ok ( Some(self.nav_node_adjust(s, mathml)) ) |
2456 | } | |
2457 | }, | |
2458 | 9 | Err(e) => Err( e.context( |
2459 | 9 | format!( |
2460 | 9 | "attempting replacement pattern: \"{}\" for \"{}\".\n\ |
2461 | 9 | Replacement\n{}\n...due to matching the MathML\n{} with the pattern\n\ |
2462 | 9 | {}\n\ |
2463 | 9 | The patterns are in {}.\n", |
2464 | 9 | pattern.pattern_name, pattern.tag_name, |
2465 | 9 | pattern.replacements.pretty_print_replacements(), |
2466 | 9 | mml_to_string(mathml), pattern.pattern, |
2467 | 9 | pattern.file_name |
2468 | 9 | ) |
2469 | 9 | )) |
2470 | } | |
2471 | 749k | } else if pattern.match_uses_var_defs { |
2472 | 5.60k | self.context_stack.pop(); |
2473 | 743k | } |
2474 | }; | |
2475 | 127k | return Ok(None); // no matches |
2476 | ||
2477 | 0 | fn error_string(pattern: &SpeechPattern, mathml: Element) -> String { |
2478 | 0 | return format!( |
2479 | "error during pattern match using: \"{}\" for \"{}\".\n\ | |
2480 | Pattern is \n{}\nMathML for the match:\n\ | |
2481 | {}\ | |
2482 | The patterns are in {}.\n", | |
2483 | pattern.pattern_name, pattern.tag_name, | |
2484 | pattern.pattern, | |
2485 | 0 | mml_to_string(mathml), |
2486 | pattern.file_name | |
2487 | ); | |
2488 | 0 | } |
2489 | ||
2490 | 248k | } |
2491 | ||
2492 | 18.5k | fn nav_node_adjust<T:TreeOrString<'c, 'm, T>>(&self, speech: T, mathml: Element<'c>) -> T { |
2493 | 18.5k | if let Some(id) = mathml.attribute_value("id") && |
2494 | 18.5k | self.nav_node_id == id { |
2495 | 990 | let offset = mathml.attribute_value(crate::navigate::ID_OFFSET).unwrap_or("0"); |
2496 | 990 | debug!("nav_node_adjust: id/name='{}/{}' offset?='{}'", id, |
2497 | 0 | self.nav_node_offset.to_string().as_str() == offset |
2498 | ); | |
2499 | 990 | if is_leaf(mathml) || |
2500 | 990 | if self.speech_rules.name == RulesFor::Braille { |
2501 | 469 | let highlight_style = self.speech_rules.pref_manager.borrow().pref_to_string("BrailleNavHighlight"); |
2502 | 469 | return T::highlight_braille(speech, highlight_style); |
2503 | } else { | |
2504 | 521 | debug!("nav_node_adjust: id='{}' offset='{}/{}'", id, self.nav_node_offset, offset); |
2505 | 521 | return T::mark_nav_speech(speech) |
2506 | } | |
2507 | 0 | } |
2508 | 17.5k | } |
2509 | 17.5k | return speech; |
2510 | 18.5k | } |
2511 | ||
2512 | 469 | fn highlight_braille_string(braille: String, highlight_style: String) -> String { |
2513 | // add dots 7 & 8 to the Unicode braille (28xx) | |
2514 | 469 | if &highlight_style == "Off" || braille.is_empty() { |
2515 | 6 | return braille; |
2516 | 463 | } |
2517 | ||
2518 | // FIX: this seems needlessly complex. It is much simpler if the char can be changed in place... | |
2519 | // find first char that can get the dots and add them | |
2520 | 463 | let mut chars = braille.chars().collect::<Vec<char>>(); |
2521 | ||
2522 | // the 'b' for baseline indicator is really part of the previous token, so it needs to be highlighted but isn't because it is not Unicode braille | |
2523 | 463 | let baseline_indicator_hack = PreferenceManager::get().borrow().pref_to_string("BrailleCode") == "Nemeth"; |
2524 | // debug!("highlight_braille_string: highlight_style={}\n braille={}", highlight_style, braille); | |
2525 | 463 | let mut i_first_modified = 0; |
2526 | 760 | for (i, ch) in |
2527 | 760 | let modified_ch = add_dots_to_braille_char(*ch, baseline_indicator_hack); |
2528 | 760 | if *ch != modified_ch { |
2529 | 463 | *ch = modified_ch; |
2530 | 463 | i_first_modified = i; |
2531 | 463 | break; |
2532 | 297 | }; |
2533 | }; | |
2534 | ||
2535 | 463 | let mut i_last_modified = i_first_modified; |
2536 | 463 | if &highlight_style != "FirstChar" { |
2537 | // find last char so that we know when to modify the char | |
2538 | 491 | for i in ( |
2539 | 491 | let ch = chars[i]; |
2540 | 491 | let modified_ch = add_dots_to_braille_char(ch, baseline_indicator_hack); |
2541 | 491 | chars[i] = modified_ch; |
2542 | 491 | if ch != modified_ch { |
2543 | 390 | i_last_modified = i; |
2544 | 390 | break; |
2545 | 101 | } |
2546 | } | |
2547 | 0 | } |
2548 | ||
2549 | 463 | if &highlight_style == "All" { |
2550 | // finish going through the string | |
2551 | #[allow(clippy::needless_range_loop)] // I don't like enumerate/take/skip here | |
2552 | 4 | for |
2553 | 0 | chars[i] = add_dots_to_braille_char(chars[i], baseline_indicator_hack); |
2554 | 0 | }; |
2555 | 459 | } |
2556 | ||
2557 | 463 | let result = chars.into_iter().collect::<String>(); |
2558 | // debug!(" result={}", result); | |
2559 | 463 | return result; |
2560 | ||
2561 | 1.25k | fn add_dots_to_braille_char(ch: char, baseline_indicator_hack: bool) -> char { |
2562 | 1.25k | let as_u32 = ch as u32; |
2563 | 1.25k | if (0x2800..0x28FF).contains(&as_u32) { |
2564 | 919 | return unsafe {char::from_u32_unchecked(as_u32 | 0xC0)}; // safe because we have checked the range |
2565 | 332 | } else if baseline_indicator_hack && |
2566 | 7 | return '𝑏' |
2567 | } else { | |
2568 | 325 | return ch; |
2569 | } | |
2570 | 1.25k | } |
2571 | 469 | } |
2572 | ||
2573 | 521 | fn mark_nav_speech(speech: String) -> String { |
2574 | // add unique markers (since speech is mostly ascii letters and digits, most any symbol will do) | |
2575 | // it's a bug (but happened during intent generation), we might have identical id's, choose innermost one | |
2576 | 521 | debug!("mark_nav_speech: adding [[ {} ]] ", |
2577 | 521 | if !speech.contains("[[") { |
2578 | 521 | return "[[".to_string() + &speech + "]]"; |
2579 | } else { | |
2580 | 0 | return speech |
2581 | } | |
2582 | 521 | } |
2583 | ||
2584 | 456k | fn replace<T:TreeOrString<'c, 'm, T>>(&'r mut self, replacement: &Replacement, mathml: Element<'c>) -> Result<T> { |
2585 | return Ok( | |
2586 | 456k | match replacement { |
2587 | 63.6k | Replacement::Text(t) => T::from_string(t.clone(), self.doc) |
2588 | 151k | Replacement::XPath(xpath) => xpath.replace(self, mathml) |
2589 | 60.7k | Replacement::TTS(tts) => { |
2590 | 60.7k | T::from_string( |
2591 | 60.7k | self.speech_rules.pref_manager.borrow().get_tts().replace(tts, &self.speech_rules.pref_manager.borrow(), self, mathml) |
2592 | 60.7k | self.doc |
2593 | 0 | )? |
2594 | }, | |
2595 | 45.5k | Replacement::Intent(intent) => { |
2596 | 45.5k | intent.replace(self, mathml) |
2597 | }, | |
2598 | 115k | Replacement::Test(test) => { |
2599 | 115k | test.replace(self, mathml) |
2600 | }, | |
2601 | 7.28k | Replacement::With(with) => { |
2602 | 7.28k | with.replace(self, mathml) |
2603 | }, | |
2604 | 3.78k | Replacement::SetVariables(vars) => { |
2605 | 3.78k | vars.replace(self, mathml) |
2606 | }, | |
2607 | 7.45k | Replacement::Insert(ic) => { |
2608 | 7.45k | ic.replace(self, mathml) |
2609 | }, | |
2610 | 2 | Replacement::Translate(id) => { |
2611 | 2 | id.replace(self, mathml) |
2612 | }, | |
2613 | } | |
2614 | ) | |
2615 | 456k | } |
2616 | ||
2617 | /// Iterate over all the nodes, concatenating the result strings together with a ' ' between them | |
2618 | /// If the node is an element, pattern match it | |
2619 | /// For 'Text' and 'Attribute' nodes, convert them to strings | |
2620 | 121k | fn replace_nodes<T:TreeOrString<'c, 'm, T>>(&'r mut self, nodes: Vec<Node<'c>>, mathml: Element<'c>) -> Result<T> { |
2621 | 121k | return T::replace_nodes(self, nodes, mathml); |
2622 | 121k | } |
2623 | ||
2624 | /// Iterate over all the nodes finding matches for the elements | |
2625 | /// For this case of returning MathML, everything else is an error | |
2626 | 48.6k | fn replace_nodes_tree(&'r mut self, nodes: Vec<Node<'c>>, _mathml: Element<'c>) -> Result<Element<'m>> { |
2627 | 48.6k | let mut children = Vec::with_capacity(3*nodes.len()); // guess (2 chars/node + space) |
2628 | 69.6k | for node in |
2629 | 69.6k | let matched = match node { |
2630 | 41.9k | Node::Element(n) => self.match_pattern::<Element<'m>>(n) |
2631 | 27.5k | Node::Text(t) => { |
2632 | 27.5k | let leaf = create_mathml_element(&self.doc, "TEMP_NAME"); |
2633 | 27.5k | leaf.set_text(t.text()); |
2634 | 27.5k | leaf |
2635 | }, | |
2636 | 32 | Node::Attribute(attr) => { |
2637 | // debug!(" from attr with text '{}'", attr.value()); | |
2638 | 32 | let leaf = create_mathml_element(&self.doc, "TEMP_NAME"); |
2639 | 32 | leaf.set_text(attr.value()); |
2640 | 32 | leaf |
2641 | }, | |
2642 | _ => { | |
2643 | 0 | bail!("replace_nodes: found unexpected node type!!!"); |
2644 | }, | |
2645 | }; | |
2646 | 69.6k | children.push(matched); |
2647 | } | |
2648 | ||
2649 | 48.6k | let result = create_mathml_element(&self.doc, "TEMP_NAME"); // FIX: what name should be used? |
2650 | 48.6k | result.append_children(children); |
2651 | // debug!("replace_nodes_tree\n{}\n====>>>>>\n", mml_to_string(result)); | |
2652 | 48.6k | return Ok( result ); |
2653 | 48.6k | } |
2654 | ||
2655 | 72.9k | fn replace_nodes_string(&'r mut self, nodes: Vec<Node<'c>>, mathml: Element<'c>) -> Result<String> { |
2656 | // debug!("replace_nodes: working on {} nodes", nodes.len()); | |
2657 | 72.9k | let mut result = String::with_capacity(3*nodes.len()); // guess (2 chars/node + space) |
2658 | 72.9k | let mut first_time = true; |
2659 | 85.7k | for node in |
2660 | 85.7k | if first_time { |
2661 | 72.9k | first_time = false; |
2662 | 72.9k | } else { |
2663 | 12.8k | result.push(' '); |
2664 | 12.8k | }; |
2665 | 85.7k | let matched = match node { |
2666 | 66.5k | Node::Element(n) => self.match_pattern::<String>(n) |
2667 | 19.2k | Node::Text(t) => self.replace_chars(t.text(), mathml) |
2668 | 14 | Node::Attribute(attr) => self.replace_chars(attr.value(), mathml) |
2669 | 0 | _ => bail!("replace_nodes: found unexpected node type!!!"), |
2670 | }; | |
2671 | 85.7k | result += &matched; |
2672 | } | |
2673 | 72.9k | return Ok( result ); |
2674 | 72.9k | } |
2675 | ||
2676 | /// Lookup unicode "pronunciation" of char. | |
2677 | /// Note: TTS is not supported here (not needed and a little less efficient) | |
2678 | 58.0k | pub fn replace_chars(&'r mut self, str: &str, mathml: Element<'c>) -> Result<String> { |
2679 | 58.0k | let chars = str.chars().collect::<Vec<char>>(); |
2680 | 58.0k | let rules = self.speech_rules; |
2681 | // handled in match_pattern -- temporarily leaving as comments in case something is missed and needed here | |
2682 | // if self.nav_node_offset > 0 && chars.len() > 1 { | |
2683 | // if self.nav_node_offset > chars.len() { | |
2684 | // debug!("replace_chars: nav_node_offset {} is larger than string length {}", self.nav_node_offset, chars.len()); | |
2685 | // self.nav_node_offset = chars.len(); | |
2686 | // } | |
2687 | // let ch = chars[self.nav_node_offset-1]; | |
2688 | // debug!("replace_chars: adjusted string to '{}' based on nav_node_offset {}", ch, self.nav_node_offset); | |
2689 | // if rules.translate_single_chars_only { | |
2690 | // return self.replace_single_char(ch, mathml); | |
2691 | // } else { | |
2692 | // return Ok( ch.to_string() ); | |
2693 | // } | |
2694 | // } | |
2695 | 58.0k | if is_quoted_string(str) { // quoted string -- already translated (set in get_braille_chars) |
2696 | 12.5k | return Ok(unquote_string(str).to_string()); |
2697 | 45.5k | } |
2698 | // in a string, avoid "a" -> "eigh", "." -> "point", etc | |
2699 | 45.5k | if rules.translate_single_chars_only { |
2700 | 30.0k | if chars.len() == 1 { |
2701 | 27.3k | return self.replace_single_char(chars[0], mathml) |
2702 | } else { | |
2703 | // more than one char -- fix up non-breaking space | |
2704 | 2.69k | return Ok(str.replace('\u{00A0}', " ").replace(['\u{2061}', '\u{2062}', '\u{2063}', '\u{2064}'], "")) |
2705 | } | |
2706 | 15.5k | }; |
2707 | ||
2708 | 15.5k | let result = chars.iter() |
2709 | 18.2k | . |
2710 | 15.5k | .collect::<Result<Vec<String>>>() |
2711 | 15.5k | .join(""); |
2712 | 15.5k | return Ok( result ); |
2713 | 58.0k | } |
2714 | ||
2715 | 45.6k | fn replace_single_char(&'r mut self, ch: char, mathml: Element<'c>) -> Result<String> { |
2716 | 45.6k | let ch_as_u32 = ch as u32; |
2717 | 45.6k | let rules = self.speech_rules; |
2718 | 45.6k | let mut unicode = rules.unicode_short.borrow(); |
2719 | 45.6k | let mut replacements = unicode.get( &ch_as_u32 ); |
2720 | // debug!("replace_single_char: looking for unicode {} for char '{}'/{:#06x}, found: {:?}", rules.name, ch, ch_as_u32, replacements); | |
2721 | 45.6k | if replacements.is_none() { |
2722 | // see if it in the full unicode table (if it isn't loaded already) | |
2723 | 1.64k | let pref_manager = rules.pref_manager.borrow(); |
2724 | 1.64k | let unicode_pref_files = if rules.name == RulesFor::Braille { |
2725 | 1.64k | let should_ignore_file_time = pref_manager.pref_to_string("CheckRuleFiles") == "All"; |
2726 | 1.64k | if rules.unicode_full.borrow().is_empty() || |
2727 | 468 | info!("*** Loading full unicode {} for char '{}'/{:#06x}", rules.name, ch, ch_as_u32); |
2728 | 468 | rules.unicode_full.borrow_mut().clear(); |
2729 | 468 | rules.unicode_full_files.borrow_mut().set_files_and_times(rules.read_unicode(None, false) |
2730 | 468 | info!("# Unicode defs = {}/{}", |
2731 | 1.18k | } |
2732 | 1.64k | unicode = rules.unicode_full.borrow(); |
2733 | 1.64k | replacements = unicode.get( &ch_as_u32 ); |
2734 | 1.64k | if replacements.is_none() { |
2735 | 269 | self.translate_count = 0; // not in loop |
2736 | // debug!("*** Did not find unicode {} for char '{}'/{:#06x}", rules.name, ch, ch_as_u32); | |
2737 | 269 | if rules.translate_single_chars_only || |
2738 | 269 | return Ok(String::from(ch)); // no replacement, so just return the char and hope for the best |
2739 | } else { // braille -- must turn into braille dots | |
2740 | // Emulate what NVDA does: generate (including single quotes) '\xhhhh' or '\yhhhhhh' | |
2741 | 0 | let ch_as_int = ch as u32; |
2742 | 0 | let prefix_indicator = if ch_as_int < 1<<16 {'x'} else {'y'}; |
2743 | 0 | return self.replace_chars( &format!("'\\{prefix_indicator}{:06x}'", ch_as_int), mathml); |
2744 | } | |
2745 | 1.37k | } |
2746 | 43.9k | }; |
2747 | ||
2748 | // map across all the parts of the replacement, collect them up into a Vec, and then concat them together | |
2749 | 45.3k | let result = replacements.unwrap() |
2750 | 45.3k | .iter() |
2751 | 45.3k | .map(|replacement| |
2752 | 49.1k | self.replace(replacement, mathml) |
2753 | 49.1k | .with_context(|| |
2754 | 45.3k | .collect::<Result<Vec<String>>>() |
2755 | 45.3k | .join(" "); |
2756 | 45.3k | self.translate_count = 0; // found a replacement, so not in a loop |
2757 | 45.3k | return Ok(result); |
2758 | 45.6k | } |
2759 | } | |
2760 | ||
2761 | /// Hack to allow replacement of `str` with braille chars. | |
2762 | 12.5k | pub fn braille_replace_chars(str: &str, mathml: Element) -> Result<String> { |
2763 | 12.5k | return BRAILLE_RULES.with(|rules| { |
2764 | 12.5k | let rules = rules.borrow(); |
2765 | 12.5k | let new_package = Package::new(); |
2766 | 12.5k | let mut rules_with_context = SpeechRulesWithContext::new(&rules, new_package.as_document(), "", 0); |
2767 | 12.5k | return rules_with_context.replace_chars(str, mathml); |
2768 | 12.5k | }) |
2769 | 12.5k | } |
2770 | ||
2771 | ||
2772 | ||
2773 | #[cfg(test)] | |
2774 | mod tests { | |
2775 | #[allow(unused_imports)] | |
2776 | use crate::init_logger; | |
2777 | ||
2778 | use super::*; | |
2779 | ||
2780 | #[test] | |
2781 | 1 | fn test_read_statement() { |
2782 | 1 | let str = r#"--- |
2783 | 1 | {name: default, tag: math, match: ".", replace: [x: "./*"] }"#; |
2784 | 1 | let doc = YamlLoader::load_from_str(str).unwrap(); |
2785 | 1 | assert_eq!(doc.len(), 1); |
2786 | 1 | let mut rules = SpeechRules::new(RulesFor::Speech, true); |
2787 | ||
2788 | 1 | SpeechPattern::build(&doc[0], Path::new("testing"), &mut rules).unwrap(); |
2789 | 1 | assert_eq!(rules.rules["math"].len(), 1, "\nshould only be one rule"); |
2790 | ||
2791 | 1 | let speech_pattern = &rules.rules["math"][0]; |
2792 | 1 | assert_eq!(speech_pattern.pattern_name, "default", "\npattern name failure"); |
2793 | 1 | assert_eq!(speech_pattern.tag_name, "math", "\ntag name failure"); |
2794 | 1 | assert_eq!(speech_pattern.pattern.rc.string, ".", "\npattern failure"); |
2795 | 1 | assert_eq!(speech_pattern.replacements.replacements.len(), 1, "\nreplacement failure"); |
2796 | 1 | assert_eq!(speech_pattern.replacements.replacements[0].to_string(), r#""./*""#, "\nreplacement failure"); |
2797 | 1 | } |
2798 | ||
2799 | #[test] | |
2800 | 1 | fn test_read_statements_with_replace() { |
2801 | 1 | let str = r#"--- |
2802 | 1 | {name: default, tag: math, match: ".", replace: [x: "./*"] }"#; |
2803 | 1 | let doc = YamlLoader::load_from_str(str).unwrap(); |
2804 | 1 | assert_eq!(doc.len(), 1); |
2805 | 1 | let mut rules = SpeechRules::new(RulesFor::Speech, true); |
2806 | 1 | SpeechPattern::build(&doc[0], Path::new("testing"), &mut rules).unwrap(); |
2807 | ||
2808 | 1 | let str = r#"--- |
2809 | 1 | {name: default, tag: math, match: ".", replace: [t: "test", x: "./*"] }"#; |
2810 | 1 | let doc2 = YamlLoader::load_from_str(str).unwrap(); |
2811 | 1 | assert_eq!(doc2.len(), 1); |
2812 | 1 | SpeechPattern::build(&doc2[0], Path::new("testing"), &mut rules).unwrap(); |
2813 | 1 | assert_eq!(rules.rules["math"].len(), 1, "\nfirst rule not replaced"); |
2814 | ||
2815 | 1 | let speech_pattern = &rules.rules["math"][0]; |
2816 | 1 | assert_eq!(speech_pattern.pattern_name, "default", "\npattern name failure"); |
2817 | 1 | assert_eq!(speech_pattern.tag_name, "math", "\ntag name failure"); |
2818 | 1 | assert_eq!(speech_pattern.pattern.rc.string, ".", "\npattern failure"); |
2819 | 1 | assert_eq!(speech_pattern.replacements.replacements.len(), 2, "\nreplacement failure"); |
2820 | 1 | } |
2821 | ||
2822 | #[test] | |
2823 | 1 | fn test_read_statements_with_add() { |
2824 | 1 | let str = r#"--- |
2825 | 1 | {name: default, tag: math, match: ".", replace: [x: "./*"] }"#; |
2826 | 1 | let doc = YamlLoader::load_from_str(str).unwrap(); |
2827 | 1 | assert_eq!(doc.len(), 1); |
2828 | 1 | let mut rules = SpeechRules::new(RulesFor::Speech, true); |
2829 | 1 | SpeechPattern::build(&doc[0], Path::new("testing"), &mut rules).unwrap(); |
2830 | ||
2831 | 1 | let str = r#"--- |
2832 | 1 | {name: another-rule, tag: math, match: ".", replace: [t: "test", x: "./*"] }"#; |
2833 | 1 | let doc2 = YamlLoader::load_from_str(str).unwrap(); |
2834 | 1 | assert_eq!(doc2.len(), 1); |
2835 | 1 | SpeechPattern::build(&doc2[0], Path::new("testing"), &mut rules).unwrap(); |
2836 | 1 | assert_eq!(rules.rules["math"].len(), 2, "\nsecond rule not added"); |
2837 | ||
2838 | 1 | let speech_pattern = &rules.rules["math"][0]; |
2839 | 1 | assert_eq!(speech_pattern.pattern_name, "default", "\npattern name failure"); |
2840 | 1 | assert_eq!(speech_pattern.tag_name, "math", "\ntag name failure"); |
2841 | 1 | assert_eq!(speech_pattern.pattern.rc.string, ".", "\npattern failure"); |
2842 | 1 | assert_eq!(speech_pattern.replacements.replacements.len(), 1, "\nreplacement failure"); |
2843 | 1 | } |
2844 | ||
2845 | #[test] | |
2846 | 1 | fn test_debug_no_debug() { |
2847 | 1 | let str = r#"*[2]/*[3][text()='3']"#; |
2848 | 1 | let result = MyXPath::add_debug_string_arg(str); |
2849 | 1 | assert!(result.is_ok()); |
2850 | 1 | assert_eq!(result.unwrap(), str); |
2851 | 1 | } |
2852 | ||
2853 | #[test] | |
2854 | 1 | fn test_debug_no_debug_with_quote() { |
2855 | 1 | let str = r#"*[2]/*[3][text()='(']"#; |
2856 | 1 | let result = MyXPath::add_debug_string_arg(str); |
2857 | 1 | assert!(result.is_ok()); |
2858 | 1 | assert_eq!(result.unwrap(), str); |
2859 | 1 | } |
2860 | ||
2861 | #[test] | |
2862 | 1 | fn test_debug_no_quoted_paren() { |
2863 | 1 | let str = r#"DEBUG(*[2]/*[3][text()='3'])"#; |
2864 | 1 | let result = MyXPath::add_debug_string_arg(str); |
2865 | 1 | assert!(result.is_ok()); |
2866 | 1 | assert_eq!(result.unwrap(), r#"DEBUG(*[2]/*[3][text()='3'], "*[2]/*[3][text()='3']")"#); |
2867 | 1 | } |
2868 | ||
2869 | #[test] | |
2870 | 1 | fn test_debug_quoted_paren() { |
2871 | 1 | let str = r#"DEBUG(*[2]/*[3][text()='('])"#; |
2872 | 1 | let result = MyXPath::add_debug_string_arg(str); |
2873 | 1 | assert!(result.is_ok()); |
2874 | 1 | assert_eq!(result.unwrap(), r#"DEBUG(*[2]/*[3][text()='('], "*[2]/*[3][text()='(']")"#); |
2875 | 1 | } |
2876 | ||
2877 | #[test] | |
2878 | 1 | fn test_debug_quoted_paren_before_paren() { |
2879 | 1 | let str = r#"DEBUG(ClearSpeak_Matrix = 'Combinatorics') and IsBracketed(., '(', ')')"#; |
2880 | 1 | let result = MyXPath::add_debug_string_arg(str); |
2881 | 1 | assert!(result.is_ok()); |
2882 | 1 | assert_eq!(result.unwrap(), r#"DEBUG(ClearSpeak_Matrix = 'Combinatorics', "ClearSpeak_Matrix = 'Combinatorics'") and IsBracketed(., '(', ')')"#); |
2883 | 1 | } |
2884 | ||
2885 | ||
2886 | // zipped files do NOT include "zz", hence we need to exclude this test | |
2887 | cfg_if::cfg_if! {if #[cfg(not(feature = "include-zip"))] { | |
2888 | #[test] | |
2889 | 1 | fn test_up_to_date() { |
2890 | use crate::interface::*; | |
2891 | // initialize and move to a directory where making a time change doesn't really matter | |
2892 | 1 | set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); |
2893 | 1 | set_preference("Language", "zz-aa").unwrap(); |
2894 | // not much is support in zz | |
2895 | 1 | if let Err( |
2896 | 0 | error!("{}", crate::errors_to_string(&e)); |
2897 | 0 | panic!("Should not be an error in setting MathML") |
2898 | 1 | } |
2899 | ||
2900 | 1 | set_preference("CheckRuleFiles", "All").unwrap(); |
2901 | 1 | assert!(!is_file_time_same(), "file's time did not get updated"); |
2902 | 1 | set_preference("CheckRuleFiles", "None").unwrap(); |
2903 | 1 | assert!(is_file_time_same(), "file's time was wrongly updated (preference 'CheckRuleFiles' should have prevented updating)"); |
2904 | ||
2905 | // change a file, cause read_files to be called, and return if MathCAT noticed the change and updated its time | |
2906 | 2 | fn is_file_time_same() -> bool { |
2907 | // read and write a unicode file in a test dir | |
2908 | // files are read in due to setting the MathML | |
2909 | ||
2910 | use std::time::Duration; | |
2911 | 2 | return SPEECH_RULES.with(|rules| { |
2912 | 2 | let start_main_file = rules.borrow().unicode_short_files.borrow().ft[0].clone(); |
2913 | ||
2914 | // open the file, read all the contents, then write them back so the time changes | |
2915 | 2 | let contents = std::fs::read(&start_main_file.file).expect(&format!("Failed to read file {} during test", &start_main_file.file.to_string_lossy())); |
2916 | 2 | std::fs::write(start_main_file.file, contents).unwrap(); |
2917 | 2 | std::thread::sleep(Duration::from_millis(5)); // pause a little to make sure the time changes |
2918 | ||
2919 | // speak should cause the file stored to have a new time | |
2920 | 2 | if let Err( |
2921 | 0 | error!("{}", crate::errors_to_string(&e)); |
2922 | 0 | panic!("Should not be an error in speech") |
2923 | 2 | } |
2924 | 2 | return rules.borrow().unicode_short_files.borrow().ft[0].time == start_main_file.time; |
2925 | 2 | }); |
2926 | 2 | } |
2927 | 1 | } |
2928 | }} | |
2929 | ||
2930 | // #[test] | |
2931 | // fn test_nested_debug_quoted_paren() { | |
2932 | // let str = r#"DEBUG(*[2]/*[3][DEBUG(text()='(')])"#; | |
2933 | // let result = MyXPath::add_debug_string_arg(str); | |
2934 | // assert!(result.is_ok()); | |
2935 | // assert_eq!(result.unwrap(), r#"DEBUG(*[2]/*[3][DEBUG(text()='(')], "DEBUG(*[2]/*[3][DEBUG(text()='(')], \"text()='(')]\")"#); | |
2936 | // } | |
2937 | ||
2938 | } |
Line | Count | Source |
1 | //! #Speech Engine Information | |
2 | //! | |
3 | //! ## Pitch (default 140hz) | |
4 | //! ### SAPI4: Relative pitch | |
5 | //! * Number is relative to the default/current pitch. | |
6 | //! * 50 is 1/2 of the default/current pitch, 200 is 2 times the default/current pitch. | |
7 | //! | |
8 | //! Note: no range is specified by the spec | |
9 | //! ### SAPI5: Relative pitch | |
10 | //! From https://documentation.help/SAPI-5/sapi.xsd | |
11 | //! * A value of +10 sets a voice to speak at four-thirds (or 4/3) of its default pitch. | |
12 | //! * Each increment between –10 and +10 is logarithmically distributed such that | |
13 | //! incrementing/decrementing by 1 is multiplying/dividing the pitch by the 24th root of 2 (about 1.03). | |
14 | //! * Values more extreme than –10 and 10 will be passed to an engine but SAPI 5compliant engines may not support | |
15 | //! such extremes and instead may clip the pitch to the maximum or minimum pitch it supports. | |
16 | //! * Values of –24 and +24 must lower and raise pitch by 1 octave respectively. | |
17 | //! All incrementing/decrementing by 1 must multiply/divide the pitch by the 24th root of 2. | |
18 | //! | |
19 | //! Note: an octave is a doubling of frequency, so pitch change of 100% should turn into +/- 24 | |
20 | //! ### SSML: Relative pitch | |
21 | //! * pitch in hertz (default/current man's voice is about 100hz, woman's 180hz) | |
22 | //! | |
23 | //! Note: other legal values for SSML are not supported, and all numbers are interpreted as relative changes | |
24 | //! ### Eloquence: Absolute pitch (relative pitch not supported by Eloquence) | |
25 | //! * Range is 0 - 100. Guess is that 0 ~= 42hz, 100 ~= 422hz based on supported \"sapi\" values | |
26 | //! ## Rate (default 180 words/min) | |
27 | //! ### SAPI4: Absolute rate | |
28 | //! * Number is relative to the default/current rate | |
29 | //! * 50 is 1/2 of the default/current rate, 200 is 2 times the default/current rate | |
30 | //! | |
31 | //! Note: no range is specified by the spec | |
32 | //! ### SAPI5: Relative rate | |
33 | //! * Number is in range -10 to 10 | |
34 | //! * -10 is 1/3 of the default/current speed; 10 3 times the default/current speech | |
35 | //! * changes are logarithmic -- a change of +/-1 corresponds to multiplying/dividing by 10th root of 3 (10*log_3(change)) | |
36 | //! ### SSML: Relative rate % | |
37 | //! * 100% is no change, 50% is half the current rate, 200% is doubling the rate | |
38 | //! | |
39 | //! Note: other legal values for SSML are not supported, and all numbers are interpreted as relative changes | |
40 | //! ### Eloquence: Absolute rate (relative rate not supported by Eloquence) | |
41 | //! * Range is 0 - 250, which manual seems to indicate corresponds to 70 - 1297 words/min. | |
42 | //! * * Window-Eyes only seems to give values in range 1 - 150. | |
43 | //! * On the low end, 1 ~= 72words/min | |
44 | //! * On the high end, I can't tell, but 80 seems to be a bit over twice normal (~400 words/min?) | |
45 | //! 250 ~= 1297 words/min based on supported "sapi" values | |
46 | //! | |
47 | //! Note: this means words/min = 4.18 * Eloquence rate + 66 | |
48 | //! So the relative pause rate is 180/computed value | |
49 | //! | |
50 | //! | |
51 | //! ## Volume (default 100 \[full]) | |
52 | //! ### SAPI4: Relative volume | |
53 | //! * Number is relative to the default/current rate | |
54 | //! * Range is 0 - 065535 | |
55 | //! ### SAPI5: Relative volume | |
56 | //! * Number is in range 0 to 100 | |
57 | //! ### SSML: Relative volume | |
58 | //! * Number is in range 0 to 100 | |
59 | //! | |
60 | //! Note: other legal values for SSML are not supported, and all numbers are interpreted as relative changes | |
61 | //! ### Eloquence: Absolute volume (relative volume not supported by Eloquence) | |
62 | //! * Range is 0 - 100 | |
63 | //! | |
64 | //! ## Pause | |
65 | //! * All systems -- pauses are given in milliseconds | |
66 | //! | |
67 | //! Note: Pauses on output are scaled based on the ratio of the current rate to the default rate (180 wpm) | |
68 | #![allow(clippy::needless_return)] | |
69 | ||
70 | use crate::{errors::*, prefs::PreferenceManager, speech::ReplacementArray}; | |
71 | use sxd_document::dom::Element; | |
72 | use yaml_rust::Yaml; | |
73 | ||
74 | use std::fmt; | |
75 | use crate::speech::{SpeechRulesWithContext, MyXPath, TreeOrString}; | |
76 | use std::string::ToString; | |
77 | use std::str::FromStr; | |
78 | use strum_macros::{Display, EnumString}; | |
79 | use regex::Regex; | |
80 | use std::sync::LazyLock; | |
81 | use sxd_xpath::Value; | |
82 | use html_escape::encode_safe; | |
83 | ||
84 | const MIN_PAUSE:f64 = 50.0; // ms -- avoids clutter of putting out pauses that probably can't be heard | |
85 | const PAUSE_SHORT:f64 = 200.0; // ms | |
86 | const PAUSE_MEDIUM:f64 = 400.0; // ms | |
87 | const PAUSE_LONG:f64 = 800.0; // ms | |
88 | const PAUSE_XLONG:f64 = 1600.0; // ms | |
89 | const PAUSE_AUTO:f64 = 987654321.5; // ms -- hopefully unique | |
90 | pub const PAUSE_AUTO_STR: &str = "\u{F8FA}\u{F8FA}"; | |
91 | const RATE_FROM_CONTEXT:f64 = 987654321.5; // hopefully unique | |
92 | ||
93 | const MAX_TRANSLATE_RECURSION: usize = 5; // probably never more than three -- prevents infinite loop/stack overflows bugs | |
94 | ||
95 | /// TTSCommand are the supported TTS commands | |
96 | /// When parsing the YAML rule files, they are converted to these enums | |
97 | #[derive(Debug, Clone, PartialEq, Eq, Display, EnumString)] | |
98 | #[strum(serialize_all = "snake_case")] // allows lower case | |
99 | pub enum TTSCommand { | |
100 | Pause, | |
101 | Rate, | |
102 | Volume, | |
103 | Pitch, | |
104 | Audio, | |
105 | Gender, | |
106 | Voice, | |
107 | Spell, | |
108 | Bookmark, | |
109 | Pronounce, | |
110 | } | |
111 | ||
112 | #[derive(Debug, Clone)] | |
113 | pub struct Pronounce { | |
114 | text: String, // plain text | |
115 | ipa: String, // ipa | |
116 | sapi5: String, | |
117 | eloquence: String, | |
118 | } | |
119 | ||
120 | ||
121 | impl fmt::Display for Pronounce { | |
122 | 1 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
123 | 1 | let mut comma = ""; // comma separator so it looks right |
124 | 1 | write!(f, "pronounce: [") |
125 | 1 | if !self.text.is_empty() { |
126 | 1 | write!(f, "text: '{}'", self.text) |
127 | 1 | comma = ","; |
128 | 0 | } |
129 | 1 | write!(f, "pronounce: [") |
130 | 1 | if !self.ipa.is_empty() { |
131 | 1 | write!(f, "{}ipa: '{}'", comma, self.ipa) |
132 | 1 | comma = ","; |
133 | 0 | } |
134 | 1 | write!(f, "pronounce: [") |
135 | 1 | if !self.sapi5.is_empty() { |
136 | 1 | write!(f, "{}sapi5: '{}'", comma, self.sapi5) |
137 | 1 | comma = ","; |
138 | 0 | } |
139 | 1 | write!(f, "pronounce: [") |
140 | 1 | if !self.eloquence.is_empty() { |
141 | 1 | write!(f, "{}eloquence: '{}'", comma, self.eloquence) |
142 | 0 | } |
143 | 1 | return writeln!(f, "]"); |
144 | 1 | } |
145 | } | |
146 | ||
147 | impl Pronounce { | |
148 | 5.02k | fn build(values: &Yaml) -> Result<Pronounce> { |
149 | use crate::speech::{as_str_checked, yaml_to_type}; | |
150 | use crate::pretty_print::yaml_to_string; | |
151 | ||
152 | 5.02k | let mut text = ""; |
153 | 5.02k | let mut ipa = ""; |
154 | 5.02k | let mut sapi5 = ""; |
155 | 5.02k | let mut eloquence = ""; |
156 | // values should be an array with potential values for Pronounce | |
157 | 5.02k | let values = values.as_vec().ok_or_else(|| |
158 | 0 | anyhow!("'pronounce' value '{}' is not an array", yaml_to_type(values)))?; |
159 | 20.0k | for key_value in |
160 | 20.0k | let key_value_hash = key_value.as_hash().ok_or_else(|| |
161 | 0 | anyhow!("pronounce value '{}' is not key/value pair", yaml_to_string(key_value, 0)))?; |
162 | 20.0k | if key_value_hash.len() != 1 { |
163 | 0 | bail!("pronounce value {:?} is not a single key/value pair", key_value_hash); |
164 | 20.0k | } |
165 | ||
166 | 20.0k | for (key, value) in key_value_hash { |
167 | 20.0k | match as_str_checked(key) |
168 | 20.0k | "text" => text = |
169 | 15.0k | "ipa" => ipa = |
170 | 10.0k | "sapi5" => sapi5 = |
171 | 5.02k | "eloquence" => eloquence = as_str_checked(value) |
172 | 0 | _ => bail!("unknown pronounce type: {} with value {}", yaml_to_string(key, 0), yaml_to_string(value, 0)), |
173 | } | |
174 | } | |
175 | } | |
176 | 5.02k | if text.is_empty() { |
177 | 1 | bail!("'text' key/value is required for 'pronounce' -- it is used is the speech engine is unknown.") |
178 | 5.02k | } |
179 | 5.02k | return Ok( Pronounce{ |
180 | 5.02k | text: text.to_string(), |
181 | 5.02k | ipa: ipa.to_string(), |
182 | 5.02k | sapi5: sapi5.to_string(), |
183 | 5.02k | eloquence: eloquence.to_string() |
184 | 5.02k | } ); |
185 | ||
186 | ||
187 | 5.02k | } |
188 | } | |
189 | /// TTSCommands are either numbers (f64 because of YAML) or strings | |
190 | #[derive(Debug, Clone)] | |
191 | pub enum TTSCommandValue { | |
192 | Number(f64), | |
193 | String(String), | |
194 | XPath(MyXPath), | |
195 | Pronounce(Box<Pronounce>), | |
196 | } | |
197 | ||
198 | impl TTSCommandValue { | |
199 | 77.2k | fn get_num(&self) -> f64 { |
200 | 77.2k | match self { |
201 | 77.2k | TTSCommandValue::Number(n) => return *n, |
202 | 0 | _ => panic!("Internal error: TTSCommandValue is not a number"), |
203 | } | |
204 | 77.2k | } |
205 | ||
206 | 0 | fn get_string(&self) -> &String { |
207 | 0 | match self { |
208 | 0 | TTSCommandValue::String(s) => return s, |
209 | 0 | _ => panic!("Internal error: TTSCommandValue is not a string"), |
210 | } | |
211 | 0 | } |
212 | ||
213 | 0 | fn get_pronounce(&self) -> &Pronounce { |
214 | 0 | match self { |
215 | 0 | TTSCommandValue::Pronounce(p) => return p, |
216 | 0 | _ => panic!("Internal error: TTSCommandValue is not a 'pronounce' command'"), |
217 | } | |
218 | ||
219 | 0 | } |
220 | } | |
221 | ||
222 | /// A TTS rule consists of the command, the value, and its replacement | |
223 | #[derive(Debug, Clone)] | |
224 | pub struct TTSCommandRule { | |
225 | command: TTSCommand, | |
226 | value: TTSCommandValue, | |
227 | replacements: ReplacementArray | |
228 | } | |
229 | ||
230 | impl fmt::Display for TTSCommandRule { | |
231 | 1 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
232 | 1 | let value = match &self.value { |
233 | 0 | TTSCommandValue::String(s) => s.to_string(), |
234 | 0 | TTSCommandValue::Number(f) => f.to_string(), |
235 | 0 | TTSCommandValue::XPath(p) => p.to_string(), |
236 | 1 | TTSCommandValue::Pronounce(p) => p.to_string(), |
237 | }; | |
238 | 1 | if self.command == TTSCommand::Pause { |
239 | 0 | return write!(f, "pause: {value}"); |
240 | } else { | |
241 | 1 | return write!(f, "{}: {}{}", self.command, value, self.replacements); |
242 | }; | |
243 | 1 | } |
244 | } | |
245 | ||
246 | ||
247 | impl TTSCommandRule { | |
248 | 2.43M | pub fn new(command: TTSCommand, value: TTSCommandValue, replacements: ReplacementArray) -> TTSCommandRule { |
249 | 2.43M | return TTSCommandRule{ |
250 | 2.43M | command, |
251 | 2.43M | value, |
252 | 2.43M | replacements |
253 | 2.43M | } |
254 | 2.43M | } |
255 | } | |
256 | ||
257 | /// Supported TTS engines | |
258 | /// These types should do something for all the TTSCommands | |
259 | #[allow(clippy::upper_case_acronyms)] | |
260 | #[allow(dead_code)] | |
261 | #[derive(Debug, Clone, PartialEq, Eq)] | |
262 | pub enum TTS { | |
263 | None, | |
264 | SSML, | |
265 | SAPI5, | |
266 | // Eloquence, | |
267 | // Mac, | |
268 | } | |
269 | ||
270 | impl TTS { | |
271 | /// Given the tts command ("pause", "rate", etc) and its value, build the TTS data structure for it. | |
272 | /// | |
273 | /// `tts_command`: one of "pause", "rate", etc | |
274 | /// | |
275 | /// `value`: keyword 'value' or dict with 'value' and 'replace' (optional) keys | |
276 | 2.41M | pub fn build(tts_command: &str, values: &Yaml) -> Result<Box<TTSCommandRule>> { |
277 | use crate::pretty_print::yaml_to_string; | |
278 | 2.41M | let hashmap = values.as_hash(); |
279 | let tts_value; | |
280 | let replacements; | |
281 | 2.41M | if hashmap.is_some() { |
282 | 446k | tts_value = &values["value"]; |
283 | 446k | if tts_value.is_badvalue() { |
284 | 0 | bail!("{} TTS command is missing a 'value' sub-key. Found\n{}", tts_command, yaml_to_string(values, 1)); |
285 | 446k | }; |
286 | 446k | replacements = ReplacementArray::build(&values["replace"]) |
287 | 1.96M | } else { |
288 | 1.96M | tts_value = values; |
289 | 1.96M | replacements = ReplacementArray::build_empty(); |
290 | 1.96M | } |
291 | 2.41M | let tts_str_value = yaml_to_string(tts_value, 0); |
292 | 2.41M | let tts_str_value = tts_str_value.trim(); |
293 | 2.41M | let tts_enum = match TTSCommand::from_str(tts_command) { |
294 | 2.41M | Ok(t) => t, |
295 | 0 | Err(_) => bail!("Internal error in build_tts: unexpected rule ({:?}) encountered", tts_command), |
296 | }; | |
297 | ||
298 | 2.41M | let |
299 | TTSCommand::Pause | TTSCommand::Rate | TTSCommand::Volume | TTSCommand::Pitch => { | |
300 | // these strings are almost always what the value will be, so we try them first | |
301 | 1.05M | let val = match tts_str_value { |
302 | 1.05M | "auto" => |
303 | 982k | "short" => |
304 | 426k | "medium" => |
305 | 314k | "long" => |
306 | 229k | "xlong" => |
307 | 225k | "$MathRate" => |
308 | 221k | _ => tts_str_value.parse::<f64>() |
309 | }; | |
310 | ||
311 | 1.05M | match val { |
312 | 832k | Ok(num) => TTSCommandValue::Number(num), |
313 | Err(_) => { | |
314 | // let's try as an xpath (e.g., could be '$CapitalLetters_Pitch') | |
315 | TTSCommandValue::XPath( | |
316 | 221k | MyXPath::build(tts_value).with_context(|| |
317 | ) | |
318 | } | |
319 | } | |
320 | }, | |
321 | TTSCommand::Bookmark | TTSCommand::Spell => { | |
322 | TTSCommandValue::XPath( | |
323 | 1.13M | MyXPath::build(values).with_context(|| |
324 | ) | |
325 | }, | |
326 | TTSCommand::Pronounce => { | |
327 | 5.02k | TTSCommandValue::Pronounce( |
328 | }, | |
329 | _ => { | |
330 | 220k | TTSCommandValue::String(tts_str_value.to_string()) |
331 | }, | |
332 | }; | |
333 | 2.41M | return Ok( Box::new( TTSCommandRule::new(tts_enum, tts_command_value, replacements) ) ); |
334 | 2.41M | } |
335 | ||
336 | /// The rule called to execute the TTSCommand `command` | |
337 | /// `prefs` are used for scaling the speech rate | |
338 | /// some rules have MathML nested inside, so we need to do replacements on them (hence `rules` and `mathml` are needed) | |
339 | /// | |
340 | /// A string is returned for the speech engine. | |
341 | /// | |
342 | /// `auto` pausing is handled at a later phase and a special char is used for it | |
343 | 60.7k | pub fn replace<'c, 's:'c, 'm:'c, 'r, T:TreeOrString<'c, 'm, T>>(&self, command: &TTSCommandRule, prefs: &PreferenceManager, rules_with_context: &'r mut SpeechRulesWithContext<'c, 's, 'm>, mathml: Element<'c>) -> Result<T> { |
344 | 60.7k | return T::replace_tts(self, command, prefs, rules_with_context, mathml); |
345 | 60.7k | } |
346 | ||
347 | 60.7k | pub fn replace_string<'c, 's:'c, 'm, 'r>(&self, command: &TTSCommandRule, prefs: &PreferenceManager, rules_with_context: &'r mut SpeechRulesWithContext<'c, 's, 'm>, mathml: Element<'c>) -> Result<String> { |
348 | // The general idea is we handle the begin tag, the contents, and then the end tag | |
349 | // For the begin/end tag, we dispatch off to specialized code for each TTS engine | |
350 | ||
351 | // 'bookmark' is special in that we need to eval the xpath | |
352 | // rather than pass a bunch of extra info into the generic handling routines, we just deal with them here | |
353 | 60.7k | if command.command == TTSCommand::Bookmark { |
354 | // if we aren't suppose to generate bookmarks, short circuit and just return | |
355 | 26.7k | if prefs.pref_to_string("Bookmark") != "true"{ |
356 | 26.7k | return Ok("".to_string()); |
357 | 0 | } |
358 | 0 | return Ok( match self { |
359 | 0 | TTS::None => "".to_string(), |
360 | 0 | TTS::SSML => compute_bookmark_element(&command.value, "mark name", rules_with_context, mathml)?, |
361 | 0 | TTS::SAPI5 => compute_bookmark_element(&command.value, "bookmark mark", rules_with_context, mathml)?, |
362 | } ); | |
363 | 33.9k | } |
364 | ||
365 | 33.9k | let mut command = command.clone(); |
366 | 33.9k | if command.command == TTSCommand::Spell { |
367 | // spell is also special because we need to eval the xpath to get the string to spell (typically the text content of an mi) | |
368 | 2.77k | match command.value { |
369 | 2.77k | TTSCommandValue::XPath(xpath) => { |
370 | 2.77k | let value = xpath.evaluate(rules_with_context.get_context(), mathml) |
371 | 2.77k | .with_context(|| |
372 | 2.77k | let value_string = match |
373 | 2.24k | Value::String(s) => s, |
374 | 527 | Value::Nodeset(nodes) if nodes.size() == 1 => { |
375 | 527 | let node = nodes.iter().next().unwrap(); |
376 | 527 | if let Some(text) = node.text() { |
377 | 527 | text.text().to_string() |
378 | 0 | } else if let Some(el) = node.element() { |
379 | 0 | if crate::xpath_functions::is_leaf(el) { |
380 | 0 | crate::canonicalize::as_text(el).to_string() |
381 | } else { | |
382 | 0 | bail!("in 'spell': value returned from xpath '{}' does not evaluate to a string", &xpath.to_string()); |
383 | } | |
384 | } else { | |
385 | 0 | bail!("in 'spell': value returned from xpath '{}' does not evaluate to a string, it is {} nodes", |
386 | 0 | &xpath.to_string(), nodes.size()); |
387 | } | |
388 | }, | |
389 | 0 | _ => bail!("in 'spell': value returned from xpath '{}' does not evaluate to a string", &xpath.to_string()), |
390 | }; | |
391 | // Chemistry wants to spell elements like "Na". But we also have the issue of capitalization (SpeechOverrides_CapitalLetters) | |
392 | // so the "N" need to use that. The logic for that is already in unicode.yaml. We could replicate that here. | |
393 | // Rather than duplicate the logic (we would need to handle 'a', and who knows what in other languages), | |
394 | // we split the token into each letter and call the replacement on each letter. | |
395 | // That in turns calls spell again. We end up in an infinite loop. To prevent this we set a flag that says don't recurse. | |
396 | // The only structure to put that in is SpeechRulesWithContext. A bit of a hack to put it there, but better than a static var. | |
397 | // Also, to avoid repeating the code for "cap" over and over, "spell" with "translate" is used. So keep going until no "translate" | |
398 | 2.77k | let xpath_str = xpath.to_string(); |
399 | 2.77k | if rules_with_context.inside_spell && |
400 | 0 | command.value = TTSCommandValue::String(value_string); |
401 | 0 | rules_with_context.translate_count = 0; |
402 | 2.77k | } else if rules_with_context.translate_count > MAX_TRANSLATE_RECURSION { |
403 | 0 | bail!("Rule error: potential infinite recursion found in translate: {}", xpath_str); |
404 | } else { | |
405 | // let the call to replace call spell on the individual chars -- that lets an "cap" be outside "spell" | |
406 | 2.77k | rules_with_context.translate_count += 1; |
407 | 2.77k | let str_with_spaces = value_string.chars() |
408 | 2.94k | . |
409 | 2.94k | rules_with_context.inside_spell = true; |
410 | 2.94k | let spelled_char = rules_with_context.replace_chars(ch.to_string().as_str(), mathml); |
411 | 2.94k | rules_with_context.inside_spell = false; |
412 | 2.94k | spelled_char |
413 | 2.94k | }) |
414 | 2.77k | .collect::<Result<Vec<String>>>() |
415 | 2.77k | .join(" "); |
416 | 2.77k | return Ok(str_with_spaces); |
417 | } | |
418 | }, | |
419 | 0 | _ => bail!("Implementation error: found non-xpath value for spell"), |
420 | } | |
421 | 31.1k | } else if command.command == TTSCommand::Rate && |
422 | 0 | let TTSCommandValue::Number(number_value) = command.value && |
423 | 0 | number_value == RATE_FROM_CONTEXT { |
424 | // handle hack for $Rate -- need to look up in context | |
425 | 0 | let rate_from_context = crate::navigate::context_get_variable(rules_with_context.get_context(), "MathRate", mathml)?.parse::<usize>().unwrap_or(100); |
426 | 0 | command.value = TTSCommandValue::Number(rate_from_context as f64); |
427 | 31.1k | } |
428 | ||
429 | // evaluate any xpath value now to simplify later code | |
430 | 31.1k | if let TTSCommandValue::XPath( |
431 | 1.31k | let eval_str = xpath.replace::<String>(rules_with_context, mathml) |
432 | // can it be a number? | |
433 | 1.31k | command.value = match eval_str.parse::<f64>() { |
434 | 1.31k | Ok(num) => TTSCommandValue::Number(num), |
435 | 0 | Err(_) => TTSCommandValue::String(eval_str), |
436 | } | |
437 | 29.8k | }; |
438 | ||
439 | ||
440 | // small optimization to avoid generating tags that do nothing | |
441 | 31.1k | if ((command.command == TTSCommand::Pitch || |
442 | 29.8k | (command.command == TTSCommand::Rate && |
443 | 1.31k | return command.replacements.replace::<String>(rules_with_context, mathml); |
444 | 29.8k | } |
445 | ||
446 | 29.8k | let mut result = String::with_capacity(255); |
447 | 29.8k | result += &match self { |
448 | 29.8k | TTS::None => self.get_string_none(&command, prefs, true), |
449 | 0 | TTS::SSML => self.get_string_ssml(&command, prefs, true), |
450 | 0 | TTS::SAPI5 => self.get_string_sapi5(&command, prefs, true), |
451 | }; | |
452 | ||
453 | ||
454 | 29.8k | if !command.replacements.is_empty() { |
455 | 0 | if result.is_empty() { |
456 | 0 | result += " "; |
457 | 0 | } |
458 | // need to sanitize string so that SSML is not injected into it via mtext, etc. | |
459 | 0 | let speech = command.replacements.replace::<String>(rules_with_context, mathml)?; |
460 | 0 | result += &encode_safe(&speech); |
461 | 29.8k | } |
462 | ||
463 | 29.8k | let end_tag = match self { |
464 | 29.8k | TTS::None => self.get_string_none(&command, prefs, false), |
465 | 0 | TTS::SSML => self.get_string_ssml(&command, prefs, false), |
466 | 0 | TTS::SAPI5 => self.get_string_sapi5(&command, prefs, false), |
467 | }; | |
468 | ||
469 | 29.8k | if end_tag.is_empty() { |
470 | 29.8k | return Ok( result ); // avoids adding in " " |
471 | } else { | |
472 | 0 | return Ok( result + &end_tag ); |
473 | } | |
474 | ||
475 | ||
476 | 0 | fn compute_bookmark_element<'c, 's:'c, 'm, 'r>(value: &TTSCommandValue, tag_and_attr: &str, rules_with_context: &'r mut SpeechRulesWithContext<'c, 's, 'm>, mathml: Element<'c>) -> Result<String> { |
477 | 0 | match value { |
478 | 0 | TTSCommandValue::XPath(xpath) => { |
479 | 0 | let id = xpath.replace::<String>(rules_with_context, mathml)?; |
480 | 0 | return Ok( format!("<{tag_and_attr}='{id}'/>") ); |
481 | }, | |
482 | 0 | _ => bail!("Implementation error: found bookmark value that did not evaluate to a string"), |
483 | } | |
484 | 0 | } |
485 | ||
486 | 60.7k | } |
487 | ||
488 | // auto pausing can't be known until neighboring strings are computed | |
489 | // we create a unique string in this case and compute the real value later | |
490 | 75.9k | fn get_string_none(&self, command: &TTSCommandRule, prefs: &PreferenceManager, is_start_tag: bool) -> String { |
491 | // they only thing to do is handle "pause" with some punctuation hacks along with 'spell' | |
492 | 75.9k | if is_start_tag { |
493 | 46.1k | if command.command == TTSCommand::Pause { |
494 | 46.1k | let amount = command.value.get_num(); |
495 | // only ',' and ';' are used as '.' didn't seem to reliably generate pauses in tests | |
496 | 46.1k | return crate::speech::CONCAT_INDICATOR.to_string() + ( |
497 | 46.1k | if amount == PAUSE_AUTO { |
498 | 19.5k | PAUSE_AUTO_STR |
499 | } else { | |
500 | 26.5k | let amount = amount * TTS::get_pause_multiplier(prefs); |
501 | 26.5k | if amount <= MIN_PAUSE { |
502 | 11.3k | "" |
503 | 15.1k | } else if amount <= 250.0 { |
504 | 9.93k | "," |
505 | } else { | |
506 | 5.21k | ";" |
507 | } | |
508 | } | |
509 | ); | |
510 | 32 | } else if command.command == TTSCommand::Spell { |
511 | // debug!("spell rule: {}", command.value.get_string()); | |
512 | 0 | return command.value.get_string().to_string(); |
513 | 32 | } else if let TTSCommandValue::Pronounce(p) = &command.value { |
514 | 32 | return crate::speech::CONCAT_INDICATOR.to_string() + &p.text; |
515 | 0 | } |
516 | 29.8k | }; |
517 | 29.8k | return "".to_string(); |
518 | 75.9k | } |
519 | ||
520 | 0 | fn get_string_sapi5(&self, command: &TTSCommandRule, prefs: &PreferenceManager, is_start_tag: bool) -> String { |
521 | 0 | return match &command.command { |
522 | 0 | TTSCommand::Pause => if is_start_tag { |
523 | 0 | let amount = command.value.get_num(); |
524 | 0 | if amount == PAUSE_AUTO { |
525 | 0 | PAUSE_AUTO_STR.to_string() |
526 | } else { | |
527 | 0 | let amount = amount * TTS::get_pause_multiplier(prefs); |
528 | 0 | if amount > MIN_PAUSE { |
529 | 0 | format!("<silence msec=='{}ms'/>", (amount * 180.0/prefs.get_rate()).round()) |
530 | } else { | |
531 | 0 | "".to_string() |
532 | } | |
533 | } | |
534 | } else { | |
535 | 0 | "".to_string() |
536 | }, | |
537 | // pitch must be in [-10, 10], logarithmic based on octaves | |
538 | // note MathPlayer uses 'absmiddle' (requires keeping a stack) -- could be 'middle' is not well supported | |
539 | 0 | TTSCommand::Pitch => if is_start_tag {format!("<pitch middle=\"{}\">", (24.0*(1.0+command.value.get_num()/100.0).log2()).round())} else {String::from("</prosody>")}, |
540 | // rate must be in [-10, 10], but we get relative %s. 300% => 10 (see comments at top of file) | |
541 | 0 | TTSCommand::Rate => if is_start_tag {format!("<rate speed='{:.1}'>", 10.0*(0.01*command.value.get_num()).log(3.0))} else {String::from("</rate>")}, |
542 | 0 | TTSCommand::Volume =>if is_start_tag {format!("<volume level='{}'>", command.value.get_num())} else {String::from("</volume>")}, |
543 | 0 | TTSCommand::Audio => "".to_string(), // SAPI5 doesn't support audio |
544 | 0 | TTSCommand::Gender =>if is_start_tag {format!("<voice required=\"Gender={}\">", command.value.get_string())} else {String::from("</prosody>")}, |
545 | 0 | TTSCommand::Voice =>if is_start_tag {format!("<voice required=\"Name={}\">", command.value.get_string())} else {String::from("</prosody>")}, |
546 | 0 | TTSCommand::Spell =>if is_start_tag {format!("<spell>{}", command.value.get_string())} else {String::from("</spell>")}, |
547 | 0 | TTSCommand::Pronounce =>if is_start_tag { |
548 | 0 | format!("<pron sym='{}'>{}", &command.value.get_pronounce().sapi5, &command.value.get_pronounce().text) |
549 | } else { | |
550 | 0 | String::from("</pron>") |
551 | }, | |
552 | 0 | TTSCommand::Bookmark => panic!("Internal error: bookmarks should have been handled earlier"), |
553 | }; | |
554 | 0 | } |
555 | ||
556 | 0 | fn get_string_ssml(&self, command: &TTSCommandRule, prefs: &PreferenceManager, is_start_tag: bool) -> String { |
557 | 0 | return match &command.command { |
558 | TTSCommand::Pause => { | |
559 | 0 | if is_start_tag { |
560 | 0 | let amount = command.value.get_num(); |
561 | 0 | if amount == PAUSE_AUTO { |
562 | 0 | PAUSE_AUTO_STR.to_string() |
563 | } else { | |
564 | 0 | let amount = amount * TTS::get_pause_multiplier(prefs); |
565 | 0 | if amount > MIN_PAUSE { |
566 | 0 | format!("<break time='{}ms'/>", (amount * 180.0/prefs.get_rate()).round()) |
567 | } else { | |
568 | 0 | "".to_string() |
569 | } | |
570 | } | |
571 | } else { | |
572 | 0 | "".to_string() |
573 | } | |
574 | }, | |
575 | 0 | TTSCommand::Pitch => if is_start_tag {format!("<prosody pitch='{}%'>", command.value.get_num())} else {String::from("</prosody>")}, |
576 | 0 | TTSCommand::Rate => if is_start_tag {format!("<prosody rate='{}%'>", command.value.get_num())} else {String::from("</prosody>")}, |
577 | 0 | TTSCommand::Volume =>if is_start_tag {format!("<prosody volume='{}db'>", command.value.get_num())} else {String::from("</prosody>")}, |
578 | 0 | TTSCommand::Audio =>if is_start_tag {format!("<audio src='{}'>", command.value.get_string())} else {String::from("</audio>")}, // only 'beep' is supported for now |
579 | 0 | TTSCommand::Gender =>if is_start_tag {format!("<voice required='gender=\"{}\"'>", command.value.get_string())} else {String::from("</voice>")}, |
580 | 0 | TTSCommand::Voice =>if is_start_tag {format!("<voice required='{}'>", command.value.get_string())} else {String::from("</voice>")}, |
581 | 0 | TTSCommand::Spell =>if is_start_tag {format!("<say-as interpret-as='characters'>{}", command.value.get_string())} else {String::from("</say-as>")}, |
582 | 0 | TTSCommand::Pronounce =>if is_start_tag { |
583 | 0 | format!("<phoneme alphabet='ipa' ph='{}'>{}", &command.value.get_pronounce().ipa, &command.value.get_pronounce().text) |
584 | } else { | |
585 | 0 | String::from("</phoneme>") |
586 | }, | |
587 | 0 | TTSCommand::Bookmark => panic!("Internal error: bookmarks should have been handled earlier"), |
588 | } | |
589 | 0 | } |
590 | ||
591 | 26.5k | fn get_pause_multiplier(prefs: &PreferenceManager) -> f64 { |
592 | 26.5k | return prefs.pref_to_string("PauseFactor").parse::<f64>().unwrap_or(100.)/100.0; |
593 | 26.5k | } |
594 | ||
595 | /// Compute the length of the pause to use. | |
596 | /// | |
597 | /// The computation is based on the length of the speech strings (after removing tagging). | |
598 | /// There is a bias towards pausing more _after_ longer strings. | |
599 | 19.5k | pub fn compute_auto_pause(&self, prefs: &PreferenceManager, before: &str, after: &str) -> String { |
600 | 0 | static REMOVE_XML: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<.+?>").unwrap()); // punctuation ending with a '.' |
601 | let before_len; | |
602 | let after_len; | |
603 | 19.5k | match self { |
604 | 0 | TTS::SSML | TTS::SAPI5 => { |
605 | 0 | before_len = REMOVE_XML.replace_all(before, "").len(); |
606 | 0 | after_len = REMOVE_XML.replace_all(after, "").len(); |
607 | 0 | }, |
608 | 19.5k | _ => { |
609 | 19.5k | before_len = before.len(); |
610 | 19.5k | after_len = after.len(); |
611 | 19.5k | }, |
612 | } | |
613 | ||
614 | // pause values are not cut in stone | |
615 | // the calculation bias to 'previous' is based on MathPlayer which used '30 * #-of-descendants-on-left | |
616 | // I think I did this as a sort of "take a breath" after saying something long although one might want to do that | |
617 | // before speaking something long. | |
618 | 19.5k | if after_len < 3 { |
619 | // hack to prevent pausing before "of" in exprs like "the fourth power of secant, of x" | |
620 | // if it should pause anywhere, it should be after the "of" | |
621 | 3.31k | return "".to_string(); |
622 | 16.2k | } |
623 | 16.2k | let pause = std::cmp::min(3000, ((2 * before_len + after_len)/48) * 128); |
624 | // create a TTSCommandRule so we reuse code | |
625 | 16.2k | let command = TTSCommandRule::new( |
626 | 16.2k | TTSCommand::Pause, |
627 | 16.2k | TTSCommandValue::Number(pause as f64), |
628 | 16.2k | ReplacementArray::build_empty(), |
629 | ); | |
630 | 16.2k | return match self { |
631 | 16.2k | TTS::None => self.get_string_none(&command, prefs, true), |
632 | 0 | TTS::SSML => self.get_string_ssml(&command, prefs, true), |
633 | 0 | TTS::SAPI5 => self.get_string_sapi5(&command, prefs, true), |
634 | }; | |
635 | ||
636 | 19.5k | } |
637 | ||
638 | /// Take the longest of the pauses | |
639 | /// | |
640 | /// Two other options are: | |
641 | /// 1. average the pauses | |
642 | /// 2. add the pauses together. | |
643 | /// | |
644 | /// Until evidence points otherwise, use 'longest'. | |
645 | 5.10k | pub fn merge_pauses(&self, str: &str) -> String { |
646 | // we need specialized merges for each TTS engine because we need to know the format of the commands | |
647 | 5.10k | return match self { |
648 | 5.10k | TTS::None => self.merge_pauses_none(str), |
649 | 1 | TTS::SSML => self.merge_pauses_ssml(str), |
650 | 1 | TTS::SAPI5 => self.merge_pauses_sapi5(str), |
651 | }; | |
652 | 5.10k | } |
653 | ||
654 | 5.10k | fn merge_pauses_none(&self, str: &str) -> String { |
655 | // punctuation used for pauses is ",", ";" | |
656 | 2 | static SPACES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s+([;,])").unwrap()); // two or more pauses |
657 | 2 | static MULTIPLE_PAUSES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"([,;][,;]+)").unwrap()); // two or more pauses |
658 | // we reduce all sequences of two or more pauses to a single medium pause | |
659 | 5.10k | let merges_string = SPACES.replace_all(str, "$1").to_string(); |
660 | 5.10k | let merges_string = MULTIPLE_PAUSES.replace_all(&merges_string, ";").to_string(); |
661 | 5.10k | return merges_string; |
662 | 5.10k | } |
663 | ||
664 | 2 | fn merge_pauses_xml<F>(str: &str, full_attr_re: &Regex, sub_attr_re: &Regex, replace_with: F) -> String |
665 | 2 | where F: Fn(usize) -> String { |
666 | // we reduce all sequences of two or more pauses to the max pause amount | |
667 | // other options would be the sum or an average | |
668 | // maybe some amount a little longer than the max would be best??? | |
669 | 2 | let mut merges_string = str.to_string(); |
670 | 2 | for cap in full_attr_re.captures_iter(str) { |
671 | 2 | let mut amount = 0; |
672 | 4 | for c in |
673 | 4 | amount = std::cmp::max(amount, c[1].parse::<usize>().unwrap()); |
674 | 4 | }; |
675 | 2 | merges_string = merges_string.replace(&cap[0], &replace_with(amount)); |
676 | } | |
677 | 2 | return merges_string; |
678 | 2 | } |
679 | ||
680 | 1 | fn merge_pauses_sapi5(&self, str: &str) -> String { |
681 | 1 | static CONSECUTIVE_BREAKS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(<silence msec[^>]+?> *){2,}").unwrap()); // two or more pauses |
682 | 1 | static PAUSE_AMOUNT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"msec=.*?(\d+)").unwrap()); // amount after 'time' |
683 | 1 | let replacement = |amount: usize| format!("<silence msec=='{amount}ms'/>"); |
684 | 1 | return TTS::merge_pauses_xml(str, &CONSECUTIVE_BREAKS, &PAUSE_AMOUNT, replacement); |
685 | 1 | } |
686 | ||
687 | 1 | fn merge_pauses_ssml(&self, str: &str) -> String { |
688 | 1 | static CONSECUTIVE_BREAKS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(<break time=[^>]+?> *){2,}").unwrap()); // two or more pauses |
689 | 1 | static PAUSE_AMOUNT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"time=.*?(\d+)").unwrap()); // amount after 'time' |
690 | 1 | let replacement = |amount: usize| format!("<break time='{amount}ms'/>"); |
691 | 1 | return TTS::merge_pauses_xml(str, &CONSECUTIVE_BREAKS, &PAUSE_AMOUNT, replacement); |
692 | 1 | } |
693 | } | |
694 | ||
695 | #[cfg(test)] | |
696 | mod tests { | |
697 | use super::*; | |
698 | use yaml_rust::YamlLoader; | |
699 | ||
700 | #[test] | |
701 | /// Verifies pronounce YAML builds and renders all supported fields. | |
702 | 1 | fn pronounce_build_and_display() { |
703 | 1 | let yaml = YamlLoader::load_from_str( |
704 | 1 | r#" |
705 | 1 | - text: "alpha" |
706 | 1 | - ipa: "a" |
707 | 1 | - sapi5: "b" |
708 | 1 | - eloquence: "c" |
709 | 1 | "#, |
710 | ) | |
711 | 1 | .unwrap(); |
712 | 1 | let values = &yaml[0]; |
713 | 1 | let rule = TTS::build("pronounce", values).unwrap(); |
714 | 1 | let rendered = format!("{rule}"); |
715 | ||
716 | 1 | assert!(rendered.contains("text: 'alpha'")); |
717 | 1 | assert!(rendered.contains("ipa: 'a'")); |
718 | 1 | assert!(rendered.contains("sapi5: 'b'")); |
719 | 1 | assert!(rendered.contains("eloquence: 'c'")); |
720 | 1 | } |
721 | ||
722 | #[test] | |
723 | /// Ensures pronounce requires a text entry and rejects missing text. | |
724 | 1 | fn pronounce_requires_text() { |
725 | 1 | let yaml = YamlLoader::load_from_str( |
726 | 1 | r#" |
727 | 1 | - ipa: "a" |
728 | 1 | "#, |
729 | ) | |
730 | 1 | .unwrap(); |
731 | 1 | let values = &yaml[0]; |
732 | 1 | let err = TTS::build("pronounce", values).unwrap_err(); |
733 | 1 | assert!(err.to_string().contains("'text' key/value is required")); |
734 | 1 | } |
735 | ||
736 | #[test] | |
737 | /// Coalesces adjacent punctuation pauses for the None engine. | |
738 | 1 | fn merge_pauses_none_coalesces() { |
739 | 1 | let input = "a,,;b"; |
740 | 1 | let output = TTS::None.merge_pauses(input); |
741 | 1 | assert!(!output.contains(",,")); |
742 | 1 | assert!(output.contains(";")); |
743 | 1 | } |
744 | ||
745 | #[test] | |
746 | /// Uses the maximum pause when merging consecutive SSML breaks. | |
747 | 1 | fn merge_pauses_ssml_keeps_max() { |
748 | 1 | let input = "<break time='100ms'/><break time='300ms'/>"; |
749 | 1 | let output = TTS::SSML.merge_pauses(input); |
750 | 1 | assert!(!output.contains("100ms")); |
751 | 1 | assert!(output.contains("300ms")); |
752 | 1 | } |
753 | ||
754 | #[test] | |
755 | /// Uses the maximum pause when merging consecutive SAPI5 breaks. | |
756 | 1 | fn merge_pauses_sapi5_keeps_max() { |
757 | 1 | let input = "<silence msec=='100ms'/><silence msec=='300ms'/>"; |
758 | 1 | let output = TTS::SAPI5.merge_pauses(input); |
759 | 1 | assert!(!output.contains("100ms")); |
760 | 1 | assert!(output.contains("300ms")); |
761 | 1 | } |
762 | } |
Line | Count | Source |
1 | //! #Speech Engine Information | |
2 | //! | |
3 | //! ## Pitch (default 140hz) | |
4 | //! ### SAPI4: Relative pitch | |
5 | //! * Number is relative to the default/current pitch. | |
6 | //! * 50 is 1/2 of the default/current pitch, 200 is 2 times the default/current pitch. | |
7 | //! | |
8 | //! Note: no range is specified by the spec | |
9 | //! ### SAPI5: Relative pitch | |
10 | //! From https://documentation.help/SAPI-5/sapi.xsd | |
11 | //! * A value of +10 sets a voice to speak at four-thirds (or 4/3) of its default pitch. | |
12 | //! * Each increment between –10 and +10 is logarithmically distributed such that | |
13 | //! incrementing/decrementing by 1 is multiplying/dividing the pitch by the 24th root of 2 (about 1.03). | |
14 | //! * Values more extreme than –10 and 10 will be passed to an engine but SAPI 5compliant engines may not support | |
15 | //! such extremes and instead may clip the pitch to the maximum or minimum pitch it supports. | |
16 | //! * Values of –24 and +24 must lower and raise pitch by 1 octave respectively. | |
17 | //! All incrementing/decrementing by 1 must multiply/divide the pitch by the 24th root of 2. | |
18 | //! | |
19 | //! Note: an octave is a doubling of frequency, so pitch change of 100% should turn into +/- 24 | |
20 | //! ### SSML: Relative pitch | |
21 | //! * pitch in hertz (default/current man's voice is about 100hz, woman's 180hz) | |
22 | //! | |
23 | //! Note: other legal values for SSML are not supported, and all numbers are interpreted as relative changes | |
24 | //! ### Eloquence: Absolute pitch (relative pitch not supported by Eloquence) | |
25 | //! * Range is 0 - 100. Guess is that 0 ~= 42hz, 100 ~= 422hz based on supported \"sapi\" values | |
26 | //! ## Rate (default 180 words/min) | |
27 | //! ### SAPI4: Absolute rate | |
28 | //! * Number is relative to the default/current rate | |
29 | //! * 50 is 1/2 of the default/current rate, 200 is 2 times the default/current rate | |
30 | //! | |
31 | //! Note: no range is specified by the spec | |
32 | //! ### SAPI5: Relative rate | |
33 | //! * Number is in range -10 to 10 | |
34 | //! * -10 is 1/3 of the default/current speed; 10 3 times the default/current speech | |
35 | //! * changes are logarithmic -- a change of +/-1 corresponds to multiplying/dividing by 10th root of 3 (10*log_3(change)) | |
36 | //! ### SSML: Relative rate % | |
37 | //! * 100% is no change, 50% is half the current rate, 200% is doubling the rate | |
38 | //! | |
39 | //! Note: other legal values for SSML are not supported, and all numbers are interpreted as relative changes | |
40 | //! ### Eloquence: Absolute rate (relative rate not supported by Eloquence) | |
41 | //! * Range is 0 - 250, which manual seems to indicate corresponds to 70 - 1297 words/min. | |
42 | //! * * Window-Eyes only seems to give values in range 1 - 150. | |
43 | //! * On the low end, 1 ~= 72words/min | |
44 | //! * On the high end, I can't tell, but 80 seems to be a bit over twice normal (~400 words/min?) | |
45 | //! 250 ~= 1297 words/min based on supported "sapi" values | |
46 | //! | |
47 | //! Note: this means words/min = 4.18 * Eloquence rate + 66 | |
48 | //! So the relative pause rate is 180/computed value | |
49 | //! | |
50 | //! | |
51 | //! ## Volume (default 100 \[full]) | |
52 | //! ### SAPI4: Relative volume | |
53 | //! * Number is relative to the default/current rate | |
54 | //! * Range is 0 - 065535 | |
55 | //! ### SAPI5: Relative volume | |
56 | //! * Number is in range 0 to 100 | |
57 | //! ### SSML: Relative volume | |
58 | //! * Number is in range 0 to 100 | |
59 | //! | |
60 | //! Note: other legal values for SSML are not supported, and all numbers are interpreted as relative changes | |
61 | //! ### Eloquence: Absolute volume (relative volume not supported by Eloquence) | |
62 | //! * Range is 0 - 100 | |
63 | //! | |
64 | //! ## Pause | |
65 | //! * All systems -- pauses are given in milliseconds | |
66 | //! | |
67 | //! Note: Pauses on output are scaled based on the ratio of the current rate to the default rate (180 wpm) | |
68 | #![allow(clippy::needless_return)] | |
69 | ||
70 | use crate::{errors::*, prefs::PreferenceManager, speech::ReplacementArray}; | |
71 | use sxd_document::dom::Element; | |
72 | use yaml_rust::Yaml; | |
73 | ||
74 | use std::fmt; | |
75 | use crate::speech::{SpeechRulesWithContext, MyXPath, TreeOrString}; | |
76 | use std::string::ToString; | |
77 | use std::str::FromStr; | |
78 | use strum_macros::{Display, EnumString}; | |
79 | use regex::Regex; | |
80 | use std::sync::LazyLock; | |
81 | use sxd_xpath::Value; | |
82 | use html_escape::encode_safe; | |
83 | ||
84 | const MIN_PAUSE:f64 = 50.0; // ms -- avoids clutter of putting out pauses that probably can't be heard | |
85 | const PAUSE_SHORT:f64 = 200.0; // ms | |
86 | const PAUSE_MEDIUM:f64 = 400.0; // ms | |
87 | const PAUSE_LONG:f64 = 800.0; // ms | |
88 | const PAUSE_XLONG:f64 = 1600.0; // ms | |
89 | const PAUSE_AUTO:f64 = 987654321.5; // ms -- hopefully unique | |
90 | pub const PAUSE_AUTO_STR: &str = "\u{F8FA}\u{F8FA}"; | |
91 | const RATE_FROM_CONTEXT:f64 = 987654321.5; // hopefully unique | |
92 | ||
93 | const MAX_TRANSLATE_RECURSION: usize = 5; // probably never more than three -- prevents infinite loop/stack overflows bugs | |
94 | ||
95 | /// TTSCommand are the supported TTS commands | |
96 | /// When parsing the YAML rule files, they are converted to these enums | |
97 | #[derive(Debug, Clone, PartialEq, Eq, Display, EnumString)] | |
98 | #[strum(serialize_all = "snake_case")] // allows lower case | |
99 | pub enum TTSCommand { | |
100 | Pause, | |
101 | Rate, | |
102 | Volume, | |
103 | Pitch, | |
104 | Audio, | |
105 | Gender, | |
106 | Voice, | |
107 | Spell, | |
108 | Bookmark, | |
109 | Pronounce, | |
110 | } | |
111 | ||
112 | #[derive(Debug, Clone)] | |
113 | pub struct Pronounce { | |
114 | text: String, // plain text | |
115 | ipa: String, // ipa | |
116 | sapi5: String, | |
117 | eloquence: String, | |
118 | } | |
119 | ||
120 | ||
121 | impl fmt::Display for Pronounce { | |
122 | 1 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
123 | 1 | let mut comma = ""; // comma separator so it looks right |
124 | 1 | write!(f, "pronounce: [") |
125 | 1 | if !self.text.is_empty() { |
126 | 1 | write!(f, "text: '{}'", self.text) |
127 | 1 | comma = ","; |
128 | 0 | } |
129 | 1 | write!(f, "pronounce: [") |
130 | 1 | if !self.ipa.is_empty() { |
131 | 1 | write!(f, "{}ipa: '{}'", comma, self.ipa) |
132 | 1 | comma = ","; |
133 | 0 | } |
134 | 1 | write!(f, "pronounce: [") |
135 | 1 | if !self.sapi5.is_empty() { |
136 | 1 | write!(f, "{}sapi5: '{}'", comma, self.sapi5) |
137 | 1 | comma = ","; |
138 | 0 | } |
139 | 1 | write!(f, "pronounce: [") |
140 | 1 | if !self.eloquence.is_empty() { |
141 | 1 | write!(f, "{}eloquence: '{}'", comma, self.eloquence) |
142 | 0 | } |
143 | 1 | return writeln!(f, "]"); |
144 | 1 | } |
145 | } | |
146 | ||
147 | impl Pronounce { | |
148 | 5.02k | fn build(values: &Yaml) -> Result<Pronounce> { |
149 | use crate::speech::{as_str_checked, yaml_to_type}; | |
150 | use crate::pretty_print::yaml_to_string; | |
151 | ||
152 | 5.02k | let mut text = ""; |
153 | 5.02k | let mut ipa = ""; |
154 | 5.02k | let mut sapi5 = ""; |
155 | 5.02k | let mut eloquence = ""; |
156 | // values should be an array with potential values for Pronounce | |
157 | 5.02k | let values = values.as_vec().ok_or_else(|| |
158 | 0 | anyhow!("'pronounce' value '{}' is not an array", yaml_to_type(values)))?; |
159 | 20.0k | for key_value in |
160 | 20.0k | let key_value_hash = key_value.as_hash().ok_or_else(|| |
161 | 0 | anyhow!("pronounce value '{}' is not key/value pair", yaml_to_string(key_value, 0)))?; |
162 | 20.0k | if key_value_hash.len() != 1 { |
163 | 0 | bail!("pronounce value {:?} is not a single key/value pair", key_value_hash); |
164 | 20.0k | } |
165 | ||
166 | 20.0k | for (key, value) in key_value_hash { |
167 | 20.0k | match as_str_checked(key) |
168 | 20.0k | "text" => text = |
169 | 15.0k | "ipa" => ipa = |
170 | 10.0k | "sapi5" => sapi5 = |
171 | 5.02k | "eloquence" => eloquence = as_str_checked(value) |
172 | 0 | _ => bail!("unknown pronounce type: {} with value {}", yaml_to_string(key, 0), yaml_to_string(value, 0)), |
173 | } | |
174 | } | |
175 | } | |
176 | 5.02k | if text.is_empty() { |
177 | 1 | bail!("'text' key/value is required for 'pronounce' -- it is used is the speech engine is unknown.") |
178 | 5.02k | } |
179 | 5.02k | return Ok( Pronounce{ |
180 | 5.02k | text: text.to_string(), |
181 | 5.02k | ipa: ipa.to_string(), |
182 | 5.02k | sapi5: sapi5.to_string(), |
183 | 5.02k | eloquence: eloquence.to_string() |
184 | 5.02k | } ); |
185 | ||
186 | ||
187 | 5.02k | } |
188 | } | |
189 | /// TTSCommands are either numbers (f64 because of YAML) or strings | |
190 | #[derive(Debug, Clone)] | |
191 | pub enum TTSCommandValue { | |
192 | Number(f64), | |
193 | String(String), | |
194 | XPath(MyXPath), | |
195 | Pronounce(Box<Pronounce>), | |
196 | } | |
197 | ||
198 | impl TTSCommandValue { | |
199 | 77.2k | fn get_num(&self) -> f64 { |
200 | 77.2k | match self { |
201 | 77.2k | TTSCommandValue::Number(n) => return *n, |
202 | 0 | _ => panic!("Internal error: TTSCommandValue is not a number"), |
203 | } | |
204 | 77.2k | } |
205 | ||
206 | 0 | fn get_string(&self) -> &String { |
207 | 0 | match self { |
208 | 0 | TTSCommandValue::String(s) => return s, |
209 | 0 | _ => panic!("Internal error: TTSCommandValue is not a string"), |
210 | } | |
211 | 0 | } |
212 | ||
213 | 0 | fn get_pronounce(&self) -> &Pronounce { |
214 | 0 | match self { |
215 | 0 | TTSCommandValue::Pronounce(p) => return p, |
216 | 0 | _ => panic!("Internal error: TTSCommandValue is not a 'pronounce' command'"), |
217 | } | |
218 | ||
219 | 0 | } |
220 | } | |
221 | ||
222 | /// A TTS rule consists of the command, the value, and its replacement | |
223 | #[derive(Debug, Clone)] | |
224 | pub struct TTSCommandRule { | |
225 | command: TTSCommand, | |
226 | value: TTSCommandValue, | |
227 | replacements: ReplacementArray | |
228 | } | |
229 | ||
230 | impl fmt::Display for TTSCommandRule { | |
231 | 1 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
232 | 1 | let value = match &self.value { |
233 | 0 | TTSCommandValue::String(s) => s.to_string(), |
234 | 0 | TTSCommandValue::Number(f) => f.to_string(), |
235 | 0 | TTSCommandValue::XPath(p) => p.to_string(), |
236 | 1 | TTSCommandValue::Pronounce(p) => p.to_string(), |
237 | }; | |
238 | 1 | if self.command == TTSCommand::Pause { |
239 | 0 | return write!(f, "pause: {value}"); |
240 | } else { | |
241 | 1 | return write!(f, "{}: {}{}", self.command, value, self.replacements); |
242 | }; | |
243 | 1 | } |
244 | } | |
245 | ||
246 | ||
247 | impl TTSCommandRule { | |
248 | 2.43M | pub fn new(command: TTSCommand, value: TTSCommandValue, replacements: ReplacementArray) -> TTSCommandRule { |
249 | 2.43M | return TTSCommandRule{ |
250 | 2.43M | command, |
251 | 2.43M | value, |
252 | 2.43M | replacements |
253 | 2.43M | } |
254 | 2.43M | } |
255 | } | |
256 | ||
257 | /// Supported TTS engines | |
258 | /// These types should do something for all the TTSCommands | |
259 | #[allow(clippy::upper_case_acronyms)] | |
260 | #[allow(dead_code)] | |
261 | #[derive(Debug, Clone, PartialEq, Eq)] | |
262 | pub enum TTS { | |
263 | None, | |
264 | SSML, | |
265 | SAPI5, | |
266 | // Eloquence, | |
267 | // Mac, | |
268 | } | |
269 | ||
270 | impl TTS { | |
271 | /// Given the tts command ("pause", "rate", etc) and its value, build the TTS data structure for it. | |
272 | /// | |
273 | /// `tts_command`: one of "pause", "rate", etc | |
274 | /// | |
275 | /// `value`: keyword 'value' or dict with 'value' and 'replace' (optional) keys | |
276 | 2.41M | pub fn build(tts_command: &str, values: &Yaml) -> Result<Box<TTSCommandRule>> { |
277 | use crate::pretty_print::yaml_to_string; | |
278 | 2.41M | let hashmap = values.as_hash(); |
279 | let tts_value; | |
280 | let replacements; | |
281 | 2.41M | if hashmap.is_some() { |
282 | 446k | tts_value = &values["value"]; |
283 | 446k | if tts_value.is_badvalue() { |
284 | 0 | bail!("{} TTS command is missing a 'value' sub-key. Found\n{}", tts_command, yaml_to_string(values, 1)); |
285 | 446k | }; |
286 | 446k | replacements = ReplacementArray::build(&values["replace"]) |
287 | 1.96M | } else { |
288 | 1.96M | tts_value = values; |
289 | 1.96M | replacements = ReplacementArray::build_empty(); |
290 | 1.96M | } |
291 | 2.41M | let tts_str_value = yaml_to_string(tts_value, 0); |
292 | 2.41M | let tts_str_value = tts_str_value.trim(); |
293 | 2.41M | let tts_enum = match TTSCommand::from_str(tts_command) { |
294 | 2.41M | Ok(t) => t, |
295 | 0 | Err(_) => bail!("Internal error in build_tts: unexpected rule ({:?}) encountered", tts_command), |
296 | }; | |
297 | ||
298 | 2.41M | let |
299 | TTSCommand::Pause | TTSCommand::Rate | TTSCommand::Volume | TTSCommand::Pitch => { | |
300 | // these strings are almost always what the value will be, so we try them first | |
301 | 1.05M | let val = match tts_str_value { |
302 | 1.05M | "auto" => |
303 | 982k | "short" => |
304 | 426k | "medium" => |
305 | 314k | "long" => |
306 | 229k | "xlong" => |
307 | 225k | "$MathRate" => |
308 | 221k | _ => tts_str_value.parse::<f64>() |
309 | }; | |
310 | ||
311 | 1.05M | match val { |
312 | 832k | Ok(num) => TTSCommandValue::Number(num), |
313 | Err(_) => { | |
314 | // let's try as an xpath (e.g., could be '$CapitalLetters_Pitch') | |
315 | TTSCommandValue::XPath( | |
316 | 221k | MyXPath::build(tts_value).with_context(|| |
317 | ) | |
318 | } | |
319 | } | |
320 | }, | |
321 | TTSCommand::Bookmark | TTSCommand::Spell => { | |
322 | TTSCommandValue::XPath( | |
323 | 1.13M | MyXPath::build(values).with_context(|| |
324 | ) | |
325 | }, | |
326 | TTSCommand::Pronounce => { | |
327 | 5.02k | TTSCommandValue::Pronounce( |
328 | }, | |
329 | _ => { | |
330 | 220k | TTSCommandValue::String(tts_str_value.to_string()) |
331 | }, | |
332 | }; | |
333 | 2.41M | return Ok( Box::new( TTSCommandRule::new(tts_enum, tts_command_value, replacements) ) ); |
334 | 2.41M | } |
335 | ||
336 | /// The rule called to execute the TTSCommand `command` | |
337 | /// `prefs` are used for scaling the speech rate | |
338 | /// some rules have MathML nested inside, so we need to do replacements on them (hence `rules` and `mathml` are needed) | |
339 | /// | |
340 | /// A string is returned for the speech engine. | |
341 | /// | |
342 | /// `auto` pausing is handled at a later phase and a special char is used for it | |
343 | 60.7k | pub fn replace<'c, 's:'c, 'm:'c, 'r, T:TreeOrString<'c, 'm, T>>(&self, command: &TTSCommandRule, prefs: &PreferenceManager, rules_with_context: &'r mut SpeechRulesWithContext<'c, 's, 'm>, mathml: Element<'c>) -> Result<T> { |
344 | 60.7k | return T::replace_tts(self, command, prefs, rules_with_context, mathml); |
345 | 60.7k | } |
346 | ||
347 | 60.7k | pub fn replace_string<'c, 's:'c, 'm, 'r>(&self, command: &TTSCommandRule, prefs: &PreferenceManager, rules_with_context: &'r mut SpeechRulesWithContext<'c, 's, 'm>, mathml: Element<'c>) -> Result<String> { |
348 | // The general idea is we handle the begin tag, the contents, and then the end tag | |
349 | // For the begin/end tag, we dispatch off to specialized code for each TTS engine | |
350 | ||
351 | // 'bookmark' is special in that we need to eval the xpath | |
352 | // rather than pass a bunch of extra info into the generic handling routines, we just deal with them here | |
353 | 60.7k | if command.command == TTSCommand::Bookmark { |
354 | // if we aren't suppose to generate bookmarks, short circuit and just return | |
355 | 26.7k | if prefs.pref_to_string("Bookmark") != "true"{ |
356 | 26.7k | return Ok("".to_string()); |
357 | 0 | } |
358 | 0 | return Ok( match self { |
359 | 0 | TTS::None => "".to_string(), |
360 | 0 | TTS::SSML => compute_bookmark_element(&command.value, "mark name", rules_with_context, mathml)?, |
361 | 0 | TTS::SAPI5 => compute_bookmark_element(&command.value, "bookmark mark", rules_with_context, mathml)?, |
362 | } ); | |
363 | 33.9k | } |
364 | ||
365 | 33.9k | let mut command = command.clone(); |
366 | 33.9k | if command.command == TTSCommand::Spell { |
367 | // spell is also special because we need to eval the xpath to get the string to spell (typically the text content of an mi) | |
368 | 2.77k | match command.value { |
369 | 2.77k | TTSCommandValue::XPath(xpath) => { |
370 | 2.77k | let value = xpath.evaluate(rules_with_context.get_context(), mathml) |
371 | 2.77k | .with_context(|| |
372 | 2.77k | let value_string = match |
373 | 2.24k | Value::String(s) => s, |
374 | 527 | Value::Nodeset(nodes) if nodes.size() == 1 => { |
375 | 527 | let node = nodes.iter().next().unwrap(); |
376 | 527 | if let Some(text) = node.text() { |
377 | 527 | text.text().to_string() |
378 | 0 | } else if let Some(el) = node.element() { |
379 | 0 | if crate::xpath_functions::is_leaf(el) { |
380 | 0 | crate::canonicalize::as_text(el).to_string() |
381 | } else { | |
382 | 0 | bail!("in 'spell': value returned from xpath '{}' does not evaluate to a string", &xpath.to_string()); |
383 | } | |
384 | } else { | |
385 | 0 | bail!("in 'spell': value returned from xpath '{}' does not evaluate to a string, it is {} nodes", |
386 | 0 | &xpath.to_string(), nodes.size()); |
387 | } | |
388 | }, | |
389 | 0 | _ => bail!("in 'spell': value returned from xpath '{}' does not evaluate to a string", &xpath.to_string()), |
390 | }; | |
391 | // Chemistry wants to spell elements like "Na". But we also have the issue of capitalization (SpeechOverrides_CapitalLetters) | |
392 | // so the "N" need to use that. The logic for that is already in unicode.yaml. We could replicate that here. | |
393 | // Rather than duplicate the logic (we would need to handle 'a', and who knows what in other languages), | |
394 | // we split the token into each letter and call the replacement on each letter. | |
395 | // That in turns calls spell again. We end up in an infinite loop. To prevent this we set a flag that says don't recurse. | |
396 | // The only structure to put that in is SpeechRulesWithContext. A bit of a hack to put it there, but better than a static var. | |
397 | // Also, to avoid repeating the code for "cap" over and over, "spell" with "translate" is used. So keep going until no "translate" | |
398 | 2.77k | let xpath_str = xpath.to_string(); |
399 | 2.77k | if rules_with_context.inside_spell && |
400 | 0 | command.value = TTSCommandValue::String(value_string); |
401 | 0 | rules_with_context.translate_count = 0; |
402 | 2.77k | } else if rules_with_context.translate_count > MAX_TRANSLATE_RECURSION { |
403 | 0 | bail!("Rule error: potential infinite recursion found in translate: {}", xpath_str); |
404 | } else { | |
405 | // let the call to replace call spell on the individual chars -- that lets an "cap" be outside "spell" | |
406 | 2.77k | rules_with_context.translate_count += 1; |
407 | 2.77k | let str_with_spaces = value_string.chars() |
408 | 2.94k | . |
409 | 2.94k | rules_with_context.inside_spell = true; |
410 | 2.94k | let spelled_char = rules_with_context.replace_chars(ch.to_string().as_str(), mathml); |
411 | 2.94k | rules_with_context.inside_spell = false; |
412 | 2.94k | spelled_char |
413 | 2.94k | }) |
414 | 2.77k | .collect::<Result<Vec<String>>>() |
415 | 2.77k | .join(" "); |
416 | 2.77k | return Ok(str_with_spaces); |
417 | } | |
418 | }, | |
419 | 0 | _ => bail!("Implementation error: found non-xpath value for spell"), |
420 | } | |
421 | 31.1k | } else if command.command == TTSCommand::Rate && |
422 | 0 | let TTSCommandValue::Number(number_value) = command.value && |
423 | 0 | number_value == RATE_FROM_CONTEXT { |
424 | // handle hack for $Rate -- need to look up in context | |
425 | 0 | let rate_from_context = crate::navigate::context_get_variable(rules_with_context.get_context(), "MathRate", mathml)?.parse::<usize>().unwrap_or(100); |
426 | 0 | command.value = TTSCommandValue::Number(rate_from_context as f64); |
427 | 31.1k | } |
428 | ||
429 | // evaluate any xpath value now to simplify later code | |
430 | 31.1k | if let TTSCommandValue::XPath( |
431 | 1.31k | let eval_str = xpath.replace::<String>(rules_with_context, mathml) |
432 | // can it be a number? | |
433 | 1.31k | command.value = match eval_str.parse::<f64>() { |
434 | 1.31k | Ok(num) => TTSCommandValue::Number(num), |
435 | 0 | Err(_) => TTSCommandValue::String(eval_str), |
436 | } | |
437 | 29.8k | }; |
438 | ||
439 | ||
440 | // small optimization to avoid generating tags that do nothing | |
441 | 31.1k | if ((command.command == TTSCommand::Pitch || |
442 | 29.8k | (command.command == TTSCommand::Rate && |
443 | 1.31k | return command.replacements.replace::<String>(rules_with_context, mathml); |
444 | 29.8k | } |
445 | ||
446 | 29.8k | let mut result = String::with_capacity(255); |
447 | 29.8k | result += &match self { |
448 | 29.8k | TTS::None => self.get_string_none(&command, prefs, true), |
449 | 0 | TTS::SSML => self.get_string_ssml(&command, prefs, true), |
450 | 0 | TTS::SAPI5 => self.get_string_sapi5(&command, prefs, true), |
451 | }; | |
452 | ||
453 | ||
454 | 29.8k | if !command.replacements.is_empty() { |
455 | 0 | if result.is_empty() { |
456 | 0 | result += " "; |
457 | 0 | } |
458 | // need to sanitize string so that SSML is not injected into it via mtext, etc. | |
459 | 0 | let speech = command.replacements.replace::<String>(rules_with_context, mathml)?; |
460 | 0 | result += &encode_safe(&speech); |
461 | 29.8k | } |
462 | ||
463 | 29.8k | let end_tag = match self { |
464 | 29.8k | TTS::None => self.get_string_none(&command, prefs, false), |
465 | 0 | TTS::SSML => self.get_string_ssml(&command, prefs, false), |
466 | 0 | TTS::SAPI5 => self.get_string_sapi5(&command, prefs, false), |
467 | }; | |
468 | ||
469 | 29.8k | if end_tag.is_empty() { |
470 | 29.8k | return Ok( result ); // avoids adding in " " |
471 | } else { | |
472 | 0 | return Ok( result + &end_tag ); |
473 | } | |
474 | ||
475 | ||
476 | 0 | fn compute_bookmark_element<'c, 's:'c, 'm, 'r>(value: &TTSCommandValue, tag_and_attr: &str, rules_with_context: &'r mut SpeechRulesWithContext<'c, 's, 'm>, mathml: Element<'c>) -> Result<String> { |
477 | 0 | match value { |
478 | 0 | TTSCommandValue::XPath(xpath) => { |
479 | 0 | let id = xpath.replace::<String>(rules_with_context, mathml)?; |
480 | 0 | return Ok( format!("<{tag_and_attr}='{id}'/>") ); |
481 | }, | |
482 | 0 | _ => bail!("Implementation error: found bookmark value that did not evaluate to a string"), |
483 | } | |
484 | 0 | } |
485 | ||
486 | 60.7k | } |
487 | ||
488 | // auto pausing can't be known until neighboring strings are computed | |
489 | // we create a unique string in this case and compute the real value later | |
490 | 75.9k | fn get_string_none(&self, command: &TTSCommandRule, prefs: &PreferenceManager, is_start_tag: bool) -> String { |
491 | // they only thing to do is handle "pause" with some punctuation hacks along with 'spell' | |
492 | 75.9k | if is_start_tag { |
493 | 46.1k | if command.command == TTSCommand::Pause { |
494 | 46.1k | let amount = command.value.get_num(); |
495 | // only ',' and ';' are used as '.' didn't seem to reliably generate pauses in tests | |
496 | 46.1k | return crate::speech::CONCAT_INDICATOR.to_string() + ( |
497 | 46.1k | if amount == PAUSE_AUTO { |
498 | 19.5k | PAUSE_AUTO_STR |
499 | } else { | |
500 | 26.5k | let amount = amount * TTS::get_pause_multiplier(prefs); |
501 | 26.5k | if amount <= MIN_PAUSE { |
502 | 11.3k | "" |
503 | 15.1k | } else if amount <= 250.0 { |
504 | 9.93k | "," |
505 | } else { | |
506 | 5.21k | ";" |
507 | } | |
508 | } | |
509 | ); | |
510 | 32 | } else if command.command == TTSCommand::Spell { |
511 | // debug!("spell rule: {}", command.value.get_string()); | |
512 | 0 | return command.value.get_string().to_string(); |
513 | 32 | } else if let TTSCommandValue::Pronounce(p) = &command.value { |
514 | 32 | return crate::speech::CONCAT_INDICATOR.to_string() + &p.text; |
515 | 0 | } |
516 | 29.8k | }; |
517 | 29.8k | return "".to_string(); |
518 | 75.9k | } |
519 | ||
520 | 0 | fn get_string_sapi5(&self, command: &TTSCommandRule, prefs: &PreferenceManager, is_start_tag: bool) -> String { |
521 | 0 | return match &command.command { |
522 | 0 | TTSCommand::Pause => if is_start_tag { |
523 | 0 | let amount = command.value.get_num(); |
524 | 0 | if amount == PAUSE_AUTO { |
525 | 0 | PAUSE_AUTO_STR.to_string() |
526 | } else { | |
527 | 0 | let amount = amount * TTS::get_pause_multiplier(prefs); |
528 | 0 | if amount > MIN_PAUSE { |
529 | 0 | format!("<silence msec=='{}ms'/>", (amount * 180.0/prefs.get_rate()).round()) |
530 | } else { | |
531 | 0 | "".to_string() |
532 | } | |
533 | } | |
534 | } else { | |
535 | 0 | "".to_string() |
536 | }, | |
537 | // pitch must be in [-10, 10], logarithmic based on octaves | |
538 | // note MathPlayer uses 'absmiddle' (requires keeping a stack) -- could be 'middle' is not well supported | |
539 | 0 | TTSCommand::Pitch => if is_start_tag {format!("<pitch middle=\"{}\">", (24.0*(1.0+command.value.get_num()/100.0).log2()).round())} else {String::from("</prosody>")}, |
540 | // rate must be in [-10, 10], but we get relative %s. 300% => 10 (see comments at top of file) | |
541 | 0 | TTSCommand::Rate => if is_start_tag {format!("<rate speed='{:.1}'>", 10.0*(0.01*command.value.get_num()).log(3.0))} else {String::from("</rate>")}, |
542 | 0 | TTSCommand::Volume =>if is_start_tag {format!("<volume level='{}'>", command.value.get_num())} else {String::from("</volume>")}, |
543 | 0 | TTSCommand::Audio => "".to_string(), // SAPI5 doesn't support audio |
544 | 0 | TTSCommand::Gender =>if is_start_tag {format!("<voice required=\"Gender={}\">", command.value.get_string())} else {String::from("</prosody>")}, |
545 | 0 | TTSCommand::Voice =>if is_start_tag {format!("<voice required=\"Name={}\">", command.value.get_string())} else {String::from("</prosody>")}, |
546 | 0 | TTSCommand::Spell =>if is_start_tag {format!("<spell>{}", command.value.get_string())} else {String::from("</spell>")}, |
547 | 0 | TTSCommand::Pronounce =>if is_start_tag { |
548 | 0 | format!("<pron sym='{}'>{}", &command.value.get_pronounce().sapi5, &command.value.get_pronounce().text) |
549 | } else { | |
550 | 0 | String::from("</pron>") |
551 | }, | |
552 | 0 | TTSCommand::Bookmark => panic!("Internal error: bookmarks should have been handled earlier"), |
553 | }; | |
554 | 0 | } |
555 | ||
556 | 0 | fn get_string_ssml(&self, command: &TTSCommandRule, prefs: &PreferenceManager, is_start_tag: bool) -> String { |
557 | 0 | return match &command.command { |
558 | TTSCommand::Pause => { | |
559 | 0 | if is_start_tag { |
560 | 0 | let amount = command.value.get_num(); |
561 | 0 | if amount == PAUSE_AUTO { |
562 | 0 | PAUSE_AUTO_STR.to_string() |
563 | } else { | |
564 | 0 | let amount = amount * TTS::get_pause_multiplier(prefs); |
565 | 0 | if amount > MIN_PAUSE { |
566 | 0 | format!("<break time='{}ms'/>", (amount * 180.0/prefs.get_rate()).round()) |
567 | } else { | |
568 | 0 | "".to_string() |
569 | } | |
570 | } | |
571 | } else { | |
572 | 0 | "".to_string() |
573 | } | |
574 | }, | |
575 | 0 | TTSCommand::Pitch => if is_start_tag {format!("<prosody pitch='{}%'>", command.value.get_num())} else {String::from("</prosody>")}, |
576 | 0 | TTSCommand::Rate => if is_start_tag {format!("<prosody rate='{}%'>", command.value.get_num())} else {String::from("</prosody>")}, |
577 | 0 | TTSCommand::Volume =>if is_start_tag {format!("<prosody volume='{}db'>", command.value.get_num())} else {String::from("</prosody>")}, |
578 | 0 | TTSCommand::Audio =>if is_start_tag {format!("<audio src='{}'>", command.value.get_string())} else {String::from("</audio>")}, // only 'beep' is supported for now |
579 | 0 | TTSCommand::Gender =>if is_start_tag {format!("<voice required='gender=\"{}\"'>", command.value.get_string())} else {String::from("</voice>")}, |
580 | 0 | TTSCommand::Voice =>if is_start_tag {format!("<voice required='{}'>", command.value.get_string())} else {String::from("</voice>")}, |
581 | 0 | TTSCommand::Spell =>if is_start_tag {format!("<say-as interpret-as='characters'>{}", command.value.get_string())} else {String::from("</say-as>")}, |
582 | 0 | TTSCommand::Pronounce =>if is_start_tag { |
583 | 0 | format!("<phoneme alphabet='ipa' ph='{}'>{}", &command.value.get_pronounce().ipa, &command.value.get_pronounce().text) |
584 | } else { | |
585 | 0 | String::from("</phoneme>") |
586 | }, | |
587 | 0 | TTSCommand::Bookmark => panic!("Internal error: bookmarks should have been handled earlier"), |
588 | } | |
589 | 0 | } |
590 | ||
591 | 26.5k | fn get_pause_multiplier(prefs: &PreferenceManager) -> f64 { |
592 | 26.5k | return prefs.pref_to_string("PauseFactor").parse::<f64>().unwrap_or(100.)/100.0; |
593 | 26.5k | } |
594 | ||
595 | /// Compute the length of the pause to use. | |
596 | /// | |
597 | /// The computation is based on the length of the speech strings (after removing tagging). | |
598 | /// There is a bias towards pausing more _after_ longer strings. | |
599 | 19.5k | pub fn compute_auto_pause(&self, prefs: &PreferenceManager, before: &str, after: &str) -> String { |
600 | 0 | static REMOVE_XML: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<.+?>").unwrap()); // punctuation ending with a '.' |
601 | let before_len; | |
602 | let after_len; | |
603 | 19.5k | match self { |
604 | 0 | TTS::SSML | TTS::SAPI5 => { |
605 | 0 | before_len = REMOVE_XML.replace_all(before, "").len(); |
606 | 0 | after_len = REMOVE_XML.replace_all(after, "").len(); |
607 | 0 | }, |
608 | 19.5k | _ => { |
609 | 19.5k | before_len = before.len(); |
610 | 19.5k | after_len = after.len(); |
611 | 19.5k | }, |
612 | } | |
613 | ||
614 | // pause values are not cut in stone | |
615 | // the calculation bias to 'previous' is based on MathPlayer which used '30 * #-of-descendants-on-left | |
616 | // I think I did this as a sort of "take a breath" after saying something long although one might want to do that | |
617 | // before speaking something long. | |
618 | 19.5k | if after_len < 3 { |
619 | // hack to prevent pausing before "of" in exprs like "the fourth power of secant, of x" | |
620 | // if it should pause anywhere, it should be after the "of" | |
621 | 3.31k | return "".to_string(); |
622 | 16.2k | } |
623 | 16.2k | let pause = std::cmp::min(3000, ((2 * before_len + after_len)/48) * 128); |
624 | // create a TTSCommandRule so we reuse code | |
625 | 16.2k | let command = TTSCommandRule::new( |
626 | 16.2k | TTSCommand::Pause, |
627 | 16.2k | TTSCommandValue::Number(pause as f64), |
628 | 16.2k | ReplacementArray::build_empty(), |
629 | ); | |
630 | 16.2k | return match self { |
631 | 16.2k | TTS::None => self.get_string_none(&command, prefs, true), |
632 | 0 | TTS::SSML => self.get_string_ssml(&command, prefs, true), |
633 | 0 | TTS::SAPI5 => self.get_string_sapi5(&command, prefs, true), |
634 | }; | |
635 | ||
636 | 19.5k | } |
637 | ||
638 | /// Take the longest of the pauses | |
639 | /// | |
640 | /// Two other options are: | |
641 | /// 1. average the pauses | |
642 | /// 2. add the pauses together. | |
643 | /// | |
644 | /// Until evidence points otherwise, use 'longest'. | |
645 | 5.10k | pub fn merge_pauses(&self, str: &str) -> String { |
646 | // we need specialized merges for each TTS engine because we need to know the format of the commands | |
647 | 5.10k | return match self { |
648 | 5.10k | TTS::None => self.merge_pauses_none(str), |
649 | 1 | TTS::SSML => self.merge_pauses_ssml(str), |
650 | 1 | TTS::SAPI5 => self.merge_pauses_sapi5(str), |
651 | }; | |
652 | 5.10k | } |
653 | ||
654 | 5.10k | fn merge_pauses_none(&self, str: &str) -> String { |
655 | // punctuation used for pauses is ",", ";" | |
656 | 2 | static SPACES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s+([;,])").unwrap()); // two or more pauses |
657 | 2 | static MULTIPLE_PAUSES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"([,;][,;]+)").unwrap()); // two or more pauses |
658 | // we reduce all sequences of two or more pauses to a single medium pause | |
659 | 5.10k | let merges_string = SPACES.replace_all(str, "$1").to_string(); |
660 | 5.10k | let merges_string = MULTIPLE_PAUSES.replace_all(&merges_string, ";").to_string(); |
661 | 5.10k | return merges_string; |
662 | 5.10k | } |
663 | ||
664 | 2 | fn merge_pauses_xml<F>(str: &str, full_attr_re: &Regex, sub_attr_re: &Regex, replace_with: F) -> String |
665 | 2 | where F: Fn(usize) -> String { |
666 | // we reduce all sequences of two or more pauses to the max pause amount | |
667 | // other options would be the sum or an average | |
668 | // maybe some amount a little longer than the max would be best??? | |
669 | 2 | let mut merges_string = str.to_string(); |
670 | 2 | for cap in full_attr_re.captures_iter(str) { |
671 | 2 | let mut amount = 0; |
672 | 4 | for c in |
673 | 4 | amount = std::cmp::max(amount, c[1].parse::<usize>().unwrap()); |
674 | 4 | }; |
675 | 2 | merges_string = merges_string.replace(&cap[0], &replace_with(amount)); |
676 | } | |
677 | 2 | return merges_string; |
678 | 2 | } |
679 | ||
680 | 1 | fn merge_pauses_sapi5(&self, str: &str) -> String { |
681 | 1 | static CONSECUTIVE_BREAKS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(<silence msec[^>]+?> *){2,}").unwrap()); // two or more pauses |
682 | 1 | static PAUSE_AMOUNT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"msec=.*?(\d+)").unwrap()); // amount after 'time' |
683 | 1 | let replacement = |amount: usize| format!("<silence msec=='{amount}ms'/>"); |
684 | 1 | return TTS::merge_pauses_xml(str, &CONSECUTIVE_BREAKS, &PAUSE_AMOUNT, replacement); |
685 | 1 | } |
686 | ||
687 | 1 | fn merge_pauses_ssml(&self, str: &str) -> String { |
688 | 1 | static CONSECUTIVE_BREAKS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(<break time=[^>]+?> *){2,}").unwrap()); // two or more pauses |
689 | 1 | static PAUSE_AMOUNT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"time=.*?(\d+)").unwrap()); // amount after 'time' |
690 | 1 | let replacement = |amount: usize| format!("<break time='{amount}ms'/>"); |
691 | 1 | return TTS::merge_pauses_xml(str, &CONSECUTIVE_BREAKS, &PAUSE_AMOUNT, replacement); |
692 | 1 | } |
693 | } | |
694 | ||
695 | #[cfg(test)] | |
696 | mod tests { | |
697 | use super::*; | |
698 | use yaml_rust::YamlLoader; | |
699 | ||
700 | #[test] | |
701 | /// Verifies pronounce YAML builds and renders all supported fields. | |
702 | 1 | fn pronounce_build_and_display() { |
703 | 1 | let yaml = YamlLoader::load_from_str( |
704 | 1 | r#" |
705 | 1 | - text: "alpha" |
706 | 1 | - ipa: "a" |
707 | 1 | - sapi5: "b" |
708 | 1 | - eloquence: "c" |
709 | 1 | "#, |
710 | ) | |
711 | 1 | .unwrap(); |
712 | 1 | let values = &yaml[0]; |
713 | 1 | let rule = TTS::build("pronounce", values).unwrap(); |
714 | 1 | let rendered = format!("{rule}"); |
715 | ||
716 | 1 | assert!(rendered.contains("text: 'alpha'")); |
717 | 1 | assert!(rendered.contains("ipa: 'a'")); |
718 | 1 | assert!(rendered.contains("sapi5: 'b'")); |
719 | 1 | assert!(rendered.contains("eloquence: 'c'")); |
720 | 1 | } |
721 | ||
722 | #[test] | |
723 | /// Ensures pronounce requires a text entry and rejects missing text. | |
724 | 1 | fn pronounce_requires_text() { |
725 | 1 | let yaml = YamlLoader::load_from_str( |
726 | 1 | r#" |
727 | 1 | - ipa: "a" |
728 | 1 | "#, |
729 | ) | |
730 | 1 | .unwrap(); |
731 | 1 | let values = &yaml[0]; |
732 | 1 | let err = TTS::build("pronounce", values).unwrap_err(); |
733 | 1 | assert!(err.to_string().contains("'text' key/value is required")); |
734 | 1 | } |
735 | ||
736 | #[test] | |
737 | /// Coalesces adjacent punctuation pauses for the None engine. | |
738 | 1 | fn merge_pauses_none_coalesces() { |
739 | 1 | let input = "a,,;b"; |
740 | 1 | let output = TTS::None.merge_pauses(input); |
741 | 1 | assert!(!output.contains(",,")); |
742 | 1 | assert!(output.contains(";")); |
743 | 1 | } |
744 | ||
745 | #[test] | |
746 | /// Uses the maximum pause when merging consecutive SSML breaks. | |
747 | 1 | fn merge_pauses_ssml_keeps_max() { |
748 | 1 | let input = "<break time='100ms'/><break time='300ms'/>"; |
749 | 1 | let output = TTS::SSML.merge_pauses(input); |
750 | 1 | assert!(!output.contains("100ms")); |
751 | 1 | assert!(output.contains("300ms")); |
752 | 1 | } |
753 | ||
754 | #[test] | |
755 | /// Uses the maximum pause when merging consecutive SAPI5 breaks. | |
756 | 1 | fn merge_pauses_sapi5_keeps_max() { |
757 | 1 | let input = "<silence msec=='100ms'/><silence msec=='300ms'/>"; |
758 | 1 | let output = TTS::SAPI5.merge_pauses(input); |
759 | 1 | assert!(!output.contains("100ms")); |
760 | 1 | assert!(output.contains("300ms")); |
761 | 1 | } |
762 | } |
Line | Count | Source |
1 | #![allow(clippy::needless_return)] | |
2 | //! XPath underlies rule matching and speech generation. The version of xpath used is based on xpath 1.0 | |
3 | //! and includes the ability to define functions and variables. | |
4 | //! The variables defined are all the preferences and also variables set in speech rules via the `variables` keyword. | |
5 | //! The function defined here are: | |
6 | //! * `IsNode(node, kind)`: returns true if the node matches the "kind". | |
7 | //! Valid values are "leaf", "2D", "simple", "common_fraction", "trig_name". | |
8 | //! * `ToOrdinal(number, fractional, plural)`: converts the number to an ordinal (e.g, third) | |
9 | //! * `number` -- the number to translate | |
10 | //! * `fractional` -- true if this is a fractional ordinal (e.g, "half") | |
11 | //! * `plural` -- true if answer should be plural | |
12 | //! * `ToCommonFraction(mfrac)` -- converts the fraction to an ordinal version (e.g, 2 thirds) | |
13 | //! * `IsLargeOp(node)` -- returns true if the node is a large operator (e.g, integral or sum) | |
14 | //! * `IsBracketed(node, left, right, requires_comma)` -- returns true if the first/last element in the mrow match `left`/`right`. | |
15 | //! If the optional `requires_comma` argument is given and is `true`, then there also must be a "," in the mrow (e.g., "f(x,y)") | |
16 | //! * `DEBUG(xpath)` -- _Very_ useful function for debugging speech rules. | |
17 | //! This can be used to surround a whole or part of an xpath expression in a match or output. | |
18 | //! The result will be printed to standard output and the result returned so that `DEBUG` does not affect the computation. | |
19 | ||
20 | use sxd_document::dom::{Element, ChildOfElement}; | |
21 | use sxd_xpath::{Value, Context, context, function::*, nodeset::*}; | |
22 | use crate::definitions::{Definitions, SPEECH_DEFINITIONS, BRAILLE_DEFINITIONS}; | |
23 | use regex::Regex; | |
24 | use crate::pretty_print::mml_to_string; | |
25 | use std::cell::{Ref, RefCell}; | |
26 | use log::{debug, error, warn}; | |
27 | use std::sync::LazyLock; | |
28 | use std::thread::LocalKey; | |
29 | use phf::phf_set; | |
30 | use sxd_xpath::function::Error as XPathError; | |
31 | use crate::canonicalize::{as_element, name, get_parent, MATHML_FROM_NAME_ATTR}; | |
32 | ||
33 | // useful utility functions | |
34 | // note: child of an element is a ChildOfElement, so sometimes it is useful to have parallel functions, | |
35 | // one for Element and one for ChildOfElement. | |
36 | ||
37 | // @returns {String} -- the text of the (leaf) element otherwise an empty string | |
38 | 126k | fn get_text_from_element(e: Element) -> String { |
39 | 126k | if e.children().len() == 1 && |
40 | 103k | let ChildOfElement::Text( |
41 | 102k | return t.text().to_string(); |
42 | 23.4k | } |
43 | 23.4k | return "".to_string(); |
44 | 126k | } |
45 | ||
46 | #[allow(non_snake_case)] | |
47 | // Same as 'is_tag', but for ChildOfElement | |
48 | 110k | fn get_text_from_COE(coe: &ChildOfElement) -> String { |
49 | 110k | let element = coe.element(); |
50 | 110k | return match element { |
51 | 110k | Some(e) => get_text_from_element(e), |
52 | 0 | None => "".to_string(), |
53 | }; | |
54 | 110k | } |
55 | ||
56 | // make sure that there is only one node in the NodeSet | |
57 | // Returns the node or an Error | |
58 | 147k | pub fn validate_one_node<'n>(nodes: Nodeset<'n>, func_name: &str) -> Result<Node<'n>, Error> { |
59 | 147k | if nodes.size() == 0 { |
60 | 0 | return Err(Error::Other(format!("Missing argument for {func_name}"))); |
61 | 147k | } else if nodes.size() > 1 { |
62 | 0 | return Err( Error::Other(format!("{} arguments for {}; expected 1 argument", nodes.size(), func_name)) ); |
63 | 147k | } |
64 | 147k | return Ok( nodes.iter().next().unwrap() ); |
65 | 147k | } |
66 | ||
67 | // Return true if the element's name is 'name' | |
68 | 157k | fn is_tag(e: Element, name: &str) -> bool { |
69 | // need to check name before the fallback of where the name came from | |
70 | 157k | return e.name().local_part() == name || |
71 | 157k | } |
72 | ||
73 | #[allow(non_snake_case)] | |
74 | // Same as 'is_tag', but for ChildOfElement | |
75 | 1.40k | fn is_COE_tag(coe: ChildOfElement, name: &str) -> bool { |
76 | 1.40k | let element = coe.element(); |
77 | 1.40k | return element.is_some() && is_tag(element.unwrap(), name) |
78 | 1.40k | } |
79 | ||
80 | /// Should be an internal structure for implementation of the IsNode, but it was useful in one place in a separate module. | |
81 | /// This should probably be restructured slightly. | |
82 | pub struct IsNode; | |
83 | ||
84 | impl IsNode { | |
85 | /// implements ClearSpeak's definition of "simple" | |
86 | /// this is fairly detailed, so we define a few local functions (at end) to help out | |
87 | /// Also, it doesn't help that the structure is a bit complicated Elements->ChildOfElement->Element/Text | |
88 | 7.44k | pub fn is_simple(elem: Element) -> bool { |
89 | 7.44k | if is_trivially_simple(elem) { |
90 | 3.63k | return true; |
91 | 3.81k | } |
92 | ||
93 | 3.81k | if is_negative_of_trivially_simple(elem) { |
94 | // -3 or -x | |
95 | 41 | return true; |
96 | 3.76k | } |
97 | ||
98 | 3.76k | if !is_tag(elem, "mrow") || |
99 | 2.90k | return false; |
100 | 867 | } |
101 | ||
102 | // x y or -x or -3 x or -x y or -3 x y or x° or n° or -x° or -n° | |
103 | #[allow(clippy::if_same_then_else)] | |
104 | 867 | if is_times_mi(elem) { |
105 | 42 | return true; // x y |
106 | 825 | } else if is_degrees(elem) { |
107 | 0 | return true; // x° or n° |
108 | 825 | } else if is_function(elem) { |
109 | 44 | return true; |
110 | 781 | } |
111 | ||
112 | 781 | return false; |
113 | ||
114 | ||
115 | // returns the element's text value | |
116 | 5.72k | fn to_str(e: Element<'_>) -> &str { |
117 | // typically usage assumes 'e' is a leaf | |
118 | // bad MathML is the following isn't true | |
119 | 5.72k | if e.children().len() == 1 { |
120 | 5.72k | let text_node = e.children()[0]; |
121 | 5.72k | if let Some(t) = text_node.text() { |
122 | 5.72k | return t.text(); |
123 | 0 | } |
124 | 0 | } |
125 | 0 | return ""; |
126 | 5.72k | } |
127 | ||
128 | // same as 'to_str' but for ChildOfElement | |
129 | 1.01k | fn coe_to_str(coe: ChildOfElement<'_>) -> &str { |
130 | // typically usage assumes 'coe' is a leaf | |
131 | 1.01k | let element_node = coe.element(); |
132 | 1.01k | if let Some(e) = element_node { |
133 | // bad MathML is the following isn't true | |
134 | 1.01k | if e.children().len() == 1 { |
135 | 1.01k | let text_node = e.children()[0]; |
136 | 1.01k | if let Some(t) = text_node.text() { |
137 | 1.01k | return t.text(); |
138 | 0 | } |
139 | 8 | } |
140 | 0 | } |
141 | 8 | return ""; |
142 | 1.01k | } |
143 | ||
144 | // returns true if the string is just a single *char* (which can be multiple bytes) | |
145 | 5.72k | fn is_single_char(str: &str) -> bool { |
146 | 5.72k | let mut chars = str.chars(); |
147 | 5.72k | return chars.next().is_some() && chars.next().is_none(); |
148 | 5.72k | } |
149 | ||
150 | // checks the single element to see if it is simple (mn, mi that is a single char, common fraction) | |
151 | 8.33k | fn is_trivially_simple(elem: Element) -> bool { |
152 | 8.33k | if is_tag(elem, "mn") { |
153 | 914 | return true; |
154 | 7.42k | } |
155 | 7.42k | if is_tag(elem, "mi") && |
156 | // "simple" only if it is a single char (which can be multiple bytes) | |
157 | 3.14k | return true; |
158 | 4.27k | } |
159 | ||
160 | // FIX: need to consult preference Fraction_Ordinal | |
161 | 4.27k | if IsNode::is_common_fraction(elem, 10, 19) { |
162 | 66 | return true; |
163 | 4.21k | } |
164 | 4.21k | return false; |
165 | 8.33k | } |
166 | ||
167 | // true if the negative of a single element that is simple | |
168 | 4.20k | fn is_negative_of_trivially_simple(elem: Element) -> bool { |
169 | 4.20k | if is_tag(elem, "mrow") && |
170 | 38 | let children = elem.children(); |
171 | // better be negative of something at this point... | |
172 | 38 | if is_COE_tag(children[0], "mo") && |
173 | 6 | children[1].element().is_some() && is_trivially_simple(children[1].element().unwrap()) { |
174 | 6 | return true; |
175 | 32 | } |
176 | 4.16k | } |
177 | 4.20k | if is_tag(elem, "minus") && |
178 | 54 | let child = elem.children()[0]; |
179 | 54 | if let Some(e) = child.element() { |
180 | 54 | return is_trivially_simple(e); |
181 | 0 | } |
182 | 4.14k | } |
183 | ||
184 | 4.14k | return false; |
185 | 4.20k | } |
186 | ||
187 | // return true if ChildOfElement has exactly text 'ch' | |
188 | 967 | fn is_equal(coe: ChildOfElement, ch: char) -> bool { |
189 | 967 | return coe_to_str(coe).starts_with(ch); |
190 | 967 | } |
191 | ||
192 | // true if mrow(xxx, ⁢, mi) or mrow(xxx, ⁢ mi, ⁢, mi) where mi's have len==1 | |
193 | 867 | fn is_times_mi(mrow: Element) -> bool { |
194 | 867 | assert!( is_tag(mrow, "mrow") ); |
195 | 867 | let children = mrow.children(); |
196 | 867 | if !(children.len() == 3 || |
197 | 34 | return false; |
198 | 833 | } |
199 | 833 | if children[0].element().is_none() { |
200 | 0 | return false; |
201 | 833 | } |
202 | ||
203 | 833 | let first_child = children[0].element().unwrap(); |
204 | 833 | if !is_trivially_simple(first_child) { |
205 | 396 | if !is_negative_of_trivially_simple(first_child) { |
206 | 382 | return false; |
207 | 14 | } |
208 | 14 | if children.len() == 5 && |
209 | 2 | ( (name(first_child) == "minus" && |
210 | 2 | (name(first_child) == "mrow" && !is_COE_tag(first_child.children()[1], "mn")) ) { |
211 | 1 | return false; // '-x y z' is too complicated () -- -2 x y is ok |
212 | 13 | } |
213 | 437 | } |
214 | ||
215 | 450 | if !(is_COE_tag(children[1], "mo") && |
216 | 450 | is_equal(children[1], '\u{2062}') && |
217 | 63 | is_COE_tag(children[2], "mi") && |
218 | 51 | coe_to_str(children[2]).len()==1 ) { |
219 | 408 | return false; |
220 | 42 | } |
221 | ||
222 | 42 | if children.len() == 3 { |
223 | 41 | return true; |
224 | 1 | } |
225 | ||
226 | // len == 5 | |
227 | 1 | return is_COE_tag(children[3], "mo") && |
228 | 1 | is_equal(children[3], '\u{2062}') && // invisible times |
229 | 1 | is_COE_tag(children[4], "mi") && |
230 | 1 | coe_to_str(children[4]).len()==1 ; |
231 | 867 | } |
232 | ||
233 | // return true if the mrow is var° or num° | |
234 | 825 | fn is_degrees(mrow: Element) -> bool { |
235 | 825 | assert!( is_tag(mrow, "mrow") ); |
236 | 825 | let children = mrow.children(); |
237 | 825 | return children.len() == 2 && |
238 | 32 | is_equal(children[1], '°') && |
239 | 0 | (is_COE_tag(children[0], "mi") || |
240 | 0 | is_COE_tag(children[0], "mn") ); |
241 | 825 | } |
242 | ||
243 | // fn_name ⁡ [simple arg or (simple arg)] | |
244 | 825 | fn is_function(mrow: Element) -> bool { |
245 | 825 | assert!( is_tag(mrow, "mrow") ); |
246 | 825 | let children = mrow.children(); |
247 | 825 | if children.len() != 3 { |
248 | 40 | return false; |
249 | 785 | } |
250 | 785 | if !(is_COE_tag(children[1], "mo") && |
251 | 473 | is_equal(children[1], '\u{2061}') ) { // invisible function application |
252 | 717 | return false; |
253 | 68 | } |
254 | 68 | if !is_COE_tag(children[0], "mi") { |
255 | 0 | return false; |
256 | 68 | } |
257 | 68 | let function_arg = children[2].element().unwrap(); |
258 | 68 | if IsBracketed::is_bracketed(function_arg, "(", ")", false, false) { |
259 | 60 | return IsNode::is_simple(function_arg.children()[1].element().unwrap()); |
260 | } else { | |
261 | 8 | return IsNode::is_simple(function_arg); |
262 | } | |
263 | 825 | } |
264 | 7.44k | } |
265 | ||
266 | // Returns true if 'frac' is a common fraction | |
267 | // In this case, the numerator and denominator can be no larger than 'num_limit' and 'denom_limit' | |
268 | 4.31k | fn is_common_fraction(frac: Element, num_limit: usize, denom_limit: usize) -> bool { |
269 | 2 | static ALL_DIGITS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\d+").unwrap()); // match one or more digits |
270 | ||
271 | 4.31k | if !is_tag(frac, "mfrac") && |
272 | 4.12k | return false; |
273 | 188 | } |
274 | 188 | let children = frac.children(); |
275 | 188 | if children.len() != 2 { |
276 | 0 | return false; |
277 | 188 | } |
278 | ||
279 | 188 | let num = children[0].element(); |
280 | 188 | let denom = children[1].element(); |
281 | 188 | if num.is_none() || denom.is_none() { |
282 | 0 | return false; |
283 | 188 | }; |
284 | ||
285 | 188 | let num = num.unwrap(); |
286 | 188 | let denom = denom.unwrap(); |
287 | 188 | if !is_tag(num, "mn") || |
288 | 87 | return false |
289 | 101 | }; |
290 | ||
291 | 101 | let num = get_text_from_element(num); |
292 | 101 | let denom = get_text_from_element(denom); |
293 | 101 | if num.is_empty() || denom.is_empty() { |
294 | 0 | return false; |
295 | 101 | } |
296 | ||
297 | 101 | return ALL_DIGITS.is_match(&num) && is_small_enough(&num, num_limit) && |
298 | 100 | ALL_DIGITS.is_match(&denom) && is_small_enough(&denom, denom_limit); |
299 | ||
300 | 201 | fn is_small_enough(val: &str, upper_bound: usize) -> bool { |
301 | 201 | return if let Ok(value) = val.parse::<usize>() { value <= upper_bound } else { |
302 | 201 | } |
303 | 4.31k | } |
304 | ||
305 | 14.2k | pub fn is_mathml(elem: Element) -> bool { |
306 | // doesn't check MATHML_FROM_NAME_ATTR because we are interested in if it is an intent. | |
307 | 14.2k | return ALL_MATHML_ELEMENTS.contains(name(elem)); |
308 | 14.2k | } |
309 | ||
310 | #[allow(non_snake_case)] | |
311 | 14.3k | pub fn is_2D(elem: Element) -> bool { |
312 | 14.3k | return MATHML_2D_NODES.contains(elem.attribute_value(MATHML_FROM_NAME_ATTR).unwrap_or(name(elem))); |
313 | 14.3k | } |
314 | ||
315 | 37.8k | pub fn is_scripted(elem: Element) -> bool { |
316 | 37.8k | return MATHML_SCRIPTED_NODES.contains(elem.attribute_value(MATHML_FROM_NAME_ATTR).unwrap_or(name(elem))); |
317 | 37.8k | } |
318 | ||
319 | 138k | pub fn is_modified(elem: Element) -> bool { |
320 | 138k | return MATHML_MODIFIED_NODES.contains(elem.attribute_value(MATHML_FROM_NAME_ATTR).unwrap_or(name(elem))); |
321 | 138k | } |
322 | } | |
323 | ||
324 | /// All MathML elements, including a few that get cleaned away | |
325 | /// "semantics", "annotation-xml", "annotation" and Content MathML are not included | |
326 | static ALL_MATHML_ELEMENTS: phf::Set<&str> = phf_set!{ | |
327 | "mi", "mo", "mn", "mtext", "ms", "mspace", "mglyph", | |
328 | "mfrac", "mroot", "msub", "msup", "msubsup","munder", "mover", "munderover", "mmultiscripts", | |
329 | "mstack", "mlongdiv", "msgroup", "msrow", "mscarries", "mscarry", "msline", | |
330 | "none", "mprescripts", "malignmark", "maligngroup", | |
331 | "math", "msqrt", "merror", "mpadded", "mphantom", "menclose", "mtd", "mstyle", | |
332 | "mrow", "a", "mfenced", "mtable", "mtr", "mlabeledtr", | |
333 | }; | |
334 | ||
335 | static MATHML_LEAF_NODES: phf::Set<&str> = phf_set! { | |
336 | "mi", "mo", "mn", "mtext", "ms", "mspace", "mglyph", | |
337 | "none", "annotation", "ci", "cn", "csymbol", // content could be inside an annotation-xml (faster to allow here than to check lots of places) | |
338 | }; | |
339 | ||
340 | ||
341 | // Should mstack and mlongdiv be included here? | |
342 | static MATHML_2D_NODES: phf::Set<&str> = phf_set! { | |
343 | "mfrac", "msqrt", "mroot", "menclose", | |
344 | "msub", "msup", "msubsup", "munder", "mover", "munderover", "mmultiscripts", | |
345 | "mtable", "mtr", "mlabeledtr", "mtd", | |
346 | }; | |
347 | ||
348 | // Should mstack and mlongdiv be included here? | |
349 | static MATHML_MODIFIED_NODES: phf::Set<&str> = phf_set! { | |
350 | "msub", "msup", "msubsup", "munder", "mover", "munderover", "mmultiscripts", | |
351 | }; | |
352 | ||
353 | // Should mstack and mlongdiv be included here? | |
354 | static MATHML_SCRIPTED_NODES: phf::Set<&str> = phf_set! { | |
355 | "msub", "msup", "msubsup", "mmultiscripts", | |
356 | }; | |
357 | ||
358 | 1.07M | pub fn is_leaf(element: Element) -> bool { |
359 | 1.07M | return MATHML_LEAF_NODES.contains(name(element)); |
360 | 1.07M | } |
361 | ||
362 | impl Function for IsNode { | |
363 | // eval function for IsNode | |
364 | // errors happen for wrong number/kind of arg | |
365 | 5.12k | fn evaluate<'d>(&self, |
366 | 5.12k | _context: &context::Evaluation<'_, 'd>, |
367 | 5.12k | args: Vec<Value<'d>>) |
368 | 5.12k | -> Result<Value<'d>, Error> |
369 | { | |
370 | ||
371 | 5.12k | let mut args = Args(args); |
372 | 5.12k | args.exactly(2) |
373 | 5.12k | let kind = args.pop_string() |
374 | // FIX: there is some conflict problem with xpath errors and error-chain | |
375 | // .chain_err(|e| format!("Second arg to is_leaf is not a string: {}", e.to_string()))?; | |
376 | 5.12k | match kind.as_str() { |
377 | 5.12k | "simple" | |
378 | 0 | _ => return Err( Error::Other(format!("Unknown argument value '{}' for IsNode", kind.as_str())) ), |
379 | }; | |
380 | ||
381 | 5.12k | let nodes = args.pop_nodeset() |
382 | 5.12k | if nodes.size() == 0 { |
383 | 0 | return Ok (Value::Boolean(false)); // like xpath, don't make this an error |
384 | 5.12k | }; |
385 | return Ok( | |
386 | Value::Boolean( | |
387 | 5.12k | nodes.iter() |
388 | 5.12k | .all(|node| |
389 | 5.39k | if let Node::Element(e) = node { |
390 | 5.39k | match kind.as_str() { |
391 | 5.39k | "simple" => |
392 | 3.09k | "leaf" => |
393 | 849 | "2D" => |
394 | 162 | "modified" => |
395 | 140 | "scripted" => |
396 | 49 | "mathml" => IsNode::is_mathml(e), |
397 | 0 | "common_fraction" => IsNode::is_common_fraction(e, usize::MAX, usize::MAX), |
398 | 0 | _ => true, // can't happen due to check above |
399 | } | |
400 | } else { | |
401 | // xpath is something besides an element, so no match | |
402 | 0 | false |
403 | 5.39k | } |
404 | ) | |
405 | ) | |
406 | ); | |
407 | ||
408 | 2.25k | fn is_leaf_any_name(e: Element) -> bool { |
409 | 2.25k | let children = e.children(); |
410 | 2.25k | if children.is_empty() { |
411 | 0 | return true; |
412 | 2.25k | } else if children.len() == 1 && |
413 | 1.24k | let ChildOfElement::Text(_) = children[0] { |
414 | 1.17k | return true; |
415 | 1.07k | } |
416 | 1.07k | return false |
417 | 2.25k | } |
418 | 5.12k | } |
419 | } | |
420 | ||
421 | struct ToOrdinal; | |
422 | impl ToOrdinal { | |
423 | // ordinals often have an irregular start (e.g., "half") before becoming regular. | |
424 | // if the number is irregular, return the ordinal form, otherwise return 'None'. | |
425 | 353 | fn compute_irregular_fractional_speech(number: &str, plural: bool) -> Option<String> { |
426 | 353 | SPEECH_DEFINITIONS.with(|definitions| { |
427 | 353 | let definitions = definitions.borrow(); |
428 | 353 | let words = if plural { |
429 | 208 | definitions.get_vec("NumbersOrdinalFractionalPluralOnes") |
430 | } else { | |
431 | 145 | definitions.get_vec("NumbersOrdinalFractionalOnes") |
432 | }; | |
433 | 353 | let number_as_int: usize = number.parse().unwrap(); // already verified it is only digits |
434 | 353 | if number_as_int < words.len() { |
435 | // use the words associated with this irregular pattern. | |
436 | 291 | return Some( words[number_as_int].clone() ); |
437 | 62 | }; |
438 | 62 | return None; |
439 | 353 | }) |
440 | 353 | } |
441 | ||
442 | /** | |
443 | * Translates a number of up to twelve digits into a string representation. | |
444 | * number -- the number to translate | |
445 | * fractional -- true if this is a fractional ordinal (e.g, "half") | |
446 | * plural -- true if answer should be plural | |
447 | * Returns the string representation of that number or an error message | |
448 | */ | |
449 | 416 | fn convert(number: &str, fractional: bool, plural: bool) -> Option<String> { |
450 | 2 | static NO_DIGIT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[^\d]").unwrap()); // match anything except a digit |
451 | 416 | return SPEECH_DEFINITIONS.with(|definitions| { |
452 | 416 | let definitions = definitions.borrow(); |
453 | 416 | let numbers_large = definitions.get_vec("NumbersLarge") |
454 | ||
455 | 416 | let pref_manager = crate::prefs::PreferenceManager::get(); |
456 | 416 | let pref_manager = pref_manager.borrow(); |
457 | 416 | let block_separators = pref_manager.pref_to_string("BlockSeparators"); |
458 | 416 | let decimal_separator = pref_manager.pref_to_string("DecimalSeparators"); |
459 | ||
460 | // check number validity (has digits, not a decimal) | |
461 | 416 | if number.is_empty() || number.contains(&decimal_separator) { |
462 | 0 | return Some(String::from(number)); |
463 | 416 | } |
464 | // remove any block separators | |
465 | 416 | let number = match clean_number(number, &block_separators) { |
466 | 0 | None => return Some(String::from(number)), |
467 | 416 | Some(num) => num, |
468 | }; | |
469 | ||
470 | // check to see if the number is too big or is not an integer or has non-digits | |
471 | 416 | if number.len() > 3*numbers_large.len() { |
472 | 0 | return Some(number); |
473 | 416 | } |
474 | 416 | if NO_DIGIT.is_match(&number) { |
475 | // this shouldn't have been part of an mn, so likely an error. Log a warning | |
476 | // FIX: log a warning that a non-number was passed to convert() | |
477 | 0 | return Some(number); |
478 | 416 | } |
479 | ||
480 | // first deal with the abnormalities of fractional ordinals (one half, etc). That simplifies what remains | |
481 | 416 | if fractional && |
482 | 353 | let Some( |
483 | 291 | return Some(string); |
484 | 125 | } |
485 | ||
486 | // at this point, we only need to worry about singular/plural distinction | |
487 | ||
488 | // break into groups of three digits and add 10^3 word (thousands, millions, ...) after each chunk | |
489 | // FIX: add a pause between groups of three -- need to use TTS-specific pause | |
490 | ||
491 | // handle special case of trailing zeros | |
492 | // num_thousands_at_end represents the amount to shift NumbersLarge... (e.g., millions->thousands) | |
493 | 243 | let |
494 | 122 | Some(n) => (number.len() - 1 - n) / 3 , |
495 | 3 | None => 0 |
496 | }; | |
497 | 125 | let (number,_) = number.split_at(number.len() - 3 * num_thousands_at_end); // drop the 0s |
498 | ||
499 | // everything is simplified if we add zeros at the start so that block size is a factor of 3 | |
500 | 125 | let number = match number.len() % 3 { |
501 | 18 | 0 => "".to_string() + number, |
502 | 69 | 1 => "00".to_string() + number, |
503 | 38 | _ => "0".to_string() + number, // can only be "2" -- compiler doesn't know there aren't other options |
504 | }; | |
505 | ||
506 | // At this point we have at least three "digits", and length is a multiple of 3 | |
507 | // We have already verified that there are only ASCII digits, so we can subtract '0' to get an index | |
508 | const ASCII_0: usize = 48; | |
509 | 125 | let digits = number.as_bytes() |
510 | 125 | .iter() |
511 | 411 | . |
512 | 125 | .collect::<Vec<usize>>(); |
513 | ||
514 | 125 | let mut answer = String::with_capacity(255); // reasonable max most of the time |
515 | 125 | let large_words = numbers_large; |
516 | 125 | if digits.len() > 3 { |
517 | // speak this first groups as cardinal numbers | |
518 | 7 | let words = [ |
519 | 7 | definitions.get_vec("NumbersHundreds") |
520 | 7 | definitions.get_vec("NumbersTens") |
521 | 7 | definitions.get_vec("NumbersOnes") |
522 | ]; | |
523 | 7 | answer = digits[0..digits.len()-3] |
524 | 7 | .chunks(3) |
525 | 7 | .enumerate() |
526 | 12 | . |
527 | 12 | if chunk[0] != 0 || |
528 | 7 | Some(ToOrdinal::hundreds_to_words(chunk, &words) |
529 | 7 | &large_words[num_thousands_at_end + digits.len()/3 - 1 - i] + " ") |
530 | } else { | |
531 | 5 | Some("".to_string()) |
532 | } | |
533 | 12 | }) |
534 | 7 | .collect::<Option<Vec<String>>>() |
535 | 7 | .join(""); // can't use " " because 1000567 would get extra space in the middle |
536 | 7 | if num_thousands_at_end > 0 { |
537 | // add on "billionths", etc and we are done | |
538 | 0 | let large_words = if plural { |
539 | 0 | definitions.get_vec("NumbersOrdinalPluralLarge") |
540 | } else { | |
541 | 0 | definitions.get_vec("NumbersOrdinalLarge") |
542 | }; | |
543 | 0 | return Some(answer + &large_words?[num_thousands_at_end]); |
544 | 7 | } |
545 | 118 | }; |
546 | ||
547 | // all that is left is to speak the hundreds part, possibly followed by "thousands", "billions", etc | |
548 | 125 | let words = match (num_thousands_at_end > 0, plural) { |
549 | (true, _) => [ | |
550 | 10 | definitions.get_vec("NumbersHundreds") |
551 | 10 | definitions.get_vec("NumbersTens") |
552 | 10 | definitions.get_vec("NumbersOnes") |
553 | ], | |
554 | (false, true) => [ | |
555 | 54 | definitions.get_vec("NumbersOrdinalPluralHundreds") |
556 | 54 | definitions.get_vec("NumbersOrdinalPluralTens") |
557 | 54 | definitions.get_vec("NumbersOrdinalPluralOnes") |
558 | ], | |
559 | (false, false) => [ | |
560 | 61 | definitions.get_vec("NumbersOrdinalHundreds") |
561 | 61 | definitions.get_vec("NumbersOrdinalTens") |
562 | 61 | definitions.get_vec("NumbersOrdinalOnes") |
563 | ], | |
564 | }; | |
565 | 125 | answer += &ToOrdinal::hundreds_to_words(&digits[digits.len()-3..], &words) |
566 | 125 | if num_thousands_at_end > 0 { |
567 | 10 | let large_words = if plural { |
568 | 3 | definitions.get_vec("NumbersOrdinalPluralLarge") |
569 | } else { | |
570 | 7 | definitions.get_vec("NumbersOrdinalLarge") |
571 | }; | |
572 | 10 | answer = answer + " " + &large_words[num_thousands_at_end]; |
573 | 115 | } |
574 | 125 | return Some(answer); |
575 | 416 | }); |
576 | ||
577 | /// Remove block separators and convert alphanumeric digits to ascii digits | |
578 | 416 | fn clean_number(number: &str, block_separators: &str) -> Option<String> { |
579 | 416 | let mut answer = String::with_capacity(number.len()); |
580 | 617 | for ch in |
581 | 617 | if block_separators.contains(ch) { |
582 | 0 | continue; |
583 | 617 | } |
584 | 617 | if ch.is_ascii_digit() { |
585 | 615 | answer.push(ch); |
586 | 615 | } else { |
587 | 2 | let shifted_ch = match ch { |
588 | 2 | '𝟎'..='𝟗' => ch as u32 -'𝟎' as u32 + '0' as u32, |
589 | 0 | '𝟘'..='𝟡' => ch as u32 -'𝟘' as u32 + '0' as u32, |
590 | 0 | '𝟢'..='𝟫' => ch as u32 -'𝟢' as u32 + '0' as u32, |
591 | 0 | '𝟬'..='𝟵' => ch as u32 -'𝟬' as u32 + '0' as u32, |
592 | 0 | '𝟶'..='𝟿' => ch as u32 -'𝟶' as u32 + '0' as u32, |
593 | 0 | _ => return None, |
594 | }; | |
595 | 2 | answer.push(char::from_u32(shifted_ch).unwrap()); |
596 | } | |
597 | } | |
598 | 416 | return Some(answer); |
599 | 416 | } |
600 | 416 | } |
601 | ||
602 | ||
603 | 132 | fn hundreds_to_words(number: &[usize], words: &[Ref<Vec<String>>; 3]) -> Option<String> { |
604 | 132 | assert!( number.len() == 3 ); |
605 | 132 | return SPEECH_DEFINITIONS.with(|definitions| { |
606 | 132 | let definitions = definitions.borrow(); |
607 | 132 | if number[0] != 0 && |
608 | 6 | return Some(words[0][number[0]].clone()); |
609 | 126 | } |
610 | ||
611 | 126 | let mut hundreds = definitions.get_vec("NumbersHundreds") |
612 | 126 | if !hundreds.is_empty() { |
613 | 18 | hundreds += " "; |
614 | 108 | } |
615 | ||
616 | 126 | if number[1] != 0 && |
617 | 26 | return Some(hundreds + &words[1][number[1]]); |
618 | 100 | } |
619 | ||
620 | 100 | if 10*number[1] < words[2].len() { |
621 | // usurp regular ordering to handle something like '14' | |
622 | 85 | return Some(hundreds + &words[2][10*number[1] + number[2]]); |
623 | } else { | |
624 | 15 | return Some(hundreds + &definitions.get_vec("NumbersTens") |
625 | } | |
626 | 132 | }); |
627 | 132 | } |
628 | } | |
629 | ||
630 | impl Function for ToOrdinal { | |
631 | // convert a node to an ordinal number | |
632 | 320 | fn evaluate<'d>(&self, |
633 | 320 | _context: &context::Evaluation<'_, 'd>, |
634 | 320 | args: Vec<Value<'d>>) |
635 | 320 | -> Result<Value<'d>, Error> |
636 | { | |
637 | 320 | let mut args = Args(args); |
638 | 320 | if let Err( |
639 | 0 | return Err( XPathError::Other(format!("ToOrdinal requires 1 or 3 args: {e}"))); |
640 | 320 | }; |
641 | 320 | let mut fractional = false; |
642 | 320 | let mut plural = false; |
643 | 320 | if args.len() == 3 { |
644 | 288 | plural = args.pop_boolean() |
645 | 288 | fractional = args.pop_boolean() |
646 | 32 | } |
647 | 320 | let node = validate_one_node(args.pop_nodeset() |
648 | 320 | return match node { |
649 | 0 | Node::Text(t) => Ok( Value::String( |
650 | 0 | match ToOrdinal::convert(t.text(), fractional, plural) { |
651 | 0 | None => t.text().to_string(), |
652 | 0 | Some(ord) => ord, |
653 | } ) ), | |
654 | 320 | Node::Element(e) => Ok( Value::String( |
655 | 320 | match ToOrdinal::convert(&get_text_from_element(e), fractional, plural) { |
656 | 0 | None => get_text_from_element(e).to_string(), |
657 | 320 | Some(ord) => ord, |
658 | } ) ), | |
659 | 0 | _ => Err( Error::ArgumentNotANodeset{actual: ArgumentType::String} ), |
660 | } | |
661 | 320 | } |
662 | } | |
663 | ||
664 | ||
665 | struct ToCommonFraction; | |
666 | ||
667 | impl Function for ToCommonFraction { | |
668 | // convert a node to a common fraction (if the numerator and denominator are within given limits) | |
669 | 34 | fn evaluate<'d>(&self, |
670 | 34 | _context: &context::Evaluation<'_, 'd>, |
671 | 34 | args: Vec<Value<'d>>) |
672 | 34 | -> Result<Value<'d>, Error> |
673 | { | |
674 | 34 | let mut args = Args(args); |
675 | 34 | args.exactly(1) |
676 | ||
677 | // FIX: should probably handle errors by logging them and then trying to evaluate any children | |
678 | 34 | let node = validate_one_node(args.pop_nodeset() |
679 | 34 | if let Node::Element(frac) = node { |
680 | 34 | if !IsNode::is_common_fraction(frac, usize::MAX, usize::MAX) { |
681 | 0 | return Err( Error::Other( format!("ToCommonFraction -- argument is not an 'mfrac': {}': ", mml_to_string(frac))) ); |
682 | 34 | } |
683 | ||
684 | // everything has been verified, so we can just get the pieces and ignore potential error results | |
685 | 34 | let children = frac.children(); |
686 | 34 | let num = children[0].element().unwrap(); |
687 | 34 | let num = get_text_from_element( num ); |
688 | 34 | let denom = children[1].element().unwrap(); |
689 | 34 | let denom = get_text_from_element( denom ); |
690 | 34 | let mut answer = num.clone() + " "; |
691 | 34 | answer += &match ToOrdinal::convert(&denom, true, num!="1") { |
692 | 0 | None => denom, |
693 | 34 | Some(ord) => ord, |
694 | }; | |
695 | ||
696 | 34 | return Ok( Value::String( answer ) ) |
697 | } else { | |
698 | 0 | return Err( Error::Other( "ToCommonFraction -- argument is not an element".to_string()) ); |
699 | } | |
700 | 34 | } |
701 | } | |
702 | ||
703 | struct Min; | |
704 | /** | |
705 | * Returns true the smallest of the two args | |
706 | * @param(num1) | |
707 | * @param(num2) | |
708 | */ | |
709 | impl Function for Min { | |
710 | ||
711 | 0 | fn evaluate<'d>(&self, |
712 | 0 | _context: &context::Evaluation<'_, 'd>, |
713 | 0 | args: Vec<Value<'d>>) |
714 | 0 | -> Result<Value<'d>, Error> |
715 | { | |
716 | 0 | let mut args = Args(args); |
717 | 0 | args.exactly(2)?; |
718 | 0 | let num1 = args.pop_number()?; |
719 | 0 | let num2 = args.pop_number()?; |
720 | 0 | return Ok( Value::Number( num1.min(num2) ) ); |
721 | 0 | } |
722 | } | |
723 | ||
724 | struct Max; | |
725 | ||
726 | impl Function for Max { | |
727 | ||
728 | 0 | fn evaluate<'d>(&self, |
729 | 0 | _context: &context::Evaluation<'_, 'd>, |
730 | 0 | args: Vec<Value<'d>>) |
731 | 0 | -> Result<Value<'d>, Error> |
732 | { | |
733 | 0 | let mut args = Args(args); |
734 | 0 | args.exactly(2)?; |
735 | 0 | let num1 = args.pop_number()?; |
736 | 0 | let num2 = args.pop_number()?; |
737 | 0 | return Ok( Value::Number( num1.max(num2) ) ); |
738 | 0 | } |
739 | } | |
740 | ||
741 | ||
742 | struct BaseNode; | |
743 | /** | |
744 | * Returns true if the node is a large op | |
745 | * @param(node) -- node(s) to test -- should be an <mo> | |
746 | */ | |
747 | impl BaseNode { | |
748 | /// Recursively find the base node | |
749 | /// The base node of a non scripted element is the element itself | |
750 | 1.26k | fn base_node(node: Element) -> Element { |
751 | 1.26k | let name = node.attribute_value(MATHML_FROM_NAME_ATTR).unwrap_or(name(node)); |
752 | 1.26k | if ["msub", "msup", "msubsup", "munder", "mover", "munderover", "mmultiscripts"].contains(&name) { |
753 | 97 | return BaseNode::base_node(as_element(node.children()[0])); |
754 | } else { | |
755 | 1.16k | return node; |
756 | } | |
757 | 1.26k | } |
758 | } | |
759 | impl Function for BaseNode { | |
760 | ||
761 | 1.16k | fn evaluate<'d>(&self, |
762 | 1.16k | _context: &context::Evaluation<'_, 'd>, |
763 | 1.16k | args: Vec<Value<'d>>) |
764 | 1.16k | -> Result<Value<'d>, Error> |
765 | { | |
766 | 1.16k | let mut args = Args(args); |
767 | 1.16k | args.exactly(1) |
768 | 1.16k | let node = validate_one_node(args.pop_nodeset() |
769 | 1.16k | if let Node::Element(e) = node { |
770 | 1.16k | let mut node_set = Nodeset::new(); |
771 | 1.16k | node_set.add(BaseNode::base_node(e)); |
772 | 1.16k | return Ok( Value::Nodeset(node_set) ); |
773 | } else { | |
774 | // xpath is something besides an element, so no match | |
775 | 0 | return Err( Error::Other("Argument other than a node given to BaseNode".to_string()) ); |
776 | } | |
777 | 1.16k | } |
778 | } | |
779 | ||
780 | ||
781 | struct IfThenElse; | |
782 | impl Function for IfThenElse { | |
783 | 36.2k | fn evaluate<'d>(&self, |
784 | 36.2k | _context: &context::Evaluation<'_, 'd>, |
785 | 36.2k | args: Vec<Value<'d>>) |
786 | 36.2k | -> Result<Value<'d>, Error> |
787 | { | |
788 | 36.2k | let args = Args(args); |
789 | 36.2k | args.exactly(3) |
790 | 36.2k | let if_val = &args[0]; |
791 | 36.2k | let then_val = &args[1]; |
792 | 36.2k | let else_val = &args[2]; |
793 | 36.2k | let is_true = match if_val { |
794 | 14.5k | Value::Nodeset(nodes) => nodes.size() > 0, |
795 | 21.7k | Value::Boolean(b) => *b, |
796 | 0 | Value::Number(f) => *f != 0.0, |
797 | 0 | Value::String(s) => !s.is_empty(), |
798 | }; | |
799 | 36.2k | return Ok( if is_true { |
800 | 36.2k | } |
801 | } | |
802 | ||
803 | ||
804 | struct Debug; | |
805 | /** | |
806 | * Prints it's argument along with the string that was evaluated | |
807 | * @param(node) -- node(s) to be evaluated/printed | |
808 | * @param(string) -- string showing what is being evaluated | |
809 | */ | |
810 | impl Function for Debug { | |
811 | ||
812 | 348 | fn evaluate<'d>(&self, |
813 | 348 | _context: &context::Evaluation<'_, 'd>, |
814 | 348 | args: Vec<Value<'d>>) |
815 | 348 | -> Result<Value<'d>, Error> |
816 | { | |
817 | 348 | let mut args = Args(args); |
818 | 348 | args.exactly(2) |
819 | 348 | let xpath_str = args.pop_string() |
820 | 348 | let eval_result = &args[0]; |
821 | 348 | debug!(" -- Debug: value of '{xpath_str}' is "); |
822 | 348 | match eval_result { |
823 | 78 | Value::Nodeset(nodes) => { |
824 | 78 | if nodes.size() == 0 { |
825 | 0 | debug!("0 nodes (false)"); |
826 | } else { | |
827 | 78 | let singular = nodes.size()==1; |
828 | 78 | debug!("{} node{}. {}:", |
829 | 0 | if singular {""} else {"s"}, |
830 | 0 | if singular {"Node is"} else {"Nodes are"}); |
831 | 78 | nodes.document_order() |
832 | 78 | .iter() |
833 | 78 | .enumerate() |
834 | 78 | .for_each(|(i, node)| { |
835 | 78 | match node { |
836 | 78 | Node::Element(mathml) => debug!("#{}:\n{}", |
837 | 0 | i, mml_to_string(*mathml)), |
838 | 0 | _ => debug!("'{node:?}'"), |
839 | } | |
840 | 78 | }) |
841 | } | |
842 | }, | |
843 | 270 | _ => debug!("'{eval_result:?}'"), |
844 | } | |
845 | 348 | return Ok( eval_result.clone() ); |
846 | 348 | } |
847 | } | |
848 | ||
849 | ||
850 | /// Should be an internal structure for implementation of the IsBracketed, but it was useful in one place in a separate module. | |
851 | /// This should probably be restructured slightly. | |
852 | pub struct IsBracketed; | |
853 | impl IsBracketed { | |
854 | 139k | pub fn is_bracketed(element: Element, left: &str, right: &str, requires_comma: bool, requires_mrow: bool) -> bool { |
855 | use crate::canonicalize::is_fence; | |
856 | 139k | if requires_mrow && |
857 | 18.8k | return false; |
858 | 120k | } |
859 | 120k | let children = element.children(); |
860 | 120k | let n_children = children.len(); |
861 | 120k | if (n_children == 0 || |
862 | 120k | !left.is_empty() && |
863 | 116k | requires_comma && |
864 | // not enough argument for there to be a match | |
865 | 4.44k | return false; |
866 | 115k | } |
867 | ||
868 | 115k | let first_child = as_element(children[0]); |
869 | 115k | let last_child = as_element(children[children.len()-1]); |
870 | // debug!("first_child: {}", crate::pretty_print::mml_to_string(first_child)); | |
871 | // debug!("last_child: {}", crate::pretty_print::mml_to_string(last_child)); | |
872 | 115k | if (left.is_empty() && ( |
873 | 106k | (right.is_empty() && ( |
874 | 9.61k | return false; |
875 | 106k | } |
876 | ||
877 | 106k | if !left.is_empty() && |
878 | 6.14k | !right.is_empty() && |
879 | // left or right don't match | |
880 | 101k | return false; |
881 | 5.12k | } |
882 | ||
883 | 5.12k | if requires_comma { |
884 | 445 | if let ChildOfElement::Element(contents) = children[1] { |
885 | 445 | let children = contents.children(); |
886 | 445 | if !is_tag(contents, "mrow") || |
887 | 197 | return false; |
888 | 248 | } |
889 | // finally, we can check for a comma -- we might not have operands, so we to check first and second entry | |
890 | 248 | if get_text_from_COE(&children[0]).as_str() == "," { |
891 | 1 | return true; |
892 | 247 | } |
893 | 247 | if children.len() > 1 && get_text_from_COE(&children[1]).as_str() == "," { |
894 | 133 | return true; |
895 | 114 | } |
896 | 0 | } |
897 | 114 | return false; |
898 | } else { | |
899 | 4.67k | return true; |
900 | } | |
901 | 139k | } |
902 | } | |
903 | ||
904 | /** | |
905 | * Returns true if the node is a bracketed expr with the indicated left/right chars | |
906 | * node -- node(s) to test | |
907 | * left -- string (like "[") or empty | |
908 | * right -- string (like "]") or empty | |
909 | * requires_comma - boolean, optional (check the top level of 'node' for commas) | |
910 | */ | |
911 | // 'requiresComma' is useful for checking parenthesized expressions vs function arg lists and other lists | |
912 | impl Function for IsBracketed { | |
913 | 115k | fn evaluate<'d>(&self, |
914 | 115k | _context: &context::Evaluation<'_, 'd>, |
915 | 115k | args: Vec<Value<'d>>) |
916 | 115k | -> Result<Value<'d>, Error> |
917 | { | |
918 | 115k | let mut args = Args(args); |
919 | 115k | args.at_least(3) |
920 | 115k | args.at_most(5) |
921 | 115k | let mut requires_comma = false; |
922 | 115k | let mut requires_mrow = true; |
923 | 115k | if args.len() == 5 { |
924 | 0 | requires_mrow = args.pop_boolean()?; |
925 | 115k | } |
926 | 115k | if args.len() >= 4 { |
927 | 15 | requires_comma = args.pop_boolean() |
928 | 115k | } |
929 | 115k | let right = args.pop_string() |
930 | 115k | let left = args.pop_string() |
931 | return Ok( Value::Boolean( | |
932 | 115k | match validate_one_node(args.pop_nodeset() |
933 | 0 | Err(_) => false, // be fault tolerant, like xpath, |
934 | 115k | Ok(node) => { |
935 | 115k | if let Node::Element(e) = node { |
936 | 115k | IsBracketed::is_bracketed(e, &left, &right, requires_comma, requires_mrow) |
937 | } else { | |
938 | 0 | false |
939 | } | |
940 | } | |
941 | }) ); | |
942 | 115k | } |
943 | } | |
944 | ||
945 | pub struct IsInDefinition; | |
946 | impl IsInDefinition { | |
947 | /// Returns true if `test_str` is in `set_name` | |
948 | /// Returns an error if `set_name` is not defined | |
949 | 11.0k | pub fn is_defined_in(test_str: &str, defs: &'static LocalKey<RefCell<Definitions>>, set_name: &str) -> Result<bool, Error> { |
950 | 11.0k | return defs.with(|definitions| { |
951 | 11.0k | if let Some( |
952 | 11.0k | return Ok( set.contains(test_str) ); |
953 | 12 | } |
954 | 12 | if let Some(hashmap) = definitions.borrow().get_hashmap(set_name) { |
955 | 12 | return Ok( hashmap.contains_key(test_str) ); |
956 | 0 | } |
957 | 0 | return Err( Error::Other( format!("\n IsInDefinition: '{set_name}' is not defined in definitions.yaml") ) ); |
958 | 11.0k | }); |
959 | 11.0k | } |
960 | } | |
961 | ||
962 | /** | |
963 | * Returns true if the text is contained in the set defined in Speech or Braille. | |
964 | * element/string -- element (converted to string)/string to test | |
965 | * speech or braille | |
966 | * set_name -- the set in which the string is to be searched | |
967 | */ | |
968 | // 'requiresComma' is useful for checking parenthesized expressions vs function arg lists and other lists | |
969 | impl Function for IsInDefinition { | |
970 | 12.0k | fn evaluate<'d>(&self, |
971 | 12.0k | _context: &context::Evaluation<'_, 'd>, |
972 | 12.0k | args: Vec<Value<'d>>) |
973 | 12.0k | -> Result<Value<'d>, Error> |
974 | { | |
975 | 12.0k | let mut args = Args(args); |
976 | // FIX: temporarily accept two args as assume SPEECH_DEFINITIONS until the Rule files are fixed | |
977 | 12.0k | args.at_least(2) |
978 | 12.0k | args.at_most(3) |
979 | 12.0k | let set_name = args.pop_string() |
980 | // FIX: this (len == 1) is temporary until all the usages are switched to the (new) 3-arg form | |
981 | 12.0k | let definitions = if args.len() == 2 { |
982 | 10.4k | match args.pop_string() |
983 | 10.4k | "Speech" => |
984 | 9.09k | "Braille" => &BRAILLE_DEFINITIONS, |
985 | 0 | _ => return Err( Error::Other("IsInDefinition:: second argument must be either 'Speech' or 'Braille'".to_string()) ) |
986 | } | |
987 | } else { | |
988 | 1.61k | &SPEECH_DEFINITIONS |
989 | }; | |
990 | 12.0k | match &args[0] { |
991 | 5.04k | Value::String(str) => return match IsInDefinition::is_defined_in(str, definitions, &set_name) { |
992 | 5.04k | Ok(result) => Ok( Value::Boolean( result ) ), |
993 | 0 | Err(e) => Err(e), |
994 | }, | |
995 | 7.02k | Value::Nodeset(nodes) => { |
996 | 7.02k | return if nodes.size() == 0 { |
997 | 0 | Ok( Value::Boolean(false) ) // trivially not in definition |
998 | } else { | |
999 | 7.02k | let node = validate_one_node(nodes.clone(), "IsInDefinition") |
1000 | 7.02k | if let Node::Element(e) = node { |
1001 | 7.02k | let text = get_text_from_element(e); |
1002 | 7.02k | if text.is_empty() { |
1003 | 979 | Ok( Value::Boolean(false) ) |
1004 | } else { | |
1005 | 6.04k | match IsInDefinition::is_defined_in(&text, definitions, &set_name) { |
1006 | 6.04k | Ok(result) => Ok( Value::Boolean( result ) ), |
1007 | 0 | Err(e) => Err(e), |
1008 | } | |
1009 | } | |
1010 | } else { | |
1011 | 0 | Ok( Value::Boolean(false)) // trivially not in definition } |
1012 | } | |
1013 | } | |
1014 | }, | |
1015 | 0 | _ => Err( Error::Other("IsInDefinition:: neither a node nor a string is passed for first argument".to_string()) ), |
1016 | } | |
1017 | 12.0k | } |
1018 | } | |
1019 | ||
1020 | ||
1021 | pub struct DefinitionValue; | |
1022 | impl DefinitionValue { | |
1023 | /// Returns the value associated with `key` in `set_name`. If `key` is not in `set_name`, an empty string is returned | |
1024 | /// Returns an error if `set_name` is not defined | |
1025 | 12.7k | pub fn definition_value(key: &str, defs: &'static LocalKey<RefCell<Definitions>>, set_name: &str) -> Result<String, Error> { |
1026 | 12.7k | return defs.with(|definitions| { |
1027 | 12.7k | if let Some(map) = definitions.borrow().get_hashmap(set_name) { |
1028 | 12.7k | return Ok( match map.get(key) { |
1029 | 5.64k | None => "".to_string(), |
1030 | 7.09k | Some(str) => str.clone(), |
1031 | }); | |
1032 | 0 | } |
1033 | 0 | return Err( Error::Other( format!("\n DefinitionValue: '{set_name}' is not defined in definitions.yaml") ) ); |
1034 | 12.7k | }); |
1035 | 12.7k | } |
1036 | } | |
1037 | ||
1038 | /** | |
1039 | * Returns true if the node is a bracketed expr with the indicated left/right chars | |
1040 | * element/string -- element (converted to string)/string to test | |
1041 | * left -- string (like "[") or empty | |
1042 | * right -- string (like "]") or empty | |
1043 | * requires_comma - boolean, optional (check the top level of 'node' for commas | |
1044 | */ | |
1045 | // 'requiresComma' is useful for checking parenthesized expressions vs function arg lists and other lists | |
1046 | impl Function for DefinitionValue { | |
1047 | 13.1k | fn evaluate<'d>(&self, |
1048 | 13.1k | _context: &context::Evaluation<'_, 'd>, |
1049 | 13.1k | args: Vec<Value<'d>>) |
1050 | 13.1k | -> Result<Value<'d>, Error> |
1051 | { | |
1052 | 13.1k | let mut args = Args(args); |
1053 | 13.1k | args.exactly(3) |
1054 | 13.1k | let set_name = args.pop_string() |
1055 | 13.1k | let definitions = match args.pop_string() |
1056 | 13.1k | "Speech" => |
1057 | 12 | "Braille" => &BRAILLE_DEFINITIONS, |
1058 | 0 | _ => return Err( Error::Other("IsInDefinition:: second argument must be either 'Speech' or 'Braille'".to_string()) ) |
1059 | }; | |
1060 | 13.1k | match &args[0] { |
1061 | 5.04k | Value::String(str) => return match DefinitionValue::definition_value(str, definitions, &set_name) { |
1062 | 5.04k | Ok(result) => Ok( Value::String( result ) ), |
1063 | 0 | Err(e) => Err(e), |
1064 | }, | |
1065 | 8.10k | Value::Nodeset(nodes) => { |
1066 | 8.10k | return if nodes.size() == 0 { |
1067 | 0 | Ok( Value::String("".to_string()) ) // trivially not in definition |
1068 | } else { | |
1069 | 8.10k | let node = validate_one_node(nodes.clone(), "DefinitionValue") |
1070 | 8.10k | if let Node::Element( |
1071 | 8.10k | let text = get_text_from_element(e); |
1072 | 8.10k | if text.is_empty() { |
1073 | 410 | Ok( Value::String("".to_string()) ) |
1074 | } else { | |
1075 | 7.69k | match DefinitionValue::definition_value(&text, definitions, &set_name) { |
1076 | 7.69k | Ok(result) => Ok( Value::String( result ) ), |
1077 | 0 | Err(e) => Err(e), |
1078 | } | |
1079 | } | |
1080 | } else { | |
1081 | 3 | Ok( Value::String("".to_string()) ) // trivially not in definition } |
1082 | } | |
1083 | } | |
1084 | }, | |
1085 | 0 | _ => Err( Error::Other("DefinitionValue:: neither a node nor a string is passed for first argument".to_string()) ), |
1086 | } | |
1087 | 13.1k | } |
1088 | } | |
1089 | ||
1090 | pub struct DistanceFromLeaf; | |
1091 | impl DistanceFromLeaf { | |
1092 | 240 | fn distance(element: Element, use_left_side: bool, treat_2d_elements_as_tokens: bool) -> usize { |
1093 | // FIX: need to handle char level (i.e., chars in a leaf element) | |
1094 | 240 | let mut element = element; |
1095 | 240 | let mut distance = 1; |
1096 | loop { | |
1097 | // debug!("distance={} -- element: {}", distance, mml_to_string(element)); | |
1098 | 361 | if MATHML_LEAF_NODES.contains(element.attribute_value(MATHML_FROM_NAME_ATTR).unwrap_or(name(element))) { |
1099 | 199 | return distance; |
1100 | 162 | } |
1101 | 162 | if treat_2d_elements_as_tokens && ( |
1102 | 41 | return distance; |
1103 | 121 | } |
1104 | 121 | let children = element.children(); |
1105 | 121 | assert!(!children.is_empty()); |
1106 | 121 | element = as_element( if use_left_side { |
1107 | 121 | distance += 1; |
1108 | } | |
1109 | 240 | } |
1110 | } | |
1111 | ||
1112 | /** | |
1113 | * Returns distance from the current node to the leftmost/rightmost leaf (if char, then = 0, if token, then 1). | |
1114 | * If the node is a bracketed expr with the indicated left/right chars | |
1115 | * node -- node(s) to test | |
1116 | * left_side -- (bool) traverse leftmost child to leaf | |
1117 | * treat2D_elements_as_tokens -- (bool) 2D notations such as fractions are treated like leaves | |
1118 | */ | |
1119 | impl Function for DistanceFromLeaf { | |
1120 | 240 | fn evaluate<'d>(&self, |
1121 | 240 | _context: &context::Evaluation<'_, 'd>, |
1122 | 240 | args: Vec<Value<'d>>) |
1123 | 240 | -> Result<Value<'d>, Error> |
1124 | { | |
1125 | 240 | let mut args = Args(args); |
1126 | 240 | args.exactly(3) |
1127 | 240 | let treat_2d_elements_as_tokens = args.pop_boolean() |
1128 | 240 | let use_left_side = args.pop_boolean() |
1129 | 240 | let node = validate_one_node(args.pop_nodeset() |
1130 | 240 | if let Node::Element(e) = node { |
1131 | 240 | return Ok( Value::Number( DistanceFromLeaf::distance(e, use_left_side, treat_2d_elements_as_tokens) as f64) ); |
1132 | 0 | } |
1133 | ||
1134 | // FIX: should having a non-element be an error instead?? | |
1135 | 0 | return Err(Error::Other(format!("DistanceFromLeaf: first arg '{node:?}' is not a node"))); |
1136 | 240 | } |
1137 | } | |
1138 | ||
1139 | ||
1140 | ||
1141 | pub struct EdgeNode; | |
1142 | impl EdgeNode { | |
1143 | // Return the root of the ancestor tree if we are at the left/right side of a path from that to 'element' | |
1144 | 2.09k | fn edge_node<'a>(element: Element<'a>, use_left_side: bool, stop_node_name: &str) -> Option<Element<'a>> { |
1145 | 2.09k | let element_name = element.attribute_value(MATHML_FROM_NAME_ATTR).unwrap_or(name(element)); |
1146 | 2.09k | if element_name == "math" { |
1147 | 86 | return Some(element); |
1148 | 2.00k | }; |
1149 | ||
1150 | 2.00k | let parent = get_parent(element); // there is always a "math" node |
1151 | 2.00k | let parent_name = parent.attribute_value(MATHML_FROM_NAME_ATTR).unwrap_or(name(parent)); |
1152 | ||
1153 | // first check to see if we have the special case of punctuation as last child of math/mrow element | |
1154 | // it only matters if we are looking at the right edge | |
1155 | ||
1156 | // debug!("EdgeNode: there are {} preceding siblings",element.preceding_siblings().len() ); | |
1157 | 2.00k | if use_left_side && |
1158 | 587 | return None; |
1159 | 1.41k | }; |
1160 | ||
1161 | 1.41k | if !use_left_side && |
1162 | // check for the special case that the parent is an mrow and the grandparent is <math> and we have punctuation | |
1163 | 574 | let grandparent = get_parent(parent); |
1164 | 574 | let grandparent_name = grandparent.attribute_value(MATHML_FROM_NAME_ATTR).unwrap_or(name(grandparent)); |
1165 | 574 | if grandparent_name == "math" && |
1166 | 105 | parent_name == "mrow" && |
1167 | 11 | let text = get_text_from_element( as_element(parent.children()[1]) ); |
1168 | 11 | if text == "," || text == "." || |
1169 | 1 | return Some(grandparent); |
1170 | 10 | } |
1171 | 563 | } |
1172 | 573 | return None; |
1173 | 843 | }; |
1174 | ||
1175 | // at an edge -- check to see the parent is desired root | |
1176 | 843 | if parent_name == stop_node_name || |
1177 | 735 | (stop_node_name == "2D" && |
1178 | 176 | return Some(parent); |
1179 | 667 | }; |
1180 | ||
1181 | // debug!("EdgeNode: recurse to {}", parent_name); | |
1182 | 667 | return EdgeNode::edge_node(parent, use_left_side, stop_node_name) |
1183 | 2.09k | } |
1184 | } | |
1185 | ||
1186 | // EdgeNode(node, "left"/"right", stopNodeName) | |
1187 | // -- returns the stopNode if at left/right edge of named ancestor node. "stopNodeName' can also be "2D' | |
1188 | // returns original node match isn't found | |
1189 | // Note: if stopNodeName=="math", then punctuation is taken into account since it isn't really part of the math | |
1190 | impl Function for EdgeNode { | |
1191 | 1.41k | fn evaluate<'d>(&self, |
1192 | 1.41k | _context: &context::Evaluation<'_, 'd>, |
1193 | 1.41k | args: Vec<Value<'d>>) |
1194 | 1.41k | -> Result<Value<'d>, Error> |
1195 | { | |
1196 | 1.41k | let mut args = Args(args); |
1197 | 1.41k | args.exactly(3) |
1198 | 1.41k | let stop_node_name = args.pop_string() |
1199 | 1.41k | let use_left_side = args.pop_string() |
1200 | 1.41k | let node = validate_one_node(args.pop_nodeset() |
1201 | 1.41k | if let Node::Element(e) = node { |
1202 | 1.41k | let result = match EdgeNode::edge_node(e, use_left_side, &stop_node_name) { |
1203 | 260 | Some(found) => found, |
1204 | 1.15k | None => e, |
1205 | }; | |
1206 | 1.41k | let mut node_set = Nodeset::new(); |
1207 | 1.41k | node_set.add(result); |
1208 | 1.41k | return Ok( Value::Nodeset(node_set) ); |
1209 | 0 | } |
1210 | ||
1211 | // FIX: should having a non-element be an error instead?? | |
1212 | 0 | return Err(Error::Other(format!("EdgeNode: first arg '{node:?}' is not a node"))); |
1213 | 1.41k | } |
1214 | } | |
1215 | ||
1216 | pub struct SpeakIntentName; | |
1217 | /// SpeakIntentName(intent, verbosity) | |
1218 | /// Returns a string corresponding to the intent name with the indicated verbosity | |
1219 | impl Function for SpeakIntentName { | |
1220 | 340 | fn evaluate<'d>(&self, |
1221 | 340 | _context: &context::Evaluation<'_, 'd>, |
1222 | 340 | args: Vec<Value<'d>>) |
1223 | 340 | -> Result<Value<'d>, Error> |
1224 | { | |
1225 | 340 | let mut args = Args(args); |
1226 | 340 | args.exactly(3) |
1227 | 340 | let fixity = args.pop_string() |
1228 | 340 | let verbosity = args.pop_string() |
1229 | 340 | let intent_name = args.pop_string() |
1230 | 340 | return Ok( Value::String(crate::infer_intent::intent_speech_for_name(&intent_name, &verbosity, &fixity)) ); |
1231 | 340 | } |
1232 | } | |
1233 | ||
1234 | pub struct GetBracketingIntentName; | |
1235 | /// GetBracketingIntentName(name, verbosity, at_start_or_end) | |
1236 | /// Returns a potentially empty string to use to bracket an intent expression (start foo... end foo) | |
1237 | /// | |
1238 | impl GetBracketingIntentName { | |
1239 | 61 | fn bracketing_words(intent_name: &str, verbosity: &str, fixity: &str, at_start: bool) -> String { |
1240 | 61 | crate::definitions::SPEECH_DEFINITIONS.with(|definitions| { |
1241 | 61 | let definitions = definitions.borrow(); |
1242 | 61 | if let Some( |
1243 | // Split the pattern is: fixity-def [|| fixity-def]* | |
1244 | // fixity-def := fixity=open; verbosity; close | |
1245 | // verbosity := terse | medium | verbose | |
1246 | 68 | if let Some( |
1247 | 57 | let (_, matched_intent) = matched_intent.split_once("=").unwrap_or_default(); |
1248 | 57 | let parts = matched_intent.trim().split(";").collect::<Vec<&str>>(); |
1249 | 57 | if parts.len() == 1 { |
1250 | 30 | return "".to_string(); |
1251 | 27 | } |
1252 | 27 | if parts.len() != 3 { |
1253 | 0 | error!("Intent '{}' has {} ';' separated parts, should have 3", intent_name, parts.len()); |
1254 | 0 | return "".to_string(); |
1255 | 27 | } |
1256 | 27 | let mut speech = (if at_start { |
1257 | 27 | match speech.len() { |
1258 | 20 | 1 => return speech[0].to_string(), |
1259 | 2 | 3 => { | |
1260 | 7 | if speech.len() == 2 { |
1261 | 0 | warn!("Intent '{intent_name}' has only two ':' separated parts, but should have three"); |
1262 | 0 | speech.push(speech[1]); |
1263 | 7 | } |
1264 | 7 | let bracketing_words = match verbosity { |
1265 | 7 | "Terse" => |
1266 | 7 | "Medium" => speech[1], |
1267 | 0 | _ => speech[2], |
1268 | }; | |
1269 | 7 | return bracketing_words.to_string(); |
1270 | }, | |
1271 | _ => { | |
1272 | 0 | error!("Intent '{}' has too many ({}) operator names, should only have 2", intent_name, speech.len()); |
1273 | }, | |
1274 | } | |
1275 | 0 | } |
1276 | 4 | }; |
1277 | 4 | return "".to_string(); |
1278 | 61 | }) |
1279 | 61 | } |
1280 | } | |
1281 | ||
1282 | impl Function for GetBracketingIntentName { | |
1283 | 61 | fn evaluate<'d>(&self, |
1284 | 61 | _context: &context::Evaluation<'_, 'd>, |
1285 | 61 | args: Vec<Value<'d>>) |
1286 | 61 | -> Result<Value<'d>, Error> |
1287 | { | |
1288 | 61 | let mut args = Args(args); |
1289 | 61 | args.exactly(4) |
1290 | 61 | let start_or_end = args.pop_string() |
1291 | 61 | if start_or_end != "start" && |
1292 | 0 | return Err( Error::Other("GetBracketingIntentName: first argument must be either 'start' or 'end'".to_string()) ); |
1293 | 61 | } |
1294 | 61 | let fixity = args.pop_string() |
1295 | 61 | let verbosity = args.pop_string() |
1296 | 61 | let name = args.pop_string() |
1297 | 61 | return Ok( Value::String(GetBracketingIntentName:: bracketing_words(&name, &verbosity, &fixity, start_or_end == "start")) ); |
1298 | 61 | } |
1299 | } | |
1300 | ||
1301 | pub struct GetNavigationPartName; | |
1302 | /// GetNavigationPartName(name, index) | |
1303 | /// Returns the name to use to speak the part of a navigation expression (e.g., 'numerator', 'denominator', 'base', 'exponent', ...). | |
1304 | /// If there is no match, an empty string is returned. | |
1305 | /// 'index' is 0-based | |
1306 | /// | |
1307 | impl GetNavigationPartName { | |
1308 | 129 | fn navigation_part_name(intent_name: &str, index: usize) -> String { |
1309 | 129 | crate::definitions::SPEECH_DEFINITIONS.with(|definitions| { |
1310 | 129 | let definitions = definitions.borrow(); |
1311 | 129 | if let Some(navigation_names) = definitions.get_hashmap("NavigationParts") && |
1312 | 129 | let Some( |
1313 | // Split the pattern is: part [; part]* | |
1314 | 105 | if let Some(part_name) = nav_part_names.trim().split(";").nth(index) { |
1315 | 105 | return part_name.trim().to_string(); |
1316 | 0 | } |
1317 | 24 | } |
1318 | 24 | return "".to_string(); |
1319 | 129 | }) |
1320 | 129 | } |
1321 | } | |
1322 | ||
1323 | impl Function for GetNavigationPartName { | |
1324 | 129 | fn evaluate<'d>(&self, |
1325 | 129 | _context: &context::Evaluation<'_, 'd>, |
1326 | 129 | args: Vec<Value<'d>>) |
1327 | 129 | -> Result<Value<'d>, Error> |
1328 | { | |
1329 | 129 | let mut args = Args(args); |
1330 | 129 | args.exactly(2) |
1331 | 129 | let index = args.pop_number() |
1332 | 129 | let name = args.pop_string() |
1333 | 129 | return Ok( Value::String(GetNavigationPartName:: navigation_part_name(&name, index)) ); |
1334 | 129 | } |
1335 | } | |
1336 | ||
1337 | pub struct FontSizeGuess; | |
1338 | /// FontSizeGuess(size_string) | |
1339 | /// returns a guess of the size in "ems" | |
1340 | /// Examples: | |
1341 | /// "0.278em" -> 0.278 | |
1342 | /// "" | |
1343 | // returns original node match isn't found | |
1344 | impl FontSizeGuess { | |
1345 | 224 | pub fn em_from_value(value_with_unit: &str) -> f64 { |
1346 | // match one or more digits followed by a unit -- there are many more units, but they tend to be large and rarer(?) | |
1347 | 3 | static FONT_VALUE: LazyLock<Regex> = LazyLock::new(|| { Regex::new(r"(-?[0-9]*\.?[0-9]*)(px|cm|mm|Q|in|ppc|pt|ex|em|rem)").unwrap() }); |
1348 | 224 | let cap = FONT_VALUE.captures(value_with_unit); |
1349 | 224 | if let Some( |
1350 | 200 | if cap.len() == 3 { |
1351 | 200 | let multiplier = match &cap[2] { // guess based on 12pt font to convert to ems |
1352 | 200 | "px" => |
1353 | 200 | "cm" => |
1354 | 200 | "mm" => |
1355 | 200 | "Q" => |
1356 | 200 | "in" => |
1357 | 177 | "pc" => |
1358 | 177 | "pt" => |
1359 | 171 | "ex" => |
1360 | 171 | "em" => 1.0, |
1361 | 0 | "rem" => 16.0/12.0, |
1362 | 0 | default => {debug!("unit='{default}'"); 10.0} |
1363 | }; | |
1364 | // debug!("FontSizeGuess: {}->{}, val={}, multiplier={}", value_with_unit, value*multiplier, value, multiplier); | |
1365 | 200 | return cap[1].parse::<f64>().unwrap_or(0.0) * multiplier; |
1366 | } else { | |
1367 | 0 | return 0.0; // something bad happened |
1368 | } | |
1369 | }else { | |
1370 | 24 | let multiplier = match value_with_unit { // guess based on 12pt font to convert to ems |
1371 | 24 | "veryverythinspace" => |
1372 | 24 | "verythinspace" => |
1373 | 24 | "thinspace" => |
1374 | 24 | "mediumspace" => |
1375 | 24 | "thickspace" => |
1376 | 24 | "verythickspace" => |
1377 | 24 | "veryverythickspace" => |
1378 | 24 | _ => 0.0, |
1379 | }; | |
1380 | 24 | return multiplier; |
1381 | } | |
1382 | 224 | } |
1383 | } | |
1384 | impl Function for FontSizeGuess { | |
1385 | 0 | fn evaluate<'d>(&self, |
1386 | 0 | _context: &context::Evaluation<'_, 'd>, |
1387 | 0 | args: Vec<Value<'d>>) |
1388 | 0 | -> Result<Value<'d>, Error> |
1389 | { | |
1390 | 0 | let mut args = Args(args); |
1391 | 0 | args.exactly(1)?; |
1392 | 0 | let value_with_unit = args.pop_string()?; |
1393 | 0 | let em_value = FontSizeGuess::em_from_value(&value_with_unit); |
1394 | 0 | return Ok( Value::Number(em_value) ); |
1395 | 0 | } |
1396 | } | |
1397 | ||
1398 | pub struct ReplaceAll; | |
1399 | /// ReplaceAll(haystack, needle, replacement) | |
1400 | /// Returns a string with all occurrences of 'needle' replaced with 'replacement' | |
1401 | impl Function for ReplaceAll { | |
1402 | 0 | fn evaluate<'d>(&self, |
1403 | 0 | _context: &context::Evaluation<'_, 'd>, |
1404 | 0 | args: Vec<Value<'d>>) |
1405 | 0 | -> Result<Value<'d>, Error> |
1406 | { | |
1407 | 0 | let mut args = Args(args); |
1408 | 0 | args.exactly(3)?; |
1409 | 0 | let replacement = args.pop_string()?; |
1410 | 0 | let needle = args.pop_string()?; |
1411 | 0 | let haystack = args.pop_string()?; |
1412 | 0 | return Ok( Value::String(haystack.replace(&needle, &replacement)) ); |
1413 | 0 | } |
1414 | } | |
1415 | ||
1416 | /// Add all the functions defined in this module to `context`. | |
1417 | 22.7k | pub fn add_builtin_functions(context: &mut Context) { |
1418 | 22.7k | context.set_function("NestingChars", crate::braille::NemethNestingChars); |
1419 | 22.7k | context.set_function("BrailleChars", crate::braille::BrailleChars); |
1420 | 22.7k | context.set_function("NeedsToBeGrouped", crate::braille::NeedsToBeGrouped); |
1421 | 22.7k | context.set_function("IsNode", IsNode); |
1422 | 22.7k | context.set_function("ToOrdinal", ToOrdinal); |
1423 | 22.7k | context.set_function("ToCommonFraction", ToCommonFraction); |
1424 | 22.7k | context.set_function("IsBracketed", IsBracketed); |
1425 | 22.7k | context.set_function("IsInDefinition", IsInDefinition); |
1426 | 22.7k | context.set_function("DefinitionValue", DefinitionValue); |
1427 | 22.7k | context.set_function("BaseNode", BaseNode); |
1428 | 22.7k | context.set_function("IfThenElse", IfThenElse); |
1429 | 22.7k | context.set_function("IFTHENELSE", IfThenElse); |
1430 | 22.7k | context.set_function("DistanceFromLeaf", DistanceFromLeaf); |
1431 | 22.7k | context.set_function("EdgeNode", EdgeNode); |
1432 | 22.7k | context.set_function("SpeakIntentName", SpeakIntentName); |
1433 | 22.7k | context.set_function("GetBracketingIntentName", GetBracketingIntentName); |
1434 | 22.7k | context.set_function("GetNavigationPartName", GetNavigationPartName); |
1435 | 22.7k | context.set_function("DEBUG", Debug); |
1436 | ||
1437 | // Not used: remove?? | |
1438 | 22.7k | context.set_function("min", Min); // missing in xpath 1.0 |
1439 | 22.7k | context.set_function("max", Max); // missing in xpath 1.0 |
1440 | 22.7k | context.set_function("FontSizeGuess", FontSizeGuess); |
1441 | 22.7k | context.set_function("ReplaceAll", ReplaceAll); |
1442 | 22.7k | } |
1443 | ||
1444 | ||
1445 | #[cfg(test)] | |
1446 | mod tests { | |
1447 | use super::*; | |
1448 | use sxd_document::parser; | |
1449 | use crate::interface::{trim_element, get_element}; | |
1450 | ||
1451 | ||
1452 | 4 | fn init_word_list() { |
1453 | 4 | crate::interface::set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); |
1454 | 4 | let result = crate::definitions::read_definitions_file(true); |
1455 | 4 | if let Err( |
1456 | 0 | panic!("unable to read 'Rules/Languages/en/definitions.yaml\n{e}"); |
1457 | 4 | } |
1458 | 4 | } |
1459 | ||
1460 | #[test] | |
1461 | 1 | fn ordinal_one_digit() { |
1462 | 1 | init_word_list(); |
1463 | 1 | assert_eq!("zeroth", ToOrdinal::convert("0", false, false).unwrap()); |
1464 | 1 | assert_eq!("second", ToOrdinal::convert("2", false, false).unwrap()); |
1465 | 1 | assert_eq!("ninth", ToOrdinal::convert("9", false, false).unwrap()); |
1466 | ||
1467 | 1 | assert_eq!("zeroth", ToOrdinal::convert("0", false, true).unwrap()); |
1468 | 1 | assert_eq!("seconds", ToOrdinal::convert("2", false, true).unwrap()); |
1469 | 1 | assert_eq!("ninths", ToOrdinal::convert("9", false, true).unwrap()); |
1470 | ||
1471 | 1 | assert_eq!("first", ToOrdinal::convert("1", true, false).unwrap()); |
1472 | 1 | assert_eq!("half", ToOrdinal::convert("2", true, false).unwrap()); |
1473 | 1 | assert_eq!("half", ToOrdinal::convert("02", true, false).unwrap()); |
1474 | 1 | assert_eq!("ninth", ToOrdinal::convert("9", true, false).unwrap()); |
1475 | ||
1476 | 1 | assert_eq!("halves", ToOrdinal::convert("2", true, true).unwrap()); |
1477 | 1 | assert_eq!("halves", ToOrdinal::convert("002", true, true).unwrap()); |
1478 | 1 | assert_eq!("ninths", ToOrdinal::convert("9", true, true).unwrap()); |
1479 | 1 | } |
1480 | ||
1481 | #[test] | |
1482 | 1 | fn ordinal_two_digit() { |
1483 | 1 | init_word_list(); |
1484 | 1 | assert_eq!("tenth", ToOrdinal::convert("10", false, false).unwrap()); |
1485 | 1 | assert_eq!("seventeenth", ToOrdinal::convert("17", false, false).unwrap()); |
1486 | 1 | assert_eq!("thirty second", ToOrdinal::convert("32", false, false).unwrap()); |
1487 | 1 | assert_eq!("fortieth", ToOrdinal::convert("40", false, false).unwrap()); |
1488 | ||
1489 | 1 | assert_eq!("tenths", ToOrdinal::convert("10", false, true).unwrap()); |
1490 | 1 | assert_eq!("sixteenths", ToOrdinal::convert("16", false, true).unwrap()); |
1491 | 1 | assert_eq!("eighty eighths", ToOrdinal::convert("88", false, true).unwrap()); |
1492 | 1 | assert_eq!("fiftieths", ToOrdinal::convert("50", false, true).unwrap()); |
1493 | ||
1494 | 1 | assert_eq!("eleventh", ToOrdinal::convert("11", true, false).unwrap()); |
1495 | 1 | assert_eq!("forty fourth", ToOrdinal::convert("44", true, false).unwrap()); |
1496 | 1 | assert_eq!("ninth", ToOrdinal::convert("9", true, false).unwrap()); |
1497 | 1 | assert_eq!("ninth", ToOrdinal::convert("00000009", true, false).unwrap()); |
1498 | 1 | assert_eq!("sixtieth", ToOrdinal::convert("60", true, false).unwrap()); |
1499 | ||
1500 | 1 | assert_eq!("tenths", ToOrdinal::convert("10", true, true).unwrap()); |
1501 | 1 | assert_eq!("tenths", ToOrdinal::convert("0010", true, true).unwrap()); |
1502 | 1 | assert_eq!("elevenths", ToOrdinal::convert("11", true, true).unwrap()); |
1503 | 1 | assert_eq!("nineteenths", ToOrdinal::convert("19", true, true).unwrap()); |
1504 | 1 | assert_eq!("twentieths", ToOrdinal::convert("20", true, true).unwrap()); |
1505 | 1 | assert_eq!("nineteenths", ToOrdinal::convert("𝟏𝟗", true, true).unwrap()); |
1506 | 1 | } |
1507 | ||
1508 | #[test] | |
1509 | 1 | fn ordinal_three_digit() { |
1510 | 1 | init_word_list(); |
1511 | 1 | assert_eq!("one hundred first", ToOrdinal::convert("101", false, false).unwrap()); |
1512 | 1 | assert_eq!("two hundred tenth", ToOrdinal::convert("210", false, false).unwrap()); |
1513 | 1 | assert_eq!("four hundred thirty second", ToOrdinal::convert("432", false, false).unwrap()); |
1514 | 1 | assert_eq!("four hundred second", ToOrdinal::convert("402", false, false).unwrap()); |
1515 | ||
1516 | 1 | assert_eq!("one hundred first", ToOrdinal::convert("101", true, false).unwrap()); |
1517 | 1 | assert_eq!("two hundred second", ToOrdinal::convert("202", true, false).unwrap()); |
1518 | 1 | assert_eq!("four hundred thirty second", ToOrdinal::convert("432", true, false).unwrap()); |
1519 | 1 | assert_eq!("five hundred third", ToOrdinal::convert("503", true, false).unwrap()); |
1520 | ||
1521 | 1 | assert_eq!("three hundred elevenths", ToOrdinal::convert("311", false, true).unwrap()); |
1522 | 1 | assert_eq!("four hundred ninety ninths", ToOrdinal::convert("499", false, true).unwrap()); |
1523 | 1 | assert_eq!("nine hundred ninetieths", ToOrdinal::convert("990", false, true).unwrap()); |
1524 | 1 | assert_eq!("six hundred seconds", ToOrdinal::convert("602", false, true).unwrap()); |
1525 | ||
1526 | 1 | assert_eq!("seven hundredths", ToOrdinal::convert("700", true, true).unwrap()); |
1527 | 1 | assert_eq!("one hundredths", ToOrdinal::convert("100", true, true).unwrap()); |
1528 | 1 | assert_eq!("eight hundred seventeenths", ToOrdinal::convert("817", true, true).unwrap()); |
1529 | 1 | } |
1530 | #[test] | |
1531 | 1 | fn ordinal_large() { |
1532 | 1 | init_word_list(); |
1533 | 1 | assert_eq!("one thousandth", ToOrdinal::convert("1000", false, false).unwrap()); |
1534 | 1 | assert_eq!("two thousand one hundredth", ToOrdinal::convert("2100", false, false).unwrap()); |
1535 | 1 | assert_eq!("thirty thousandth", ToOrdinal::convert("30000", false, false).unwrap()); |
1536 | 1 | assert_eq!("four hundred thousandth", ToOrdinal::convert("400000", false, false).unwrap()); |
1537 | ||
1538 | 1 | assert_eq!("four hundred thousandth", ToOrdinal::convert("400000", true, false).unwrap()); |
1539 | 1 | assert_eq!("five hundred thousand second", ToOrdinal::convert("500002", true, false).unwrap()); |
1540 | 1 | assert_eq!("six millionth", ToOrdinal::convert("6000000", true, false).unwrap()); |
1541 | 1 | assert_eq!("sixty millionth", ToOrdinal::convert("60000000", true, false).unwrap()); |
1542 | ||
1543 | 1 | assert_eq!("seven billionths", ToOrdinal::convert("7000000000", false, true).unwrap()); |
1544 | 1 | assert_eq!("eight trillionths", ToOrdinal::convert("8000000000000", false, true).unwrap()); |
1545 | 1 | assert_eq!("nine quadrillionths", ToOrdinal::convert("9000000000000000", false, true).unwrap()); |
1546 | 1 | assert_eq!("one quintillionth", ToOrdinal::convert("1000000000000000000", false, false).unwrap()); |
1547 | ||
1548 | 1 | assert_eq!("nine billion eight hundred seventy six million five hundred forty three thousand two hundred tenths", ToOrdinal::convert("9876543210", true, true).unwrap()); |
1549 | 1 | assert_eq!("nine billion five hundred forty three thousand two hundred tenths", ToOrdinal::convert("9000543210", true, true).unwrap()); |
1550 | 1 | assert_eq!("zeroth", ToOrdinal::convert("00000", false, false).unwrap()); |
1551 | 1 | } |
1552 | ||
1553 | ||
1554 | 11 | fn test_is_simple(message: &'static str, mathml_str: &'static str) { |
1555 | // this forces initialization | |
1556 | 11 | crate::speech::SPEECH_RULES.with(|_| true); |
1557 | 11 | let package = parser::parse(mathml_str) |
1558 | 11 | .expect("failed to parse XML"); |
1559 | 11 | let mathml = get_element(&package); |
1560 | 11 | trim_element(mathml, false); |
1561 | 11 | assert!(IsNode::is_simple(mathml), "{}", message); |
1562 | 11 | } |
1563 | ||
1564 | 7 | fn test_is_not_simple(message: &'static str, mathml_str: &'static str) { |
1565 | // this forces initialization | |
1566 | 7 | crate::speech::SPEECH_RULES.with(|_| true); |
1567 | 7 | let package = parser::parse(mathml_str) |
1568 | 7 | .expect("failed to parse XML"); |
1569 | 7 | let mathml = get_element(&package); |
1570 | 7 | trim_element(mathml, false); |
1571 | 7 | assert!(!IsNode::is_simple(mathml), "{}", message); |
1572 | 7 | } |
1573 | #[test] | |
1574 | 1 | fn is_simple() { |
1575 | 1 | test_is_simple("single variable", "<mi>x</mi>"); |
1576 | 1 | test_is_simple("single number", "<mn>1.2</mn>"); |
1577 | 1 | test_is_simple("negative number", "<mrow><mo>-</mo><mn>10</mn></mrow>"); |
1578 | 1 | test_is_simple("negative variable", "<mrow><mo>-</mo><mi>x</mi></mrow>"); |
1579 | 1 | test_is_simple("ordinal fraction", "<mfrac><mn>3</mn><mn>4</mn></mfrac>"); |
1580 | 1 | test_is_simple("x y", "<mrow><mi>x</mi><mo>⁢</mo><mi>y</mi></mrow>"); |
1581 | 1 | test_is_simple("negative two vars", |
1582 | "<mrow><mrow><mo>-</mo><mi>x</mi></mrow><mo>⁢</mo><mi>y</mi></mrow>"); | |
1583 | 1 | test_is_simple("-2 x y", |
1584 | "<mrow><mrow><mo>-</mo><mn>2</mn></mrow> | |
1585 | <mo>⁢</mo><mi>x</mi><mo>⁢</mo><mi>z</mi></mrow>"); | |
1586 | 1 | test_is_simple("sin x", "<mrow><mi>sin</mi><mo>⁡</mo><mi>x</mi></mrow>"); |
1587 | 1 | test_is_simple("f(x)", "<mrow><mi>f</mi><mo>⁡</mo><mrow><mo>(</mo><mi>x</mi><mo>)</mo></mrow></mrow>"); |
1588 | 1 | test_is_simple("f(x+y)", |
1589 | "<mrow><mi>f</mi><mo>⁡</mo>\ | |
1590 | <mrow><mo>(</mo><mi>x</mi><mo>+</mo><mi>y</mi><mo>)</mo></mrow></mrow>"); | |
1591 | ||
1592 | 1 | } |
1593 | ||
1594 | #[test] | |
1595 | 1 | fn is_not_simple() { |
1596 | 1 | test_is_not_simple("multi-char variable", "<mi>rise</mi>"); |
1597 | 1 | test_is_not_simple("large ordinal fraction", "<mfrac><mn>30</mn><mn>4</mn></mfrac>"); |
1598 | 1 | test_is_not_simple("fraction with var in numerator", "<mfrac><mi>x</mi><mn>4</mn></mfrac>"); |
1599 | 1 | test_is_not_simple("square root", "<msqrt><mi>x</mi></msqrt>"); |
1600 | 1 | test_is_not_simple("subscript", "<msub><mi>x</mi><mn>4</mn></msub>"); |
1601 | 1 | test_is_not_simple("-x y z", |
1602 | "<mrow><mrow><mo>-</mo><mi>x</mi></mrow> | |
1603 | <mo>⁢</mo><mi>y</mi><mo>⁢</mo><mi>z</mi></mrow>"); | |
1604 | 1 | test_is_not_simple("C(-2,1,4)", // github.com/NSoiffer/MathCAT/issues/199 |
1605 | "<mrow><mi>C</mi><mrow><mo>(</mo><mo>−</mo><mn>2</mn><mo>,</mo><mn>1</mn><mo>,</mo><mn>4</mn><mo>)</mo></mrow></mrow>"); | |
1606 | ||
1607 | 1 | } |
1608 | ||
1609 | #[test] | |
1610 | 1 | fn at_left_edge() { |
1611 | 1 | let mathml = "<math><mfrac><mrow><mn>30</mn><mi>x</mi></mrow><mn>4</mn></mfrac></math>"; |
1612 | 1 | let package = parser::parse(mathml).expect("failed to parse XML"); |
1613 | 1 | let mathml = get_element(&package); |
1614 | 1 | trim_element(mathml, false); |
1615 | 1 | let fraction = as_element(mathml.children()[0]); |
1616 | 1 | let mn = as_element(as_element(fraction.children()[0]).children()[0]); |
1617 | 1 | assert_eq!(EdgeNode::edge_node(mn, true, "2D"), Some(fraction)); |
1618 | 1 | assert_eq!(EdgeNode::edge_node(mn, false, "2D"), None); |
1619 | ||
1620 | 1 | let mi = as_element(as_element(fraction.children()[0]).children()[1]); |
1621 | 1 | assert_eq!(EdgeNode::edge_node(mi, true, "2D"), None); |
1622 | 1 | } |
1623 | ||
1624 | #[test] | |
1625 | 1 | fn at_right_edge() { |
1626 | 1 | let mathml = "<math><mrow><mfrac><mn>4</mn><mrow><mn>30</mn><mi>x</mi></mrow></mfrac><mo>.</mo></mrow></math>"; |
1627 | 1 | let package = parser::parse(mathml).expect("failed to parse XML"); |
1628 | 1 | let mathml = get_element(&package); |
1629 | 1 | trim_element(mathml, false); |
1630 | 1 | let fraction = as_element(as_element(mathml.children()[0]).children()[0]); |
1631 | 1 | let mi = as_element(as_element(fraction.children()[1]).children()[1]); |
1632 | 1 | assert_eq!(EdgeNode::edge_node(mi, true, "2D"), None); |
1633 | 1 | assert_eq!(EdgeNode::edge_node(mi, false, "2D"), Some(fraction)); |
1634 | 1 | assert_eq!(EdgeNode::edge_node(mi, false, "math"), Some(mathml)); |
1635 | ||
1636 | 1 | let mn = as_element(as_element(fraction.children()[1]).children()[0]); |
1637 | 1 | assert_eq!(EdgeNode::edge_node(mn, true, "2D"), None); |
1638 | 1 | } |
1639 | } |
Line | Count | Source |
1 | #![allow(clippy::needless_return)] | |
2 | //! XPath underlies rule matching and speech generation. The version of xpath used is based on xpath 1.0 | |
3 | //! and includes the ability to define functions and variables. | |
4 | //! The variables defined are all the preferences and also variables set in speech rules via the `variables` keyword. | |
5 | //! The function defined here are: | |
6 | //! * `IsNode(node, kind)`: returns true if the node matches the "kind". | |
7 | //! Valid values are "leaf", "2D", "simple", "common_fraction", "trig_name". | |
8 | //! * `ToOrdinal(number, fractional, plural)`: converts the number to an ordinal (e.g, third) | |
9 | //! * `number` -- the number to translate | |
10 | //! * `fractional` -- true if this is a fractional ordinal (e.g, "half") | |
11 | //! * `plural` -- true if answer should be plural | |
12 | //! * `ToCommonFraction(mfrac)` -- converts the fraction to an ordinal version (e.g, 2 thirds) | |
13 | //! * `IsLargeOp(node)` -- returns true if the node is a large operator (e.g, integral or sum) | |
14 | //! * `IsBracketed(node, left, right, requires_comma)` -- returns true if the first/last element in the mrow match `left`/`right`. | |
15 | //! If the optional `requires_comma` argument is given and is `true`, then there also must be a "," in the mrow (e.g., "f(x,y)") | |
16 | //! * `DEBUG(xpath)` -- _Very_ useful function for debugging speech rules. | |
17 | //! This can be used to surround a whole or part of an xpath expression in a match or output. | |
18 | //! The result will be printed to standard output and the result returned so that `DEBUG` does not affect the computation. | |
19 | ||
20 | use sxd_document::dom::{Element, ChildOfElement}; | |
21 | use sxd_xpath::{Value, Context, context, function::*, nodeset::*}; | |
22 | use crate::definitions::{Definitions, SPEECH_DEFINITIONS, BRAILLE_DEFINITIONS}; | |
23 | use regex::Regex; | |
24 | use crate::pretty_print::mml_to_string; | |
25 | use std::cell::{Ref, RefCell}; | |
26 | use log::{debug, error, warn}; | |
27 | use std::sync::LazyLock; | |
28 | use std::thread::LocalKey; | |
29 | use phf::phf_set; | |
30 | use sxd_xpath::function::Error as XPathError; | |
31 | use crate::canonicalize::{as_element, name, get_parent, MATHML_FROM_NAME_ATTR}; | |
32 | ||
33 | // useful utility functions | |
34 | // note: child of an element is a ChildOfElement, so sometimes it is useful to have parallel functions, | |
35 | // one for Element and one for ChildOfElement. | |
36 | ||
37 | // @returns {String} -- the text of the (leaf) element otherwise an empty string | |
38 | 126k | fn get_text_from_element(e: Element) -> String { |
39 | 126k | if e.children().len() == 1 && |
40 | 103k | let ChildOfElement::Text( |
41 | 102k | return t.text().to_string(); |
42 | 23.4k | } |
43 | 23.4k | return "".to_string(); |
44 | 126k | } |
45 | ||
46 | #[allow(non_snake_case)] | |
47 | // Same as 'is_tag', but for ChildOfElement | |
48 | 110k | fn get_text_from_COE(coe: &ChildOfElement) -> String { |
49 | 110k | let element = coe.element(); |
50 | 110k | return match element { |
51 | 110k | Some(e) => get_text_from_element(e), |
52 | 0 | None => "".to_string(), |
53 | }; | |
54 | 110k | } |
55 | ||
56 | // make sure that there is only one node in the NodeSet | |
57 | // Returns the node or an Error | |
58 | 147k | pub fn validate_one_node<'n>(nodes: Nodeset<'n>, func_name: &str) -> Result<Node<'n>, Error> { |
59 | 147k | if nodes.size() == 0 { |
60 | 0 | return Err(Error::Other(format!("Missing argument for {func_name}"))); |
61 | 147k | } else if nodes.size() > 1 { |
62 | 0 | return Err( Error::Other(format!("{} arguments for {}; expected 1 argument", nodes.size(), func_name)) ); |
63 | 147k | } |
64 | 147k | return Ok( nodes.iter().next().unwrap() ); |
65 | 147k | } |
66 | ||
67 | // Return true if the element's name is 'name' | |
68 | 157k | fn is_tag(e: Element, name: &str) -> bool { |
69 | // need to check name before the fallback of where the name came from | |
70 | 157k | return e.name().local_part() == name || |
71 | 157k | } |
72 | ||
73 | #[allow(non_snake_case)] | |
74 | // Same as 'is_tag', but for ChildOfElement | |
75 | 1.40k | fn is_COE_tag(coe: ChildOfElement, name: &str) -> bool { |
76 | 1.40k | let element = coe.element(); |
77 | 1.40k | return element.is_some() && is_tag(element.unwrap(), name) |
78 | 1.40k | } |
79 | ||
80 | /// Should be an internal structure for implementation of the IsNode, but it was useful in one place in a separate module. | |
81 | /// This should probably be restructured slightly. | |
82 | pub struct IsNode; | |
83 | ||
84 | impl IsNode { | |
85 | /// implements ClearSpeak's definition of "simple" | |
86 | /// this is fairly detailed, so we define a few local functions (at end) to help out | |
87 | /// Also, it doesn't help that the structure is a bit complicated Elements->ChildOfElement->Element/Text | |
88 | 7.43k | pub fn is_simple(elem: Element) -> bool { |
89 | 7.43k | if is_trivially_simple(elem) { |
90 | 3.62k | return true; |
91 | 3.81k | } |
92 | ||
93 | 3.81k | if is_negative_of_trivially_simple(elem) { |
94 | // -3 or -x | |
95 | 41 | return true; |
96 | 3.76k | } |
97 | ||
98 | 3.76k | if !is_tag(elem, "mrow") || |
99 | 2.90k | return false; |
100 | 867 | } |
101 | ||
102 | // x y or -x or -3 x or -x y or -3 x y or x° or n° or -x° or -n° | |
103 | #[allow(clippy::if_same_then_else)] | |
104 | 867 | if is_times_mi(elem) { |
105 | 42 | return true; // x y |
106 | 825 | } else if is_degrees(elem) { |
107 | 0 | return true; // x° or n° |
108 | 825 | } else if is_function(elem) { |
109 | 44 | return true; |
110 | 781 | } |
111 | ||
112 | 781 | return false; |
113 | ||
114 | ||
115 | // returns the element's text value | |
116 | 5.71k | fn to_str(e: Element<'_>) -> &str { |
117 | // typically usage assumes 'e' is a leaf | |
118 | // bad MathML is the following isn't true | |
119 | 5.71k | if e.children().len() == 1 { |
120 | 5.71k | let text_node = e.children()[0]; |
121 | 5.71k | if let Some(t) = text_node.text() { |
122 | 5.71k | return t.text(); |
123 | 0 | } |
124 | 0 | } |
125 | 0 | return ""; |
126 | 5.71k | } |
127 | ||
128 | // same as 'to_str' but for ChildOfElement | |
129 | 1.01k | fn coe_to_str(coe: ChildOfElement<'_>) -> &str { |
130 | // typically usage assumes 'coe' is a leaf | |
131 | 1.01k | let element_node = coe.element(); |
132 | 1.01k | if let Some(e) = element_node { |
133 | // bad MathML is the following isn't true | |
134 | 1.01k | if e.children().len() == 1 { |
135 | 1.01k | let text_node = e.children()[0]; |
136 | 1.01k | if let Some(t) = text_node.text() { |
137 | 1.01k | return t.text(); |
138 | 0 | } |
139 | 8 | } |
140 | 0 | } |
141 | 8 | return ""; |
142 | 1.01k | } |
143 | ||
144 | // returns true if the string is just a single *char* (which can be multiple bytes) | |
145 | 5.71k | fn is_single_char(str: &str) -> bool { |
146 | 5.71k | let mut chars = str.chars(); |
147 | 5.71k | return chars.next().is_some() && chars.next().is_none(); |
148 | 5.71k | } |
149 | ||
150 | // checks the single element to see if it is simple (mn, mi that is a single char, common fraction) | |
151 | 8.33k | fn is_trivially_simple(elem: Element) -> bool { |
152 | 8.33k | if is_tag(elem, "mn") { |
153 | 914 | return true; |
154 | 7.41k | } |
155 | 7.41k | if is_tag(elem, "mi") && |
156 | // "simple" only if it is a single char (which can be multiple bytes) | |
157 | 3.14k | return true; |
158 | 4.27k | } |
159 | ||
160 | // FIX: need to consult preference Fraction_Ordinal | |
161 | 4.27k | if IsNode::is_common_fraction(elem, 10, 19) { |
162 | 66 | return true; |
163 | 4.21k | } |
164 | 4.21k | return false; |
165 | 8.33k | } |
166 | ||
167 | // true if the negative of a single element that is simple | |
168 | 4.20k | fn is_negative_of_trivially_simple(elem: Element) -> bool { |
169 | 4.20k | if is_tag(elem, "mrow") && |
170 | 38 | let children = elem.children(); |
171 | // better be negative of something at this point... | |
172 | 38 | if is_COE_tag(children[0], "mo") && |
173 | 6 | children[1].element().is_some() && is_trivially_simple(children[1].element().unwrap()) { |
174 | 6 | return true; |
175 | 32 | } |
176 | 4.16k | } |
177 | 4.20k | if is_tag(elem, "minus") && |
178 | 54 | let child = elem.children()[0]; |
179 | 54 | if let Some(e) = child.element() { |
180 | 54 | return is_trivially_simple(e); |
181 | 0 | } |
182 | 4.14k | } |
183 | ||
184 | 4.14k | return false; |
185 | 4.20k | } |
186 | ||
187 | // return true if ChildOfElement has exactly text 'ch' | |
188 | 967 | fn is_equal(coe: ChildOfElement, ch: char) -> bool { |
189 | 967 | return coe_to_str(coe).starts_with(ch); |
190 | 967 | } |
191 | ||
192 | // true if mrow(xxx, ⁢, mi) or mrow(xxx, ⁢ mi, ⁢, mi) where mi's have len==1 | |
193 | 867 | fn is_times_mi(mrow: Element) -> bool { |
194 | 867 | assert!( is_tag(mrow, "mrow") ); |
195 | 867 | let children = mrow.children(); |
196 | 867 | if !(children.len() == 3 || |
197 | 34 | return false; |
198 | 833 | } |
199 | 833 | if children[0].element().is_none() { |
200 | 0 | return false; |
201 | 833 | } |
202 | ||
203 | 833 | let first_child = children[0].element().unwrap(); |
204 | 833 | if !is_trivially_simple(first_child) { |
205 | 396 | if !is_negative_of_trivially_simple(first_child) { |
206 | 382 | return false; |
207 | 14 | } |
208 | 14 | if children.len() == 5 && |
209 | 2 | ( (name(first_child) == "minus" && |
210 | 2 | (name(first_child) == "mrow" && !is_COE_tag(first_child.children()[1], "mn")) ) { |
211 | 1 | return false; // '-x y z' is too complicated () -- -2 x y is ok |
212 | 13 | } |
213 | 437 | } |
214 | ||
215 | 450 | if !(is_COE_tag(children[1], "mo") && |
216 | 450 | is_equal(children[1], '\u{2062}') && |
217 | 63 | is_COE_tag(children[2], "mi") && |
218 | 51 | coe_to_str(children[2]).len()==1 ) { |
219 | 408 | return false; |
220 | 42 | } |
221 | ||
222 | 42 | if children.len() == 3 { |
223 | 41 | return true; |
224 | 1 | } |
225 | ||
226 | // len == 5 | |
227 | 1 | return is_COE_tag(children[3], "mo") && |
228 | 1 | is_equal(children[3], '\u{2062}') && // invisible times |
229 | 1 | is_COE_tag(children[4], "mi") && |
230 | 1 | coe_to_str(children[4]).len()==1 ; |
231 | 867 | } |
232 | ||
233 | // return true if the mrow is var° or num° | |
234 | 825 | fn is_degrees(mrow: Element) -> bool { |
235 | 825 | assert!( is_tag(mrow, "mrow") ); |
236 | 825 | let children = mrow.children(); |
237 | 825 | return children.len() == 2 && |
238 | 32 | is_equal(children[1], '°') && |
239 | 0 | (is_COE_tag(children[0], "mi") || |
240 | 0 | is_COE_tag(children[0], "mn") ); |
241 | 825 | } |
242 | ||
243 | // fn_name ⁡ [simple arg or (simple arg)] | |
244 | 825 | fn is_function(mrow: Element) -> bool { |
245 | 825 | assert!( is_tag(mrow, "mrow") ); |
246 | 825 | let children = mrow.children(); |
247 | 825 | if children.len() != 3 { |
248 | 40 | return false; |
249 | 785 | } |
250 | 785 | if !(is_COE_tag(children[1], "mo") && |
251 | 473 | is_equal(children[1], '\u{2061}') ) { // invisible function application |
252 | 717 | return false; |
253 | 68 | } |
254 | 68 | if !is_COE_tag(children[0], "mi") { |
255 | 0 | return false; |
256 | 68 | } |
257 | 68 | let function_arg = children[2].element().unwrap(); |
258 | 68 | if IsBracketed::is_bracketed(function_arg, "(", ")", false, false) { |
259 | 60 | return IsNode::is_simple(function_arg.children()[1].element().unwrap()); |
260 | } else { | |
261 | 8 | return IsNode::is_simple(function_arg); |
262 | } | |
263 | 825 | } |
264 | 7.43k | } |
265 | ||
266 | // Returns true if 'frac' is a common fraction | |
267 | // In this case, the numerator and denominator can be no larger than 'num_limit' and 'denom_limit' | |
268 | 4.31k | fn is_common_fraction(frac: Element, num_limit: usize, denom_limit: usize) -> bool { |
269 | 2 | static ALL_DIGITS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\d+").unwrap()); // match one or more digits |
270 | ||
271 | 4.31k | if !is_tag(frac, "mfrac") && |
272 | 4.12k | return false; |
273 | 188 | } |
274 | 188 | let children = frac.children(); |
275 | 188 | if children.len() != 2 { |
276 | 0 | return false; |
277 | 188 | } |
278 | ||
279 | 188 | let num = children[0].element(); |
280 | 188 | let denom = children[1].element(); |
281 | 188 | if num.is_none() || denom.is_none() { |
282 | 0 | return false; |
283 | 188 | }; |
284 | ||
285 | 188 | let num = num.unwrap(); |
286 | 188 | let denom = denom.unwrap(); |
287 | 188 | if !is_tag(num, "mn") || |
288 | 87 | return false |
289 | 101 | }; |
290 | ||
291 | 101 | let num = get_text_from_element(num); |
292 | 101 | let denom = get_text_from_element(denom); |
293 | 101 | if num.is_empty() || denom.is_empty() { |
294 | 0 | return false; |
295 | 101 | } |
296 | ||
297 | 101 | return ALL_DIGITS.is_match(&num) && is_small_enough(&num, num_limit) && |
298 | 100 | ALL_DIGITS.is_match(&denom) && is_small_enough(&denom, denom_limit); |
299 | ||
300 | 201 | fn is_small_enough(val: &str, upper_bound: usize) -> bool { |
301 | 201 | return if let Ok(value) = val.parse::<usize>() { value <= upper_bound } else { |
302 | 201 | } |
303 | 4.31k | } |
304 | ||
305 | 14.2k | pub fn is_mathml(elem: Element) -> bool { |
306 | // doesn't check MATHML_FROM_NAME_ATTR because we are interested in if it is an intent. | |
307 | 14.2k | return ALL_MATHML_ELEMENTS.contains(name(elem)); |
308 | 14.2k | } |
309 | ||
310 | #[allow(non_snake_case)] | |
311 | 14.3k | pub fn is_2D(elem: Element) -> bool { |
312 | 14.3k | return MATHML_2D_NODES.contains(elem.attribute_value(MATHML_FROM_NAME_ATTR).unwrap_or(name(elem))); |
313 | 14.3k | } |
314 | ||
315 | 37.8k | pub fn is_scripted(elem: Element) -> bool { |
316 | 37.8k | return MATHML_SCRIPTED_NODES.contains(elem.attribute_value(MATHML_FROM_NAME_ATTR).unwrap_or(name(elem))); |
317 | 37.8k | } |
318 | ||
319 | 138k | pub fn is_modified(elem: Element) -> bool { |
320 | 138k | return MATHML_MODIFIED_NODES.contains(elem.attribute_value(MATHML_FROM_NAME_ATTR).unwrap_or(name(elem))); |
321 | 138k | } |
322 | } | |
323 | ||
324 | /// All MathML elements, including a few that get cleaned away | |
325 | /// "semantics", "annotation-xml", "annotation" and Content MathML are not included | |
326 | static ALL_MATHML_ELEMENTS: phf::Set<&str> = phf_set!{ | |
327 | "mi", "mo", "mn", "mtext", "ms", "mspace", "mglyph", | |
328 | "mfrac", "mroot", "msub", "msup", "msubsup","munder", "mover", "munderover", "mmultiscripts", | |
329 | "mstack", "mlongdiv", "msgroup", "msrow", "mscarries", "mscarry", "msline", | |
330 | "none", "mprescripts", "malignmark", "maligngroup", | |
331 | "math", "msqrt", "merror", "mpadded", "mphantom", "menclose", "mtd", "mstyle", | |
332 | "mrow", "a", "mfenced", "mtable", "mtr", "mlabeledtr", | |
333 | }; | |
334 | ||
335 | static MATHML_LEAF_NODES: phf::Set<&str> = phf_set! { | |
336 | "mi", "mo", "mn", "mtext", "ms", "mspace", "mglyph", | |
337 | "none", "annotation", "ci", "cn", "csymbol", // content could be inside an annotation-xml (faster to allow here than to check lots of places) | |
338 | }; | |
339 | ||
340 | ||
341 | // Should mstack and mlongdiv be included here? | |
342 | static MATHML_2D_NODES: phf::Set<&str> = phf_set! { | |
343 | "mfrac", "msqrt", "mroot", "menclose", | |
344 | "msub", "msup", "msubsup", "munder", "mover", "munderover", "mmultiscripts", | |
345 | "mtable", "mtr", "mlabeledtr", "mtd", | |
346 | }; | |
347 | ||
348 | // Should mstack and mlongdiv be included here? | |
349 | static MATHML_MODIFIED_NODES: phf::Set<&str> = phf_set! { | |
350 | "msub", "msup", "msubsup", "munder", "mover", "munderover", "mmultiscripts", | |
351 | }; | |
352 | ||
353 | // Should mstack and mlongdiv be included here? | |
354 | static MATHML_SCRIPTED_NODES: phf::Set<&str> = phf_set! { | |
355 | "msub", "msup", "msubsup", "mmultiscripts", | |
356 | }; | |
357 | ||
358 | 1.07M | pub fn is_leaf(element: Element) -> bool { |
359 | 1.07M | return MATHML_LEAF_NODES.contains(name(element)); |
360 | 1.07M | } |
361 | ||
362 | impl Function for IsNode { | |
363 | // eval function for IsNode | |
364 | // errors happen for wrong number/kind of arg | |
365 | 5.12k | fn evaluate<'d>(&self, |
366 | 5.12k | _context: &context::Evaluation<'_, 'd>, |
367 | 5.12k | args: Vec<Value<'d>>) |
368 | 5.12k | -> Result<Value<'d>, Error> |
369 | { | |
370 | ||
371 | 5.12k | let mut args = Args(args); |
372 | 5.12k | args.exactly(2) |
373 | 5.12k | let kind = args.pop_string() |
374 | // FIX: there is some conflict problem with xpath errors and error-chain | |
375 | // .chain_err(|e| format!("Second arg to is_leaf is not a string: {}", e.to_string()))?; | |
376 | 5.12k | match kind.as_str() { |
377 | 5.12k | "simple" | |
378 | 0 | _ => return Err( Error::Other(format!("Unknown argument value '{}' for IsNode", kind.as_str())) ), |
379 | }; | |
380 | ||
381 | 5.12k | let nodes = args.pop_nodeset() |
382 | 5.12k | if nodes.size() == 0 { |
383 | 0 | return Ok (Value::Boolean(false)); // like xpath, don't make this an error |
384 | 5.12k | }; |
385 | return Ok( | |
386 | Value::Boolean( | |
387 | 5.12k | nodes.iter() |
388 | 5.12k | .all(|node| |
389 | 5.39k | if let Node::Element(e) = node { |
390 | 5.39k | match kind.as_str() { |
391 | 5.39k | "simple" => |
392 | 3.09k | "leaf" => |
393 | 849 | "2D" => |
394 | 162 | "modified" => |
395 | 140 | "scripted" => |
396 | 49 | "mathml" => IsNode::is_mathml(e), |
397 | 0 | "common_fraction" => IsNode::is_common_fraction(e, usize::MAX, usize::MAX), |
398 | 0 | _ => true, // can't happen due to check above |
399 | } | |
400 | } else { | |
401 | // xpath is something besides an element, so no match | |
402 | 0 | false |
403 | 5.39k | } |
404 | ) | |
405 | ) | |
406 | ); | |
407 | ||
408 | 2.25k | fn is_leaf_any_name(e: Element) -> bool { |
409 | 2.25k | let children = e.children(); |
410 | 2.25k | if children.is_empty() { |
411 | 0 | return true; |
412 | 2.25k | } else if children.len() == 1 && |
413 | 1.24k | let ChildOfElement::Text(_) = children[0] { |
414 | 1.17k | return true; |
415 | 1.07k | } |
416 | 1.07k | return false |
417 | 2.25k | } |
418 | 5.12k | } |
419 | } | |
420 | ||
421 | struct ToOrdinal; | |
422 | impl ToOrdinal { | |
423 | // ordinals often have an irregular start (e.g., "half") before becoming regular. | |
424 | // if the number is irregular, return the ordinal form, otherwise return 'None'. | |
425 | 353 | fn compute_irregular_fractional_speech(number: &str, plural: bool) -> Option<String> { |
426 | 353 | SPEECH_DEFINITIONS.with(|definitions| { |
427 | 353 | let definitions = definitions.borrow(); |
428 | 353 | let words = if plural { |
429 | 208 | definitions.get_vec("NumbersOrdinalFractionalPluralOnes") |
430 | } else { | |
431 | 145 | definitions.get_vec("NumbersOrdinalFractionalOnes") |
432 | }; | |
433 | 353 | let number_as_int: usize = number.parse().unwrap(); // already verified it is only digits |
434 | 353 | if number_as_int < words.len() { |
435 | // use the words associated with this irregular pattern. | |
436 | 291 | return Some( words[number_as_int].clone() ); |
437 | 62 | }; |
438 | 62 | return None; |
439 | 353 | }) |
440 | 353 | } |
441 | ||
442 | /** | |
443 | * Translates a number of up to twelve digits into a string representation. | |
444 | * number -- the number to translate | |
445 | * fractional -- true if this is a fractional ordinal (e.g, "half") | |
446 | * plural -- true if answer should be plural | |
447 | * Returns the string representation of that number or an error message | |
448 | */ | |
449 | 416 | fn convert(number: &str, fractional: bool, plural: bool) -> Option<String> { |
450 | 2 | static NO_DIGIT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[^\d]").unwrap()); // match anything except a digit |
451 | 416 | return SPEECH_DEFINITIONS.with(|definitions| { |
452 | 416 | let definitions = definitions.borrow(); |
453 | 416 | let numbers_large = definitions.get_vec("NumbersLarge") |
454 | ||
455 | 416 | let pref_manager = crate::prefs::PreferenceManager::get(); |
456 | 416 | let pref_manager = pref_manager.borrow(); |
457 | 416 | let block_separators = pref_manager.pref_to_string("BlockSeparators"); |
458 | 416 | let decimal_separator = pref_manager.pref_to_string("DecimalSeparators"); |
459 | ||
460 | // check number validity (has digits, not a decimal) | |
461 | 416 | if number.is_empty() || number.contains(&decimal_separator) { |
462 | 0 | return Some(String::from(number)); |
463 | 416 | } |
464 | // remove any block separators | |
465 | 416 | let number = match clean_number(number, &block_separators) { |
466 | 0 | None => return Some(String::from(number)), |
467 | 416 | Some(num) => num, |
468 | }; | |
469 | ||
470 | // check to see if the number is too big or is not an integer or has non-digits | |
471 | 416 | if number.len() > 3*numbers_large.len() { |
472 | 0 | return Some(number); |
473 | 416 | } |
474 | 416 | if NO_DIGIT.is_match(&number) { |
475 | // this shouldn't have been part of an mn, so likely an error. Log a warning | |
476 | // FIX: log a warning that a non-number was passed to convert() | |
477 | 0 | return Some(number); |
478 | 416 | } |
479 | ||
480 | // first deal with the abnormalities of fractional ordinals (one half, etc). That simplifies what remains | |
481 | 416 | if fractional && |
482 | 353 | let Some( |
483 | 291 | return Some(string); |
484 | 125 | } |
485 | ||
486 | // at this point, we only need to worry about singular/plural distinction | |
487 | ||
488 | // break into groups of three digits and add 10^3 word (thousands, millions, ...) after each chunk | |
489 | // FIX: add a pause between groups of three -- need to use TTS-specific pause | |
490 | ||
491 | // handle special case of trailing zeros | |
492 | // num_thousands_at_end represents the amount to shift NumbersLarge... (e.g., millions->thousands) | |
493 | 243 | let |
494 | 122 | Some(n) => (number.len() - 1 - n) / 3 , |
495 | 3 | None => 0 |
496 | }; | |
497 | 125 | let (number,_) = number.split_at(number.len() - 3 * num_thousands_at_end); // drop the 0s |
498 | ||
499 | // everything is simplified if we add zeros at the start so that block size is a factor of 3 | |
500 | 125 | let number = match number.len() % 3 { |
501 | 18 | 0 => "".to_string() + number, |
502 | 69 | 1 => "00".to_string() + number, |
503 | 38 | _ => "0".to_string() + number, // can only be "2" -- compiler doesn't know there aren't other options |
504 | }; | |
505 | ||
506 | // At this point we have at least three "digits", and length is a multiple of 3 | |
507 | // We have already verified that there are only ASCII digits, so we can subtract '0' to get an index | |
508 | const ASCII_0: usize = 48; | |
509 | 125 | let digits = number.as_bytes() |
510 | 125 | .iter() |
511 | 411 | . |
512 | 125 | .collect::<Vec<usize>>(); |
513 | ||
514 | 125 | let mut answer = String::with_capacity(255); // reasonable max most of the time |
515 | 125 | let large_words = numbers_large; |
516 | 125 | if digits.len() > 3 { |
517 | // speak this first groups as cardinal numbers | |
518 | 7 | let words = [ |
519 | 7 | definitions.get_vec("NumbersHundreds") |
520 | 7 | definitions.get_vec("NumbersTens") |
521 | 7 | definitions.get_vec("NumbersOnes") |
522 | ]; | |
523 | 7 | answer = digits[0..digits.len()-3] |
524 | 7 | .chunks(3) |
525 | 7 | .enumerate() |
526 | 12 | . |
527 | 12 | if chunk[0] != 0 || |
528 | 7 | Some(ToOrdinal::hundreds_to_words(chunk, &words) |
529 | 7 | &large_words[num_thousands_at_end + digits.len()/3 - 1 - i] + " ") |
530 | } else { | |
531 | 5 | Some("".to_string()) |
532 | } | |
533 | 12 | }) |
534 | 7 | .collect::<Option<Vec<String>>>() |
535 | 7 | .join(""); // can't use " " because 1000567 would get extra space in the middle |
536 | 7 | if num_thousands_at_end > 0 { |
537 | // add on "billionths", etc and we are done | |
538 | 0 | let large_words = if plural { |
539 | 0 | definitions.get_vec("NumbersOrdinalPluralLarge") |
540 | } else { | |
541 | 0 | definitions.get_vec("NumbersOrdinalLarge") |
542 | }; | |
543 | 0 | return Some(answer + &large_words?[num_thousands_at_end]); |
544 | 7 | } |
545 | 118 | }; |
546 | ||
547 | // all that is left is to speak the hundreds part, possibly followed by "thousands", "billions", etc | |
548 | 125 | let words = match (num_thousands_at_end > 0, plural) { |
549 | (true, _) => [ | |
550 | 10 | definitions.get_vec("NumbersHundreds") |
551 | 10 | definitions.get_vec("NumbersTens") |
552 | 10 | definitions.get_vec("NumbersOnes") |
553 | ], | |
554 | (false, true) => [ | |
555 | 54 | definitions.get_vec("NumbersOrdinalPluralHundreds") |
556 | 54 | definitions.get_vec("NumbersOrdinalPluralTens") |
557 | 54 | definitions.get_vec("NumbersOrdinalPluralOnes") |
558 | ], | |
559 | (false, false) => [ | |
560 | 61 | definitions.get_vec("NumbersOrdinalHundreds") |
561 | 61 | definitions.get_vec("NumbersOrdinalTens") |
562 | 61 | definitions.get_vec("NumbersOrdinalOnes") |
563 | ], | |
564 | }; | |
565 | 125 | answer += &ToOrdinal::hundreds_to_words(&digits[digits.len()-3..], &words) |
566 | 125 | if num_thousands_at_end > 0 { |
567 | 10 | let large_words = if plural { |
568 | 3 | definitions.get_vec("NumbersOrdinalPluralLarge") |
569 | } else { | |
570 | 7 | definitions.get_vec("NumbersOrdinalLarge") |
571 | }; | |
572 | 10 | answer = answer + " " + &large_words[num_thousands_at_end]; |
573 | 115 | } |
574 | 125 | return Some(answer); |
575 | 416 | }); |
576 | ||
577 | /// Remove block separators and convert alphanumeric digits to ascii digits | |
578 | 416 | fn clean_number(number: &str, block_separators: &str) -> Option<String> { |
579 | 416 | let mut answer = String::with_capacity(number.len()); |
580 | 617 | for ch in |
581 | 617 | if block_separators.contains(ch) { |
582 | 0 | continue; |
583 | 617 | } |
584 | 617 | if ch.is_ascii_digit() { |
585 | 615 | answer.push(ch); |
586 | 615 | } else { |
587 | 2 | let shifted_ch = match ch { |
588 | 2 | '𝟎'..='𝟗' => ch as u32 -'𝟎' as u32 + '0' as u32, |
589 | 0 | '𝟘'..='𝟡' => ch as u32 -'𝟘' as u32 + '0' as u32, |
590 | 0 | '𝟢'..='𝟫' => ch as u32 -'𝟢' as u32 + '0' as u32, |
591 | 0 | '𝟬'..='𝟵' => ch as u32 -'𝟬' as u32 + '0' as u32, |
592 | 0 | '𝟶'..='𝟿' => ch as u32 -'𝟶' as u32 + '0' as u32, |
593 | 0 | _ => return None, |
594 | }; | |
595 | 2 | answer.push(char::from_u32(shifted_ch).unwrap()); |
596 | } | |
597 | } | |
598 | 416 | return Some(answer); |
599 | 416 | } |
600 | 416 | } |
601 | ||
602 | ||
603 | 132 | fn hundreds_to_words(number: &[usize], words: &[Ref<Vec<String>>; 3]) -> Option<String> { |
604 | 132 | assert!( number.len() == 3 ); |
605 | 132 | return SPEECH_DEFINITIONS.with(|definitions| { |
606 | 132 | let definitions = definitions.borrow(); |
607 | 132 | if number[0] != 0 && |
608 | 6 | return Some(words[0][number[0]].clone()); |
609 | 126 | } |
610 | ||
611 | 126 | let mut hundreds = definitions.get_vec("NumbersHundreds") |
612 | 126 | if !hundreds.is_empty() { |
613 | 18 | hundreds += " "; |
614 | 108 | } |
615 | ||
616 | 126 | if number[1] != 0 && |
617 | 26 | return Some(hundreds + &words[1][number[1]]); |
618 | 100 | } |
619 | ||
620 | 100 | if 10*number[1] < words[2].len() { |
621 | // usurp regular ordering to handle something like '14' | |
622 | 85 | return Some(hundreds + &words[2][10*number[1] + number[2]]); |
623 | } else { | |
624 | 15 | return Some(hundreds + &definitions.get_vec("NumbersTens") |
625 | } | |
626 | 132 | }); |
627 | 132 | } |
628 | } | |
629 | ||
630 | impl Function for ToOrdinal { | |
631 | // convert a node to an ordinal number | |
632 | 320 | fn evaluate<'d>(&self, |
633 | 320 | _context: &context::Evaluation<'_, 'd>, |
634 | 320 | args: Vec<Value<'d>>) |
635 | 320 | -> Result<Value<'d>, Error> |
636 | { | |
637 | 320 | let mut args = Args(args); |
638 | 320 | if let Err( |
639 | 0 | return Err( XPathError::Other(format!("ToOrdinal requires 1 or 3 args: {e}"))); |
640 | 320 | }; |
641 | 320 | let mut fractional = false; |
642 | 320 | let mut plural = false; |
643 | 320 | if args.len() == 3 { |
644 | 288 | plural = args.pop_boolean() |
645 | 288 | fractional = args.pop_boolean() |
646 | 32 | } |
647 | 320 | let node = validate_one_node(args.pop_nodeset() |
648 | 320 | return match node { |
649 | 0 | Node::Text(t) => Ok( Value::String( |
650 | 0 | match ToOrdinal::convert(t.text(), fractional, plural) { |
651 | 0 | None => t.text().to_string(), |
652 | 0 | Some(ord) => ord, |
653 | } ) ), | |
654 | 320 | Node::Element(e) => Ok( Value::String( |
655 | 320 | match ToOrdinal::convert(&get_text_from_element(e), fractional, plural) { |
656 | 0 | None => get_text_from_element(e).to_string(), |
657 | 320 | Some(ord) => ord, |
658 | } ) ), | |
659 | 0 | _ => Err( Error::ArgumentNotANodeset{actual: ArgumentType::String} ), |
660 | } | |
661 | 320 | } |
662 | } | |
663 | ||
664 | ||
665 | struct ToCommonFraction; | |
666 | ||
667 | impl Function for ToCommonFraction { | |
668 | // convert a node to a common fraction (if the numerator and denominator are within given limits) | |
669 | 34 | fn evaluate<'d>(&self, |
670 | 34 | _context: &context::Evaluation<'_, 'd>, |
671 | 34 | args: Vec<Value<'d>>) |
672 | 34 | -> Result<Value<'d>, Error> |
673 | { | |
674 | 34 | let mut args = Args(args); |
675 | 34 | args.exactly(1) |
676 | ||
677 | // FIX: should probably handle errors by logging them and then trying to evaluate any children | |
678 | 34 | let node = validate_one_node(args.pop_nodeset() |
679 | 34 | if let Node::Element(frac) = node { |
680 | 34 | if !IsNode::is_common_fraction(frac, usize::MAX, usize::MAX) { |
681 | 0 | return Err( Error::Other( format!("ToCommonFraction -- argument is not an 'mfrac': {}': ", mml_to_string(frac))) ); |
682 | 34 | } |
683 | ||
684 | // everything has been verified, so we can just get the pieces and ignore potential error results | |
685 | 34 | let children = frac.children(); |
686 | 34 | let num = children[0].element().unwrap(); |
687 | 34 | let num = get_text_from_element( num ); |
688 | 34 | let denom = children[1].element().unwrap(); |
689 | 34 | let denom = get_text_from_element( denom ); |
690 | 34 | let mut answer = num.clone() + " "; |
691 | 34 | answer += &match ToOrdinal::convert(&denom, true, num!="1") { |
692 | 0 | None => denom, |
693 | 34 | Some(ord) => ord, |
694 | }; | |
695 | ||
696 | 34 | return Ok( Value::String( answer ) ) |
697 | } else { | |
698 | 0 | return Err( Error::Other( "ToCommonFraction -- argument is not an element".to_string()) ); |
699 | } | |
700 | 34 | } |
701 | } | |
702 | ||
703 | struct Min; | |
704 | /** | |
705 | * Returns true the smallest of the two args | |
706 | * @param(num1) | |
707 | * @param(num2) | |
708 | */ | |
709 | impl Function for Min { | |
710 | ||
711 | 0 | fn evaluate<'d>(&self, |
712 | 0 | _context: &context::Evaluation<'_, 'd>, |
713 | 0 | args: Vec<Value<'d>>) |
714 | 0 | -> Result<Value<'d>, Error> |
715 | { | |
716 | 0 | let mut args = Args(args); |
717 | 0 | args.exactly(2)?; |
718 | 0 | let num1 = args.pop_number()?; |
719 | 0 | let num2 = args.pop_number()?; |
720 | 0 | return Ok( Value::Number( num1.min(num2) ) ); |
721 | 0 | } |
722 | } | |
723 | ||
724 | struct Max; | |
725 | ||
726 | impl Function for Max { | |
727 | ||
728 | 0 | fn evaluate<'d>(&self, |
729 | 0 | _context: &context::Evaluation<'_, 'd>, |
730 | 0 | args: Vec<Value<'d>>) |
731 | 0 | -> Result<Value<'d>, Error> |
732 | { | |
733 | 0 | let mut args = Args(args); |
734 | 0 | args.exactly(2)?; |
735 | 0 | let num1 = args.pop_number()?; |
736 | 0 | let num2 = args.pop_number()?; |
737 | 0 | return Ok( Value::Number( num1.max(num2) ) ); |
738 | 0 | } |
739 | } | |
740 | ||
741 | ||
742 | struct BaseNode; | |
743 | /** | |
744 | * Returns true if the node is a large op | |
745 | * @param(node) -- node(s) to test -- should be an <mo> | |
746 | */ | |
747 | impl BaseNode { | |
748 | /// Recursively find the base node | |
749 | /// The base node of a non scripted element is the element itself | |
750 | 1.26k | fn base_node(node: Element) -> Element { |
751 | 1.26k | let name = node.attribute_value(MATHML_FROM_NAME_ATTR).unwrap_or(name(node)); |
752 | 1.26k | if ["msub", "msup", "msubsup", "munder", "mover", "munderover", "mmultiscripts"].contains(&name) { |
753 | 97 | return BaseNode::base_node(as_element(node.children()[0])); |
754 | } else { | |
755 | 1.16k | return node; |
756 | } | |
757 | 1.26k | } |
758 | } | |
759 | impl Function for BaseNode { | |
760 | ||
761 | 1.16k | fn evaluate<'d>(&self, |
762 | 1.16k | _context: &context::Evaluation<'_, 'd>, |
763 | 1.16k | args: Vec<Value<'d>>) |
764 | 1.16k | -> Result<Value<'d>, Error> |
765 | { | |
766 | 1.16k | let mut args = Args(args); |
767 | 1.16k | args.exactly(1) |
768 | 1.16k | let node = validate_one_node(args.pop_nodeset() |
769 | 1.16k | if let Node::Element(e) = node { |
770 | 1.16k | let mut node_set = Nodeset::new(); |
771 | 1.16k | node_set.add(BaseNode::base_node(e)); |
772 | 1.16k | return Ok( Value::Nodeset(node_set) ); |
773 | } else { | |
774 | // xpath is something besides an element, so no match | |
775 | 0 | return Err( Error::Other("Argument other than a node given to BaseNode".to_string()) ); |
776 | } | |
777 | 1.16k | } |
778 | } | |
779 | ||
780 | ||
781 | struct IfThenElse; | |
782 | impl Function for IfThenElse { | |
783 | 36.2k | fn evaluate<'d>(&self, |
784 | 36.2k | _context: &context::Evaluation<'_, 'd>, |
785 | 36.2k | args: Vec<Value<'d>>) |
786 | 36.2k | -> Result<Value<'d>, Error> |
787 | { | |
788 | 36.2k | let args = Args(args); |
789 | 36.2k | args.exactly(3) |
790 | 36.2k | let if_val = &args[0]; |
791 | 36.2k | let then_val = &args[1]; |
792 | 36.2k | let else_val = &args[2]; |
793 | 36.2k | let is_true = match if_val { |
794 | 14.5k | Value::Nodeset(nodes) => nodes.size() > 0, |
795 | 21.7k | Value::Boolean(b) => *b, |
796 | 0 | Value::Number(f) => *f != 0.0, |
797 | 0 | Value::String(s) => !s.is_empty(), |
798 | }; | |
799 | 36.2k | return Ok( if is_true { |
800 | 36.2k | } |
801 | } | |
802 | ||
803 | ||
804 | struct Debug; | |
805 | /** | |
806 | * Prints it's argument along with the string that was evaluated | |
807 | * @param(node) -- node(s) to be evaluated/printed | |
808 | * @param(string) -- string showing what is being evaluated | |
809 | */ | |
810 | impl Function for Debug { | |
811 | ||
812 | 348 | fn evaluate<'d>(&self, |
813 | 348 | _context: &context::Evaluation<'_, 'd>, |
814 | 348 | args: Vec<Value<'d>>) |
815 | 348 | -> Result<Value<'d>, Error> |
816 | { | |
817 | 348 | let mut args = Args(args); |
818 | 348 | args.exactly(2) |
819 | 348 | let xpath_str = args.pop_string() |
820 | 348 | let eval_result = &args[0]; |
821 | 348 | debug!(" -- Debug: value of '{xpath_str}' is "); |
822 | 348 | match eval_result { |
823 | 78 | Value::Nodeset(nodes) => { |
824 | 78 | if nodes.size() == 0 { |
825 | 0 | debug!("0 nodes (false)"); |
826 | } else { | |
827 | 78 | let singular = nodes.size()==1; |
828 | 78 | debug!("{} node{}. {}:", |
829 | 0 | if singular {""} else {"s"}, |
830 | 0 | if singular {"Node is"} else {"Nodes are"}); |
831 | 78 | nodes.document_order() |
832 | 78 | .iter() |
833 | 78 | .enumerate() |
834 | 78 | .for_each(|(i, node)| { |
835 | 78 | match node { |
836 | 78 | Node::Element(mathml) => debug!("#{}:\n{}", |
837 | 0 | i, mml_to_string(*mathml)), |
838 | 0 | _ => debug!("'{node:?}'"), |
839 | } | |
840 | 78 | }) |
841 | } | |
842 | }, | |
843 | 270 | _ => debug!("'{eval_result:?}'"), |
844 | } | |
845 | 348 | return Ok( eval_result.clone() ); |
846 | 348 | } |
847 | } | |
848 | ||
849 | ||
850 | /// Should be an internal structure for implementation of the IsBracketed, but it was useful in one place in a separate module. | |
851 | /// This should probably be restructured slightly. | |
852 | pub struct IsBracketed; | |
853 | impl IsBracketed { | |
854 | 139k | pub fn is_bracketed(element: Element, left: &str, right: &str, requires_comma: bool, requires_mrow: bool) -> bool { |
855 | use crate::canonicalize::is_fence; | |
856 | 139k | if requires_mrow && |
857 | 18.8k | return false; |
858 | 120k | } |
859 | 120k | let children = element.children(); |
860 | 120k | let n_children = children.len(); |
861 | 120k | if (n_children == 0 || |
862 | 120k | !left.is_empty() && |
863 | 116k | requires_comma && |
864 | // not enough argument for there to be a match | |
865 | 4.44k | return false; |
866 | 115k | } |
867 | ||
868 | 115k | let first_child = as_element(children[0]); |
869 | 115k | let last_child = as_element(children[children.len()-1]); |
870 | // debug!("first_child: {}", crate::pretty_print::mml_to_string(first_child)); | |
871 | // debug!("last_child: {}", crate::pretty_print::mml_to_string(last_child)); | |
872 | 115k | if (left.is_empty() && ( |
873 | 106k | (right.is_empty() && ( |
874 | 9.61k | return false; |
875 | 106k | } |
876 | ||
877 | 106k | if !left.is_empty() && |
878 | 6.14k | !right.is_empty() && |
879 | // left or right don't match | |
880 | 101k | return false; |
881 | 5.12k | } |
882 | ||
883 | 5.12k | if requires_comma { |
884 | 445 | if let ChildOfElement::Element(contents) = children[1] { |
885 | 445 | let children = contents.children(); |
886 | 445 | if !is_tag(contents, "mrow") || |
887 | 197 | return false; |
888 | 248 | } |
889 | // finally, we can check for a comma -- we might not have operands, so we to check first and second entry | |
890 | 248 | if get_text_from_COE(&children[0]).as_str() == "," { |
891 | 1 | return true; |
892 | 247 | } |
893 | 247 | if children.len() > 1 && get_text_from_COE(&children[1]).as_str() == "," { |
894 | 133 | return true; |
895 | 114 | } |
896 | 0 | } |
897 | 114 | return false; |
898 | } else { | |
899 | 4.67k | return true; |
900 | } | |
901 | 139k | } |
902 | } | |
903 | ||
904 | /** | |
905 | * Returns true if the node is a bracketed expr with the indicated left/right chars | |
906 | * node -- node(s) to test | |
907 | * left -- string (like "[") or empty | |
908 | * right -- string (like "]") or empty | |
909 | * requires_comma - boolean, optional (check the top level of 'node' for commas) | |
910 | */ | |
911 | // 'requiresComma' is useful for checking parenthesized expressions vs function arg lists and other lists | |
912 | impl Function for IsBracketed { | |
913 | 115k | fn evaluate<'d>(&self, |
914 | 115k | _context: &context::Evaluation<'_, 'd>, |
915 | 115k | args: Vec<Value<'d>>) |
916 | 115k | -> Result<Value<'d>, Error> |
917 | { | |
918 | 115k | let mut args = Args(args); |
919 | 115k | args.at_least(3) |
920 | 115k | args.at_most(5) |
921 | 115k | let mut requires_comma = false; |
922 | 115k | let mut requires_mrow = true; |
923 | 115k | if args.len() == 5 { |
924 | 0 | requires_mrow = args.pop_boolean()?; |
925 | 115k | } |
926 | 115k | if args.len() >= 4 { |
927 | 15 | requires_comma = args.pop_boolean() |
928 | 115k | } |
929 | 115k | let right = args.pop_string() |
930 | 115k | let left = args.pop_string() |
931 | return Ok( Value::Boolean( | |
932 | 115k | match validate_one_node(args.pop_nodeset() |
933 | 0 | Err(_) => false, // be fault tolerant, like xpath, |
934 | 115k | Ok(node) => { |
935 | 115k | if let Node::Element(e) = node { |
936 | 115k | IsBracketed::is_bracketed(e, &left, &right, requires_comma, requires_mrow) |
937 | } else { | |
938 | 0 | false |
939 | } | |
940 | } | |
941 | }) ); | |
942 | 115k | } |
943 | } | |
944 | ||
945 | pub struct IsInDefinition; | |
946 | impl IsInDefinition { | |
947 | /// Returns true if `test_str` is in `set_name` | |
948 | /// Returns an error if `set_name` is not defined | |
949 | 11.0k | pub fn is_defined_in(test_str: &str, defs: &'static LocalKey<RefCell<Definitions>>, set_name: &str) -> Result<bool, Error> { |
950 | 11.0k | return defs.with(|definitions| { |
951 | 11.0k | if let Some( |
952 | 11.0k | return Ok( set.contains(test_str) ); |
953 | 12 | } |
954 | 12 | if let Some(hashmap) = definitions.borrow().get_hashmap(set_name) { |
955 | 12 | return Ok( hashmap.contains_key(test_str) ); |
956 | 0 | } |
957 | 0 | return Err( Error::Other( format!("\n IsInDefinition: '{set_name}' is not defined in definitions.yaml") ) ); |
958 | 11.0k | }); |
959 | 11.0k | } |
960 | } | |
961 | ||
962 | /** | |
963 | * Returns true if the text is contained in the set defined in Speech or Braille. | |
964 | * element/string -- element (converted to string)/string to test | |
965 | * speech or braille | |
966 | * set_name -- the set in which the string is to be searched | |
967 | */ | |
968 | // 'requiresComma' is useful for checking parenthesized expressions vs function arg lists and other lists | |
969 | impl Function for IsInDefinition { | |
970 | 12.0k | fn evaluate<'d>(&self, |
971 | 12.0k | _context: &context::Evaluation<'_, 'd>, |
972 | 12.0k | args: Vec<Value<'d>>) |
973 | 12.0k | -> Result<Value<'d>, Error> |
974 | { | |
975 | 12.0k | let mut args = Args(args); |
976 | // FIX: temporarily accept two args as assume SPEECH_DEFINITIONS until the Rule files are fixed | |
977 | 12.0k | args.at_least(2) |
978 | 12.0k | args.at_most(3) |
979 | 12.0k | let set_name = args.pop_string() |
980 | // FIX: this (len == 1) is temporary until all the usages are switched to the (new) 3-arg form | |
981 | 12.0k | let definitions = if args.len() == 2 { |
982 | 10.4k | match args.pop_string() |
983 | 10.4k | "Speech" => |
984 | 9.09k | "Braille" => &BRAILLE_DEFINITIONS, |
985 | 0 | _ => return Err( Error::Other("IsInDefinition:: second argument must be either 'Speech' or 'Braille'".to_string()) ) |
986 | } | |
987 | } else { | |
988 | 1.61k | &SPEECH_DEFINITIONS |
989 | }; | |
990 | 12.0k | match &args[0] { |
991 | 5.04k | Value::String(str) => return match IsInDefinition::is_defined_in(str, definitions, &set_name) { |
992 | 5.04k | Ok(result) => Ok( Value::Boolean( result ) ), |
993 | 0 | Err(e) => Err(e), |
994 | }, | |
995 | 7.02k | Value::Nodeset(nodes) => { |
996 | 7.02k | return if nodes.size() == 0 { |
997 | 0 | Ok( Value::Boolean(false) ) // trivially not in definition |
998 | } else { | |
999 | 7.02k | let node = validate_one_node(nodes.clone(), "IsInDefinition") |
1000 | 7.02k | if let Node::Element(e) = node { |
1001 | 7.02k | let text = get_text_from_element(e); |
1002 | 7.02k | if text.is_empty() { |
1003 | 979 | Ok( Value::Boolean(false) ) |
1004 | } else { | |
1005 | 6.04k | match IsInDefinition::is_defined_in(&text, definitions, &set_name) { |
1006 | 6.04k | Ok(result) => Ok( Value::Boolean( result ) ), |
1007 | 0 | Err(e) => Err(e), |
1008 | } | |
1009 | } | |
1010 | } else { | |
1011 | 0 | Ok( Value::Boolean(false)) // trivially not in definition } |
1012 | } | |
1013 | } | |
1014 | }, | |
1015 | 0 | _ => Err( Error::Other("IsInDefinition:: neither a node nor a string is passed for first argument".to_string()) ), |
1016 | } | |
1017 | 12.0k | } |
1018 | } | |
1019 | ||
1020 | ||
1021 | pub struct DefinitionValue; | |
1022 | impl DefinitionValue { | |
1023 | /// Returns the value associated with `key` in `set_name`. If `key` is not in `set_name`, an empty string is returned | |
1024 | /// Returns an error if `set_name` is not defined | |
1025 | 12.7k | pub fn definition_value(key: &str, defs: &'static LocalKey<RefCell<Definitions>>, set_name: &str) -> Result<String, Error> { |
1026 | 12.7k | return defs.with(|definitions| { |
1027 | 12.7k | if let Some(map) = definitions.borrow().get_hashmap(set_name) { |
1028 | 12.7k | return Ok( match map.get(key) { |
1029 | 5.64k | None => "".to_string(), |
1030 | 7.09k | Some(str) => str.clone(), |
1031 | }); | |
1032 | 0 | } |
1033 | 0 | return Err( Error::Other( format!("\n DefinitionValue: '{set_name}' is not defined in definitions.yaml") ) ); |
1034 | 12.7k | }); |
1035 | 12.7k | } |
1036 | } | |
1037 | ||
1038 | /** | |
1039 | * Returns true if the node is a bracketed expr with the indicated left/right chars | |
1040 | * element/string -- element (converted to string)/string to test | |
1041 | * left -- string (like "[") or empty | |
1042 | * right -- string (like "]") or empty | |
1043 | * requires_comma - boolean, optional (check the top level of 'node' for commas | |
1044 | */ | |
1045 | // 'requiresComma' is useful for checking parenthesized expressions vs function arg lists and other lists | |
1046 | impl Function for DefinitionValue { | |
1047 | 13.1k | fn evaluate<'d>(&self, |
1048 | 13.1k | _context: &context::Evaluation<'_, 'd>, |
1049 | 13.1k | args: Vec<Value<'d>>) |
1050 | 13.1k | -> Result<Value<'d>, Error> |
1051 | { | |
1052 | 13.1k | let mut args = Args(args); |
1053 | 13.1k | args.exactly(3) |
1054 | 13.1k | let set_name = args.pop_string() |
1055 | 13.1k | let definitions = match args.pop_string() |
1056 | 13.1k | "Speech" => |
1057 | 12 | "Braille" => &BRAILLE_DEFINITIONS, |
1058 | 0 | _ => return Err( Error::Other("IsInDefinition:: second argument must be either 'Speech' or 'Braille'".to_string()) ) |
1059 | }; | |
1060 | 13.1k | match &args[0] { |
1061 | 5.04k | Value::String(str) => return match DefinitionValue::definition_value(str, definitions, &set_name) { |
1062 | 5.04k | Ok(result) => Ok( Value::String( result ) ), |
1063 | 0 | Err(e) => Err(e), |
1064 | }, | |
1065 | 8.10k | Value::Nodeset(nodes) => { |
1066 | 8.10k | return if nodes.size() == 0 { |
1067 | 0 | Ok( Value::String("".to_string()) ) // trivially not in definition |
1068 | } else { | |
1069 | 8.10k | let node = validate_one_node(nodes.clone(), "DefinitionValue") |
1070 | 8.10k | if let Node::Element( |
1071 | 8.10k | let text = get_text_from_element(e); |
1072 | 8.10k | if text.is_empty() { |
1073 | 410 | Ok( Value::String("".to_string()) ) |
1074 | } else { | |
1075 | 7.69k | match DefinitionValue::definition_value(&text, definitions, &set_name) { |
1076 | 7.69k | Ok(result) => Ok( Value::String( result ) ), |
1077 | 0 | Err(e) => Err(e), |
1078 | } | |
1079 | } | |
1080 | } else { | |
1081 | 3 | Ok( Value::String("".to_string()) ) // trivially not in definition } |
1082 | } | |
1083 | } | |
1084 | }, | |
1085 | 0 | _ => Err( Error::Other("DefinitionValue:: neither a node nor a string is passed for first argument".to_string()) ), |
1086 | } | |
1087 | 13.1k | } |
1088 | } | |
1089 | ||
1090 | pub struct DistanceFromLeaf; | |
1091 | impl DistanceFromLeaf { | |
1092 | 240 | fn distance(element: Element, use_left_side: bool, treat_2d_elements_as_tokens: bool) -> usize { |
1093 | // FIX: need to handle char level (i.e., chars in a leaf element) | |
1094 | 240 | let mut element = element; |
1095 | 240 | let mut distance = 1; |
1096 | loop { | |
1097 | // debug!("distance={} -- element: {}", distance, mml_to_string(element)); | |
1098 | 361 | if MATHML_LEAF_NODES.contains(element.attribute_value(MATHML_FROM_NAME_ATTR).unwrap_or(name(element))) { |
1099 | 199 | return distance; |
1100 | 162 | } |
1101 | 162 | if treat_2d_elements_as_tokens && ( |
1102 | 41 | return distance; |
1103 | 121 | } |
1104 | 121 | let children = element.children(); |
1105 | 121 | assert!(!children.is_empty()); |
1106 | 121 | element = as_element( if use_left_side { |
1107 | 121 | distance += 1; |
1108 | } | |
1109 | 240 | } |
1110 | } | |
1111 | ||
1112 | /** | |
1113 | * Returns distance from the current node to the leftmost/rightmost leaf (if char, then = 0, if token, then 1). | |
1114 | * If the node is a bracketed expr with the indicated left/right chars | |
1115 | * node -- node(s) to test | |
1116 | * left_side -- (bool) traverse leftmost child to leaf | |
1117 | * treat2D_elements_as_tokens -- (bool) 2D notations such as fractions are treated like leaves | |
1118 | */ | |
1119 | impl Function for DistanceFromLeaf { | |
1120 | 240 | fn evaluate<'d>(&self, |
1121 | 240 | _context: &context::Evaluation<'_, 'd>, |
1122 | 240 | args: Vec<Value<'d>>) |
1123 | 240 | -> Result<Value<'d>, Error> |
1124 | { | |
1125 | 240 | let mut args = Args(args); |
1126 | 240 | args.exactly(3) |
1127 | 240 | let treat_2d_elements_as_tokens = args.pop_boolean() |
1128 | 240 | let use_left_side = args.pop_boolean() |
1129 | 240 | let node = validate_one_node(args.pop_nodeset() |
1130 | 240 | if let Node::Element(e) = node { |
1131 | 240 | return Ok( Value::Number( DistanceFromLeaf::distance(e, use_left_side, treat_2d_elements_as_tokens) as f64) ); |
1132 | 0 | } |
1133 | ||
1134 | // FIX: should having a non-element be an error instead?? | |
1135 | 0 | return Err(Error::Other(format!("DistanceFromLeaf: first arg '{node:?}' is not a node"))); |
1136 | 240 | } |
1137 | } | |
1138 | ||
1139 | ||
1140 | ||
1141 | pub struct EdgeNode; | |
1142 | impl EdgeNode { | |
1143 | // Return the root of the ancestor tree if we are at the left/right side of a path from that to 'element' | |
1144 | 2.09k | fn edge_node<'a>(element: Element<'a>, use_left_side: bool, stop_node_name: &str) -> Option<Element<'a>> { |
1145 | 2.09k | let element_name = element.attribute_value(MATHML_FROM_NAME_ATTR).unwrap_or(name(element)); |
1146 | 2.09k | if element_name == "math" { |
1147 | 86 | return Some(element); |
1148 | 2.00k | }; |
1149 | ||
1150 | 2.00k | let parent = get_parent(element); // there is always a "math" node |
1151 | 2.00k | let parent_name = parent.attribute_value(MATHML_FROM_NAME_ATTR).unwrap_or(name(parent)); |
1152 | ||
1153 | // first check to see if we have the special case of punctuation as last child of math/mrow element | |
1154 | // it only matters if we are looking at the right edge | |
1155 | ||
1156 | // debug!("EdgeNode: there are {} preceding siblings",element.preceding_siblings().len() ); | |
1157 | 2.00k | if use_left_side && |
1158 | 587 | return None; |
1159 | 1.41k | }; |
1160 | ||
1161 | 1.41k | if !use_left_side && |
1162 | // check for the special case that the parent is an mrow and the grandparent is <math> and we have punctuation | |
1163 | 574 | let grandparent = get_parent(parent); |
1164 | 574 | let grandparent_name = grandparent.attribute_value(MATHML_FROM_NAME_ATTR).unwrap_or(name(grandparent)); |
1165 | 574 | if grandparent_name == "math" && |
1166 | 105 | parent_name == "mrow" && |
1167 | 11 | let text = get_text_from_element( as_element(parent.children()[1]) ); |
1168 | 11 | if text == "," || text == "." || |
1169 | 1 | return Some(grandparent); |
1170 | 10 | } |
1171 | 563 | } |
1172 | 573 | return None; |
1173 | 843 | }; |
1174 | ||
1175 | // at an edge -- check to see the parent is desired root | |
1176 | 843 | if parent_name == stop_node_name || |
1177 | 735 | (stop_node_name == "2D" && |
1178 | 176 | return Some(parent); |
1179 | 667 | }; |
1180 | ||
1181 | // debug!("EdgeNode: recurse to {}", parent_name); | |
1182 | 667 | return EdgeNode::edge_node(parent, use_left_side, stop_node_name) |
1183 | 2.09k | } |
1184 | } | |
1185 | ||
1186 | // EdgeNode(node, "left"/"right", stopNodeName) | |
1187 | // -- returns the stopNode if at left/right edge of named ancestor node. "stopNodeName' can also be "2D' | |
1188 | // returns original node match isn't found | |
1189 | // Note: if stopNodeName=="math", then punctuation is taken into account since it isn't really part of the math | |
1190 | impl Function for EdgeNode { | |
1191 | 1.41k | fn evaluate<'d>(&self, |
1192 | 1.41k | _context: &context::Evaluation<'_, 'd>, |
1193 | 1.41k | args: Vec<Value<'d>>) |
1194 | 1.41k | -> Result<Value<'d>, Error> |
1195 | { | |
1196 | 1.41k | let mut args = Args(args); |
1197 | 1.41k | args.exactly(3) |
1198 | 1.41k | let stop_node_name = args.pop_string() |
1199 | 1.41k | let use_left_side = args.pop_string() |
1200 | 1.41k | let node = validate_one_node(args.pop_nodeset() |
1201 | 1.41k | if let Node::Element(e) = node { |
1202 | 1.41k | let result = match EdgeNode::edge_node(e, use_left_side, &stop_node_name) { |
1203 | 260 | Some(found) => found, |
1204 | 1.15k | None => e, |
1205 | }; | |
1206 | 1.41k | let mut node_set = Nodeset::new(); |
1207 | 1.41k | node_set.add(result); |
1208 | 1.41k | return Ok( Value::Nodeset(node_set) ); |
1209 | 0 | } |
1210 | ||
1211 | // FIX: should having a non-element be an error instead?? | |
1212 | 0 | return Err(Error::Other(format!("EdgeNode: first arg '{node:?}' is not a node"))); |
1213 | 1.41k | } |
1214 | } | |
1215 | ||
1216 | pub struct SpeakIntentName; | |
1217 | /// SpeakIntentName(intent, verbosity) | |
1218 | /// Returns a string corresponding to the intent name with the indicated verbosity | |
1219 | impl Function for SpeakIntentName { | |
1220 | 340 | fn evaluate<'d>(&self, |
1221 | 340 | _context: &context::Evaluation<'_, 'd>, |
1222 | 340 | args: Vec<Value<'d>>) |
1223 | 340 | -> Result<Value<'d>, Error> |
1224 | { | |
1225 | 340 | let mut args = Args(args); |
1226 | 340 | args.exactly(3) |
1227 | 340 | let fixity = args.pop_string() |
1228 | 340 | let verbosity = args.pop_string() |
1229 | 340 | let intent_name = args.pop_string() |
1230 | 340 | return Ok( Value::String(crate::infer_intent::intent_speech_for_name(&intent_name, &verbosity, &fixity)) ); |
1231 | 340 | } |
1232 | } | |
1233 | ||
1234 | pub struct GetBracketingIntentName; | |
1235 | /// GetBracketingIntentName(name, verbosity, at_start_or_end) | |
1236 | /// Returns a potentially empty string to use to bracket an intent expression (start foo... end foo) | |
1237 | /// | |
1238 | impl GetBracketingIntentName { | |
1239 | 61 | fn bracketing_words(intent_name: &str, verbosity: &str, fixity: &str, at_start: bool) -> String { |
1240 | 61 | crate::definitions::SPEECH_DEFINITIONS.with(|definitions| { |
1241 | 61 | let definitions = definitions.borrow(); |
1242 | 61 | if let Some( |
1243 | // Split the pattern is: fixity-def [|| fixity-def]* | |
1244 | // fixity-def := fixity=open; verbosity; close | |
1245 | // verbosity := terse | medium | verbose | |
1246 | 68 | if let Some( |
1247 | 57 | let (_, matched_intent) = matched_intent.split_once("=").unwrap_or_default(); |
1248 | 57 | let parts = matched_intent.trim().split(";").collect::<Vec<&str>>(); |
1249 | 57 | if parts.len() == 1 { |
1250 | 30 | return "".to_string(); |
1251 | 27 | } |
1252 | 27 | if parts.len() != 3 { |
1253 | 0 | error!("Intent '{}' has {} ';' separated parts, should have 3", intent_name, parts.len()); |
1254 | 0 | return "".to_string(); |
1255 | 27 | } |
1256 | 27 | let mut speech = (if at_start { |
1257 | 27 | match speech.len() { |
1258 | 20 | 1 => return speech[0].to_string(), |
1259 | 2 | 3 => { | |
1260 | 7 | if speech.len() == 2 { |
1261 | 0 | warn!("Intent '{intent_name}' has only two ':' separated parts, but should have three"); |
1262 | 0 | speech.push(speech[1]); |
1263 | 7 | } |
1264 | 7 | let bracketing_words = match verbosity { |
1265 | 7 | "Terse" => |
1266 | 7 | "Medium" => speech[1], |
1267 | 0 | _ => speech[2], |
1268 | }; | |
1269 | 7 | return bracketing_words.to_string(); |
1270 | }, | |
1271 | _ => { | |
1272 | 0 | error!("Intent '{}' has too many ({}) operator names, should only have 2", intent_name, speech.len()); |
1273 | }, | |
1274 | } | |
1275 | 0 | } |
1276 | 4 | }; |
1277 | 4 | return "".to_string(); |
1278 | 61 | }) |
1279 | 61 | } |
1280 | } | |
1281 | ||
1282 | impl Function for GetBracketingIntentName { | |
1283 | 61 | fn evaluate<'d>(&self, |
1284 | 61 | _context: &context::Evaluation<'_, 'd>, |
1285 | 61 | args: Vec<Value<'d>>) |
1286 | 61 | -> Result<Value<'d>, Error> |
1287 | { | |
1288 | 61 | let mut args = Args(args); |
1289 | 61 | args.exactly(4) |
1290 | 61 | let start_or_end = args.pop_string() |
1291 | 61 | if start_or_end != "start" && |
1292 | 0 | return Err( Error::Other("GetBracketingIntentName: first argument must be either 'start' or 'end'".to_string()) ); |
1293 | 61 | } |
1294 | 61 | let fixity = args.pop_string() |
1295 | 61 | let verbosity = args.pop_string() |
1296 | 61 | let name = args.pop_string() |
1297 | 61 | return Ok( Value::String(GetBracketingIntentName:: bracketing_words(&name, &verbosity, &fixity, start_or_end == "start")) ); |
1298 | 61 | } |
1299 | } | |
1300 | ||
1301 | pub struct GetNavigationPartName; | |
1302 | /// GetNavigationPartName(name, index) | |
1303 | /// Returns the name to use to speak the part of a navigation expression (e.g., 'numerator', 'denominator', 'base', 'exponent', ...). | |
1304 | /// If there is no match, an empty string is returned. | |
1305 | /// 'index' is 0-based | |
1306 | /// | |
1307 | impl GetNavigationPartName { | |
1308 | 129 | fn navigation_part_name(intent_name: &str, index: usize) -> String { |
1309 | 129 | crate::definitions::SPEECH_DEFINITIONS.with(|definitions| { |
1310 | 129 | let definitions = definitions.borrow(); |
1311 | 129 | if let Some(navigation_names) = definitions.get_hashmap("NavigationParts") && |
1312 | 129 | let Some( |
1313 | // Split the pattern is: part [; part]* | |
1314 | 105 | if let Some(part_name) = nav_part_names.trim().split(";").nth(index) { |
1315 | 105 | return part_name.trim().to_string(); |
1316 | 0 | } |
1317 | 24 | } |
1318 | 24 | return "".to_string(); |
1319 | 129 | }) |
1320 | 129 | } |
1321 | } | |
1322 | ||
1323 | impl Function for GetNavigationPartName { | |
1324 | 129 | fn evaluate<'d>(&self, |
1325 | 129 | _context: &context::Evaluation<'_, 'd>, |
1326 | 129 | args: Vec<Value<'d>>) |
1327 | 129 | -> Result<Value<'d>, Error> |
1328 | { | |
1329 | 129 | let mut args = Args(args); |
1330 | 129 | args.exactly(2) |
1331 | 129 | let index = args.pop_number() |
1332 | 129 | let name = args.pop_string() |
1333 | 129 | return Ok( Value::String(GetNavigationPartName:: navigation_part_name(&name, index)) ); |
1334 | 129 | } |
1335 | } | |
1336 | ||
1337 | pub struct FontSizeGuess; | |
1338 | /// FontSizeGuess(size_string) | |
1339 | /// returns a guess of the size in "ems" | |
1340 | /// Examples: | |
1341 | /// "0.278em" -> 0.278 | |
1342 | /// "" | |
1343 | // returns original node match isn't found | |
1344 | impl FontSizeGuess { | |
1345 | 224 | pub fn em_from_value(value_with_unit: &str) -> f64 { |
1346 | // match one or more digits followed by a unit -- there are many more units, but they tend to be large and rarer(?) | |
1347 | 3 | static FONT_VALUE: LazyLock<Regex> = LazyLock::new(|| { Regex::new(r"(-?[0-9]*\.?[0-9]*)(px|cm|mm|Q|in|ppc|pt|ex|em|rem)").unwrap() }); |
1348 | 224 | let cap = FONT_VALUE.captures(value_with_unit); |
1349 | 224 | if let Some( |
1350 | 200 | if cap.len() == 3 { |
1351 | 200 | let multiplier = match &cap[2] { // guess based on 12pt font to convert to ems |
1352 | 200 | "px" => |
1353 | 200 | "cm" => |
1354 | 200 | "mm" => |
1355 | 200 | "Q" => |
1356 | 200 | "in" => |
1357 | 177 | "pc" => |
1358 | 177 | "pt" => |
1359 | 171 | "ex" => |
1360 | 171 | "em" => 1.0, |
1361 | 0 | "rem" => 16.0/12.0, |
1362 | 0 | default => {debug!("unit='{default}'"); 10.0} |
1363 | }; | |
1364 | // debug!("FontSizeGuess: {}->{}, val={}, multiplier={}", value_with_unit, value*multiplier, value, multiplier); | |
1365 | 200 | return cap[1].parse::<f64>().unwrap_or(0.0) * multiplier; |
1366 | } else { | |
1367 | 0 | return 0.0; // something bad happened |
1368 | } | |
1369 | }else { | |
1370 | 24 | let multiplier = match value_with_unit { // guess based on 12pt font to convert to ems |
1371 | 24 | "veryverythinspace" => |
1372 | 24 | "verythinspace" => |
1373 | 24 | "thinspace" => |
1374 | 24 | "mediumspace" => |
1375 | 24 | "thickspace" => |
1376 | 24 | "verythickspace" => |
1377 | 24 | "veryverythickspace" => |
1378 | 24 | _ => 0.0, |
1379 | }; | |
1380 | 24 | return multiplier; |
1381 | } | |
1382 | 224 | } |
1383 | } | |
1384 | impl Function for FontSizeGuess { | |
1385 | 0 | fn evaluate<'d>(&self, |
1386 | 0 | _context: &context::Evaluation<'_, 'd>, |
1387 | 0 | args: Vec<Value<'d>>) |
1388 | 0 | -> Result<Value<'d>, Error> |
1389 | { | |
1390 | 0 | let mut args = Args(args); |
1391 | 0 | args.exactly(1)?; |
1392 | 0 | let value_with_unit = args.pop_string()?; |
1393 | 0 | let em_value = FontSizeGuess::em_from_value(&value_with_unit); |
1394 | 0 | return Ok( Value::Number(em_value) ); |
1395 | 0 | } |
1396 | } | |
1397 | ||
1398 | pub struct ReplaceAll; | |
1399 | /// ReplaceAll(haystack, needle, replacement) | |
1400 | /// Returns a string with all occurrences of 'needle' replaced with 'replacement' | |
1401 | impl Function for ReplaceAll { | |
1402 | 0 | fn evaluate<'d>(&self, |
1403 | 0 | _context: &context::Evaluation<'_, 'd>, |
1404 | 0 | args: Vec<Value<'d>>) |
1405 | 0 | -> Result<Value<'d>, Error> |
1406 | { | |
1407 | 0 | let mut args = Args(args); |
1408 | 0 | args.exactly(3)?; |
1409 | 0 | let replacement = args.pop_string()?; |
1410 | 0 | let needle = args.pop_string()?; |
1411 | 0 | let haystack = args.pop_string()?; |
1412 | 0 | return Ok( Value::String(haystack.replace(&needle, &replacement)) ); |
1413 | 0 | } |
1414 | } | |
1415 | ||
1416 | /// Add all the functions defined in this module to `context`. | |
1417 | 22.7k | pub fn add_builtin_functions(context: &mut Context) { |
1418 | 22.7k | context.set_function("NestingChars", crate::braille::NemethNestingChars); |
1419 | 22.7k | context.set_function("BrailleChars", crate::braille::BrailleChars); |
1420 | 22.7k | context.set_function("NeedsToBeGrouped", crate::braille::NeedsToBeGrouped); |
1421 | 22.7k | context.set_function("IsNode", IsNode); |
1422 | 22.7k | context.set_function("ToOrdinal", ToOrdinal); |
1423 | 22.7k | context.set_function("ToCommonFraction", ToCommonFraction); |
1424 | 22.7k | context.set_function("IsBracketed", IsBracketed); |
1425 | 22.7k | context.set_function("IsInDefinition", IsInDefinition); |
1426 | 22.7k | context.set_function("DefinitionValue", DefinitionValue); |
1427 | 22.7k | context.set_function("BaseNode", BaseNode); |
1428 | 22.7k | context.set_function("IfThenElse", IfThenElse); |
1429 | 22.7k | context.set_function("IFTHENELSE", IfThenElse); |
1430 | 22.7k | context.set_function("DistanceFromLeaf", DistanceFromLeaf); |
1431 | 22.7k | context.set_function("EdgeNode", EdgeNode); |
1432 | 22.7k | context.set_function("SpeakIntentName", SpeakIntentName); |
1433 | 22.7k | context.set_function("GetBracketingIntentName", GetBracketingIntentName); |
1434 | 22.7k | context.set_function("GetNavigationPartName", GetNavigationPartName); |
1435 | 22.7k | context.set_function("DEBUG", Debug); |
1436 | ||
1437 | // Not used: remove?? | |
1438 | 22.7k | context.set_function("min", Min); // missing in xpath 1.0 |
1439 | 22.7k | context.set_function("max", Max); // missing in xpath 1.0 |
1440 | 22.7k | context.set_function("FontSizeGuess", FontSizeGuess); |
1441 | 22.7k | context.set_function("ReplaceAll", ReplaceAll); |
1442 | 22.7k | } |
1443 | ||
1444 | ||
1445 | #[cfg(test)] | |
1446 | mod tests { | |
1447 | use super::*; | |
1448 | use sxd_document::parser; | |
1449 | use crate::interface::{trim_element, get_element}; | |
1450 | ||
1451 | ||
1452 | 4 | fn init_word_list() { |
1453 | 4 | crate::interface::set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); |
1454 | 4 | let result = crate::definitions::read_definitions_file(true); |
1455 | 4 | if let Err( |
1456 | 0 | panic!("unable to read 'Rules/Languages/en/definitions.yaml\n{e}"); |
1457 | 4 | } |
1458 | 4 | } |
1459 | ||
1460 | #[test] | |
1461 | 1 | fn ordinal_one_digit() { |
1462 | 1 | init_word_list(); |
1463 | 1 | assert_eq!("zeroth", ToOrdinal::convert("0", false, false).unwrap()); |
1464 | 1 | assert_eq!("second", ToOrdinal::convert("2", false, false).unwrap()); |
1465 | 1 | assert_eq!("ninth", ToOrdinal::convert("9", false, false).unwrap()); |
1466 | ||
1467 | 1 | assert_eq!("zeroth", ToOrdinal::convert("0", false, true).unwrap()); |
1468 | 1 | assert_eq!("seconds", ToOrdinal::convert("2", false, true).unwrap()); |
1469 | 1 | assert_eq!("ninths", ToOrdinal::convert("9", false, true).unwrap()); |
1470 | ||
1471 | 1 | assert_eq!("first", ToOrdinal::convert("1", true, false).unwrap()); |
1472 | 1 | assert_eq!("half", ToOrdinal::convert("2", true, false).unwrap()); |
1473 | 1 | assert_eq!("half", ToOrdinal::convert("02", true, false).unwrap()); |
1474 | 1 | assert_eq!("ninth", ToOrdinal::convert("9", true, false).unwrap()); |
1475 | ||
1476 | 1 | assert_eq!("halves", ToOrdinal::convert("2", true, true).unwrap()); |
1477 | 1 | assert_eq!("halves", ToOrdinal::convert("002", true, true).unwrap()); |
1478 | 1 | assert_eq!("ninths", ToOrdinal::convert("9", true, true).unwrap()); |
1479 | 1 | } |
1480 | ||
1481 | #[test] | |
1482 | 1 | fn ordinal_two_digit() { |
1483 | 1 | init_word_list(); |
1484 | 1 | assert_eq!("tenth", ToOrdinal::convert("10", false, false).unwrap()); |
1485 | 1 | assert_eq!("seventeenth", ToOrdinal::convert("17", false, false).unwrap()); |
1486 | 1 | assert_eq!("thirty second", ToOrdinal::convert("32", false, false).unwrap()); |
1487 | 1 | assert_eq!("fortieth", ToOrdinal::convert("40", false, false).unwrap()); |
1488 | ||
1489 | 1 | assert_eq!("tenths", ToOrdinal::convert("10", false, true).unwrap()); |
1490 | 1 | assert_eq!("sixteenths", ToOrdinal::convert("16", false, true).unwrap()); |
1491 | 1 | assert_eq!("eighty eighths", ToOrdinal::convert("88", false, true).unwrap()); |
1492 | 1 | assert_eq!("fiftieths", ToOrdinal::convert("50", false, true).unwrap()); |
1493 | ||
1494 | 1 | assert_eq!("eleventh", ToOrdinal::convert("11", true, false).unwrap()); |
1495 | 1 | assert_eq!("forty fourth", ToOrdinal::convert("44", true, false).unwrap()); |
1496 | 1 | assert_eq!("ninth", ToOrdinal::convert("9", true, false).unwrap()); |
1497 | 1 | assert_eq!("ninth", ToOrdinal::convert("00000009", true, false).unwrap()); |
1498 | 1 | assert_eq!("sixtieth", ToOrdinal::convert("60", true, false).unwrap()); |
1499 | ||
1500 | 1 | assert_eq!("tenths", ToOrdinal::convert("10", true, true).unwrap()); |
1501 | 1 | assert_eq!("tenths", ToOrdinal::convert("0010", true, true).unwrap()); |
1502 | 1 | assert_eq!("elevenths", ToOrdinal::convert("11", true, true).unwrap()); |
1503 | 1 | assert_eq!("nineteenths", ToOrdinal::convert("19", true, true).unwrap()); |
1504 | 1 | assert_eq!("twentieths", ToOrdinal::convert("20", true, true).unwrap()); |
1505 | 1 | assert_eq!("nineteenths", ToOrdinal::convert("𝟏𝟗", true, true).unwrap()); |
1506 | 1 | } |
1507 | ||
1508 | #[test] | |
1509 | 1 | fn ordinal_three_digit() { |
1510 | 1 | init_word_list(); |
1511 | 1 | assert_eq!("one hundred first", ToOrdinal::convert("101", false, false).unwrap()); |
1512 | 1 | assert_eq!("two hundred tenth", ToOrdinal::convert("210", false, false).unwrap()); |
1513 | 1 | assert_eq!("four hundred thirty second", ToOrdinal::convert("432", false, false).unwrap()); |
1514 | 1 | assert_eq!("four hundred second", ToOrdinal::convert("402", false, false).unwrap()); |
1515 | ||
1516 | 1 | assert_eq!("one hundred first", ToOrdinal::convert("101", true, false).unwrap()); |
1517 | 1 | assert_eq!("two hundred second", ToOrdinal::convert("202", true, false).unwrap()); |
1518 | 1 | assert_eq!("four hundred thirty second", ToOrdinal::convert("432", true, false).unwrap()); |
1519 | 1 | assert_eq!("five hundred third", ToOrdinal::convert("503", true, false).unwrap()); |
1520 | ||
1521 | 1 | assert_eq!("three hundred elevenths", ToOrdinal::convert("311", false, true).unwrap()); |
1522 | 1 | assert_eq!("four hundred ninety ninths", ToOrdinal::convert("499", false, true).unwrap()); |
1523 | 1 | assert_eq!("nine hundred ninetieths", ToOrdinal::convert("990", false, true).unwrap()); |
1524 | 1 | assert_eq!("six hundred seconds", ToOrdinal::convert("602", false, true).unwrap()); |
1525 | ||
1526 | 1 | assert_eq!("seven hundredths", ToOrdinal::convert("700", true, true).unwrap()); |
1527 | 1 | assert_eq!("one hundredths", ToOrdinal::convert("100", true, true).unwrap()); |
1528 | 1 | assert_eq!("eight hundred seventeenths", ToOrdinal::convert("817", true, true).unwrap()); |
1529 | 1 | } |
1530 | #[test] | |
1531 | 1 | fn ordinal_large() { |
1532 | 1 | init_word_list(); |
1533 | 1 | assert_eq!("one thousandth", ToOrdinal::convert("1000", false, false).unwrap()); |
1534 | 1 | assert_eq!("two thousand one hundredth", ToOrdinal::convert("2100", false, false).unwrap()); |
1535 | 1 | assert_eq!("thirty thousandth", ToOrdinal::convert("30000", false, false).unwrap()); |
1536 | 1 | assert_eq!("four hundred thousandth", ToOrdinal::convert("400000", false, false).unwrap()); |
1537 | ||
1538 | 1 | assert_eq!("four hundred thousandth", ToOrdinal::convert("400000", true, false).unwrap()); |
1539 | 1 | assert_eq!("five hundred thousand second", ToOrdinal::convert("500002", true, false).unwrap()); |
1540 | 1 | assert_eq!("six millionth", ToOrdinal::convert("6000000", true, false).unwrap()); |
1541 | 1 | assert_eq!("sixty millionth", ToOrdinal::convert("60000000", true, false).unwrap()); |
1542 | ||
1543 | 1 | assert_eq!("seven billionths", ToOrdinal::convert("7000000000", false, true).unwrap()); |
1544 | 1 | assert_eq!("eight trillionths", ToOrdinal::convert("8000000000000", false, true).unwrap()); |
1545 | 1 | assert_eq!("nine quadrillionths", ToOrdinal::convert("9000000000000000", false, true).unwrap()); |
1546 | 1 | assert_eq!("one quintillionth", ToOrdinal::convert("1000000000000000000", false, false).unwrap()); |
1547 | ||
1548 | 1 | assert_eq!("nine billion eight hundred seventy six million five hundred forty three thousand two hundred tenths", ToOrdinal::convert("9876543210", true, true).unwrap()); |
1549 | 1 | assert_eq!("nine billion five hundred forty three thousand two hundred tenths", ToOrdinal::convert("9000543210", true, true).unwrap()); |
1550 | 1 | assert_eq!("zeroth", ToOrdinal::convert("00000", false, false).unwrap()); |
1551 | 1 | } |
1552 | ||
1553 | ||
1554 | 11 | fn test_is_simple(message: &'static str, mathml_str: &'static str) { |
1555 | // this forces initialization | |
1556 | 11 | crate::speech::SPEECH_RULES.with(|_| true); |
1557 | 11 | let package = parser::parse(mathml_str) |
1558 | 11 | .expect("failed to parse XML"); |
1559 | 11 | let mathml = get_element(&package); |
1560 | 11 | trim_element(mathml, false); |
1561 | 11 | assert!(IsNode::is_simple(mathml), "{}", message); |
1562 | 11 | } |
1563 | ||
1564 | 7 | fn test_is_not_simple(message: &'static str, mathml_str: &'static str) { |
1565 | // this forces initialization | |
1566 | 7 | crate::speech::SPEECH_RULES.with(|_| true); |
1567 | 7 | let package = parser::parse(mathml_str) |
1568 | 7 | .expect("failed to parse XML"); |
1569 | 7 | let mathml = get_element(&package); |
1570 | 7 | trim_element(mathml, false); |
1571 | 7 | assert!(!IsNode::is_simple(mathml), "{}", message); |
1572 | 7 | } |
1573 | #[test] | |
1574 | 1 | fn is_simple() { |
1575 | 1 | test_is_simple("single variable", "<mi>x</mi>"); |
1576 | 1 | test_is_simple("single number", "<mn>1.2</mn>"); |
1577 | 1 | test_is_simple("negative number", "<mrow><mo>-</mo><mn>10</mn></mrow>"); |
1578 | 1 | test_is_simple("negative variable", "<mrow><mo>-</mo><mi>x</mi></mrow>"); |
1579 | 1 | test_is_simple("ordinal fraction", "<mfrac><mn>3</mn><mn>4</mn></mfrac>"); |
1580 | 1 | test_is_simple("x y", "<mrow><mi>x</mi><mo>⁢</mo><mi>y</mi></mrow>"); |
1581 | 1 | test_is_simple("negative two vars", |
1582 | "<mrow><mrow><mo>-</mo><mi>x</mi></mrow><mo>⁢</mo><mi>y</mi></mrow>"); | |
1583 | 1 | test_is_simple("-2 x y", |
1584 | "<mrow><mrow><mo>-</mo><mn>2</mn></mrow> | |
1585 | <mo>⁢</mo><mi>x</mi><mo>⁢</mo><mi>z</mi></mrow>"); | |
1586 | 1 | test_is_simple("sin x", "<mrow><mi>sin</mi><mo>⁡</mo><mi>x</mi></mrow>"); |
1587 | 1 | test_is_simple("f(x)", "<mrow><mi>f</mi><mo>⁡</mo><mrow><mo>(</mo><mi>x</mi><mo>)</mo></mrow></mrow>"); |
1588 | 1 | test_is_simple("f(x+y)", |
1589 | "<mrow><mi>f</mi><mo>⁡</mo>\ | |
1590 | <mrow><mo>(</mo><mi>x</mi><mo>+</mo><mi>y</mi><mo>)</mo></mrow></mrow>"); | |
1591 | ||
1592 | 1 | } |
1593 | ||
1594 | #[test] | |
1595 | 1 | fn is_not_simple() { |
1596 | 1 | test_is_not_simple("multi-char variable", "<mi>rise</mi>"); |
1597 | 1 | test_is_not_simple("large ordinal fraction", "<mfrac><mn>30</mn><mn>4</mn></mfrac>"); |
1598 | 1 | test_is_not_simple("fraction with var in numerator", "<mfrac><mi>x</mi><mn>4</mn></mfrac>"); |
1599 | 1 | test_is_not_simple("square root", "<msqrt><mi>x</mi></msqrt>"); |
1600 | 1 | test_is_not_simple("subscript", "<msub><mi>x</mi><mn>4</mn></msub>"); |
1601 | 1 | test_is_not_simple("-x y z", |
1602 | "<mrow><mrow><mo>-</mo><mi>x</mi></mrow> | |
1603 | <mo>⁢</mo><mi>y</mi><mo>⁢</mo><mi>z</mi></mrow>"); | |
1604 | 1 | test_is_not_simple("C(-2,1,4)", // github.com/NSoiffer/MathCAT/issues/199 |
1605 | "<mrow><mi>C</mi><mrow><mo>(</mo><mo>−</mo><mn>2</mn><mo>,</mo><mn>1</mn><mo>,</mo><mn>4</mn><mo>)</mo></mrow></mrow>"); | |
1606 | ||
1607 | 1 | } |
1608 | ||
1609 | #[test] | |
1610 | 1 | fn at_left_edge() { |
1611 | 1 | let mathml = "<math><mfrac><mrow><mn>30</mn><mi>x</mi></mrow><mn>4</mn></mfrac></math>"; |
1612 | 1 | let package = parser::parse(mathml).expect("failed to parse XML"); |
1613 | 1 | let mathml = get_element(&package); |
1614 | 1 | trim_element(mathml, false); |
1615 | 1 | let fraction = as_element(mathml.children()[0]); |
1616 | 1 | let mn = as_element(as_element(fraction.children()[0]).children()[0]); |
1617 | 1 | assert_eq!(EdgeNode::edge_node(mn, true, "2D"), Some(fraction)); |
1618 | 1 | assert_eq!(EdgeNode::edge_node(mn, false, "2D"), None); |
1619 | ||
1620 | 1 | let mi = as_element(as_element(fraction.children()[0]).children()[1]); |
1621 | 1 | assert_eq!(EdgeNode::edge_node(mi, true, "2D"), None); |
1622 | 1 | } |
1623 | ||
1624 | #[test] | |
1625 | 1 | fn at_right_edge() { |
1626 | 1 | let mathml = "<math><mrow><mfrac><mn>4</mn><mrow><mn>30</mn><mi>x</mi></mrow></mfrac><mo>.</mo></mrow></math>"; |
1627 | 1 | let package = parser::parse(mathml).expect("failed to parse XML"); |
1628 | 1 | let mathml = get_element(&package); |
1629 | 1 | trim_element(mathml, false); |
1630 | 1 | let fraction = as_element(as_element(mathml.children()[0]).children()[0]); |
1631 | 1 | let mi = as_element(as_element(fraction.children()[1]).children()[1]); |
1632 | 1 | assert_eq!(EdgeNode::edge_node(mi, true, "2D"), None); |
1633 | 1 | assert_eq!(EdgeNode::edge_node(mi, false, "2D"), Some(fraction)); |
1634 | 1 | assert_eq!(EdgeNode::edge_node(mi, false, "math"), Some(mathml)); |
1635 | ||
1636 | 1 | let mn = as_element(as_element(fraction.children()[1]).children()[0]); |
1637 | 1 | assert_eq!(EdgeNode::edge_node(mn, true, "2D"), None); |
1638 | 1 | } |
1639 | } |
Click here for information about interpreting this report.
| Filename | Function Coverage | Line Coverage | Region Coverage | Branch Coverage |
bin/mathml2text.rs | 0.00% (0/3) | 0.00% (0/60) | 0.00% (0/129) | - (0/0) |
braille.rs | 89.63% (147/164) | 84.01% (1608/1914) | 81.11% (2955/3643) | - (0/0) |
canonicalize.rs | 97.40% (300/308) | 94.43% (4747/5027) | 93.14% (6712/7206) | - (0/0) |
chemistry.rs | 98.68% (149/151) | 94.93% (2453/2584) | 90.78% (2933/3231) | - (0/0) |
definitions.rs | 61.90% (26/42) | 86.97% (207/238) | 82.48% (433/525) | - (0/0) |
infer_intent.rs | 91.94% (57/62) | 87.67% (654/746) | 79.38% (916/1154) | - (0/0) |
interface.rs | 76.03% (92/121) | 73.62% (681/925) | 72.60% (1224/1686) | - (0/0) |
lib.rs | 85.71% (6/7) | 78.00% (39/50) | 85.06% (74/87) | - (0/0) |
main.rs | 0.00% (0/3) | 0.00% (0/108) | 0.00% (0/209) | - (0/0) |
navigate.rs | 93.96% (140/149) | 87.98% (1881/2138) | 86.18% (2826/3279) | - (0/0) |
prefs.rs | 83.33% (65/78) | 79.12% (610/771) | 79.49% (1601/2014) | - (0/0) |
pretty_print.rs | 89.74% (35/39) | 70.12% (298/425) | 71.20% (576/809) | - (0/0) |
shim_filesystem.rs | 100.00% (7/7) | 81.69% (58/71) | 75.86% (110/145) | - (0/0) |
speech.rs | 67.33% (136/202) | 76.21% (1371/1799) | 74.80% (2662/3559) | - (0/0) |
tts.rs | 73.17% (30/41) | 68.86% (272/395) | 61.54% (480/780) | - (0/0) |
xpath_functions.rs | 95.18% (79/83) | 87.34% (911/1043) | 85.58% (1845/2156) | - (0/0) |
Totals | 86.92% (1269/1460) | 86.31% (15790/18294) | 82.80% (25347/30612) | - (0/0) |
Click here for information about interpreting this report.
| Filename | Function Coverage | Line Coverage | Region Coverage | Branch Coverage |
bin/mathml2text.rs | 0.00% (0/3) | 0.00% (0/60) | 0.00% (0/129) | - (0/0) |
braille.rs | 89.63% (147/164) | 84.01% (1608/1914) | 81.11% (2955/3643) | - (0/0) |
canonicalize.rs | 97.40% (300/308) | 94.43% (4747/5027) | 93.14% (6712/7206) | - (0/0) |
chemistry.rs | 98.68% (149/151) | 94.93% (2453/2584) | 90.78% (2933/3231) | - (0/0) |
definitions.rs | 61.90% (26/42) | 86.97% (207/238) | 82.48% (433/525) | - (0/0) |
infer_intent.rs | 91.94% (57/62) | 87.67% (654/746) | 79.38% (916/1154) | - (0/0) |
interface.rs | 76.03% (92/121) | 73.62% (681/925) | 72.60% (1224/1686) | - (0/0) |
lib.rs | 85.71% (6/7) | 78.00% (39/50) | 85.06% (74/87) | - (0/0) |
main.rs | 0.00% (0/3) | 0.00% (0/108) | 0.00% (0/209) | - (0/0) |
navigate.rs | 93.96% (140/149) | 87.98% (1881/2138) | 86.18% (2826/3279) | - (0/0) |
prefs.rs | 83.33% (65/78) | 79.25% (615/776) | 79.66% (1606/2016) | - (0/0) |
pretty_print.rs | 89.74% (35/39) | 70.12% (298/425) | 71.20% (576/809) | - (0/0) |
shim_filesystem.rs | 100.00% (7/7) | 78.67% (59/75) | 74.50% (111/149) | - (0/0) |
speech.rs | 67.33% (136/202) | 76.21% (1371/1799) | 74.80% (2662/3559) | - (0/0) |
tts.rs | 73.17% (30/41) | 68.86% (272/395) | 61.54% (480/780) | - (0/0) |
xpath_functions.rs | 95.18% (79/83) | 87.34% (911/1043) | 85.58% (1845/2156) | - (0/0) |
Totals | 86.92% (1269/1460) | 86.30% (15796/18303) | 82.80% (25353/30618) | - (0/0) |