-
Notifications
You must be signed in to change notification settings - Fork 170
ar money bugfix and sh tests #438
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -80,14 +80,14 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): | |
| pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert("\"") | ||
| ) | ||
|
|
||
| graph_integer_only = graph_maj_singular + insert_space + graph_integer_one | ||
| graph_integer_only |= graph_maj_plural + insert_space + graph_integer | ||
| currency_first = pynutil.insert(' morphosyntactic_features: "currency_first"') | ||
| # Currency-first tagging for exactly one major unit (e.g. $1 -> دولار واحد). | ||
| graph_integer_one_unit = graph_maj_singular + insert_space + graph_integer_one + currency_first | ||
|
|
||
| # For local currency "9د.ك" | ||
| # For local currency "5د.ك" | ||
| graph_integer_only_ar = graph_integer + insert_space + graph_ar_cur | ||
| # graph_decimal_ar = graph_decimal_final + insert_space + graph_ar_cur | ||
|
|
||
| graph = (graph_integer_only + optional_delete_fractional_zeros) | graph_integer_only_ar | ||
| graph = (graph_integer_one_unit + optional_delete_fractional_zeros) | graph_integer_only_ar | ||
|
|
||
| # remove trailing zeros of non zero number in the first 2 digits and fill up to 2 digits | ||
| # e.g. 2000 -> 20, 0200->02, 01 -> 01, 10 -> 10 | ||
|
|
@@ -112,9 +112,12 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): | |
|
|
||
| preserve_order = pynutil.insert(" preserve_order: true") | ||
| integer_plus_maj = graph_integer + insert_space + pynutil.insert(curr_symbol) @ graph_maj_plural | ||
| integer_plus_maj |= graph_integer_one + insert_space + pynutil.insert(curr_symbol) @ graph_maj_singular | ||
| # non zero integer part | ||
| integer_plus_maj = (pynini.closure(NEMO_DIGIT) - "0") @ integer_plus_maj | ||
| integer_plus_maj_with_one = integer_plus_maj | ( | ||
| graph_integer_one + insert_space + pynutil.insert(curr_symbol) @ graph_maj_singular | ||
| ) | ||
| # Amount == 1 without fractional part uses graph_integer_one_unit / graph_one_prefix. | ||
| integer_plus_maj_no_minor = (pynini.closure(NEMO_DIGIT) - "0") @ integer_plus_maj | ||
| integer_plus_maj_with_minor = (pynini.closure(NEMO_DIGIT) - "0") @ integer_plus_maj_with_one | ||
|
|
||
| graph_fractional_one = two_digits_fractional_part @ pynini.cross("1", "") | ||
| graph_fractional_one = pynutil.insert("fractional_part: \"") + graph_fractional_one + pynutil.insert("\"") | ||
|
|
@@ -141,11 +144,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): | |
| graph_fractional_up_to_ten + insert_space + pynutil.insert(curr_symbol) @ graph_min_plural | ||
| ) | ||
|
|
||
| graph_with_no_minor_curr = integer_plus_maj | ||
| graph_with_no_minor_curr |= pynutil.add_weight( | ||
| integer_plus_maj, | ||
| weight=0.0001, | ||
| ) | ||
| graph_with_no_minor_curr = integer_plus_maj_no_minor | ||
|
|
||
| graph_with_no_minor_curr = pynutil.delete(curr_symbol) + graph_with_no_minor_curr + preserve_order | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how about |
||
|
|
||
|
|
@@ -154,9 +153,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): | |
| if graph_with_no_minor is None | ||
| else pynini.union(graph_with_no_minor, graph_with_no_minor_curr) | ||
| ) | ||
| decimal_graph_with_minor_curr = integer_plus_maj + pynini.cross(".", " ") + fractional_plus_min | ||
| decimal_graph_with_minor_curr = integer_plus_maj_with_minor + pynini.cross(".", " ") + fractional_plus_min | ||
| decimal_graph_with_minor_curr |= pynutil.add_weight( | ||
| integer_plus_maj | ||
| integer_plus_maj_with_minor | ||
| + pynini.cross(".", " ") | ||
| + pynutil.insert("fractional_part: \"") | ||
| + two_digits_fractional_part @ cardinal_graph | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,6 +28,7 @@ class MoneyFst(GraphFst): | |
| Finite state transducer for verbalizing money, e.g. | ||
| money { integer_part: "تسعة" currency_maj: "يورو" preserve_order: true} -> "تسعة يورو" | ||
| money { integer_part: "تسعة" currency_maj: "دولار" preserve_order: true} -> "تسعة دولار" | ||
| money { currency_maj: "دولار" integer_part: "واحد" morphosyntactic_features: "currency_first"} -> "دولار واحد" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you modify unit test based on new logic? |
||
| money { integer_part: "خمسة" currency_maj: "دينار كويتي"} -> "خمسة دينار كويتي" | ||
|
|
||
| Args: | ||
|
|
@@ -49,9 +50,10 @@ def __init__(self, deterministic: bool = True): | |
|
|
||
| integer_part = pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") | ||
| add_and = pynutil.insert(" و") | ||
| morph_currency_first = pynutil.delete(' morphosyntactic_features: "currency_first"') | ||
|
|
||
| # *** currency_maj | ||
| graph_integer = maj + keep_space + integer_part | ||
| # currency_maj before integer_part; disambiguated via morphosyntactic_features for Sparrowhawk. | ||
| graph_currency_first = maj + keep_space + integer_part + delete_space + morph_currency_first | ||
|
|
||
| # *** currency_maj + (***) (و) *** current_min | ||
| graph_integer_with_minor = ( | ||
|
|
@@ -65,12 +67,10 @@ def __init__(self, deterministic: bool = True): | |
| + pynini.closure(keep_space + min, 0, 1) | ||
| + delete_preserve_order | ||
| ) | ||
| # this graph fix word order from dollar three (دولار تسعة)--> three dollar (تسعة دولار) | ||
| graph_integer_no_minor = integer_part + keep_space + maj + delete_space + delete_preserve_order | ||
| # *** current_min | ||
| graph_minor = fractional_part + keep_space + delete_space + min + delete_preserve_order | ||
|
|
||
| graph = graph_integer | graph_integer_with_minor | graph_minor | graph_integer_no_minor | ||
| graph = graph_currency_first | graph_integer_with_minor | graph_minor | graph_integer_no_minor | ||
|
|
||
| delete_tokens = self.delete_tokens(graph) | ||
| self.fst = delete_tokens.optimize() | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ar didn't test sh for old version? |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,71 @@ | ||
| #! /bin/sh | ||
| GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} | ||
| TEST_DIR=${2:-"/workspace/tests/ar"} | ||
|
|
||
| runtest () { | ||
| input=$1 | ||
| echo "INPUT is $input" | ||
| cd ${GRAMMARS_DIR} | ||
|
|
||
| while IFS= read -r testcase; do | ||
| IFS='~' read -r written spoken <<< "$testcase" | ||
|
|
||
| escaped_written=$(printf '%s' "$written" | sed 's/\\/\\\\/g') | ||
| denorm_pred=$(echo "$escaped_written" | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') | ||
|
|
||
| spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" | ||
| denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" | ||
|
|
||
| assertEquals "$written" "$spoken" "$denorm_pred" | ||
| done < "$input" | ||
| } | ||
|
|
||
| # For test files stored as expected~input (spoken~written). | ||
| runtest_swapped () { | ||
| input=$1 | ||
| echo "INPUT is $input" | ||
| cd ${GRAMMARS_DIR} | ||
|
|
||
| while IFS= read -r testcase; do | ||
| IFS='~' read -r spoken written <<< "$testcase" | ||
|
|
||
| escaped_written=$(printf '%s' "$written" | sed 's/\\/\\\\/g') | ||
| denorm_pred=$(echo "$escaped_written" | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') | ||
|
|
||
| spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" | ||
| denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" | ||
|
|
||
| assertEquals "$written" "$spoken" "$denorm_pred" | ||
| done < "$input" | ||
| } | ||
|
|
||
| testTNCardinal() { | ||
| input=$TEST_DIR/data_text_normalization/test_cases_cardinal.txt | ||
| runtest $input | ||
| } | ||
|
|
||
| testTNDecimal() { | ||
| input=$TEST_DIR/data_text_normalization/test_cases_decimal.txt | ||
| runtest $input | ||
| } | ||
|
|
||
| testTNFraction() { | ||
| input=$TEST_DIR/data_text_normalization/test_cases_fraction.txt | ||
| runtest_swapped $input | ||
| } | ||
|
|
||
| testTNMeasure() { | ||
| input=$TEST_DIR/data_text_normalization/test_cases_measure.txt | ||
| runtest_swapped $input | ||
| } | ||
|
|
||
| testTNMoney() { | ||
| input=$TEST_DIR/data_text_normalization/test_cases_money.txt | ||
| runtest $input | ||
| } | ||
|
|
||
| # Remove all command-line arguments | ||
| shift $# | ||
|
|
||
| # Load shUnit2 | ||
| . /workspace/shunit2/shunit2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is there any special that we need to change this comment line? if yes, we should explain and if not, just back to old version