From 56f129cf2e2300ad5b3d4b073aebfc759711003a Mon Sep 17 00:00:00 2001 From: Mayuri S Date: Fri, 5 Jun 2026 17:11:53 +0530 Subject: [PATCH 1/5] feat(hi): add Hindi ITN electronic class Signed-off-by: Mayuri S --- NEW_LAPTOP_SETUP.md | 134 ++++ .../hi/data/address/__init__.py | 13 + .../hi/data/address/cities.tsv | 36 ++ .../hi/data/address/context_cues.tsv | 84 +++ .../hi/data/address/digit_passthrough.tsv | 10 + .../hi/data/address/ordinals.tsv | 167 +++++ .../hi/data/address/special_characters.tsv | 2 + .../hi/data/address/states.tsv | 36 ++ .../hi/data/date/century.tsv | 2 + .../hi/data/electronic/__init__.py | 0 .../hi/data/electronic/chemical_formulas.tsv | 21 + .../hi/data/electronic/common_words.tsv | 169 +++++ .../hi/data/electronic/digit_glyphs.tsv | 10 + .../hi/data/electronic/digit_words.tsv | 10 + .../hi/data/electronic/domain.tsv | 89 +++ .../hi/data/electronic/letters.tsv | 54 ++ .../hi/data/electronic/server_name.tsv | 153 +++++ .../hi/data/electronic/special_codes.tsv | 24 + .../hi/data/measure/measurements.tsv | 10 +- .../hi/data/money/currency.tsv | 3 +- .../hi/data/numbers/paune.tsv | 231 +++++++ .../hi/data/numbers/teens_and_ties.tsv | 3 +- .../hi/data/percentage/__init__.py | 0 .../hi/data/percentage/percent_symbol.tsv | 4 + .../hi/data/telephone/context_cues.tsv | 15 + .../hi/data/telephone/country_codes.tsv | 1 + .../hi/data/telephone/eng_digit.tsv | 9 + .../hi/data/telephone/eng_zero.tsv | 1 + .../hi/data/time/hour_for_paune.tsv | 15 + .../hi/data/whitelist/whitelist.tsv | 8 +- .../hi/graph_utils.py | 5 + .../hi/taggers/cardinal.py | 93 ++- .../hi/taggers/date.py | 24 +- .../hi/taggers/electronic.py | 581 ++++++++++++++++++ .../hi/taggers/fraction.py | 100 ++- .../hi/taggers/measure.py | 181 +++++- .../hi/taggers/money.py | 190 +++++- .../hi/taggers/ordinal.py | 3 +- .../hi/taggers/percentage.py | 54 ++ .../hi/taggers/telephone.py | 230 +++---- .../hi/taggers/time.py | 90 ++- .../hi/taggers/tokenize_and_classify.py | 8 +- .../hi/verbalizers/date.py | 23 +- .../hi/verbalizers/electronic.py | 48 ++ .../hi/verbalizers/fraction.py | 1 + .../hi/verbalizers/measure.py | 29 +- .../hi/verbalizers/ordinal.py | 3 +- .../hi/verbalizers/percentage.py | 53 ++ .../hi/verbalizers/telephone.py | 1 - .../hi/verbalizers/verbalize.py | 8 +- .../hi/verbalizers/verbalize_final.py | 11 +- .../text_normalization/utils_audio_based.py | 5 +- .../test_cases_electronic.txt | 113 ++++ .../test_cases_percentage.txt | 12 + .../hi/test_electronic.py | 43 ++ .../hi/test_percentage.py | 29 + ..._sparrowhawk_inverse_text_normalization.sh | 22 +- tests/nemo_text_processing/utils.py | 2 +- tools/text_processing_deployment/Dockerfile | 17 +- tools/text_processing_deployment/diag_base.sh | 27 + tools/text_processing_deployment/diag_fst.sh | 21 + v2_helper_test1.py | 250 ++++++++ 62 files changed, 3421 insertions(+), 170 deletions(-) create mode 100644 NEW_LAPTOP_SETUP.md create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/address/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/address/cities.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/address/context_cues.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/address/digit_passthrough.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/address/ordinals.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/address/special_characters.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/address/states.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/electronic/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/electronic/chemical_formulas.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/electronic/common_words.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/electronic/digit_glyphs.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/electronic/digit_words.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/electronic/domain.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/electronic/letters.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/electronic/server_name.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/electronic/special_codes.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/numbers/paune.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/percentage/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/percentage/percent_symbol.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/telephone/context_cues.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/telephone/country_codes.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_digit.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_zero.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/time/hour_for_paune.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/taggers/electronic.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi/taggers/percentage.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi/verbalizers/electronic.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi/verbalizers/percentage.py create mode 100644 tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_electronic.txt create mode 100644 tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_percentage.txt create mode 100644 tests/nemo_text_processing/hi/test_electronic.py create mode 100644 tests/nemo_text_processing/hi/test_percentage.py create mode 100644 tools/text_processing_deployment/diag_base.sh create mode 100644 tools/text_processing_deployment/diag_fst.sh create mode 100644 v2_helper_test1.py diff --git a/NEW_LAPTOP_SETUP.md b/NEW_LAPTOP_SETUP.md new file mode 100644 index 000000000..f66e32a61 --- /dev/null +++ b/NEW_LAPTOP_SETUP.md @@ -0,0 +1,134 @@ +# New Laptop Setup — Hindi ITN Electronic Grammar + Sparrowhawk Tests + +Complete from-scratch guide to get this project running on a fresh Windows laptop, +including Docker (via Docker Desktop) and the Sparrowhawk ITN test pipeline. + +> All the "WSL" commands run **inside the Ubuntu terminal**, not Windows PowerShell. +> Steps marked "(Windows)" run in **Windows PowerShell**. + +--- + +## 1. Install WSL2 + Ubuntu (Windows PowerShell, as Administrator) + +```powershell +wsl --install +wsl --set-default-version 2 +``` + +Reboot when prompted. On first launch, Ubuntu asks you to create a username + password — set those (remember the password; it's your `sudo` password). + +Verify: +```powershell +wsl --status # should show Default Version: 2 +``` + +--- + +## 2. Install Docker Desktop (Windows) + +We use **Docker Desktop** (not the native WSL engine) because it handles WSL +networking/DNS/MTU automatically — avoids the TLS-timeout / DNS issues. + +1. Download from https://www.docker.com/products/docker-desktop/ → **Windows AMD64** +2. Run the installer → keep **"Use WSL 2 instead of Hyper-V"** CHECKED +3. Launch Docker Desktop (whale icon in the system tray) +4. **Settings → General** → confirm **"Use the WSL 2 based engine"** is checked +5. **Settings → Resources → WSL Integration** → toggle **ON** for your **Ubuntu** distro +6. Click **Apply & Restart** + +Verify (in a FRESH Ubuntu terminal): +```bash +docker context ls # "desktop-linux" should be current (*) +docker run hello-world # should pull + print "Hello from Docker!" +``` + +> Note: with Docker Desktop you do NOT run `sudo service docker start`. +> The daemon runs on the Windows side — just keep Docker Desktop open. + +--- + +## 3. Get the code (WSL Ubuntu) + +```bash +sudo apt update && sudo apt install -y git +git clone https://github.com/mayuris-00/NeMo-text-processing.git +cd NeMo-text-processing + +# Your latest WIP work is on this branch: +git checkout hi-itn-electronic-backup-2026-06-04 +``` + +Branches: +- `hi-itn-electronic-nvidia-base` — main working branch (NVIDIA-original base) +- `hi-itn-electronic-backup-2026-06-04` — WIP backup (electronic grammar work) + +--- + +## 4. Set up Python + pynini (WSL) + +`pynini` only builds on Linux — that's why grammar export happens in WSL. + +```bash +sudo apt install -y python3 python3-pip +pip3 install pynini==2.1.5 +pip3 install nemo_text_processing +``` + +(If you prefer an isolated env: `python3 -m venv ~/ntp-venv && source ~/ntp-venv/bin/activate` +before the `pip3 install` lines.) + +--- + +## 5. Run the Sparrowhawk ITN test (WSL) + +```bash +cd ~/NeMo-text-processing/tools/text_processing_deployment + +bash export_grammars.sh \ + --GRAMMARS=itn_grammars \ + --LANGUAGE=hi \ + --INPUT_CASE=lower_cased \ + --MODE=test +``` + +What this does (chain): +1. `pynini_export.py` compiles the Hindi ITN grammars → `.far` files +2. `docker/build.sh` builds the `sparrowhawk` image (~30 min the FIRST time; + it compiles protobuf/re2/sparrowhawk from source) +3. `docker/launch.sh` runs the container → executes + `test_sparrowhawk_inverse_text_normalization.sh` → runs all the ITN tests, + including `testITNElectronic` (your electronic test cases) + +### Just the electronic test cases +The electronic cases live in: +`tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_electronic.txt` + +The `testITNElectronic` function in +`tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh` +runs them. + +--- + +## Troubleshooting + +- **`docker run hello-world` TLS timeout** → Docker Desktop not started, or WSL + integration off. Open Docker Desktop, re-check Settings → Resources → WSL Integration. +- **Sparrowhawk build fails on a `git clone` / download** → network blip; just + re-run the `export_grammars.sh` command (Docker caches completed layers). +- **`pynini` install fails** → make sure you're in WSL/Ubuntu, not Windows Python. +- **Rebuild the image from scratch** → add `FORCE_REBUILD=True` to the export command. +- **Re-use existing `.far` files (skip recompile)** → the script auto-detects them; + to force overwrite, leave `OVERWRITE_CACHE=True` (default). + +--- + +## Quick reference — daily workflow + +```bash +# 1. Make sure Docker Desktop is running (Windows) +# 2. In WSL: +cd ~/NeMo-text-processing +git checkout hi-itn-electronic-backup-2026-06-04 +cd tools/text_processing_deployment +bash export_grammars.sh --GRAMMARS=itn_grammars --LANGUAGE=hi --INPUT_CASE=lower_cased --MODE=test +``` diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/address/__init__.py b/nemo_text_processing/inverse_text_normalization/hi/data/address/__init__.py new file mode 100644 index 000000000..4fc25d0d3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/address/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/address/cities.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/address/cities.tsv new file mode 100644 index 000000000..b49330652 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/address/cities.tsv @@ -0,0 +1,36 @@ +अमरावती +ईटानगर +दिसपुर +पटना +रायपुर +पणजी +गांधीनगर +चंडीगढ़ +शिमला +रांची +बेंगलुरु +तिरुवनंतपुरम +भोपाल +मुंबई +इम्फाल +शिलांग +आइजोल +कोहिमा +भुवनेश्वर +जयपुर +गंगटोक +चेन्नई +हैदराबाद +अगरतला +लखनऊ +देहरादून +कोलकाता +पोर्ट ब्लेयर +दमन +नई दिल्ली +श्रीनगर +जम्मू +लेह +कारगिल +कवरत्ती +पुडुचेरी \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/address/context_cues.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/address/context_cues.tsv new file mode 100644 index 000000000..bdb9228a4 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/address/context_cues.tsv @@ -0,0 +1,84 @@ +हाउस +प्लॉट +बूथ +अपार्टमेंट +फ्लैट +यूनिट +टावर +कॉम्प्लेक्स +मंजिल +फ्लोर +ब्लॉक +सेक्टर +फेज +रोड +सड़क +मार्ग +स्ट्रीट +गली +राजमार्ग +ड्राइव +डिस्ट्रिक्ट +बाईपास +हाइवे +पार्कवे +कॉलोनी +नगर +पार्क +एस्टेट +बोलवार्ड +मार्केट +सेंटर +पिन +गांव +पास +ब्रिगेड +नियर +स्क्वेर +मॉल +टॉवर +इंस्टीट्यूट +पिलर +मेट्रो +एवेन्यू +वेस्ट +सामने +पीछे +वीया +टाउन +लेन +चौक +चौराहा +बस्ती +मोहल्ला +विहार +एन्क्लेव +पुरम +शहर +जिला +ईस्ट +नॉर्थ +साउथ +प्लाज़ा +ब्रिज +स्टेशन +अन्तर्गत +शॉप +क्रॉस +विलेज +हिल +असम +गोवा +महाराष्ट्र +दिल्ली +बैंगलोर +चेन्नई +मुंबई +कोलकाता +हैदराबाद +अहमदाबाद +पुणे +जयपुर +चंडीगढ़ +लखनऊ +इलाहाबाद diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/address/digit_passthrough.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/address/digit_passthrough.tsv new file mode 100644 index 000000000..42633c28b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/address/digit_passthrough.tsv @@ -0,0 +1,10 @@ +۰ ۰ +۱ ۱ +۲ ۲ +۳ ۳ +۴ ۴ +۵ ۵ +۶ ۶ +۷ ۷ +۸ ۸ +۹ ۹ diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/address/ordinals.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/address/ordinals.tsv new file mode 100644 index 000000000..1460d2b3d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/address/ordinals.tsv @@ -0,0 +1,167 @@ +फ़र्स्ट १st +सेकंड २nd +थर्ड ३rd +फ़ोर्थ ४th +फ़िफ्थ ५th +सिक्स्थ ६th +सेवंथ ७th +एटथ ८th +नाइंथ ९th +टेंथ १०th +इलेवंथ ११th +ट्वेल्फ्थ १२th +थर्टींथ १३th +फोर्टींथ १४th +फिफ्टींथ १५th +पाँचवां ५वां +पाँचवीं ५वीं +छठवां ६ठवां +छठवीं ६ठवीं +सातवां ७वां +सातवीं ७वीं +आठवां ८वां +आठवीं ८वीं +नौवां ९वां +नौवीं ९वीं +दसवां १०वां +दसवीं १०वीं +ग्यारहवां ११वां +ग्यारहवीं ११वीं +बारहवां १२वां +बारहवीं १२वीं +तेरहवां १३वां +तेरहवीं १३वीं +चौदहवां १४वां +चौदहवीं १४वीं +पंद्रहवां १५वां +पंद्रहवीं १५वीं +सोलहवां १६वां +सोलहवीं १६वीं +सत्रहवां १७वां +सत्रहवीं १७वीं +अठारहवां १८वां +अठारहवीं १८वीं +उन्नीसवां १९वां +उन्नीसवीं १९वीं +बीसवां २०वां +बीसवीं २०वीं +इक्कीसवां २१वां +इक्कीसवीं २१वीं +बाईसवां २२वां +बाईसवीं २२वीं +तेईसवां २३वां +तेईसवीं २३वीं +चौबीसवां २४वां +चौबीसवीं २४वीं +पच्चीसवां २५वां +पच्चीसवीं २५वीं +छब्बीसवां २६वां +छब्बीसवीं २६वीं +सत्ताईसवां २७वां +सत्ताईसवीं २७वीं +अट्ठाईसवां २८वां +अट्ठाईसवीं २८वीं +उनतीसवां २९वां +उनतीसवीं २९वीं +तीसवां ३०वां +तीसवीं ३०वीं +इकतीसवां ३१वां +इकतीसवीं ३१वीं +बत्तीसवां ३२वां +बत्तीसवीं ३२वीं +तैंतीसवां ३३वां +तैंतीसवीं ३३वीं +चौंतीसवां ३४वां +चौंतीसवीं ३४वीं +पैंतीसवां ३५वां +पैंतीसवीं ३५वीं +छत्तीसवां ३६वां +छत्तीसवीं ३६वीं +सैंतीसवां ३७वां +सैंतीसवीं ३७वीं +अड़तीसवां ३८वां +अड़तीसवीं ३८वीं +उनतालीसवां ३९वां +उनतालीसवीं ३९वीं +चालीसवां ४०वां +चालीसवीं ४०वीं +इकतालीसवां ४१वां +इकतालीसवीं ४१वीं +बयालीसवां ४२वां +बयालीसवीं ४२वीं +तैंतालीसवां ४३वां +तैंतालीसवीं ४३वीं +चवालीसवां ४४वां +चवालीसवीं ४४वीं +पैंतालीसवां ४५वां +पैंतालीसवीं ४५वीं +छियालीसवां ४६वां +छियालीसवीं ४६वीं +सैंतालीसवां ४७वां +सैंतालीसवीं ४७वीं +अड़तालीसवां ४८वां +अड़तालीसवीं ४८वीं +उनचासवां ४९वां +उनचासवीं ४९वीं +पचासवां ५०वां +पचासवीं ५०वीं +इक्यावनवां ५१वां +इक्यावनवीं ५१वीं +बावनवां ५२वां +बावनवीं ५२वीं +तिरपनवां ५३वां +तिरपनवीं ५३वीं +चौवनवां ५४वां +चौवनवीं ५४वीं +पचपनवां ५५वां +पचपनवीं ५५वीं +छप्पनवां ५६वां +छप्पनवीं ५६वीं +सत्तावनवां ५७वां +सत्तावनवीं ५७वीं +अट्ठावनवां ५८वां +अट्ठावनवीं ५८वीं +उनसठवां ५९वां +उनसठवीं ५९वीं +साठवां ६०वां +साठवीं ६०वीं +इकसठवां ६१वां +इकसठवीं ६१वीं +बासठवां ६२वां +बासठवीं ६२वीं +तिरसठवां ६३वां +तिरसठवीं ६३वीं +चौंसठवां ६४वां +चौंसठवीं ६४वीं +पैंसठवां ६५वां +पैंसठवीं ६५वीं +छियासठवां ६६वां +छियासठवीं ६६वीं +सड़सठवां ६७वां +सड़सठवीं ६७वीं +अड़सठवां ६८वां +अड़सठवीं ६८वीं +उनहत्तरवां ६९वां +उनहत्तरवीं ६९वीं +सत्तरवां ७०वां +सत्तरवीं ७०वीं +इकहत्तरवां ७१वां +इकहत्तरवीं ७१वीं +बहत्तरवां ७२वां +बहत्तरवीं ७२वीं +तिहत्तरवां ७३वां +तिहत्तरवीं ७३वीं +चौहत्तरवां ७४वां +चौहत्तरवीं ७४वीं +पचहत्तरवां ७५वां +पचहत्तरवीं ७५वीं +छिहत्तरवां ७६वां +छिहत्तरवीं ७६वीं +सतहत्तरवां ७७वां +सतहत्तरवीं ७७वीं +अठहत्तरवां ७८वां +अठहत्तरवीं ७८वीं +उनासीवां ७९वां +उनासीवीं ७९वीं +अस्सीवां ८०वां +अस्सीवीं ८०वीं diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/address/special_characters.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/address/special_characters.tsv new file mode 100644 index 000000000..bcef2cfdd --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/address/special_characters.tsv @@ -0,0 +1,2 @@ +हाइफ़न - +बटा / \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/address/states.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/address/states.tsv new file mode 100644 index 000000000..531e70da4 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/address/states.tsv @@ -0,0 +1,36 @@ +आंध्र प्रदेश +अरुणाचल प्रदेश +असम +बिहार +छत्तीसगढ़ +गोवा +गुजरात +हरियाणा +हिमाचल प्रदेश +झारखंड +कर्नाटक +केरल +मध्य प्रदेश +महाराष्ट्र +मणिपुर +मेघालय +मिज़ोरम +नागालैंड +ओडिशा +पंजाब +राजस्थान +सिक्किम +तमिलनाडु +तेलंगाना +त्रिपुरा +उत्तर प्रदेश +उत्तराखंड +पश्चिम बंगाल +अंडमान और निकोबार द्वीप समूह +चंडीगढ़ +दादरा और नगर हवेली और दमन और दीव +दिल्ली +जम्मू और कश्मीर +लद्दाख +लक्षद्वीप +पुडुचेरी \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv index da69e23eb..9369023e0 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv +++ b/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv @@ -1,3 +1,5 @@ ई.पू. ईसा पूर्व ई. ईस्वी ई. ईसवी +वर्ष पूर्व वर्ष पूर्व +शताब्दी शताब्दी \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/electronic/__init__.py b/nemo_text_processing/inverse_text_normalization/hi/data/electronic/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/electronic/chemical_formulas.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/electronic/chemical_formulas.tsv new file mode 100644 index 000000000..eb02c6276 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/electronic/chemical_formulas.tsv @@ -0,0 +1,21 @@ +ग्लूकोज C6H12O6 +जिंक सलफेट ZnSO4 +बेरियम हाइड्रॉक्साइड Ba(OH)2 +मैग्नीशियम ब्रोमाइड MgBr2 +हेक्सेन C6H14 +नाइट्रोजन N2 +ओजोन O3 +पानी H2O +अमोनिया NH3 +नमक NaCl +मेथेन CH4 +इथेनॉल C2H5OH +कार्बन डाइऑक्साइड CO2 +हाइड्रोक्लोरिक एसिड HCl +कैल्शियम कार्बोनेट CaCO3 +जंग Fe2O3 +बेंजीन C6H6 +सल्फ्यूरिक एसिड H2SO4 +नाइट्रिक एसिड HNO3 +एसिटिक एसिड CH3COOH +फॉर्मिक एसिड HCOOH diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/electronic/common_words.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/electronic/common_words.tsv new file mode 100644 index 000000000..4c2fb94f5 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/electronic/common_words.tsv @@ -0,0 +1,169 @@ +Users यूज़र्स +User यूज़र +Desktop डेस्कटॉप +Downloads डाउनलोड्स +Documents डॉक्युमेंट्स +Music म्यूज़िक +Pictures पिक्चर्स +Videos वीडियोज़ +Audio ऑडियो +file फ़ाइल +chapter चैप्टर +python पाइथन +work वर्क +school स्कूल +images इमेजेज़ +about अबाउट +blog ब्लॉग +index इंडेक्स +login लॉगिन +register रजिस्टर +search सर्च +tags टैग्स +category केटेगरी +categories केटेगरीज़ +post पोस्ट +posts पोस्ट्स +page पेज +pages पेजेस +admin एडमिन +app ऐप +help हेल्प +terms टर्म्स +privacy प्राइवेसी +contact कॉन्टैक्ट +main मेन +explore एक्सप्लोर +wiki विकी +docs डॉक्स +download डाउनलोड +upload अपलोड +uploads अपलोड्स +photos फ़ोटोज़ +video वीडियो +master मास्टर +blob ब्लॉब +tree ट्री +tests टेस्ट्स +test टेस्ट +config कॉन्फ़िग +settings सेटिंग्स +profile प्रोफ़ाइल +account अकाउंट +web वेब +email ई मेल +laptop लैपटॉप +mobile मोबाइल +phone फोन +phones फोन्स +online ऑनलाइन +courses कोर्सेज़ +learn लर्न +learning लर्निंग +university यूनिवर्सिटी +academy अकेडमी +domain डोमेन +domains डोमेन्स +analysis अनैलिसिस +play प्ले +maps मैप्स +drive ड्राइव +cloud क्लाउड +services सर्विसेज़ +india इंडिया +apache अपाची +kernel कर्नल +bin बिन +var वार +home होम +activate एक्टिवेट +sites साइट्स +available अवेलेबल +enabled इनेबल्ड +backups बैकअप्स +temp टेम्प +secure सेक्योर +real रियल +data डेटा +survey सर्वे +files फाइल्स +sample सैंपल +templates टेम्पलेट्स +impress इंप्रेस +office ऑफिस +libreoffice लिब्रे ऑफिस +express एक्सप्रेस +scribe स्क्राइब +transcription ट्रांसक्रिप्शन +software सॉफ्टवेयर +or ओ आर +teams टीम्स +green ग्रीन +brown ब्राउन +white व्हाइट +black ब्लैक +homepage होमपेज +content कंटेन्ट +default डिफ़ॉल्ट +foodhealth फूड हेल्थ +workflows वर्कफ्लोज़ +world वर्ल्ड +list लिस्ट +and एंड +LICENSE लाइसेंस +tag टैग +blogs ब्लॉग्स +bath बाथ +ward वार्ड +banks बैंक्स +dean डीन +rice राइस +honda होंडा +ford फोर्ड +house हाउस +bharat भरत +rich रिच +cook कुक +lane लेन +knight नाइट +moody मूडी +wise वाइज़ +shields शील्ड्स +puppy पप्पी +recipe रेसिपी +hall हॉल +mason मेसन +king किंग +fry फ्राई +flowers फ्लावर्स +assam आसाम +grace ग्रेस +bishop बिशप +woods वुड्स +brewer ब्रूअर +cannon कैनन +saute सौटे +pope पोप +robin रॉबिन +price प्राइस +address एड्रेस +venv वेन्व +SAMPLE एस ए एम पी एल ई +hotmail हॉटमेल +ExpressScribeTranscriptionSoftware ई एक्स पी आर ई एस एस एस सी आर आई बी ई टी आर ए एन एस सी आर आई पी टी आई ओ एन एस ओ एफ टी डब्ल्यू ए आर ई +Phones पी एच ओ एन ई एस +TXR20820d90fb1d3327447009e701166f29 टी एक्स आर दो शून्य आठ दो शून्य डी नौ शून्य एफ बी एक डी तीन तीन दो सात चार चार सात शून्य शून्य नौ ई सात शून्य एक एक छह छह एफ दो नौ +poetry पोएट्री +food फूड +health हेल्थ +any ए एन वाई +Hipo एच आई पी ओ +README आर ई ए डी एम ई +coursera कोर्सेरा +anuj ए एन यू जे +Main एम ए आई एन +google गूगल +blogger ब्लॉगर +travis टी आर ए वी आई एस +frank एफ आर ए एन के +Page पी ए जी ई \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/electronic/digit_glyphs.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/electronic/digit_glyphs.tsv new file mode 100644 index 000000000..dfc93217a --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/electronic/digit_glyphs.tsv @@ -0,0 +1,10 @@ +1 १ +2 २ +3 ३ +4 ४ +5 ५ +6 ६ +7 ७ +8 ८ +9 ९ +0 ० \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/electronic/digit_words.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/electronic/digit_words.tsv new file mode 100644 index 000000000..f85014305 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/electronic/digit_words.tsv @@ -0,0 +1,10 @@ +एक 1 +दो 2 +तीन 3 +चार 4 +पाँच 5 +छह 6 +सात 7 +आठ 8 +नौ 9 +शून्य 0 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/electronic/domain.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/electronic/domain.tsv new file mode 100644 index 000000000..c6e2a7cbe --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/electronic/domain.tsv @@ -0,0 +1,89 @@ +com कॉम +org ऑर्ग +net नेट +edu ई डी यू +gov जी ओ वी +biz बिज़ +info इन्फो +in इन +io आई ओ +ai ए आई +uk यू के +us यू एस +ru आर यू +de डी ई +jp जे पी +cn सी एन +au ए यू +ca सी ए +br बी आर +ac ए सी +res आर ई एस +nic एन आई सी +ernet ई आर नेट +int आई एन टी +tv टी वी +me एम ई +tech टेक +dev डी ई वी +app ऐप +xyz एक्स वाई ज़ेड +online ऑनलाइन +store स्टोर +blog ब्लॉग +site साइट +pro प्रो +photos फ़ोटोज़ +gov जी ओ वी +ac ए सी +nic एन आई सी +sims सिम्स +pope पोप +Zoom ज़ेड ओ ओ एम +Coursera सी ओ यू आर एस ई आर ए +Screenshot एस सी आर ई ई एन एस एच ओ टी +Microsoft एम आई सी आर ओ एस ओ एफ टी +Audacity ए यू डी ए सी आई टी वाई +Teams टी ई ए एम एस +hotmail हॉटमेल +Mobile एम ओ बी आई एल ई +Phones पी एच ओ एन ई एस +com कॉम +org ऑर्ग +net नेट +edu ई डी यू +gov जी ओ वी +biz बिज़ +info इन्फो +in इन +io आई ओ +ai ए आई +uk यू के +us यू एस +ru आर यू +de डी ई +fr एफ आर +jp जे पी +cn सी एन +au ए यू +ca सी ए +br बी आर +ac ए सी +res आर ई एस +nic एन आई सी +ernet ई आर नेट +int आई एन टी +tv टी वी +me एम ई +tech टेक +dev डी ई वी +app ऐप +xyz एक्स वाई ज़ेड +online ऑनलाइन +store स्टोर +blog ब्लॉग +site साइट +pro प्रो +photos फ़ोटोज़ +sims सिम्स +com COM \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/electronic/letters.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/electronic/letters.tsv new file mode 100644 index 000000000..2f4ad7963 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/electronic/letters.tsv @@ -0,0 +1,54 @@ +a ए +b बी +c सी +d डी +e ई +f एफ +f एफ़ +g जी +h एच +i आई +i आइ +j जे +k के +l एल +m एम +n एन +o ओ +p पी +q क्यू +r आर +s एस +t टी +u यू +v वी +w डब्ल्यू +x एक्स +y वाई +z ज़ेड +A ए +B बी +C सी +D डी +E ई +F एफ +G जी +H एच +I आई +J जे +K के +L एल +M एम +N एन +O ओ +P पी +Q क्यू +R आर +S एस +T टी +U यू +V वी +W डब्ल्यू +X एक्स +Y वाई +Z ज़ेड \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/electronic/server_name.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/electronic/server_name.tsv new file mode 100644 index 000000000..e9de1badd --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/electronic/server_name.tsv @@ -0,0 +1,153 @@ +gmail जीमेल +yahoo याहू +hotmail हॉटमेल +outlook आउटलुक +live लाइव +google गूगल +microsoft माइक्रोसॉफ्ट +facebook फ़ेसबुक +twitter ट्विटर +instagram इंस्टाग्राम +linkedin लिंक्डइन +youtube यूट्यूब +amazon अमेज़ोन +wikipedia विकिपीडिया +github गिटहब +reddit रेडिट +netflix नेटफ्लिक्स +spotify स्पॉटिफाई +apple एप्पल +samsung सैमसंग +Zoom ज़ेड ओ ओ एम +Coursera सी ओ यू आर एस ई आर ए +Screenshot एस सी आर ई ई एन एस एच ओ टी +Microsoft एम आई सी आर ओ एस ओ एफ टी +Audacity ए यू डी ए सी आई टी वाई +Teams टी ई ए एम एस +Mobile एम ओ बी आई एल ई +Phones पी एच ओ एन ई एस +nvidia एनविडिया +intel इंटेल +adobe अडोब +wordpress वर्डप्रेस +blogger ब्लॉगर +mentalfloss मेंटल फ्लॉस +placekitten प्लेस किटन +dummyimage डमी इमेज +reliablesoft रिलाएबल सॉफ्ट +ebay ई बे +moz मोज़ +mozilla मॉज़िला +genius जीनियस +groupon ग्रुप ऑन +gutenberg गुटेनबर्ग +recipepuppy रेसिपी पप्पी +buyagift बाय अ गिफ्ट +researchgate रिसर्च गेट +afternic आफ्टर निक +hipolabs हिपो लैब्स +licindia एल आई सी इंडिया +placeimg प्लेस आई एम जी +codecademy कोड कैडेमी +skillshop स्किलशॉप +skillshare स्किल शेयर +udemy यूडेमी +masterclass मास्टरक्लास +amity एमिटी +sharda शारदा +universities यूनिवर्सिटीज़ +mcdonald मैक्डॉनल्ड +southmountaincc साउथ माउन्टेन सी सी +academyart अकेडमी आर्ट +bryanuniversity ब्रायन यूनिवर्सिटी +centralaz सेंट्रल ए ज़ेड +alaska अलास्का +phoenix फीनिक्स +phoenixcollege फीनिक्स कॉलेज +maricopa मैरीकोपा +prescott प्रेसकॉट +azwestern ए ज़ेड वेस्टर्न +poetry पोएट्री +harvard हावर्ड +bamu बी ए एम यू +garcia गारशिया +torres टॉरेस +robinson रॉबिन्सन +picsum पी आई सी एस यू एम +web वेब +laptop लैपटॉप +desktop डेस्कटॉप +email ई मेल +gaangulii जी ए ए एन जी यू एल आई आई +alii ए एल आई आई +paattil पी ए ए टी टी आई एल +laal एल ए ए एल +raamshrmaa आर ए ए एम एस एच आर एम ए ए +vphaadaar वी पी एच ए ए डी ए ए आर +dddhaal डी डी डी एच ए ए एल +mngt एम एन जी टी +dyaal डी वाई ए ए एल +kidd के आई डी डी +kline के एल आई एन ई +bnaa बी एन ए ए +dve डी वी ई +chaabraa सी एच ए ए बी आर ए ए +gaaykvaadd जी ए ए वाई के वी ए ए डी डी +jmaant जे एम ए ए एन टी +raamllaa आर ए ए एम एल एल ए ए +shirole एस एच आई आर ओ एल ई +saayaa एस ए ए वाई ए ए +loknaattyon एल ओ के एन ए ए टी टी वाई ओ एन +ddhiingraa डी डी एच आई आई एन जी आर ए ए +mjuumdaar एम जे यू यू एम डी ए ए आर +bjaaj बी जे ए ए जे +duube डी यू यू बी ई +vikaavi वी आई के ए ए वी आई +gaavit जी ए ए वी आई टी +sinh एस आई एन एच +baird बी ए आई आर डी +harvey एच ए आर वी ई वाई +holder एच ओ एल डी ई आर +mallik एम ए एल एल आई के +badami बी ए डी ए एम आई +maan एम ए ए एन +ortega ओ आर टी ई जी ए +osborne ओ एस बी ओ आर एन ई +perez पी ई आर ई ज़ेड +forbes एफ ओ आर बी ई एस +agraval ए जी आर ए वी ए एल +kunda के यू एन डी ए +frederick एफ आर ई डी ई आर आई सी के +fitzgerald एफ आई टी ज़ेड जी ई आर ए एल डी +aahuujaa ए ए एच यू यू जे ए ए +aacaary ए ए सी ए ए आर वाई +baalaasubrmnniym बी ए ए एल ए ए एस यू बी आर एम एन एन आई वाई एम +krssnnmuurti के आर एस एस एन एन एम यू यू आर टी आई +mllik एम एल एल आई के +baadaamii बी ए ए डी ए ए एम आई आई +congress सी ओ एन जी आर ई एस एस +powell पी ओ डब्ल्यू ई एल एल +russo आर यू एस एस ओ +mason मेसन +rich रिच +ruiz आर यू आई ज़ेड +baamu बी ए एम यू +bieap बी आई ई ए पी +aasc ए ए एस सी +lucero एल यू सी ई आर ओ +stevenson एस टी ई वी ई एन एस ओ एन +sharma एस एच ए आर एम ए +green ग्रीन +turner टी यू आर एन ई आर +wallace डब्ल्यू ए एल एल ए सी ई +agrvaal ए जी आर वी ए ए एल +kunnddaa के यू एन एन डी डी ए ए +shriiviml एस एच आर आई आई वी आई एम एल +baalkrssnnn बी ए ए एल के आर एस एस एन एन एन +shrivimal एस एच आर आई वी आई एम ए एल +ahluvaaliyaa ए एच एल यू वी ए ए एल आई वाई ए ए +naam एन ए ए एम +TXR टी एक्स आर +Hipo एच आई पी ओ +sanchez सानशेज़ +crawford क्रॉफोर्ड diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/electronic/special_codes.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/electronic/special_codes.tsv new file mode 100644 index 000000000..28d07e974 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/electronic/special_codes.tsv @@ -0,0 +1,24 @@ +ज़ेड एक्स आठ शून्य एक नौ आठ शून्य ZX80 1980 +बी ए एस ई तीन दो Base32 +बी ई सी एच तीन दो Bech32 +सी ए एन ओ एन ए सात पाँच Canon A75 +आर ई डी एम आई एच चार चार चार जी REDMI H44 4G +आई एन एन ओ टी ई एक IN NOTE1 +ए एस सी आई आई आठ पाँच Ascii85 +एम के हाइफ़न एक ए Mk-1A +एक नौ तीन पाँच आठ एम नौ 1 9358M9 +छह दो दो छह शून्य एम छह 6 2260M6 +सात नौ नौ छह पाँच एम छह 7 9965M6 +डॉट टी आर ए वी आई एस डॉट वाई एम एल .travis.yml +एच आई पी ओ Hipo +अग्नि द्वितीय अग्नि-2 +कोविड नौटीन कोविड-19 +पृथ्वी चार पृथ्वी-4 +ब्रह्मोस हाइफ़न एक ब्रह्मोस-1 +भास्कर हाइफ़न दो भास्कर-II +श्रेणी हाइफ़न दो श्रेणी-II +रोहिणी आर एस हाइफ़न एक रोहिणी आर एस-I +ऑडी आठ शून्य बटा नौ शून्य बी चार ऑडी 80/90 B4 +ऑडी आठ शून्य बटा नौ शून्य बी चार ऑडी 80/90 B4 +पाँच जी 5G +लैपटॉप हाइफ़न चार तीन डॉट सानशेज़ हाइफ़न क्रॉफोर्ड laptop-43.sanchez-crawford diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/measure/measurements.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/measure/measurements.tsv index a1ab32da0..d472e15df 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/data/measure/measurements.tsv +++ b/nemo_text_processing/inverse_text_normalization/hi/data/measure/measurements.tsv @@ -263,6 +263,7 @@ yr सालों yr वर्ष yr वर्षों hp हॉर्स पावर +hp हॉर्सपॉवर hp हॉर्सपावर hp अश्वशक्ति hp अश्वशक्त @@ -284,4 +285,11 @@ mi/s मील प्रति सेकेंड mi/h मील प्रति घंटा mi/h मील प्रति घंटे mi/m मील प्रति मिनट -₹/ac रुपए प्रति एकड़ \ No newline at end of file +₹/ac रुपए प्रति एकड़ +w हफ़्ते +w हफ़्ता +w सप्ताह +सदियां सदियां +सदियाँ सदियाँ +सदियों सदियों +सदी सदी diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/money/currency.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/money/currency.tsv index 0ca503bb1..3ee478688 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/data/money/currency.tsv +++ b/nemo_text_processing/inverse_text_normalization/hi/data/money/currency.tsv @@ -22,7 +22,6 @@ km बोस्निया और हर्जेगोविना का म p बोत्सवाना पुला r$ ब्राजीलियाई रियाल £ ब्रिटिश पाउंड -£ पाउंड b$ ब्रुनेई डॉलर лв बुल्गारियाई लेव fbu बुरुंडी फ्रैंक @@ -179,4 +178,4 @@ bs. वेनेजुएलन बोलिवार ₺ लीरा ₦ नाइरा ¢ सेंट्स -¢ सेंट \ No newline at end of file +¢ सेंट diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/numbers/paune.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/numbers/paune.tsv new file mode 100644 index 000000000..f56b256e6 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/numbers/paune.tsv @@ -0,0 +1,231 @@ +१ दो +२ तीन +३ चार +४ पाँच +४ पांच +५ छः +५ छह +५ छे +६ सात +७ आठ +८ नौ +९ दस +१० ग्यारह +११ बारह +१२ तेरह +१३ चौदह +१४ पन्द्रह +१४ पंद्रह +१५ सोलह +१६ सत्रह +१६ सतरह +१७ अठारह +१७ अट्ठारह +१८ उन्नीस +१८ उनीस +१९ बीस +२० इक्कीस +२० इकीस +२० ईकीस +२१ बाईस +२१ बाइस +२२ तेईस +२२ तेइस +२३ चौबीस +२४ पच्चीस +२४ पचीस +२५ छब्बीस +२५ छबीस +२६ सत्ताईस +२६ सत्ताइस +२६ सताईस +२६ सताइस +२७ अट्ठाईस +२७ अट्ठाइस +२७ अठाईस +२७ अठाइस +२८ उनतीस +२८ उन्तीस +२९ तीस +३० इकतीस +३० इकतिस +३० इकत्तीस +३० इकत्तिस +३१ बत्तीस +३१ बत्तिस +३१ बतीस +३१ बतिस +३२ तैंतीस +३२ तैंतिस +३२ तैंत्तीस +३२ तैंत्तिस +३२ तेतीस +३२ तेंतीस +३३ चौंतीस +३३ चौंतिस +३३ चौंत्तीस +३३ चौंत्तिस +३४ पैंतीस +३४ पैंतिस +३४ पैंत्तीस +३४ पैंत्तिस +३५ छत्तीस +३५ छत्तिस +३५ छतीस +३५ छतिस +३६ सैंतीस +३६ सैंतिस +३६ सैंत्तीस +३६ सैंत्तिस +३७ अड़तीस +३७ अड़तिस +३७ अड़त्तीस +३७ अड़त्तिस +३८ उनतालीस +३८ उनतालिस +३८ उनत्तालीस +३८ उनत्तालिस +३८ उन्तालीस +३८ उन्तालिस +३९ चालीस +४० इकतालीस +४० इकतालिस +४० इक्तालीस +४१ बयालीस +४१ बयालिस +४१ ब्यालीस +४२ तैंतालीस +४२ तैंतालिस +४३ चौवालीस +४३ चौवालिस +४३ चवालीस +४३ चवालिस +४३ चौंतालीस +४४ पैंतालीस +४४ पैंतालिस +४५ छियालीस +४५ छियालिस +४५ छयालीस +४६ सैंतालीस +४६ सैंतालिस +४६ सैतालिस +४७ अड़तालीस +४७ अड़तालिस +४८ उनचास +४९ पचास +५० इक्यावन +५० इकयावन +५१ बावन +५२ तिरपन +५२ तिरेपन +५३ चौवन +५४ पचपन +५५ छप्पन +५५ छपन +५६ सत्तावन +५६ सतावन +५७ अट्ठावन +५७ अठावन +५८ उनसठ +५८ उनसठ +५९ साठ +६० इकसठ +६१ बासठ +६१ बासट +६२ तिरसठ +६२ तिरेसठ +६३ चौंसठ +६४ पैंसठ +६५ छियासठ +६५ छयासठ +६६ सड़सठ +६७ अड़सठ +६८ उनहत्तर +६८ उनहतर +६९ सत्तर +६९ सतर +७० इकहत्तर +७० इकहतर +७० इक्हत्तर +७० इकत्तर +७१ बहत्तर +७१ बहतर +७२ तिहत्तर +७२ तिहतर +७३ चौहत्तर +७३ चौहतर +७४ पचहत्तर +७४ पचहतर +७४ पिछत्तर +७४ पिछतर +७५ छिहत्तर +७५ छिहतर +७५ छियत्तर +७६ सतहत्तर +७६ सतहतर +७६ सतत्तर +७७ अठहत्तर +७७ अठहतर +७८ उन्यासी +७८ उन्यासि +७८ उनासी +७८ उनासि +७९ अस्सी +७९ अस्सि +८० इक्यासी +८० इक्यासि +८१ बयासी +८१ बयासि +८१ ब्यासी +८१ ब्यासि +८१ बिरासी +८२ तिरासी +८२ तिरासि +८२ तेरासी +८३ चौरासी +८३ चौरासि +८४ पचासी +८४ पचासि +८४ पिचयासी +८४ पिचयासि +८४ पिचासी +८५ छियासी +८५ छियासि +८६ सत्तासी +८६ सत्तासि +८६ सतासी +८६ सतासि +८७ अट्ठासी +८७ अट्ठासि +८७ अठासी +८७ अठासि +८८ नवासी +८८ नवासि +८९ नब्बे +९० इक्यानबे +९० इक्यानवे +९१ बानबे +९१ बानवे +९२ तिरानबे +९२ तिरानवे +९३ चौरानबे +९३ चौरानवे +९४ पंचानबे +९४ पंचानवे +९४ पचानवे +९४ पिचयानवे +९४ पिचयानबे +९४ पिच्यानवे +९४ पिच्यानबे +९५ छियानबे +९५ छियानवे +९६ सत्तानबे +९६ सत्तानवे +९७ अट्ठानबे +९७ अट्ठानवे +९७ अठानवे +९७ अठानबे +९८ निन्यान्बे +९८ निन्यानबे +९८ निन्यानवे +९८ निन्यान्वे diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/numbers/teens_and_ties.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/numbers/teens_and_ties.tsv index 91f656cfd..3968a1320 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/data/numbers/teens_and_ties.tsv +++ b/nemo_text_processing/inverse_text_normalization/hi/data/numbers/teens_and_ties.tsv @@ -9,6 +9,7 @@ १७ सत्रह १७ सतरह १८ अठारह +१८ अठाहर १८ अट्ठारह १९ उन्नीस १९ उनीस @@ -216,4 +217,4 @@ ९९ निन्यान्बे ९९ निन्यानबे ९९ निन्यानवे -९९ निन्यान्वे \ No newline at end of file +९९ निन्यान्वे diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/percentage/__init__.py b/nemo_text_processing/inverse_text_normalization/hi/data/percentage/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/percentage/percent_symbol.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/percentage/percent_symbol.tsv new file mode 100644 index 000000000..c2fb2992b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/percentage/percent_symbol.tsv @@ -0,0 +1,4 @@ +% प्रतिशत +% परसेंट +% फ़ीसदी +% फीसदी \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/context_cues.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/context_cues.tsv new file mode 100644 index 000000000..8373c52df --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/context_cues.tsv @@ -0,0 +1,15 @@ +mobile नंबर +mobile मोबाइल +mobile फोन +mobile कॉल +landline नंबर +landline मोबाइल +landline फोन +landline लैंडलाइन +landline कॉल +pincode पिन +pincode कोड +pincode पिनकोड +credit नंबर +credit कार्ड +credit क्रेडिट \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/country_codes.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/country_codes.tsv new file mode 100644 index 000000000..268d326b1 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/country_codes.tsv @@ -0,0 +1 @@ +९१ नौ एक \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_digit.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_digit.tsv new file mode 100644 index 000000000..6049cbf50 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_digit.tsv @@ -0,0 +1,9 @@ +१ one +२ two +३ three +४ four +५ five +६ six +७ seven +८ eight +९ nine \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_zero.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_zero.tsv new file mode 100644 index 000000000..769cbb603 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_zero.tsv @@ -0,0 +1 @@ +० zero \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/time/hour_for_paune.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/time/hour_for_paune.tsv new file mode 100644 index 000000000..8bb4c67ca --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/time/hour_for_paune.tsv @@ -0,0 +1,15 @@ +१२ एक +१ दो +२ तीन +३ चार +४ पाँच +४ पांच +५ छः +५ छह +५ छे +६ सात +७ आठ +८ नौ +९ दस +१० ग्यारह +११ बारह diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv index f9eb081b9..8cfd0e19f 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv +++ b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv @@ -1,13 +1,7 @@ १/४ पाव -१/२ आधा -३/४ पौन -१:३० डेढ़ बजे -२:३० ढाई बजे -१.५ डेढ़ -२.५ ढाई कु. कुमारी स्मि. श्रीमती श्री. श्री श्री. श्रीमान मा. मास्टर -डॉ. डॉक्टर \ No newline at end of file +डॉ. डॉक्टर diff --git a/nemo_text_processing/inverse_text_normalization/hi/graph_utils.py b/nemo_text_processing/inverse_text_normalization/hi/graph_utils.py index 96cbc58bb..b002efa52 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/hi/graph_utils.py @@ -32,6 +32,7 @@ graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) NEMO_HI_DIGIT = pynini.union("०", "१", "२", "३", "४", "५", "६", "७", "८", "९").optimize() +DEVANAGARI_DIGIT = ["०", "१", "२", "३", "४", "५", "६", "७", "८", "९"] NEMO_HEX = pynini.union(*string.hexdigits).optimize() NEMO_NON_BREAKING_SPACE = u"\u00a0" @@ -63,6 +64,10 @@ MINUS = pynini.union("ऋणात्मक", "नकारात्मक").optimize() +def integer_to_devanagari(n: int) -> str: + return ''.join(DEVANAGARI_DIGIT[int(d)] for d in str(n)) + + def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): """ Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/cardinal.py index 63b055bef..7fcdcf348 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/cardinal.py @@ -44,8 +44,14 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() graph_teens_and_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")).invert() + graph_paune = pynini.string_file(get_abs_path("data/numbers/paune.tsv")).invert() + self.graph_zero = graph_zero + self.graph_digit = graph_digit + self.graph_single_digit_with_zero = pynutil.insert("०") + graph_digit + self.graph_teens_and_ties = graph_teens_and_ties self.graph_two_digit = graph_teens_and_ties | (pynutil.insert("०") + graph_digit) graph_hundred = pynini.cross("सौ", "") + delete_hundred = pynutil.delete("सौ") delete_thousand = pynutil.delete("हज़ार") | pynutil.delete("हजार") graph_hundred_component = pynini.union(graph_digit + delete_space + graph_hundred, pynutil.insert("०")) graph_hundred_component += delete_space @@ -64,11 +70,93 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): ) graph_hundred_as_thousand += delete_space graph_hundred_as_thousand += self.graph_two_digit | pynutil.insert("००") + graph_hundred_as_thousand |= pynutil.add_weight( + pynutil.delete("साढ़े") + + delete_space + + graph_digit + + pynutil.insert("५००", weight=-0.1) + + delete_space + + delete_thousand, + -0.1, + ) + graph_hundred_as_thousand |= pynutil.add_weight( + pynutil.delete("सवा") + + delete_space + + graph_digit + + pynutil.insert("२५०", weight=-0.1) + + delete_space + + delete_thousand, + -0.1, + ) + graph_hundred_as_thousand |= pynutil.add_weight( + pynutil.delete("पौने") + + delete_space + + graph_paune + + pynutil.insert("७५०", weight=-0.1) + + delete_space + + delete_thousand, + -0.1, + ) + graph_hundred_as_thousand |= pynutil.add_weight( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + pynutil.insert("१५००", weight=-0.1) + + delete_space + + delete_thousand, + -0.1, + ) + graph_hundred_as_thousand |= pynutil.add_weight( + pynutil.delete("ढाई") + + delete_space + + pynutil.insert("२५००", weight=-0.1) + + delete_space + + delete_thousand, + -0.1, + ) - self.graph_hundreds = graph_hundred_component | graph_hundred_as_thousand + graph_in_hundreds = pynutil.add_weight( + pynutil.delete("साढ़े") + + delete_space + + (graph_digit | self.graph_two_digit) + + pynutil.insert("५०", weight=-0.1) + + delete_space + + delete_hundred, + -0.1, + ) + graph_in_hundreds |= pynutil.add_weight( + pynutil.delete("सवा") + + delete_space + + (graph_digit | self.graph_two_digit) + + pynutil.insert("२५", weight=-0.1) + + delete_space + + delete_hundred, + -0.1, + ) + graph_in_hundreds |= pynutil.add_weight( + pynutil.delete("पौने") + + delete_space + + graph_paune + + pynutil.insert("७५", weight=-0.1) + + delete_space + + delete_hundred, + -0.1, + ) + graph_in_hundreds |= pynutil.add_weight( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + pynutil.insert("१५०", weight=-0.1) + + delete_space + + delete_hundred, + -0.1, + ) + graph_in_hundreds |= pynutil.add_weight( + pynutil.delete("ढाई") + delete_space + pynutil.insert("२५०", weight=-0.1) + delete_space + delete_hundred, + -0.1, + ) + self.graph_hundreds = graph_hundred_component | graph_hundred_as_thousand | graph_in_hundreds graph_teens_and_ties_component = pynini.union( - graph_teens_and_ties | pynutil.insert("00") + delete_space + (graph_digit | pynutil.insert("0")), + graph_teens_and_ties | pynutil.insert("००") + delete_space + (graph_digit | pynutil.insert("०")), ) graph_ties_component_at_least_one_none_zero_digit = self.graph_two_digit @ ( pynini.closure(NEMO_HI_DIGIT) + pynini.closure(NEMO_HI_DIGIT) @@ -139,6 +227,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): graph_no_prefix = pynutil.add_weight( pynini.cross("सौ", "१००") | pynini.cross("हज़ार", "१०००") + | pynini.cross("हजार", "१०००") | pynini.cross("लाख", "१०००००") | pynini.cross("करोड़", "१०००००००"), 2, diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py index 817b1b86a..d019a7973 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py @@ -16,11 +16,13 @@ from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( + DEVANAGARI_DIGIT, NEMO_HI_DIGIT, GraphFst, delete_extra_space, delete_space, insert_space, + integer_to_devanagari, ) from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path @@ -35,16 +37,22 @@ class DateFst(GraphFst): date: DateFst """ - def __init__(self, cardinal: GraphFst): + def __init__(self, cardinal: GraphFst, ordinal: GraphFst): super().__init__(name="date", kind="classify") graph_year = pynutil.add_weight( pynini.compose(cardinal.graph_no_exception, pynini.closure(NEMO_HI_DIGIT, 1, 4)), 0.03 ) + cardinal_graph = cardinal.graph_no_exception month_graph = pynini.string_file(get_abs_path("data/date/months.tsv")) - graph_date_days = pynini.string_file(get_abs_path("data/date/date_days.tsv")).invert() + + graph_date_days = cardinal.graph_digit | cardinal.graph_teens_and_ties + date_days = pynini.union(*[integer_to_devanagari(i) for i in range(1, 32)]).optimize() + graph_date_days = graph_date_days @ date_days + graph_century = pynini.string_file(get_abs_path("data/date/century.tsv")).invert() + graph_morph_features = pynini.string_file(get_abs_path("data/ordinals/morph_features.tsv")) self.day = pynutil.insert("day: \"") + graph_date_days + pynutil.insert("\" ") self.month = pynutil.insert("month: \"") + month_graph + pynutil.insert("\" ") @@ -60,8 +68,10 @@ def __init__(self, cardinal: GraphFst): + pynutil.insert("\" ") ) self.century = pynutil.insert("text: \"") + graph_century + pynutil.insert("\" ") - insert_comma = pynutil.insert(", ") - + self.ordinal_century = pynutil.insert("era: \"") + cardinal_graph + pynutil.insert("\" ") + self.morpho_graph = ( + pynutil.insert("morphosyntactic_features: \"") + graph_morph_features + pynutil.insert("\"") + ) graph_day_month = self.day + delete_space + self.month graph_month_day = self.month + delete_space + self.day graph_month_day += pynutil.insert(" preserve_order: true") @@ -76,7 +86,11 @@ def __init__(self, cardinal: GraphFst): ) graph_month_year_century = self.month + delete_space + self.year + delete_space + self.century graph_year_range = self.year_range + graph_year_range_century = self.year_range + delete_space + self.century + + graph_ordinal_century = self.ordinal_century + self.morpho_graph + delete_extra_space + self.century + graph_ordinal_century = self.ordinal_century + self.morpho_graph + delete_extra_space + self.century graph_date_exceptions = self.month + delete_space + pynutil.delete("की") + delete_space + self.day graph_date_exceptions += pynutil.insert("preserve_order: true") @@ -91,6 +105,8 @@ def __init__(self, cardinal: GraphFst): | graph_day_month_year_century | graph_month_year_century | graph_year_range + | graph_year_range_century + | graph_ordinal_century | graph_date_exceptions ) final_graph = self.add_tokens(graph) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/electronic.py new file mode 100644 index 000000000..58b33ce20 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/electronic.py @@ -0,0 +1,581 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( + GraphFst, + delete_space, + delete_zero_or_one_space, + NEMO_SIGMA, +) +from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path + + +class ElectronicFst(GraphFst): + """ + Finite state transducer for classifying electronic expressions in Hindi + inverse text normalization: converts spoken Hindi words into written + electronic forms such as email addresses, URLs, file paths, IP addresses, + domains, chemical formulas, and alphanumeric codes. + + e-mail: + e.g. कुमार एट जीमेल डॉट कॉम + -> tokens { electronic { username: "kumar" domain: "gmail.com" } } + URL: + e.g. एच टी टी पी एस कोलन फॉरवर्ड स्लैश फॉरवर्ड स्लैश गूगल डॉट कॉम + -> tokens { electronic { domain: "https://google.com" } } + file path (Windows): + e.g. सी कोलन बैकवर्ड स्लैश यूजर्स बैकवर्ड स्लैश एच पी बैकवर्ड स्लैश डेस्कटॉप + -> tokens { electronic { path: "C:\\Users\\HP\\Desktop" } } + file path (Unix/Linux): + e.g. फॉरवर्ड स्लैश होम फॉरवर्ड स्लैश यूजर फॉरवर्ड स्लैश डॉक्युमेंट्स + -> tokens { electronic { path: "/home/user/documents" } } + IP address: + e.g. एक नौ दो डॉट एक छह आठ डॉट एक डॉट एक + -> tokens { electronic { ip: "192.168.1.1" } } + chemical formula: + e.g. एन ए ओ एच + -> tokens { electronic { domain: "NaOH" } } + alphanumeric code: + e.g. ए बी सी एक दो तीन + -> tokens { electronic { domain: "ABC123" } } + + """ + + def __init__(self): + super().__init__(name="electronic", kind="classify") + digit_words = pynini.string_file( + get_abs_path("data/electronic/digit_words.tsv") + ) + digit_glyphs = pynini.string_file( + get_abs_path("data/electronic/digit_glyphs.tsv") + ).invert() + single_digit = ( + pynutil.add_weight(digit_glyphs, 0.8) + | pynutil.add_weight(digit_words, 0.50) + ) + digit_seq = ( + pynutil.add_weight(digit_glyphs + pynini.closure(digit_glyphs, 0), 0.8) + | pynutil.add_weight(digit_words + pynini.closure(delete_space + digit_words, 0), 0.9) + ) + + letter_map = pynini.string_file(get_abs_path("data/electronic/letters.tsv")).invert() + domain_map = pynini.string_file(get_abs_path("data/electronic/domain.tsv")).invert() + server_map = pynini.string_file(get_abs_path("data/electronic/server_name.tsv")).invert() + common_map = pynini.string_file(get_abs_path("data/electronic/common_words.tsv")).invert() + + try: + chem_named_map = pynini.string_file( + get_abs_path("data/electronic/chemical_formulas.tsv") + ).optimize() + except Exception: + chem_named_map = None + + special_codes_map = pynini.string_file( + get_abs_path("data/electronic/special_codes.tsv") + ).optimize() + + to_lower = pynini.cdrewrite( + pynini.string_map([(chr(c), chr(c + 32)) for c in range(ord('A'), ord('Z') + 1)]), + "", "", NEMO_SIGMA, + ) + to_upper = pynini.cdrewrite( + pynini.string_map([(chr(c + 32), chr(c)) for c in range(ord('A'), ord('Z') + 1)]), + "", "", NEMO_SIGMA, + ) + + def make_lower(fst): + return (fst @ to_lower).optimize() + + def make_upper(fst): + return (fst @ to_upper).optimize() + + letter_map_lower = make_lower(letter_map) + letter_map_upper = make_upper(letter_map) + common_map_lower = make_lower(common_map) + server_map_lower = make_lower(server_map) + + latin_run = pynini.closure( + pynini.union(*[pynini.accep(chr(c)) for c in range(ord('A'), ord('Z') + 1)]) + | pynini.union(*[pynini.accep(chr(c)) for c in range(ord('a'), ord('z') + 1)]), + 1, + ) + latin_run_lower = make_lower(latin_run) + + _drive_chars = pynini.union("C", "D", "E", "F", "G", "H", "I", "J") + drive_letter = pynini.compose(letter_map_upper, _drive_chars) + + def _backslash(): + return (pynutil.delete("बैकवर्ड") + delete_space + + pynutil.delete("स्लैश") + pynutil.insert("\\\\")) + seg_backslash = delete_space + _backslash() + delete_space + trail_backslash = delete_space + _backslash() + lead_backslash = _backslash() + delete_space + + def _unix_slash(): + return (pynutil.delete("फॉरवर्ड") + delete_space + + pynutil.delete("स्लैश") + pynutil.insert("/")) + unix_seg_slash = delete_space + _unix_slash() + delete_space + unix_lead_slash = _unix_slash() + delete_space + unix_trail_slash = delete_space + _unix_slash() + + url_slash = ( + delete_space + + pynutil.delete("फॉरवर्ड") + delete_space + pynutil.delete("स्लैश") + + pynutil.insert("/") + ) + + lit_slash_seg = pynini.cross(" / ", "/") + lit_hyphen_seg = pynini.cross(" - ", "-") + + dot = delete_space + (pynutil.delete("डॉट") | pynutil.delete("DOT")) + delete_space + pynutil.insert(".") + dot_end_safe = delete_space + (pynutil.delete("डॉट") | pynutil.delete("DOT")) + delete_zero_or_one_space + pynutil.insert(".") + hyphen = delete_space + (pynutil.delete("हाइफ़न") | pynutil.delete("हाइफन")) + delete_space + pynutil.insert("-") + underscore = delete_space + pynutil.delete("अंडर") + delete_space + pynutil.delete("स्कोर") + pynutil.insert("_") + at_sign = delete_space + pynutil.delete("एट") + delete_space + x_sep = delete_space + pynutil.delete("एक्स") + pynutil.insert("x") + literal_space = delete_space + pynutil.delete("स्पेस") + pynutil.insert(" ") + open_bracket = ( + delete_space + + pynutil.delete("ओपन") + delete_space + pynutil.delete("ब्रेकेट") + + pynutil.insert("(") + + delete_zero_or_one_space + ) + close_bracket = ( + delete_space + + pynutil.delete("क्लोज़") + delete_space + pynutil.delete("ब्रेकेट") + + pynutil.insert(")") + + delete_zero_or_one_space + ) + dollar_sign = delete_space + pynutil.delete("डॉलर") + pynutil.insert("$") + + lit_open_paren = ( + delete_space + + pynutil.delete("(") + + pynutil.insert("(") + + delete_zero_or_one_space + ) + lit_close_paren = ( + delete_space + + pynutil.delete(")") + + pynutil.insert(")") + ) + or_word = pynutil.delete("ओ") + delete_space + pynutil.delete("आर") + pynutil.insert("or") + and_as_letters = pynutil.delete("एंड") + pynutil.insert("and") + www_token = (pynutil.delete("डब्ल्यू") + delete_space + + pynutil.delete("डब्ल्यू") + delete_space + + pynutil.delete("डब्ल्यू") + pynutil.insert("www")) + v_prefix = pynutil.delete("वी") + pynutil.insert("v") + hp_token = pynutil.delete("एच") + delete_space + pynutil.delete("पी") + pynutil.insert("HP") + tilde_delete = pynutil.delete("~") | pynutil.delete("टिल्ड") + + single_token = ( + pynutil.add_weight(server_map, 0.90) + | pynutil.add_weight(common_map, 0.95) + | pynutil.add_weight(letter_map_lower, 1.00) + ) + token_seq = single_token + pynini.closure(delete_space + single_token, 0) + + path_atom = ( + pynutil.add_weight(hp_token, 0.76) + | pynutil.add_weight(www_token, 0.77) + | pynutil.add_weight(or_word, 0.80) + | pynutil.add_weight(and_as_letters, 0.84) + | pynutil.add_weight(common_map, 0.90) + | pynutil.add_weight(server_map, 0.92) + | pynutil.add_weight(digit_words, 0.94) + | pynutil.add_weight(digit_glyphs, 0.95) + | pynutil.add_weight(latin_run, 0.97) + | pynutil.add_weight(letter_map, 1.00) + ) + path_atom_lower = ( + pynutil.add_weight(common_map_lower, 0.90) + | pynutil.add_weight(server_map_lower, 0.92) + | pynutil.add_weight(digit_words, 0.94) + | pynutil.add_weight(digit_glyphs, 0.95) + | pynutil.add_weight(latin_run_lower, 0.97) + | pynutil.add_weight(letter_map_lower, 1.00) + ) + unix_path_atom = ( + pynutil.add_weight(www_token, 0.77) + | pynutil.add_weight(or_word, 0.80) + | pynutil.add_weight(and_as_letters, 0.84) + | pynutil.add_weight(pynini.cross("CI", "c"), 0.86) + | path_atom_lower + ) + + single_ext = ( + delete_space + pynutil.delete("डॉट") + pynutil.insert(".") + + delete_space + path_atom_lower + + pynini.closure(delete_space + path_atom_lower, 0) + ) + ext_hyphen = ( + delete_space + (pynutil.delete("हाइफ़न") | pynutil.delete("हाइफन")) + + pynutil.insert("-") + delete_space + + path_atom_lower + pynini.closure(delete_space + path_atom_lower, 0) + ) + file_ext = single_ext + pynini.closure(single_ext | ext_hyphen, 0) + + win_hyphen = ( + delete_space + (pynutil.delete("हाइफ़न") | pynutil.delete("हाइफन")) + + pynutil.insert("-") + delete_space + + path_atom + pynini.closure(delete_space + path_atom, 0) + ) + win_underscore = ( + delete_space + pynutil.delete("अंडर") + delete_space + + pynutil.delete("स्कोर") + pynutil.insert("_") + ) + path_segment = ( + path_atom + + pynini.closure( + pynutil.add_weight(delete_space + path_atom, 1.0) + | pynutil.add_weight(win_hyphen, 1.0) + | pynutil.add_weight(win_underscore, 1.0) + | pynutil.add_weight(literal_space, 1.0) + | pynutil.add_weight(open_bracket, 1.0) + | pynutil.add_weight(close_bracket, 1.0) + | pynutil.add_weight(lit_open_paren, 1.0) + | pynutil.add_weight(lit_close_paren, 1.0) + , 0) + + pynini.closure(file_ext, 0, 1) + ) + + unix_hyphen = ( + delete_space + (pynutil.delete("हाइफ़न") | pynutil.delete("हाइफन")) + + pynutil.insert("-") + delete_space + + unix_path_atom + pynini.closure(delete_space + unix_path_atom, 0) + ) + unix_underscore = ( + delete_space + pynutil.delete("अंडर") + delete_space + + pynutil.delete("स्कोर") + pynutil.insert("_") + + delete_space + unix_path_atom + + pynini.closure(delete_space + unix_path_atom, 0) + ) + version_seg = ( + v_prefix + unix_path_atom + + pynini.closure( + delete_space + pynutil.delete("डॉट") + pynutil.insert(".") + + delete_space + unix_path_atom + + pynini.closure(delete_space + unix_path_atom, 0) + , 0) + ) + dollar_var = ( + dollar_sign + delete_space + + unix_path_atom + pynini.closure(delete_space + unix_path_atom, 0) + ) + unix_segment = ( + ( + pynutil.add_weight(version_seg, 0.85) + | pynutil.add_weight(dollar_var, 0.87) + | pynutil.add_weight(unix_path_atom, 1.00) + ) + + pynini.closure( + pynutil.add_weight(delete_space + unix_path_atom, 1.0) + | pynutil.add_weight(unix_hyphen, 1.0) + | pynutil.add_weight(unix_underscore, 1.0) + , 0) + + pynini.closure(file_ext, 0, 1) + ) + + windows_path_fst = ( + pynutil.insert("path: \"") + + drive_letter + delete_space + pynutil.delete("कोलन") + pynutil.insert(":") + + seg_backslash + path_segment + + pynini.closure(seg_backslash + path_segment, 0) + + pynini.closure(trail_backslash, 0, 1) + + pynutil.insert("\"") + ) + unc_path_fst = ( + pynutil.insert("path: \"") + + lead_backslash + path_segment + + pynini.closure(seg_backslash + path_segment, 0) + + pynini.closure(trail_backslash, 0, 1) + + pynutil.insert("\"") + ) + unix_abs_path_fst = ( + pynutil.insert("path: \"") + + unix_lead_slash + unix_segment + + pynini.closure(unix_seg_slash + unix_segment, 0) + + pynini.closure(unix_trail_slash, 0, 1) + + pynutil.insert("\"") + ) + unix_rel_path_fst = ( + pynutil.insert("path: \"") + + unix_segment + unix_seg_slash + unix_segment + + pynini.closure(unix_seg_slash + unix_segment, 0) + + pynini.closure(unix_trail_slash, 0, 1) + + pynutil.insert("\"") + ) + tilde_path_fst = ( + pynutil.insert("path: \"") + + tilde_delete + unix_seg_slash + unix_segment + + pynini.closure(unix_seg_slash + unix_segment, 0) + + pynini.closure(unix_trail_slash, 0, 1) + + pynutil.insert("\"") + ) + + lit_seg = ( + unix_path_atom + + pynini.closure( + pynutil.add_weight(delete_space + unix_path_atom, 1.0) + | pynutil.add_weight(unix_hyphen, 1.0) + | pynutil.add_weight(lit_hyphen_seg, 1.0) + , 0) + + pynini.closure(file_ext, 0, 1) + ) + literal_rel_path_fst = ( + pynutil.insert("path: \"") + + lit_seg + + lit_slash_seg + lit_seg + + pynini.closure(lit_slash_seg + lit_seg, 0) + + pynini.closure(pynini.cross(" /", "/"), 0, 1) + + pynutil.insert("\"") + ) + + host_prefix_map = pynini.string_map([ + ("एस आर वी", "srv"), ("डी बी", "db"), ("एल टी", "lt"), + ("वेब", "web"), ("लैपटॉप", "laptop"), ("डेस्कटॉप", "desktop"), + ("ई मेल", "email"), + ]) + + domain_single = ( + pynutil.add_weight(server_map_lower, 0.90) + | pynutil.add_weight(common_map_lower, 0.95) + | pynutil.add_weight(letter_map_lower, 1.00) + ) + domain_token_seq = domain_single + pynini.closure(delete_space + domain_single, 0) + + digit_then_letter = ( + digit_seq + + pynini.closure(delete_space + letter_map_lower, 0) + ) + + first_label = ( + pynutil.add_weight(host_prefix_map, 0.80) + | pynutil.add_weight(digit_seq + delete_space + letter_map_lower, 0.85) + | pynutil.add_weight(digit_seq, 0.87) + | pynutil.add_weight(domain_token_seq, 1.00) + ) + domain_body = first_label + pynini.closure( + hyphen + ( + pynutil.add_weight(digit_then_letter, 0.85) + | pynutil.add_weight(digit_seq | domain_token_seq, 1.0) + ), 0 + ) + compound_tld = domain_map + pynini.closure(dot_end_safe + domain_map, 0, 2) + full_domain = pynini.closure(domain_body + dot, 0, 4) + domain_body + dot + compound_tld + full_domain_bare = pynini.closure(domain_body + dot, 0, 4) + domain_body + + uname_atom = ( + pynutil.add_weight(and_as_letters, 0.84) + | pynutil.add_weight(digit_words, 0.88) + | pynutil.add_weight(digit_glyphs, 0.88) + | pynutil.add_weight(server_map, 0.90) + | pynutil.add_weight(common_map, 0.95) + | pynutil.add_weight(letter_map_lower, 0.84) + ) + uname_sep = ( + (delete_space + pynutil.delete("डॉट") + delete_space + pynutil.insert(".")) + | (delete_space + (pynutil.delete("हाइफ़न") | pynutil.delete("हाइफन")) + delete_space + pynutil.insert("-")) + | (delete_space + pynutil.delete("अंडर") + delete_space + pynutil.delete("स्कोर") + pynutil.insert("_")) + ) + username = uname_atom + pynini.closure((uname_sep + uname_atom) | (delete_space + uname_atom), 0) + email_fst = ( + pynutil.insert("username: \"") + username + pynutil.insert("\"") + + at_sign + + pynutil.insert("domain: \"") + domain_body + dot + compound_tld + pynutil.insert("\"") + ) + + ip_octet = single_digit + pynini.closure(delete_space + single_digit, 0, 2) + ip_fst = ( + pynutil.insert("ip: \"") + + ip_octet + dot + ip_octet + dot + ip_octet + dot + ip_octet + + pynutil.insert("\"") + ) + + path_atom_url = ( + pynutil.add_weight(digit_seq + x_sep + delete_space + digit_seq, 0.75) + | pynutil.add_weight(digit_seq + delete_space + letter_map_lower + delete_space + digit_seq, 0.80) + | pynutil.add_weight(digit_words, 0.88) + | pynutil.add_weight(digit_glyphs, 0.89) + | pynutil.add_weight(digit_seq, 0.90) + | pynutil.add_weight(token_seq, 1.00) + ) + + inline_domain_seg = ( + pynini.closure(token_seq + dot, 0, 2) + token_seq + dot + domain_map + + pynini.closure(dot + domain_map, 0, 1) + ) + + path_segment_url = ( + path_atom_url + + pynini.closure(hyphen + (digit_seq | token_seq), 0) + + pynini.closure(underscore + token_seq, 0) + + pynini.closure(dot + token_seq, 0, 1) + ) + + slash_with_word = url_slash + ( + pynutil.add_weight(pynutil.insert(".") + pynutil.delete("डॉट") + delete_space + token_seq, 0.90) + | pynutil.add_weight(inline_domain_seg, 0.95) + | pynutil.add_weight(path_segment_url, 1.00) + ) + + www_as_path_seg = ( + www_token + dot + full_domain + + pynini.closure(slash_with_word, 0) + ) + + slash_with_word = url_slash + ( + pynutil.add_weight(delete_space + digit_seq + x_sep + delete_space + digit_seq, 0.20) + | pynutil.add_weight(delete_space + digit_seq, 0.30) + | pynutil.add_weight(delete_space + pynutil.insert(".") + pynutil.delete("डॉट") + delete_space + token_seq, 0.90) + | pynutil.add_weight(delete_space + inline_domain_seg, 0.95) + | pynutil.add_weight(delete_space + www_as_path_seg, 0.97) + | pynutil.add_weight(delete_space + path_segment_url, 1.00) + ) + + hash_frag_body = token_seq + pynini.closure(hyphen + token_seq, 0) + hash_frag = ( + delete_space + + pynutil.delete("हैशटैग") + + pynutil.insert("#") + + delete_space + + hash_frag_body + ) + + domain_and_path = ( + full_domain + + pynini.closure(slash_with_word, 0) + + pynini.closure(url_slash, 0, 1) + + pynini.closure(hash_frag, 0, 1) + ) + domain_and_path_bare = ( + full_domain_bare + + pynini.closure(slash_with_word, 0) + + pynini.closure(url_slash, 0, 1) + + pynini.closure(hash_frag, 0, 1) + ) + + https_prefix = ( + pynutil.delete("एच") + delete_space + pynutil.delete("टी") + delete_space + + pynutil.delete("टी") + delete_space + pynutil.delete("पी") + delete_space + + pynutil.delete("एस") + delete_space + pynutil.delete("कोलन") + delete_space + + pynutil.delete("फॉरवर्ड") + delete_space + pynutil.delete("स्लैश") + delete_space + + pynutil.delete("फॉरवर्ड") + delete_space + pynutil.delete("स्लैश") + + pynutil.insert("https://") + ) + http_prefix = ( + pynutil.delete("एच") + delete_space + pynutil.delete("टी") + delete_space + + pynutil.delete("टी") + delete_space + pynutil.delete("पी") + delete_space + + pynutil.delete("कोलन") + delete_space + + pynutil.delete("फॉरवर्ड") + delete_space + pynutil.delete("स्लैश") + delete_space + + pynutil.delete("फॉरवर्ड") + delete_space + pynutil.delete("स्लैश") + + pynutil.insert("http://") + ) + protocol = pynutil.add_weight(https_prefix, 1.0) | pynutil.add_weight(http_prefix, 1.01) + + url_fst = (pynutil.insert("domain: \"") + protocol + delete_space + + pynini.closure(www_token + dot, 0, 1) + domain_and_path + pynutil.insert("\"")) + www_fst = (pynutil.insert("domain: \"") + www_token + dot + + domain_and_path + pynutil.insert("\"")) + plain_fst = pynutil.insert("domain: \"") + domain_and_path + pynutil.insert("\"") + + url_fst_bare = ( + pynutil.insert("domain: \"") + protocol + delete_space + + pynini.closure(www_token + dot, 0, 1) + + domain_and_path_bare + pynutil.insert("\"") + ) + www_fst_bare = ( + pynutil.insert("domain: \"") + www_token + dot + + domain_and_path_bare + pynutil.insert("\"") + ) + + chem_token = ( + pynutil.add_weight(digit_glyphs, 0.90) + | pynutil.add_weight(letter_map, 1.00) + ) + chem_more = pynini.closure( + pynutil.add_weight(delete_space + chem_token, 1.0) + | pynutil.add_weight(open_bracket, 1.0) + | pynutil.add_weight(close_bracket, 1.0) + | pynutil.add_weight(delete_space + pynutil.delete("इनदो") + pynutil.insert("("), 1.0) + | pynutil.add_weight(delete_space + pynutil.delete("बाय") + pynutil.insert(")"), 1.0) + | pynutil.add_weight(delete_space + (pynutil.delete("माइनस") | pynutil.delete("–")) + pynutil.insert("−"), 1.0) + , 0) + + chem_spelled_fst = pynutil.insert("domain: \"") + ( + chem_token + delete_space + chem_token + chem_more + ) + pynutil.insert("\"") + + alnum_phrase_fst = pynutil.insert("domain: \"") + special_codes_map + pynutil.insert("\"") + + alnum_token = ( + pynutil.add_weight(digit_glyphs, 0.77) + | pynutil.add_weight(digit_words, 0.10) + | pynutil.add_weight(letter_map_upper, 0.84) + ) + alnum_run = alnum_token + delete_space + alnum_token + pynini.closure(delete_space + alnum_token, 0) + + alnum_hyphen_ext = ( + delete_space + (pynutil.delete("हाइफ़न") | pynutil.delete("हाइफन")) + + pynutil.insert("-") + delete_space + + alnum_token + pynini.closure(delete_space + alnum_token, 0) + ) + alnum_body_start = ( + alnum_run + | (alnum_token + alnum_hyphen_ext) + ) + alnum_body = alnum_body_start + pynini.closure( + pynutil.add_weight(alnum_hyphen_ext, 1.0) + | pynutil.add_weight( + delete_space + + (pynutil.delete("डॉट") | pynutil.delete("DOT") | pynutil.delete("प्वाइंट")) + + pynutil.insert(".") + delete_space + + alnum_token + pynini.closure(delete_space + alnum_token, 0), 1.0) + | pynutil.add_weight( + delete_space + pynutil.delete("स्पेस") + pynutil.insert(" ") + + delete_space + alnum_token + pynini.closure(delete_space + alnum_token, 0), 1.0) + | pynutil.add_weight(lit_open_paren + alnum_token + pynini.closure(delete_space + alnum_token, 0), 1.0) + | pynutil.add_weight(lit_close_paren + alnum_token + pynini.closure(delete_space + alnum_token, 0), 1.0) + | pynutil.add_weight(lit_close_paren, 1.0) + , 0) + alnum_letterdigit_fst = pynutil.insert("domain: \"") + alnum_body + pynutil.insert("\"") + + chem_fst = pynutil.add_weight( + pynutil.insert("domain: \"") + chem_named_map + pynutil.insert("\""), 0.04 + ) if chem_named_map is not None else pynini.accep("") + + graph = ( + pynutil.add_weight(ip_fst, 1.00) + | pynutil.add_weight(email_fst, 1.05) + | pynutil.add_weight(windows_path_fst, 1.06) + | pynutil.add_weight(unc_path_fst, 1.07) + | pynutil.add_weight(url_fst, 0.10) + | pynutil.add_weight(www_fst, 0.11) + | pynutil.add_weight(url_fst_bare, 0.50) + | pynutil.add_weight(www_fst_bare, 0.51) + | pynutil.add_weight(unix_abs_path_fst, 15.00) + | pynutil.add_weight(tilde_path_fst, 1.12) + | pynutil.add_weight(unix_rel_path_fst, 15.00) + | pynutil.add_weight(literal_rel_path_fst, 1.15) + | pynutil.add_weight(alnum_phrase_fst, 0.05) + | pynutil.add_weight(chem_spelled_fst, 1.18) + | pynutil.add_weight(alnum_letterdigit_fst, 0.90) + | pynutil.add_weight(plain_fst, 1.30) + | chem_fst + ) + + self.fst = self.add_tokens(graph).optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/fraction.py index 1e44f59e8..970bf7313 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/fraction.py @@ -16,13 +16,14 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import ( +from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst, get_abs_path +from nemo_text_processing.text_normalization.en.utils import load_labels +from nemo_text_processing.text_normalization.hi.graph_utils import ( INPUT_CASED, INPUT_LOWER_CASED, MIN_NEG_WEIGHT, MINUS, - NEMO_DIGIT, + NEMO_HI_DIGIT, NEMO_SIGMA, TO_LOWER, GraphFst, @@ -30,7 +31,6 @@ delete_extra_space, delete_space, ) -from nemo_text_processing.text_normalization.en.utils import load_labels class FractionFst(GraphFst): @@ -59,9 +59,99 @@ def __init__(self, cardinal: GraphFst): numerator = pynutil.insert("numerator: \"") + graph_cardinal + pynutil.insert("\"") denominator = pynutil.insert(" denominator: \"") + graph_cardinal + pynutil.insert("\"") + graph_fraction = numerator + delete_bata + denominator + graph_mixed_fraction = integer + delete_extra_space + pynutil.delete("सही") + delete_space + graph_fraction + + graph_saade = pynutil.add_weight( + pynutil.delete("साढ़े") + + delete_space + + integer + + pynutil.insert("numerator: \"१\"") + + delete_space + + pynutil.insert(" denominator: \"२\""), + -0.01, + ) + graph_sava = pynutil.add_weight( + pynutil.delete("सवा") + + delete_space + + integer + + pynutil.insert("numerator: \"१\"") + + delete_space + + pynutil.insert(" denominator: \"४\""), + -0.001, + ) + graph_paune = pynutil.add_weight( + pynutil.delete("पौने") + + delete_space + + integer + + pynutil.insert("numerator: \"३\"") + + delete_space + + pynutil.insert(" denominator: \"४\""), + -0.01, + ) + graph_dedh = pynutil.add_weight( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + pynutil.insert("integer_part: \"१\"") + + pynutil.insert(" numerator: \"१\"") + + delete_space + + pynutil.insert(" denominator: \"२\""), + -0.01, + ) + graph_dhaai = pynutil.add_weight( + pynutil.delete("ढाई") + + delete_space + + pynutil.insert("integer_part: \"२\"") + + pynutil.insert(" numerator: \"१\"") + + delete_space + + pynutil.insert(" denominator: \"२\""), + -0.1, + ) + + graph_aadha_and_saade_only = ( + pynini.union(pynutil.delete("आधा") | pynutil.delete("साढ़े")) + + delete_space + + pynutil.insert(" numerator: \"१\"") + + delete_space + + pynutil.insert(" denominator: \"२\"") + ) + graph_sava_only = ( + pynutil.delete("सवा") + + delete_space + + pynutil.insert(" numerator: \"१\"") + + delete_space + + pynutil.insert(" denominator: \"४\"") + ) + graph_paune_only = ( + pynini.union(pynutil.delete("पौन") | pynutil.delete("पौना")) + + delete_space + + pynutil.insert("numerator: \"३\"") + + delete_space + + pynutil.insert(" denominator: \"४\"") + ) + + graph_tihaai = ( + numerator + delete_space + pynutil.delete("तिहाई") + delete_space + pynutil.insert(" denominator: \"३\"") + ) + graph_chauthaai = ( + numerator + delete_space + pynutil.delete("चौथाई") + delete_space + pynutil.insert(" denominator: \"४\"") + ) + + graph_quarterly_exceptions = ( + graph_saade + | graph_sava + | graph_paune + | graph_dedh + | graph_dhaai + | graph_aadha_and_saade_only + | graph_sava_only + | graph_paune_only + | graph_tihaai + | graph_chauthaai + ) - graph = graph_fraction + graph = graph_fraction | graph_mixed_fraction | graph_quarterly_exceptions self.graph = graph.optimize() self.final_graph_wo_negative = graph optional_graph_negative = pynini.closure( diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/measure.py index 15d8e4eb8..59227a436 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/measure.py @@ -17,11 +17,12 @@ from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( + NEMO_CHAR, + NEMO_WHITE_SPACE, GraphFst, convert_space, delete_extra_space, delete_space, - insert_space, ) from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path @@ -31,11 +32,11 @@ class MeasureFst(GraphFst): Finite state transducer for classifying measure e.g. ऋण बारह किलोग्राम -> measure { decimal { negative: "true" integer_part: "१२" fractional_part: "५०"} units: "kg" } e.g. ऋण बारह किलोग्राम -> measure { cardinal { negative: "true" integer_part: "१२"} units: "kg" } + e.g. सात शून्य शून्य ओक स्ट्रीट -> measure { units: "address" cardinal { integer: "७०० ओक स्ट्रीट" } preserve_order: true } Args: cardinal: CardinalFst decimal: DecimalFst - measure: MeasureFst """ def __init__(self, cardinal: GraphFst, decimal: GraphFst): @@ -51,7 +52,60 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): ) measurements_graph = pynini.string_file(get_abs_path("data/measure/measurements.tsv")).invert() + paune_graph = pynini.string_file(get_abs_path("data/numbers/paune.tsv")).invert() + self.measurements = pynutil.insert("units: \"") + measurements_graph + pynutil.insert("\" ") + graph_integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + graph_integer_paune = pynutil.insert("integer_part: \"") + paune_graph + pynutil.insert("\"") + + graph_saade_single_digit = pynutil.add_weight( + pynutil.delete("साढ़े") + + delete_space + + graph_integer + + delete_space + + pynutil.insert(" fractional_part: \"५\""), + 0.1, + ) + graph_sava_single_digit = pynutil.add_weight( + pynutil.delete("सवा") + + delete_space + + graph_integer + + delete_space + + pynutil.insert(" fractional_part: \"२५\""), + 0.1, + ) + graph_paune_single_digit = pynutil.add_weight( + pynutil.delete("पौने") + + delete_space + + graph_integer_paune + + delete_space + + pynutil.insert(" fractional_part: \"७५\""), + 1, + ) + graph_dedh_single_digit = pynutil.add_weight( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + pynutil.insert("integer_part: \"१\"") + + delete_space + + pynutil.insert(" fractional_part: \"५\""), + 0.1, + ) + graph_dhaai_single_digit = pynutil.add_weight( + pynutil.delete("ढाई") + + delete_space + + pynutil.insert("integer_part: \"२\"") + + delete_space + + pynutil.insert(" fractional_part: \"५\""), + 1, + ) + + graph_exceptions = ( + graph_saade_single_digit + | graph_sava_single_digit + | graph_paune_single_digit + | graph_dedh_single_digit + | graph_dhaai_single_digit + ) graph_measurements = ( pynutil.insert("decimal { ") @@ -71,8 +125,129 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + delete_extra_space + self.measurements ) + graph_quarterly_measurements = ( + pynutil.insert("decimal { ") + + optional_graph_negative + + graph_exceptions + + pynutil.insert(" }") + + delete_extra_space + + self.measurements + ) + graph_exception_bai = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + cardinal_graph + + delete_space + + pynini.cross("बाई", "x") + + delete_space + + cardinal_graph + + pynutil.insert("\"") + + pynutil.insert(" }") + + pynini.closure(delete_extra_space + self.measurements) + ) - graph = graph_measurements + # Shared digit word -> Devanagari digit mapping + num_word = ( + ( + pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + | pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + | pynini.string_file(get_abs_path("data/telephone/eng_digit.tsv")) + | pynini.string_file(get_abs_path("data/telephone/eng_zero.tsv")) + ) + .invert() + .optimize() + ) + + delete_one_space = pynutil.delete(" ") + + # Structured address: state/city + pincode + states = pynini.string_file(get_abs_path("data/address/states.tsv")) + cities = pynini.string_file(get_abs_path("data/address/cities.tsv")) + state_city_names = pynini.union(states, cities).optimize() + + pincode = num_word + pynini.closure(delete_one_space + num_word, 5, 5) + + structured_pattern = ( + state_city_names + + pynini.closure(pynini.accep(",") + pynini.accep(" ") + state_city_names, 0, 1) + + pynini.accep(" ") + + pincode + ).optimize() + + structured_address_graph = ( + pynutil.insert('units: "address" cardinal { integer: "') + + convert_space(structured_pattern) + + pynutil.insert('" } preserve_order: true') + ) + structured_address_graph = pynutil.add_weight(structured_address_graph, 1.0).optimize() + + # Address: digit/special/ordinal conversion with context keywords + special_word = pynini.string_file(get_abs_path("data/address/special_characters.tsv")) + ordinal_word = pynini.string_file(get_abs_path("data/address/ordinals.tsv")) + context_keywords_fsa = pynini.string_file(get_abs_path("data/address/context_cues.tsv")) + + digit_passthrough = pynini.string_file(get_abs_path("data/address/digit_passthrough.tsv")).optimize() + digit_unit = pynini.union(num_word, digit_passthrough).optimize() + + all_digit_inputs = pynini.project(digit_unit, "input").optimize() + all_ordinal_inputs = pynini.project(ordinal_word, "input").optimize() + + non_space_non_comma = pynini.difference( + NEMO_CHAR, pynini.union(NEMO_WHITE_SPACE, pynini.accep(",")) + ).optimize() + any_word = pynini.closure(non_space_non_comma, 1).optimize() + + text_word = pynini.difference(any_word, pynini.union(all_digit_inputs, all_ordinal_inputs)).optimize() + + digit_block = digit_unit + pynini.closure(pynutil.add_weight(delete_one_space + digit_unit, -1.0)) + + connector = delete_one_space + special_word + delete_one_space + + matchable = pynini.union( + pynutil.add_weight(digit_block, -0.1), + pynutil.add_weight(ordinal_word, -0.2), + pynutil.add_weight(text_word, 0.1), + ).optimize() + + chain = matchable + pynini.closure(pynutil.add_weight(connector + matchable, -0.5)) + + opt_comma = pynini.closure(pynini.accep(","), 0, 1) + element = chain + opt_comma + address_content = element + pynini.closure(pynini.accep(" ") + element) + + # Context detection: keyword must appear as a complete word in the input + any_char = pynini.union( + pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE), + NEMO_WHITE_SPACE, + ).optimize() + sigma_star = pynini.closure(any_char).optimize() + + word_sep = pynini.union(pynini.accep(" "), pynini.accep(",")).optimize() + + input_pattern = pynini.union( + context_keywords_fsa + word_sep + sigma_star, + sigma_star + pynini.accep(" ") + context_keywords_fsa, + sigma_star + pynini.accep(" ") + context_keywords_fsa + word_sep + sigma_star, + context_keywords_fsa, + ).optimize() + + address_graph = pynini.compose(input_pattern, address_content).optimize() + + address_graph = ( + pynutil.insert('units: "address" cardinal { integer: "') + + convert_space(address_graph) + + pynutil.insert('" } preserve_order: true') + ) + address_graph = pynutil.add_weight(address_graph, 1.05).optimize() + + graph = ( + graph_measurements + | graph_quarterly_measurements + | graph_exception_bai + | address_graph + | structured_address_graph + ) self.graph = graph.optimize() final_graph = self.add_tokens(graph) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/money.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/money.py index 7fa59ee26..e79c9d0b3 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/money.py @@ -21,7 +21,7 @@ delete_space, insert_space, ) -from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst, get_abs_path +from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path class MoneyFst(GraphFst): @@ -40,14 +40,22 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") cardinal_graph = cardinal.graph_no_exception + cardinal_single_and_double_digit_graph = cardinal.graph_digit | cardinal.graph_teens_and_ties decimal_graph = decimal.final_graph_wo_negative currency_graph = pynini.string_file(get_abs_path("data/money/currency.tsv")).invert() + paune_graph = pynini.string_file(get_abs_path("data/numbers/paune.tsv")).invert() self.integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + self.integer_quarterly_measures = pynutil.insert("integer_part: \"") + cardinal_single_and_double_digit_graph + self.integer_paune = pynutil.insert("integer_part: \"") + paune_graph self.paise = pynutil.insert("fractional_part: \"") + cardinal_graph + pynutil.insert("\"") self.fraction = decimal_graph self.currency = pynutil.insert("currency: \"") + currency_graph + pynutil.insert("\" ") aur = pynutil.delete("और") + delete_hundred = pynutil.delete("सौ") + delete_lakh = pynutil.delete("लाख") + delete_hazar = pynutil.delete("हजार") | pynutil.delete("हज़ार") + delete_crore = pynutil.delete("करोड़") | pynutil.delete("करोड़") graph_currency_decimal = self.fraction + delete_extra_space + self.currency graph_currency_cardinal = self.integer + delete_extra_space + self.currency @@ -60,8 +68,186 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + delete_extra_space + pynutil.delete(currency_graph) ) + # cases for saade,sava with teens and ties + graph_saade_teens_ties = ( + pynutil.delete("साढ़े") + + delete_space + + self.integer_quarterly_measures + + pynutil.insert("\"") + + delete_space + + pynutil.insert(" fractional_part: \"५०\"") + + delete_extra_space + + self.currency + ) + graph_sava_teens_ties = ( + pynutil.delete("सवा") + + delete_space + + self.integer_quarterly_measures + + pynutil.insert("\"") + + delete_space + + pynutil.insert(" fractional_part: \"२५\"") + + delete_extra_space + + self.currency + ) + graph_dedh = ( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + pynutil.insert("integer_part: \"१\"") + + delete_space + + pynutil.insert(" fractional_part: \"५०\"") + + delete_extra_space + + self.currency + ) + graph_dhaai = ( + pynutil.delete("ढाई") + + delete_space + + pynutil.insert("integer_part: \"२\"") + + delete_space + + pynutil.insert(" fractional_part: \"५०\"") + + delete_extra_space + + self.currency + ) + + graph_exceptions_teens_ties = graph_saade_teens_ties | graph_sava_teens_ties | graph_dedh | graph_dhaai + + # cases for saade,sava,paune,dedh and dhaai with hundreds and thousands + graph_exceptions = self.integer + delete_extra_space + self.currency + + # exceptions with lakhs + graph_saade_lakh = pynutil.add_weight( + pynutil.delete("साढ़े") + + delete_space + + self.integer_quarterly_measures + + delete_space + + pynutil.insert("५००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_lakh + + delete_extra_space + + self.currency, + 0.01, + ) + graph_sava_lakh = pynutil.add_weight( + pynutil.delete("सवा") + + delete_space + + self.integer_quarterly_measures + + delete_space + + pynutil.insert("२५०००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_lakh + + delete_extra_space + + self.currency, + 0.01, + ) + graph_paune_lakh = ( + pynutil.delete("पौने") + + delete_space + + self.integer_paune + + delete_space + + pynutil.insert("७५०००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_lakh + + delete_extra_space + + self.currency + ) + graph_dedh_lakh = ( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + pynutil.insert("integer_part: \"") + + pynutil.insert("१५००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_lakh + + delete_extra_space + + self.currency + ) + graph_dhaai_lakh = ( + pynutil.delete("ढाई") + + delete_space + + pynutil.insert("integer_part: \"") + + pynutil.insert("२५००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_lakh + + delete_extra_space + + self.currency + ) + + graph_exceptions_lakhs = ( + graph_saade_lakh | graph_sava_lakh | graph_paune_lakh | graph_dedh_lakh | graph_dhaai_lakh + ) + + # exceptions with crores + graph_saade_crore = ( + pynutil.delete("साढ़े") + + delete_space + + self.integer_quarterly_measures + + delete_space + + pynutil.insert("५००००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_crore + + delete_extra_space + + self.currency + ) + graph_sava_crore = ( + pynutil.delete("सवा") + + delete_space + + self.integer_quarterly_measures + + delete_space + + pynutil.insert("२५०००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_crore + + delete_extra_space + + self.currency + ) + graph_paune_crore = ( + pynutil.delete("पौने") + + delete_space + + self.integer_paune + + delete_space + + pynutil.insert("७५०००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_crore + + delete_extra_space + + self.currency + ) + graph_dhaai_crore = ( + pynutil.delete("ढाई") + + delete_space + + pynutil.insert("integer_part: \"") + + pynutil.insert("२५००००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_crore + + delete_extra_space + + self.currency + ) + graph_dedh_crore = ( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + pynutil.insert("integer_part: \"") + + pynutil.insert("१५००००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_crore + + delete_extra_space + + self.currency + ) + + graph_exceptions_crores = ( + graph_saade_crore | graph_sava_crore | graph_paune_crore | graph_dedh_crore | graph_dhaai_crore + ) + + graph_quarterly_measures = ( + graph_exceptions_teens_ties | graph_exceptions | graph_exceptions_lakhs | graph_exceptions_crores + ) - graph = graph_currency_decimal | graph_currency_cardinal | graph_rupay_and_paisa + graph = graph_currency_decimal | graph_currency_cardinal | graph_rupay_and_paisa | graph_quarterly_measures self.graph = graph.optimize() final_graph = self.add_tokens(graph) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/ordinal.py index d6f4d59ac..473055891 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/ordinal.py @@ -83,5 +83,6 @@ def __init__(self, cardinal: GraphFst): rule = pynini.cdrewrite(morpho_graph, pynini.closure(NEMO_HI_DIGIT), pynini.union("[EOS]", " "), NEMO_SIGMA) final_graph = pynutil.insert("integer: \"") + graph @ rule - final_graph = self.add_tokens(final_graph) + self.final_graph = self.add_tokens(final_graph) + final_graph = self.final_graph self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/percentage.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/percentage.py new file mode 100644 index 000000000..c191866b3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/percentage.py @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( + GraphFst, + delete_space, +) +from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path + + +class PercentageFst(GraphFst): + """ + Finite state transducer for classifying percentages + e.g. बीस प्रतिशत -> percentage { integer: "२०" percent: "%" } + + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal): + super().__init__(name="percentage", kind="classify") + + graph_percent_symbol = pynini.string_file( + get_abs_path("data/percentage/percent_symbol.tsv") + ).invert() + + integer_graph = cardinal.graph_no_exception + + final_graph = ( + pynutil.insert("integer: \"") + + integer_graph + + pynutil.insert("\"") + + delete_space + + pynutil.insert(" percent: \"") + + graph_percent_symbol + + pynutil.insert("\"") + ) + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py index 386f1353d..ad584b58b 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py @@ -15,144 +15,146 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.hi.graph_utils import GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( + NEMO_CHAR, + NEMO_WHITE_SPACE, + GraphFst, + delete_space, +) from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path +shunya = ( + pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() + | pynini.string_file(get_abs_path("data/telephone/eng_zero.tsv")).invert() +) +digit_without_shunya = ( + pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() + | pynini.string_file(get_abs_path("data/telephone/eng_digit.tsv")).invert() +) +digit = digit_without_shunya | shunya -class TelephoneFst(GraphFst): - """ - Finite state transducer for classifying telephone numbers, e.g. - e.g. प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य => tokens { name: "+९१ ९८७६५ ४३२१०" } - Args: - Cardinal: CardinalFst - """ +def get_context(keywords: list): + keywords = pynini.union(*keywords) - def __init__(self, cardinal: GraphFst): - super().__init__(name="telephone", kind="classify") + # Load Hindi digits from TSV files + hindi_digits = ( + pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + | pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + ).project("output") - hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() - hindi_digit_graph |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() + # Load English digits from TSV files + english_digits = ( + pynini.string_file(get_abs_path("data/telephone/eng_digit.tsv")) + | pynini.string_file(get_abs_path("data/telephone/eng_zero.tsv")) + ).project("output") - english_digit_graph = pynini.string_file(get_abs_path("data/telephone/eng_to_hindi_digit.tsv")).invert() + all_digits = hindi_digits | english_digits - country_code_graph_single_digits = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() - country_code_graph_single_digits |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() - country_code_graph_single_digits |= pynini.string_file( - get_abs_path("data/telephone/eng_to_hindi_digit.tsv") - ).invert() + non_digit_char = pynini.difference(NEMO_CHAR, pynini.union(all_digits, NEMO_WHITE_SPACE)) + word = pynini.closure(non_digit_char, 1) + NEMO_WHITE_SPACE + window = pynini.closure(word, 0, 5) + before = (keywords + window).optimize() + after = (window + keywords).optimize() - country_code_graph_double_digits = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")).invert() - country_code_graph_double_digits |= pynini.string_file( - get_abs_path("data/telephone/teens_and_ties_eng_to_hin.tsv") - ).invert() + return before, after - self.hindi_digit = ( - pynutil.insert("number_part: \"") - + pynini.closure(hindi_digit_graph + delete_space, 0, 9) - + hindi_digit_graph - + pynutil.insert("\" ") - ) - self.english_digit = ( - pynutil.insert("number_part: \"") - + pynini.closure(english_digit_graph + delete_space, 0, 9) - + english_digit_graph - + delete_space - + pynutil.insert("\" ") - ) - self.country_code_with_single_digits = ( - pynutil.insert("country_code: \"") - + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) - + pynutil.insert("\" ") - ) - self.country_code_with_double_digits = ( - pynutil.insert("country_code: \"") - + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) - + pynutil.insert("\" ") - ) - self.country_code = self.country_code_with_single_digits | self.country_code_with_double_digits +def generate_context_graph(context_keywords, length): + context_before, context_after = get_context(context_keywords) + digits = pynini.closure(digit + delete_space, length - 1, length - 1) + digit - # two, three, four-digit extension code with zero - self.city_code_hindi = ( - pynutil.insert("extension: \"") - + pynini.closure(hindi_digit_graph + delete_space, 2, 5) - + pynutil.insert("\" ") - ) - self.city_code_english = ( - pynutil.insert("extension: \"") - + pynini.closure(english_digit_graph + delete_space, 2, 5) - + pynutil.insert("\" ") - ) + graph_after_context = digits + NEMO_WHITE_SPACE + context_after + graph_before_context = context_before + NEMO_WHITE_SPACE + digits + graph_without_context = digits - self.city_extension = self.city_code_hindi | self.city_code_english + return ( + pynutil.insert("number_part: \"") + + (graph_before_context | graph_after_context | graph_without_context) + + pynutil.insert("\" ") + ).optimize() - # 7-digit landline graph in hindi and english digits - self.landline_hindi = ( - pynutil.insert("number_part: \"") - + pynini.closure(hindi_digit_graph + delete_space, 7, 7) - + pynutil.insert("\" ") - ) - self.landline_english = ( - pynutil.insert("number_part: \"") - + pynini.closure(english_digit_graph + delete_space, 7, 7) - + pynutil.insert("\" ") - ) - self.landline = self.landline_hindi | self.landline_english +def generate_pincode(context_keywords): + return generate_context_graph(context_keywords, 6) - self.pincode_in_hindi = ( - pynutil.insert("number_part: \"") - + pynini.closure(hindi_digit_graph + delete_space, 0, 5) - + hindi_digit_graph - + pynutil.insert("\" ") - ) - self.pincode_in_english = ( - pynutil.insert("number_part: \"") - + pynini.closure(english_digit_graph + delete_space, 0, 5) - + english_digit_graph - + pynutil.insert("\" ") - ) - self.credit_card_last_digits_hindi = ( - pynutil.insert("number_part: \"") - + pynini.closure(hindi_digit_graph + delete_space, 0, 3) - + hindi_digit_graph - + pynutil.insert("\" ") - ) - self.credit_card_last_digits_english = ( - pynutil.insert("number_part: \"") - + pynini.closure(english_digit_graph + delete_space, 0, 3) - + english_digit_graph - + pynutil.insert("\" ") - ) +def generate_credit(context_keywords): + return generate_context_graph(context_keywords, 4) - delete_plus = pynini.union( - pynutil.delete("प्लस") | pynutil.delete("plus") | pynutil.delete("Plus") | pynutil.delete("PLUS") - ) - delete_zero = pynini.union( - pynutil.delete("शून्य") | pynutil.delete("zero") | pynutil.delete("Zero") | pynutil.delete("ZERO") - ) +def generate_mobile(context_keywords): + context_before, context_after = get_context(context_keywords) - graph_number_with_hindi_digit = ( - delete_plus + delete_space + self.country_code + delete_space + self.hindi_digit - ) - graph_number_with_english_digit = delete_plus + delete_space + self.country_code + self.english_digit + country_code = pynini.cross("प्लस", "+") + pynini.closure(delete_space + digit, 2, 2) + NEMO_WHITE_SPACE + graph_country_code = ( + pynutil.insert("country_code: \"") + + (context_before + NEMO_WHITE_SPACE) ** (0, 1) + + country_code + + pynutil.insert("\" ") + ) + + number_part = digit_without_shunya + delete_space + pynini.closure(digit + delete_space, 8, 8) + digit + graph_number = ( + pynutil.insert("number_part: \"") + + number_part + + pynini.closure(NEMO_WHITE_SPACE + context_after, 0, 1) + + pynutil.insert("\" ") + ) + + graph = (graph_country_code + graph_number) | graph_number + return graph.optimize() + + +def generate_telephone(context_keywords): + context_before, context_after = get_context(context_keywords) + + landline = shunya + delete_space + pynini.closure(digit + delete_space, 9, 9) + digit + landline_with_context_before = context_before + NEMO_WHITE_SPACE + landline + landline_with_context_after = landline + NEMO_WHITE_SPACE + context_after + + return ( + pynutil.insert("number_part: \"") + + (landline | landline_with_context_before | landline_with_context_after) + + pynutil.insert("\" ") + ) + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for classifying telephone numbers, e.g. + e.g. प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य => tokens { name: "+९१ ९८७६५ ४३२१०" } + Args: + Cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="telephone", kind="classify") + + # Load context cues from TSV file + context_cues = pynini.string_file(get_abs_path("data/telephone/context_cues.tsv")) + + # Extract keywords for each category + mobile_keywords = pynini.compose(pynutil.delete("mobile"), context_cues).project("output").optimize() + + landline_keywords = pynini.compose(pynutil.delete("landline"), context_cues).project("output").optimize() - graph_landline_with_extension = delete_zero + delete_space + self.city_extension + delete_space + self.landline + pincode_keywords = pynini.compose(pynutil.delete("pincode"), context_cues).project("output").optimize() - graph_pincode = self.pincode_in_hindi | self.pincode_in_english + credit_keywords = pynini.compose(pynutil.delete("credit"), context_cues).project("output").optimize() - graph_credit_card_last_digits = self.credit_card_last_digits_hindi | self.credit_card_last_digits_english + # Convert FSTs to keyword lists for generate_* functions + mobile = generate_mobile([mobile_keywords]) + landline = generate_telephone([landline_keywords]) + pincode = generate_pincode([pincode_keywords]) + credit = generate_credit([credit_keywords]) graph = ( - graph_number_with_hindi_digit - | graph_number_with_english_digit - | graph_landline_with_extension - | graph_pincode - | graph_credit_card_last_digits + pynutil.add_weight(mobile, 0.7) + | pynutil.add_weight(landline, 0.8) + | pynutil.add_weight(credit, 0.9) + | pynutil.add_weight(pincode, 1) ) - final_graph = self.add_tokens(graph) - self.fst = final_graph + self.final = graph.optimize() + self.fst = self.add_tokens(self.final) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/time.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/time.py index 6bfc51af7..942b5022b 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/time.py @@ -15,7 +15,14 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.hi.graph_utils import GraphFst, delete_space, insert_space +from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( + DEVANAGARI_DIGIT, + GraphFst, + delete_extra_space, + delete_space, + insert_space, + integer_to_devanagari, +) from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path @@ -29,12 +36,15 @@ class TimeFst(GraphFst): time: TimeFst """ - def __init__(self): + def __init__(self, cardinal: GraphFst): super().__init__(name="time", kind="classify") - hour_graph = pynini.string_file(get_abs_path("data/time/hour.tsv")).invert() - minute_graph = pynini.string_file(get_abs_path("data/time/minute_and_second.tsv")).invert() - second_graph = pynini.string_file(get_abs_path("data/time/minute_and_second.tsv")).invert() + hour_graph = cardinal.graph_digit | cardinal.graph_teens_and_ties + time_hours = pynini.union(*[integer_to_devanagari(i) for i in range(1, 25)]).optimize() + hour_graph = hour_graph @ time_hours + + cardinal_graph = cardinal.graph_single_digit_with_zero | cardinal.graph_teens_and_ties + paune_hour_graph = pynini.string_file(get_abs_path("data/time/hour_for_paune.tsv")).invert() delete_baje = pynini.union( pynutil.delete("बजके") | pynutil.delete("बजकर") | pynutil.delete("बजे") | pynutil.delete("घंटा") @@ -44,8 +54,9 @@ def __init__(self): delete_second = pynutil.delete("सेकंड") self.hour = pynutil.insert("hours: \"") + hour_graph + pynutil.insert("\" ") - self.minute = pynutil.insert("minutes: \"") + minute_graph + pynutil.insert("\" ") - self.second = pynutil.insert("seconds: \"") + second_graph + pynutil.insert("\" ") + self.paune_hour = pynutil.insert("hours: \"") + paune_hour_graph + pynutil.insert("\" ") + self.minute = pynutil.insert("minutes: \"") + cardinal_graph + pynutil.insert("\" ") + self.second = pynutil.insert("seconds: \"") + cardinal_graph + pynutil.insert("\" ") # hour minute second graph_hms = ( @@ -63,17 +74,20 @@ def __init__(self): ) # hour minute and hour minute without "baje and minat" - graph_hm = ( + graph_hm = pynutil.add_weight( self.hour + delete_space + pynini.closure(delete_baje, 0, 1) + delete_space + self.minute - + pynini.closure(delete_space + delete_minute, 0, 1) + + pynini.closure(delete_space + delete_minute, 0, 1), + 0.01, ) # hour second - graph_hs = self.hour + delete_space + delete_baje + delete_space + self.second + delete_space + delete_second + graph_hs = pynutil.add_weight( + self.hour + delete_space + delete_baje + delete_space + self.second + delete_space + delete_second, 0.01 + ) # minute second graph_ms = ( @@ -83,7 +97,61 @@ def __init__(self): # hour graph_hour = self.hour + delete_space + delete_baje - graph = graph_hms | graph_hm | graph_hs | graph_ms | graph_hour + graph_saade = pynutil.add_weight( + pynutil.delete("साढ़े") + + delete_space + + self.hour + + delete_space + + pynutil.insert(" minutes: \"३०\"") + + delete_space + + pynini.closure(delete_baje), + 0.01, + ) + graph_sava = pynutil.add_weight( + pynutil.delete("सवा") + + delete_space + + self.hour + + delete_space + + pynutil.insert(" minutes: \"१५\"") + + delete_space + + pynini.closure(delete_baje), + 0.01, + ) + graph_paune = pynutil.add_weight( + pynutil.delete("पौने") + + delete_space + + self.paune_hour + + delete_space + + pynutil.insert(" minutes: \"४५\"") + + delete_space + + pynini.closure(delete_baje), + 0.01, + ) + graph_dedh = pynutil.add_weight( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + delete_baje + + pynutil.insert("hours: \"१\"") + + delete_space + + pynutil.insert(" minutes: \"३०\""), + 0.01, + ) + graph_dhaai = pynutil.add_weight( + pynutil.delete("ढाई") + + delete_space + + delete_baje + + pynutil.insert("hours: \"२\"") + + delete_space + + pynutil.insert(" minutes: \"३०\""), + 0.01, + ) + graph_quarterly_measures = ( + graph_dedh + | graph_dhaai + | ((graph_saade | graph_sava | graph_paune) + pynini.closure(delete_space + delete_baje)) + ) + + graph = graph_hms | graph_hm | graph_hs | graph_ms | graph_hour | graph_quarterly_measures self.graph = graph.optimize() final_graph = self.add_tokens(graph) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py index b3fcb0c2d..aef975b8e 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py @@ -26,6 +26,7 @@ generator_main, ) from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.electronic import ElectronicFst from nemo_text_processing.inverse_text_normalization.hi.taggers.date import DateFst from nemo_text_processing.inverse_text_normalization.hi.taggers.decimal import DecimalFst from nemo_text_processing.inverse_text_normalization.hi.taggers.fraction import FractionFst @@ -79,9 +80,9 @@ def __init__( decimal_graph = decimal.fst fraction = FractionFst(cardinal) fraction_graph = fraction.fst - date = DateFst(cardinal) + date = DateFst(cardinal, ordinal) date_graph = date.fst - time = TimeFst() + time = TimeFst(cardinal) time_graph = time.fst measure = MeasureFst(cardinal, decimal) measure_graph = measure.fst @@ -89,6 +90,8 @@ def __init__( money_graph = money.fst telephone = TelephoneFst(cardinal) telephone_graph = telephone.fst + electronic = ElectronicFst() + electronic_graph = electronic.fst punct_graph = PunctuationFst().fst whitelist_graph = WhiteListFst().fst word_graph = WordFst().fst @@ -103,6 +106,7 @@ def __init__( | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.1) + | pynutil.add_weight(electronic_graph, 0.5) | pynutil.add_weight(word_graph, 100) | pynutil.add_weight(whitelist_graph, 1.01) ) diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py index eacfb5765..7a5c10c4c 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py @@ -30,7 +30,7 @@ class DateFst(GraphFst): date { day: "५" month: "जनवरी" year: "२०१२" preserve_order: true } -> ५ जनवरी २०१२ """ - def __init__(self): + def __init__(self, cardinal: GraphFst, ordinal: GraphFst): super().__init__(name="date", kind="verbalize") month = ( pynutil.delete("month:") @@ -61,6 +61,21 @@ def __init__(self): + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) + era = ( + pynutil.delete("era:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + morpho_features = ( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + graph_fy = year graph_fy |= period + delete_space + year @@ -100,6 +115,11 @@ def __init__(self): # year range graph_year_range = year + # ordinal century + graph_ordinal_century = era + delete_space + morpho_features + delete_extra_space + period + + # graph_ordinal_range = graph_ordinal + delete_extra_space + period + optional_preserve_order = pynini.closure( pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space | pynutil.delete("field_order:") @@ -122,6 +142,7 @@ def __init__(self): | graph_dmyc | graph_myc | graph_year_range + | graph_ordinal_century ) + delete_space + optional_preserve_order diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/electronic.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/electronic.py new file mode 100644 index 000000000..883fd7909 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/electronic.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( + NEMO_NOT_QUOTE, + GraphFst, + delete_space, +) + + +class ElectronicFst(GraphFst): + """ + ITN verbalizer for electronic expressions. + All fields pass content through unchanged. + """ + + def __init__(self): + super().__init__(name="electronic", kind="verbalize") + + def field_graph(field_name: str) -> pynini.Fst: + return ( + pynutil.delete(f"{field_name}:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + + ip_graph = field_graph("ip") + domain_graph = field_graph("domain") + username_graph = field_graph("username") + path_graph = field_graph("path") + + email_graph = ( + username_graph + + pynutil.insert("@") + + delete_space + + domain_graph + ) + + # email before domain (both use domain: field) + graph = ip_graph | email_graph | path_graph | domain_graph + + self.fst = self.delete_tokens(graph).optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/fraction.py index 0fa7e97bd..45b5832b5 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/fraction.py @@ -16,6 +16,7 @@ import pynini from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst, delete_space diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/measure.py index 1fc9ba373..dc8592ebf 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/measure.py @@ -23,6 +23,7 @@ class MeasureFst(GraphFst): Finite state transducer for verbalizing measure, e.g. measure { negative: "true" cardinal { integer: "१२" } units: "kg" } -> -१२ kg measure { decimal { negative: "true" integer_part: "१२" fractional_part: "५०"} units: "kg" } -> -१२.५० kg + measure { units: "address" cardinal { integer: "७०० ओक स्ट्रीट" } preserve_order: true } -> ७०० ओक स्ट्रीट Args: decimal: DecimalFst @@ -32,11 +33,13 @@ class MeasureFst(GraphFst): def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="measure", kind="verbalize") optional_sign = pynini.closure(pynini.cross("negative: \"true\"", "-"), 0, 1) + + # Exclude "address" from regular unit matching so address path handles it unit = ( pynutil.delete("units:") + delete_space + pynutil.delete("\"") - + pynini.closure(NEMO_CHAR - " ", 1) + + pynini.difference(pynini.closure(NEMO_CHAR - " ", 1), pynini.accep("address")) + pynutil.delete("\"") + delete_space ) @@ -52,7 +55,31 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): graph_decimal = ( pynutil.delete("decimal {") + delete_space + decimal.numbers + delete_space + pynutil.delete("}") ) + graph_exception_bai = ( + pynutil.delete("cardinal {") + + delete_space + + optional_sign + + delete_space + + cardinal.numbers + + delete_space + + pynutil.delete("}") + ) graph = (graph_cardinal | graph_decimal) + delete_space + pynutil.insert(" ") + unit + graph |= graph_exception_bai + pynini.closure(delete_space + pynutil.insert(" ") + unit) + + # Address verbalizer: units: "address" cardinal { integer: "..." } preserve_order: true + preserve_order = pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space + address_graph = ( + pynutil.delete("units:") + + delete_space + + pynutil.delete("\"address\"") + + delete_space + + graph_cardinal + + delete_space + + pynini.closure(preserve_order) + ) + graph |= address_graph + delete_tokens = self.delete_tokens(graph) self.decimal = graph_decimal self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/ordinal.py index d6c4e0025..94f280798 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/ordinal.py @@ -40,6 +40,7 @@ def __init__(self): + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) - + self.numbers = graph + graph = graph.optimize() delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/percentage.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/percentage.py new file mode 100644 index 000000000..2267a8761 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/percentage.py @@ -0,0 +1,53 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( + NEMO_NOT_QUOTE, + GraphFst, + delete_space, +) + + +class PercentageFst(GraphFst): + """ + Finite state transducer for verbalizing percentage + e.g. percentage { integer: "२०" percent: "%" } -> २०% + """ + + def __init__(self): + super().__init__(name="percentage", kind="verbalize") + + integer_part = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + + percent_part = ( + pynutil.delete("percent:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + + graph = integer_part + delete_space + percent_part + + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py index 3f4b4de1f..fb2e5d618 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py @@ -32,7 +32,6 @@ def __init__(self, cardinal: GraphFst): number_part = pynutil.delete("number_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") optional_country_code = pynini.closure( pynutil.delete("country_code: \"") - + pynutil.insert("+") + delete_space + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py index 165fe7a7e..ff8aa8661 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py @@ -15,6 +15,7 @@ from nemo_text_processing.inverse_text_normalization.hi.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.electronic import ElectronicFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.date import DateFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.decimal import DecimalFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.fraction import FractionFst @@ -38,15 +39,17 @@ def __init__(self): super().__init__(name="verbalize", kind="verbalize") cardinal = CardinalFst() cardinal_graph = cardinal.fst - ordinal_graph = OrdinalFst().fst + ordinal = OrdinalFst() + ordinal_graph = ordinal.fst decimal = DecimalFst() decimal_graph = decimal.fst fraction_graph = FractionFst().fst - date_graph = DateFst().fst + date_graph = DateFst(cardinal, ordinal).fst time_graph = TimeFst().fst measure_graph = MeasureFst(cardinal, decimal).fst money_graph = MoneyFst(cardinal, decimal).fst telephone_graph = TelephoneFst(cardinal).fst + electronic_graph = ElectronicFst().fst word_graph = WordFst().fst whitelist_graph = WhiteListFst().fst @@ -62,5 +65,6 @@ def __init__(self): | measure_graph | money_graph | telephone_graph + | electronic_graph ) self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize_final.py index 17dfebf64..acaa71d87 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize_final.py @@ -16,7 +16,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.hi.graph_utils import GraphFst, delete_extra_space, delete_space +from nemo_text_processing.inverse_text_normalization.hi.graph_utils import NEMO_CHAR, GraphFst, delete_extra_space, delete_space from nemo_text_processing.inverse_text_normalization.hi.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.word import WordFst @@ -42,4 +42,11 @@ def __init__(self): + pynutil.delete("}") ) graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space - self.fst = graph + # Remove space before sentence-ending punctuation: "CODE ." -> "CODE." + remove_space_before_punct = pynini.cdrewrite( + pynutil.delete(" "), + "", + pynini.union(".", "।", ",", "!", "?", ";", ":"), + pynini.closure(NEMO_CHAR), + ) + self.fst = graph @ remove_space_before_punct diff --git a/nemo_text_processing/text_normalization/utils_audio_based.py b/nemo_text_processing/text_normalization/utils_audio_based.py index 2e9626d9e..1e3e811af 100644 --- a/nemo_text_processing/text_normalization/utils_audio_based.py +++ b/nemo_text_processing/text_normalization/utils_audio_based.py @@ -14,7 +14,10 @@ from typing import Dict -from cdifflib import CSequenceMatcher +try: + from cdifflib import CSequenceMatcher +except ImportError: + from difflib import SequenceMatcher as CSequenceMatcher from nemo_text_processing.utils.logging import logger diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_electronic.txt new file mode 100644 index 000000000..66c2fb7d8 --- /dev/null +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_electronic.txt @@ -0,0 +1,113 @@ +ए ए एस सी डॉट एन आई सी डॉट इन फॉरवर्ड स्लैश~aasc.nic.in/ +ए आर एन ओ एल डी सी ए टी एच वाई एट एम आई एल एल ई आर हाइफ़न सी ए आर आर डॉट कॉम~arnoldcathy@miller-carr.com +ए डी एच आई आई एस एच एट जीमेल डॉट कॉम~adhiish@gmail.com +ए एल आई आई हाइफ़न जी ए ए एन जी यू एल आई आई डॉट कॉम~alii-gaangulii.com +एच टी टी पी एस कोलन फॉरवर्ड स्लैश फॉरवर्ड स्लैश ए आर ओ डी डी डी एच ए ए डॉट नेट फॉरवर्ड स्लैश~https://arodddhaa.net/ +एच टी टी पी एस कोलन फॉरवर्ड स्लैश फॉरवर्ड स्लैश ए एल आई आई हाइफ़न एन यू यू आर ए ए एन आई आई डॉट कॉम फॉरवर्ड स्लैश टैग्स फॉरवर्ड स्लैश अबाउट फॉरवर्ड स्लैश~https://alii-nuuraanii.com/tags/about/ +एच टी टी पी एस कोलन फॉरवर्ड स्लैश फॉरवर्ड स्लैश ई एन डॉट विकिपीडिया डॉट ऑर्ग फॉरवर्ड स्लैश विकी फॉरवर्ड स्लैश~https://en.wikipedia.org/wiki/ +एच टी टी पी एस कोलन फॉरवर्ड स्लैश फॉरवर्ड स्लैश एम ए वाई एस हाइफ़न ई डी वार्ड एस डॉट कॉम फॉरवर्ड स्लैश~https://mays-edwards.com/ +एच टी टी पी कोलन फॉरवर्ड स्लैश फॉरवर्ड स्लैश एम एच ए ए डी ई वी डॉट ऑर्ग फॉरवर्ड स्लैश अबाउट डॉट एच टी एम एल~http://mhaadev.org/about.html +सी कोलन बैकवर्ड स्लैश यूज़र्स बैकवर्ड स्लैश एच पी बैकवर्ड स्लैश डेस्कटॉप बैकवर्ड स्लैश ऑडियो अंडर स्कोर फ़ाइल अंडर स्कोर दो डॉट एम पी तीन~C:\Users\HP\Desktop\Audio_file_2.mp3 +सी कोलन बैकवर्ड स्लैश यूज़र्स बैकवर्ड स्लैश एच पी बैकवर्ड स्लैश डेस्कटॉप बैकवर्ड स्लैश चैप्टर स्पेस एक स्पेस पाइथन डॉट पी डी एफ~C:\Users\HP\Desktop\chapter 1 python.pdf +होम / लिब्रे ऑफिस - इंप्रेस - टेम्पलेट्स - मास्टर /~home/libreoffice-impress-templates-master/ +होम / डेस्कटॉप~home/desktop +फॉरवर्ड स्लैश ई टी सी फॉरवर्ड स्लैश सी ए हाइफ़न सी ई आर टी आई एफ आई सी ए टी ई एस डॉट सी ओ एन एफ डॉट डी पी के जी हाइफ़न ओ एल डी~/etc/ca-certificates.conf.dpkg-old +बैकअप्स / टेम्प~backups/temp +डब्ल्यू डब्ल्यू डब्ल्यू डॉट ए एल यू एन आई वी डॉट ऑर्ग फॉरवर्ड स्लैश~www.aluniv.org/ +डब्ल्यू डब्ल्यू डब्ल्यू डॉट ए एम यू डॉट ए सी डॉट इन~www.amu.ac.in +लैपटॉप हाइफ़न चार तीन डॉट सानशेज़ हाइफ़न क्रॉफोर्ड~laptop-43.sanchez-crawford +~ फॉरवर्ड स्लैश वर्क फॉरवर्ड स्लैश ओ एस अंडर स्कोर सी ओ ओ आर डी अंडर स्कोर ओ आर डी आई एन ए एन सी ई अंडर स्कोर सर्वे डॉट एच~~/work/os_coord_ordinance_survey.h +आई ए तीन दो~IA32 +आई ई एल एफ शून्य शून्य चार~IELF004 +आई ई नौ जे~IE9J +आई एन छह तीन आठ नौ शून्य नौ शून्य~IN6389090 +आठ सात बी एफ जे यू आठ नौ छह शून्य वी एफ~87BFJU8960VF +आठ नौ सात छह एन एच जे ए के~8976NHJAK +आठ यू~8U +आर एस दो~RS2 +आर ज़ेड सी टी तीन एक चार आठ जी सात टी~RZCT3148G7T +आर तीन आठ शून्य~R380 +आर तीन एक शून्य~R310 +आर दो पाँच शून्य एस प्रो~R250S प्रो +ई आठ जे तीन~E8J3 +ई ई ई बी दो एक आठ~EEEB218 +ई ई ए डी शून्य पाँच एक~EEAD051 +ए ई ए डी आठ पाँच शून्य~AEAD850 +ए ए डी ई चार शून्य चार~AADE404 +ए एक शून्य एक आठ एस~A1018S +एक दो शून्य एम एम~120MM +एक्स ए पाँच तीन~XA53 +एक्स आठ छह~X86 +एक्स दो पाँच~X25 +एच ए जे डी एफ पाँच चार तीन आठ जे~HAJDF5438J +एन एफ डी एल दो तीन एक~NFDL231 +एन ओ आठ आठ~NO88 +एन नौ पाँच~N95 +एफ ई ए ज़ेड एफ आठ छह दो शून्य जे~FEAZF8620J +एफ ए ए एच एफ आठ छह चार तीन के~FAAHF8643K +एफ आठ~F8 +एम एम एक्स दो दो शून्य नौ शून्य आठ~MMX220908 +एम ओ यू दो शून्य शून्य दो शून्य~MOU20020 +एम के दो~MK2 +एल ए तीन छह~LA36 +एल ए तीन तीन~LA33 +एल तीन जी ए एक~L3GA1 +एल वाई छह चार~LY64 +एस आई एस दो तीन दो के दो~SIS232K2 +एस जे चार छह नौ~SJ469 +एस जे यू टी ए सात पाँच चार नौ एल~SJUTA7549L +एस पाँच तीन~S53 +सी एच तीन सी ओ ओ –~CH3COO– +ऑडी आठ शून्य बटा नौ शून्य बी चार~ऑडी 80/90 B4 +ऑडी ए छह~ऑडी A6 +ऑपपो एन्को एक्स दो~ऑपपो एन्को X2 +ऑपपो एम दो~ऑपपो M2 +ओ डी एम ई आठ एम एफ~ODME8MF +ओ डी शून्य दो सात छह तीन पाँच~OD027635 +ओ डी नौ तीन~OD93 +ओ पाँच डी पी एक~O5DP1 +के एन डब्ल्यू तीन तीन चार~KNW334 +के एल एम आठ एक~KLM81 +के डी एच ई आठ तीन एन नौ~KDHE83N9 +के डी तीन आठ चार~KD384 +सी ए एन ओ एन ए सात पाँच~Canon A75 +सी छह एच एक दो ओ छह~C6H12O6 +छह एस~6S +ज़ेड एन एस ओ चार~ZnSO4 +जी छह सी ओ आठ~G6CO8 +जी दो सी एम नौ~G2CM9 +जी एस ए टी हाइफ़न एक आठ~GSAT-18 +जे आठ एफ छह~J8F6 +जे आठ तीन डी के~J83DK +जे आर सात चार~JR74 +जे एक ओ पी नौ~J1OP9 +ज़ेड एक्स आठ शून्य एक नौ आठ शून्य~ZX80 1980 +ज़ेड एक्स आठ एक~ZX81 +ज़ेड एम आठ छह पाँच तीन आठ~ZM86538 +टी दो आठ एस~T28S +सी सात एच आठ~C7H8 +डब्ल्यू डब्ल्यू डब्ल्यू दो नौ शून्य एक एफ एन~WWW2901FN +डब्ल्यू डब्ल्यू डब्ल्यू सात सात आठ~WWW778 +डी आठ जे डी जे डब्ल्यू~D8JDJW +डी आठ जे शून्य डी के सात डी~D8J0DK7D +डी आठ जे सात डी एच तीन~D8J7DH3 +पाँच सी~5C +पी आर तीन दो दो एक सात सात~PR322177 +पी एम दो प्वाइंट पाँच~PM2.5 +पी के छह छह सात आठ~PK6678 +पी दो एम सी दो~P2MC2 +पी नौ पाँच~P95 +बी ई ई सी चार तीन शून्य~BEEC430 +बी ई ए डी आठ दो सात~BEAD827 +यू आठ जे डब्ल्यू~U8JW +यू आठ तीन जे एस~U83JS +यू ई आठ चार~UE84 +वाई आर सात तीन एच~YR73H +वाई छह पाँच~Y65 +वी क्यू सात शून्य एक जे~VQ701J +वी छह~V6 +वी दो तीन~V23 +टी हाइफ़न एक्स चार शून्य शून्य~T-X400 +टी हाइफ़न सात दो~T-72 +दो पाँच पाँच डॉट एक छह आठ डॉट चार छह डॉट एक सात पाँच~255.168.46.175 +आठ नौ डॉट तीन सात डॉट दो एक दो डॉट एक पाँच~89.37.212.15 diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_percentage.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_percentage.txt new file mode 100644 index 000000000..622cbd791 --- /dev/null +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_percentage.txt @@ -0,0 +1,12 @@ +बीस प्रतिशत~२०% +पचास प्रतिशत~५०% +दस प्रतिशत~१०% +सौ प्रतिशत~१००% +पच्चीस प्रतिशत~२५% +पाँच प्रतिशत~५% +तीन प्रतिशत~३% +सत्तर परसेंट~७०% +एक प्रतिशत~१% +शून्य प्रतिशत~०% +पाँच सौ फ़ीसदी~५००% +तेरह प्रतिशत~१३% \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/test_electronic.py b/tests/nemo_text_processing/hi/test_electronic.py new file mode 100644 index 000000000..6e23f7f39 --- /dev/null +++ b/tests/nemo_text_processing/hi/test_electronic.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestElectronic: + """ + ITN Electronic test suite — Hindi. + + Covers: email · https/http/www URL · plain domain · Windows path · + Unix path · IP address · alphanumeric codes · + chemical formulas (named + subscript) · + subdomain patterns (srv- db- lt- web- laptop- desktop- email-) + + Test cases: hi/data_inverse_text_normalization/test_cases_electronic.txt + Format per line: spoken_hindi~expected_written_form + """ + + inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand( + parse_test_case_file('hi/data_inverse_text_normalization/test_cases_electronic.txt') + ) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected, ( + f"\nInput: {test_input}" + f"\nExpected: {expected}" + f"\nGot: {pred}" + ) diff --git a/tests/nemo_text_processing/hi/test_percentage.py b/tests/nemo_text_processing/hi/test_percentage.py new file mode 100644 index 000000000..cec684241 --- /dev/null +++ b/tests/nemo_text_processing/hi/test_percentage.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestPercentage: + inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_percentage.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh index a365a834d..07b7d403b 100644 --- a/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh @@ -6,15 +6,30 @@ runtest () { input=$1 cd /workspace/sparrowhawk/documentation/grammars + # per-case timeout (seconds): bounds any input that causes a lattice blow-up + # so the suite can never hang forever. Override with CASE_TIMEOUT env var. + : ${CASE_TIMEOUT:=20} + total=$(wc -l < "$input") + n=0 + # read test file while read testcase; do + n=$((n+1)) IFS='~' read spoken written <<< $testcase - denorm_pred=$(echo $spoken | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1) + + # run with a timeout; if it times out, mark the prediction so it fails loudly + denorm_pred=$(echo $spoken | timeout ${CASE_TIMEOUT} normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1) + if [ $? -eq 124 ]; then + denorm_pred="<>" + fi # trim white space written="$(echo -e "${written}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + # progress to stderr so you can see it moving (not part of test output) + echo "[$n/$total] '$spoken' -> '$denorm_pred'" >&2 + # input expected actual assertEquals "$spoken" "$written" "$denorm_pred" done < "$input" @@ -78,6 +93,11 @@ testITNWhiteList() { runtest $input } +testITNElectronic() { + input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_electronic.txt + runtest $input +} + # Load shUnit2 . $PROJECT_DIR/../shunit2/shunit2 diff --git a/tests/nemo_text_processing/utils.py b/tests/nemo_text_processing/utils.py index 5326784e9..0e06d5945 100644 --- a/tests/nemo_text_processing/utils.py +++ b/tests/nemo_text_processing/utils.py @@ -39,7 +39,7 @@ def parse_test_case_file(file_name: str): Prepares tests pairs for ITN and TN tests """ test_pairs = [] - with open(os.path.dirname(os.path.abspath(__file__)) + os.path.sep + file_name, 'r') as f: + with open(os.path.dirname(os.path.abspath(__file__)) + os.path.sep + file_name, 'r', encoding='utf-8') as f: for line in f: components = line.strip("\n").split("~") spoken = components[0] diff --git a/tools/text_processing_deployment/Dockerfile b/tools/text_processing_deployment/Dockerfile index be6fedcda..e36f438e5 100644 --- a/tools/text_processing_deployment/Dockerfile +++ b/tools/text_processing_deployment/Dockerfile @@ -16,7 +16,10 @@ # Dockerfile for C++ (inverse) text normalization backend Sparrowhawk https://github.com/google/sparrowhawk # set base image (host OS) -FROM continuumio/miniconda3 +# Pinned to 4.6.14 (Debian stretch / g++ 6.3). The default `latest` tag now ships +# g++ 14, which cannot compile openfst 1.7.9's headers (fst/fst.h) used by +# Sparrowhawk's ./configure. g++ 6 compiles them cleanly. +FROM continuumio/miniconda3:4.6.14 # set the working directory in the container @@ -24,17 +27,23 @@ WORKDIR /workspace # install dependencies RUN echo "deb http://archive.debian.org/debian stretch main contrib non-free" > /etc/apt/sources.list -RUN apt-get update && apt-get upgrade -y && apt-get install -y --reinstall build-essential pkg-config git make wget +# stretch is EOL: its archive Release file is expired, so apt needs +# Check-Valid-Until=false and --allow-unauthenticated to install anything. +RUN apt-get -o Acquire::Check-Valid-Until=false update && \ + apt-get install -y --allow-unauthenticated build-essential pkg-config git make wget ca-certificates +# the old base ships outdated CA certs; skip TLS verification for the github/ +# wget downloads below (ephemeral build container, fixed open-source URLs). +RUN git config --global http.sslVerify false RUN conda install conda-build -y RUN conda install -c conda-forge thrax=1.3.4 -y RUN git clone https://github.com/google/re2 RUN cd re2 && git checkout tags/2022-02-01 && make && make install -RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v2.5.0/protobuf-2.5.0.tar.gz +RUN wget --no-check-certificate https://github.com/protocolbuffers/protobuf/releases/download/v2.5.0/protobuf-2.5.0.tar.gz RUN tar xzvf protobuf-2.5.0.tar.gz RUN cd protobuf-2.5.0 && ./configure && make && make install && ldconfig RUN printf "# Conda lib path \n/opt/conda/lib" > /etc/ld.so.conf.d/conda.so.conf ENV CPPFLAGS="-I/opt/conda/include" ENV LDFLAGS="-L/opt/conda/lib" -RUN git clone https://github.com/anand-nv/sparrowhawk.git && cd sparrowhawk && git checkout nemo_tests && apt-get install -y autoconf && bash autoreconf && ./configure && make && make install && ldconfig +RUN git clone https://github.com/anand-nv/sparrowhawk.git && cd sparrowhawk && git checkout nemo_tests && apt-get -o Acquire::Check-Valid-Until=false install -y --allow-unauthenticated autoconf && bash autoreconf && ./configure && make && make install && ldconfig RUN git clone https://github.com/kward/shunit2.git RUN echo "DONE" diff --git a/tools/text_processing_deployment/diag_base.sh b/tools/text_processing_deployment/diag_base.sh new file mode 100644 index 000000000..4a1642319 --- /dev/null +++ b/tools/text_processing_deployment/diag_base.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Test whether base miniconda3:4.6.14 (Debian stretch / g++6) compiles openfst +# 1.7.9's fst/fst.h. Stretch is EOL, so archive.debian.org has an expired Release +# file -> apt needs Check-Valid-Until=false + allow-unauthenticated to install g++. + +echo "########## Base: continuumio/miniconda3:4.6.14 (Debian stretch / g++6) ##########" +docker run --rm continuumio/miniconda3:4.6.14 bash -c ' + echo "deb http://archive.debian.org/debian stretch main contrib non-free" > /etc/apt/sources.list + echo "--- apt update ---" + apt-get -o Acquire::Check-Valid-Until=false update 2>&1 | tail -3 + echo "--- install build-essential ---" + apt-get install -y --allow-unauthenticated build-essential 2>&1 | tail -3 + echo "--- installing thrax (conda) ---" + conda install -c conda-forge thrax=1.3.4 -y >/dev/null 2>&1 + echo "--- g++ version ---" + g++ --version | head -1 + echo "--- header present? ---" + ls -la /opt/conda/include/fst/fst.h 2>&1 + printf "#include \nint main(){return 0;}\n" > /tmp/t.cpp + echo "--- compile test ---" + if g++ -I/opt/conda/include /tmp/t.cpp -o /tmp/t 2>/tmp/err; then + echo "RESULT: COMPILE_OK (base 4.6.14 fixes the build)" + else + echo "RESULT: COMPILE_FAIL" + head -10 /tmp/err + fi +' diff --git a/tools/text_processing_deployment/diag_fst.sh b/tools/text_processing_deployment/diag_fst.sh new file mode 100644 index 000000000..d70157845 --- /dev/null +++ b/tools/text_processing_deployment/diag_fst.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Diagnostic: why does sparrowhawk's ./configure fail with "fst/fst.h not found"? +# The header exists at /opt/conda/include/fst/fst.h, so we test whether it +# actually COMPILES (autoconf reports "not found" when the test-compile fails). + +set -x +docker run --rm continuumio/miniconda3 bash -c ' + conda install -c conda-forge thrax=1.3.4 -y >/dev/null 2>&1 + apt-get update >/dev/null 2>&1 + apt-get install -y g++ >/dev/null 2>&1 + printf "#include \nint main(){return 0;}\n" > /tmp/t.cpp + echo "=== g++ version ===" + g++ --version | head -1 + echo "=== PLAIN (default std) ===" + g++ -I/opt/conda/include /tmp/t.cpp -o /tmp/t 2>&1 | head -15 + echo "=== CPP14 ===" + g++ -std=c++14 -I/opt/conda/include /tmp/t.cpp -o /tmp/t 2>&1 | head -15 + echo "=== CPP17 ===" + g++ -std=c++17 -I/opt/conda/include /tmp/t.cpp -o /tmp/t 2>&1 | head -15 + echo "=== DONE ===" +' diff --git a/v2_helper_test1.py b/v2_helper_test1.py new file mode 100644 index 000000000..d7791753a --- /dev/null +++ b/v2_helper_test1.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +import asyncio +import os +import re +import sys +import time +import argparse +from pathlib import Path + +# Force line buffering for stdout +sys.stdout.reconfigure(line_buffering=True) + +# Auto-detect the Hindi test dir relative to this script's location, +# so it works on any machine regardless of username/OS. +REPO_ROOT = Path(__file__).resolve().parent +HINDI_TEST_DIR = str(REPO_ROOT / "tests" / "nemo_text_processing" / "hi") + os.sep + +# Use the current interpreter's pytest so the right conda env is picked up. +PYTEST = [sys.executable, "-m", "pytest"] + + +async def run_subprocess(cmd): + process = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.STDOUT, + env=os.environ.copy(), + ) + stdout, _ = await process.communicate() + output = stdout.decode(errors="replace") if stdout else "" + return process.returncode, output + + +async def collect_test_units(): + """Collect pytest nodeids and group into runnable units (classes or functions).""" + candidate_cmds = [ + PYTEST + [HINDI_TEST_DIR, "--collect-only", "-q"], + PYTEST + [HINDI_TEST_DIR, "--collect-only"], + ] + output = "" + for cmd in candidate_cmds: + returncode, out = await run_subprocess(cmd) + if returncode == 0 and out.strip(): + output = out + break + if not output: + return [] + + units = set() + for raw_line in output.splitlines(): + line = raw_line.strip() + if not line or line.startswith("<") or line.startswith("collected "): + continue + parts = line.split("::") + if len(parts) >= 3: + unit = f"{parts[0]}::{parts[1]}" + elif len(parts) == 2: + unit = line + else: + continue + units.add(unit) + + return sorted(units) + + +async def run_test_unit(nodeid, sem, basetemp_dir, timeout=None): + """Run a class or function nodeid and return results.""" + async with sem: + cmd = PYTEST + [ + nodeid, + "--cpu", + "--disable-warnings", + "--tb=line", + "-q", + "--cache-clear", + f"--basetemp={basetemp_dir}", + ] + + env = os.environ.copy() + env["PYTHONUNBUFFERED"] = "1" + + start_t = time.time() + try: + process = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.STDOUT, + env=env, + ) + + try: + if timeout: + stdout, _ = await asyncio.wait_for(process.communicate(), timeout=timeout) + else: + stdout, _ = await process.communicate() + except asyncio.TimeoutError: + process.kill() + await process.communicate() + duration = time.time() - start_t + return { + "unit": nodeid, + "returncode": -1, + "output": f"\n\n[X] Timeout after {timeout}s (killed)", + "duration": duration, + } + + duration = time.time() - start_t + output = stdout.decode(errors="replace") if stdout else "" + return { + "unit": nodeid, + "returncode": process.returncode, + "output": output, + "duration": duration, + } + except Exception as e: + return { + "unit": nodeid, + "returncode": -1, + "output": str(e), + "duration": time.time() - start_t, + } + + +def parse_summary_line(output: str): + for line in output.splitlines(): + s = line.strip() + if ("passed" in s or "failed" in s or "skipped" in s) and s.startswith("===") and s.endswith("==="): + return s.strip("=").strip() + return None + + +def parse_test_counts(summary_line): + """Parse counts from pytest summary line like '62 passed, 3 failed in 84.85s'.""" + counts = {"passed": 0, "failed": 0, "skipped": 0, "error": 0} + if not summary_line: + return counts + for key in counts: + match = re.search(rf"(\d+)\s+{key}", summary_line) + if match: + counts[key] = int(match.group(1)) + return counts + + +async def run_all_tests(limit=None, timeout=None): + """Collect tests and run classes/functions in parallel.""" + units = await collect_test_units() + + if not units: + test_dir = Path(HINDI_TEST_DIR) + units = [str(p) for p in sorted(test_dir.glob("test_*.py"))] + if not units: + print(f"[X] No tests found in {HINDI_TEST_DIR}") + return 1 + print(f"[!] Falling back to per-file parallelization ({len(units)} files)") + + if limit: + print(f"[!] Limiting to first {limit} units for verification.") + units = units[:limit] + + max_parallel_env = os.environ.get("TEST_PARALLELISM") + try: + default_parallel = 2 + max_parallel = int(max_parallel_env) if max_parallel_env else default_parallel + except ValueError: + max_parallel = 4 + + max_parallel = max(1, min(max_parallel, len(units))) + + print(f"[>] Running {len(units)} units in parallel (concurrency={max_parallel})\n") + start_time = time.time() + + sem = asyncio.Semaphore(max_parallel) + base_root = (REPO_ROOT / ".pytest_basetemp").absolute() + base_root.mkdir(parents=True, exist_ok=True) + + def safe_name(s: str) -> str: + return s.replace("/", "_").replace("\\", "_").replace(":", "_") + + unit_to_temp = {u: str(base_root / f"bt_{safe_name(u)}") for u in units} + + tasks = [run_test_unit(unit, sem, unit_to_temp[unit], timeout=timeout) for unit in units] + + print("\n" + "=" * 70) + print("Test Results by Unit (Streaming)\n") + + completed_results = [] + for task in asyncio.as_completed(tasks): + result = await task + completed_results.append(result) + + unit = result["unit"] + output = result["output"] + returncode = result["returncode"] + duration = result["duration"] + + summary_line = parse_summary_line(output) + status = "[OK]" if returncode == 0 else "[X]" + + line_info = f"{status} {unit} ({duration:.2f}s)" + if summary_line: + print(f"{line_info}\n {summary_line}", flush=True) + else: + tail = "\n".join([l for l in output.splitlines()[-20:] if l.strip()]) + if returncode != 0 and tail: + print(f"{line_info}\n Last lines:\n {tail}", flush=True) + else: + print(f"{line_info}", flush=True) + + elapsed_time = time.time() - start_time + + failed_units = [] + total_passed = total_failed = total_skipped = total_error = 0 + + for result in completed_results: + if result["returncode"] != 0: + failed_units.append(result["unit"]) + counts = parse_test_counts(parse_summary_line(result["output"])) + total_passed += counts["passed"] + total_failed += counts["failed"] + total_skipped += counts["skipped"] + total_error += counts["error"] + + print("=" * 70) + print(f"\nTotal time: {elapsed_time:.2f} seconds") + + print(f"\nGrand Total: {total_passed} passed, {total_failed} failed", end="") + if total_skipped > 0: + print(f", {total_skipped} skipped", end="") + if total_error > 0: + print(f", {total_error} error", end="") + print() + + if failed_units: + print(f"\n[X] {len(failed_units)} unit(s) failed:") + for u in failed_units: + print(f" - {u}") + return 1 + else: + print("\n[OK] All tests passed!") + return 0 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Parallel Pytest Runner") + parser.add_argument("--limit", type=int, default=None, help="Limit the number of units to run") + parser.add_argument("--timeout", type=int, default=None, help="Timeout in seconds per unit") + args = parser.parse_args() + + exit_code = asyncio.run(run_all_tests(limit=args.limit, timeout=args.timeout)) + sys.exit(exit_code) From 5bd93ca6e1224b746f5862a53b8d75950637d125 Mon Sep 17 00:00:00 2001 From: Mayuri S Date: Fri, 5 Jun 2026 17:11:53 +0530 Subject: [PATCH 2/5] chore: remove laptop setup guide and temp diag scripts Signed-off-by: Mayuri S --- NEW_LAPTOP_SETUP.md | 134 ------------------ tools/text_processing_deployment/diag_base.sh | 27 ---- tools/text_processing_deployment/diag_fst.sh | 21 --- 3 files changed, 182 deletions(-) delete mode 100644 NEW_LAPTOP_SETUP.md delete mode 100644 tools/text_processing_deployment/diag_base.sh delete mode 100644 tools/text_processing_deployment/diag_fst.sh diff --git a/NEW_LAPTOP_SETUP.md b/NEW_LAPTOP_SETUP.md deleted file mode 100644 index f66e32a61..000000000 --- a/NEW_LAPTOP_SETUP.md +++ /dev/null @@ -1,134 +0,0 @@ -# New Laptop Setup — Hindi ITN Electronic Grammar + Sparrowhawk Tests - -Complete from-scratch guide to get this project running on a fresh Windows laptop, -including Docker (via Docker Desktop) and the Sparrowhawk ITN test pipeline. - -> All the "WSL" commands run **inside the Ubuntu terminal**, not Windows PowerShell. -> Steps marked "(Windows)" run in **Windows PowerShell**. - ---- - -## 1. Install WSL2 + Ubuntu (Windows PowerShell, as Administrator) - -```powershell -wsl --install -wsl --set-default-version 2 -``` - -Reboot when prompted. On first launch, Ubuntu asks you to create a username + password — set those (remember the password; it's your `sudo` password). - -Verify: -```powershell -wsl --status # should show Default Version: 2 -``` - ---- - -## 2. Install Docker Desktop (Windows) - -We use **Docker Desktop** (not the native WSL engine) because it handles WSL -networking/DNS/MTU automatically — avoids the TLS-timeout / DNS issues. - -1. Download from https://www.docker.com/products/docker-desktop/ → **Windows AMD64** -2. Run the installer → keep **"Use WSL 2 instead of Hyper-V"** CHECKED -3. Launch Docker Desktop (whale icon in the system tray) -4. **Settings → General** → confirm **"Use the WSL 2 based engine"** is checked -5. **Settings → Resources → WSL Integration** → toggle **ON** for your **Ubuntu** distro -6. Click **Apply & Restart** - -Verify (in a FRESH Ubuntu terminal): -```bash -docker context ls # "desktop-linux" should be current (*) -docker run hello-world # should pull + print "Hello from Docker!" -``` - -> Note: with Docker Desktop you do NOT run `sudo service docker start`. -> The daemon runs on the Windows side — just keep Docker Desktop open. - ---- - -## 3. Get the code (WSL Ubuntu) - -```bash -sudo apt update && sudo apt install -y git -git clone https://github.com/mayuris-00/NeMo-text-processing.git -cd NeMo-text-processing - -# Your latest WIP work is on this branch: -git checkout hi-itn-electronic-backup-2026-06-04 -``` - -Branches: -- `hi-itn-electronic-nvidia-base` — main working branch (NVIDIA-original base) -- `hi-itn-electronic-backup-2026-06-04` — WIP backup (electronic grammar work) - ---- - -## 4. Set up Python + pynini (WSL) - -`pynini` only builds on Linux — that's why grammar export happens in WSL. - -```bash -sudo apt install -y python3 python3-pip -pip3 install pynini==2.1.5 -pip3 install nemo_text_processing -``` - -(If you prefer an isolated env: `python3 -m venv ~/ntp-venv && source ~/ntp-venv/bin/activate` -before the `pip3 install` lines.) - ---- - -## 5. Run the Sparrowhawk ITN test (WSL) - -```bash -cd ~/NeMo-text-processing/tools/text_processing_deployment - -bash export_grammars.sh \ - --GRAMMARS=itn_grammars \ - --LANGUAGE=hi \ - --INPUT_CASE=lower_cased \ - --MODE=test -``` - -What this does (chain): -1. `pynini_export.py` compiles the Hindi ITN grammars → `.far` files -2. `docker/build.sh` builds the `sparrowhawk` image (~30 min the FIRST time; - it compiles protobuf/re2/sparrowhawk from source) -3. `docker/launch.sh` runs the container → executes - `test_sparrowhawk_inverse_text_normalization.sh` → runs all the ITN tests, - including `testITNElectronic` (your electronic test cases) - -### Just the electronic test cases -The electronic cases live in: -`tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_electronic.txt` - -The `testITNElectronic` function in -`tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh` -runs them. - ---- - -## Troubleshooting - -- **`docker run hello-world` TLS timeout** → Docker Desktop not started, or WSL - integration off. Open Docker Desktop, re-check Settings → Resources → WSL Integration. -- **Sparrowhawk build fails on a `git clone` / download** → network blip; just - re-run the `export_grammars.sh` command (Docker caches completed layers). -- **`pynini` install fails** → make sure you're in WSL/Ubuntu, not Windows Python. -- **Rebuild the image from scratch** → add `FORCE_REBUILD=True` to the export command. -- **Re-use existing `.far` files (skip recompile)** → the script auto-detects them; - to force overwrite, leave `OVERWRITE_CACHE=True` (default). - ---- - -## Quick reference — daily workflow - -```bash -# 1. Make sure Docker Desktop is running (Windows) -# 2. In WSL: -cd ~/NeMo-text-processing -git checkout hi-itn-electronic-backup-2026-06-04 -cd tools/text_processing_deployment -bash export_grammars.sh --GRAMMARS=itn_grammars --LANGUAGE=hi --INPUT_CASE=lower_cased --MODE=test -``` diff --git a/tools/text_processing_deployment/diag_base.sh b/tools/text_processing_deployment/diag_base.sh deleted file mode 100644 index 4a1642319..000000000 --- a/tools/text_processing_deployment/diag_base.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Test whether base miniconda3:4.6.14 (Debian stretch / g++6) compiles openfst -# 1.7.9's fst/fst.h. Stretch is EOL, so archive.debian.org has an expired Release -# file -> apt needs Check-Valid-Until=false + allow-unauthenticated to install g++. - -echo "########## Base: continuumio/miniconda3:4.6.14 (Debian stretch / g++6) ##########" -docker run --rm continuumio/miniconda3:4.6.14 bash -c ' - echo "deb http://archive.debian.org/debian stretch main contrib non-free" > /etc/apt/sources.list - echo "--- apt update ---" - apt-get -o Acquire::Check-Valid-Until=false update 2>&1 | tail -3 - echo "--- install build-essential ---" - apt-get install -y --allow-unauthenticated build-essential 2>&1 | tail -3 - echo "--- installing thrax (conda) ---" - conda install -c conda-forge thrax=1.3.4 -y >/dev/null 2>&1 - echo "--- g++ version ---" - g++ --version | head -1 - echo "--- header present? ---" - ls -la /opt/conda/include/fst/fst.h 2>&1 - printf "#include \nint main(){return 0;}\n" > /tmp/t.cpp - echo "--- compile test ---" - if g++ -I/opt/conda/include /tmp/t.cpp -o /tmp/t 2>/tmp/err; then - echo "RESULT: COMPILE_OK (base 4.6.14 fixes the build)" - else - echo "RESULT: COMPILE_FAIL" - head -10 /tmp/err - fi -' diff --git a/tools/text_processing_deployment/diag_fst.sh b/tools/text_processing_deployment/diag_fst.sh deleted file mode 100644 index d70157845..000000000 --- a/tools/text_processing_deployment/diag_fst.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -# Diagnostic: why does sparrowhawk's ./configure fail with "fst/fst.h not found"? -# The header exists at /opt/conda/include/fst/fst.h, so we test whether it -# actually COMPILES (autoconf reports "not found" when the test-compile fails). - -set -x -docker run --rm continuumio/miniconda3 bash -c ' - conda install -c conda-forge thrax=1.3.4 -y >/dev/null 2>&1 - apt-get update >/dev/null 2>&1 - apt-get install -y g++ >/dev/null 2>&1 - printf "#include \nint main(){return 0;}\n" > /tmp/t.cpp - echo "=== g++ version ===" - g++ --version | head -1 - echo "=== PLAIN (default std) ===" - g++ -I/opt/conda/include /tmp/t.cpp -o /tmp/t 2>&1 | head -15 - echo "=== CPP14 ===" - g++ -std=c++14 -I/opt/conda/include /tmp/t.cpp -o /tmp/t 2>&1 | head -15 - echo "=== CPP17 ===" - g++ -std=c++17 -I/opt/conda/include /tmp/t.cpp -o /tmp/t 2>&1 | head -15 - echo "=== DONE ===" -' From 933155b3190517d23b62105c9f6f635c46de2cb5 Mon Sep 17 00:00:00 2001 From: Mayuri S Date: Fri, 5 Jun 2026 17:11:53 +0530 Subject: [PATCH 3/5] chore: remove scratch helper script Signed-off-by: Mayuri S --- v2_helper_test1.py | 250 --------------------------------------------- 1 file changed, 250 deletions(-) delete mode 100644 v2_helper_test1.py diff --git a/v2_helper_test1.py b/v2_helper_test1.py deleted file mode 100644 index d7791753a..000000000 --- a/v2_helper_test1.py +++ /dev/null @@ -1,250 +0,0 @@ -#!/usr/bin/env python3 -import asyncio -import os -import re -import sys -import time -import argparse -from pathlib import Path - -# Force line buffering for stdout -sys.stdout.reconfigure(line_buffering=True) - -# Auto-detect the Hindi test dir relative to this script's location, -# so it works on any machine regardless of username/OS. -REPO_ROOT = Path(__file__).resolve().parent -HINDI_TEST_DIR = str(REPO_ROOT / "tests" / "nemo_text_processing" / "hi") + os.sep - -# Use the current interpreter's pytest so the right conda env is picked up. -PYTEST = [sys.executable, "-m", "pytest"] - - -async def run_subprocess(cmd): - process = await asyncio.create_subprocess_exec( - *cmd, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.STDOUT, - env=os.environ.copy(), - ) - stdout, _ = await process.communicate() - output = stdout.decode(errors="replace") if stdout else "" - return process.returncode, output - - -async def collect_test_units(): - """Collect pytest nodeids and group into runnable units (classes or functions).""" - candidate_cmds = [ - PYTEST + [HINDI_TEST_DIR, "--collect-only", "-q"], - PYTEST + [HINDI_TEST_DIR, "--collect-only"], - ] - output = "" - for cmd in candidate_cmds: - returncode, out = await run_subprocess(cmd) - if returncode == 0 and out.strip(): - output = out - break - if not output: - return [] - - units = set() - for raw_line in output.splitlines(): - line = raw_line.strip() - if not line or line.startswith("<") or line.startswith("collected "): - continue - parts = line.split("::") - if len(parts) >= 3: - unit = f"{parts[0]}::{parts[1]}" - elif len(parts) == 2: - unit = line - else: - continue - units.add(unit) - - return sorted(units) - - -async def run_test_unit(nodeid, sem, basetemp_dir, timeout=None): - """Run a class or function nodeid and return results.""" - async with sem: - cmd = PYTEST + [ - nodeid, - "--cpu", - "--disable-warnings", - "--tb=line", - "-q", - "--cache-clear", - f"--basetemp={basetemp_dir}", - ] - - env = os.environ.copy() - env["PYTHONUNBUFFERED"] = "1" - - start_t = time.time() - try: - process = await asyncio.create_subprocess_exec( - *cmd, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.STDOUT, - env=env, - ) - - try: - if timeout: - stdout, _ = await asyncio.wait_for(process.communicate(), timeout=timeout) - else: - stdout, _ = await process.communicate() - except asyncio.TimeoutError: - process.kill() - await process.communicate() - duration = time.time() - start_t - return { - "unit": nodeid, - "returncode": -1, - "output": f"\n\n[X] Timeout after {timeout}s (killed)", - "duration": duration, - } - - duration = time.time() - start_t - output = stdout.decode(errors="replace") if stdout else "" - return { - "unit": nodeid, - "returncode": process.returncode, - "output": output, - "duration": duration, - } - except Exception as e: - return { - "unit": nodeid, - "returncode": -1, - "output": str(e), - "duration": time.time() - start_t, - } - - -def parse_summary_line(output: str): - for line in output.splitlines(): - s = line.strip() - if ("passed" in s or "failed" in s or "skipped" in s) and s.startswith("===") and s.endswith("==="): - return s.strip("=").strip() - return None - - -def parse_test_counts(summary_line): - """Parse counts from pytest summary line like '62 passed, 3 failed in 84.85s'.""" - counts = {"passed": 0, "failed": 0, "skipped": 0, "error": 0} - if not summary_line: - return counts - for key in counts: - match = re.search(rf"(\d+)\s+{key}", summary_line) - if match: - counts[key] = int(match.group(1)) - return counts - - -async def run_all_tests(limit=None, timeout=None): - """Collect tests and run classes/functions in parallel.""" - units = await collect_test_units() - - if not units: - test_dir = Path(HINDI_TEST_DIR) - units = [str(p) for p in sorted(test_dir.glob("test_*.py"))] - if not units: - print(f"[X] No tests found in {HINDI_TEST_DIR}") - return 1 - print(f"[!] Falling back to per-file parallelization ({len(units)} files)") - - if limit: - print(f"[!] Limiting to first {limit} units for verification.") - units = units[:limit] - - max_parallel_env = os.environ.get("TEST_PARALLELISM") - try: - default_parallel = 2 - max_parallel = int(max_parallel_env) if max_parallel_env else default_parallel - except ValueError: - max_parallel = 4 - - max_parallel = max(1, min(max_parallel, len(units))) - - print(f"[>] Running {len(units)} units in parallel (concurrency={max_parallel})\n") - start_time = time.time() - - sem = asyncio.Semaphore(max_parallel) - base_root = (REPO_ROOT / ".pytest_basetemp").absolute() - base_root.mkdir(parents=True, exist_ok=True) - - def safe_name(s: str) -> str: - return s.replace("/", "_").replace("\\", "_").replace(":", "_") - - unit_to_temp = {u: str(base_root / f"bt_{safe_name(u)}") for u in units} - - tasks = [run_test_unit(unit, sem, unit_to_temp[unit], timeout=timeout) for unit in units] - - print("\n" + "=" * 70) - print("Test Results by Unit (Streaming)\n") - - completed_results = [] - for task in asyncio.as_completed(tasks): - result = await task - completed_results.append(result) - - unit = result["unit"] - output = result["output"] - returncode = result["returncode"] - duration = result["duration"] - - summary_line = parse_summary_line(output) - status = "[OK]" if returncode == 0 else "[X]" - - line_info = f"{status} {unit} ({duration:.2f}s)" - if summary_line: - print(f"{line_info}\n {summary_line}", flush=True) - else: - tail = "\n".join([l for l in output.splitlines()[-20:] if l.strip()]) - if returncode != 0 and tail: - print(f"{line_info}\n Last lines:\n {tail}", flush=True) - else: - print(f"{line_info}", flush=True) - - elapsed_time = time.time() - start_time - - failed_units = [] - total_passed = total_failed = total_skipped = total_error = 0 - - for result in completed_results: - if result["returncode"] != 0: - failed_units.append(result["unit"]) - counts = parse_test_counts(parse_summary_line(result["output"])) - total_passed += counts["passed"] - total_failed += counts["failed"] - total_skipped += counts["skipped"] - total_error += counts["error"] - - print("=" * 70) - print(f"\nTotal time: {elapsed_time:.2f} seconds") - - print(f"\nGrand Total: {total_passed} passed, {total_failed} failed", end="") - if total_skipped > 0: - print(f", {total_skipped} skipped", end="") - if total_error > 0: - print(f", {total_error} error", end="") - print() - - if failed_units: - print(f"\n[X] {len(failed_units)} unit(s) failed:") - for u in failed_units: - print(f" - {u}") - return 1 - else: - print("\n[OK] All tests passed!") - return 0 - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Parallel Pytest Runner") - parser.add_argument("--limit", type=int, default=None, help="Limit the number of units to run") - parser.add_argument("--timeout", type=int, default=None, help="Timeout in seconds per unit") - args = parser.parse_args() - - exit_code = asyncio.run(run_all_tests(limit=args.limit, timeout=args.timeout)) - sys.exit(exit_code) From 12e49f0ba82b1aaa330e836c8eecf5bc8062957f Mon Sep 17 00:00:00 2001 From: Mayuri S Date: Fri, 5 Jun 2026 17:12:44 +0530 Subject: [PATCH 4/5] chore(hi): remove percentage class (out of scope for electronic PR) Signed-off-by: Mayuri S --- .../hi/data/percentage/__init__.py | 0 .../hi/data/percentage/percent_symbol.tsv | 4 --- .../test_cases_percentage.txt | 12 -------- .../hi/test_percentage.py | 29 ------------------- 4 files changed, 45 deletions(-) delete mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/percentage/__init__.py delete mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/percentage/percent_symbol.tsv delete mode 100644 tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_percentage.txt delete mode 100644 tests/nemo_text_processing/hi/test_percentage.py diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/percentage/__init__.py b/nemo_text_processing/inverse_text_normalization/hi/data/percentage/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/percentage/percent_symbol.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/percentage/percent_symbol.tsv deleted file mode 100644 index c2fb2992b..000000000 --- a/nemo_text_processing/inverse_text_normalization/hi/data/percentage/percent_symbol.tsv +++ /dev/null @@ -1,4 +0,0 @@ -% प्रतिशत -% परसेंट -% फ़ीसदी -% फीसदी \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_percentage.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_percentage.txt deleted file mode 100644 index 622cbd791..000000000 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_percentage.txt +++ /dev/null @@ -1,12 +0,0 @@ -बीस प्रतिशत~२०% -पचास प्रतिशत~५०% -दस प्रतिशत~१०% -सौ प्रतिशत~१००% -पच्चीस प्रतिशत~२५% -पाँच प्रतिशत~५% -तीन प्रतिशत~३% -सत्तर परसेंट~७०% -एक प्रतिशत~१% -शून्य प्रतिशत~०% -पाँच सौ फ़ीसदी~५००% -तेरह प्रतिशत~१३% \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/test_percentage.py b/tests/nemo_text_processing/hi/test_percentage.py deleted file mode 100644 index cec684241..000000000 --- a/tests/nemo_text_processing/hi/test_percentage.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from parameterized import parameterized -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer -from ..utils import CACHE_DIR, parse_test_case_file - - -class TestPercentage: - inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) - - @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_percentage.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred == expected \ No newline at end of file From d4c9cf6a63829fb7e87c8ffc597f09b3e5839af2 Mon Sep 17 00:00:00 2001 From: Mayuri S Date: Mon, 8 Jun 2026 11:34:30 +0530 Subject: [PATCH 5/5] chore(hi): remove leftover percentage tagger and verbalizer Signed-off-by: Mayuri S --- .../hi/taggers/percentage.py | 54 ------------------- .../hi/verbalizers/percentage.py | 53 ------------------ 2 files changed, 107 deletions(-) delete mode 100644 nemo_text_processing/inverse_text_normalization/hi/taggers/percentage.py delete mode 100644 nemo_text_processing/inverse_text_normalization/hi/verbalizers/percentage.py diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/percentage.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/percentage.py deleted file mode 100644 index c191866b3..000000000 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/percentage.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pynini -from pynini.lib import pynutil - -from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( - GraphFst, - delete_space, -) -from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path - - -class PercentageFst(GraphFst): - """ - Finite state transducer for classifying percentages - e.g. बीस प्रतिशत -> percentage { integer: "२०" percent: "%" } - - Args: - cardinal: CardinalFst - """ - - def __init__(self, cardinal): - super().__init__(name="percentage", kind="classify") - - graph_percent_symbol = pynini.string_file( - get_abs_path("data/percentage/percent_symbol.tsv") - ).invert() - - integer_graph = cardinal.graph_no_exception - - final_graph = ( - pynutil.insert("integer: \"") - + integer_graph - + pynutil.insert("\"") - + delete_space - + pynutil.insert(" percent: \"") - + graph_percent_symbol - + pynutil.insert("\"") - ) - - final_graph = self.add_tokens(final_graph) - self.fst = final_graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/percentage.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/percentage.py deleted file mode 100644 index 2267a8761..000000000 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/percentage.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pynini -from pynini.lib import pynutil - -from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( - NEMO_NOT_QUOTE, - GraphFst, - delete_space, -) - - -class PercentageFst(GraphFst): - """ - Finite state transducer for verbalizing percentage - e.g. percentage { integer: "२०" percent: "%" } -> २०% - """ - - def __init__(self): - super().__init__(name="percentage", kind="verbalize") - - integer_part = ( - pynutil.delete("integer:") - + delete_space - + pynutil.delete("\"") - + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete("\"") - ) - - percent_part = ( - pynutil.delete("percent:") - + delete_space - + pynutil.delete("\"") - + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete("\"") - ) - - graph = integer_part + delete_space + percent_part - - delete_tokens = self.delete_tokens(graph) - self.fst = delete_tokens.optimize() \ No newline at end of file