|
| 1 | +# Arabic S2R using the 3rd-party ArabicTransliterator library: |
| 2 | +# https://github.com/MTG/ArabicTransliterator |
| 3 | + |
| 4 | +--- |
| 5 | +general: |
| 6 | + name: Arabic |
| 7 | + parents: |
| 8 | + - _ignore_base |
| 9 | + description: > |
| 10 | + Version 1.0 (2025-11-29) - Arabic language R2S using a conversion table; S2R using a 3rd party library. |
| 11 | + case_sensitive: false |
| 12 | + |
| 13 | +roman_to_script: |
| 14 | + map: |
| 15 | + |
| 16 | + # Punctuation marks: |
| 17 | + "*": "\u066D" |
| 18 | + ",": "\u060C" |
| 19 | + ";": "\u061B" |
| 20 | + "?": "\u061F" |
| 21 | + |
| 22 | + # Exceptions for specific words |
| 23 | + |
| 24 | + # Allah |
| 25 | + "%alla\u0304h%": "\uFDF2" |
| 26 | + "alla\u0304h": "\u0627\u0644\u0644\u0647" |
| 27 | + |
| 28 | + # Qur'an |
| 29 | + "qur\u02BCa\u0304n": "\u0642\u0631\u0622\u0646" |
| 30 | + |
| 31 | + # lillah |
| 32 | + "lilla\u0304h": "\u0644\u0644\u0647" |
| 33 | + |
| 34 | + # billah |
| 35 | + "billa\u0304h": "\u0628\u0644\u0644\u0647" |
| 36 | + |
| 37 | + # Rahman |
| 38 | + "rah\u0323ma\u0304n": "\u0631\u062D\u0645\u0646" |
| 39 | + |
| 40 | + # Ruwat |
| 41 | + "ruwa\u0304t": "\u0631\u0648\u0627\u0629" |
| 42 | + |
| 43 | + # Hadha |
| 44 | + "ha\u0304dha\u0304": "\u0647\u0630\u0627" |
| 45 | + |
| 46 | + # Hadhihi |
| 47 | + "ha\u0304dhi\u0304hi": "\u0647\u0630\u0647" |
| 48 | + |
| 49 | + # dhalika |
| 50 | + "dha\u0304lika": "\u0630\u0644\u0643" |
| 51 | + |
| 52 | + # Ibn when it appears in the middle of a name sequence |
| 53 | + "ibn": "\u0628\u0646" |
| 54 | + |
| 55 | + # H[dot below]aya[macron]t |
| 56 | + "h\u0323aya\u0304t": "\u062D\u064A\u0627\u0629" |
| 57 | + |
| 58 | + # "sh[dot below] as in "Ishaq" |
| 59 | + "sh\u0323": "\u0633\u062D" |
| 60 | + |
| 61 | + # "s[prime]h" combos |
| 62 | + "s\u02B9h": "\u0633\u0647" |
| 63 | + |
| 64 | + # "th[dot below]" |
| 65 | + "th\u0323": "\u062A\u062D" |
| 66 | + |
| 67 | + # dh[dot under] |
| 68 | + "dh\u0323": "\u062F\u062D" |
| 69 | + |
| 70 | + # La-hu |
| 71 | + "la-hu": "\u0644\u0647" |
| 72 | + |
| 73 | + # Mi'ah |
| 74 | + "mi\u02BEah": "\u0645\u0627\u0626\u0629" |
| 75 | + "mi\u02BCah": "\u0645\u0627\u0626\u0629" |
| 76 | + |
| 77 | + # Mi'at |
| 78 | + "mi\u02BEat": "\u0645\u0627\u0626\u0629" |
| 79 | + "mi\u02BCat": "\u0645\u0627\u0626\u0629" |
| 80 | + |
| 81 | + # Numbers (I have set these to Hindi numbers. Note that Persian and Urdu |
| 82 | + # will technically use \u06F0-06F9. This needs further discussion with PSD |
| 83 | + # as RLIN21 used Hindi numbers, Connexion and Voyager does not.) |
| 84 | + |
| 85 | + # Edition statements with Latin number |
| 86 | + "al-t\u0323ab\u02BBah 1": "\u0627\u0644\u0637\u0628\u0639\u0629 1" |
| 87 | + "al-t\u0323ab\u02BBah 2": "\u0627\u0644\u0637\u0628\u0639\u0629 2" |
| 88 | + "al-t\u0323ab\u02BBah 3": "\u0627\u0644\u0637\u0628\u0639\u0629 3" |
| 89 | + "al-t\u0323ab\u02BBah 4": "\u0627\u0644\u0637\u0628\u0639\u0629 4" |
| 90 | + "al-t\u0323ab\u02BBah 5": "\u0627\u0644\u0637\u0628\u0639\u0629 5" |
| 91 | + "al-t\u0323ab\u02BBah 6": "\u0627\u0644\u0637\u0628\u0639\u0629 6" |
| 92 | + "al-t\u0323ab\u02BBah 7": "\u0627\u0644\u0637\u0628\u0639\u0629 7" |
| 93 | + "al-t\u0323ab\u02BBah 8": "\u0627\u0644\u0637\u0628\u0639\u0629 8" |
| 94 | + "al-t\u0323ab\u02BBah 9": "\u0627\u0644\u0637\u0628\u0639\u0629 9" |
| 95 | + |
| 96 | + # Use Basic Arabic-Indic \u0660-0669 |
| 97 | + "0": "\u0660" |
| 98 | + "1": "\u0661" |
| 99 | + "2": "\u0662" |
| 100 | + "3": "\u0663" |
| 101 | + "4": "\u0664" |
| 102 | + "5": "\u0665" |
| 103 | + "6": "\u0666" |
| 104 | + "7": "\u0667" |
| 105 | + "8": "\u0668" |
| 106 | + "9": "\u0669" |
| 107 | + |
| 108 | + # Hyphenated prefixes: |
| 109 | + "wa-": "\u0648" |
| 110 | + "bi-": "\u0628" |
| 111 | + "al-": "\u0627\u0644" |
| 112 | + "lil-": "\u0644\u0644" |
| 113 | + "li-": "\u0644" |
| 114 | + "la\u0304-": "\u0644" |
| 115 | + "fi\u0304-": "\u0641\u064A" |
| 116 | + "ka-": "\u0643" |
| 117 | + |
| 118 | + # Vowels and vowel/consonant combinations - ta-marbutah at end of word |
| 119 | + "ah%": "\u0629" |
| 120 | + "at%": "\u0629" |
| 121 | + |
| 122 | + # tanwin at end of word |
| 123 | + "an%": "\u0627" |
| 124 | + |
| 125 | + # ayn-alif combo |
| 126 | + "\u02BBa\u0304\u02BE%": "\u0639\u0627\u0621" |
| 127 | + "\u02BBa\u0304\u02BC%": "\u0639\u0627\u0621" |
| 128 | + |
| 129 | + "\u02BBa\u0304": "\u0639\u0627" |
| 130 | + |
| 131 | + "\u02BBi\u0304y": "\u0639\u064A" |
| 132 | + "\u02BBi\u0304": "\u0639\u064A" |
| 133 | + |
| 134 | + "\u02BBu\u0304": "\u0639\u0648" |
| 135 | + "\u02BBu": "\u0639" |
| 136 | + |
| 137 | + "%\u02BBa": "\u0639" |
| 138 | + # "\u02BBa%": "\u0639" |
| 139 | + |
| 140 | + # alif and hamzas for all occasions |
| 141 | + |
| 142 | + # truncation necessary? It seems to work fine with. |
| 143 | + |
| 144 | + "i\u0304\u02BEah%": "\u064A\u0626\u0629" |
| 145 | + "i\u0304\u02BCah%": "\u064A\u0626\u0629" |
| 146 | + |
| 147 | + "i\u0304\u02BEat%": "\u064A\u0626\u0629" |
| 148 | + "i\u0304\u02BCat%": "\u064A\u0626\u0629" |
| 149 | + |
| 150 | + "i\u02BEa\u0304%": "\u0626\u0627" |
| 151 | + "i\u02BCa\u0304%": "\u0626\u0627" |
| 152 | + |
| 153 | + "i\u02BE": "\u0626%" |
| 154 | + "i\u02BC": "\u0626%" |
| 155 | + "a\u0304\u02BEa\u0304": "\u0627\u0621\u0627" |
| 156 | + "a\u0304\u02BCa\u0304": "\u0627\u0621\u0627" |
| 157 | + |
| 158 | + "a\u02BE": "\u0623" |
| 159 | + "a\u02BC": "\u0623" |
| 160 | + "\u02BEi": "\u0626" |
| 161 | + "\u02BCi": "\u0626" |
| 162 | + "\u02BEa\u0304": "\u0622" |
| 163 | + "\u02BCa\u0304": "\u0622" |
| 164 | + "\u02BEa": "\u0623" |
| 165 | + "\u02BCa": "\u0623" |
| 166 | + |
| 167 | + "y\u02BCah": "\u064A\u0626\u0629" |
| 168 | + "y\u02BEah": "\u064A\u0626\u0629" |
| 169 | + |
| 170 | + "y\u02BCat": "\u064A\u0626\u0629" |
| 171 | + "y\u02BEat": "\u064A\u0626\u0629" |
| 172 | + |
| 173 | + # A |
| 174 | + |
| 175 | + "a\u0304\u02BCi\u0304": "\u0627\u0626\u064A" |
| 176 | + "a\u0304\u02BEi\u0304": "\u0627\u0626\u064A" |
| 177 | + |
| 178 | + "a\u0304\u02BCi": "\u0627\u0626" |
| 179 | + "a\u0304\u02BEi": "\u0627\u0626" |
| 180 | + "a\u0304\u02BC": "\u0627\u0621" |
| 181 | + "a\u0304\u02BE": "\u0627\u0621" |
| 182 | + "%a\u0304": "\u0622" |
| 183 | + "a\u0304": "\u0627" |
| 184 | + |
| 185 | + # These next two lines were intended to convert to alif-ayn when it is at |
| 186 | + # # the beginning of a word, definite or indefinine (i.e. |
| 187 | + # al-a[ayn]ma[macron]l or [space]a[ayn]ma[macron]l" |
| 188 | + "%a\u02BB": "\u0623\u0639" |
| 189 | + "a\u02BB": "\u0639" |
| 190 | + "a\u0301": "\u0649" |
| 191 | + |
| 192 | + "ayy": "\u064A" |
| 193 | + "%a": "\u0623" |
| 194 | + "a": "" |
| 195 | + |
| 196 | + # I - Capital I at beginning of word is usually alif hamzah-below. |
| 197 | + |
| 198 | + "%i\u0304": "\u064A" |
| 199 | + "i\u0304y": "\u064A" |
| 200 | + "iy": "\u064A" |
| 201 | + "i\u0304": "\u064A" |
| 202 | + "%\u02BBi": "\u0639" |
| 203 | + |
| 204 | + # "i\u02BB": "\u0625\u0639" |
| 205 | + |
| 206 | + "i\u02BE": "\u0626" |
| 207 | + "i\u02BC": "\u0627\u0626" |
| 208 | + |
| 209 | + "%i": "\u0625" |
| 210 | + "i": "" |
| 211 | + |
| 212 | + # U |
| 213 | + |
| 214 | + "u\u0304\u02BE": "\u0624" |
| 215 | + "u\u0304\u02BC": "\u0624" |
| 216 | + "%u\u0304w": "\u0623\u0648" |
| 217 | + "%u\u0304": "\u0623\u0648" |
| 218 | + "u\u0304w": "\u0648" |
| 219 | + "u\u0304": "\u0648" |
| 220 | + "u\u02BE": "\u0624" |
| 221 | + "u\u02BC": "\u0624" |
| 222 | + |
| 223 | + "%u": "\u0623" |
| 224 | + "u": "" |
| 225 | + |
| 226 | + # Consonants, with tashdid added |
| 227 | + |
| 228 | + "bb": "\u0628" |
| 229 | + "b": "\u0628" |
| 230 | + "thth": "\u062B" |
| 231 | + "th": "\u062B" |
| 232 | + "t\u0323t\u0323": "\u0637" |
| 233 | + "t\u0323": "\u0637" |
| 234 | + "tt": "\u062A" |
| 235 | + "t": "\u062A" |
| 236 | + "J": "\u062C" |
| 237 | + "jj": "\u062C" |
| 238 | + "j": "\u062C" |
| 239 | + "h\u0323h\u0323": "\u062D" |
| 240 | + "h\u0323": "\u062D" |
| 241 | + "hh": "\u0647" |
| 242 | + "h": "\u0647" |
| 243 | + "Kh": "\u062E" |
| 244 | + "khkh": "\u062E" |
| 245 | + "kh": "\u062E" |
| 246 | + "kk": "\u0643" |
| 247 | + "k": "\u0643" |
| 248 | + "dhdh": "\u0630" |
| 249 | + "dh": "\u0630" |
| 250 | + "d\u0323d\u0323": "\u0636" |
| 251 | + "d\u0323": "\u0636" |
| 252 | + "dd": "\u062F" |
| 253 | + "d": "\u062F" |
| 254 | + "rr": "\u0631" |
| 255 | + "r": "\u0631" |
| 256 | + "z\u0323z\u0323": "\u0638" |
| 257 | + "z\u0323": "\u0638" |
| 258 | + "zz": "\u0632" |
| 259 | + "z": "\u0632" |
| 260 | + "shsh": "\u0634" |
| 261 | + "sh": "\u0634" |
| 262 | + "s\u0323s\u0323": "\u0635" |
| 263 | + "s\u0323": "\u0635" |
| 264 | + "ss": "\u0633" |
| 265 | + "s": "\u0633" |
| 266 | + "ghgh": "\u063A" |
| 267 | + "gh": "\u063A" |
| 268 | + "ff": "\u0641" |
| 269 | + "f": "\u0641" |
| 270 | + "qq": "\u0642" |
| 271 | + "q": "\u0642" |
| 272 | + "ll": "\u0644" |
| 273 | + "l": "\u0644" |
| 274 | + "mm": "\u0645" |
| 275 | + "m": "\u0645" |
| 276 | + "nn": "\u0646" |
| 277 | + "n": "\u0646" |
| 278 | + "ww": "\u0648" |
| 279 | + "w": "\u0648" |
| 280 | + "yy": "\u064A" |
| 281 | + "y": "\u064A" |
| 282 | + |
| 283 | + # non-Arabic consonants: |
| 284 | + "p": "\u067E" |
| 285 | + "ch": "\u0686" |
| 286 | + "v": "\u06A4" |
| 287 | + "g": "\u06AF" |
| 288 | + |
| 289 | + # Diacritic characters: |
| 290 | + # ain (\u0639) - not transliterated alone: |
| 291 | + "\u02BB": "\u0639" |
| 292 | + # hamza - not romanized |
| 293 | + # "\u0621" |
| 294 | + # hamza (alone in final position) |
| 295 | + "\u02BE%": "\u0621" |
| 296 | + "\u02BC%": "\u0621" |
| 297 | + |
| 298 | + # Do not know what, if anything, is needed here: |
| 299 | + # tatweel: |
| 300 | + # "\u0640" |
| 301 | + # fathatan: |
| 302 | + # "\u064B" |
| 303 | + # dammatan: |
| 304 | + # "\u064C" |
| 305 | + # kasratan: |
| 306 | + # "\u064D" |
| 307 | + # fatha: |
| 308 | + # "\u064E" |
| 309 | + # damma: |
| 310 | + # "\u064F" |
| 311 | + # kasra: |
| 312 | + # "\u0650" |
| 313 | + # shadda: |
| 314 | + # "\u0651" |
| 315 | + # sukun: |
| 316 | + # "\u0652" |
| 317 | + # superscript alef: |
| 318 | + # "\u0670" |
| 319 | + # alef wasla |
| 320 | + # "\u0671" |
| 321 | + |
| 322 | + |
| 323 | +script_to_roman: |
| 324 | + hooks: |
| 325 | + post_config: |
| 326 | + - |
| 327 | + - arabic.arabic_romanizer.s2r_post_config |
0 commit comments