Skip to content

Commit dfd1c04

Browse files
authored
Merge pull request #285 from lcnetdev/main
Update test from main.
2 parents a160e3e + 4ac0851 commit dfd1c04

66 files changed

Lines changed: 1954 additions & 296 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

scriptshifter/tables/data/_chinese_base.yml

Lines changed: 135 additions & 49 deletions
Large diffs are not rendered by default.

scriptshifter/tables/data/_ignore_base.yml

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,24 @@ general:
88
roman_to_script:
99
ignore:
1010
- "At head of title"
11-
- "at head of title"
1211
- "Colophon"
13-
- "colophon"
12+
- "Colophon"
1413
- "Cover title"
14+
- "On cover"
15+
- "S.l."
16+
- "Spine title"
17+
- "and one other"
18+
- "at head of title"
19+
- "colophon"
20+
- "cover title"
1521
- "date of publication not identified"
22+
- "et al."
23+
- "on cover"
1624
- "place of publication not identified"
1725
- "publisher not identified"
18-
- "and one other"
19-
- "and others"
20-
- "et al."
26+
- "s.l."
27+
- "s.n."
28+
- "spine title"
2129
ignore_ptn:
2230
- "and ([a-z0-9]+ )?others"
2331

@@ -29,17 +37,22 @@ roman_to_script:
2937
# dedicated U+2160÷U+216F (uppercase Roman
3038
# numerals) and/or U+2170÷U+217F (lower case Roman
3139
# numerals) ranges to avoid this ambiguity.
32-
- "I{2,3}\\b"
33-
- "I(V|X)\\b"
34-
- "LI{1,3}\\b"
35-
- "LI?(V|X)\\b"
36-
- "L(V|X{1,3})I{,3}\\b"
37-
- "LX{1,3}I?V\\b"
38-
- "LX{1,3}VI{,3}\\b"
39-
- "VI{1,3}\\b"
40-
- "X{1,3}I{1,3}\\b"
41-
- "X{1,3}I(V|X)\\b"
42-
- "X{1,3}VI{,3}\\b"
40+
- "M{,3}(CM)?C?D?C{1,3}L?X{,3}I{,3}\\b"
41+
- "M{1,3}(CM)?C?D?C{,3}L?X{,3}I{,3}\\b"
42+
- "M{,3}(CM)?C?D?C{1,3}L?X{,3}I[VX]\\b"
43+
- "M{1,3}(CM)?C?D?C{,3}L?X{,3}I[VX]\\b"
44+
45+
# NMay not be prefixed by M, D, C, L. Cannot use for single digits.
46+
- "M{,3}(CM)?C?D?C{,3}I(I{,2}V|X)\\b"
47+
- "M{,3}(CM)?C?D?C{,3}LI{1,3}\\b"
48+
- "M{,3}(CM)?C?D?C{,3}LI?[VX]\\b"
49+
- "M{,3}(CM)?C?D?C{,3}L(V|X{1,3})I{,3}\\b"
50+
- "M{,3}(CM)?C?D?C{,3}LX{1,3}I?[VX]\\b"
51+
- "M{,3}(CM)?C?D?C{,3}LX{1,3}VI{,3}\\b"
52+
- "M{,3}(CM)?C?D?C{,3}VI{1,3}\\b"
53+
- "M{,3}(CM)?C?D?C{,3}X{1,3}C?I{1,3}\\b"
54+
- "M{,3}(CM)?C?D?C{,3}X{1,3}C?I[VX]\\b"
55+
- "M{,3}(CM)?C?D?C{,3}X{1,3}C?VI{,3}\\b"
4356

4457
# MARC sub-field markers.
4558
- "[\u2021\u01C2\\$][0-9a-z]\\b"
Lines changed: 327 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,327 @@
1+
# Arabic S2R using the 3rd-party ArabicTransliterator library:
2+
# https://github.com/MTG/ArabicTransliterator
3+
4+
---
5+
general:
6+
name: Arabic
7+
parents:
8+
- _ignore_base
9+
description: >
10+
Version 1.0 (2025-11-29) - Arabic language R2S using a conversion table; S2R using a 3rd party library.
11+
case_sensitive: false
12+
13+
roman_to_script:
14+
map:
15+
16+
# Punctuation marks:
17+
"*": "\u066D"
18+
",": "\u060C"
19+
";": "\u061B"
20+
"?": "\u061F"
21+
22+
# Exceptions for specific words
23+
24+
# Allah
25+
"%alla\u0304h%": "\uFDF2"
26+
"alla\u0304h": "\u0627\u0644\u0644\u0647"
27+
28+
# Qur'an
29+
"qur\u02BCa\u0304n": "\u0642\u0631\u0622\u0646"
30+
31+
# lillah
32+
"lilla\u0304h": "\u0644\u0644\u0647"
33+
34+
# billah
35+
"billa\u0304h": "\u0628\u0644\u0644\u0647"
36+
37+
# Rahman
38+
"rah\u0323ma\u0304n": "\u0631\u062D\u0645\u0646"
39+
40+
# Ruwat
41+
"ruwa\u0304t": "\u0631\u0648\u0627\u0629"
42+
43+
# Hadha
44+
"ha\u0304dha\u0304": "\u0647\u0630\u0627"
45+
46+
# Hadhihi
47+
"ha\u0304dhi\u0304hi": "\u0647\u0630\u0647"
48+
49+
# dhalika
50+
"dha\u0304lika": "\u0630\u0644\u0643"
51+
52+
# Ibn when it appears in the middle of a name sequence
53+
"ibn": "\u0628\u0646"
54+
55+
# H[dot below]aya[macron]t
56+
"h\u0323aya\u0304t": "\u062D\u064A\u0627\u0629"
57+
58+
# "sh[dot below] as in "Ishaq"
59+
"sh\u0323": "\u0633\u062D"
60+
61+
# "s[prime]h" combos
62+
"s\u02B9h": "\u0633\u0647"
63+
64+
# "th[dot below]"
65+
"th\u0323": "\u062A\u062D"
66+
67+
# dh[dot under]
68+
"dh\u0323": "\u062F\u062D"
69+
70+
# La-hu
71+
"la-hu": "\u0644\u0647"
72+
73+
# Mi'ah
74+
"mi\u02BEah": "\u0645\u0627\u0626\u0629"
75+
"mi\u02BCah": "\u0645\u0627\u0626\u0629"
76+
77+
# Mi'at
78+
"mi\u02BEat": "\u0645\u0627\u0626\u0629"
79+
"mi\u02BCat": "\u0645\u0627\u0626\u0629"
80+
81+
# Numbers (I have set these to Hindi numbers. Note that Persian and Urdu
82+
# will technically use \u06F0-06F9. This needs further discussion with PSD
83+
# as RLIN21 used Hindi numbers, Connexion and Voyager does not.)
84+
85+
# Edition statements with Latin number
86+
"al-t\u0323ab\u02BBah 1": "\u0627\u0644\u0637\u0628\u0639\u0629 1"
87+
"al-t\u0323ab\u02BBah 2": "\u0627\u0644\u0637\u0628\u0639\u0629 2"
88+
"al-t\u0323ab\u02BBah 3": "\u0627\u0644\u0637\u0628\u0639\u0629 3"
89+
"al-t\u0323ab\u02BBah 4": "\u0627\u0644\u0637\u0628\u0639\u0629 4"
90+
"al-t\u0323ab\u02BBah 5": "\u0627\u0644\u0637\u0628\u0639\u0629 5"
91+
"al-t\u0323ab\u02BBah 6": "\u0627\u0644\u0637\u0628\u0639\u0629 6"
92+
"al-t\u0323ab\u02BBah 7": "\u0627\u0644\u0637\u0628\u0639\u0629 7"
93+
"al-t\u0323ab\u02BBah 8": "\u0627\u0644\u0637\u0628\u0639\u0629 8"
94+
"al-t\u0323ab\u02BBah 9": "\u0627\u0644\u0637\u0628\u0639\u0629 9"
95+
96+
# Use Basic Arabic-Indic \u0660-0669
97+
"0": "\u0660"
98+
"1": "\u0661"
99+
"2": "\u0662"
100+
"3": "\u0663"
101+
"4": "\u0664"
102+
"5": "\u0665"
103+
"6": "\u0666"
104+
"7": "\u0667"
105+
"8": "\u0668"
106+
"9": "\u0669"
107+
108+
# Hyphenated prefixes:
109+
"wa-": "\u0648"
110+
"bi-": "\u0628"
111+
"al-": "\u0627\u0644"
112+
"lil-": "\u0644\u0644"
113+
"li-": "\u0644"
114+
"la\u0304-": "\u0644"
115+
"fi\u0304-": "\u0641\u064A"
116+
"ka-": "\u0643"
117+
118+
# Vowels and vowel/consonant combinations - ta-marbutah at end of word
119+
"ah%": "\u0629"
120+
"at%": "\u0629"
121+
122+
# tanwin at end of word
123+
"an%": "\u0627"
124+
125+
# ayn-alif combo
126+
"\u02BBa\u0304\u02BE%": "\u0639\u0627\u0621"
127+
"\u02BBa\u0304\u02BC%": "\u0639\u0627\u0621"
128+
129+
"\u02BBa\u0304": "\u0639\u0627"
130+
131+
"\u02BBi\u0304y": "\u0639\u064A"
132+
"\u02BBi\u0304": "\u0639\u064A"
133+
134+
"\u02BBu\u0304": "\u0639\u0648"
135+
"\u02BBu": "\u0639"
136+
137+
"%\u02BBa": "\u0639"
138+
# "\u02BBa%": "\u0639"
139+
140+
# alif and hamzas for all occasions
141+
142+
# truncation necessary? It seems to work fine with.
143+
144+
"i\u0304\u02BEah%": "\u064A\u0626\u0629"
145+
"i\u0304\u02BCah%": "\u064A\u0626\u0629"
146+
147+
"i\u0304\u02BEat%": "\u064A\u0626\u0629"
148+
"i\u0304\u02BCat%": "\u064A\u0626\u0629"
149+
150+
"i\u02BEa\u0304%": "\u0626\u0627"
151+
"i\u02BCa\u0304%": "\u0626\u0627"
152+
153+
"i\u02BE": "\u0626%"
154+
"i\u02BC": "\u0626%"
155+
"a\u0304\u02BEa\u0304": "\u0627\u0621\u0627"
156+
"a\u0304\u02BCa\u0304": "\u0627\u0621\u0627"
157+
158+
"a\u02BE": "\u0623"
159+
"a\u02BC": "\u0623"
160+
"\u02BEi": "\u0626"
161+
"\u02BCi": "\u0626"
162+
"\u02BEa\u0304": "\u0622"
163+
"\u02BCa\u0304": "\u0622"
164+
"\u02BEa": "\u0623"
165+
"\u02BCa": "\u0623"
166+
167+
"y\u02BCah": "\u064A\u0626\u0629"
168+
"y\u02BEah": "\u064A\u0626\u0629"
169+
170+
"y\u02BCat": "\u064A\u0626\u0629"
171+
"y\u02BEat": "\u064A\u0626\u0629"
172+
173+
# A
174+
175+
"a\u0304\u02BCi\u0304": "\u0627\u0626\u064A"
176+
"a\u0304\u02BEi\u0304": "\u0627\u0626\u064A"
177+
178+
"a\u0304\u02BCi": "\u0627\u0626"
179+
"a\u0304\u02BEi": "\u0627\u0626"
180+
"a\u0304\u02BC": "\u0627\u0621"
181+
"a\u0304\u02BE": "\u0627\u0621"
182+
"%a\u0304": "\u0622"
183+
"a\u0304": "\u0627"
184+
185+
# These next two lines were intended to convert to alif-ayn when it is at
186+
# # the beginning of a word, definite or indefinine (i.e.
187+
# al-a[ayn]ma[macron]l or [space]a[ayn]ma[macron]l"
188+
"%a\u02BB": "\u0623\u0639"
189+
"a\u02BB": "\u0639"
190+
"a\u0301": "\u0649"
191+
192+
"ayy": "\u064A"
193+
"%a": "\u0623"
194+
"a": ""
195+
196+
# I - Capital I at beginning of word is usually alif hamzah-below.
197+
198+
"%i\u0304": "\u064A"
199+
"i\u0304y": "\u064A"
200+
"iy": "\u064A"
201+
"i\u0304": "\u064A"
202+
"%\u02BBi": "\u0639"
203+
204+
# "i\u02BB": "\u0625\u0639"
205+
206+
"i\u02BE": "\u0626"
207+
"i\u02BC": "\u0627\u0626"
208+
209+
"%i": "\u0625"
210+
"i": ""
211+
212+
# U
213+
214+
"u\u0304\u02BE": "\u0624"
215+
"u\u0304\u02BC": "\u0624"
216+
"%u\u0304w": "\u0623\u0648"
217+
"%u\u0304": "\u0623\u0648"
218+
"u\u0304w": "\u0648"
219+
"u\u0304": "\u0648"
220+
"u\u02BE": "\u0624"
221+
"u\u02BC": "\u0624"
222+
223+
"%u": "\u0623"
224+
"u": ""
225+
226+
# Consonants, with tashdid added
227+
228+
"bb": "\u0628"
229+
"b": "\u0628"
230+
"thth": "\u062B"
231+
"th": "\u062B"
232+
"t\u0323t\u0323": "\u0637"
233+
"t\u0323": "\u0637"
234+
"tt": "\u062A"
235+
"t": "\u062A"
236+
"J": "\u062C"
237+
"jj": "\u062C"
238+
"j": "\u062C"
239+
"h\u0323h\u0323": "\u062D"
240+
"h\u0323": "\u062D"
241+
"hh": "\u0647"
242+
"h": "\u0647"
243+
"Kh": "\u062E"
244+
"khkh": "\u062E"
245+
"kh": "\u062E"
246+
"kk": "\u0643"
247+
"k": "\u0643"
248+
"dhdh": "\u0630"
249+
"dh": "\u0630"
250+
"d\u0323d\u0323": "\u0636"
251+
"d\u0323": "\u0636"
252+
"dd": "\u062F"
253+
"d": "\u062F"
254+
"rr": "\u0631"
255+
"r": "\u0631"
256+
"z\u0323z\u0323": "\u0638"
257+
"z\u0323": "\u0638"
258+
"zz": "\u0632"
259+
"z": "\u0632"
260+
"shsh": "\u0634"
261+
"sh": "\u0634"
262+
"s\u0323s\u0323": "\u0635"
263+
"s\u0323": "\u0635"
264+
"ss": "\u0633"
265+
"s": "\u0633"
266+
"ghgh": "\u063A"
267+
"gh": "\u063A"
268+
"ff": "\u0641"
269+
"f": "\u0641"
270+
"qq": "\u0642"
271+
"q": "\u0642"
272+
"ll": "\u0644"
273+
"l": "\u0644"
274+
"mm": "\u0645"
275+
"m": "\u0645"
276+
"nn": "\u0646"
277+
"n": "\u0646"
278+
"ww": "\u0648"
279+
"w": "\u0648"
280+
"yy": "\u064A"
281+
"y": "\u064A"
282+
283+
# non-Arabic consonants:
284+
"p": "\u067E"
285+
"ch": "\u0686"
286+
"v": "\u06A4"
287+
"g": "\u06AF"
288+
289+
# Diacritic characters:
290+
# ain (\u0639) - not transliterated alone:
291+
"\u02BB": "\u0639"
292+
# hamza - not romanized
293+
# "\u0621"
294+
# hamza (alone in final position)
295+
"\u02BE%": "\u0621"
296+
"\u02BC%": "\u0621"
297+
298+
# Do not know what, if anything, is needed here:
299+
# tatweel:
300+
# "\u0640"
301+
# fathatan:
302+
# "\u064B"
303+
# dammatan:
304+
# "\u064C"
305+
# kasratan:
306+
# "\u064D"
307+
# fatha:
308+
# "\u064E"
309+
# damma:
310+
# "\u064F"
311+
# kasra:
312+
# "\u0650"
313+
# shadda:
314+
# "\u0651"
315+
# sukun:
316+
# "\u0652"
317+
# superscript alef:
318+
# "\u0670"
319+
# alef wasla
320+
# "\u0671"
321+
322+
323+
script_to_roman:
324+
hooks:
325+
post_config:
326+
-
327+
- arabic.arabic_romanizer.s2r_post_config

0 commit comments

Comments
 (0)