Skip to content

Commit ea7b8eb

Browse files
committed
Add support for ambiguous-width characters
Add a new ambiguous_width parameter to the wcwidth() function, which allows indicating which width should be reported for such characters. By default it will return 1, but can be changed to 2 for those cases where needed. While at it, move table generation to a new generate-tables Lua script, which better handles input and character attributes.
1 parent 1b044ba commit ea7b8eb

9 files changed

Lines changed: 1248 additions & 593 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ With them, it generates the following files:
136136

137137
* [wcwidth/widetab.lua](./wcwidth/widetab.lua)
138138
* [wcwidth/zerotab.lua](./wcwidth/zerotab.lua)
139+
* [wcwidth/ambitab.lua](./wcwidth/ambitab.lua)
139140

140141
The most current version of `wcwidth` uses the following versions of the above
141142
Unicode Standard release files:

README.md.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ With them, it generates the following files:
136136

137137
* [wcwidth/widetab.lua](./wcwidth/widetab.lua)
138138
* [wcwidth/zerotab.lua](./wcwidth/zerotab.lua)
139+
* [wcwidth/ambitab.lua](./wcwidth/ambitab.lua)
139140

140141
The most current version of `wcwidth` uses the following versions of the above
141142
Unicode Standard release files:

generate-tables

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
#! /usr/bin/env lua
2+
-- vim:set ft=lua:sw=3:ts=3:
3+
4+
local help_text = [[Usage: %s [-w<path>] [-z<path>] [-a<path>] [-p<path>] [--help | -h]
5+
6+
-w<path> Path to wide characters table (default: widetab.lua)
7+
-z<path> Path to zero-width characters table (default: zerotab.lua)
8+
-a<path> Path to ambiguous width characters table (default: ambitab.lua)
9+
-p<path> Path prefix to prepend to all above paths (default: .)
10+
-h, --help Show this help text.
11+
12+
]]
13+
14+
local path_prefix = "."
15+
local wide_tab_path = "widetab.lua"
16+
local zero_tab_path = "zerotab.lua"
17+
local ambi_tab_path = "ambitab.lua"
18+
19+
for _, item in ipairs(arg) do
20+
if item == "-h" or item == "--help" then
21+
io.stdout:write(help_text:format(arg[0]))
22+
os.exit(0)
23+
end
24+
local prefix = item:sub(1, 2)
25+
local suffix = item:sub(3)
26+
if prefix == "-w" then
27+
wide_tab_path = suffix
28+
elseif prefix == "-z" then
29+
zero_tab_path = suffix
30+
elseif prefix == "-a" then
31+
ambi_tab_path = suffix
32+
elseif prefix == "-p" then
33+
path_prefix = suffix
34+
else
35+
io.stderr:write(("Unrecognized command line option: %s\n\n"):format(item))
36+
io.stderr:write(help_text:format(arg[0]))
37+
os.exit(1)
38+
end
39+
end
40+
41+
local pat_range = "^(%x+)%.%.(%x+)%s*;%s*(%w+)"
42+
local pat_rune = "^(%x+)%s*;%s*(%w+)"
43+
44+
local wide_tab = {}
45+
local zero_tab = {}
46+
local ambi_tab = {}
47+
48+
local wide_attrs = {
49+
F = true,
50+
W = true,
51+
}
52+
local zero_attrs = {
53+
Cf = true,
54+
Mc = true,
55+
Me = true,
56+
Mn = true,
57+
Zl = true,
58+
Zp = true,
59+
}
60+
local ambi_attrs = {
61+
A = true,
62+
}
63+
64+
for line in io.lines() do
65+
local range_start, range_end, attribute, range_start_s, range_end_s
66+
67+
range_start_s, attribute = line:match(pat_rune)
68+
if range_start_s then
69+
range_start = tonumber(range_start_s, 16)
70+
range_end = range_start
71+
else
72+
range_start_s, range_end_s, attribute = line:match(pat_range)
73+
if range_start_s then
74+
range_start = tonumber(range_start_s, 16)
75+
range_end = tonumber(range_end_s, 16)
76+
end
77+
end
78+
79+
if range_start then
80+
local tab
81+
if wide_attrs[attribute] then
82+
tab = wide_tab
83+
elseif zero_attrs[attribute] then
84+
tab = zero_tab
85+
elseif ambi_attrs[attribute] then
86+
tab = ambi_tab
87+
end
88+
if tab then
89+
tab[#tab + 1] = { range_start, range_end }
90+
end
91+
end
92+
end
93+
94+
local function tab_sort_compare(a, b)
95+
return a[1] < b[1]
96+
end
97+
98+
table.sort(wide_tab, tab_sort_compare)
99+
table.sort(zero_tab, tab_sort_compare)
100+
table.sort(ambi_tab, tab_sort_compare)
101+
102+
local tab_dump_line_format = "\t0x%X, 0x%X,\n"
103+
local function tab_dump(tab, out)
104+
out:write("-- Automatically generated, do not edit\n")
105+
out:write("return {\n")
106+
for _, item in ipairs(tab) do
107+
out:write(tab_dump_line_format:format(item[1], item[2]))
108+
end
109+
out:write("}\n")
110+
end
111+
112+
tab_dump(wide_tab, io.open(path_prefix .. "/" .. wide_tab_path, "w"))
113+
tab_dump(zero_tab, io.open(path_prefix .. "/" .. zero_tab_path, "w"))
114+
tab_dump(ambi_tab, io.open(path_prefix .. "/" .. ambi_tab_path, "w"))

spec/wcwidth_spec.lua

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,15 @@ local function test_phrase(input, expected_lengths, expected_total_length)
2525
end
2626

2727
--
28-
-- Many test cases are from:
29-
-- https://github.com/jquast/wcwidth/blob/master/wcwidth/tests/test_core.py
28+
-- A number of test cases are from:
29+
-- https://github.com/jquast/wcwidth/blob/master/tests/
3030
--
3131
describe("wcwidth()", function ()
32+
it("reports double-width for Katakana Ko", function ()
33+
local input = ""
34+
local rune = utf8.codepoint(input)
35+
assert.equal(2, wcwidth(rune))
36+
end)
3237
it("handles a mix of Japanese and ASCII", function ()
3338
-- Given a phrase of 5 and 3 Katakana ideographs, joined with 3 English
3439
-- ASCII punctuation characters, totaling 11, this phrase consumes 19
@@ -54,16 +59,21 @@ describe("wcwidth()", function ()
5459
-- Phrase cafe + COMBINING ACUTE ACCENT is café of length 4.
5560
test_phrase("cafe" .. utf8.char(0x0301), { 1, 1, 1, 1, 0 }, 4)
5661
end)
57-
it("handles a combining enclosing", function ()
58-
-- CYRILLIC CAPITAL LETTER A + COMBINING CYRILLIC HUNDRED THOUSANDS SIGN is А҈ of length 1.
59-
test_phrase(utf8.char(0x0410, 0x0488), { 1, 0 }, 1)
62+
it("handles a combining enclosing character", function ()
63+
-- CAPITAL LETTER A + COMBINING ENCLOSING CIRCLE has length 1.
64+
test_phrase(utf8.char(0x41, 0x20DD), { 1, 0 }, 1)
6065
end)
61-
it("handles combining spaces", function ()
62-
-- Balinese kapal (ship) is ᬓᬨᬮ᭄ of length 4.
63-
test_phrase(utf8.char(0x1B13, 0x1B28, 0x1B2E, 0x1B44), { 1, 1, 1, 1 }, 4)
66+
it("handles multiple combining characters", function ()
67+
-- A + acute + grave
68+
test_phrase(utf8.char(0x41, 0x0301, 0x0300), { 1, 0, 0 }, 1)
6469
end)
6570
it("handles a 👍 emoji", function ()
6671
test_phrase("two 👍", { 1, 1, 1, 1, 2 }, 6)
6772
end)
73+
it("can report ambiguous-width char as either 1 or 2", function ()
74+
assert.equal(1, wcwidth(0x451))
75+
assert.equal(1, wcwidth(0x451, 1))
76+
assert.equal(2, wcwidth(0x451, 2))
77+
end)
6878
end)
6979

update-tables

Lines changed: 10 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -22,78 +22,20 @@ ZERO_FILE="${srcdir}/${ZERO_URL##*/}"
2222
[[ -r ${WIDE_FILE} ]] || wget -c -O "${WIDE_FILE}" "${WIDE_URL}"
2323
[[ -r ${ZERO_FILE} ]] || wget -c -O "${ZERO_FILE}" "${ZERO_URL}"
2424

25-
# Pad hex values to be 8 characters wide, and prepend "0x" to them.
26-
u32hex () {
27-
local -i len=${#1}
28-
local -i nzeros=$(( 8 - len ))
29-
echo -n '0x'
30-
for (( ; nzeros > 0 ; nzeros-- )) ; do
31-
echo -n '0'
32-
done
33-
echo -n "$1"
34-
}
35-
36-
format_range () {
37-
if [[ $1 = *..* ]] ; then
38-
u32hex "${1%..*}"
39-
echo -n ','
40-
u32hex "${1#*..}"
41-
echo ','
42-
else
43-
u32hex "${1}"
44-
echo -n ','
45-
u32hex "${1}"
46-
echo ','
47-
fi
48-
}
49-
50-
zero_table () {
51-
local tmpfile="${TMPDIR:-/tmp}/update-tables-$$-${RANDOM}"
52-
echo "-- Autogenerated from ${ZERO_FILE##*/}"
53-
echo 'return {'
54-
while read -r line ; do
55-
if [[ -z ${line} || ${line} = \#* ]] ; then
56-
read line rest <<< "${line}"
57-
if [[ ${rest} = *.txt ]] ; then
58-
ZERO_VER=${rest}
59-
elif [[ ${rest} = Date:\ 20* || ${rest} = ©\ 20* ]] ; then
60-
ZERO_VER="${ZERO_VER}, ${rest}"
61-
fi
62-
continue
63-
fi
64-
read -a items <<< "${line}"
65-
if [[ ${items[2]} == Mn || ${items[2]} == Me ]] ; then
66-
format_range "${items[0]}"
67-
fi
68-
done > "${tmpfile}"
69-
sort "${tmpfile}"
70-
rm "${tmpfile}"
71-
echo '}'
72-
}
73-
74-
wide_table () {
75-
local tmpfile="${TMPDIR:-/tmp}/update-tables-$$-${RANDOM}"
76-
echo "-- Autogenerated from ${WIDE_FILE##*/}"
77-
echo 'return {'
25+
parse_file_version () {
26+
local version rest line
7827
while read -r line ; do
7928
if [[ -z ${line} || ${line} = \#* ]] ; then
8029
read -r line rest <<< "${line}"
8130
if [[ ${rest} = *.txt ]] ; then
82-
WIDE_VER=${rest}
31+
version=${rest}
8332
elif [[ ${rest} = Date:\ 20* || ${rest} = ©\ 20* ]] ; then
84-
WIDE_VER="${WIDE_VER}, ${rest}"
33+
version="${version}, ${rest}"
8534
fi
8635
continue
8736
fi
88-
read -r -a items <<< "${line}"
89-
case ${items[2]} in
90-
F | W | A )
91-
format_range "${items[0]}" ;;
92-
esac
93-
done > "${tmpfile}"
94-
sort "${tmpfile}"
95-
rm "${tmpfile}"
96-
echo '}'
37+
done
38+
echo "${version}"
9739
}
9840

9941
make_readme () {
@@ -109,6 +51,8 @@ make_readme () {
10951
-e "s+@@LUAROCKS_VER@@+${V}+g"
11052
}
11153

112-
wide_table < "${WIDE_FILE}" > "${srcdir}/wcwidth/widetab.lua"
113-
zero_table < "${ZERO_FILE}" > "${srcdir}/wcwidth/zerotab.lua"
54+
WIDE_VER=$(parse_file_version < "${WIDE_FILE}")
55+
ZERO_VER=$(parse_file_version < "${ZERO_FILE}")
11456
make_readme < "${srcdir}/README.md.in" > "${srcdir}/README.md"
57+
58+
cat "${WIDE_FILE}" "${ZERO_FILE}" | "${srcdir}/generate-tables" -p"${srcdir}/wcwidth"

0 commit comments

Comments
 (0)