-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutf8.go
More file actions
30 lines (25 loc) · 772 Bytes
/
utf8.go
File metadata and controls
30 lines (25 loc) · 772 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
package gnlib
import (
"unicode/utf8"
"golang.org/x/text/unicode/norm"
)
// FixUtf8 cleans a string by replacing invalid UTF-8 sequences with U+FFFD
// and normalizing to NFC.
func FixUtf8(s string) string {
// Estimate capacity: assume most bytes are valid runes (1 rune per 1-4 bytes).
result := make([]rune, 0, len(s)/2+1)
// Iterate over the string byte by byte, tracking position.
for i := 0; i < len(s); {
r, size := utf8.DecodeRuneInString(s[i:])
if r == utf8.RuneError && size <= 1 {
// Invalid sequence: append U+FFFD and advance by 1 byte.
result = append(result, utf8.RuneError)
i++
} else {
// Valid rune: append and advance by rune size.
result = append(result, r)
i += size
}
}
return norm.NFC.String(string(result))
}