-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtoken.go
More file actions
114 lines (92 loc) · 2.05 KB
/
token.go
File metadata and controls
114 lines (92 loc) · 2.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
package xim
import (
"fmt"
"strings"
)
type bigram struct {
a, b rune
}
// String - to string
func (b *bigram) String() string {
return fmt.Sprintf("%c%c", b.a, b.b)
}
// Biunigrams - returns bigram and unigram tokens from s.
func Biunigrams(s string) []string {
tokens := make([]string, 0, 32)
for bigram := range toBigrams(s) {
tokens = append(tokens, fmt.Sprintf("%c%c", bigram.a, bigram.b))
}
for unigram := range toUnigrams(s) {
tokens = append(tokens, fmt.Sprintf("%c", unigram))
}
return tokens
}
// Bigrams returns bigram tokens from s.
func Bigrams(s string) []string {
tokens := make([]string, 0, 32)
for bigram := range toBigrams(s) {
tokens = append(tokens, fmt.Sprintf("%c%c", bigram.a, bigram.b))
}
return tokens
}
// Prefixes - returns prefix tokens from s.
func Prefixes(s string) []string {
return tokenize(s, false)
}
// Suffixes - returns suffix tokens from s.
func Suffixes(s string) []string {
return tokenize(s, true)
}
func tokenize(s string, isSuffix bool) []string {
tokenMap := make(map[string]struct{})
runes := make([]rune, 0, 64)
for _, w := range strings.Split(s, " ") {
if w == "" {
continue
}
if isSuffix {
w = reverse(w)
}
runes = runes[0:0]
for _, c := range w {
runes = append(runes, c)
tokenMap[string(runes)] = struct{}{}
}
}
tokens := make([]string, 0, 32)
for suf := range tokenMap {
if isSuffix {
suf = reverse(suf)
}
tokens = append(tokens, suf)
}
return tokens
}
func reverse(s string) string {
runes := []rune(s)
for i, j := 0, len(runes)-1; i < j; i, j = i+1, j-1 {
runes[i], runes[j] = runes[j], runes[i]
}
return string(runes)
}
func toBigrams(value string) map[bigram]struct{} {
result := make(map[bigram]struct{})
var prev rune
for i, r := range value {
if i > 0 && prev != ' ' && r != ' ' {
result[bigram{prev, r}] = struct{}{}
}
prev = r
}
return result
}
func toUnigrams(value string) map[rune]struct{} {
result := make(map[rune]struct{})
for _, r := range value {
if r == ' ' {
continue
}
result[r] = struct{}{}
}
return result
}