-
Notifications
You must be signed in to change notification settings - Fork 0
/
dajare.go
151 lines (133 loc) · 3.47 KB
/
dajare.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
package dajareGo
import (
"strings"
ipaneologd "github.com/ikawaha/kagome-dict-ipa-neologd"
"github.com/ikawaha/kagome-dict/dict"
"github.com/ikawaha/kagome/v2/tokenizer"
)
type (
// Result has a result of IsDajare function.
Result struct {
// If it is Dajare, this field is True, else False.
IsDajare bool
// This field shows an index of all word which has a similar reading but a different meaning
DajareWordIndex []int
// a pair of sentence and its Syllables
Sentence Token
// Tokens of given sentence
Tokens []Token
}
// Token is a pair of Surface and its Syllables
Token struct {
Surface string
Syllables Syllables
}
)
var t *tokenizer.Tokenizer
// Set dictionary. You must do this first.
func Init() error {
if _t, err := tokenizer.New(ipaneologd.Dict(), tokenizer.OmitBosEos()); err != nil {
return err
} else {
t = _t
return nil
}
}
// Set dictionary you like. You can use a Kagome Dictionary.
// https://github.com/ikawaha/kagome#dictionaries
func SetCustomDictionary(dict *dict.Dict) error {
if _t, err := tokenizer.New(dict, tokenizer.OmitBosEos()); err != nil {
return err
} else {
t = _t
IsDajare("")
return nil
}
}
// IsDajare checks if a sentence is Dajare.
func IsDajare(s string) Result {
// normalize before Morphological analysis
s = preNormalizer(s)
// Morphological analysis
tokens := t.Tokenize(s)
var r Result
var rTokens []Token
for _, token := range tokens {
surface := token.Surface
r.Sentence.Surface += surface
pron, ok := token.Pronunciation()
if ok {
syllables := NewSyllables(pron)
newToken := Token{surface, syllables}
rTokens = append(rTokens, newToken)
r.Sentence.Syllables = append(r.Sentence.Syllables, syllables...)
} else {
suspectedPron := ""
flag := true
for _, value := range surface {
if 0x3041 <= int(value) && int(value) <= 0x3094 {
// if value is Hiragana
suspectedPron += string(value + 0x0060)
} else if 0x30A1 <= int(value) && int(value) <= 0x30FF {
// if value is Katakana
suspectedPron += string(value)
} else {
flag = false
}
}
if flag {
syllables := NewSyllables(suspectedPron)
newToken := Token{surface, syllables}
rTokens = append(rTokens, newToken)
r.Sentence.Syllables = append(r.Sentence.Syllables, syllables...)
}
}
}
r.Tokens = rTokens
// Analyze if the string is Dajare
for i, token := range r.Tokens {
// prepare for analyzing
var target Token
if token.pronLen() < 3 {
// If token is too short...
continue
} else if token.pronLen() == 2 {
if i == len(r.Tokens)-1 || i == 0 {
continue
}
if r.Tokens[i-1].pronLen() >= 2 {
target = token.union(r.Tokens[i-1])
}
} else {
target = token
}
// analyze
if r.analyze(target) {
if r.IsDajare {
r.DajareWordIndex = append(r.DajareWordIndex, i)
} else {
r.IsDajare = true
r.DajareWordIndex = []int{i}
}
}
}
return r
}
// Check each token if a sentence has a similar reading but a different meaning.
func (r *Result) analyze(t Token) bool {
surfaceCount := strings.Count(r.Sentence.Surface, t.Surface)
pronCount := fuzzyCount(r.Sentence.Syllables, t.Syllables, 1)
return surfaceCount < pronCount && 2 <= pronCount
}
// get pron length
func (t *Token) pronLen() int {
var len int
for _, syllable := range t.Syllables {
len += syllable.length() - 1
}
return len
}
// union 2 tokens
func (t Token) union(new Token) Token {
return Token{t.Surface + new.Surface, append(t.Syllables, new.Syllables...)}
}