-
Notifications
You must be signed in to change notification settings - Fork 1
/
tokens.go
79 lines (76 loc) · 2.03 KB
/
tokens.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
package corpustools
import (
"bufio"
"io"
"log"
"os"
"regexp"
"strings"
"unicode"
)
// Streams the tokens within a text file.
func TokensFromFile(filename string, lowerCase bool, returnChars bool) (tokens []string) {
var (
bfr *bufio.Reader
tks []string
)
tokens = make([]string, 0)
// Open the file for reading.
fh, err := os.Open(filename)
if err != nil {
log.Fatal(err)
}
defer fh.Close()
// Read the lines of the file one at a time.
bfr = bufio.NewReaderSize(fh, 1024*16)
for line, isprefix, err := bfr.ReadLine(); err != io.EOF; {
// Error handling.
if err != nil {
log.Fatal(err)
}
if isprefix {
log.Fatal("Line too long for buffered reader.")
}
// Convert the bytes in the line to nice tokens.
tks = TokenizeLine(string(line), lowerCase, returnChars)
for _, tk := range tks {
tokens = append(tokens, tk)
}
// Read from the file for the next iteration.
line, isprefix, err = bfr.ReadLine()
}
return
}
// Converts a string (e.g. a line from a file) into an array of tokens.
func TokenizeLine(line string, lowerCase bool, returnChars bool) (tokens []string) {
// Lower case everything if required.
if lowerCase {
line = strings.ToLower(line)
}
// Split line into characters.
if returnChars {
// Create a map of acceptable characters.
var okChars = make(map[rune]bool)
for _, rn := range "abcdefghijklmnopqrstuvwxyz0123456789 ,;:." {
okChars[rn] = true
okChars[unicode.ToUpper(rn)] = true
}
// Add rune to tokens if it is acceptable.
for _, rn := range line {
if okChars[rn] {
tokens = append(tokens, string(rn))
} else {
tokens = append(tokens, "XXX")
}
}
// Or else split line into "words" by splitting on space.
} else {
// Insert spaces around break character, remove repeated spaces, and then split on individual spaces.
splitchars, _ := regexp.Compile(`\b`)
multiplespaces, _ := regexp.Compile(` +`)
line = splitchars.ReplaceAllString(line, " $1 ")
line = multiplespaces.ReplaceAllString(line, " ")
tokens = strings.Split(line, " ")
}
return
}