Skip to content

Commit 8632d65

Browse files
committed
make all words lower case, add unit test
1 parent 406978a commit 8632d65

File tree

2 files changed

+8
-6
lines changed

2 files changed

+8
-6
lines changed

example-golang/app/extract.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"io"
88
"os"
99
"sort"
10+
"strings"
1011
"sync"
1112
"unicode"
1213
"unicode/utf8"
@@ -17,7 +18,7 @@ import (
1718

1819
const (
1920
filePerm = 0644
20-
initialDictSize = 1e4
21+
InitialDictSize = 10000
2122
)
2223

2324
// splitWordsUnicode splits data into words, using Unicode Letter character class.
@@ -109,7 +110,7 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
109110

110111
// One of the possible optimisations here is to split file in chunks and process
111112
// each chunk individually.
112-
words, err := collectWords(fd, initialDictSize)
113+
words, err := collectWords(fd, InitialDictSize)
113114
if err != nil {
114115
_, _ = fmt.Fprintf(os.Stderr, `extract: reading input "%s": %s`, src, err)
115116
return
@@ -160,7 +161,7 @@ func collectWords(r io.Reader, sizeHint int) ([]string, error) {
160161
words := make([]string, 0, sizeHint)
161162

162163
for scanner.Scan() {
163-
word := scanner.Text()
164+
word := strings.ToLower(scanner.Text())
164165
hash := memHashString(word)
165166
if _, ok := dict[hash]; ok {
166167
continue // duplicate detected
@@ -197,9 +198,9 @@ func writeResults(w io.Writer, words []string) error {
197198
return nil
198199
}
199200

200-
func ExtractUniqueWords(content string, lang string) ([]string, error) {
201+
func ExtractUniqueWords(content string, lang string, sizeHint int) ([]string, error) {
201202
r := strings.NewReader(content)
202-
words, err := collectWords(r)
203+
words, err := collectWords(r, sizeHint)
203204
if err != nil {
204205
_, _ = fmt.Fprintf(os.Stderr, `collectWords error: %s`, err)
205206
return nil, err

example-golang/app_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ import (
1212
func Test_ExtractUniqueWords(t *testing.T) {
1313
text := "ćma cześć ser. śmiech!żółw zebra-łuk len Ćma Żółw ser"
1414
expected := []string{"cześć", "ćma", "len", "łuk", "ser", "śmiech", "zebra", "żółw"}
15-
given, err := app.ExtractUniqueWords(text, "pl")
15+
initialDictSize := app.InitialDictSize
16+
given, err := app.ExtractUniqueWords(text, "pl", initialDictSize)
1617
if err != nil {
1718
_, _ = fmt.Fprintf(os.Stderr, `ExtractUniqueWords error: %s`, err)
1819
return

0 commit comments

Comments
 (0)