77 "io"
88 "os"
99 "sort"
10+ "strings"
1011 "sync"
1112 "unicode"
1213 "unicode/utf8"
@@ -17,7 +18,7 @@ import (
1718
1819const (
1920 filePerm = 0644
20- initialDictSize = 1e4
21+ InitialDictSize = 10000
2122)
2223
2324// splitWordsUnicode splits data into words, using Unicode Letter character class.
@@ -109,7 +110,7 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
109110
110111 // One of the possible optimisations here is to split file in chunks and process
111112 // each chunk individually.
112- words , err := collectWords (fd , initialDictSize )
113+ words , err := collectWords (fd , InitialDictSize )
113114 if err != nil {
114115 _ , _ = fmt .Fprintf (os .Stderr , `extract: reading input "%s": %s` , src , err )
115116 return
@@ -160,7 +161,7 @@ func collectWords(r io.Reader, sizeHint int) ([]string, error) {
160161 words := make ([]string , 0 , sizeHint )
161162
162163 for scanner .Scan () {
163- word := scanner .Text ()
164+ word := strings . ToLower ( scanner .Text () )
164165 hash := memHashString (word )
165166 if _ , ok := dict [hash ]; ok {
166167 continue // duplicate detected
@@ -197,9 +198,9 @@ func writeResults(w io.Writer, words []string) error {
197198 return nil
198199}
199200
200- func ExtractUniqueWords (content string , lang string ) ([]string , error ) {
201+ func ExtractUniqueWords (content string , lang string , sizeHint int ) ([]string , error ) {
201202 r := strings .NewReader (content )
202- words , err := collectWords (r )
203+ words , err := collectWords (r , sizeHint )
203204 if err != nil {
204205 _ , _ = fmt .Fprintf (os .Stderr , `collectWords error: %s` , err )
205206 return nil , err
0 commit comments