@@ -11,11 +11,15 @@ import (
1111 "sync"
1212 "unicode"
1313 "unicode/utf8"
14+ "unsafe"
1415
1516 "github.com/tidwall/collate"
1617)
1718
18- const filePerm = 0644
19+ const (
20+ filePerm = 0644
21+ InitialDictSize = 10000
22+ )
1923
2024// splitWordsUnicode splits data into words, using Unicode Letter character class.
2125// It works similar to the regular expression "[^\p{L}]+". This is what was used
@@ -77,6 +81,20 @@ func isLatin(r rune) bool {
7781 return ('a' <= r && r <= 'z' ) || ('A' <= r && r <= 'Z' )
7882}
7983
84+ //go:noescape
85+ //go:linkname memhash runtime.memhash
86+ func memhash (p unsafe.Pointer , h , s uintptr ) uintptr
87+
88+ type stringStruct struct {
89+ str unsafe.Pointer
90+ len int
91+ }
92+
93+ func memHashString (str string ) uint64 {
94+ ss := (* stringStruct )(unsafe .Pointer (& str ))
95+ return uint64 (memhash (ss .str , 0 , uintptr (ss .len )))
96+ }
97+
8098func extract (src , dst , lang string , sortResults bool , sem <- chan empty , wg * sync.WaitGroup ) {
8199 defer func () {
82100 <- sem
@@ -92,7 +110,7 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
92110
93111 // One of the possible optimisations here is to split file in chunks and process
94112 // each chunk individually.
95- words , err := collectWords (fd )
113+ words , err := collectWords (fd , InitialDictSize )
96114 if err != nil {
97115 _ , _ = fmt .Fprintf (os .Stderr , `extract: reading input "%s": %s` , src , err )
98116 return
@@ -125,21 +143,32 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
125143 return
126144 }
127145
128- // _, _ = fmt.Fprintf(os.Stdout, "Saved %s\n", dst)
146+ _ , _ = fmt .Fprintf (os .Stdout , "Saved %s\n " , dst )
129147}
130- func collectWords (r io.Reader ) ([]string , error ) {
131- scanner := bufio .NewScanner (r )
132148
133- scanner .Split (splitWordsUnicode )
149+ func collectWords (r io.Reader , sizeHint int ) ([]string , error ) {
150+ scanner := bufio .NewScanner (r )
151+ scanner .Split (splitWords )
152+
153+ // map[uint64]empty should take less memory than map[string]empty and avoid
154+ // GC checks.
155+ //
156+ // sizeHint is used to preallocate map[string]empty and []string slice and skip
157+ // initial reallocation when they should grow. It is a "magic" number which
158+ // should not be too big or too small. Ideally, it should be approximated from
159+ // the text.
160+ dict := make (map [uint64 ]empty , sizeHint )
161+ words := make ([]string , 0 , sizeHint )
134162
135- dict := make (map [string ]empty )
136163 for scanner .Scan () {
137164 word := strings .ToLower (scanner .Text ())
138- if _ , ok := dict [word ]; ok {
165+ hash := memHashString (word )
166+ if _ , ok := dict [hash ]; ok {
139167 continue // duplicate detected
140168 }
141169
142- dict [word ] = empty {}
170+ dict [hash ] = empty {}
171+ words = append (words , word )
143172
144173 // Theoretically, if sorting is not needed, we can write right here and
145174 // skip words slice preparation below.
@@ -148,17 +177,6 @@ func collectWords(r io.Reader) ([]string, error) {
148177 return nil , err
149178 }
150179
151- // This is expensive ...
152- words := make ([]string , len (dict ))
153-
154- var i int
155- for w := range dict {
156- words [i ] = w
157- i ++
158-
159- delete (dict , w )
160- }
161-
162180 return words , nil
163181}
164182
@@ -180,9 +198,9 @@ func writeResults(w io.Writer, words []string) error {
180198 return nil
181199}
182200
183- func ExtractUniqueWords (content string , lang string ) ([]string , error ) {
201+ func ExtractUniqueWords (content string , lang string , sizeHint int ) ([]string , error ) {
184202 r := strings .NewReader (content )
185- words , err := collectWords (r )
203+ words , err := collectWords (r , sizeHint )
186204 if err != nil {
187205 _ , _ = fmt .Fprintf (os .Stderr , `collectWords error: %s` , err )
188206 return nil , err
0 commit comments