@@ -10,11 +10,15 @@ import (
1010 "sync"
1111 "unicode"
1212 "unicode/utf8"
13+ "unsafe"
1314
1415 "github.com/tidwall/collate"
1516)
1617
17- const filePerm = 0644
18+ const (
19+ filePerm = 0644
20+ initialDictSize = 1e4
21+ )
1822
1923// splitWordsUnicode splits data into words, using Unicode Letter character class.
2024// It works similar to the regular expression "[^\p{L}]+". This is what was used
@@ -76,6 +80,20 @@ func isLatin(r rune) bool {
7680 return ('a' <= r && r <= 'z' ) || ('A' <= r && r <= 'Z' )
7781}
7882
83+ //go:noescape
84+ //go:linkname memhash runtime.memhash
85+ func memhash (p unsafe.Pointer , h , s uintptr ) uintptr
86+
87+ type stringStruct struct {
88+ str unsafe.Pointer
89+ len int
90+ }
91+
92+ func memHashString (str string ) uint64 {
93+ ss := (* stringStruct )(unsafe .Pointer (& str ))
94+ return uint64 (memhash (ss .str , 0 , uintptr (ss .len )))
95+ }
96+
7997func extract (src , dst , lang string , sortResults bool , sem <- chan empty , wg * sync.WaitGroup ) {
8098 defer func () {
8199 <- sem
@@ -91,7 +109,7 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
91109
92110 // One of the possible optimisations here is to split file in chunks and process
93111 // each chunk individually.
94- words , err := collectWords (fd )
112+ words , err := collectWords (fd , initialDictSize )
95113 if err != nil {
96114 _ , _ = fmt .Fprintf (os .Stderr , `extract: reading input "%s": %s` , src , err )
97115 return
@@ -127,18 +145,29 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
127145 _ , _ = fmt .Fprintf (os .Stdout , "Saved %s\n " , dst )
128146}
129147
130- func collectWords (r io.Reader ) ([]string , error ) {
148+ func collectWords (r io.Reader , sizeHint int ) ([]string , error ) {
131149 scanner := bufio .NewScanner (r )
132- scanner .Split (splitWordsUnicode )
150+ scanner .Split (splitWords )
151+
152+ // map[uint64]empty should take less memory than map[string]empty and avoid
153+ // GC checks.
154+ //
155+ // sizeHint is used to preallocate map[string]empty and []string slice and skip
156+ // initial reallocation when they should grow. It is a "magic" number which
157+ // should not be too big or too small. Ideally, it should be approximated from
158+ // the text.
159+ dict := make (map [uint64 ]empty , sizeHint )
160+ words := make ([]string , 0 , sizeHint )
133161
134- dict := make (map [string ]empty )
135162 for scanner .Scan () {
136163 word := scanner .Text ()
137- if _ , ok := dict [word ]; ok {
164+ hash := memHashString (word )
165+ if _ , ok := dict [hash ]; ok {
138166 continue // duplicate detected
139167 }
140168
141- dict [word ] = empty {}
169+ dict [hash ] = empty {}
170+ words = append (words , word )
142171
143172 // Theoretically, if sorting is not needed, we can write right here and
144173 // skip words slice preparation below.
@@ -147,17 +176,6 @@ func collectWords(r io.Reader) ([]string, error) {
147176 return nil , err
148177 }
149178
150- // This is expensive ...
151- words := make ([]string , len (dict ))
152-
153- var i int
154- for w := range dict {
155- words [i ] = w
156- i ++
157-
158- delete (dict , w )
159- }
160-
161179 return words , nil
162180}
163181
0 commit comments