@@ -6,28 +6,25 @@ import (
66 "fmt"
77 "io"
88 "os"
9- "sort"
109 "sync"
1110 "unicode"
1211 "unicode/utf8"
13- "unsafe"
1412
15- "github.com/tidwall/collate"
13+ "github.com/cespare/xxhash/v2"
14+ "golang.org/x/text/collate"
15+ "golang.org/x/text/language"
1616)
1717
18- const (
19- filePerm = 0644
20- initialDictSize = 1e4
21- )
18+ const filePerm = 0644
2219
23- // splitWordsUnicode splits data into words, using Unicode Letter character class.
20+ // splitWordsFunc splits data into words, using Unicode Letter character class.
2421// It works similar to the regular expression "[^\p{L}]+". This is what was used
2522// in the original code. Unicode function has slight overhead, but handles UTF-8
2623// correctly.
2724//
2825// Rust and Python versions split text according to "[\W\d]+" - anything that is
2926// not a word or a digit. WTF?
30- func splitWordsUnicode (data []byte , atEOF bool ) (advance int , token []byte , err error ) {
27+ func splitWordsFunc (data []byte , atEOF bool ) (advance int , token []byte , err error ) {
3128 var start int
3229 var r rune
3330 for width := 0 ; start < len (data ); start += width {
@@ -49,52 +46,7 @@ func splitWordsUnicode(data []byte, atEOF bool) (advance int, token []byte, err
4946 return start , nil , nil
5047}
5148
52- // splitWords splits data into words similar to the "[\W\d]+" regular expression.
53- func splitWords (data []byte , atEOF bool ) (advance int , token []byte , err error ) {
54- var start int
55- var r rune
56- for width := 0 ; start < len (data ); start += width {
57- if r , width = utf8 .DecodeRune (data [start :]); isLatin (r ) {
58- break
59- }
60- }
61-
62- for width , i := 0 , start ; i < len (data ); i += width {
63- if r , width = utf8 .DecodeRune (data [i :]); ! isLatin (r ) {
64- return i + width , data [start :i ], nil
65- }
66- }
67-
68- if atEOF && len (data ) > start {
69- return len (data ), data [start :], nil
70- }
71-
72- return start , nil , nil
73- }
74-
75- func isLatin (r rune ) bool {
76- if r >= 0x80 || r == 0x00 {
77- return false
78- }
79-
80- return ('a' <= r && r <= 'z' ) || ('A' <= r && r <= 'Z' )
81- }
82-
83- //go:noescape
84- //go:linkname memhash runtime.memhash
85- func memhash (p unsafe.Pointer , h , s uintptr ) uintptr
86-
87- type stringStruct struct {
88- str unsafe.Pointer
89- len int
90- }
91-
92- func memHashString (str string ) uint64 {
93- ss := (* stringStruct )(unsafe .Pointer (& str ))
94- return uint64 (memhash (ss .str , 0 , uintptr (ss .len )))
95- }
96-
97- func extract (src , dst , lang string , sortResults bool , sem <- chan empty , wg * sync.WaitGroup ) {
49+ func extract (src , dst string , sortResults bool , tag language.Tag , sem <- chan empty , wg * sync.WaitGroup ) {
9850 defer func () {
9951 <- sem
10052 wg .Done ()
@@ -109,17 +61,15 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
10961
11062 // One of the possible optimisations here is to split file in chunks and process
11163 // each chunk individually.
112- words , err := collectWords (fd , initialDictSize )
64+ words , err := collectWords (fd )
11365 if err != nil {
11466 _ , _ = fmt .Fprintf (os .Stderr , `extract: reading input "%s": %s` , src , err )
11567 return
11668 }
11769
11870 if sortResults {
119- less := collate .IndexString (lang )
120- sort .Slice (words , func (i , j int ) bool {
121- return less (words [i ], words [j ])
122- })
71+ collator := collate .New (tag )
72+ collator .SortStrings (words )
12373 }
12474
12575 wd , err := os .OpenFile (dst , os .O_WRONLY | os .O_CREATE | os .O_TRUNC , filePerm )
@@ -145,23 +95,18 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
14595 _ , _ = fmt .Fprintf (os .Stdout , "Saved %s\n " , dst )
14696}
14797
148- func collectWords (r io.Reader , sizeHint int ) ([]string , error ) {
98+ func collectWords (r io.Reader ) ([]string , error ) {
14999 scanner := bufio .NewScanner (r )
150- scanner .Split (splitWords )
100+ scanner .Split (splitWordsFunc )
151101
152102 // map[uint64]empty should take less memory than map[string]empty and avoid
153103 // GC checks.
154- //
155- // sizeHint is used to preallocate map[string]empty and []string slice and skip
156- // initial reallocation when they should grow. It is a "magic" number which
157- // should not be too big or too small. Ideally, it should be approximated from
158- // the text.
159- dict := make (map [uint64 ]empty , sizeHint )
160- words := make ([]string , 0 , sizeHint )
104+ dict := make (map [uint64 ]empty )
105+ words := make ([]string , 0 )
161106
162107 for scanner .Scan () {
163108 word := scanner .Text ()
164- hash := memHashString (word )
109+ hash := xxhash . Sum64String (word )
165110 if _ , ok := dict [hash ]; ok {
166111 continue // duplicate detected
167112 }
0 commit comments