@@ -6,29 +6,26 @@ import (
66 "fmt"
77 "io"
88 "os"
9- "sort"
109 "strings"
1110 "sync"
1211 "unicode"
1312 "unicode/utf8"
14- "unsafe"
1513
16- "github.com/tidwall/collate"
14+ "github.com/cespare/xxhash/v2"
15+ "golang.org/x/text/collate"
16+ "golang.org/x/text/language"
1717)
1818
19- const (
20- filePerm = 0644
21- InitialDictSize = 10000
22- )
19+ const filePerm = 0644
2320
24- // splitWordsUnicode splits data into words, using Unicode Letter character class.
21+ // splitWordsFunc splits data into words, using Unicode Letter character class.
2522// It works similar to the regular expression "[^\p{L}]+". This is what was used
2623// in the original code. Unicode function has slight overhead, but handles UTF-8
2724// correctly.
2825//
2926// Rust and Python versions split text according to "[\W\d]+" - anything that is
30- // not a word or a digit. TODO: comfirm if some words contain digits
31- func splitWordsUnicode (data []byte , atEOF bool ) (advance int , token []byte , err error ) {
27+ // not a word or a digit. WTF?
28+ func splitWordsFunc (data []byte , atEOF bool ) (advance int , token []byte , err error ) {
3229 var start int
3330 var r rune
3431 for width := 0 ; start < len (data ); start += width {
@@ -50,52 +47,7 @@ func splitWordsUnicode(data []byte, atEOF bool) (advance int, token []byte, err
5047 return start , nil , nil
5148}
5249
53- // splitWords splits data into words similar to the "[\W\d]+" regular expression.
54- func splitWords (data []byte , atEOF bool ) (advance int , token []byte , err error ) {
55- var start int
56- var r rune
57- for width := 0 ; start < len (data ); start += width {
58- if r , width = utf8 .DecodeRune (data [start :]); isLatin (r ) {
59- break
60- }
61- }
62-
63- for width , i := 0 , start ; i < len (data ); i += width {
64- if r , width = utf8 .DecodeRune (data [i :]); ! isLatin (r ) {
65- return i + width , data [start :i ], nil
66- }
67- }
68-
69- if atEOF && len (data ) > start {
70- return len (data ), data [start :], nil
71- }
72-
73- return start , nil , nil
74- }
75-
76- func isLatin (r rune ) bool {
77- if r >= 0x80 || r == 0x00 {
78- return false
79- }
80-
81- return ('a' <= r && r <= 'z' ) || ('A' <= r && r <= 'Z' )
82- }
83-
84- //go:noescape
85- //go:linkname memhash runtime.memhash
86- func memhash (p unsafe.Pointer , h , s uintptr ) uintptr
87-
88- type stringStruct struct {
89- str unsafe.Pointer
90- len int
91- }
92-
93- func memHashString (str string ) uint64 {
94- ss := (* stringStruct )(unsafe .Pointer (& str ))
95- return uint64 (memhash (ss .str , 0 , uintptr (ss .len )))
96- }
97-
98- func extract (src , dst , lang string , sortResults bool , sem <- chan empty , wg * sync.WaitGroup ) {
50+ func extract (src , dst string , sortResults bool , tag language.Tag , sem <- chan empty , wg * sync.WaitGroup ) {
9951 defer func () {
10052 <- sem
10153 wg .Done ()
@@ -110,17 +62,15 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
11062
11163 // One of the possible optimisations here is to split file in chunks and process
11264 // each chunk individually.
113- words , err := collectWords (fd , lang , InitialDictSize )
65+ words , err := collectWords (fd )
11466 if err != nil {
11567 _ , _ = fmt .Fprintf (os .Stderr , `extract: reading input "%s": %s` , src , err )
11668 return
11769 }
11870
11971 if sortResults {
120- less := collate .IndexString (lang )
121- sort .Slice (words , func (i , j int ) bool {
122- return less (words [i ], words [j ])
123- })
72+ collator := collate .New (tag )
73+ collator .SortStrings (words )
12474 }
12575
12676 wd , err := os .OpenFile (dst , os .O_WRONLY | os .O_CREATE | os .O_TRUNC , filePerm )
@@ -146,28 +96,18 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
14696 _ , _ = fmt .Fprintf (os .Stdout , "Saved %s\n " , dst )
14797}
14898
149- func collectWords (r io.Reader , lang string , sizeHint int ) ([]string , error ) {
99+ func collectWords (r io.Reader ) ([]string , error ) {
150100 scanner := bufio .NewScanner (r )
151- ascii := []string {"en" , "la" , "eo" } // English, Latin, Esperanto
152- if stringInSlice (lang , ascii ) {
153- scanner .Split (splitWords )
154- } else {
155- scanner .Split (splitWordsUnicode )
156- }
101+ scanner .Split (splitWordsFunc )
157102
158103 // map[uint64]empty should take less memory than map[string]empty and avoid
159104 // GC checks.
160- //
161- // sizeHint is used to preallocate map[string]empty and []string slice and skip
162- // initial reallocation when they should grow. It is a "magic" number which
163- // should not be too big or too small. Ideally, it should be approximated from
164- // the text.
165- dict := make (map [uint64 ]empty , sizeHint )
166- words := make ([]string , 0 , sizeHint )
105+ dict := make (map [uint64 ]empty )
106+ words := make ([]string , 0 )
167107
168108 for scanner .Scan () {
169109 word := strings .ToLower (scanner .Text ())
170- hash := memHashString (word )
110+ hash := xxhash . Sum64String (word )
171111 if _ , ok := dict [hash ]; ok {
172112 continue // duplicate detected
173113 }
@@ -202,27 +142,3 @@ func writeResults(w io.Writer, words []string) error {
202142
203143 return nil
204144}
205-
206- func ExtractUniqueWords (content string , lang string , sizeHint int ) ([]string , error ) {
207- r := strings .NewReader (content )
208- words , err := collectWords (r , lang , sizeHint )
209-
210- if err != nil {
211- _ , _ = fmt .Fprintf (os .Stderr , `collectWords error: %s` , err )
212- return nil , err
213- }
214- less := collate .IndexString (lang )
215- sort .Slice (words , func (i , j int ) bool {
216- return less (words [i ], words [j ])
217- })
218- return words , nil
219- }
220-
221- func stringInSlice (a string , list []string ) bool {
222- for _ , b := range list {
223- if b == a {
224- return true
225- }
226- }
227- return false
228- }
0 commit comments