Skip to content

Commit ca5bb38

Browse files
committed
Code clean up
1 parent 4232926 commit ca5bb38

File tree

5 files changed

+33
-111
lines changed

5 files changed

+33
-111
lines changed

example-golang/app/app.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ func Run(srcDir, outDir string, numWorkers int, sortResults bool) error {
4444
dst := filepath.Join(outDir, "extracted-words-for-"+spec.Code+".txt")
4545

4646
wg.Add(1)
47-
go extract(src, dst, "POLISH_CI", sortResults, sem, &wg)
47+
go extract(src, dst, sortResults, spec.Tag, sem, &wg)
4848
}
4949

5050
wg.Wait()

example-golang/app/extract.go

Lines changed: 15 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -6,28 +6,25 @@ import (
66
"fmt"
77
"io"
88
"os"
9-
"sort"
109
"sync"
1110
"unicode"
1211
"unicode/utf8"
13-
"unsafe"
1412

15-
"github.com/tidwall/collate"
13+
"github.com/cespare/xxhash/v2"
14+
"golang.org/x/text/collate"
15+
"golang.org/x/text/language"
1616
)
1717

18-
const (
19-
filePerm = 0644
20-
initialDictSize = 1e4
21-
)
18+
const filePerm = 0644
2219

23-
// splitWordsUnicode splits data into words, using Unicode Letter character class.
20+
// splitWordsFunc splits data into words, using Unicode Letter character class.
2421
// It works similar to the regular expression "[^\p{L}]+". This is what was used
2522
// in the original code. Unicode function has slight overhead, but handles UTF-8
2623
// correctly.
2724
//
2825
// Rust and Python versions split text according to "[\W\d]+" - anything that is
2926
// not a word or a digit. WTF?
30-
func splitWordsUnicode(data []byte, atEOF bool) (advance int, token []byte, err error) {
27+
func splitWordsFunc(data []byte, atEOF bool) (advance int, token []byte, err error) {
3128
var start int
3229
var r rune
3330
for width := 0; start < len(data); start += width {
@@ -49,52 +46,7 @@ func splitWordsUnicode(data []byte, atEOF bool) (advance int, token []byte, err
4946
return start, nil, nil
5047
}
5148

52-
// splitWords splits data into words similar to the "[\W\d]+" regular expression.
53-
func splitWords(data []byte, atEOF bool) (advance int, token []byte, err error) {
54-
var start int
55-
var r rune
56-
for width := 0; start < len(data); start += width {
57-
if r, width = utf8.DecodeRune(data[start:]); isLatin(r) {
58-
break
59-
}
60-
}
61-
62-
for width, i := 0, start; i < len(data); i += width {
63-
if r, width = utf8.DecodeRune(data[i:]); !isLatin(r) {
64-
return i + width, data[start:i], nil
65-
}
66-
}
67-
68-
if atEOF && len(data) > start {
69-
return len(data), data[start:], nil
70-
}
71-
72-
return start, nil, nil
73-
}
74-
75-
func isLatin(r rune) bool {
76-
if r >= 0x80 || r == 0x00 {
77-
return false
78-
}
79-
80-
return ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z')
81-
}
82-
83-
//go:noescape
84-
//go:linkname memhash runtime.memhash
85-
func memhash(p unsafe.Pointer, h, s uintptr) uintptr
86-
87-
type stringStruct struct {
88-
str unsafe.Pointer
89-
len int
90-
}
91-
92-
func memHashString(str string) uint64 {
93-
ss := (*stringStruct)(unsafe.Pointer(&str))
94-
return uint64(memhash(ss.str, 0, uintptr(ss.len)))
95-
}
96-
97-
func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync.WaitGroup) {
49+
func extract(src, dst string, sortResults bool, tag language.Tag, sem <-chan empty, wg *sync.WaitGroup) {
9850
defer func() {
9951
<-sem
10052
wg.Done()
@@ -109,17 +61,15 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
10961

11062
// One of the possible optimisations here is to split file in chunks and process
11163
// each chunk individually.
112-
words, err := collectWords(fd, initialDictSize)
64+
words, err := collectWords(fd)
11365
if err != nil {
11466
_, _ = fmt.Fprintf(os.Stderr, `extract: reading input "%s": %s`, src, err)
11567
return
11668
}
11769

11870
if sortResults {
119-
less := collate.IndexString(lang)
120-
sort.Slice(words, func(i, j int) bool {
121-
return less(words[i], words[j])
122-
})
71+
collator := collate.New(tag)
72+
collator.SortStrings(words)
12373
}
12474

12575
wd, err := os.OpenFile(dst, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, filePerm)
@@ -145,23 +95,18 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
14595
_, _ = fmt.Fprintf(os.Stdout, "Saved %s\n", dst)
14696
}
14797

148-
func collectWords(r io.Reader, sizeHint int) ([]string, error) {
98+
func collectWords(r io.Reader) ([]string, error) {
14999
scanner := bufio.NewScanner(r)
150-
scanner.Split(splitWords)
100+
scanner.Split(splitWordsFunc)
151101

152102
// map[uint64]empty should take less memory than map[string]empty and avoid
153103
// GC checks.
154-
//
155-
// sizeHint is used to preallocate map[string]empty and []string slice and skip
156-
// initial reallocation when they should grow. It is a "magic" number which
157-
// should not be too big or too small. Ideally, it should be approximated from
158-
// the text.
159-
dict := make(map[uint64]empty, sizeHint)
160-
words := make([]string, 0, sizeHint)
104+
dict := make(map[uint64]empty)
105+
words := make([]string, 0)
161106

162107
for scanner.Scan() {
163108
word := scanner.Text()
164-
hash := memHashString(word)
109+
hash := xxhash.Sum64String(word)
165110
if _, ok := dict[hash]; ok {
166111
continue // duplicate detected
167112
}

example-golang/app/spec.go

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,15 @@ import (
44
"fmt"
55
"os"
66

7+
"golang.org/x/text/language"
78
"gopkg.in/yaml.v3"
89
)
910

1011
type MetaConfig struct {
11-
Lang string `yaml:"lang"`
12-
Code string `yaml:"code"`
13-
Label string `yaml:"label"`
12+
Lang string `yaml:"lang"`
13+
Code string `yaml:"code"`
14+
Label string `yaml:"label"`
15+
Tag language.Tag `yaml:"-"`
1416
}
1517

1618
func ReadSpec(filepath string) (*MetaConfig, error) {
@@ -24,5 +26,11 @@ func ReadSpec(filepath string) (*MetaConfig, error) {
2426
return nil, fmt.Errorf(`spec: parsing YAML file "%s": %w`, filepath, err)
2527
}
2628

29+
t, err := language.Parse(config.Lang)
30+
if err != nil {
31+
return nil, fmt.Errorf(`spec: invalid language code "%s": %w`, config.Code, err)
32+
}
33+
34+
config.Tag = t
2735
return &config, nil
2836
}

example-golang/go.mod

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,7 @@ module github.com/hipertracker/word_extractor
33
go 1.17
44

55
require (
6-
github.com/bmatcuk/doublestar v1.3.4
7-
github.com/stretchr/testify v1.7.0
8-
github.com/thoas/go-funk v0.9.1
9-
github.com/tidwall/collate v1.0.0
6+
github.com/cespare/xxhash/v2 v2.1.2
7+
golang.org/x/text v0.3.7
108
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b
119
)
12-
13-
require (
14-
github.com/davecgh/go-spew v1.1.0 // indirect
15-
github.com/pmezard/go-difflib v1.0.0 // indirect
16-
github.com/tidwall/gjson v1.3.4 // indirect
17-
github.com/tidwall/match v1.0.1 // indirect
18-
github.com/tidwall/pretty v1.0.0 // indirect
19-
golang.org/x/text v0.3.2 // indirect
20-
)

example-golang/go.sum

Lines changed: 4 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,9 @@
1-
github.com/bmatcuk/doublestar v1.3.4 h1:gPypJ5xD31uhX6Tf54sDPUOBXTqKH4c9aPY66CyQrS0=
2-
github.com/bmatcuk/doublestar v1.3.4/go.mod h1:wiQtGV+rzVYxB7WIlirSN++5HPtPlXEo9MEoZQC/PmE=
3-
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
4-
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
5-
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
6-
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
7-
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
8-
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
9-
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
10-
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
11-
github.com/thoas/go-funk v0.9.1 h1:O549iLZqPpTUQ10ykd26sZhzD+rmR5pWhuElrhbC20M=
12-
github.com/thoas/go-funk v0.9.1/go.mod h1:+IWnUfUmFO1+WVYQWQtIJHeRRdaIyyYglZN7xzUPe4Q=
13-
github.com/tidwall/collate v1.0.0 h1:xgvwO2UunUoXx3NS3UqHBX63l248ZApqo7mUe3NHy6I=
14-
github.com/tidwall/collate v1.0.0/go.mod h1:S56qxEr2ALVCaGY41npreOJ5lBIILSrxYLgEpxoHVIk=
15-
github.com/tidwall/gjson v1.3.4 h1:On5waDnyKKk3SWE4EthbjjirAWXp43xx5cKCUZY1eZw=
16-
github.com/tidwall/gjson v1.3.4/go.mod h1:P256ACg0Mn+j1RXIDXoss50DeIABTYK1PULOJHhxOls=
17-
github.com/tidwall/match v1.0.1 h1:PnKP62LPNxHKTwvHHZZzdOAOCtsJTjo6dZLCwpKm5xc=
18-
github.com/tidwall/match v1.0.1/go.mod h1:LujAq0jyVjBy028G1WhWfIzbpQfMO8bBZ6Tyb0+pL9E=
19-
github.com/tidwall/pretty v1.0.0 h1:HsD+QiTn7sK6flMKIvNmpqz1qrpP3Ps6jOKIKMooyg4=
20-
github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk=
21-
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
22-
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
1+
github.com/cespare/xxhash/v2 v2.1.2 h1:YRXhKfTDauu4ajMg1TPgFO5jnlC2HCbmLXMcTG5cbYE=
2+
github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
3+
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
4+
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
235
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
246
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
257
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
26-
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
27-
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
288
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo=
299
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

0 commit comments

Comments
 (0)