Skip to content

Commit b4b3e69

Browse files
committed
improve golang code
1 parent 4fa4111 commit b4b3e69

File tree

6 files changed

+32
-145
lines changed

6 files changed

+32
-145
lines changed

example-golang/app/app.go

Lines changed: 1 addition & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ func Run(srcDir, outDir string, numWorkers int, sortResults bool) error {
4646
dst := filepath.Join(outDir, spec.Lang+"-"+spec.Code+".txt")
4747

4848
wg.Add(1)
49-
go extract(src, dst, langMap[spec.Lang], sortResults, sem, &wg)
49+
go extract(src, dst, sortResults, spec.Tag, sem, &wg)
5050
}
5151

5252
wg.Wait()
@@ -65,29 +65,3 @@ func clearOutput(outDir string) error {
6565

6666
return nil
6767
}
68-
69-
var langMap = map[string]string{
70-
"en": "ENGLISH_CI", // The first language is used as fallback.
71-
"la": "ENGLISH_CI", // Latin
72-
"eo": "ENGLISH_CI", // Esperanto
73-
"ar": "ARABIC_CI",
74-
"cz": "CZECH_CI",
75-
"da": "DANISH_CI", // ?
76-
"de": "GERMAN_CI",
77-
"el": "GREEK_CI",
78-
"es": "SPANISH_CI",
79-
"fi": "FINNISH_CI",
80-
"fr": "FRENCH_CI",
81-
"he": "HEBREW_CI",
82-
"hr": "CROATIAN_CI",
83-
"hu": "HUNGARIAN_CI",
84-
"it": "ITALIAN_CI",
85-
"lt": "LITHUANIAN_CI",
86-
"nl": "DUTCH_CI",
87-
"pl": "POLISH_CI",
88-
"pt": "PORTUGUESE_CI",
89-
"ru": "RUSSIAN_CI",
90-
"sk": "SLOVAK_CI",
91-
"sv": "SWEDISH_CI",
92-
"uk": "UKRAINIAN_CI",
93-
}

example-golang/app/extract.go

Lines changed: 16 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -6,29 +6,26 @@ import (
66
"fmt"
77
"io"
88
"os"
9-
"sort"
109
"strings"
1110
"sync"
1211
"unicode"
1312
"unicode/utf8"
14-
"unsafe"
1513

16-
"github.com/tidwall/collate"
14+
"github.com/cespare/xxhash/v2"
15+
"golang.org/x/text/collate"
16+
"golang.org/x/text/language"
1717
)
1818

19-
const (
20-
filePerm = 0644
21-
InitialDictSize = 10000
22-
)
19+
const filePerm = 0644
2320

24-
// splitWordsUnicode splits data into words, using Unicode Letter character class.
21+
// splitWordsFunc splits data into words, using Unicode Letter character class.
2522
// It works similar to the regular expression "[^\p{L}]+". This is what was used
2623
// in the original code. Unicode function has slight overhead, but handles UTF-8
2724
// correctly.
2825
//
2926
// Rust and Python versions split text according to "[\W\d]+" - anything that is
30-
// not a word or a digit. TODO: comfirm if some words contain digits
31-
func splitWordsUnicode(data []byte, atEOF bool) (advance int, token []byte, err error) {
27+
// not a word or a digit. WTF?
28+
func splitWordsFunc(data []byte, atEOF bool) (advance int, token []byte, err error) {
3229
var start int
3330
var r rune
3431
for width := 0; start < len(data); start += width {
@@ -50,52 +47,7 @@ func splitWordsUnicode(data []byte, atEOF bool) (advance int, token []byte, err
5047
return start, nil, nil
5148
}
5249

53-
// splitWords splits data into words similar to the "[\W\d]+" regular expression.
54-
func splitWords(data []byte, atEOF bool) (advance int, token []byte, err error) {
55-
var start int
56-
var r rune
57-
for width := 0; start < len(data); start += width {
58-
if r, width = utf8.DecodeRune(data[start:]); isLatin(r) {
59-
break
60-
}
61-
}
62-
63-
for width, i := 0, start; i < len(data); i += width {
64-
if r, width = utf8.DecodeRune(data[i:]); !isLatin(r) {
65-
return i + width, data[start:i], nil
66-
}
67-
}
68-
69-
if atEOF && len(data) > start {
70-
return len(data), data[start:], nil
71-
}
72-
73-
return start, nil, nil
74-
}
75-
76-
func isLatin(r rune) bool {
77-
if r >= 0x80 || r == 0x00 {
78-
return false
79-
}
80-
81-
return ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z')
82-
}
83-
84-
//go:noescape
85-
//go:linkname memhash runtime.memhash
86-
func memhash(p unsafe.Pointer, h, s uintptr) uintptr
87-
88-
type stringStruct struct {
89-
str unsafe.Pointer
90-
len int
91-
}
92-
93-
func memHashString(str string) uint64 {
94-
ss := (*stringStruct)(unsafe.Pointer(&str))
95-
return uint64(memhash(ss.str, 0, uintptr(ss.len)))
96-
}
97-
98-
func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync.WaitGroup) {
50+
func extract(src, dst string, sortResults bool, tag language.Tag, sem <-chan empty, wg *sync.WaitGroup) {
9951
defer func() {
10052
<-sem
10153
wg.Done()
@@ -110,17 +62,15 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
11062

11163
// One of the possible optimisations here is to split file in chunks and process
11264
// each chunk individually.
113-
words, err := collectWords(fd, lang, InitialDictSize)
65+
words, err := collectWords(fd)
11466
if err != nil {
11567
_, _ = fmt.Fprintf(os.Stderr, `extract: reading input "%s": %s`, src, err)
11668
return
11769
}
11870

11971
if sortResults {
120-
less := collate.IndexString(lang)
121-
sort.Slice(words, func(i, j int) bool {
122-
return less(words[i], words[j])
123-
})
72+
collator := collate.New(tag)
73+
collator.SortStrings(words)
12474
}
12575

12676
wd, err := os.OpenFile(dst, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, filePerm)
@@ -146,28 +96,18 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
14696
_, _ = fmt.Fprintf(os.Stdout, "Saved %s\n", dst)
14797
}
14898

149-
func collectWords(r io.Reader, lang string, sizeHint int) ([]string, error) {
99+
func collectWords(r io.Reader) ([]string, error) {
150100
scanner := bufio.NewScanner(r)
151-
ascii := []string{"en", "la", "eo"} // English, Latin, Esperanto
152-
if stringInSlice(lang, ascii) {
153-
scanner.Split(splitWords)
154-
} else {
155-
scanner.Split(splitWordsUnicode)
156-
}
101+
scanner.Split(splitWordsFunc)
157102

158103
// map[uint64]empty should take less memory than map[string]empty and avoid
159104
// GC checks.
160-
//
161-
// sizeHint is used to preallocate map[string]empty and []string slice and skip
162-
// initial reallocation when they should grow. It is a "magic" number which
163-
// should not be too big or too small. Ideally, it should be approximated from
164-
// the text.
165-
dict := make(map[uint64]empty, sizeHint)
166-
words := make([]string, 0, sizeHint)
105+
dict := make(map[uint64]empty)
106+
words := make([]string, 0)
167107

168108
for scanner.Scan() {
169109
word := strings.ToLower(scanner.Text())
170-
hash := memHashString(word)
110+
hash := xxhash.Sum64String(word)
171111
if _, ok := dict[hash]; ok {
172112
continue // duplicate detected
173113
}
@@ -202,27 +142,3 @@ func writeResults(w io.Writer, words []string) error {
202142

203143
return nil
204144
}
205-
206-
func ExtractUniqueWords(content string, lang string, sizeHint int) ([]string, error) {
207-
r := strings.NewReader(content)
208-
words, err := collectWords(r, lang, sizeHint)
209-
210-
if err != nil {
211-
_, _ = fmt.Fprintf(os.Stderr, `collectWords error: %s`, err)
212-
return nil, err
213-
}
214-
less := collate.IndexString(lang)
215-
sort.Slice(words, func(i, j int) bool {
216-
return less(words[i], words[j])
217-
})
218-
return words, nil
219-
}
220-
221-
func stringInSlice(a string, list []string) bool {
222-
for _, b := range list {
223-
if b == a {
224-
return true
225-
}
226-
}
227-
return false
228-
}

example-golang/app/spec.go

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,15 @@ import (
44
"fmt"
55
"os"
66

7+
"golang.org/x/text/language"
78
"gopkg.in/yaml.v3"
89
)
910

1011
type MetaConfig struct {
11-
Lang string `yaml:"lang"`
12-
Code string `yaml:"code"`
13-
Label string `yaml:"label"`
12+
Lang string `yaml:"lang"`
13+
Code string `yaml:"code"`
14+
Label string `yaml:"label"`
15+
Tag language.Tag `yaml:"-"`
1416
}
1517

1618
func ReadSpec(filepath string) (*MetaConfig, error) {
@@ -24,5 +26,11 @@ func ReadSpec(filepath string) (*MetaConfig, error) {
2426
return nil, fmt.Errorf(`spec: parsing YAML file "%s": %w`, filepath, err)
2527
}
2628

29+
t, err := language.Parse(config.Lang)
30+
if err != nil {
31+
return nil, fmt.Errorf(`spec: invalid language code "%s": %w`, config.Code, err)
32+
}
33+
34+
config.Tag = t
2735
return &config, nil
2836
}

example-golang/go.mod

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,13 @@ go 1.17
44

55
require (
66
github.com/bmatcuk/doublestar v1.3.4
7+
github.com/cespare/xxhash/v2 v2.1.2
78
github.com/stretchr/testify v1.7.0
8-
github.com/tidwall/collate v1.0.0
9+
golang.org/x/text v0.3.2
910
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b
1011
)
1112

1213
require (
1314
github.com/davecgh/go-spew v1.1.0 // indirect
1415
github.com/pmezard/go-difflib v1.0.0 // indirect
15-
github.com/tidwall/gjson v1.3.4 // indirect
16-
github.com/tidwall/match v1.0.1 // indirect
17-
github.com/tidwall/pretty v1.0.0 // indirect
18-
golang.org/x/text v0.3.2 // indirect
1916
)

example-golang/go.sum

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,14 @@
11
github.com/bmatcuk/doublestar v1.3.4 h1:gPypJ5xD31uhX6Tf54sDPUOBXTqKH4c9aPY66CyQrS0=
22
github.com/bmatcuk/doublestar v1.3.4/go.mod h1:wiQtGV+rzVYxB7WIlirSN++5HPtPlXEo9MEoZQC/PmE=
3+
github.com/cespare/xxhash/v2 v2.1.2 h1:YRXhKfTDauu4ajMg1TPgFO5jnlC2HCbmLXMcTG5cbYE=
4+
github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
35
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
46
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
57
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
68
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
79
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
810
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
911
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
10-
github.com/tidwall/collate v1.0.0 h1:xgvwO2UunUoXx3NS3UqHBX63l248ZApqo7mUe3NHy6I=
11-
github.com/tidwall/collate v1.0.0/go.mod h1:S56qxEr2ALVCaGY41npreOJ5lBIILSrxYLgEpxoHVIk=
12-
github.com/tidwall/gjson v1.3.4 h1:On5waDnyKKk3SWE4EthbjjirAWXp43xx5cKCUZY1eZw=
13-
github.com/tidwall/gjson v1.3.4/go.mod h1:P256ACg0Mn+j1RXIDXoss50DeIABTYK1PULOJHhxOls=
14-
github.com/tidwall/match v1.0.1 h1:PnKP62LPNxHKTwvHHZZzdOAOCtsJTjo6dZLCwpKm5xc=
15-
github.com/tidwall/match v1.0.1/go.mod h1:LujAq0jyVjBy028G1WhWfIzbpQfMO8bBZ6Tyb0+pL9E=
16-
github.com/tidwall/pretty v1.0.0 h1:HsD+QiTn7sK6flMKIvNmpqz1qrpP3Ps6jOKIKMooyg4=
17-
github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk=
1812
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
1913
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
2014
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=

example-golang/main.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,6 @@ func main() {
2424

2525
defaultNumWorkers := runtime.NumCPU()
2626

27-
// In the original Go code, results where always sorted, unlike in Rust or
28-
// Python implementations. Sorting is turned off, by default.
2927
var sortResults bool
3028
var numWorkers int
3129

0 commit comments

Comments
 (0)