Skip to content

Commit a0ea2ce

Browse files
committed
update Golang example
1 parent 3b5c81b commit a0ea2ce

File tree

12 files changed

+360
-170
lines changed

12 files changed

+360
-170
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ The following results are for 123 unique utf-8 Bible text files in 23 languages
1919
5. Elixir 1.13.2 = 7.82s
2020
6. Ruby 3.1.0 = 8.31s
2121

22-
Golang 1.17 = UNDER REFACTORING, stay tuned
22+
Golang 1.17.6 = UNDER REFACTORING, stay tuned
2323
</pre>
2424

2525
### Conclusion

example-golang/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
/coverage.out

example-golang/.tool-versions

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
golang 1.18beta1
1+
golang 1.17.6

example-golang/Makefile

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,22 @@ build:
88
run: build
99
./${BINARY_NAME}
1010

11+
run-sort: build
12+
./${BINARY_NAME} -n 10 -s
13+
1114
test:
15+
@go test ./... -v
16+
17+
coverage:
1218
@go test ./... -v -coverprofile=coverage.out
13-
19+
20+
1421
cover: test
1522
@go tool cover -html=coverage.out
1623

1724
clean:
1825
@go clean
1926
rm -f coverage.out
2027
rm -f ./${BINARY_NAME}
21-
rm -rf ./words
28+
rm -rf ./words
29+

example-golang/README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,12 @@
44

55
```
66
make build
7-
GOGC=2000 ./main
7+
./main -n 8
88
```
9+
10+
<pre>
11+
Usage of ./main:
12+
-n int
13+
Number of workers to run (zero to match the number of available CPUs) (default 10)
14+
-s Sort results
15+
</pre>

example-golang/app/app.go

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
package app
2+
3+
import (
4+
"fmt"
5+
"os"
6+
"path/filepath"
7+
"sync"
8+
)
9+
10+
const dirPerms = 0755
11+
12+
type empty struct{}
13+
14+
// Run extracts unique words from the list of files and saves them to the outDir.
15+
// No error handling, no context cancellation is implemented to match implementations
16+
// in other languages.
17+
func Run(srcDir, outDir string, numWorkers int, sortResults bool) error {
18+
files, err := filepath.Glob(srcDir)
19+
if err != nil {
20+
return fmt.Errorf(`app: getting list of files "%s": %w`, srcDir, err)
21+
}
22+
23+
if err = clearOutput(outDir); err != nil {
24+
return err
25+
}
26+
27+
// This is a very basic semaphore implementation. Counting unique words from
28+
// a stream of data is IO, memory and CPU expensive. Semaphore lets to run
29+
// up to the numWorkers or workers concurrently and, by default, this number
30+
// matches the number of CPUs.
31+
sem := make(chan empty, numWorkers)
32+
33+
var wg sync.WaitGroup
34+
var spec *MetaConfig
35+
36+
for _, file := range files {
37+
sem <- empty{}
38+
39+
if spec, err = ReadSpec(file); err != nil {
40+
return err
41+
}
42+
43+
src := file[:len(file)-3] + "txt"
44+
dst := filepath.Join(outDir, spec.Lang+"-"+spec.Code+".txt")
45+
46+
wg.Add(1)
47+
go extract(src, dst, "POLISH_CI", sortResults, sem, &wg)
48+
}
49+
50+
wg.Wait()
51+
close(sem)
52+
53+
return nil
54+
}
55+
56+
func clearOutput(outDir string) error {
57+
if err := os.RemoveAll(outDir); err != nil {
58+
return fmt.Errorf(`app: cleaning previous results in "%s": %w`, outDir, err)
59+
}
60+
if err := os.MkdirAll(outDir, dirPerms); err != nil {
61+
return fmt.Errorf(`app: creating output directory "%s": %w`, outDir, err)
62+
}
63+
64+
return nil
65+
}

example-golang/app/extract.go

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
package app
2+
3+
import (
4+
"bufio"
5+
"bytes"
6+
"fmt"
7+
"io"
8+
"os"
9+
"sort"
10+
"strings"
11+
"sync"
12+
"unicode"
13+
"unicode/utf8"
14+
15+
"github.com/tidwall/collate"
16+
)
17+
18+
const filePerm = 0644
19+
20+
// splitWordsUnicode splits data into words, using Unicode Letter character class.
21+
// It works similar to the regular expression "[^\p{L}]+". This is what was used
22+
// in the original code. Unicode function has slight overhead, but handles UTF-8
23+
// correctly.
24+
//
25+
// Rust and Python versions split text according to "[\W\d]+" - anything that is
26+
// not a word or a digit. WTF?
27+
func splitWordsUnicode(data []byte, atEOF bool) (advance int, token []byte, err error) {
28+
var start int
29+
var r rune
30+
for width := 0; start < len(data); start += width {
31+
if r, width = utf8.DecodeRune(data[start:]); unicode.IsLetter(r) {
32+
break
33+
}
34+
}
35+
36+
for width, i := 0, start; i < len(data); i += width {
37+
if r, width = utf8.DecodeRune(data[i:]); !unicode.IsLetter(r) {
38+
return i + width, data[start:i], nil
39+
}
40+
}
41+
42+
if atEOF && len(data) > start {
43+
return len(data), data[start:], nil
44+
}
45+
46+
return start, nil, nil
47+
}
48+
49+
// splitWords splits data into words similar to the "[\W\d]+" regular expression.
50+
func splitWords(data []byte, atEOF bool) (advance int, token []byte, err error) {
51+
var start int
52+
var r rune
53+
for width := 0; start < len(data); start += width {
54+
if r, width = utf8.DecodeRune(data[start:]); isLatin(r) {
55+
break
56+
}
57+
}
58+
59+
for width, i := 0, start; i < len(data); i += width {
60+
if r, width = utf8.DecodeRune(data[i:]); !isLatin(r) {
61+
return i + width, data[start:i], nil
62+
}
63+
}
64+
65+
if atEOF && len(data) > start {
66+
return len(data), data[start:], nil
67+
}
68+
69+
return start, nil, nil
70+
}
71+
72+
func isLatin(r rune) bool {
73+
if r >= 0x80 || r == 0x00 {
74+
return false
75+
}
76+
77+
return ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z')
78+
}
79+
80+
func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync.WaitGroup) {
81+
defer func() {
82+
<-sem
83+
wg.Done()
84+
}()
85+
86+
fd, err := os.Open(src)
87+
if err != nil {
88+
_, _ = fmt.Fprintf(os.Stderr, `extract: opening source file "%s" for reading: %s`, src, err)
89+
return
90+
}
91+
defer fd.Close()
92+
93+
// One of the possible optimisations here is to split file in chunks and process
94+
// each chunk individually.
95+
words, err := collectWords(fd)
96+
if err != nil {
97+
_, _ = fmt.Fprintf(os.Stderr, `extract: reading input "%s": %s`, src, err)
98+
return
99+
}
100+
101+
if sortResults {
102+
less := collate.IndexString(lang)
103+
sort.Slice(words, func(i, j int) bool {
104+
return less(words[i], words[j])
105+
})
106+
}
107+
108+
wd, err := os.OpenFile(dst, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, filePerm)
109+
if err != nil {
110+
_, _ = fmt.Fprintf(os.Stderr, `extract: opening destination file "%s" for writing: %s`, src, err)
111+
return
112+
}
113+
defer fd.Close()
114+
115+
// Writing word by word can result in too many writes, hence, it is slow.
116+
// Let's add some steroids ...
117+
wr := bufio.NewWriter(wd)
118+
119+
if err = writeResults(wr, words); err != nil {
120+
_, _ = fmt.Fprintf(os.Stderr, `extract: writing results "%s": %s`, dst, err)
121+
return
122+
}
123+
if err = wr.Flush(); err != nil {
124+
_, _ = fmt.Fprintf(os.Stderr, `extract: writing results "%s": %s`, dst, err)
125+
return
126+
}
127+
128+
// _, _ = fmt.Fprintf(os.Stdout, "Saved %s\n", dst)
129+
}
130+
func collectWords(r io.Reader) ([]string, error) {
131+
scanner := bufio.NewScanner(r)
132+
133+
scanner.Split(splitWordsUnicode)
134+
135+
dict := make(map[string]empty)
136+
for scanner.Scan() {
137+
word := strings.ToLower(scanner.Text())
138+
if _, ok := dict[word]; ok {
139+
continue // duplicate detected
140+
}
141+
142+
dict[word] = empty{}
143+
144+
// Theoretically, if sorting is not needed, we can write right here and
145+
// skip words slice preparation below.
146+
}
147+
if err := scanner.Err(); err != nil {
148+
return nil, err
149+
}
150+
151+
// This is expensive ...
152+
words := make([]string, len(dict))
153+
154+
var i int
155+
for w := range dict {
156+
words[i] = w
157+
i++
158+
159+
delete(dict, w)
160+
}
161+
162+
return words, nil
163+
}
164+
165+
func writeResults(w io.Writer, words []string) error {
166+
// This is to preallocate memory once for "string => []byte + \n" conversion
167+
// and reuse it on every iteration.
168+
var buf bytes.Buffer
169+
for _, word := range words {
170+
buf.WriteString(word)
171+
buf.WriteRune('\n')
172+
173+
if _, err := buf.WriteTo(w); err != nil {
174+
return err
175+
}
176+
177+
buf.Reset()
178+
}
179+
180+
return nil
181+
}
182+
183+
func ExtractUniqueWords(content string, lang string) ([]string, error) {
184+
r := strings.NewReader(content)
185+
words, err := collectWords(r)
186+
if err != nil {
187+
_, _ = fmt.Fprintf(os.Stderr, `collectWords error: %s`, err)
188+
return nil, err
189+
}
190+
less := collate.IndexString(lang)
191+
sort.Slice(words, func(i, j int) bool {
192+
return less(words[i], words[j])
193+
})
194+
return words, nil
195+
}

example-golang/app/spec.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
package app
2+
3+
import (
4+
"fmt"
5+
"os"
6+
7+
"gopkg.in/yaml.v3"
8+
)
9+
10+
type MetaConfig struct {
11+
Lang string `yaml:"lang"`
12+
Code string `yaml:"code"`
13+
Label string `yaml:"label"`
14+
}
15+
16+
func ReadSpec(filepath string) (*MetaConfig, error) {
17+
p, err := os.ReadFile(filepath)
18+
if err != nil {
19+
return nil, fmt.Errorf(`spec: reading YAML file "%s": %w`, filepath, err)
20+
}
21+
22+
var config MetaConfig
23+
if err = yaml.Unmarshal(p, &config); err != nil {
24+
return nil, fmt.Errorf(`spec: parsing YAML file "%s": %w`, filepath, err)
25+
}
26+
27+
return &config, nil
28+
}

example-golang/app_test.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
package main
2+
3+
import (
4+
"fmt"
5+
"os"
6+
"testing"
7+
"wordextractor/app"
8+
9+
"github.com/stretchr/testify/assert"
10+
)
11+
12+
func Test_ExtractUniqueWords(t *testing.T) {
13+
text := "ćma cześć ser. śmiech!żółw zebra-łuk len Ćma Żółw ser"
14+
expected := []string{"cześć", "ćma", "len", "łuk", "ser", "śmiech", "zebra", "żółw"}
15+
given, err := app.ExtractUniqueWords(text, "pl")
16+
if err != nil {
17+
_, _ = fmt.Fprintf(os.Stderr, `ExtractUniqueWords error: %s`, err)
18+
return
19+
}
20+
assert.Equal(t, expected, given, "text should be tokenized into unique words")
21+
}

example-golang/go.mod

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
module github.com/hipertracker/word_extractor
1+
module wordextractor
22

3-
go 1.18
3+
go 1.17
44

55
require (
66
github.com/bmatcuk/doublestar v1.3.4

0 commit comments

Comments
 (0)