Skip to content

Commit c1f3c66

Browse files
committed
Golang example refactoring.
1 parent 45358d2 commit c1f3c66

File tree

9 files changed

+303
-196
lines changed

9 files changed

+303
-196
lines changed

example-golang/.tool-versions

Lines changed: 0 additions & 1 deletion
This file was deleted.

example-golang/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@
44

55
```
66
make build
7-
GOGC=2000 ./main
7+
./main [-n=NUMBER_OF_WORKERS, integer] [-s]
88
```

example-golang/app/app.go

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
package app
2+
3+
import (
4+
"fmt"
5+
"os"
6+
"path/filepath"
7+
"sync"
8+
)
9+
10+
const dirPerms = 0755
11+
12+
type empty struct{}
13+
14+
// Run extracts unique words from the list of files and saves them to the outDir.
15+
// No error handling, no context cancellation is implemented to match implementations
16+
// in other languages.
17+
func Run(srcDir, outDir string, numWorkers int, sortResults bool) error {
18+
files, err := filepath.Glob(srcDir)
19+
if err != nil {
20+
return fmt.Errorf(`app: getting list of files "%s": %w`, srcDir, err)
21+
}
22+
23+
if err = clearOutput(outDir); err != nil {
24+
return err
25+
}
26+
27+
// This is a very basic semaphore implementation. Counting unique words from
28+
// a stream of data is IO, memory and CPU expensive. Semaphore lets to run
29+
// up to the numWorkers or workers concurrently and, by default, this number
30+
// matches the number of CPUs.
31+
sem := make(chan empty, numWorkers)
32+
33+
var wg sync.WaitGroup
34+
var spec *MetaConfig
35+
36+
for _, file := range files {
37+
sem <- empty{}
38+
39+
if spec, err = ReadSpec(file); err != nil {
40+
return err
41+
}
42+
43+
src := file[:len(file)-3] + "txt"
44+
dst := filepath.Join(outDir, "extracted-words-for-"+spec.Code+".txt")
45+
46+
wg.Add(1)
47+
go extract(src, dst, "POLISH_CI", sortResults, sem, &wg)
48+
}
49+
50+
wg.Wait()
51+
close(sem)
52+
53+
return nil
54+
}
55+
56+
func clearOutput(outDir string) error {
57+
if err := os.RemoveAll(outDir); err != nil {
58+
return fmt.Errorf(`app: cleaning previous results in "%s": %w`, outDir, err)
59+
}
60+
if err := os.MkdirAll(outDir, dirPerms); err != nil {
61+
return fmt.Errorf(`app: creating output directory "%s": %w`, outDir, err)
62+
}
63+
64+
return nil
65+
}

example-golang/app/extract.go

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
package app
2+
3+
import (
4+
"bufio"
5+
"bytes"
6+
"fmt"
7+
"io"
8+
"os"
9+
"sort"
10+
"sync"
11+
"unicode"
12+
"unicode/utf8"
13+
14+
"github.com/tidwall/collate"
15+
)
16+
17+
const filePerm = 0644
18+
19+
// splitWordsUnicode splits data into words, using Unicode Letter character class.
20+
// It works similar to the regular expression "[^\p{L}]+". This is what was used
21+
// in the original code. Unicode function has slight overhead, but handles UTF-8
22+
// correctly.
23+
//
24+
// Rust and Python versions split text according to "[\W\d]+" - anything that is
25+
// not a word or a digit. WTF?
26+
func splitWordsUnicode(data []byte, atEOF bool) (advance int, token []byte, err error) {
27+
var start int
28+
var r rune
29+
for width := 0; start < len(data); start += width {
30+
if r, width = utf8.DecodeRune(data[start:]); unicode.IsLetter(r) {
31+
break
32+
}
33+
}
34+
35+
for width, i := 0, start; i < len(data); i += width {
36+
if r, width = utf8.DecodeRune(data[i:]); !unicode.IsLetter(r) {
37+
return i + width, data[start:i], nil
38+
}
39+
}
40+
41+
if atEOF && len(data) > start {
42+
return len(data), data[start:], nil
43+
}
44+
45+
return start, nil, nil
46+
}
47+
48+
// splitWords splits data into words similar to the "[\W\d]+" regular expression.
49+
func splitWords(data []byte, atEOF bool) (advance int, token []byte, err error) {
50+
var start int
51+
var r rune
52+
for width := 0; start < len(data); start += width {
53+
if r, width = utf8.DecodeRune(data[start:]); isLatin(r) {
54+
break
55+
}
56+
}
57+
58+
for width, i := 0, start; i < len(data); i += width {
59+
if r, width = utf8.DecodeRune(data[i:]); !isLatin(r) {
60+
return i + width, data[start:i], nil
61+
}
62+
}
63+
64+
if atEOF && len(data) > start {
65+
return len(data), data[start:], nil
66+
}
67+
68+
return start, nil, nil
69+
}
70+
71+
func isLatin(r rune) bool {
72+
if r >= 0x80 || r == 0x00 {
73+
return false
74+
}
75+
76+
return ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z')
77+
}
78+
79+
func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync.WaitGroup) {
80+
defer func() {
81+
<-sem
82+
wg.Done()
83+
}()
84+
85+
fd, err := os.Open(src)
86+
if err != nil {
87+
_, _ = fmt.Fprintf(os.Stderr, `extract: opening source file "%s" for reading: %s`, src, err)
88+
return
89+
}
90+
defer fd.Close()
91+
92+
// One of the possible optimisations here is to split file in chunks and process
93+
// each chunk individually.
94+
words, err := collectWords(fd)
95+
if err != nil {
96+
_, _ = fmt.Fprintf(os.Stderr, `extract: reading input "%s": %s`, src, err)
97+
return
98+
}
99+
100+
if sortResults {
101+
less := collate.IndexString(lang)
102+
sort.Slice(words, func(i, j int) bool {
103+
return less(words[i], words[j])
104+
})
105+
}
106+
107+
wd, err := os.OpenFile(dst, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, filePerm)
108+
if err != nil {
109+
_, _ = fmt.Fprintf(os.Stderr, `extract: opening destination file "%s" for writing: %s`, src, err)
110+
return
111+
}
112+
defer fd.Close()
113+
114+
// Writing word by word can result in too many writes, hence, it is slow.
115+
// Let's add some steroids ...
116+
wr := bufio.NewWriter(wd)
117+
118+
if err = writeResults(wr, words); err != nil {
119+
_, _ = fmt.Fprintf(os.Stderr, `extract: writing results "%s": %s`, dst, err)
120+
return
121+
}
122+
if err = wr.Flush(); err != nil {
123+
_, _ = fmt.Fprintf(os.Stderr, `extract: writing results "%s": %s`, dst, err)
124+
return
125+
}
126+
127+
_, _ = fmt.Fprintf(os.Stdout, "Saved %s\n", dst)
128+
}
129+
130+
func collectWords(r io.Reader) ([]string, error) {
131+
scanner := bufio.NewScanner(r)
132+
scanner.Split(splitWordsUnicode)
133+
134+
dict := make(map[string]empty)
135+
for scanner.Scan() {
136+
word := scanner.Text()
137+
if _, ok := dict[word]; ok {
138+
continue // duplicate detected
139+
}
140+
141+
dict[word] = empty{}
142+
143+
// Theoretically, if sorting is not needed, we can write right here and
144+
// skip words slice preparation below.
145+
}
146+
if err := scanner.Err(); err != nil {
147+
return nil, err
148+
}
149+
150+
// This is expensive ...
151+
words := make([]string, len(dict))
152+
153+
var i int
154+
for w := range dict {
155+
words[i] = w
156+
i++
157+
158+
delete(dict, w)
159+
}
160+
161+
return words, nil
162+
}
163+
164+
func writeResults(w io.Writer, words []string) error {
165+
// This is to preallocate memory once for "string => []byte + \n" conversion
166+
// and reuse it on every iteration.
167+
var buf bytes.Buffer
168+
for _, word := range words {
169+
buf.WriteString(word)
170+
buf.WriteRune('\n')
171+
172+
if _, err := buf.WriteTo(w); err != nil {
173+
return err
174+
}
175+
176+
buf.Reset()
177+
}
178+
179+
return nil
180+
}

example-golang/app/spec.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
package app
2+
3+
import (
4+
"fmt"
5+
"os"
6+
7+
"gopkg.in/yaml.v3"
8+
)
9+
10+
type MetaConfig struct {
11+
Lang string `yaml:"lang"`
12+
Code string `yaml:"code"`
13+
Label string `yaml:"label"`
14+
}
15+
16+
func ReadSpec(filepath string) (*MetaConfig, error) {
17+
p, err := os.ReadFile(filepath)
18+
if err != nil {
19+
return nil, fmt.Errorf(`spec: reading YAML file "%s": %w`, filepath, err)
20+
}
21+
22+
var config MetaConfig
23+
if err = yaml.Unmarshal(p, &config); err != nil {
24+
return nil, fmt.Errorf(`spec: parsing YAML file "%s": %w`, filepath, err)
25+
}
26+
27+
return &config, nil
28+
}

example-golang/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module github.com/hipertracker/word_extractor
22

3-
go 1.18
3+
go 1.17
44

55
require (
66
github.com/bmatcuk/doublestar v1.3.4

0 commit comments

Comments
 (0)