Skip to content

Commit eafd1b1

Browse files
author
Jaroslaw Zabiello
committed
add golang example
1 parent d350d1f commit eafd1b1

File tree

8 files changed

+308
-0
lines changed

8 files changed

+308
-0
lines changed

example-golang/Makefile

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
BINARY_NAME=main
2+
3+
all: clean test build
4+
5+
build:
6+
@go build -o ${BINARY_NAME} -ldflags "-s -w" *.go
7+
8+
run: build
9+
./${BINARY_NAME}
10+
11+
test:
12+
@go test ./... -v -coverprofile=coverage.out
13+
14+
cover: test
15+
@go tool cover -html=coverage.out
16+
17+
clean:
18+
@go clean
19+
rm -f coverage.out
20+
rm -f ./${BINARY_NAME}
21+
rm -rf ./words

example-golang/README.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
2+
```
3+
make build
4+
GOGC=2000 ./main
5+
```
6+
7+
MacOS 12.2
8+
Go 1.18beta1
9+
MBP 16" M1Max 10 cores
10+
Total files: 123
11+
Total size: 504 MB
12+
Total time: 7.2166 s
13+
14+
```
15+
go run .
16+
```
17+
18+
Total time: 8.8602 s

example-golang/go.mod

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
module github.com/hipertracker/word_extractor
2+
3+
go 1.18
4+
5+
require (
6+
github.com/bmatcuk/doublestar v1.3.4
7+
github.com/stretchr/testify v1.7.0
8+
github.com/thoas/go-funk v0.9.1
9+
github.com/tidwall/collate v1.0.0
10+
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b
11+
)
12+
13+
require (
14+
github.com/davecgh/go-spew v1.1.0 // indirect
15+
github.com/pmezard/go-difflib v1.0.0 // indirect
16+
github.com/tidwall/gjson v1.3.4 // indirect
17+
github.com/tidwall/match v1.0.1 // indirect
18+
github.com/tidwall/pretty v1.0.0 // indirect
19+
golang.org/x/text v0.3.2 // indirect
20+
)

example-golang/go.sum

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
github.com/bmatcuk/doublestar v1.3.4 h1:gPypJ5xD31uhX6Tf54sDPUOBXTqKH4c9aPY66CyQrS0=
2+
github.com/bmatcuk/doublestar v1.3.4/go.mod h1:wiQtGV+rzVYxB7WIlirSN++5HPtPlXEo9MEoZQC/PmE=
3+
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
4+
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
5+
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
6+
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
7+
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
8+
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
9+
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
10+
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
11+
github.com/thoas/go-funk v0.9.1 h1:O549iLZqPpTUQ10ykd26sZhzD+rmR5pWhuElrhbC20M=
12+
github.com/thoas/go-funk v0.9.1/go.mod h1:+IWnUfUmFO1+WVYQWQtIJHeRRdaIyyYglZN7xzUPe4Q=
13+
github.com/tidwall/collate v1.0.0 h1:xgvwO2UunUoXx3NS3UqHBX63l248ZApqo7mUe3NHy6I=
14+
github.com/tidwall/collate v1.0.0/go.mod h1:S56qxEr2ALVCaGY41npreOJ5lBIILSrxYLgEpxoHVIk=
15+
github.com/tidwall/gjson v1.3.4 h1:On5waDnyKKk3SWE4EthbjjirAWXp43xx5cKCUZY1eZw=
16+
github.com/tidwall/gjson v1.3.4/go.mod h1:P256ACg0Mn+j1RXIDXoss50DeIABTYK1PULOJHhxOls=
17+
github.com/tidwall/match v1.0.1 h1:PnKP62LPNxHKTwvHHZZzdOAOCtsJTjo6dZLCwpKm5xc=
18+
github.com/tidwall/match v1.0.1/go.mod h1:LujAq0jyVjBy028G1WhWfIzbpQfMO8bBZ6Tyb0+pL9E=
19+
github.com/tidwall/pretty v1.0.0 h1:HsD+QiTn7sK6flMKIvNmpqz1qrpP3Ps6jOKIKMooyg4=
20+
github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk=
21+
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
22+
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
23+
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
24+
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
25+
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
26+
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
27+
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
28+
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo=
29+
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

example-golang/main

3.6 MB
Binary file not shown.

example-golang/main.go

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
package main
2+
3+
import (
4+
"fmt"
5+
"io/ioutil"
6+
"os"
7+
"regexp"
8+
"sort"
9+
"strings"
10+
"sync"
11+
"time"
12+
13+
"github.com/bmatcuk/doublestar"
14+
"github.com/thoas/go-funk"
15+
"github.com/tidwall/collate"
16+
)
17+
18+
type Pair struct {
19+
SrcPath string
20+
Dstpath string
21+
}
22+
23+
var srcPath = "../data/??/**/*.yml"
24+
var outdir = "words"
25+
26+
var wg sync.WaitGroup
27+
28+
// func mainOld() {
29+
// paths, _ := doublestar.Glob(srcPath)
30+
31+
// clearResults()
32+
// runWithChannels(paths)
33+
34+
// clearResults()
35+
// runWithWaitGroups(paths)
36+
// }
37+
38+
func main() {
39+
var wg sync.WaitGroup
40+
wg.Add(1)
41+
42+
go func() {
43+
defer wg.Done()
44+
45+
t := time.Now()
46+
defer timeTrack(t)
47+
48+
paths, _ := doublestar.Glob(srcPath)
49+
50+
ch1 := make(chan Pair, len(paths))
51+
ch2 := make(chan string, len(paths))
52+
53+
clearResults()
54+
55+
for _, yamlPath := range paths {
56+
go loadYaml(ch1, yamlPath)
57+
}
58+
59+
for range paths {
60+
pair := <-ch1
61+
go loadText(ch2, pair.SrcPath, pair.Dstpath, true)
62+
}
63+
for range paths {
64+
fmt.Printf("Saved %s\n", <-ch2)
65+
}
66+
}()
67+
wg.Wait()
68+
}
69+
70+
func loadYaml(ch chan Pair, path string) {
71+
meta := GetYAML(path)
72+
srcPath := strings.Replace(path, ".yml", ".txt", -1)
73+
dstPath := fmt.Sprintf("%s/extracted-words-for-%s.txt", outdir, meta.Code)
74+
ch <- Pair{srcPath, dstPath}
75+
}
76+
77+
func loadText(ch2 chan string, srcPath string, dstPath string, sorting bool) {
78+
content, err := ioutil.ReadFile(srcPath)
79+
if err != nil {
80+
panic(err)
81+
}
82+
words := extractUniqueWords(content)
83+
if sorting {
84+
words = sortWords(words, "POLISH_CI")
85+
}
86+
text := strings.Join(words, "\n")
87+
for err := ioutil.WriteFile(dstPath, []byte(text), 0644); err != nil; {
88+
panic(err)
89+
}
90+
ch2 <- dstPath
91+
}
92+
93+
func clearResults() {
94+
os.RemoveAll(outdir)
95+
os.Mkdir(outdir, 0777)
96+
}
97+
98+
func runWithChannels(paths []string) {
99+
var ch = make(chan string)
100+
t := time.Now()
101+
defer timeTrack(t)
102+
for _, path := range paths {
103+
go func(yamlPath string) {
104+
ch <- parseFile(yamlPath, false)
105+
}(path)
106+
}
107+
for range paths {
108+
<-ch
109+
}
110+
}
111+
112+
func runWithWaitGroups(paths []string) {
113+
var wg sync.WaitGroup
114+
t := time.Now()
115+
defer timeTrack(t)
116+
for _, path := range paths {
117+
wg.Add(1)
118+
go func(yamlPath string) {
119+
parseFile(yamlPath, false)
120+
wg.Done()
121+
}(path)
122+
}
123+
wg.Wait()
124+
}
125+
126+
func parseFile(path string, sorting bool) string {
127+
// load YAML file
128+
meta := GetYAML(path)
129+
outfilepath := fmt.Sprintf("%s/extracted-words-for-%s.txt", outdir, meta.Code)
130+
131+
// load text file
132+
filepath := strings.Replace(path, ".yml", ".txt", -1)
133+
content, err := ioutil.ReadFile(filepath)
134+
if err != nil {
135+
panic(err)
136+
}
137+
138+
words := extractUniqueWords(content)
139+
140+
// sort unique words
141+
if sorting {
142+
words = sortWords(words, "POLISH_CI")
143+
}
144+
145+
text := strings.Join(words, "\n")
146+
for err := ioutil.WriteFile(outfilepath, []byte(text), 0644); err != nil; {
147+
panic(err)
148+
}
149+
return outfilepath
150+
}
151+
152+
func timeTrack(start time.Time) {
153+
fmt.Println("Total timing: ", time.Since(start))
154+
}
155+
156+
func extractUniqueWords(content []byte) []string {
157+
text := strings.ToLower(string(content))
158+
re := regexp.MustCompile(`[^\p{L}]+`)
159+
tokens := re.Split(text, -1)
160+
return funk.UniqString(tokens)
161+
}
162+
163+
func sortWords(words []string, lang string) []string {
164+
less := collate.IndexString(lang)
165+
sort.SliceStable(words, func(i, j int) bool {
166+
return less(words[i], words[j])
167+
})
168+
return words
169+
}

example-golang/main_test.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
package main
2+
3+
import (
4+
"testing"
5+
6+
"github.com/stretchr/testify/assert"
7+
)
8+
9+
func Test_extractUniqueWords(t *testing.T) {
10+
text := "ćma cześć ser. śmiech!żółw zebra-łuk len Ćma Żółw ser"
11+
expected := []string{"ćma", "cześć", "ser", "śmiech", "żółw", "zebra", "łuk", "len"}
12+
given := extractUniqueWords([]byte(text))
13+
assert.Equal(t, expected, given, "text should be tokenized into unique words")
14+
}
15+
16+
func Test_sortWords(t *testing.T) {
17+
words := []string{"ćma", "cześć", "ser", "śmiech", "żółw", "zebra", "łuk", "len"}
18+
expected := []string{"cześć", "ćma", "len", "łuk", "ser", "śmiech", "zebra", "żółw"}
19+
given := sortWords(words, "POLISH_CI")
20+
assert.Equal(t, expected, given, "words should be sorted out using Polish grammar rules")
21+
}

example-golang/yaml.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
package main
2+
3+
import (
4+
"io/ioutil"
5+
6+
"gopkg.in/yaml.v3"
7+
)
8+
9+
type metaConfig struct {
10+
Lang string
11+
Code string
12+
Label string
13+
}
14+
15+
func (m *metaConfig) Parse(data []byte) error {
16+
return yaml.Unmarshal(data, m)
17+
}
18+
19+
func GetYAML(filepath string) metaConfig {
20+
data, err := ioutil.ReadFile(filepath)
21+
if err != nil {
22+
panic(err)
23+
}
24+
config := metaConfig{}
25+
err = config.Parse(data)
26+
if err != nil {
27+
panic(err)
28+
}
29+
return config
30+
}

0 commit comments

Comments
 (0)