Skip to content

Commit efbc06d

Browse files
committed
Golang improvements.
1 parent f5d14c5 commit efbc06d

File tree

8 files changed

+179
-42
lines changed

8 files changed

+179
-42
lines changed

words_extractor_go/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ words
33
.history
44
/fast_words
55
/.DS_Store
6+
sort-me-out

words_extractor_go/Makefile

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
all: build
2+
3+
build:
4+
go build -o sort-me-out *.go
5+
6+
run: build
7+
./sort-me-out
8+
9+
test:
10+
go test ./... -v -race -cover

words_extractor_go/content.go

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,26 @@
11
package main
22

33
import (
4+
"bufio"
45
"bytes"
5-
"io/ioutil"
6+
"os"
67
"regexp"
78
"strconv"
89
"strings"
910
)
1011

1112
func getRows(metaPath string) ListOfStrings {
1213
path := strings.Replace(metaPath, ".yml", ".txt", -1)
13-
data, err := ioutil.ReadFile(path)
14-
if err != nil {
15-
panic(err)
16-
}
17-
rows := strings.Split(string(data), "\n")
18-
if rows[len(rows)-1] == "" {
19-
rows = rows[:len(rows)-1]
14+
data, _ := os.Open(path)
15+
defer data.Close()
16+
17+
scanner := bufio.NewScanner(data)
18+
scanner.Split(bufio.ScanLines)
19+
var txtlines []string
20+
for scanner.Scan() {
21+
txtlines = append(txtlines, scanner.Text())
2022
}
21-
return rows
23+
return txtlines
2224
}
2325

2426
func (arr ListOfStrings) toString() string {

words_extractor_go/go.mod

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
module github.com/hipertracker/words_extractor
2+
3+
go 1.16
4+
5+
require (
6+
github.com/jfcg/sorty v1.0.15
7+
github.com/stretchr/testify v1.7.0
8+
github.com/thoas/go-funk v0.8.0
9+
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b
10+
)

words_extractor_go/go.sum

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
2+
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
3+
github.com/jfcg/opt v0.2.4 h1:EBYw7LO5/9ux4PR+3AQB8DTP37URrN1J2uUo/6ELKlE=
4+
github.com/jfcg/opt v0.2.4/go.mod h1:KVF8GWz/SDWSHnYY80Tghcs4zPcquIvirV40naQhFVU=
5+
github.com/jfcg/sixb v0.8.2 h1:87Ybxpk3J5+Xr4+37EdCaOk+W+7RPiDPgtnX+q+O04E=
6+
github.com/jfcg/sixb v0.8.2/go.mod h1:3afTV2ig2p+URYUtxV4oTfEVv5/FJiIqxmrh9mGNc3A=
7+
github.com/jfcg/sorty v1.0.15 h1:QRQCLV6wxOemaWErr1LUDgKaYVMSgga69MkppnnsXEE=
8+
github.com/jfcg/sorty v1.0.15/go.mod h1:uakexn86cGdrQfPi8NPom9sDdMjs+0KwdLfYominI7Y=
9+
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
10+
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
11+
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
12+
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
13+
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
14+
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
15+
github.com/thoas/go-funk v0.8.0 h1:JP9tKSvnpFVclYgDM0Is7FD9M4fhPvqA0s0BsXmzSRQ=
16+
github.com/thoas/go-funk v0.8.0/go.mod h1:+IWnUfUmFO1+WVYQWQtIJHeRRdaIyyYglZN7xzUPe4Q=
17+
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
18+
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
19+
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
20+
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
21+
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo=
22+
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

words_extractor_go/main.go

Lines changed: 19 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,23 @@ package main
33
import (
44
"fmt"
55
"io/ioutil"
6-
"sort"
6+
"strings"
7+
"time"
78

8-
"github.com/tidwall/collate"
9+
"github.com/jfcg/sorty"
10+
"github.com/thoas/go-funk"
11+
)
12+
13+
type resultsArray struct {
14+
Results []string
15+
}
16+
17+
var (
18+
res resultsArray
919
)
1020

1121
func main() {
22+
t1 := time.Now()
1223
folder := "./words"
1324
prepareFolder(folder, "*.txt")
1425

@@ -17,34 +28,14 @@ func main() {
1728
filename := "słowa - " + meta.Label + ".txt"
1829
fmt.Println("Parsing...", filename)
1930

20-
// set: extracted unique words normalized to lowercase
21-
set := make(map[string]void)
22-
extractWords(getRows(path).toString(), set)
23-
delete(set, "")
24-
25-
// convert map[string]void to []string
26-
var words []string
27-
for word := range set {
28-
words = append(words, word)
29-
}
31+
res.extractWords(getRows(path).toString())
32+
res.Results = funk.UniqString(res.Results)
33+
sorty.SortS(res.Results)
34+
data := strings.Join(res.Results, "\n")
3035

31-
// sortArray(words, "POLISH_CI")
32-
33-
var data []byte
34-
for _, word := range words {
35-
bytes := []byte(word + "\n")
36-
data = append(data, bytes...)
37-
}
38-
39-
for err := ioutil.WriteFile(folder+"/"+filename, data, 0644); err != nil; {
36+
for err := ioutil.WriteFile(folder+"/"+filename, []byte(data), 0644); err != nil; {
4037
panic(err)
4138
}
4239
}
43-
}
44-
45-
func sortArray(arr []string, lang string) {
46-
less := collate.IndexString(lang)
47-
sort.SliceStable(arr, func(i, j int) bool {
48-
return less(arr[i], arr[j])
49-
})
40+
fmt.Println("Total timing: ", time.Since(t1))
5041
}

words_extractor_go/utils.go

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package main
33
import (
44
"os"
55
"path/filepath"
6-
"regexp"
76
"strings"
87
)
98

@@ -26,10 +25,19 @@ func getYamlFilepaths(root string) []string {
2625
return result
2726
}
2827

29-
func extractWords(s string, set map[string]void) {
30-
re := regexp.MustCompile("[^\\p{L}]+")
31-
for _, word := range re.Split(s, -1) {
32-
set[strings.ToLower(word)] = member
28+
func removeCharacters(input string, characters string) string {
29+
filter := func(r rune) rune {
30+
if strings.IndexRune(characters, r) < 0 {
31+
return r
32+
}
33+
return -1
34+
}
35+
return strings.Map(filter, input)
36+
}
37+
38+
func (r *resultsArray) extractWords(s string) {
39+
for _, word := range strings.Fields(s) {
40+
r.Results = append(r.Results, strings.ToLower(removeCharacters(word, ".:,;()!?'-_")))
3341
}
3442
}
3543

words_extractor_go/utils_test.go

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
package main
2+
3+
import (
4+
"testing"
5+
6+
"github.com/stretchr/testify/assert"
7+
)
8+
9+
func Test_removeCharacters(t *testing.T) {
10+
type args struct {
11+
input string
12+
characters string
13+
}
14+
tests := []struct {
15+
name string
16+
args args
17+
want string
18+
}{
19+
{
20+
name: "Characters removal: suffix",
21+
args: args{
22+
input: "Załoenie;!",
23+
characters: ";!",
24+
},
25+
want: "Załoenie",
26+
},
27+
{
28+
name: "Characters removal: prefix",
29+
args: args{
30+
input: ",#Załoenie",
31+
characters: ";!-,#",
32+
},
33+
want: "Załoenie",
34+
},
35+
{
36+
name: "Characters removal: both",
37+
args: args{
38+
input: "-!Załoenie;!",
39+
characters: ";!-",
40+
},
41+
want: "Załoenie",
42+
},
43+
}
44+
assert := assert.New(t)
45+
for _, tt := range tests {
46+
t.Run(tt.name, func(t *testing.T) {
47+
got := removeCharacters(tt.args.input, tt.args.characters)
48+
assert.Equal(tt.want, got, "Unexpected result in test: "+tt.name)
49+
})
50+
}
51+
}
52+
53+
func Test_resultsArray_extractWords(t *testing.T) {
54+
type fields struct {
55+
Results []string
56+
}
57+
type args struct {
58+
s string
59+
}
60+
tests := []struct {
61+
name string
62+
fields fields
63+
args args
64+
wants []string
65+
}{
66+
{
67+
name: "Simple sentence",
68+
args: args{
69+
s: "Within this tutorial, we are going to look at how you can effectively read and write to files within your filesystem using the go programming language.",
70+
},
71+
wants: []string{"within", "this", "tutorial", "we", "are", "going", "to", "look", "at", "how", "you", "can", "effectively", "read", "and", "write", "to", "files", "within", "your", "filesystem", "using", "the", "go", "programming", "language"},
72+
},
73+
{
74+
name: "Multiline sentence",
75+
args: args{
76+
s: `The UK has recorded another five COVID deaths and 2,047 more cases in the latest daily figures.
77+
78+
It compares with seven deaths and 1,907 cases this time last week, while the latest seven-day rolling average is 11.3 and 2,080.`,
79+
},
80+
wants: []string{"the", "uk", "has", "recorded", "another", "five", "covid", "deaths", "and", "2047", "more", "cases", "in", "the", "latest", "daily", "figures", "it", "compares", "with", "seven", "deaths", "and", "1907", "cases", "this", "time", "last", "week", "while", "the", "latest", "sevenday", "rolling", "average", "is", "113", "and", "2080"},
81+
},
82+
}
83+
assert := assert.New(t)
84+
for _, tt := range tests {
85+
t.Run(tt.name, func(t *testing.T) {
86+
r := &resultsArray{
87+
Results: tt.fields.Results,
88+
}
89+
r.extractWords(tt.args.s)
90+
assert.Equal(tt.wants, r.Results, "Unexpected result in test: "+tt.name)
91+
})
92+
}
93+
}

0 commit comments

Comments
 (0)