Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Golang improvements.
  • Loading branch information
lukaszraczylo committed May 8, 2021
commit efbc06daddef97413ba77b3b4f076a320e51c8c2
1 change: 1 addition & 0 deletions words_extractor_go/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ words
.history
/fast_words
/.DS_Store
sort-me-out
10 changes: 10 additions & 0 deletions words_extractor_go/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
all: build

build:
go build -o sort-me-out *.go

run: build
./sort-me-out

test:
go test ./... -v -race -cover
20 changes: 11 additions & 9 deletions words_extractor_go/content.go
Original file line number Diff line number Diff line change
@@ -1,24 +1,26 @@
package main

import (
"bufio"
"bytes"
"io/ioutil"
"os"
"regexp"
"strconv"
"strings"
)

func getRows(metaPath string) ListOfStrings {
path := strings.Replace(metaPath, ".yml", ".txt", -1)
data, err := ioutil.ReadFile(path)
if err != nil {
panic(err)
}
rows := strings.Split(string(data), "\n")
if rows[len(rows)-1] == "" {
rows = rows[:len(rows)-1]
data, _ := os.Open(path)
defer data.Close()

scanner := bufio.NewScanner(data)
scanner.Split(bufio.ScanLines)
var txtlines []string
for scanner.Scan() {
txtlines = append(txtlines, scanner.Text())
}
return rows
return txtlines
}

func (arr ListOfStrings) toString() string {
Expand Down
10 changes: 10 additions & 0 deletions words_extractor_go/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
module github.com/hipertracker/words_extractor

go 1.16

require (
github.com/jfcg/sorty v1.0.15
github.com/stretchr/testify v1.7.0
github.com/thoas/go-funk v0.8.0
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b
)
22 changes: 22 additions & 0 deletions words_extractor_go/go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/jfcg/opt v0.2.4 h1:EBYw7LO5/9ux4PR+3AQB8DTP37URrN1J2uUo/6ELKlE=
github.com/jfcg/opt v0.2.4/go.mod h1:KVF8GWz/SDWSHnYY80Tghcs4zPcquIvirV40naQhFVU=
github.com/jfcg/sixb v0.8.2 h1:87Ybxpk3J5+Xr4+37EdCaOk+W+7RPiDPgtnX+q+O04E=
github.com/jfcg/sixb v0.8.2/go.mod h1:3afTV2ig2p+URYUtxV4oTfEVv5/FJiIqxmrh9mGNc3A=
github.com/jfcg/sorty v1.0.15 h1:QRQCLV6wxOemaWErr1LUDgKaYVMSgga69MkppnnsXEE=
github.com/jfcg/sorty v1.0.15/go.mod h1:uakexn86cGdrQfPi8NPom9sDdMjs+0KwdLfYominI7Y=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/thoas/go-funk v0.8.0 h1:JP9tKSvnpFVclYgDM0Is7FD9M4fhPvqA0s0BsXmzSRQ=
github.com/thoas/go-funk v0.8.0/go.mod h1:+IWnUfUmFO1+WVYQWQtIJHeRRdaIyyYglZN7xzUPe4Q=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo=
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
47 changes: 19 additions & 28 deletions words_extractor_go/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,23 @@ package main
import (
"fmt"
"io/ioutil"
"sort"
"strings"
"time"

"github.com/tidwall/collate"
"github.com/jfcg/sorty"
"github.com/thoas/go-funk"
)

type resultsArray struct {
Results []string
}

var (
res resultsArray
)

func main() {
t1 := time.Now()
folder := "./words"
prepareFolder(folder, "*.txt")

Expand All @@ -17,34 +28,14 @@ func main() {
filename := "słowa - " + meta.Label + ".txt"
fmt.Println("Parsing...", filename)

// set: extracted unique words normalized to lowercase
set := make(map[string]void)
extractWords(getRows(path).toString(), set)
delete(set, "")

// convert map[string]void to []string
var words []string
for word := range set {
words = append(words, word)
}
res.extractWords(getRows(path).toString())
res.Results = funk.UniqString(res.Results)
sorty.SortS(res.Results)
data := strings.Join(res.Results, "\n")

// sortArray(words, "POLISH_CI")

var data []byte
for _, word := range words {
bytes := []byte(word + "\n")
data = append(data, bytes...)
}

for err := ioutil.WriteFile(folder+"/"+filename, data, 0644); err != nil; {
for err := ioutil.WriteFile(folder+"/"+filename, []byte(data), 0644); err != nil; {
panic(err)
}
}
}

func sortArray(arr []string, lang string) {
less := collate.IndexString(lang)
sort.SliceStable(arr, func(i, j int) bool {
return less(arr[i], arr[j])
})
fmt.Println("Total timing: ", time.Since(t1))
}
18 changes: 13 additions & 5 deletions words_extractor_go/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package main
import (
"os"
"path/filepath"
"regexp"
"strings"
)

Expand All @@ -26,10 +25,19 @@ func getYamlFilepaths(root string) []string {
return result
}

func extractWords(s string, set map[string]void) {
re := regexp.MustCompile("[^\\p{L}]+")
for _, word := range re.Split(s, -1) {
set[strings.ToLower(word)] = member
func removeCharacters(input string, characters string) string {
filter := func(r rune) rune {
if strings.IndexRune(characters, r) < 0 {
return r
}
return -1
}
return strings.Map(filter, input)
}

func (r *resultsArray) extractWords(s string) {
for _, word := range strings.Fields(s) {
r.Results = append(r.Results, strings.ToLower(removeCharacters(word, ".:,;()!?'-_")))
}
}

Expand Down
93 changes: 93 additions & 0 deletions words_extractor_go/utils_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
package main

import (
"testing"

"github.com/stretchr/testify/assert"
)

func Test_removeCharacters(t *testing.T) {
type args struct {
input string
characters string
}
tests := []struct {
name string
args args
want string
}{
{
name: "Characters removal: suffix",
args: args{
input: "Załoenie;!",
characters: ";!",
},
want: "Załoenie",
},
{
name: "Characters removal: prefix",
args: args{
input: ",#Załoenie",
characters: ";!-,#",
},
want: "Załoenie",
},
{
name: "Characters removal: both",
args: args{
input: "-!Załoenie;!",
characters: ";!-",
},
want: "Załoenie",
},
}
assert := assert.New(t)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := removeCharacters(tt.args.input, tt.args.characters)
assert.Equal(tt.want, got, "Unexpected result in test: "+tt.name)
})
}
}

func Test_resultsArray_extractWords(t *testing.T) {
type fields struct {
Results []string
}
type args struct {
s string
}
tests := []struct {
name string
fields fields
args args
wants []string
}{
{
name: "Simple sentence",
args: args{
s: "Within this tutorial, we are going to look at how you can effectively read and write to files within your filesystem using the go programming language.",
},
wants: []string{"within", "this", "tutorial", "we", "are", "going", "to", "look", "at", "how", "you", "can", "effectively", "read", "and", "write", "to", "files", "within", "your", "filesystem", "using", "the", "go", "programming", "language"},
},
{
name: "Multiline sentence",
args: args{
s: `The UK has recorded another five COVID deaths and 2,047 more cases in the latest daily figures.

It compares with seven deaths and 1,907 cases this time last week, while the latest seven-day rolling average is 11.3 and 2,080.`,
},
wants: []string{"the", "uk", "has", "recorded", "another", "five", "covid", "deaths", "and", "2047", "more", "cases", "in", "the", "latest", "daily", "figures", "it", "compares", "with", "seven", "deaths", "and", "1907", "cases", "this", "time", "last", "week", "while", "the", "latest", "sevenday", "rolling", "average", "is", "113", "and", "2080"},
},
}
assert := assert.New(t)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
r := &resultsArray{
Results: tt.fields.Results,
}
r.extractWords(tt.args.s)
assert.Equal(tt.wants, r.Results, "Unexpected result in test: "+tt.name)
})
}
}