Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions words_extractor_go/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ words
.history
/fast_words
/.DS_Store
sort-me-out
10 changes: 10 additions & 0 deletions words_extractor_go/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
all: build

build:
@go build -o sort-me-out *.go

run: build
@./sort-me-out

test:
@go test ./... -v -race -cover
20 changes: 11 additions & 9 deletions words_extractor_go/content.go
Original file line number Diff line number Diff line change
@@ -1,24 +1,26 @@
package main

import (
"bufio"
"bytes"
"io/ioutil"
"os"
"regexp"
"strconv"
"strings"
)

func getRows(metaPath string) ListOfStrings {
path := strings.Replace(metaPath, ".yml", ".txt", -1)
data, err := ioutil.ReadFile(path)
if err != nil {
panic(err)
}
rows := strings.Split(string(data), "\n")
if rows[len(rows)-1] == "" {
rows = rows[:len(rows)-1]
data, _ := os.Open(path)
defer data.Close()

scanner := bufio.NewScanner(data)
scanner.Split(bufio.ScanLines)
var txtlines []string
for scanner.Scan() {
txtlines = append(txtlines, scanner.Text())
}
return rows
return txtlines
}

func (arr ListOfStrings) toString() string {
Expand Down
13 changes: 13 additions & 0 deletions words_extractor_go/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
module github.com/hipertracker/words_extractor

go 1.16

require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/jfcg/sorty v1.0.15
github.com/kr/pretty v0.1.0 // indirect
github.com/stretchr/testify v1.7.0
github.com/thoas/go-funk v0.8.0
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b
)
29 changes: 29 additions & 0 deletions words_extractor_go/go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/jfcg/opt v0.2.4 h1:EBYw7LO5/9ux4PR+3AQB8DTP37URrN1J2uUo/6ELKlE=
github.com/jfcg/opt v0.2.4/go.mod h1:KVF8GWz/SDWSHnYY80Tghcs4zPcquIvirV40naQhFVU=
github.com/jfcg/sixb v0.8.2 h1:87Ybxpk3J5+Xr4+37EdCaOk+W+7RPiDPgtnX+q+O04E=
github.com/jfcg/sixb v0.8.2/go.mod h1:3afTV2ig2p+URYUtxV4oTfEVv5/FJiIqxmrh9mGNc3A=
github.com/jfcg/sorty v1.0.15 h1:QRQCLV6wxOemaWErr1LUDgKaYVMSgga69MkppnnsXEE=
github.com/jfcg/sorty v1.0.15/go.mod h1:uakexn86cGdrQfPi8NPom9sDdMjs+0KwdLfYominI7Y=
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/thoas/go-funk v0.8.0 h1:JP9tKSvnpFVclYgDM0Is7FD9M4fhPvqA0s0BsXmzSRQ=
github.com/thoas/go-funk v0.8.0/go.mod h1:+IWnUfUmFO1+WVYQWQtIJHeRRdaIyyYglZN7xzUPe4Q=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo=
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
51 changes: 23 additions & 28 deletions words_extractor_go/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,28 @@ package main
import (
"fmt"
"io/ioutil"
"sort"
"strings"
"time"

"github.com/tidwall/collate"
"github.com/jfcg/sorty"
"github.com/thoas/go-funk"
)

type resultsArray struct {
Results []string
}

var (
res resultsArray
)

func timeTrack(start time.Time) {
fmt.Println("Total timing: ", time.Since(start))
}

func main() {
t1 := time.Now()
defer timeTrack(t1)
folder := "./words"
prepareFolder(folder, "*.txt")

Expand All @@ -17,34 +33,13 @@ func main() {
filename := "słowa - " + meta.Label + ".txt"
fmt.Println("Parsing...", filename)

// set: extracted unique words normalized to lowercase
set := make(map[string]void)
extractWords(getRows(path).toString(), set)
delete(set, "")
res.extractWords(getRows(path).toString())
res.Results = funk.UniqString(res.Results)
sorty.SortS(res.Results)
data := strings.Join(res.Results, "\n")

// convert map[string]void to []string
var words []string
for word := range set {
words = append(words, word)
}

// sortArray(words, "POLISH_CI")

var data []byte
for _, word := range words {
bytes := []byte(word + "\n")
data = append(data, bytes...)
}

for err := ioutil.WriteFile(folder+"/"+filename, data, 0644); err != nil; {
for err := ioutil.WriteFile(folder+"/"+filename, []byte(data), 0644); err != nil; {
panic(err)
}
}
}

func sortArray(arr []string, lang string) {
less := collate.IndexString(lang)
sort.SliceStable(arr, func(i, j int) bool {
return less(arr[i], arr[j])
})
}
3 changes: 2 additions & 1 deletion words_extractor_go/meta.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
package main

import (
"gopkg.in/yaml.v3"
"io/ioutil"

"gopkg.in/yaml.v3"
)

type metaConfig struct {
Expand Down
18 changes: 13 additions & 5 deletions words_extractor_go/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package main
import (
"os"
"path/filepath"
"regexp"
"strings"
)

Expand All @@ -26,10 +25,19 @@ func getYamlFilepaths(root string) []string {
return result
}

func extractWords(s string, set map[string]void) {
re := regexp.MustCompile("[^\\p{L}]+")
for _, word := range re.Split(s, -1) {
set[strings.ToLower(word)] = member
func removeCharacters(input string, characters string) string {
filter := func(r rune) rune {
if strings.IndexRune(characters, r) < 0 {
return r
}
return -1
}
return strings.Map(filter, input)
}

func (r *resultsArray) extractWords(s string) {
for _, word := range strings.Fields(s) {
r.Results = append(r.Results, strings.ToLower(removeCharacters(word, ".:,;()!?'-_")))
}
}

Expand Down
93 changes: 93 additions & 0 deletions words_extractor_go/utils_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
package main

import (
"testing"

"github.com/stretchr/testify/assert"
)

func Test_removeCharacters(t *testing.T) {
type args struct {
input string
characters string
}
tests := []struct {
name string
args args
want string
}{
{
name: "Characters removal: suffix",
args: args{
input: "Załoenie;!",
characters: ";!",
},
want: "Załoenie",
},
{
name: "Characters removal: prefix",
args: args{
input: ",#Załoenie",
characters: ";!-,#",
},
want: "Załoenie",
},
{
name: "Characters removal: both",
args: args{
input: "-!Załoenie;!",
characters: ";!-",
},
want: "Załoenie",
},
}
assert := assert.New(t)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := removeCharacters(tt.args.input, tt.args.characters)
assert.Equal(tt.want, got, "Unexpected result in test: "+tt.name)
})
}
}

func Test_resultsArray_extractWords(t *testing.T) {
type fields struct {
Results []string
}
type args struct {
s string
}
tests := []struct {
name string
fields fields
args args
wants []string
}{
{
name: "Simple sentence",
args: args{
s: "Within this tutorial, we are going to look at how you can effectively read and write to files within your filesystem using the go programming language.",
},
wants: []string{"within", "this", "tutorial", "we", "are", "going", "to", "look", "at", "how", "you", "can", "effectively", "read", "and", "write", "to", "files", "within", "your", "filesystem", "using", "the", "go", "programming", "language"},
},
{
name: "Multiline sentence",
args: args{
s: `The UK has recorded another five COVID deaths and 2,047 more cases in the latest daily figures.

It compares with seven deaths and 1,907 cases this time last week, while the latest seven-day rolling average is 11.3 and 2,080.`,
},
wants: []string{"the", "uk", "has", "recorded", "another", "five", "covid", "deaths", "and", "2047", "more", "cases", "in", "the", "latest", "daily", "figures", "it", "compares", "with", "seven", "deaths", "and", "1907", "cases", "this", "time", "last", "week", "while", "the", "latest", "sevenday", "rolling", "average", "is", "113", "and", "2080"},
},
}
assert := assert.New(t)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
r := &resultsArray{
Results: tt.fields.Results,
}
r.extractWords(tt.args.s)
assert.Equal(tt.wants, r.Results, "Unexpected result in test: "+tt.name)
})
}
}