diff --git a/words_extractor_go/.gitignore b/words_extractor_go/.gitignore index a5fb87a..96a80b1 100644 --- a/words_extractor_go/.gitignore +++ b/words_extractor_go/.gitignore @@ -3,3 +3,4 @@ words .history /fast_words /.DS_Store +sort-me-out \ No newline at end of file diff --git a/words_extractor_go/Makefile b/words_extractor_go/Makefile new file mode 100644 index 0000000..d992c5a --- /dev/null +++ b/words_extractor_go/Makefile @@ -0,0 +1,10 @@ +all: build + +build: + @go build -o sort-me-out *.go + +run: build + @./sort-me-out + +test: + @go test ./... -v -race -cover \ No newline at end of file diff --git a/words_extractor_go/content.go b/words_extractor_go/content.go index f4bdaeb..dc17d96 100644 --- a/words_extractor_go/content.go +++ b/words_extractor_go/content.go @@ -1,8 +1,9 @@ package main import ( + "bufio" "bytes" - "io/ioutil" + "os" "regexp" "strconv" "strings" @@ -10,15 +11,16 @@ import ( func getRows(metaPath string) ListOfStrings { path := strings.Replace(metaPath, ".yml", ".txt", -1) - data, err := ioutil.ReadFile(path) - if err != nil { - panic(err) - } - rows := strings.Split(string(data), "\n") - if rows[len(rows)-1] == "" { - rows = rows[:len(rows)-1] + data, _ := os.Open(path) + defer data.Close() + + scanner := bufio.NewScanner(data) + scanner.Split(bufio.ScanLines) + var txtlines []string + for scanner.Scan() { + txtlines = append(txtlines, scanner.Text()) } - return rows + return txtlines } func (arr ListOfStrings) toString() string { diff --git a/words_extractor_go/go.mod b/words_extractor_go/go.mod new file mode 100644 index 0000000..9a3e62b --- /dev/null +++ b/words_extractor_go/go.mod @@ -0,0 +1,13 @@ +module github.com/hipertracker/words_extractor + +go 1.16 + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/jfcg/sorty v1.0.15 + github.com/kr/pretty v0.1.0 // indirect + github.com/stretchr/testify v1.7.0 + github.com/thoas/go-funk v0.8.0 + gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect + gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b +) diff --git a/words_extractor_go/go.sum b/words_extractor_go/go.sum new file mode 100644 index 0000000..7cd7d22 --- /dev/null +++ b/words_extractor_go/go.sum @@ -0,0 +1,29 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/jfcg/opt v0.2.4 h1:EBYw7LO5/9ux4PR+3AQB8DTP37URrN1J2uUo/6ELKlE= +github.com/jfcg/opt v0.2.4/go.mod h1:KVF8GWz/SDWSHnYY80Tghcs4zPcquIvirV40naQhFVU= +github.com/jfcg/sixb v0.8.2 h1:87Ybxpk3J5+Xr4+37EdCaOk+W+7RPiDPgtnX+q+O04E= +github.com/jfcg/sixb v0.8.2/go.mod h1:3afTV2ig2p+URYUtxV4oTfEVv5/FJiIqxmrh9mGNc3A= +github.com/jfcg/sorty v1.0.15 h1:QRQCLV6wxOemaWErr1LUDgKaYVMSgga69MkppnnsXEE= +github.com/jfcg/sorty v1.0.15/go.mod h1:uakexn86cGdrQfPi8NPom9sDdMjs+0KwdLfYominI7Y= +github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/thoas/go-funk v0.8.0 h1:JP9tKSvnpFVclYgDM0Is7FD9M4fhPvqA0s0BsXmzSRQ= +github.com/thoas/go-funk v0.8.0/go.mod h1:+IWnUfUmFO1+WVYQWQtIJHeRRdaIyyYglZN7xzUPe4Q= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo= +gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/words_extractor_go/main.go b/words_extractor_go/main.go index 0b7d042..e7fb131 100644 --- a/words_extractor_go/main.go +++ b/words_extractor_go/main.go @@ -3,12 +3,28 @@ package main import ( "fmt" "io/ioutil" - "sort" + "strings" + "time" - "github.com/tidwall/collate" + "github.com/jfcg/sorty" + "github.com/thoas/go-funk" ) +type resultsArray struct { + Results []string +} + +var ( + res resultsArray +) + +func timeTrack(start time.Time) { + fmt.Println("Total timing: ", time.Since(start)) +} + func main() { + t1 := time.Now() + defer timeTrack(t1) folder := "./words" prepareFolder(folder, "*.txt") @@ -17,34 +33,13 @@ func main() { filename := "słowa - " + meta.Label + ".txt" fmt.Println("Parsing...", filename) - // set: extracted unique words normalized to lowercase - set := make(map[string]void) - extractWords(getRows(path).toString(), set) - delete(set, "") + res.extractWords(getRows(path).toString()) + res.Results = funk.UniqString(res.Results) + sorty.SortS(res.Results) + data := strings.Join(res.Results, "\n") - // convert map[string]void to []string - var words []string - for word := range set { - words = append(words, word) - } - - // sortArray(words, "POLISH_CI") - - var data []byte - for _, word := range words { - bytes := []byte(word + "\n") - data = append(data, bytes...) - } - - for err := ioutil.WriteFile(folder+"/"+filename, data, 0644); err != nil; { + for err := ioutil.WriteFile(folder+"/"+filename, []byte(data), 0644); err != nil; { panic(err) } } } - -func sortArray(arr []string, lang string) { - less := collate.IndexString(lang) - sort.SliceStable(arr, func(i, j int) bool { - return less(arr[i], arr[j]) - }) -} diff --git a/words_extractor_go/meta.go b/words_extractor_go/meta.go index 3031438..775ef2f 100644 --- a/words_extractor_go/meta.go +++ b/words_extractor_go/meta.go @@ -1,8 +1,9 @@ package main import ( - "gopkg.in/yaml.v3" "io/ioutil" + + "gopkg.in/yaml.v3" ) type metaConfig struct { diff --git a/words_extractor_go/utils.go b/words_extractor_go/utils.go index 3743aca..f1dd652 100644 --- a/words_extractor_go/utils.go +++ b/words_extractor_go/utils.go @@ -3,7 +3,6 @@ package main import ( "os" "path/filepath" - "regexp" "strings" ) @@ -26,10 +25,19 @@ func getYamlFilepaths(root string) []string { return result } -func extractWords(s string, set map[string]void) { - re := regexp.MustCompile("[^\\p{L}]+") - for _, word := range re.Split(s, -1) { - set[strings.ToLower(word)] = member +func removeCharacters(input string, characters string) string { + filter := func(r rune) rune { + if strings.IndexRune(characters, r) < 0 { + return r + } + return -1 + } + return strings.Map(filter, input) +} + +func (r *resultsArray) extractWords(s string) { + for _, word := range strings.Fields(s) { + r.Results = append(r.Results, strings.ToLower(removeCharacters(word, ".:,;()!?'-_"))) } } diff --git a/words_extractor_go/utils_test.go b/words_extractor_go/utils_test.go new file mode 100644 index 0000000..2e2f6ee --- /dev/null +++ b/words_extractor_go/utils_test.go @@ -0,0 +1,93 @@ +package main + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func Test_removeCharacters(t *testing.T) { + type args struct { + input string + characters string + } + tests := []struct { + name string + args args + want string + }{ + { + name: "Characters removal: suffix", + args: args{ + input: "Załoenie;!", + characters: ";!", + }, + want: "Załoenie", + }, + { + name: "Characters removal: prefix", + args: args{ + input: ",#Załoenie", + characters: ";!-,#", + }, + want: "Załoenie", + }, + { + name: "Characters removal: both", + args: args{ + input: "-!Załoenie;!", + characters: ";!-", + }, + want: "Załoenie", + }, + } + assert := assert.New(t) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := removeCharacters(tt.args.input, tt.args.characters) + assert.Equal(tt.want, got, "Unexpected result in test: "+tt.name) + }) + } +} + +func Test_resultsArray_extractWords(t *testing.T) { + type fields struct { + Results []string + } + type args struct { + s string + } + tests := []struct { + name string + fields fields + args args + wants []string + }{ + { + name: "Simple sentence", + args: args{ + s: "Within this tutorial, we are going to look at how you can effectively read and write to files within your filesystem using the go programming language.", + }, + wants: []string{"within", "this", "tutorial", "we", "are", "going", "to", "look", "at", "how", "you", "can", "effectively", "read", "and", "write", "to", "files", "within", "your", "filesystem", "using", "the", "go", "programming", "language"}, + }, + { + name: "Multiline sentence", + args: args{ + s: `The UK has recorded another five COVID deaths and 2,047 more cases in the latest daily figures. + + It compares with seven deaths and 1,907 cases this time last week, while the latest seven-day rolling average is 11.3 and 2,080.`, + }, + wants: []string{"the", "uk", "has", "recorded", "another", "five", "covid", "deaths", "and", "2047", "more", "cases", "in", "the", "latest", "daily", "figures", "it", "compares", "with", "seven", "deaths", "and", "1907", "cases", "this", "time", "last", "week", "while", "the", "latest", "sevenday", "rolling", "average", "is", "113", "and", "2080"}, + }, + } + assert := assert.New(t) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + r := &resultsArray{ + Results: tt.fields.Results, + } + r.extractWords(tt.args.s) + assert.Equal(tt.wants, r.Results, "Unexpected result in test: "+tt.name) + }) + } +}