diff --git a/words_extractor_go/.gitignore b/words_extractor_go/.gitignore index 96a80b1..a5fb87a 100644 --- a/words_extractor_go/.gitignore +++ b/words_extractor_go/.gitignore @@ -3,4 +3,3 @@ words .history /fast_words /.DS_Store -sort-me-out \ No newline at end of file diff --git a/words_extractor_go/Makefile b/words_extractor_go/Makefile deleted file mode 100644 index d992c5a..0000000 --- a/words_extractor_go/Makefile +++ /dev/null @@ -1,10 +0,0 @@ -all: build - -build: - @go build -o sort-me-out *.go - -run: build - @./sort-me-out - -test: - @go test ./... -v -race -cover \ No newline at end of file diff --git a/words_extractor_go/content.go b/words_extractor_go/content.go index dc17d96..f4bdaeb 100644 --- a/words_extractor_go/content.go +++ b/words_extractor_go/content.go @@ -1,9 +1,8 @@ package main import ( - "bufio" "bytes" - "os" + "io/ioutil" "regexp" "strconv" "strings" @@ -11,16 +10,15 @@ import ( func getRows(metaPath string) ListOfStrings { path := strings.Replace(metaPath, ".yml", ".txt", -1) - data, _ := os.Open(path) - defer data.Close() - - scanner := bufio.NewScanner(data) - scanner.Split(bufio.ScanLines) - var txtlines []string - for scanner.Scan() { - txtlines = append(txtlines, scanner.Text()) + data, err := ioutil.ReadFile(path) + if err != nil { + panic(err) + } + rows := strings.Split(string(data), "\n") + if rows[len(rows)-1] == "" { + rows = rows[:len(rows)-1] } - return txtlines + return rows } func (arr ListOfStrings) toString() string { diff --git a/words_extractor_go/go.mod b/words_extractor_go/go.mod deleted file mode 100644 index 9a3e62b..0000000 --- a/words_extractor_go/go.mod +++ /dev/null @@ -1,13 +0,0 @@ -module github.com/hipertracker/words_extractor - -go 1.16 - -require ( - github.com/davecgh/go-spew v1.1.1 // indirect - github.com/jfcg/sorty v1.0.15 - github.com/kr/pretty v0.1.0 // indirect - github.com/stretchr/testify v1.7.0 - github.com/thoas/go-funk v0.8.0 - gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect - gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b -) diff --git a/words_extractor_go/go.sum b/words_extractor_go/go.sum deleted file mode 100644 index 7cd7d22..0000000 --- a/words_extractor_go/go.sum +++ /dev/null @@ -1,29 +0,0 @@ -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= -github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/jfcg/opt v0.2.4 h1:EBYw7LO5/9ux4PR+3AQB8DTP37URrN1J2uUo/6ELKlE= -github.com/jfcg/opt v0.2.4/go.mod h1:KVF8GWz/SDWSHnYY80Tghcs4zPcquIvirV40naQhFVU= -github.com/jfcg/sixb v0.8.2 h1:87Ybxpk3J5+Xr4+37EdCaOk+W+7RPiDPgtnX+q+O04E= -github.com/jfcg/sixb v0.8.2/go.mod h1:3afTV2ig2p+URYUtxV4oTfEVv5/FJiIqxmrh9mGNc3A= -github.com/jfcg/sorty v1.0.15 h1:QRQCLV6wxOemaWErr1LUDgKaYVMSgga69MkppnnsXEE= -github.com/jfcg/sorty v1.0.15/go.mod h1:uakexn86cGdrQfPi8NPom9sDdMjs+0KwdLfYominI7Y= -github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= -github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= -github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= -github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/thoas/go-funk v0.8.0 h1:JP9tKSvnpFVclYgDM0Is7FD9M4fhPvqA0s0BsXmzSRQ= -github.com/thoas/go-funk v0.8.0/go.mod h1:+IWnUfUmFO1+WVYQWQtIJHeRRdaIyyYglZN7xzUPe4Q= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= -gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo= -gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/words_extractor_go/main.go b/words_extractor_go/main.go index e7fb131..0b7d042 100644 --- a/words_extractor_go/main.go +++ b/words_extractor_go/main.go @@ -3,28 +3,12 @@ package main import ( "fmt" "io/ioutil" - "strings" - "time" + "sort" - "github.com/jfcg/sorty" - "github.com/thoas/go-funk" + "github.com/tidwall/collate" ) -type resultsArray struct { - Results []string -} - -var ( - res resultsArray -) - -func timeTrack(start time.Time) { - fmt.Println("Total timing: ", time.Since(start)) -} - func main() { - t1 := time.Now() - defer timeTrack(t1) folder := "./words" prepareFolder(folder, "*.txt") @@ -33,13 +17,34 @@ func main() { filename := "słowa - " + meta.Label + ".txt" fmt.Println("Parsing...", filename) - res.extractWords(getRows(path).toString()) - res.Results = funk.UniqString(res.Results) - sorty.SortS(res.Results) - data := strings.Join(res.Results, "\n") + // set: extracted unique words normalized to lowercase + set := make(map[string]void) + extractWords(getRows(path).toString(), set) + delete(set, "") - for err := ioutil.WriteFile(folder+"/"+filename, []byte(data), 0644); err != nil; { + // convert map[string]void to []string + var words []string + for word := range set { + words = append(words, word) + } + + // sortArray(words, "POLISH_CI") + + var data []byte + for _, word := range words { + bytes := []byte(word + "\n") + data = append(data, bytes...) + } + + for err := ioutil.WriteFile(folder+"/"+filename, data, 0644); err != nil; { panic(err) } } } + +func sortArray(arr []string, lang string) { + less := collate.IndexString(lang) + sort.SliceStable(arr, func(i, j int) bool { + return less(arr[i], arr[j]) + }) +} diff --git a/words_extractor_go/meta.go b/words_extractor_go/meta.go index 775ef2f..3031438 100644 --- a/words_extractor_go/meta.go +++ b/words_extractor_go/meta.go @@ -1,9 +1,8 @@ package main import ( - "io/ioutil" - "gopkg.in/yaml.v3" + "io/ioutil" ) type metaConfig struct { diff --git a/words_extractor_go/utils.go b/words_extractor_go/utils.go index f1dd652..3743aca 100644 --- a/words_extractor_go/utils.go +++ b/words_extractor_go/utils.go @@ -3,6 +3,7 @@ package main import ( "os" "path/filepath" + "regexp" "strings" ) @@ -25,19 +26,10 @@ func getYamlFilepaths(root string) []string { return result } -func removeCharacters(input string, characters string) string { - filter := func(r rune) rune { - if strings.IndexRune(characters, r) < 0 { - return r - } - return -1 - } - return strings.Map(filter, input) -} - -func (r *resultsArray) extractWords(s string) { - for _, word := range strings.Fields(s) { - r.Results = append(r.Results, strings.ToLower(removeCharacters(word, ".:,;()!?'-_"))) +func extractWords(s string, set map[string]void) { + re := regexp.MustCompile("[^\\p{L}]+") + for _, word := range re.Split(s, -1) { + set[strings.ToLower(word)] = member } } diff --git a/words_extractor_go/utils_test.go b/words_extractor_go/utils_test.go deleted file mode 100644 index 2e2f6ee..0000000 --- a/words_extractor_go/utils_test.go +++ /dev/null @@ -1,93 +0,0 @@ -package main - -import ( - "testing" - - "github.com/stretchr/testify/assert" -) - -func Test_removeCharacters(t *testing.T) { - type args struct { - input string - characters string - } - tests := []struct { - name string - args args - want string - }{ - { - name: "Characters removal: suffix", - args: args{ - input: "Załoenie;!", - characters: ";!", - }, - want: "Załoenie", - }, - { - name: "Characters removal: prefix", - args: args{ - input: ",#Załoenie", - characters: ";!-,#", - }, - want: "Załoenie", - }, - { - name: "Characters removal: both", - args: args{ - input: "-!Załoenie;!", - characters: ";!-", - }, - want: "Załoenie", - }, - } - assert := assert.New(t) - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got := removeCharacters(tt.args.input, tt.args.characters) - assert.Equal(tt.want, got, "Unexpected result in test: "+tt.name) - }) - } -} - -func Test_resultsArray_extractWords(t *testing.T) { - type fields struct { - Results []string - } - type args struct { - s string - } - tests := []struct { - name string - fields fields - args args - wants []string - }{ - { - name: "Simple sentence", - args: args{ - s: "Within this tutorial, we are going to look at how you can effectively read and write to files within your filesystem using the go programming language.", - }, - wants: []string{"within", "this", "tutorial", "we", "are", "going", "to", "look", "at", "how", "you", "can", "effectively", "read", "and", "write", "to", "files", "within", "your", "filesystem", "using", "the", "go", "programming", "language"}, - }, - { - name: "Multiline sentence", - args: args{ - s: `The UK has recorded another five COVID deaths and 2,047 more cases in the latest daily figures. - - It compares with seven deaths and 1,907 cases this time last week, while the latest seven-day rolling average is 11.3 and 2,080.`, - }, - wants: []string{"the", "uk", "has", "recorded", "another", "five", "covid", "deaths", "and", "2047", "more", "cases", "in", "the", "latest", "daily", "figures", "it", "compares", "with", "seven", "deaths", "and", "1907", "cases", "this", "time", "last", "week", "while", "the", "latest", "sevenday", "rolling", "average", "is", "113", "and", "2080"}, - }, - } - assert := assert.New(t) - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - r := &resultsArray{ - Results: tt.fields.Results, - } - r.extractWords(tt.args.s) - assert.Equal(tt.wants, r.Results, "Unexpected result in test: "+tt.name) - }) - } -}