Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Code clean up
  • Loading branch information
romanatnews committed Feb 7, 2022
commit ca5bb387febb608f2746f44f0d94a1351e1e4980
2 changes: 1 addition & 1 deletion example-golang/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ func Run(srcDir, outDir string, numWorkers int, sortResults bool) error {
dst := filepath.Join(outDir, "extracted-words-for-"+spec.Code+".txt")

wg.Add(1)
go extract(src, dst, "POLISH_CI", sortResults, sem, &wg)
go extract(src, dst, sortResults, spec.Tag, sem, &wg)
}

wg.Wait()
Expand Down
85 changes: 15 additions & 70 deletions example-golang/app/extract.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,25 @@ import (
"fmt"
"io"
"os"
"sort"
"sync"
"unicode"
"unicode/utf8"
"unsafe"

"github.com/tidwall/collate"
"github.com/cespare/xxhash/v2"
"golang.org/x/text/collate"
"golang.org/x/text/language"
)

const (
filePerm = 0644
initialDictSize = 1e4
)
const filePerm = 0644

// splitWordsUnicode splits data into words, using Unicode Letter character class.
// splitWordsFunc splits data into words, using Unicode Letter character class.
// It works similar to the regular expression "[^\p{L}]+". This is what was used
// in the original code. Unicode function has slight overhead, but handles UTF-8
// correctly.
//
// Rust and Python versions split text according to "[\W\d]+" - anything that is
// not a word or a digit. WTF?
func splitWordsUnicode(data []byte, atEOF bool) (advance int, token []byte, err error) {
func splitWordsFunc(data []byte, atEOF bool) (advance int, token []byte, err error) {
var start int
var r rune
for width := 0; start < len(data); start += width {
Expand All @@ -49,52 +46,7 @@ func splitWordsUnicode(data []byte, atEOF bool) (advance int, token []byte, err
return start, nil, nil
}

// splitWords splits data into words similar to the "[\W\d]+" regular expression.
func splitWords(data []byte, atEOF bool) (advance int, token []byte, err error) {
var start int
var r rune
for width := 0; start < len(data); start += width {
if r, width = utf8.DecodeRune(data[start:]); isLatin(r) {
break
}
}

for width, i := 0, start; i < len(data); i += width {
if r, width = utf8.DecodeRune(data[i:]); !isLatin(r) {
return i + width, data[start:i], nil
}
}

if atEOF && len(data) > start {
return len(data), data[start:], nil
}

return start, nil, nil
}

func isLatin(r rune) bool {
if r >= 0x80 || r == 0x00 {
return false
}

return ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z')
}

//go:noescape
//go:linkname memhash runtime.memhash
func memhash(p unsafe.Pointer, h, s uintptr) uintptr

type stringStruct struct {
str unsafe.Pointer
len int
}

func memHashString(str string) uint64 {
ss := (*stringStruct)(unsafe.Pointer(&str))
return uint64(memhash(ss.str, 0, uintptr(ss.len)))
}

func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync.WaitGroup) {
func extract(src, dst string, sortResults bool, tag language.Tag, sem <-chan empty, wg *sync.WaitGroup) {
defer func() {
<-sem
wg.Done()
Expand All @@ -109,17 +61,15 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync

// One of the possible optimisations here is to split file in chunks and process
// each chunk individually.
words, err := collectWords(fd, initialDictSize)
words, err := collectWords(fd)
if err != nil {
_, _ = fmt.Fprintf(os.Stderr, `extract: reading input "%s": %s`, src, err)
return
}

if sortResults {
less := collate.IndexString(lang)
sort.Slice(words, func(i, j int) bool {
return less(words[i], words[j])
})
collator := collate.New(tag)
collator.SortStrings(words)
}

wd, err := os.OpenFile(dst, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, filePerm)
Expand All @@ -145,23 +95,18 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
_, _ = fmt.Fprintf(os.Stdout, "Saved %s\n", dst)
}

func collectWords(r io.Reader, sizeHint int) ([]string, error) {
func collectWords(r io.Reader) ([]string, error) {
scanner := bufio.NewScanner(r)
scanner.Split(splitWords)
scanner.Split(splitWordsFunc)

// map[uint64]empty should take less memory than map[string]empty and avoid
// GC checks.
//
// sizeHint is used to preallocate map[string]empty and []string slice and skip
// initial reallocation when they should grow. It is a "magic" number which
// should not be too big or too small. Ideally, it should be approximated from
// the text.
dict := make(map[uint64]empty, sizeHint)
words := make([]string, 0, sizeHint)
dict := make(map[uint64]empty)
words := make([]string, 0)

for scanner.Scan() {
word := scanner.Text()
hash := memHashString(word)
hash := xxhash.Sum64String(word)
if _, ok := dict[hash]; ok {
continue // duplicate detected
}
Expand Down
14 changes: 11 additions & 3 deletions example-golang/app/spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@ import (
"fmt"
"os"

"golang.org/x/text/language"
"gopkg.in/yaml.v3"
)

type MetaConfig struct {
Lang string `yaml:"lang"`
Code string `yaml:"code"`
Label string `yaml:"label"`
Lang string `yaml:"lang"`
Code string `yaml:"code"`
Label string `yaml:"label"`
Tag language.Tag `yaml:"-"`
}

func ReadSpec(filepath string) (*MetaConfig, error) {
Expand All @@ -24,5 +26,11 @@ func ReadSpec(filepath string) (*MetaConfig, error) {
return nil, fmt.Errorf(`spec: parsing YAML file "%s": %w`, filepath, err)
}

t, err := language.Parse(config.Lang)
if err != nil {
return nil, fmt.Errorf(`spec: invalid language code "%s": %w`, config.Code, err)
}

config.Tag = t
return &config, nil
}
15 changes: 2 additions & 13 deletions example-golang/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,7 @@ module github.com/hipertracker/word_extractor
go 1.17

require (
github.com/bmatcuk/doublestar v1.3.4
github.com/stretchr/testify v1.7.0
github.com/thoas/go-funk v0.9.1
github.com/tidwall/collate v1.0.0
github.com/cespare/xxhash/v2 v2.1.2
golang.org/x/text v0.3.7
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b
)

require (
github.com/davecgh/go-spew v1.1.0 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/tidwall/gjson v1.3.4 // indirect
github.com/tidwall/match v1.0.1 // indirect
github.com/tidwall/pretty v1.0.0 // indirect
golang.org/x/text v0.3.2 // indirect
)
28 changes: 4 additions & 24 deletions example-golang/go.sum
Original file line number Diff line number Diff line change
@@ -1,29 +1,9 @@
github.com/bmatcuk/doublestar v1.3.4 h1:gPypJ5xD31uhX6Tf54sDPUOBXTqKH4c9aPY66CyQrS0=
github.com/bmatcuk/doublestar v1.3.4/go.mod h1:wiQtGV+rzVYxB7WIlirSN++5HPtPlXEo9MEoZQC/PmE=
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/thoas/go-funk v0.9.1 h1:O549iLZqPpTUQ10ykd26sZhzD+rmR5pWhuElrhbC20M=
github.com/thoas/go-funk v0.9.1/go.mod h1:+IWnUfUmFO1+WVYQWQtIJHeRRdaIyyYglZN7xzUPe4Q=
github.com/tidwall/collate v1.0.0 h1:xgvwO2UunUoXx3NS3UqHBX63l248ZApqo7mUe3NHy6I=
github.com/tidwall/collate v1.0.0/go.mod h1:S56qxEr2ALVCaGY41npreOJ5lBIILSrxYLgEpxoHVIk=
github.com/tidwall/gjson v1.3.4 h1:On5waDnyKKk3SWE4EthbjjirAWXp43xx5cKCUZY1eZw=
github.com/tidwall/gjson v1.3.4/go.mod h1:P256ACg0Mn+j1RXIDXoss50DeIABTYK1PULOJHhxOls=
github.com/tidwall/match v1.0.1 h1:PnKP62LPNxHKTwvHHZZzdOAOCtsJTjo6dZLCwpKm5xc=
github.com/tidwall/match v1.0.1/go.mod h1:LujAq0jyVjBy028G1WhWfIzbpQfMO8bBZ6Tyb0+pL9E=
github.com/tidwall/pretty v1.0.0 h1:HsD+QiTn7sK6flMKIvNmpqz1qrpP3Ps6jOKIKMooyg4=
github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk=
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
github.com/cespare/xxhash/v2 v2.1.2 h1:YRXhKfTDauu4ajMg1TPgFO5jnlC2HCbmLXMcTG5cbYE=
github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo=
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=