diff --git a/example-golang/.tool-versions b/example-golang/.tool-versions deleted file mode 100644 index ea45b6a..0000000 --- a/example-golang/.tool-versions +++ /dev/null @@ -1 +0,0 @@ -golang 1.18beta1 diff --git a/example-golang/README.md b/example-golang/README.md index d2020fa..7eb88d6 100644 --- a/example-golang/README.md +++ b/example-golang/README.md @@ -4,5 +4,5 @@ ``` make build -GOGC=2000 ./main +./main [-n=NUMBER_OF_WORKERS, integer] [-s] ``` diff --git a/example-golang/app/app.go b/example-golang/app/app.go new file mode 100644 index 0000000..8688617 --- /dev/null +++ b/example-golang/app/app.go @@ -0,0 +1,65 @@ +package app + +import ( + "fmt" + "os" + "path/filepath" + "sync" +) + +const dirPerms = 0755 + +type empty struct{} + +// Run extracts unique words from the list of files and saves them to the outDir. +// No error handling, no context cancellation is implemented to match implementations +// in other languages. +func Run(srcDir, outDir string, numWorkers int, sortResults bool) error { + files, err := filepath.Glob(srcDir) + if err != nil { + return fmt.Errorf(`app: getting list of files "%s": %w`, srcDir, err) + } + + if err = clearOutput(outDir); err != nil { + return err + } + + // This is a very basic semaphore implementation. Counting unique words from + // a stream of data is IO, memory and CPU expensive. Semaphore lets to run + // up to the numWorkers or workers concurrently and, by default, this number + // matches the number of CPUs. + sem := make(chan empty, numWorkers) + + var wg sync.WaitGroup + var spec *MetaConfig + + for _, file := range files { + sem <- empty{} + + if spec, err = ReadSpec(file); err != nil { + return err + } + + src := file[:len(file)-3] + "txt" + dst := filepath.Join(outDir, "extracted-words-for-"+spec.Code+".txt") + + wg.Add(1) + go extract(src, dst, "POLISH_CI", sortResults, sem, &wg) + } + + wg.Wait() + close(sem) + + return nil +} + +func clearOutput(outDir string) error { + if err := os.RemoveAll(outDir); err != nil { + return fmt.Errorf(`app: cleaning previous results in "%s": %w`, outDir, err) + } + if err := os.MkdirAll(outDir, dirPerms); err != nil { + return fmt.Errorf(`app: creating output directory "%s": %w`, outDir, err) + } + + return nil +} diff --git a/example-golang/app/extract.go b/example-golang/app/extract.go new file mode 100644 index 0000000..7ff88f9 --- /dev/null +++ b/example-golang/app/extract.go @@ -0,0 +1,198 @@ +package app + +import ( + "bufio" + "bytes" + "fmt" + "io" + "os" + "sort" + "sync" + "unicode" + "unicode/utf8" + "unsafe" + + "github.com/tidwall/collate" +) + +const ( + filePerm = 0644 + initialDictSize = 1e4 +) + +// splitWordsUnicode splits data into words, using Unicode Letter character class. +// It works similar to the regular expression "[^\p{L}]+". This is what was used +// in the original code. Unicode function has slight overhead, but handles UTF-8 +// correctly. +// +// Rust and Python versions split text according to "[\W\d]+" - anything that is +// not a word or a digit. WTF? +func splitWordsUnicode(data []byte, atEOF bool) (advance int, token []byte, err error) { + var start int + var r rune + for width := 0; start < len(data); start += width { + if r, width = utf8.DecodeRune(data[start:]); unicode.IsLetter(r) { + break + } + } + + for width, i := 0, start; i < len(data); i += width { + if r, width = utf8.DecodeRune(data[i:]); !unicode.IsLetter(r) { + return i + width, data[start:i], nil + } + } + + if atEOF && len(data) > start { + return len(data), data[start:], nil + } + + return start, nil, nil +} + +// splitWords splits data into words similar to the "[\W\d]+" regular expression. +func splitWords(data []byte, atEOF bool) (advance int, token []byte, err error) { + var start int + var r rune + for width := 0; start < len(data); start += width { + if r, width = utf8.DecodeRune(data[start:]); isLatin(r) { + break + } + } + + for width, i := 0, start; i < len(data); i += width { + if r, width = utf8.DecodeRune(data[i:]); !isLatin(r) { + return i + width, data[start:i], nil + } + } + + if atEOF && len(data) > start { + return len(data), data[start:], nil + } + + return start, nil, nil +} + +func isLatin(r rune) bool { + if r >= 0x80 || r == 0x00 { + return false + } + + return ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z') +} + +//go:noescape +//go:linkname memhash runtime.memhash +func memhash(p unsafe.Pointer, h, s uintptr) uintptr + +type stringStruct struct { + str unsafe.Pointer + len int +} + +func memHashString(str string) uint64 { + ss := (*stringStruct)(unsafe.Pointer(&str)) + return uint64(memhash(ss.str, 0, uintptr(ss.len))) +} + +func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync.WaitGroup) { + defer func() { + <-sem + wg.Done() + }() + + fd, err := os.Open(src) + if err != nil { + _, _ = fmt.Fprintf(os.Stderr, `extract: opening source file "%s" for reading: %s`, src, err) + return + } + defer fd.Close() + + // One of the possible optimisations here is to split file in chunks and process + // each chunk individually. + words, err := collectWords(fd, initialDictSize) + if err != nil { + _, _ = fmt.Fprintf(os.Stderr, `extract: reading input "%s": %s`, src, err) + return + } + + if sortResults { + less := collate.IndexString(lang) + sort.Slice(words, func(i, j int) bool { + return less(words[i], words[j]) + }) + } + + wd, err := os.OpenFile(dst, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, filePerm) + if err != nil { + _, _ = fmt.Fprintf(os.Stderr, `extract: opening destination file "%s" for writing: %s`, src, err) + return + } + defer fd.Close() + + // Writing word by word can result in too many writes, hence, it is slow. + // Let's add some steroids ... + wr := bufio.NewWriter(wd) + + if err = writeResults(wr, words); err != nil { + _, _ = fmt.Fprintf(os.Stderr, `extract: writing results "%s": %s`, dst, err) + return + } + if err = wr.Flush(); err != nil { + _, _ = fmt.Fprintf(os.Stderr, `extract: writing results "%s": %s`, dst, err) + return + } + + _, _ = fmt.Fprintf(os.Stdout, "Saved %s\n", dst) +} + +func collectWords(r io.Reader, sizeHint int) ([]string, error) { + scanner := bufio.NewScanner(r) + scanner.Split(splitWords) + + // map[uint64]empty should take less memory than map[string]empty and avoid + // GC checks. + // + // sizeHint is used to preallocate map[string]empty and []string slice and skip + // initial reallocation when they should grow. It is a "magic" number which + // should not be too big or too small. Ideally, it should be approximated from + // the text. + dict := make(map[uint64]empty, sizeHint) + words := make([]string, 0, sizeHint) + + for scanner.Scan() { + word := scanner.Text() + hash := memHashString(word) + if _, ok := dict[hash]; ok { + continue // duplicate detected + } + + dict[hash] = empty{} + words = append(words, word) + + // Theoretically, if sorting is not needed, we can write right here and + // skip words slice preparation below. + } + if err := scanner.Err(); err != nil { + return nil, err + } + + return words, nil +} + +func writeResults(w io.Writer, words []string) error { + // This is to preallocate memory once for "string => []byte + \n" conversion + // and reuse it on every iteration. + var buf bytes.Buffer + for _, word := range words { + buf.WriteString(word) + buf.WriteRune('\n') + + if _, err := buf.WriteTo(w); err != nil { + return err + } + + buf.Reset() + } + + return nil +} diff --git a/example-golang/app/spec.go b/example-golang/app/spec.go new file mode 100644 index 0000000..a8d7318 --- /dev/null +++ b/example-golang/app/spec.go @@ -0,0 +1,28 @@ +package app + +import ( + "fmt" + "os" + + "gopkg.in/yaml.v3" +) + +type MetaConfig struct { + Lang string `yaml:"lang"` + Code string `yaml:"code"` + Label string `yaml:"label"` +} + +func ReadSpec(filepath string) (*MetaConfig, error) { + p, err := os.ReadFile(filepath) + if err != nil { + return nil, fmt.Errorf(`spec: reading YAML file "%s": %w`, filepath, err) + } + + var config MetaConfig + if err = yaml.Unmarshal(p, &config); err != nil { + return nil, fmt.Errorf(`spec: parsing YAML file "%s": %w`, filepath, err) + } + + return &config, nil +} diff --git a/example-golang/go.mod b/example-golang/go.mod index 5f1fbc2..be826ad 100644 --- a/example-golang/go.mod +++ b/example-golang/go.mod @@ -1,6 +1,6 @@ module github.com/hipertracker/word_extractor -go 1.18 +go 1.17 require ( github.com/bmatcuk/doublestar v1.3.4 diff --git a/example-golang/main.go b/example-golang/main.go index 99614c7..b9e7da3 100644 --- a/example-golang/main.go +++ b/example-golang/main.go @@ -1,169 +1,55 @@ package main import ( + "flag" "fmt" - "io/ioutil" "os" - "regexp" - "sort" - "strings" - "sync" + "path/filepath" + "runtime" "time" - "github.com/bmatcuk/doublestar" - "github.com/thoas/go-funk" - "github.com/tidwall/collate" + "github.com/hipertracker/word_extractor/app" ) -type Pair struct { - SrcPath string - Dstpath string -} - -var srcPath = "../data/??/**/*.yml" -var outdir = "words" - -var wg sync.WaitGroup - -// func mainOld() { -// paths, _ := doublestar.Glob(srcPath) - -// clearResults() -// runWithChannels(paths) - -// clearResults() -// runWithWaitGroups(paths) -// } +const ( + srcPath = "../data/??/**/*.yml" + outDir = "words" +) func main() { - var wg sync.WaitGroup - wg.Add(1) - - go func() { - defer wg.Done() - - t := time.Now() - defer timeTrack(t) - - paths, _ := doublestar.Glob(srcPath) - - ch1 := make(chan Pair, len(paths)) - ch2 := make(chan string, len(paths)) - - clearResults() - - for _, yamlPath := range paths { - go loadYaml(ch1, yamlPath) - } - - for range paths { - pair := <-ch1 - go loadText(ch2, pair.SrcPath, pair.Dstpath, true) - } - for range paths { - fmt.Printf("Saved %s\n", <-ch2) - } - }() - wg.Wait() -} - -func loadYaml(ch chan Pair, path string) { - meta := GetYAML(path) - srcPath := strings.Replace(path, ".yml", ".txt", -1) - dstPath := fmt.Sprintf("%s/extracted-words-for-%s.txt", outdir, meta.Code) - ch <- Pair{srcPath, dstPath} -} - -func loadText(ch2 chan string, srcPath string, dstPath string, sorting bool) { - content, err := ioutil.ReadFile(srcPath) + wd, err := os.Getwd() if err != nil { - panic(err) - } - words := extractUniqueWords(content) - if sorting { - words = sortWords(words, "POLISH_CI") + _, _ = fmt.Fprintf(os.Stderr, "Cannot get working directory: %s", err) + os.Exit(1) } - text := strings.Join(words, "\n") - for err := ioutil.WriteFile(dstPath, []byte(text), 0644); err != nil; { - panic(err) - } - ch2 <- dstPath -} -func clearResults() { - os.RemoveAll(outdir) - os.Mkdir(outdir, 0777) -} + defaultNumWorkers := runtime.NumCPU() -func runWithChannels(paths []string) { - var ch = make(chan string) - t := time.Now() - defer timeTrack(t) - for _, path := range paths { - go func(yamlPath string) { - ch <- parseFile(yamlPath, false) - }(path) - } - for range paths { - <-ch - } -} - -func runWithWaitGroups(paths []string) { - var wg sync.WaitGroup - t := time.Now() - defer timeTrack(t) - for _, path := range paths { - wg.Add(1) - go func(yamlPath string) { - parseFile(yamlPath, false) - wg.Done() - }(path) - } - wg.Wait() -} + // In the original Go code, results where always sorted, unlike in Rust or + // Python implementations. Sorting is turned off, by default. + var sortResults bool + var numWorkers int -func parseFile(path string, sorting bool) string { - // load YAML file - meta := GetYAML(path) - outfilepath := fmt.Sprintf("%s/extracted-words-for-%s.txt", outdir, meta.Code) + flag.IntVar(&numWorkers, "n", defaultNumWorkers, "Number of workers to run (zero to match the number of available CPUs)") + flag.BoolVar(&sortResults, "s", sortResults, "Sort results") + flag.Parse() - // load text file - filepath := strings.Replace(path, ".yml", ".txt", -1) - content, err := ioutil.ReadFile(filepath) - if err != nil { - panic(err) + if numWorkers <= 0 { + numWorkers = defaultNumWorkers } - words := extractUniqueWords(content) + t := time.Now() - // sort unique words - if sorting { - words = sortWords(words, "POLISH_CI") + var exitCode int + if err = app.Run(filepath.Join(wd, srcPath), filepath.Join(wd, outDir), numWorkers, sortResults); err != nil { + _, _ = fmt.Fprintf(os.Stderr, "Shit happened: %s", err) + exitCode = 1 } - text := strings.Join(words, "\n") - for err := ioutil.WriteFile(outfilepath, []byte(text), 0644); err != nil; { - panic(err) - } - return outfilepath + timeTrack(t) + os.Exit(exitCode) } func timeTrack(start time.Time) { fmt.Println("Total timing: ", time.Since(start)) } - -func extractUniqueWords(content []byte) []string { - text := strings.ToLower(string(content)) - re := regexp.MustCompile(`[^\p{L}]+`) - tokens := re.Split(text, -1) - return funk.UniqString(tokens) -} - -func sortWords(words []string, lang string) []string { - less := collate.IndexString(lang) - sort.SliceStable(words, func(i, j int) bool { - return less(words[i], words[j]) - }) - return words -} diff --git a/example-golang/main_test.go b/example-golang/main_test.go deleted file mode 100644 index f1e0a1c..0000000 --- a/example-golang/main_test.go +++ /dev/null @@ -1,21 +0,0 @@ -package main - -import ( - "testing" - - "github.com/stretchr/testify/assert" -) - -func Test_extractUniqueWords(t *testing.T) { - text := "ćma cześć ser. śmiech!żółw zebra-łuk len Ćma Żółw ser" - expected := []string{"ćma", "cześć", "ser", "śmiech", "żółw", "zebra", "łuk", "len"} - given := extractUniqueWords([]byte(text)) - assert.Equal(t, expected, given, "text should be tokenized into unique words") -} - -func Test_sortWords(t *testing.T) { - words := []string{"ćma", "cześć", "ser", "śmiech", "żółw", "zebra", "łuk", "len"} - expected := []string{"cześć", "ćma", "len", "łuk", "ser", "śmiech", "zebra", "żółw"} - given := sortWords(words, "POLISH_CI") - assert.Equal(t, expected, given, "words should be sorted out using Polish grammar rules") -} diff --git a/example-golang/yaml.go b/example-golang/yaml.go deleted file mode 100644 index a2298da..0000000 --- a/example-golang/yaml.go +++ /dev/null @@ -1,30 +0,0 @@ -package main - -import ( - "io/ioutil" - - "gopkg.in/yaml.v3" -) - -type metaConfig struct { - Lang string - Code string - Label string -} - -func (m *metaConfig) Parse(data []byte) error { - return yaml.Unmarshal(data, m) -} - -func GetYAML(filepath string) metaConfig { - data, err := ioutil.ReadFile(filepath) - if err != nil { - panic(err) - } - config := metaConfig{} - err = config.Parse(data) - if err != nil { - panic(err) - } - return config -}