hipertracker · hipertracker · Feb 6, 2022 · Feb 5, 2022 · Feb 6, 2022
diff --git a/example-golang/.tool-versions b/example-golang/.tool-versions
diff --git a/example-golang/README.md b/example-golang/README.md
@@ -4,5 +4,5 @@
 
 ```
 make build
-GOGC=2000 ./main
+./main [-n=NUMBER_OF_WORKERS, integer] [-s] 
 ```
diff --git a/example-golang/app/app.go b/example-golang/app/app.go
@@ -0,0 +1,65 @@
+package app
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"sync"
+)
+
+const dirPerms = 0755
+
+type empty struct{}
+
+// Run extracts unique words from the list of files and saves them to the outDir.
+// No error handling, no context cancellation is implemented to match implementations
+// in other languages.
+func Run(srcDir, outDir string, numWorkers int, sortResults bool) error {
+	files, err := filepath.Glob(srcDir)
+	if err != nil {
+		return fmt.Errorf(`app: getting list of files "%s": %w`, srcDir, err)
+	}
+
+	if err = clearOutput(outDir); err != nil {
+		return err
+	}
+
+	// This is a very basic semaphore implementation. Counting unique words from
+	// a stream of data is IO, memory and CPU expensive. Semaphore lets to run
+	// up to the numWorkers or workers concurrently and, by default, this number
+	// matches the number of CPUs.
+	sem := make(chan empty, numWorkers)
+
+	var wg sync.WaitGroup
+	var spec *MetaConfig
+
+	for _, file := range files {
+		sem <- empty{}
+
+		if spec, err = ReadSpec(file); err != nil {
+			return err
+		}
+
+		src := file[:len(file)-3] + "txt"
+		dst := filepath.Join(outDir, "extracted-words-for-"+spec.Code+".txt")
+
+		wg.Add(1)
+		go extract(src, dst, "POLISH_CI", sortResults, sem, &wg)
+	}
+
+	wg.Wait()
+	close(sem)
+
+	return nil
+}
+
+func clearOutput(outDir string) error {
+	if err := os.RemoveAll(outDir); err != nil {
+		return fmt.Errorf(`app: cleaning previous results in "%s": %w`, outDir, err)
+	}
+	if err := os.MkdirAll(outDir, dirPerms); err != nil {
+		return fmt.Errorf(`app: creating output directory "%s": %w`, outDir, err)
+	}
+
+	return nil
+}
diff --git a/example-golang/app/extract.go b/example-golang/app/extract.go
@@ -0,0 +1,198 @@
+package app
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"io"
+	"os"
+	"sort"
+	"sync"
+	"unicode"
+	"unicode/utf8"
+	"unsafe"
+
+	"github.com/tidwall/collate"
+)
+
+const (
+	filePerm        = 0644
+	initialDictSize = 1e4
+)
+
+// splitWordsUnicode splits data into words, using Unicode Letter character class.
+// It works similar to the regular expression "[^\p{L}]+". This is what was used
+// in the original code. Unicode function has slight overhead, but handles UTF-8
+// correctly.
+//
+// Rust and Python versions split text according to "[\W\d]+" - anything that is
+// not a word or a digit. WTF?
+func splitWordsUnicode(data []byte, atEOF bool) (advance int, token []byte, err error) {
+	var start int
+	var r rune
+	for width := 0; start < len(data); start += width {
+		if r, width = utf8.DecodeRune(data[start:]); unicode.IsLetter(r) {
+			break
+		}
+	}
+
+	for width, i := 0, start; i < len(data); i += width {
+		if r, width = utf8.DecodeRune(data[i:]); !unicode.IsLetter(r) {
+			return i + width, data[start:i], nil
+		}
+	}
+
+	if atEOF && len(data) > start {
+		return len(data), data[start:], nil
+	}
+
+	return start, nil, nil
+}
+
+// splitWords splits data into words similar to the "[\W\d]+" regular expression.
+func splitWords(data []byte, atEOF bool) (advance int, token []byte, err error) {
+	var start int
+	var r rune
+	for width := 0; start < len(data); start += width {
+		if r, width = utf8.DecodeRune(data[start:]); isLatin(r) {
+			break
+		}
+	}
+
+	for width, i := 0, start; i < len(data); i += width {
+		if r, width = utf8.DecodeRune(data[i:]); !isLatin(r) {
+			return i + width, data[start:i], nil
+		}
+	}
+
+	if atEOF && len(data) > start {
+		return len(data), data[start:], nil
+	}
+
+	return start, nil, nil
+}
+
+func isLatin(r rune) bool {
+	if r >= 0x80 || r == 0x00 {
+		return false
+	}
+
+	return ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z')
+}
+
+//go:noescape
+//go:linkname memhash runtime.memhash
+func memhash(p unsafe.Pointer, h, s uintptr) uintptr
+
+type stringStruct struct {
+	str unsafe.Pointer
+	len int
+}
+
+func memHashString(str string) uint64 {
+	ss := (*stringStruct)(unsafe.Pointer(&str))
+	return uint64(memhash(ss.str, 0, uintptr(ss.len)))
+}
+
+func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync.WaitGroup) {
+	defer func() {
+		<-sem
+		wg.Done()
+	}()
+
+	fd, err := os.Open(src)
+	if err != nil {
+		_, _ = fmt.Fprintf(os.Stderr, `extract: opening source file "%s" for reading: %s`, src, err)
+		return
+	}
+	defer fd.Close()
+
+	// One of the possible optimisations here is to split file in chunks and process
+	// each chunk individually.
+	words, err := collectWords(fd, initialDictSize)
+	if err != nil {
+		_, _ = fmt.Fprintf(os.Stderr, `extract: reading input "%s": %s`, src, err)
+		return
+	}
+
+	if sortResults {
+		less := collate.IndexString(lang)
+		sort.Slice(words, func(i, j int) bool {
+			return less(words[i], words[j])
+		})
+	}
+
+	wd, err := os.OpenFile(dst, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, filePerm)
+	if err != nil {
+		_, _ = fmt.Fprintf(os.Stderr, `extract: opening destination file "%s" for writing: %s`, src, err)
+		return
+	}
+	defer fd.Close()
+
+	// Writing word by word can result in too many writes, hence, it is slow.
+	// Let's add some steroids ...
+	wr := bufio.NewWriter(wd)
+
+	if err = writeResults(wr, words); err != nil {
+		_, _ = fmt.Fprintf(os.Stderr, `extract: writing results "%s": %s`, dst, err)
+		return
+	}
+	if err = wr.Flush(); err != nil {
+		_, _ = fmt.Fprintf(os.Stderr, `extract: writing results "%s": %s`, dst, err)
+		return
+	}
+
+	_, _ = fmt.Fprintf(os.Stdout, "Saved %s\n", dst)
+}
+
+func collectWords(r io.Reader, sizeHint int) ([]string, error) {
+	scanner := bufio.NewScanner(r)
+	scanner.Split(splitWords)
+
+	// map[uint64]empty should take less memory than map[string]empty and avoid
+	// GC checks.
+	//
+	// sizeHint is used to preallocate map[string]empty and []string slice and skip
+	// initial reallocation when they should grow. It is a "magic" number which
+	// should not be too big or too small. Ideally, it should be approximated from
+	// the text.
+	dict := make(map[uint64]empty, sizeHint)
+	words := make([]string, 0, sizeHint)
+
+	for scanner.Scan() {
+		word := scanner.Text()
+		hash := memHashString(word)
+		if _, ok := dict[hash]; ok {
+			continue // duplicate detected
+		}
+
+		dict[hash] = empty{}
+		words = append(words, word)
+
+		// Theoretically, if sorting is not needed, we can write right here and
+		// skip words slice preparation below.
+	}
+	if err := scanner.Err(); err != nil {
+		return nil, err
+	}
+
+	return words, nil
+}
+
+func writeResults(w io.Writer, words []string) error {
+	// This is to preallocate memory once for "string => []byte + \n" conversion
+	// and reuse it on every iteration.
+	var buf bytes.Buffer
+	for _, word := range words {
+		buf.WriteString(word)
+		buf.WriteRune('\n')
+
+		if _, err := buf.WriteTo(w); err != nil {
+			return err
+		}
+
+		buf.Reset()
+	}
+
+	return nil
+}
diff --git a/example-golang/app/spec.go b/example-golang/app/spec.go
@@ -0,0 +1,28 @@
+package app
+
+import (
+	"fmt"
+	"os"
+
+	"gopkg.in/yaml.v3"
+)
+
+type MetaConfig struct {
+	Lang  string `yaml:"lang"`
+	Code  string `yaml:"code"`
+	Label string `yaml:"label"`
+}
+
+func ReadSpec(filepath string) (*MetaConfig, error) {
+	p, err := os.ReadFile(filepath)
+	if err != nil {
+		return nil, fmt.Errorf(`spec: reading YAML file "%s": %w`, filepath, err)
+	}
+
+	var config MetaConfig
+	if err = yaml.Unmarshal(p, &config); err != nil {
+		return nil, fmt.Errorf(`spec: parsing YAML file "%s": %w`, filepath, err)
+	}
+
+	return &config, nil
+}
diff --git a/example-golang/go.mod b/example-golang/go.mod
@@ -1,6 +1,6 @@
 module github.com/hipertracker/word_extractor
 
-go 1.18
+go 1.17
 
 require (
 	github.com/bmatcuk/doublestar v1.3.4