Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion example-golang/.tool-versions

This file was deleted.

2 changes: 1 addition & 1 deletion example-golang/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@

```
make build
GOGC=2000 ./main
./main [-n=NUMBER_OF_WORKERS, integer] [-s]
```
65 changes: 65 additions & 0 deletions example-golang/app/app.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
package app

import (
"fmt"
"os"
"path/filepath"
"sync"
)

const dirPerms = 0755

type empty struct{}

// Run extracts unique words from the list of files and saves them to the outDir.
// No error handling, no context cancellation is implemented to match implementations
// in other languages.
func Run(srcDir, outDir string, numWorkers int, sortResults bool) error {
files, err := filepath.Glob(srcDir)
if err != nil {
return fmt.Errorf(`app: getting list of files "%s": %w`, srcDir, err)
}

if err = clearOutput(outDir); err != nil {
return err
}

// This is a very basic semaphore implementation. Counting unique words from
// a stream of data is IO, memory and CPU expensive. Semaphore lets to run
// up to the numWorkers or workers concurrently and, by default, this number
// matches the number of CPUs.
sem := make(chan empty, numWorkers)

var wg sync.WaitGroup
var spec *MetaConfig

for _, file := range files {
sem <- empty{}

if spec, err = ReadSpec(file); err != nil {
return err
}

src := file[:len(file)-3] + "txt"
dst := filepath.Join(outDir, "extracted-words-for-"+spec.Code+".txt")

wg.Add(1)
go extract(src, dst, "POLISH_CI", sortResults, sem, &wg)
}

wg.Wait()
close(sem)

return nil
}

func clearOutput(outDir string) error {
if err := os.RemoveAll(outDir); err != nil {
return fmt.Errorf(`app: cleaning previous results in "%s": %w`, outDir, err)
}
if err := os.MkdirAll(outDir, dirPerms); err != nil {
return fmt.Errorf(`app: creating output directory "%s": %w`, outDir, err)
}

return nil
}
198 changes: 198 additions & 0 deletions example-golang/app/extract.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
package app

import (
"bufio"
"bytes"
"fmt"
"io"
"os"
"sort"
"sync"
"unicode"
"unicode/utf8"
"unsafe"

"github.com/tidwall/collate"
)

const (
filePerm = 0644
initialDictSize = 1e4
)

// splitWordsUnicode splits data into words, using Unicode Letter character class.
// It works similar to the regular expression "[^\p{L}]+". This is what was used
// in the original code. Unicode function has slight overhead, but handles UTF-8
// correctly.
//
// Rust and Python versions split text according to "[\W\d]+" - anything that is
// not a word or a digit. WTF?
func splitWordsUnicode(data []byte, atEOF bool) (advance int, token []byte, err error) {
var start int
var r rune
for width := 0; start < len(data); start += width {
if r, width = utf8.DecodeRune(data[start:]); unicode.IsLetter(r) {
break
}
}

for width, i := 0, start; i < len(data); i += width {
if r, width = utf8.DecodeRune(data[i:]); !unicode.IsLetter(r) {
return i + width, data[start:i], nil
}
}

if atEOF && len(data) > start {
return len(data), data[start:], nil
}

return start, nil, nil
}

// splitWords splits data into words similar to the "[\W\d]+" regular expression.
func splitWords(data []byte, atEOF bool) (advance int, token []byte, err error) {
var start int
var r rune
for width := 0; start < len(data); start += width {
if r, width = utf8.DecodeRune(data[start:]); isLatin(r) {
break
}
}

for width, i := 0, start; i < len(data); i += width {
if r, width = utf8.DecodeRune(data[i:]); !isLatin(r) {
return i + width, data[start:i], nil
}
}

if atEOF && len(data) > start {
return len(data), data[start:], nil
}

return start, nil, nil
}

func isLatin(r rune) bool {
if r >= 0x80 || r == 0x00 {
return false
}

return ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z')
}

//go:noescape
//go:linkname memhash runtime.memhash
func memhash(p unsafe.Pointer, h, s uintptr) uintptr

type stringStruct struct {
str unsafe.Pointer
len int
}

func memHashString(str string) uint64 {
ss := (*stringStruct)(unsafe.Pointer(&str))
return uint64(memhash(ss.str, 0, uintptr(ss.len)))
}

func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync.WaitGroup) {
defer func() {
<-sem
wg.Done()
}()

fd, err := os.Open(src)
if err != nil {
_, _ = fmt.Fprintf(os.Stderr, `extract: opening source file "%s" for reading: %s`, src, err)
return
}
defer fd.Close()

// One of the possible optimisations here is to split file in chunks and process
// each chunk individually.
words, err := collectWords(fd, initialDictSize)
if err != nil {
_, _ = fmt.Fprintf(os.Stderr, `extract: reading input "%s": %s`, src, err)
return
}

if sortResults {
less := collate.IndexString(lang)
sort.Slice(words, func(i, j int) bool {
return less(words[i], words[j])
})
}

wd, err := os.OpenFile(dst, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, filePerm)
if err != nil {
_, _ = fmt.Fprintf(os.Stderr, `extract: opening destination file "%s" for writing: %s`, src, err)
return
}
defer fd.Close()

// Writing word by word can result in too many writes, hence, it is slow.
// Let's add some steroids ...
wr := bufio.NewWriter(wd)

if err = writeResults(wr, words); err != nil {
_, _ = fmt.Fprintf(os.Stderr, `extract: writing results "%s": %s`, dst, err)
return
}
if err = wr.Flush(); err != nil {
_, _ = fmt.Fprintf(os.Stderr, `extract: writing results "%s": %s`, dst, err)
return
}

_, _ = fmt.Fprintf(os.Stdout, "Saved %s\n", dst)
}

func collectWords(r io.Reader, sizeHint int) ([]string, error) {
scanner := bufio.NewScanner(r)
scanner.Split(splitWords)

// map[uint64]empty should take less memory than map[string]empty and avoid
// GC checks.
//
// sizeHint is used to preallocate map[string]empty and []string slice and skip
// initial reallocation when they should grow. It is a "magic" number which
// should not be too big or too small. Ideally, it should be approximated from
// the text.
dict := make(map[uint64]empty, sizeHint)
words := make([]string, 0, sizeHint)

for scanner.Scan() {
word := scanner.Text()
hash := memHashString(word)
if _, ok := dict[hash]; ok {
continue // duplicate detected
}

dict[hash] = empty{}
words = append(words, word)

// Theoretically, if sorting is not needed, we can write right here and
// skip words slice preparation below.
}
if err := scanner.Err(); err != nil {
return nil, err
}

return words, nil
}

func writeResults(w io.Writer, words []string) error {
// This is to preallocate memory once for "string => []byte + \n" conversion
// and reuse it on every iteration.
var buf bytes.Buffer
for _, word := range words {
buf.WriteString(word)
buf.WriteRune('\n')

if _, err := buf.WriteTo(w); err != nil {
return err
}

buf.Reset()
}

return nil
}
28 changes: 28 additions & 0 deletions example-golang/app/spec.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package app

import (
"fmt"
"os"

"gopkg.in/yaml.v3"
)

type MetaConfig struct {
Lang string `yaml:"lang"`
Code string `yaml:"code"`
Label string `yaml:"label"`
}

func ReadSpec(filepath string) (*MetaConfig, error) {
p, err := os.ReadFile(filepath)
if err != nil {
return nil, fmt.Errorf(`spec: reading YAML file "%s": %w`, filepath, err)
}

var config MetaConfig
if err = yaml.Unmarshal(p, &config); err != nil {
return nil, fmt.Errorf(`spec: parsing YAML file "%s": %w`, filepath, err)
}

return &config, nil
}
2 changes: 1 addition & 1 deletion example-golang/go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module github.com/hipertracker/word_extractor

go 1.18
go 1.17

require (
github.com/bmatcuk/doublestar v1.3.4
Expand Down
Loading