improve golang code

hipertracker · hipertracker · commit b4b3e697c0d2 · 2022-02-20T01:44:44.000Z
diff --git a/example-golang/app/app.go b/example-golang/app/app.go
@@ -46,7 +46,7 @@ func Run(srcDir, outDir string, numWorkers int, sortResults bool) error {
 		dst := filepath.Join(outDir, spec.Lang+"-"+spec.Code+".txt")
 
 		wg.Add(1)
-		go extract(src, dst, langMap[spec.Lang], sortResults, sem, &wg)
+		go extract(src, dst, sortResults, spec.Tag, sem, &wg)
 	}
 
 	wg.Wait()
@@ -65,29 +65,3 @@ func clearOutput(outDir string) error {
 
 	return nil
 }
-
-var langMap = map[string]string{
-	"en": "ENGLISH_CI", // The first language is used as fallback.
-	"la": "ENGLISH_CI", // Latin
-	"eo": "ENGLISH_CI", // Esperanto
-	"ar": "ARABIC_CI",
-	"cz": "CZECH_CI",
-	"da": "DANISH_CI", // ?
-	"de": "GERMAN_CI",
-	"el": "GREEK_CI",
-	"es": "SPANISH_CI",
-	"fi": "FINNISH_CI",
-	"fr": "FRENCH_CI",
-	"he": "HEBREW_CI",
-	"hr": "CROATIAN_CI",
-	"hu": "HUNGARIAN_CI",
-	"it": "ITALIAN_CI",
-	"lt": "LITHUANIAN_CI",
-	"nl": "DUTCH_CI",
-	"pl": "POLISH_CI",
-	"pt": "PORTUGUESE_CI",
-	"ru": "RUSSIAN_CI",
-	"sk": "SLOVAK_CI",
-	"sv": "SWEDISH_CI",
-	"uk": "UKRAINIAN_CI",
-}
diff --git a/example-golang/app/extract.go b/example-golang/app/extract.go
@@ -6,29 +6,26 @@ import (
 	"fmt"
 	"io"
 	"os"
-	"sort"
 	"strings"
 	"sync"
 	"unicode"
 	"unicode/utf8"
-	"unsafe"
 
-	"github.com/tidwall/collate"
+	"github.com/cespare/xxhash/v2"
+	"golang.org/x/text/collate"
+	"golang.org/x/text/language"
 )
 
-const (
-	filePerm        = 0644
-	InitialDictSize = 10000
-)
+const filePerm = 0644
 
-// splitWordsUnicode splits data into words, using Unicode Letter character class.
+// splitWordsFunc splits data into words, using Unicode Letter character class.
 // It works similar to the regular expression "[^\p{L}]+". This is what was used
 // in the original code. Unicode function has slight overhead, but handles UTF-8
 // correctly.
 //
 // Rust and Python versions split text according to "[\W\d]+" - anything that is
-// not a word or a digit. TODO: comfirm if some words contain digits
-func splitWordsUnicode(data []byte, atEOF bool) (advance int, token []byte, err error) {
+// not a word or a digit. WTF?
+func splitWordsFunc(data []byte, atEOF bool) (advance int, token []byte, err error) {
 	var start int
 	var r rune
 	for width := 0; start < len(data); start += width {
@@ -50,52 +47,7 @@ func splitWordsUnicode(data []byte, atEOF bool) (advance int, token []byte, err
 	return start, nil, nil
 }
 
-// splitWords splits data into words similar to the "[\W\d]+" regular expression.
-func splitWords(data []byte, atEOF bool) (advance int, token []byte, err error) {
-	var start int
-	var r rune
-	for width := 0; start < len(data); start += width {
-		if r, width = utf8.DecodeRune(data[start:]); isLatin(r) {
-			break
-		}
-	}
-
-	for width, i := 0, start; i < len(data); i += width {
-		if r, width = utf8.DecodeRune(data[i:]); !isLatin(r) {
-			return i + width, data[start:i], nil
-		}
-	}
-
-	if atEOF && len(data) > start {
-		return len(data), data[start:], nil
-	}
-
-	return start, nil, nil
-}
-
-func isLatin(r rune) bool {
-	if r >= 0x80 || r == 0x00 {
-		return false
-	}
-
-	return ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z')
-}
-
-//go:noescape
-//go:linkname memhash runtime.memhash
-func memhash(p unsafe.Pointer, h, s uintptr) uintptr
-
-type stringStruct struct {
-	str unsafe.Pointer
-	len int
-}
-
-func memHashString(str string) uint64 {
-	ss := (*stringStruct)(unsafe.Pointer(&str))
-	return uint64(memhash(ss.str, 0, uintptr(ss.len)))
-}
-
-func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync.WaitGroup) {
+func extract(src, dst string, sortResults bool, tag language.Tag, sem <-chan empty, wg *sync.WaitGroup) {
 	defer func() {
 		<-sem
 		wg.Done()
@@ -110,17 +62,15 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
 
 	// One of the possible optimisations here is to split file in chunks and process
 	// each chunk individually.
-	words, err := collectWords(fd, lang, InitialDictSize)
+	words, err := collectWords(fd)
 	if err != nil {
 		_, _ = fmt.Fprintf(os.Stderr, `extract: reading input "%s": %s`, src, err)
 		return
 	}
 
 	if sortResults {
-		less := collate.IndexString(lang)
-		sort.Slice(words, func(i, j int) bool {
-			return less(words[i], words[j])
-		})
+		collator := collate.New(tag)
+		collator.SortStrings(words)
 	}
 
 	wd, err := os.OpenFile(dst, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, filePerm)
@@ -146,28 +96,18 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
 	_, _ = fmt.Fprintf(os.Stdout, "Saved %s\n", dst)
 }
 
-func collectWords(r io.Reader, lang string, sizeHint int) ([]string, error) {
+func collectWords(r io.Reader) ([]string, error) {
 	scanner := bufio.NewScanner(r)
-	ascii := []string{"en", "la", "eo"} // English, Latin, Esperanto
-	if stringInSlice(lang, ascii) {
-		scanner.Split(splitWords)
-	} else {
-		scanner.Split(splitWordsUnicode)
-	}
+	scanner.Split(splitWordsFunc)
 
 	// map[uint64]empty should take less memory than map[string]empty and avoid
 	// GC checks.
-	//
-	// sizeHint is used to preallocate map[string]empty and []string slice and skip
-	// initial reallocation when they should grow. It is a "magic" number which
-	// should not be too big or too small. Ideally, it should be approximated from
-	// the text.
-	dict := make(map[uint64]empty, sizeHint)
-	words := make([]string, 0, sizeHint)
+	dict := make(map[uint64]empty)
+	words := make([]string, 0)
 
 	for scanner.Scan() {
 		word := strings.ToLower(scanner.Text())
-		hash := memHashString(word)
+		hash := xxhash.Sum64String(word)
 		if _, ok := dict[hash]; ok {
 			continue // duplicate detected
 		}
@@ -202,27 +142,3 @@ func writeResults(w io.Writer, words []string) error {
 
 	return nil
 }
-
-func ExtractUniqueWords(content string, lang string, sizeHint int) ([]string, error) {
-	r := strings.NewReader(content)
-	words, err := collectWords(r, lang, sizeHint)
-
-	if err != nil {
-		_, _ = fmt.Fprintf(os.Stderr, `collectWords error: %s`, err)
-		return nil, err
-	}
-	less := collate.IndexString(lang)
-	sort.Slice(words, func(i, j int) bool {
-		return less(words[i], words[j])
-	})
-	return words, nil
-}
-
-func stringInSlice(a string, list []string) bool {
-	for _, b := range list {
-		if b == a {
-			return true
-		}
-	}
-	return false
-}
diff --git a/example-golang/app/spec.go b/example-golang/app/spec.go
@@ -4,13 +4,15 @@ import (
 	"fmt"
 	"os"
 
+	"golang.org/x/text/language"
 	"gopkg.in/yaml.v3"
 )
 
 type MetaConfig struct {
-	Lang  string `yaml:"lang"`
-	Code  string `yaml:"code"`
-	Label string `yaml:"label"`
+	Lang  string       `yaml:"lang"`
+	Code  string       `yaml:"code"`
+	Label string       `yaml:"label"`
+	Tag   language.Tag `yaml:"-"`
 }
 
 func ReadSpec(filepath string) (*MetaConfig, error) {
@@ -24,5 +26,11 @@ func ReadSpec(filepath string) (*MetaConfig, error) {
 		return nil, fmt.Errorf(`spec: parsing YAML file "%s": %w`, filepath, err)
 	}
 
+	t, err := language.Parse(config.Lang)
+	if err != nil {
+		return nil, fmt.Errorf(`spec: invalid language code "%s": %w`, config.Code, err)
+	}
+
+	config.Tag = t
 	return &config, nil
 }
diff --git a/example-golang/go.mod b/example-golang/go.mod
@@ -4,16 +4,13 @@ go 1.17
 
 require (
 	github.com/bmatcuk/doublestar v1.3.4
+	github.com/cespare/xxhash/v2 v2.1.2
 	github.com/stretchr/testify v1.7.0
-	github.com/tidwall/collate v1.0.0
+	golang.org/x/text v0.3.2
 	gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b
 )
 
 require (
 	github.com/davecgh/go-spew v1.1.0 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
-	github.com/tidwall/gjson v1.3.4 // indirect
-	github.com/tidwall/match v1.0.1 // indirect
-	github.com/tidwall/pretty v1.0.0 // indirect
-	golang.org/x/text v0.3.2 // indirect
 )
diff --git a/example-golang/go.sum b/example-golang/go.sum
@@ -1,20 +1,14 @@
 github.com/bmatcuk/doublestar v1.3.4 h1:gPypJ5xD31uhX6Tf54sDPUOBXTqKH4c9aPY66CyQrS0=
 github.com/bmatcuk/doublestar v1.3.4/go.mod h1:wiQtGV+rzVYxB7WIlirSN++5HPtPlXEo9MEoZQC/PmE=
+github.com/cespare/xxhash/v2 v2.1.2 h1:YRXhKfTDauu4ajMg1TPgFO5jnlC2HCbmLXMcTG5cbYE=
+github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
 github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
-github.com/tidwall/collate v1.0.0 h1:xgvwO2UunUoXx3NS3UqHBX63l248ZApqo7mUe3NHy6I=
-github.com/tidwall/collate v1.0.0/go.mod h1:S56qxEr2ALVCaGY41npreOJ5lBIILSrxYLgEpxoHVIk=
-github.com/tidwall/gjson v1.3.4 h1:On5waDnyKKk3SWE4EthbjjirAWXp43xx5cKCUZY1eZw=
-github.com/tidwall/gjson v1.3.4/go.mod h1:P256ACg0Mn+j1RXIDXoss50DeIABTYK1PULOJHhxOls=
-github.com/tidwall/match v1.0.1 h1:PnKP62LPNxHKTwvHHZZzdOAOCtsJTjo6dZLCwpKm5xc=
-github.com/tidwall/match v1.0.1/go.mod h1:LujAq0jyVjBy028G1WhWfIzbpQfMO8bBZ6Tyb0+pL9E=
-github.com/tidwall/pretty v1.0.0 h1:HsD+QiTn7sK6flMKIvNmpqz1qrpP3Ps6jOKIKMooyg4=
-github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk=
 golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
 golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
diff --git a/example-golang/main.go b/example-golang/main.go
@@ -24,8 +24,6 @@ func main() {
 
 	defaultNumWorkers := runtime.NumCPU()
 
-	// In the original Go code, results where always sorted, unlike in Rust or
-	// Python implementations. Sorting is turned off, by default.
 	var sortResults bool
 	var numWorkers int