Skip to content

Commit 151b08b

Browse files
committed
Merge branch 'roman-kulish-golang-example-refactor'
2 parents 754902d + b3a7186 commit 151b08b

File tree

4 files changed

+42
-54
lines changed

4 files changed

+42
-54
lines changed

example-golang/.tool-versions

Lines changed: 0 additions & 1 deletion
This file was deleted.

example-golang/app/extract.go

Lines changed: 40 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,15 @@ import (
1111
"sync"
1212
"unicode"
1313
"unicode/utf8"
14+
"unsafe"
1415

1516
"github.com/tidwall/collate"
1617
)
1718

18-
const filePerm = 0644
19+
const (
20+
filePerm = 0644
21+
InitialDictSize = 10000
22+
)
1923

2024
// splitWordsUnicode splits data into words, using Unicode Letter character class.
2125
// It works similar to the regular expression "[^\p{L}]+". This is what was used
@@ -77,6 +81,20 @@ func isLatin(r rune) bool {
7781
return ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z')
7882
}
7983

84+
//go:noescape
85+
//go:linkname memhash runtime.memhash
86+
func memhash(p unsafe.Pointer, h, s uintptr) uintptr
87+
88+
type stringStruct struct {
89+
str unsafe.Pointer
90+
len int
91+
}
92+
93+
func memHashString(str string) uint64 {
94+
ss := (*stringStruct)(unsafe.Pointer(&str))
95+
return uint64(memhash(ss.str, 0, uintptr(ss.len)))
96+
}
97+
8098
func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync.WaitGroup) {
8199
defer func() {
82100
<-sem
@@ -92,7 +110,7 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
92110

93111
// One of the possible optimisations here is to split file in chunks and process
94112
// each chunk individually.
95-
words, err := collectWords(fd)
113+
words, err := collectWords(fd, InitialDictSize)
96114
if err != nil {
97115
_, _ = fmt.Fprintf(os.Stderr, `extract: reading input "%s": %s`, src, err)
98116
return
@@ -125,21 +143,32 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
125143
return
126144
}
127145

128-
// _, _ = fmt.Fprintf(os.Stdout, "Saved %s\n", dst)
146+
_, _ = fmt.Fprintf(os.Stdout, "Saved %s\n", dst)
129147
}
130-
func collectWords(r io.Reader) ([]string, error) {
131-
scanner := bufio.NewScanner(r)
132148

133-
scanner.Split(splitWordsUnicode)
149+
func collectWords(r io.Reader, sizeHint int) ([]string, error) {
150+
scanner := bufio.NewScanner(r)
151+
scanner.Split(splitWords)
152+
153+
// map[uint64]empty should take less memory than map[string]empty and avoid
154+
// GC checks.
155+
//
156+
// sizeHint is used to preallocate map[string]empty and []string slice and skip
157+
// initial reallocation when they should grow. It is a "magic" number which
158+
// should not be too big or too small. Ideally, it should be approximated from
159+
// the text.
160+
dict := make(map[uint64]empty, sizeHint)
161+
words := make([]string, 0, sizeHint)
134162

135-
dict := make(map[string]empty)
136163
for scanner.Scan() {
137164
word := strings.ToLower(scanner.Text())
138-
if _, ok := dict[word]; ok {
165+
hash := memHashString(word)
166+
if _, ok := dict[hash]; ok {
139167
continue // duplicate detected
140168
}
141169

142-
dict[word] = empty{}
170+
dict[hash] = empty{}
171+
words = append(words, word)
143172

144173
// Theoretically, if sorting is not needed, we can write right here and
145174
// skip words slice preparation below.
@@ -148,17 +177,6 @@ func collectWords(r io.Reader) ([]string, error) {
148177
return nil, err
149178
}
150179

151-
// This is expensive ...
152-
words := make([]string, len(dict))
153-
154-
var i int
155-
for w := range dict {
156-
words[i] = w
157-
i++
158-
159-
delete(dict, w)
160-
}
161-
162180
return words, nil
163181
}
164182

@@ -180,9 +198,9 @@ func writeResults(w io.Writer, words []string) error {
180198
return nil
181199
}
182200

183-
func ExtractUniqueWords(content string, lang string) ([]string, error) {
201+
func ExtractUniqueWords(content string, lang string, sizeHint int) ([]string, error) {
184202
r := strings.NewReader(content)
185-
words, err := collectWords(r)
203+
words, err := collectWords(r, sizeHint)
186204
if err != nil {
187205
_, _ = fmt.Fprintf(os.Stderr, `collectWords error: %s`, err)
188206
return nil, err

example-golang/app_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ import (
1212
func Test_ExtractUniqueWords(t *testing.T) {
1313
text := "ćma cześć ser. śmiech!żółw zebra-łuk len Ćma Żółw ser"
1414
expected := []string{"cześć", "ćma", "len", "łuk", "ser", "śmiech", "zebra", "żółw"}
15-
given, err := app.ExtractUniqueWords(text, "pl")
15+
initialDictSize := app.InitialDictSize
16+
given, err := app.ExtractUniqueWords(text, "pl", initialDictSize)
1617
if err != nil {
1718
_, _ = fmt.Fprintf(os.Stderr, `ExtractUniqueWords error: %s`, err)
1819
return

example-golang/yaml.go

Lines changed: 0 additions & 30 deletions
This file was deleted.

0 commit comments

Comments
 (0)