Skip to content

Commit 4232926

Browse files
committed
Removed extra loop when copying unique words from map to strings slice.
Replaced unique words map with more efficient map[uint64]empty and added runtime.memhash algo to generate map keys from strings.
1 parent c1f3c66 commit 4232926

File tree

1 file changed

+36
-18
lines changed

1 file changed

+36
-18
lines changed

example-golang/app/extract.go

Lines changed: 36 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,15 @@ import (
1010
"sync"
1111
"unicode"
1212
"unicode/utf8"
13+
"unsafe"
1314

1415
"github.com/tidwall/collate"
1516
)
1617

17-
const filePerm = 0644
18+
const (
19+
filePerm = 0644
20+
initialDictSize = 1e4
21+
)
1822

1923
// splitWordsUnicode splits data into words, using Unicode Letter character class.
2024
// It works similar to the regular expression "[^\p{L}]+". This is what was used
@@ -76,6 +80,20 @@ func isLatin(r rune) bool {
7680
return ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z')
7781
}
7882

83+
//go:noescape
84+
//go:linkname memhash runtime.memhash
85+
func memhash(p unsafe.Pointer, h, s uintptr) uintptr
86+
87+
type stringStruct struct {
88+
str unsafe.Pointer
89+
len int
90+
}
91+
92+
func memHashString(str string) uint64 {
93+
ss := (*stringStruct)(unsafe.Pointer(&str))
94+
return uint64(memhash(ss.str, 0, uintptr(ss.len)))
95+
}
96+
7997
func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync.WaitGroup) {
8098
defer func() {
8199
<-sem
@@ -91,7 +109,7 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
91109

92110
// One of the possible optimisations here is to split file in chunks and process
93111
// each chunk individually.
94-
words, err := collectWords(fd)
112+
words, err := collectWords(fd, initialDictSize)
95113
if err != nil {
96114
_, _ = fmt.Fprintf(os.Stderr, `extract: reading input "%s": %s`, src, err)
97115
return
@@ -127,18 +145,29 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
127145
_, _ = fmt.Fprintf(os.Stdout, "Saved %s\n", dst)
128146
}
129147

130-
func collectWords(r io.Reader) ([]string, error) {
148+
func collectWords(r io.Reader, sizeHint int) ([]string, error) {
131149
scanner := bufio.NewScanner(r)
132-
scanner.Split(splitWordsUnicode)
150+
scanner.Split(splitWords)
151+
152+
// map[uint64]empty should take less memory than map[string]empty and avoid
153+
// GC checks.
154+
//
155+
// sizeHint is used to preallocate map[string]empty and []string slice and skip
156+
// initial reallocation when they should grow. It is a "magic" number which
157+
// should not be too big or too small. Ideally, it should be approximated from
158+
// the text.
159+
dict := make(map[uint64]empty, sizeHint)
160+
words := make([]string, 0, sizeHint)
133161

134-
dict := make(map[string]empty)
135162
for scanner.Scan() {
136163
word := scanner.Text()
137-
if _, ok := dict[word]; ok {
164+
hash := memHashString(word)
165+
if _, ok := dict[hash]; ok {
138166
continue // duplicate detected
139167
}
140168

141-
dict[word] = empty{}
169+
dict[hash] = empty{}
170+
words = append(words, word)
142171

143172
// Theoretically, if sorting is not needed, we can write right here and
144173
// skip words slice preparation below.
@@ -147,17 +176,6 @@ func collectWords(r io.Reader) ([]string, error) {
147176
return nil, err
148177
}
149178

150-
// This is expensive ...
151-
words := make([]string, len(dict))
152-
153-
var i int
154-
for w := range dict {
155-
words[i] = w
156-
i++
157-
158-
delete(dict, w)
159-
}
160-
161179
return words, nil
162180
}
163181

0 commit comments

Comments
 (0)