Skip to content

Commit 3092e88

Browse files
committed
fix bug in Go not reading recursive folders, fix results, fix sorting rules for ASCII and non ASCII text files
1 parent d23a65f commit 3092e88

File tree

4 files changed

+25
-8
lines changed

4 files changed

+25
-8
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ The following results are for 123 unique utf-8 Bible text files in 23 languages
1414
* Machine: MacBook Pro 16" 64GB 2TB M1Max 10 cores.
1515

1616
<pre>
17-
1. Golang 1.17.6 = 0.47s (with sorting: 2.25s)
18-
2. Rust 1.58 = 1.14s (with sorting: 1.62s) with tokyo (previous: 1.34s, with sorting: 1.79)
17+
1. Rust 1.58 = 1.14s (with sorting: 1.62s) with tokyo (previous: 1.34s, with sorting: 1.79)
18+
2. Golang 1.17.6 = 1.34s (with sorting: 6.55s)
1919
3. Python 3.10.2 = 2.80s (with multiprocessing)
2020
4. Julia 1.7.1 = 4.522
2121
5. Crystal 1.3.2 = 5.72s

example-golang/app/app.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ import (
55
"os"
66
"path/filepath"
77
"sync"
8+
9+
"github.com/bmatcuk/doublestar"
810
)
911

1012
const dirPerms = 0755
@@ -15,7 +17,7 @@ type empty struct{}
1517
// No error handling, no context cancellation is implemented to match implementations
1618
// in other languages.
1719
func Run(srcDir, outDir string, numWorkers int, sortResults bool) error {
18-
files, err := filepath.Glob(srcDir)
20+
files, err := doublestar.Glob(srcDir)
1921
if err != nil {
2022
return fmt.Errorf(`app: getting list of files "%s": %w`, srcDir, err)
2123
}
@@ -44,6 +46,7 @@ func Run(srcDir, outDir string, numWorkers int, sortResults bool) error {
4446
dst := filepath.Join(outDir, spec.Lang+"-"+spec.Code+".txt")
4547

4648
wg.Add(1)
49+
// TODO: add more collations
4750
go extract(src, dst, "POLISH_CI", sortResults, sem, &wg)
4851
}
4952

example-golang/app/extract.go

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
110110

111111
// One of the possible optimisations here is to split file in chunks and process
112112
// each chunk individually.
113-
words, err := collectWords(fd, InitialDictSize)
113+
words, err := collectWords(fd, lang, InitialDictSize)
114114
if err != nil {
115115
_, _ = fmt.Fprintf(os.Stderr, `extract: reading input "%s": %s`, src, err)
116116
return
@@ -146,9 +146,14 @@ func extract(src, dst, lang string, sortResults bool, sem <-chan empty, wg *sync
146146
_, _ = fmt.Fprintf(os.Stdout, "Saved %s\n", dst)
147147
}
148148

149-
func collectWords(r io.Reader, sizeHint int) ([]string, error) {
149+
func collectWords(r io.Reader, lang string, sizeHint int) ([]string, error) {
150150
scanner := bufio.NewScanner(r)
151-
scanner.Split(splitWords)
151+
ascii := []string{"en", "la"}
152+
if stringInSlice(lang, ascii) {
153+
scanner.Split(splitWords)
154+
} else {
155+
scanner.Split(splitWordsUnicode)
156+
}
152157

153158
// map[uint64]empty should take less memory than map[string]empty and avoid
154159
// GC checks.
@@ -200,7 +205,8 @@ func writeResults(w io.Writer, words []string) error {
200205

201206
func ExtractUniqueWords(content string, lang string, sizeHint int) ([]string, error) {
202207
r := strings.NewReader(content)
203-
words, err := collectWords(r, sizeHint)
208+
words, err := collectWords(r, lang, sizeHint)
209+
204210
if err != nil {
205211
_, _ = fmt.Fprintf(os.Stderr, `collectWords error: %s`, err)
206212
return nil, err
@@ -211,3 +217,12 @@ func ExtractUniqueWords(content string, lang string, sizeHint int) ([]string, er
211217
})
212218
return words, nil
213219
}
220+
221+
func stringInSlice(a string, list []string) bool {
222+
for _, b := range list {
223+
if b == a {
224+
return true
225+
}
226+
}
227+
return false
228+
}

example-golang/main.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import (
77
"path/filepath"
88
"runtime"
99
"time"
10-
1110
"wordextractor/app"
1211
)
1312

0 commit comments

Comments
 (0)