Skip to content

Commit 5583562

Browse files
authored
Merge branch 'master' into golang-example-refactor
2 parents bc8ecd5 + b4b3e69 commit 5583562

File tree

9 files changed

+102
-55
lines changed

9 files changed

+102
-55
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,5 @@
1313
/**/main
1414
/**/target
1515
/**/.idea
16-
/**/.venv
16+
/**/.venv
17+
/data.full/

README.md

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@ The following results are for 123 unique utf-8 Bible text files in 23 languages
1515

1616
<pre>
1717
1. Rust 1.58 = 1.14s (with sorting: 1.59s) with tokyo (previous: 1.34s, with sorting: 1.79)
18-
2. Golang 1.17.6 = 1.34s (with sorting: 6.55s)
18+
2. Golang 1.17.6 = 1.34s (with sorting: 4.56s)
1919
3. Python 3.10.2 = 2.80s (with multiprocessing)
2020
4. Julia 1.7.1 = 4.522
2121
5. Crystal 1.3.2 = 5.72s
2222
6. Elixir 1.13.2 = 7.82s
23-
7. Ruby 3.1.0 = 8.31s (with Parallel)
23+
7. Ruby 3.1.0 = 10.44s (with Parallel), with sorting: 10.51s
2424
</pre>
2525

2626
### Conclusion
@@ -31,7 +31,7 @@ The new optimized Golang code version is very fast, slower than Rust but faster
3131

3232
* Python = has a great implementation of [ICU](https://icu.unicode.org/related) library however it does not support arm64/M1 platform, hence I couldn't use it in this comparison.
3333

34-
* Ruby = same as Python, no ICU for M1.
34+
* Ruby = can sort unicode text but without collations becase it can't use ICU on arm64/M1
3535

3636
* Elixir = same as Python, no ICU for M1.
3737

@@ -46,3 +46,9 @@ The new optimized Golang code version is very fast, slower than Rust but faster
4646
[@romanatnews](https://github.com/romanatnews) (Golang example refactoring)
4747

4848
[@pan93412](https://github.com/pan93412) (Rust example refactoring using Tokyo runtime)
49+
50+
## CHANGES
51+
52+
2022-02-08
53+
54+
Added improved Ruby code version with correct reading the pure text to tokenize (it ignores sigla in each verse), and with the correct regular expression for extracting words. The code is a little slower but it works almost as expected. (almost because for arm64/M1 it can't use ICU)

example-golang/.tool-versions

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
golang 1.17.7

example-golang/app/extract.go

Lines changed: 2 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"fmt"
77
"io"
88
"os"
9+
"strings"
910
"sync"
1011
"unicode"
1112
"unicode/utf8"
@@ -105,7 +106,7 @@ func collectWords(r io.Reader) ([]string, error) {
105106
words := make([]string, 0)
106107

107108
for scanner.Scan() {
108-
word := scanner.Text()
109+
word := strings.ToLower(scanner.Text())
109110
hash := xxhash.Sum64String(word)
110111
if _, ok := dict[hash]; ok {
111112
continue // duplicate detected
@@ -141,27 +142,3 @@ func writeResults(w io.Writer, words []string) error {
141142

142143
return nil
143144
}
144-
145-
func ExtractUniqueWords(content string, lang string, sizeHint int) ([]string, error) {
146-
r := strings.NewReader(content)
147-
words, err := collectWords(r, lang, sizeHint)
148-
149-
if err != nil {
150-
_, _ = fmt.Fprintf(os.Stderr, `collectWords error: %s`, err)
151-
return nil, err
152-
}
153-
less := collate.IndexString(lang)
154-
sort.Slice(words, func(i, j int) bool {
155-
return less(words[i], words[j])
156-
})
157-
return words, nil
158-
}
159-
160-
func stringInSlice(a string, list []string) bool {
161-
for _, b := range list {
162-
if b == a {
163-
return true
164-
}
165-
}
166-
return false
167-
}

example-golang/go.mod

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,14 @@ module wordextractor
33
go 1.17
44

55
require (
6+
github.com/bmatcuk/doublestar v1.3.4
67
github.com/cespare/xxhash/v2 v2.1.2
7-
golang.org/x/text v0.3.7
8+
github.com/stretchr/testify v1.7.0
9+
golang.org/x/text v0.3.2
810
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b
911
)
12+
13+
require (
14+
github.com/davecgh/go-spew v1.1.0 // indirect
15+
github.com/pmezard/go-difflib v1.0.0 // indirect
16+
)

example-golang/go.sum

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,19 @@
1+
github.com/bmatcuk/doublestar v1.3.4 h1:gPypJ5xD31uhX6Tf54sDPUOBXTqKH4c9aPY66CyQrS0=
2+
github.com/bmatcuk/doublestar v1.3.4/go.mod h1:wiQtGV+rzVYxB7WIlirSN++5HPtPlXEo9MEoZQC/PmE=
13
github.com/cespare/xxhash/v2 v2.1.2 h1:YRXhKfTDauu4ajMg1TPgFO5jnlC2HCbmLXMcTG5cbYE=
24
github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
3-
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
4-
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
5+
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
6+
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
7+
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
8+
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
9+
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
10+
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
11+
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
12+
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
13+
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
514
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
615
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
716
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
17+
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
818
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo=
919
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

example-golang/main.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,6 @@ func main() {
2424

2525
defaultNumWorkers := runtime.NumCPU()
2626

27-
// In the original Go code, results where always sorted, unlike in Rust or
28-
// Python implementations. Sorting is turned off, by default.
2927
var sortResults bool
3028
var numWorkers int
3129

example-ruby/README.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,17 @@
22

33
## setup and run
44

5+
Syntax
6+
57
```
6-
ruby words.rb
8+
❯ ruby words.rb -h
9+
Usage: ruby words.rb [options]
10+
-n [NUM] Number of cores to run (default 10)
11+
-s Sort results
712
```
813

14+
Run
15+
16+
```
17+
ruby word.rb -s
18+
```

example-ruby/words.rb

Lines changed: 57 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,68 @@
11
require 'yaml'
2-
require 'yaml'
32
require 'parallel'
43
require 'etc'
54
require 'fileutils'
5+
require 'optparse'
66

7-
outdir = 'words'
8-
9-
FileUtils.rm_rf(outdir)
10-
Dir.mkdir(outdir)
7+
class WordsExtractor
8+
def initialize(cores: Etc.nprocessors, sorting: false, outdir: 'words', source: '../data/??/**/*.yml')
9+
@cores = cores
10+
@sorting = sorting
11+
@outdir = outdir
12+
@source = source
13+
end
1114

12-
t = Time.now
15+
def clear_output
16+
FileUtils.rm_rf(@outdir)
17+
Dir.mkdir(@outdir)
18+
end
1319

14-
sorted = false
20+
def get_words(filepath)
21+
IO.readlines(filepath).map do |line|
22+
line.strip.downcase.split(' ')[2...-1].join(' ').split(/[^\p{L}]+/).uniq
23+
end.flatten.uniq
24+
end
1525

16-
paths = Dir['../data/??/**/*.yml']
17-
count = paths.count
26+
def save_words(words:, meta:, yaml_path:, count:, i:)
27+
outpath = "#{@outdir}/#{meta['lang']}-#{meta['code']}.txt"
28+
puts(format('[%3d/%d] %s/%s', i, count, yaml_path, outpath))
29+
File.write(outpath, words.join("\n"))
30+
end
1831

19-
sizes = Parallel.map_with_index(paths, in_processes: Etc.nprocessors) do |yaml_path, i|
20-
meta = YAML.load_file(yaml_path)
21-
filepath = yaml_path.gsub('.yml', '.txt')
22-
words = IO.read(filepath).downcase.strip.split(/[^\p{word}]+/).uniq
23-
words = words.sort if sorted
24-
outpath = "#{outdir}/#{meta['lang']}-#{meta['code']}.txt"
25-
puts(format('[%3d/%d] %s/%s', i, count, yaml_path, outpath))
26-
File.write(outpath, words.join("\n"))
27-
File.size(filepath)
32+
def run
33+
print "Running using #{@cores} processes"
34+
print ' with sorting' if @sorting
35+
puts '...'
36+
clear_output
37+
start = Time.now
38+
paths = Dir[@source]
39+
count = paths.count
40+
sizes = Parallel.map_with_index(paths, in_processes: @cores) do |yaml_path, i|
41+
meta = YAML.load_file(yaml_path)
42+
filepath = yaml_path.gsub('.yml', '.txt')
43+
words = get_words(filepath)
44+
words.sort! if @sorting
45+
save_words(words:, meta:, yaml_path:, count:, i:)
46+
File.size(filepath)
47+
end
48+
puts "Total size: #{(sizes.sum / 1024.0 / 1024).round} MB"
49+
puts "Total time: #{Time.now - start} s"
50+
end
2851
end
2952

30-
puts "Total size: #{(sizes.sum / 1024.0 / 1024).round} MB"
31-
puts "Total time: #{Time.now - t} s"
53+
if __FILE__ == $PROGRAM_NAME
54+
cores = Etc.nprocessors
55+
options = { s: false, n: cores }
56+
OptionParser.new do |opts|
57+
opts.banner = "Usage: ruby #{__FILE__} [options]"
58+
opts.on('-n [NUM]', OptionParser::DecimalInteger, "Number of cores to run (default #{cores})") do |val|
59+
options[:n] = if val.negative? || val > cores
60+
cores
61+
else
62+
val
63+
end
64+
end
65+
opts.on('-s', 'Sort results') { |v| options[:s] = v }
66+
end.parse!
67+
WordsExtractor.new(cores: options[:n], sorting: options[:s]).run
68+
end

0 commit comments

Comments
 (0)