updates

hipertracker · hipertracker · commit d904a9ecae12 · 2021-05-28T17:40:32.000+01:00
diff --git a/README.md b/README.md
@@ -8,8 +8,8 @@ Text source: 79.4MB in 30 files
 - Rust 1.51.0 with sorting: 7s, without sorting: 5s (no parallelism)
 - Go 1.16.4 (parallel) with sorting: 7.32s, without sorting: 4.06s
 - Python 3.9.5 with sorting: 10s, without sorting 8.32s (no multiprocessing)
+- Crystal 1.0.0 with sorting: 17s, without sorting: 7s (non optimized sort, no parallelism)
 - Go 1.16.4 with sorting: 21s, without sorting: 11s (no parallelism)
-- Crystal 1.0.0 with sorting: 35s, without sorting: 7s (non optimized sort, no parallelism)
 - Elixir 1.12 (parallel) with sorting: 33s (without release build)
 
 
diff --git a/words_extractor_cr/src/fast_words_cr.cr b/words_extractor_cr/src/fast_words_cr.cr
@@ -19,17 +19,17 @@ module FastWordsCr
   def self.worker(path, outpath)
     text = File.read(path.gsub(".yml", ".txt")).gsub("\n", " ").downcase
 
-    # 35sec
-    # words_json = (text.split(/[^\p{L}]+/).to_set - Set{""}).to_json.downcase
-    # words = Array(String).from_json(words_json).sort { |x, y| self.word_cmp(x, y) }
+    # 17sec
+    words_json = (text.split(/[^\p{L}]+/).to_set - Set{""}).to_json.downcase
+    words = Array(String).from_json(words_json).sort { |x, y| self.word_cmp(x, y) }
 
-    # 35s
+    # 19s
     # words = (text.split(/[^\p{L}]+/).to_set - Set{""}).to_a.sort do |x, y|
     #   self.word_cmp(x, y)
     # end
 
     # 7s (no sort)
-    words = (text.split(/[^\p{L}]+/).to_set - Set{""}).to_a
+    # words = (text.split(/[^\p{L}]+/).to_set - Set{""}).to_a
 
     meta = File.open(path) { |file| YAML.parse(file) }
     filepath = %Q(#{outpath}/słowa - #{meta["label"]}.txt)
@@ -45,8 +45,8 @@ module FastWordsCr
   end
 
   def self.word_cmp(str1 : String, str2 : String, charset = "aąbcćdeęfghijklłmnńoópqrsśtuvwxyzźż")
-    tokens1 = str1.downcase.split("")
-    tokens2 = str2.downcase.split("")
+    tokens1 = str1.downcase.chars
+    tokens2 = str2.downcase.chars
     tokens1.each_with_index do |s1, i|
       return 1 unless tokens2[i]?
       idx1 = charset.index(s1) || -1