Skip to content

Commit 2972255

Browse files
committed
improve Crystal code
1 parent d904a9e commit 2972255

File tree

1 file changed

+15
-20
lines changed

1 file changed

+15
-20
lines changed

words_extractor_cr/src/fast_words_cr.cr

Lines changed: 15 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5,36 +5,32 @@ require "yaml"
55

66
module FastWordsCr
77
VERSION = "0.1.0"
8+
CHARSET = "aąbcćdeęfghijklłmnńoópqrsśtuvwxyzźż"
89

910
def self.main(outpath = "words")
11+
with_sorting = true
1012
prepare_folder(outpath, "*.txt")
1113
Dir.glob("../data/pl/**/*.yml").each do |path|
1214
# spawn do
13-
worker(path, outpath)
15+
worker(path, outpath, with_sorting)
1416
# end
1517
end
1618
# Fiber.yield
1719
end
1820

19-
def self.worker(path, outpath)
21+
def self.worker(path, outpath, with_sorting)
2022
text = File.read(path.gsub(".yml", ".txt")).gsub("\n", " ").downcase
2123

22-
# 17sec
23-
words_json = (text.split(/[^\p{L}]+/).to_set - Set{""}).to_json.downcase
24-
words = Array(String).from_json(words_json).sort { |x, y| self.word_cmp(x, y) }
24+
words = text.split(/[^\p{L}]+/).to_set
2525

26-
# 19s
27-
# words = (text.split(/[^\p{L}]+/).to_set - Set{""}).to_a.sort do |x, y|
28-
# self.word_cmp(x, y)
29-
# end
30-
31-
# 7s (no sort)
32-
# words = (text.split(/[^\p{L}]+/).to_set - Set{""}).to_a
26+
if with_sorting
27+
words = words.to_a.sort {|x, y| self.word_cmp(x, y)}
28+
end
3329

3430
meta = File.open(path) { |file| YAML.parse(file) }
35-
filepath = %Q(#{outpath}/słowa - #{meta["label"]}.txt)
36-
puts filepath
31+
filepath = %Q(#{outpath}/extracted-words-for-#{meta["label"]}.txt)
3732
File.write(filepath, words.join("\n"))
33+
puts "Saved #{filepath}"
3834
end
3935

4036
def self.prepare_folder(folder : String, pattern : String)
@@ -44,13 +40,12 @@ module FastWordsCr
4440
end
4541
end
4642

47-
def self.word_cmp(str1 : String, str2 : String, charset = "aąbcćdeęfghijklłmnńoópqrsśtuvwxyzźż")
48-
tokens1 = str1.downcase.chars
49-
tokens2 = str2.downcase.chars
50-
tokens1.each_with_index do |s1, i|
43+
def self.word_cmp(str1 : String, str2 : String)
44+
tokens2 = str2.chars
45+
str1.chars.each_with_index do |s1, i|
5146
return 1 unless tokens2[i]?
52-
idx1 = charset.index(s1) || -1
53-
idx2 = charset.index(tokens2[i]) || -1
47+
idx1 = CHARSET.index(s1) || -1
48+
idx2 = CHARSET.index(tokens2[i]) || -1
5449
return 1 if idx1 > idx2
5550
return -1 if idx1 < idx2
5651
end

0 commit comments

Comments
 (0)