@@ -19,17 +19,17 @@ module FastWordsCr
1919 def self.worker (path, outpath)
2020 text = File .read(path.gsub(" .yml" , " .txt" )).gsub(" \n " , " " ).downcase
2121
22- # 35sec
23- # words_json = (text.split(/[^\p{L}]+/).to_set - Set{""}).to_json.downcase
24- # words = Array(String).from_json(words_json).sort { |x, y| self.word_cmp(x, y) }
22+ # 17sec
23+ words_json = (text.split(/[^\p {L}] +/ ).to_set - Set {" " }).to_json.downcase
24+ words = Array (String ).from_json(words_json).sort { |x , y | self .word_cmp(x, y) }
2525
26- # 35s
26+ # 19s
2727 # words = (text.split(/[^\p{L}]+/).to_set - Set{""}).to_a.sort do |x, y|
2828 # self.word_cmp(x, y)
2929 # end
3030
3131 # 7s (no sort)
32- words = (text.split(/[^\p {L}] +/ ).to_set - Set {" " }).to_a
32+ # words = (text.split(/[^\p{L}]+/).to_set - Set{""}).to_a
3333
3434 meta = File .open(path) { |file | YAML .parse(file) }
3535 filepath = %Q( #{ outpath } /słowa - #{ meta[" label" ] } .txt)
@@ -45,8 +45,8 @@ module FastWordsCr
4545 end
4646
4747 def self.word_cmp (str1 : String , str2 : String , charset = " aąbcćdeęfghijklłmnńoópqrsśtuvwxyzźż" )
48- tokens1 = str1.downcase.split( " " )
49- tokens2 = str2.downcase.split( " " )
48+ tokens1 = str1.downcase.chars
49+ tokens2 = str2.downcase.chars
5050 tokens1.each_with_index do |s1 , i |
5151 return 1 unless tokens2[i]?
5252 idx1 = charset.index(s1) || -1
0 commit comments