@@ -5,36 +5,32 @@ require "yaml"
55
66module FastWordsCr
77 VERSION = " 0.1.0"
8+ CHARSET = " aąbcćdeęfghijklłmnńoópqrsśtuvwxyzźż"
89
910 def self.main (outpath = " words" )
11+ with_sorting = true
1012 prepare_folder(outpath, " *.txt" )
1113 Dir .glob(" ../data/pl/**/*.yml" ).each do |path |
1214 # spawn do
13- worker(path, outpath)
15+ worker(path, outpath, with_sorting )
1416 # end
1517 end
1618 # Fiber.yield
1719 end
1820
19- def self.worker (path, outpath)
21+ def self.worker (path, outpath, with_sorting )
2022 text = File .read(path.gsub(" .yml" , " .txt" )).gsub(" \n " , " " ).downcase
2123
22- # 17sec
23- words_json = (text.split(/[^\p {L}] +/ ).to_set - Set {" " }).to_json.downcase
24- words = Array (String ).from_json(words_json).sort { |x , y | self .word_cmp(x, y) }
24+ words = text.split(/[^\p {L}] +/ ).to_set
2525
26- # 19s
27- # words = (text.split(/[^\p{L}]+/).to_set - Set{""}).to_a.sort do |x, y|
28- # self.word_cmp(x, y)
29- # end
30-
31- # 7s (no sort)
32- # words = (text.split(/[^\p{L}]+/).to_set - Set{""}).to_a
26+ if with_sorting
27+ words = words.to_a.sort {|x , y | self .word_cmp(x, y)}
28+ end
3329
3430 meta = File .open(path) { |file | YAML .parse(file) }
35- filepath = %Q( #{ outpath } /słowa - #{ meta[" label" ] } .txt)
36- puts filepath
31+ filepath = %Q( #{ outpath } /extracted-words-for-#{ meta[" label" ] } .txt)
3732 File .write(filepath, words.join(" \n " ))
33+ puts " Saved #{ filepath } "
3834 end
3935
4036 def self.prepare_folder (folder : String , pattern : String )
@@ -44,13 +40,12 @@ module FastWordsCr
4440 end
4541 end
4642
47- def self.word_cmp (str1 : String , str2 : String , charset = " aąbcćdeęfghijklłmnńoópqrsśtuvwxyzźż" )
48- tokens1 = str1.downcase.chars
49- tokens2 = str2.downcase.chars
50- tokens1.each_with_index do |s1 , i |
43+ def self.word_cmp (str1 : String , str2 : String )
44+ tokens2 = str2.chars
45+ str1.chars.each_with_index do |s1 , i |
5146 return 1 unless tokens2[i]?
52- idx1 = charset .index(s1) || -1
53- idx2 = charset .index(tokens2[i]) || -1
47+ idx1 = CHARSET .index(s1) || -1
48+ idx2 = CHARSET .index(tokens2[i]) || -1
5449 return 1 if idx1 > idx2
5550 return -1 if idx1 < idx2
5651 end
0 commit comments