33require 'etc'
44require 'fileutils'
55require 'optparse'
6+ require 'uri'
7+ require 'ffi-icu'
68
79class WordsExtractor
8- def initialize ( cores : Etc . nprocessors , sorting : false , outdir : 'words' , source : '../data/??/**/*.yml' )
10+ def initialize ( outdir : , source : , cores : Etc . nprocessors , sorting : false )
911 @cores = cores
1012 @sorting = sorting
1113 @outdir = outdir
@@ -17,6 +19,35 @@ def clear_output
1719 Dir . mkdir ( @outdir )
1820 end
1921
22+ def get_collation ( lang )
23+ mapper = {
24+ 'ar' => 'ar_SA' , # Arabic, Saudi Arabia
25+ 'cs' => 'cs_CZ' , # Czech, Czech Republic
26+ 'da' => 'da_DK' , # Danish, Denmark
27+ 'de' => 'de_DE' , # German, Germany
28+ 'el' => 'el_GR' , # Greek, Greece
29+ 'en' => 'en_EN' , # English
30+ 'eo' => 'eo' , # Esperanto, not country-specific
31+ 'es' => 'es_ES' , # Spanish, Spain
32+ 'fi' => 'fi_FI' , # Finnish, Finland
33+ 'fr' => 'fr_FR' , # French, France
34+ 'he' => 'he_IL' , # Hebrew, Israel
35+ 'hr' => 'hr_HR' , # Croatian, Croatia
36+ 'hu' => 'hu_HU' , # Hungarian, Hungary
37+ 'it' => 'it_IT' , # Italian, Italy
38+ 'lt' => 'lt_LT' , # Lithuanian, Lithuania
39+ 'la' => 'en_EN' , # Latin locale is the same as English
40+ 'nl' => 'nl_NL' , # Dutch, Netherlands
41+ 'pl' => 'pl_PL' , # Polish, Poland
42+ 'pt' => 'pt_PT' , # Portuguese, Portugal
43+ 'ru' => 'ru_RU' , # Russian, Russia
44+ 'sk' => 'sk_SK' , # Slovak, Slovakia
45+ 'sv' => 'sv_SE' , # Swedish, Sweden
46+ 'uk' => 'uk_UA' # Ukrainian, Ukraine
47+ }
48+ mapper [ lang ]
49+ end
50+
2051 def get_words ( filepath )
2152 IO . readlines ( filepath ) . map do |line |
2253 line . strip . downcase . split ( ' ' ) [ 2 ...-1 ] . join ( ' ' ) . split ( /[^\p {L}]/ ) . uniq . select { |s | s . size > 1 }
@@ -34,15 +65,21 @@ def run
3465 print ' with sorting' if @sorting
3566 puts '...'
3667 clear_output
68+
3769 start = Time . now
3870 paths = Dir [ @source ]
3971 count = paths . count
72+
4073 sizes = Parallel . map_with_index ( paths , in_processes : @cores ) do |yaml_path , i |
4174 meta = YAML . load_file ( yaml_path )
4275 filepath = yaml_path . gsub ( '.yml' , '.txt' )
4376 words = get_words ( filepath )
44- words . sort! if @sorting
77+ if @sorting
78+ collator = ICU ::Collation ::Collator . new ( get_collation ( meta [ 'lang' ] ) )
79+ words = words . sort { |a , b | collator . compare ( a , b ) }
80+ end
4581 save_words ( words :, meta :, yaml_path :, count :, i :)
82+
4683 File . size ( filepath )
4784 end
4885 puts "Total size: #{ ( sizes . sum / 1024.0 / 1024 ) . round } MB"
@@ -64,5 +101,10 @@ def run
64101 end
65102 opts . on ( '-s' , 'Sort results' ) { |v | options [ :s ] = v }
66103 end . parse!
67- WordsExtractor . new ( cores : options [ :n ] , sorting : options [ :s ] ) . run
104+ WordsExtractor . new (
105+ cores : options [ :n ] ,
106+ sorting : options [ :s ] ,
107+ outdir : 'words' ,
108+ source : '../data/??/**/*.yml'
109+ ) . run
68110end
0 commit comments