|
1 | 1 | require 'yaml' |
2 | | -require 'yaml' |
3 | 2 | require 'parallel' |
4 | 3 | require 'etc' |
5 | 4 | require 'fileutils' |
| 5 | +require 'optparse' |
6 | 6 |
|
7 | | -outdir = 'words' |
8 | | - |
9 | | -FileUtils.rm_rf(outdir) |
10 | | -Dir.mkdir(outdir) |
| 7 | +class WordsExtractor |
| 8 | + def initialize(cores: Etc.nprocessors, sorting: false, outdir: 'words', source: '../data/??/**/*.yml') |
| 9 | + @cores = cores |
| 10 | + @sorting = sorting |
| 11 | + @outdir = outdir |
| 12 | + @source = source |
| 13 | + end |
11 | 14 |
|
12 | | -t = Time.now |
| 15 | + def clear_output |
| 16 | + FileUtils.rm_rf(@outdir) |
| 17 | + Dir.mkdir(@outdir) |
| 18 | + end |
13 | 19 |
|
14 | | -sorted = false |
| 20 | + def get_words(filepath) |
| 21 | + IO.readlines(filepath).map do |line| |
| 22 | + line.strip.downcase.split(' ')[2...-1].join(' ').split(/[^\p{L}]+/).uniq |
| 23 | + end.flatten.uniq |
| 24 | + end |
15 | 25 |
|
16 | | -paths = Dir['../data/??/**/*.yml'] |
17 | | -count = paths.count |
| 26 | + def save_words(words:, meta:, yaml_path:, count:, i:) |
| 27 | + outpath = "#{@outdir}/#{meta['lang']}-#{meta['code']}.txt" |
| 28 | + puts(format('[%3d/%d] %s/%s', i, count, yaml_path, outpath)) |
| 29 | + File.write(outpath, words.join("\n")) |
| 30 | + end |
18 | 31 |
|
19 | | -sizes = Parallel.map_with_index(paths, in_processes: Etc.nprocessors) do |yaml_path, i| |
20 | | - meta = YAML.load_file(yaml_path) |
21 | | - filepath = yaml_path.gsub('.yml', '.txt') |
22 | | - words = IO.read(filepath).downcase.strip.split(/[^\p{word}]+/).uniq |
23 | | - words = words.sort if sorted |
24 | | - outpath = "#{outdir}/#{meta['lang']}-#{meta['code']}.txt" |
25 | | - puts(format('[%3d/%d] %s/%s', i, count, yaml_path, outpath)) |
26 | | - File.write(outpath, words.join("\n")) |
27 | | - File.size(filepath) |
| 32 | + def run |
| 33 | + print "Running using #{@cores} processes" |
| 34 | + print ' with sorting' if @sorting |
| 35 | + puts '...' |
| 36 | + clear_output |
| 37 | + start = Time.now |
| 38 | + paths = Dir[@source] |
| 39 | + count = paths.count |
| 40 | + sizes = Parallel.map_with_index(paths, in_processes: @cores) do |yaml_path, i| |
| 41 | + meta = YAML.load_file(yaml_path) |
| 42 | + filepath = yaml_path.gsub('.yml', '.txt') |
| 43 | + words = get_words(filepath) |
| 44 | + words.sort! if @sorting |
| 45 | + save_words(words:, meta:, yaml_path:, count:, i:) |
| 46 | + File.size(filepath) |
| 47 | + end |
| 48 | + puts "Total size: #{(sizes.sum / 1024.0 / 1024).round} MB" |
| 49 | + puts "Total time: #{Time.now - start} s" |
| 50 | + end |
28 | 51 | end |
29 | 52 |
|
30 | | -puts "Total size: #{(sizes.sum / 1024.0 / 1024).round} MB" |
31 | | -puts "Total time: #{Time.now - t} s" |
| 53 | +if __FILE__ == $PROGRAM_NAME |
| 54 | + cores = Etc.nprocessors |
| 55 | + options = { s: false, n: cores } |
| 56 | + OptionParser.new do |opts| |
| 57 | + opts.banner = "Usage: ruby #{__FILE__} [options]" |
| 58 | + opts.on('-n [NUM]', OptionParser::DecimalInteger, "Number of cores to run (default #{cores})") do |val| |
| 59 | + options[:n] = if val.negative? || val > cores |
| 60 | + cores |
| 61 | + else |
| 62 | + val |
| 63 | + end |
| 64 | + end |
| 65 | + opts.on('-s', 'Sort results') { |v| options[:s] = v } |
| 66 | + end.parse! |
| 67 | + WordsExtractor.new(cores: options[:n], sorting: options[:s]).run |
| 68 | +end |
0 commit comments