11module words_extractor_jl
22
3+ using ArgParse
34using Distributed
45using YAML
56using Glob
67
78const outdir = " words"
89
9- function worker (yaml_path, i, count)
10+ function parse_commandline ()
11+ s = ArgParseSettings ()
12+ @add_arg_table s begin
13+ " -s"
14+ help = " Sort results"
15+ action = :store_true
16+ end
17+ return parse_args (s)
18+ end
19+
20+ function worker (yaml_path, sorting, i, count)
1021 path = get_filepath (yaml_path)
11- words = get_words (yaml_path)
22+ words = get_words (yaml_path, sorting )
1223 write (path, join (words, " \n " ))
1324 println (" [$(lpad (i, 3 , ' ' )) /$count ] $path " )
1425end
1526
16- function get_words (yaml_path)
17- text_path = replace (yaml_path, " .yml" => " .txt" )
18- text = read (text_path, String) |> lowercase
19- split (text, r" [\W\d ]+" ) |> Set |> collect
27+ function get_words (yaml_path, sorting = false )
28+ words = []
29+ open (replace (yaml_path, " .yml" => " .txt" )) do file
30+ for line in readlines (file)
31+ # exclude beginning book refrence from the line
32+ text = split (line, " " )[begin + 2 : end ] |> t -> join (t, " " )
33+ tokens =
34+ text |>
35+ lowercase |>
36+ t -> split (t, r" [\W\d ]+" ) |> t -> filter (token -> length (token) > 1 , t)
37+ append! (words, tokens)
38+ end
39+ end
40+ unique_words = Set (words)
41+ if sorting
42+ arr = collect (unique_words)
43+ sort (arr)
44+ else
45+ unique_words
46+ end
2047end
2148
2249function get_filepath (path)
2653
2754function rdir (dir:: AbstractString , pat:: Glob.FilenameMatch )
2855 result = String[]
29- for (root, dirs , files) in walkdir (dir)
56+ for (root, _dirs , files) in walkdir (dir)
3057 filepaths = joinpath .(root, files)
3158 append! (result, filter! (f -> occursin (pat, f), filepaths))
3259 end
33- return result
60+ result
3461end
3562
3663rdir (dir:: AbstractString , pat:: AbstractString ) = rdir (dir, Glob. FilenameMatch (pat))
3764
3865function main ()
66+ parsed_args = parse_commandline ()
67+ sorting = parsed_args[" s" ]
68+
69+ addprocs ()
70+ println (string (" Workers " , nworkers ()))
71+ println (string (" Processing... using " , Threads. nthreads (), " threads" ))
72+ if sorting
73+ println (" with sorting" )
74+ end
3975 if ispath (outdir)
4076 rm (outdir, recursive = true )
4177 end
@@ -45,13 +81,10 @@ function main()
4581 i = 1
4682 Threads. @threads for path in paths
4783 # println("Spawn $path")
48- worker (path, i, count)
84+ worker (path, sorting, i, count)
4985 i += 1
5086 end
5187end
5288
53- addprocs ()
54- println (string (" Workers " , nworkers ()))
55- println (string (" Processing... using " , Threads. nthreads (), " threads" ))
5689@time main ()
5790end # module
0 commit comments