|
| 1 | +use crossbeam_utils::sync::WaitGroup; |
1 | 2 | use glob::glob; |
2 | | -use itertools::Itertools; |
3 | 3 | use lexical_sort::{natural_lexical_cmp, StringSort}; |
4 | | -use once_cell::sync::Lazy; |
5 | 4 | use regex::Regex; |
6 | | -use std::path::Path; |
7 | | -use tokio::fs; |
| 5 | +use std::collections::HashSet; |
| 6 | +use std::fs; |
| 7 | +use std::thread; |
| 8 | +use time::Instant; |
8 | 9 | use yaml_rust::YamlLoader; |
9 | 10 |
|
10 | | -const SORT: bool = false; |
11 | | -const OUTDIR: &str = "words_new"; |
12 | | -const FILE_DIR: &str = "../data/??/**/*.yml"; |
13 | | -static SEPARATOR_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\W\d]+").unwrap()); |
14 | | - |
15 | | -async fn create_outdir() -> tokio::io::Result<()> { |
16 | | - fs::create_dir_all(OUTDIR).await |
17 | | -} |
18 | | - |
19 | | -async fn read_file(path: &Path) -> String { |
20 | | - let raw = fs::read_to_string(path).await.unwrap(); |
21 | | - raw.to_lowercase().replace('\n', " ") |
22 | | -} |
23 | | - |
24 | | -fn get_unique_token(src: &str) -> Vec<&str> { |
25 | | - let mut data = SEPARATOR_REGEX.split(src).unique().collect::<Vec<_>>(); |
26 | | - |
27 | | - if SORT { |
28 | | - data.string_sort_unstable(natural_lexical_cmp); |
| 11 | +fn main() -> std::io::Result<()> { |
| 12 | + let start = Instant::now(); |
| 13 | + let with_sorting = false; |
| 14 | + let outdir = "words"; |
| 15 | + fs::create_dir_all(outdir)?; |
| 16 | + let wg = WaitGroup::new(); |
| 17 | + let path = "../data/??/**/*.yml"; |
| 18 | + for entry in glob(path).expect("Failed to read glob pattern") { |
| 19 | + match entry { |
| 20 | + Ok(path) => { |
| 21 | + // let separator = Regex::new(r"[^\p{L}]+").unwrap(); |
| 22 | + let separator = Regex::new(r"[\W\d]+").unwrap(); |
| 23 | + let wg = wg.clone(); |
| 24 | + thread::spawn(move || { |
| 25 | + let filepath = path.to_str().unwrap().replace(".yml", ".txt"); |
| 26 | + // println!("{:?}", filepath); |
| 27 | + let text = fs::read_to_string(&filepath) |
| 28 | + .unwrap() |
| 29 | + .to_lowercase() |
| 30 | + .replace("\n", " "); |
| 31 | + let tokens: Vec<&str> = separator.split(&text).collect(); |
| 32 | + let unique_tokens: HashSet<&str> = tokens.into_iter().collect(); |
| 33 | + let mut words: Vec<&str>; |
| 34 | + if with_sorting { |
| 35 | + words = unique_tokens.into_iter().collect(); |
| 36 | + words.string_sort_unstable(natural_lexical_cmp); |
| 37 | + } else { |
| 38 | + words = unique_tokens.into_iter().collect(); |
| 39 | + } |
| 40 | + let yaml = fs::read_to_string(&path).unwrap(); |
| 41 | + let docs = YamlLoader::load_from_str(&yaml).unwrap(); |
| 42 | + let meta = &docs[0]; |
| 43 | + let out = format!( |
| 44 | + "{}/{}-{}.txt", |
| 45 | + outdir, |
| 46 | + meta["lang"].as_str().unwrap(), |
| 47 | + meta["code"].as_str().unwrap() |
| 48 | + ); |
| 49 | + if let Err(e) = fs::write(out, words.join("\n")) { |
| 50 | + println!("Writing error: {}", e.to_string()); |
| 51 | + } |
| 52 | + drop(wg); |
| 53 | + }); |
| 54 | + } |
| 55 | + Err(e) => println!("{:?}", e), |
| 56 | + } |
29 | 57 | } |
30 | | - |
31 | | - data |
32 | | -} |
33 | | - |
34 | | -async fn get_filename_from_meta(path: &Path) -> anyhow::Result<String> { |
35 | | - let yaml = fs::read_to_string(path).await?; |
36 | | - let docs = YamlLoader::load_from_str(&yaml)?; |
37 | | - let meta = &docs[0]; |
38 | | - |
39 | | - let label = meta["label"] |
40 | | - .as_str() |
41 | | - .ok_or_else(|| anyhow::anyhow!("label not found"))?; |
42 | | - |
43 | | - Ok(format!("{}/extracted-words-for-{}.txt", OUTDIR, label)) |
44 | | -} |
45 | | - |
46 | | -#[tokio::main] |
47 | | -async fn main() -> std::io::Result<()> { |
48 | | - let start = std::time::Instant::now(); |
49 | | - let path = glob(FILE_DIR).expect("failed to read glob pattern"); |
50 | | - |
51 | | - let submissions = path.map(|entry| { |
52 | | - tokio::spawn(async { |
53 | | - let yaml_path = entry.expect("should be existed"); |
54 | | - let txt_path = yaml_path.with_extension("txt"); |
55 | | - |
56 | | - let outdir_submission = |
57 | | - tokio::spawn(async { create_outdir().await.expect("unable to create outdir") }); |
58 | | - |
59 | | - let read_text_file_submission = tokio::spawn(async move { |
60 | | - let data = read_file(&txt_path).await; |
61 | | - let tokens = get_unique_token(&data); |
62 | | - |
63 | | - tokens.join("\n") |
64 | | - }); |
65 | | - |
66 | | - let filename_submission = tokio::spawn(async move { |
67 | | - get_filename_from_meta(&yaml_path) |
68 | | - .await |
69 | | - .expect("should be existed") |
70 | | - }); |
71 | | - |
72 | | - let (tokens, filename, _) = tokio::join!( |
73 | | - read_text_file_submission, |
74 | | - filename_submission, |
75 | | - outdir_submission, |
76 | | - ); |
77 | | - |
78 | | - fs::write( |
79 | | - filename.expect("failed to run filename"), |
80 | | - tokens.expect("failed to get tokens"), |
81 | | - ) |
82 | | - .await |
83 | | - .expect("failed to write"); |
84 | | - }) |
85 | | - }); |
86 | | - |
87 | | - futures::future::join_all(submissions).await; |
88 | | - |
89 | | - println!("{:?}", start.elapsed()); |
| 58 | + wg.wait(); |
| 59 | + let end = Instant::now(); |
| 60 | + println!("{:?} seconds.", end - start); |
90 | 61 | Ok(()) |
91 | 62 | } |
0 commit comments