Skip to content

Commit 9793fde

Browse files
committed
update Rust example and fixing Golang result
1 parent 57e5cf1 commit 9793fde

File tree

2 files changed

+96
-67
lines changed

2 files changed

+96
-67
lines changed

README.md

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,39 +14,35 @@ The following results are for 123 unique utf-8 Bible text files in 23 languages
1414
* Machine: MacBook Pro 16" 64GB 2TB M1Max 10 cores.
1515

1616
<pre>
17-
1. Rust 1.58 = 0.38s (with sorting: 1.40s)
18-
2. Golang 1.17.6 = 0.61s (with sorting: 5.03s)
19-
3. Python 3.10.2 = 2.80s
17+
1. Golang 1.17.6 = 0.61s (with sorting: 5.03s)
18+
2. Rust 1.58 = 1.14s (with sorting: 1.62s) with tokyo (previous: 1.34s, with sorting: 1.79)
19+
3. Python 3.10.2 = 2.80s (with multiprocessing)
2020
4. Julia 1.7.1 = 4.522
2121
5. Crystal 1.3.2 = 5.72s
2222
6. Elixir 1.13.2 = 7.82s
23-
7. Ruby 3.1.0 = 8.31s
23+
7. Ruby 3.1.0 = 8.31s (with Parallel)
2424
</pre>
2525

2626
### Conclusion
2727

28-
Rust is the fastest language beyond doubt. The new optimized Golang code version is very fast, slower than Rust but faster than other languages. Golang is the only language at the moment with full mature i18n support for arm64/M1 platform.
29-
30-
* Rust = the current example uses [lexical-sort](https://lib.rs/crates/lexical-sort) which is not perfect. [There is no standard mature implementation of i18n in Rust](https://www.arewewebyet.org/topics/i18n/) at the moment.
28+
The new optimized Golang code version is very fast, slower than Rust but faster than other languages. Golang is the only language at the moment with full mature i18n support for arm64/M1 platform.
3129

30+
* Rust = the current example uses [lexical-sort](https://lib.rs/crates/lexical-sort) which is not perfect. [There is no standard mature implementation of i18n in Rust](https://www.arewewebyet.org/topics/i18n/) at the moment.
3231

3332
* Python = has a great implementation of [ICU](https://icu.unicode.org/related) library however it does not support arm64/M1 platform, hence I couldn't use it in this comparison.
3433

35-
3634
* Ruby = same as Python, no ICU for M1.
3735

38-
3936
* Elixir = same as Python, no ICU for M1.
4037

41-
4238
* Julia = I couldn't find a good i18 library supporting many languages.
4339

44-
4540
* Crystal = currently supports only Turkish collations. Probably because the language is young and does not have a large enough community or company behind it.
4641

47-
4842
* Golang = has rules for many languages. You can see the influence of a large company and community which makes Golang a mature solution. Sorting slowed the whole task down significantly, but the result is correct (in this case I only checked the results for the Polish language)
4943

5044
### Kudos
5145

5246
[@romanatnews](https://github.com/romanatnews) (Golang example refactoring)
47+
48+
[@pan93412](https://github.com/pan93412) (Rust example refactoring using Tokyo runtime)

example-rust/src/main.rs

Lines changed: 88 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,95 @@
1-
use crossbeam_utils::sync::WaitGroup;
21
use glob::glob;
2+
use itertools::Itertools;
33
use lexical_sort::{natural_lexical_cmp, StringSort};
4+
use once_cell::sync::Lazy;
45
use regex::Regex;
5-
use std::collections::HashSet;
6-
use std::fs;
7-
use std::thread;
8-
use time::Instant;
6+
use std::path::Path;
7+
use tokio::fs;
98
use yaml_rust::YamlLoader;
109

11-
fn main() -> std::io::Result<()> {
12-
let start = Instant::now();
13-
let with_sorting = false;
14-
let outdir = "words";
15-
fs::create_dir_all(outdir)?;
16-
let wg = WaitGroup::new();
17-
let path = "../data/??/**/*.yml";
18-
for entry in glob(path).expect("Failed to read glob pattern") {
19-
match entry {
20-
Ok(path) => {
21-
// let separator = Regex::new(r"[^\p{L}]+").unwrap();
22-
let separator = Regex::new(r"[\W\d]+").unwrap();
23-
let wg = wg.clone();
24-
thread::spawn(move || {
25-
let filepath = path.to_str().unwrap().replace(".yml", ".txt");
26-
// println!("{:?}", filepath);
27-
let text = fs::read_to_string(&filepath)
28-
.unwrap()
29-
.to_lowercase()
30-
.replace("\n", " ");
31-
let tokens: Vec<&str> = separator.split(&text).collect();
32-
let unique_tokens: HashSet<&str> = tokens.into_iter().collect();
33-
let mut words: Vec<&str>;
34-
if with_sorting {
35-
words = unique_tokens.into_iter().collect();
36-
words.string_sort_unstable(natural_lexical_cmp);
37-
} else {
38-
words = unique_tokens.into_iter().collect();
39-
}
40-
let yaml = fs::read_to_string(&path).unwrap();
41-
let docs = YamlLoader::load_from_str(&yaml).unwrap();
42-
let meta = &docs[0];
43-
let out = format!(
44-
"{}/{}-{}.txt",
45-
outdir,
46-
meta["lang"].as_str().unwrap(),
47-
meta["code"].as_str().unwrap()
48-
);
49-
if let Err(e) = fs::write(out, words.join("\n")) {
50-
println!("Writing error: {}", e.to_string());
51-
}
52-
drop(wg);
53-
});
54-
}
55-
Err(e) => println!("{:?}", e),
56-
}
10+
const SORT: bool = true;
11+
const OUTDIR: &str = "words";
12+
const FILE_DIR: &str = "../data/??/**/*.yml";
13+
static SEPARATOR_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\W\d]+").unwrap());
14+
15+
async fn create_outdir() -> tokio::io::Result<()> {
16+
fs::create_dir_all(OUTDIR).await
17+
}
18+
19+
async fn read_file(path: &Path) -> String {
20+
let raw = fs::read_to_string(path).await.unwrap();
21+
raw.to_lowercase().replace('\n', " ")
22+
}
23+
24+
fn get_unique_token(src: &str) -> Vec<&str> {
25+
let mut data = SEPARATOR_REGEX.split(src).unique().collect::<Vec<_>>();
26+
27+
if SORT {
28+
data.string_sort_unstable(natural_lexical_cmp);
5729
}
58-
wg.wait();
59-
let end = Instant::now();
60-
println!("{:?} seconds.", end - start);
61-
Ok(())
30+
31+
data
32+
}
33+
34+
async fn get_filename_from_meta(path: &Path) -> anyhow::Result<String> {
35+
let yaml = fs::read_to_string(path).await?;
36+
let docs = YamlLoader::load_from_str(&yaml)?;
37+
let meta = &docs[0];
38+
39+
let lang = meta["lang"]
40+
.as_str()
41+
.ok_or_else(|| anyhow::anyhow!("code not found"))?;
42+
43+
let code = meta["code"]
44+
.as_str()
45+
.ok_or_else(|| anyhow::anyhow!("code not found"))?;
46+
47+
Ok(format!("{}/{}-{}.txt", OUTDIR, lang, code))
6248
}
49+
50+
#[tokio::main]
51+
async fn main() -> std::io::Result<()> {
52+
let start = std::time::Instant::now();
53+
let path = glob(FILE_DIR).expect("failed to read glob pattern");
54+
55+
let submissions = path.map(|entry| {
56+
tokio::spawn(async {
57+
let yaml_path = entry.expect("should be existed");
58+
let txt_path = yaml_path.with_extension("txt");
59+
60+
let outdir_submission =
61+
tokio::spawn(async { create_outdir().await.expect("unable to create outdir") });
62+
63+
let read_text_file_submission = tokio::spawn(async move {
64+
let data = read_file(&txt_path).await;
65+
let tokens = get_unique_token(&data);
66+
67+
tokens.join("\n")
68+
});
69+
70+
let filename_submission = tokio::spawn(async move {
71+
get_filename_from_meta(&yaml_path)
72+
.await
73+
.expect("should be existed")
74+
});
75+
76+
let (tokens, filename, _) = tokio::join!(
77+
read_text_file_submission,
78+
filename_submission,
79+
outdir_submission,
80+
);
81+
82+
fs::write(
83+
filename.expect("failed to run filename"),
84+
tokens.expect("failed to get tokens"),
85+
)
86+
.await
87+
.expect("failed to write");
88+
})
89+
});
90+
91+
futures::future::join_all(submissions).await;
92+
93+
println!("{:?}", start.elapsed());
94+
Ok(())
95+
}

0 commit comments

Comments
 (0)