-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathwords.py
More file actions
80 lines (66 loc) · 2 KB
/
words.py
File metadata and controls
80 lines (66 loc) · 2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import glob
from typing import Tuple
import os
try:
from icu import Collator, Locale
i18nsorting = True
except ModuleNotFoundError:
# Not supported by M1
i18nsorting = False
import multiprocessing as mp
import os
import re
import shutil
import time
import yaml
def worker(path: str, outdir: str, sorting: bool = False) -> Tuple[str, int]:
if sorting:
if i18nsorting:
collator = Collator.createInstance(Locale("pl_PL.UTF-8"))
print("I18nN sorting not available")
separator = re.compile("[\W\d]+")
filepath = path.replace(".yml", ".txt")
filesize = os.path.getsize(filepath)
with open(filepath) as file:
text = file.read().lower().rstrip()
words = set(re.split(separator, text))
with open(path) as file:
meta = yaml.safe_load(file)
with open(f"{outdir}/{meta['lang']}-{meta['code']}.txt", "w") as file:
if sorting and i18nsorting:
words = sorted(words, key=collator.getSortKey)
file.write("\n".join(words))
return path, filesize
if __name__ == "__main__":
t = time.time()
outdir = "words"
src_path = "../data/??/**/*.yml"
if os.path.exists(outdir):
shutil.rmtree(outdir)
os.makedirs(outdir)
pool = mp.Pool(mp.cpu_count())
print("Processing")
results = []
paths = glob.glob(src_path, recursive=True)
if not paths:
raise Exception(f"WRONG PATH {src_path}")
for path in paths:
res = pool.apply_async(
worker,
kwds=dict(
path=path,
outdir=outdir,
sorting=False,
),
)
results.append(res)
total_size = 0
items_count = len(results)
for i, res in enumerate(results):
path, size = res.get()
total_size += size
print(f"[{i+1}/{items_count}] {path}")
print(f"Total files: {items_count}")
print(f"Total size: {round((total_size / 1024 / 1024))} MB")
t = time.time() - t
print(f"Total time: {t:.4f} s")