Skip to content

Commit ab161b7

Browse files
author
Jaroslaw Zabiello
committed
add python example
1 parent eafd1b1 commit ab161b7

File tree

3 files changed

+102
-0
lines changed

3 files changed

+102
-0
lines changed

example-python/README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
```
2+
python -m venv .venv
3+
source .venv/bin/activat
4+
pip install -U pip
5+
pip install -r requirements.txt
6+
```
7+
8+
```
9+
python words.py
10+
```
11+
12+
MacOS 12.2
13+
Python 3.10.2
14+
MBP 16" M1Max 10 cores
15+
Total files: 123
16+
Total size: 504 MB
17+
Total time: 2.9403 s

example-python/requirements.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# PyICU>=2.7.4
2+
PyYAML
3+
mypy
4+
black
5+
ipython

example-python/words.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
import glob
2+
from typing import Tuple
3+
import os
4+
5+
try:
6+
from icu import Collator, Locale
7+
8+
i18nsorting = True
9+
except ModuleNotFoundError:
10+
# Not supported by M1
11+
i18nsorting = False
12+
13+
import multiprocessing as mp
14+
import os
15+
import re
16+
import shutil
17+
import time
18+
import yaml
19+
20+
21+
def worker(path: str, outdir: str, sorting: bool = False) -> Tuple[str, int]:
22+
if sorting:
23+
if i18nsorting:
24+
collator = Collator.createInstance(Locale("pl_PL.UTF-8"))
25+
print("I18nN sorting not available")
26+
27+
separator = re.compile("[\W\d]+")
28+
filepath = path.replace(".yml", ".txt")
29+
filesize = os.path.getsize(filepath)
30+
with open(filepath) as file:
31+
text = file.read().lower().rstrip()
32+
words = set(re.split(separator, text))
33+
with open(path) as file:
34+
meta = yaml.safe_load(file)
35+
with open(f"{outdir}/{meta['lang']}-{meta['code']}.txt", "w") as file:
36+
if sorting and i18nsorting:
37+
words = sorted(words, key=collator.getSortKey)
38+
file.write("\n".join(words))
39+
return path, filesize
40+
41+
42+
if __name__ == "__main__":
43+
t = time.time()
44+
45+
outdir = "words"
46+
src_path = "../data/??/**/*.yml"
47+
48+
if os.path.exists(outdir):
49+
shutil.rmtree(outdir)
50+
os.makedirs(outdir)
51+
52+
pool = mp.Pool(mp.cpu_count())
53+
54+
print("Processing")
55+
results = []
56+
paths = glob.glob(src_path, recursive=True)
57+
if not paths:
58+
raise Exception(f"WRONG PATH {src_path}")
59+
60+
for path in paths:
61+
res = pool.apply_async(
62+
worker,
63+
kwds=dict(
64+
path=path,
65+
outdir=outdir,
66+
sorting=False,
67+
),
68+
)
69+
results.append(res)
70+
total_size = 0
71+
items_count = len(results)
72+
for i, res in enumerate(results):
73+
path, size = res.get()
74+
total_size += size
75+
print(f"[{i+1}/{items_count}] {path}")
76+
print(f"Total files: {items_count}")
77+
print(f"Total size: {round((total_size / 1024 / 1024))} MB")
78+
t = time.time() - t
79+
print(f"Total time: {t:.4f} s")
80+

0 commit comments

Comments
 (0)