Skip to content

Commit 8898e8a

Browse files
committed
add new code for Python and Go on M1
1 parent 0f7ab23 commit 8898e8a

File tree

7 files changed

+100
-122
lines changed

7 files changed

+100
-122
lines changed

words_extractor_go/main.go

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,14 @@ import (
1515
"github.com/tidwall/collate"
1616
)
1717

18+
type Pair struct {
19+
Path string
20+
Size int64
21+
}
22+
1823
func main() {
19-
var wg sync.WaitGroup
24+
// var wg sync.WaitGroup
25+
queue := make(chan Pair)
2026

2127
t := time.Now()
2228
defer timeTrack(t)
@@ -27,12 +33,43 @@ func main() {
2733

2834
fmt.Println("Parsing...")
2935

30-
paths, _ := doublestar.Glob("../data/pl/**/*.yml")
36+
paths, _ := doublestar.Glob("../data/**/*.yml")
37+
38+
total_size := int64(0)
39+
items_count := len(paths)
3140
for i, path := range paths {
32-
wg.Add(1)
33-
go worker(i, &wg, path, outdir, true)
41+
go processFile(queue, outdir, path, false)
42+
res := <-queue
43+
total_size += res.Size
44+
fmt.Printf("[%d/%d] %s\n", i+1, items_count, res.Path)
45+
}
46+
fmt.Printf("Total items: %d\n", items_count)
47+
fmt.Printf("Total size: %d MB\n", total_size/(1024*1024))
48+
}
49+
50+
func processFile(queue chan Pair, outdir string, path string, sorting bool) {
51+
meta := GetYAML(path)
52+
// load text file
53+
filepath := strings.Replace(path, ".yml", ".txt", -1)
54+
info, err := os.Stat(filepath)
55+
if err != nil {
56+
panic(err)
57+
}
58+
content, err := ioutil.ReadFile(filepath)
59+
if err != nil {
60+
panic(err)
61+
}
62+
// extract and sort unique words
63+
words := extractUniqueWords(content)
64+
if sorting {
65+
words = sortWords(words, "POLISH_CI")
66+
}
67+
text := strings.Join(words, "\n")
68+
outpath := fmt.Sprintf("%s/%s-%s.txt", outdir, meta.Lang, meta.Code)
69+
for err := ioutil.WriteFile(outpath, []byte(text), 0644); err != nil; {
70+
panic(err)
3471
}
35-
wg.Wait()
72+
queue <- Pair{path, info.Size()}
3673
}
3774

3875
func worker(id int, wg *sync.WaitGroup, path, outdir string, verbose bool) {

words_extractor_go/main_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,7 @@ func Test_sortWords(t *testing.T) {
1919
given := sortWords(words, "POLISH_CI")
2020
assert.Equal(t, expected, given, "words should be sorted out using Polish grammar rules")
2121
}
22+
23+
// Total items: 123
24+
// Total size: 503 MB`
25+
// Total timing: 36.606038042s`

words_extractor_py/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
/words/
22
/.DS_Store
33
/venv/
4+
/.idea/
5+
/.venv/
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
PyICU==2.7.3
2-
PyYAML==5.4.1
1+
#PyICU>=2.7.4
2+
PyYAML>=5.4.1

words_extractor_py/words.py

Lines changed: 50 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,42 @@
11
import glob
2-
from icu import Collator, Locale
2+
from typing import Tuple
3+
import os
4+
5+
try:
6+
from icu import Collator, Locale
7+
8+
i18nsorting = True
9+
except ModuleNotFoundError:
10+
i18nsorting = False
11+
12+
import multiprocessing as mp
313
import os
414
import re
515
import shutil
616
import time
717
import yaml
818

919

10-
def worker(path, collator, separator, outdir, with_sorting):
20+
def worker(path: str, outdir: str, sorting: bool = False) -> Tuple[str, int]:
21+
if sorting:
22+
if i18nsorting:
23+
collator = Collator.createInstance(Locale("pl_PL.UTF-8"))
24+
print("I18nN sorting not available")
25+
26+
separator = re.compile("[\W\d]+")
1127
filepath = path.replace(".yml", ".txt")
28+
filesize = os.path.getsize(filepath)
1229
with open(filepath) as file:
30+
1331
text = file.read().lower().rstrip()
1432
words = set(re.split(separator, text))
1533
with open(path) as file:
1634
meta = yaml.safe_load(file)
17-
with open(f"{outdir}/extracted-words-for-{meta['label']}.txt", "w") as file:
18-
if with_sorting:
35+
with open(f"{outdir}/{meta['lang']}-{meta['code']}.txt", "w") as file:
36+
if sorting and i18nsorting:
1937
words = sorted(words, key=collator.getSortKey)
2038
file.write("\n".join(words))
21-
print(f"Saved: ", filepath)
39+
return path, filesize
2240

2341

2442
if __name__ == "__main__":
@@ -29,16 +47,33 @@ def worker(path, collator, separator, outdir, with_sorting):
2947
shutil.rmtree(outdir)
3048
os.makedirs(outdir)
3149

32-
collator = Collator.createInstance(Locale("pl_PL.UTF-8"))
33-
separator = re.compile("[\W\d]+")
50+
pool = mp.Pool(mp.cpu_count())
51+
3452
print("Processing")
35-
for path in glob.glob("../data/pl/**/*.yml", recursive=True):
36-
worker(
37-
path=path,
38-
collator=collator,
39-
separator=separator,
40-
outdir=outdir,
41-
with_sorting=True,
53+
results = []
54+
for path in glob.glob("../data/**/*.yml", recursive=True):
55+
res = pool.apply_async(
56+
worker,
57+
kwds=dict(
58+
path=path,
59+
outdir=outdir,
60+
sorting=False,
61+
),
4262
)
63+
results.append(res)
64+
total_size = 0
65+
items_count = len(results)
66+
for i, res in enumerate(results):
67+
path, size = res.get()
68+
total_size += size
69+
print(f"[{i+1}/{items_count}] {path}")
70+
print(f"Total files: {len(results)}")
71+
print(f"Total size: {round((total_size / 1024 / 1024))} MB")
72+
t = time.time() - t
73+
print(f"Total time: {t:.4f} s")
4374

44-
print("Total timing: ", time.time() - t)
75+
"""
76+
Total files: 123
77+
Total size: 504 MB
78+
Total time: 5.1153 s
79+
"""

words_extractor_py/words_parallel.py

Lines changed: 0 additions & 52 deletions
This file was deleted.

words_extractor_py/words_parallel2.py

Lines changed: 0 additions & 48 deletions
This file was deleted.

0 commit comments

Comments
 (0)