Skip to content

Commit 79a9278

Browse files
committed
fix Python example to ignore book references from the beginning of every line
1 parent 53c7d3c commit 79a9278

File tree

1 file changed

+9
-13
lines changed

1 file changed

+9
-13
lines changed

example-python/words.py

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,22 +19,18 @@
1919

2020

2121
def worker(path: str, outdir: str, sorting: bool = False) -> Tuple[str, int]:
22-
# if sorting:
23-
# if i18nsorting:
24-
# collator = Collator.createInstance(Locale("pl_PL.UTF-8"))
25-
# print("I18nN sorting not available")
26-
27-
separator = re.compile("[\W\d]+")
22+
if sorting and i18nsorting:
23+
collator = Collator.createInstance(Locale("pl_PL.UTF-8"))
24+
25+
separator = re.compile("[\W\d]+") # also ignore Strong numbers (Python has no \p{L} pattern)
2826
filepath = path.replace(".yml", ".txt")
2927
filesize = os.path.getsize(filepath)
3028
with open(filepath) as file:
31-
text = file.read().lower().rstrip()
32-
words = set(re.split(separator, text))
33-
try:
34-
words.remove('')
35-
except KeyError:
36-
pass
37-
words = list(words)
29+
words = []
30+
for line in file.readlines():
31+
_line = ' '.join(line.strip().lower().split(' ')[2:-1]) # without book reference
32+
words += [w for w in set(re.split(separator, _line)) if w and len(w) > 1]
33+
words = list(set(words))
3834
with open(path) as file:
3935
meta = yaml.safe_load(file)
4036
with open(f"{outdir}/{meta['lang']}-{meta['code']}.txt", "w") as file:

0 commit comments

Comments
 (0)