Skip to content

Commit 881e14d

Browse files
committed
add Julia example
1 parent 2067fe1 commit 881e14d

File tree

4 files changed

+339
-0
lines changed

4 files changed

+339
-0
lines changed

words_extractor_jl/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
/words/
2+
/.ipynb_checkpoints/
3+
/.history/

words_extractor_jl/Manifest.toml

Lines changed: 271 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,271 @@
1+
# This file is machine-generated - editing it directly is not advised
2+
3+
[[ArgTools]]
4+
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
5+
6+
[[Artifacts]]
7+
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
8+
9+
[[Base64]]
10+
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
11+
12+
[[CSTParser]]
13+
deps = ["Tokenize"]
14+
git-tree-sha1 = "980055cab361b4bb77a3a158cec60375d0292a1a"
15+
uuid = "00ebfdb7-1f24-5e51-bd34-a7502290713f"
16+
version = "3.2.2"
17+
18+
[[CommonMark]]
19+
deps = ["Crayons", "JSON", "URIs"]
20+
git-tree-sha1 = "7632afc57f92720a01d9aedf23f413f4e5e21015"
21+
uuid = "a80b9123-70ca-4bc0-993e-6e3bcb318db6"
22+
version = "0.8.1"
23+
24+
[[Compat]]
25+
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
26+
git-tree-sha1 = "e4e2b39db08f967cc1360951f01e8a75ec441cab"
27+
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
28+
version = "3.30.0"
29+
30+
[[Conda]]
31+
deps = ["JSON", "VersionParsing"]
32+
git-tree-sha1 = "299304989a5e6473d985212c28928899c74e9421"
33+
uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d"
34+
version = "1.5.2"
35+
36+
[[Crayons]]
37+
git-tree-sha1 = "3f71217b538d7aaee0b69ab47d9b7724ca8afa0d"
38+
uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
39+
version = "4.0.4"
40+
41+
[[DataStructures]]
42+
deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
43+
git-tree-sha1 = "4437b64df1e0adccc3e5d1adbc3ac741095e4677"
44+
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
45+
version = "0.18.9"
46+
47+
[[Dates]]
48+
deps = ["Printf"]
49+
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
50+
51+
[[DelimitedFiles]]
52+
deps = ["Mmap"]
53+
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
54+
55+
[[Distributed]]
56+
deps = ["Random", "Serialization", "Sockets"]
57+
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
58+
59+
[[Downloads]]
60+
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
61+
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
62+
63+
[[FileWatching]]
64+
uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
65+
66+
[[IJulia]]
67+
deps = ["Base64", "Conda", "Dates", "InteractiveUtils", "JSON", "Libdl", "Markdown", "MbedTLS", "Pkg", "Printf", "REPL", "Random", "SoftGlobalScope", "Test", "UUIDs", "ZMQ"]
68+
git-tree-sha1 = "d8b9c31196e1dd92181cd0f5760ca2d2ffb4ac0f"
69+
uuid = "7073ff75-c697-5162-941a-fcdaad2a7d2a"
70+
version = "1.23.2"
71+
72+
[[InteractiveUtils]]
73+
deps = ["Markdown"]
74+
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
75+
76+
[[JLLWrappers]]
77+
deps = ["Preferences"]
78+
git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
79+
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
80+
version = "1.3.0"
81+
82+
[[JSON]]
83+
deps = ["Dates", "Mmap", "Parsers", "Unicode"]
84+
git-tree-sha1 = "81690084b6198a2e1da36fcfda16eeca9f9f24e4"
85+
uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
86+
version = "0.21.1"
87+
88+
[[JuliaFormatter]]
89+
deps = ["CSTParser", "CommonMark", "DataStructures", "Pkg", "Tokenize"]
90+
git-tree-sha1 = "b2b188c074dffa5fd86ac6f46c77364ac4d3863c"
91+
uuid = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
92+
version = "0.14.4"
93+
94+
[[LibCURL]]
95+
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
96+
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
97+
98+
[[LibCURL_jll]]
99+
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
100+
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
101+
102+
[[LibGit2]]
103+
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
104+
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
105+
106+
[[LibSSH2_jll]]
107+
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
108+
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
109+
110+
[[Libdl]]
111+
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
112+
113+
[[LinearAlgebra]]
114+
deps = ["Libdl"]
115+
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
116+
117+
[[Logging]]
118+
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
119+
120+
[[Markdown]]
121+
deps = ["Base64"]
122+
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
123+
124+
[[MbedTLS]]
125+
deps = ["Dates", "MbedTLS_jll", "Random", "Sockets"]
126+
git-tree-sha1 = "1c38e51c3d08ef2278062ebceade0e46cefc96fe"
127+
uuid = "739be429-bea8-5141-9913-cc70e7f3736d"
128+
version = "1.0.3"
129+
130+
[[MbedTLS_jll]]
131+
deps = ["Artifacts", "Libdl"]
132+
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
133+
134+
[[Mmap]]
135+
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
136+
137+
[[MozillaCACerts_jll]]
138+
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
139+
140+
[[NetworkOptions]]
141+
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
142+
143+
[[OrderedCollections]]
144+
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
145+
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
146+
version = "1.4.1"
147+
148+
[[Parsers]]
149+
deps = ["Dates"]
150+
git-tree-sha1 = "c8abc88faa3f7a3950832ac5d6e690881590d6dc"
151+
uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
152+
version = "1.1.0"
153+
154+
[[Pkg]]
155+
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
156+
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
157+
158+
[[Preferences]]
159+
deps = ["TOML"]
160+
git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a"
161+
uuid = "21216c6a-2e73-6563-6e65-726566657250"
162+
version = "1.2.2"
163+
164+
[[Printf]]
165+
deps = ["Unicode"]
166+
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
167+
168+
[[REPL]]
169+
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
170+
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
171+
172+
[[Random]]
173+
deps = ["Serialization"]
174+
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
175+
176+
[[SHA]]
177+
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
178+
179+
[[Serialization]]
180+
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
181+
182+
[[SharedArrays]]
183+
deps = ["Distributed", "Mmap", "Random", "Serialization"]
184+
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
185+
186+
[[Sockets]]
187+
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
188+
189+
[[SoftGlobalScope]]
190+
deps = ["REPL"]
191+
git-tree-sha1 = "986ec2b6162ccb95de5892ed17832f95badf770c"
192+
uuid = "b85f4697-e234-5449-a836-ec8e2f98b302"
193+
version = "1.1.0"
194+
195+
[[SparseArrays]]
196+
deps = ["LinearAlgebra", "Random"]
197+
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
198+
199+
[[Statistics]]
200+
deps = ["LinearAlgebra", "SparseArrays"]
201+
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
202+
203+
[[TOML]]
204+
deps = ["Dates"]
205+
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
206+
207+
[[Tar]]
208+
deps = ["ArgTools", "SHA"]
209+
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
210+
211+
[[Test]]
212+
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
213+
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
214+
215+
[[Tokenize]]
216+
git-tree-sha1 = "15318136d8b7a91a0e49916ec931cc51d5456ab2"
217+
uuid = "0796e94c-ce3b-5d07-9a54-7f471281c624"
218+
version = "0.5.16"
219+
220+
[[URIs]]
221+
git-tree-sha1 = "97bbe755a53fe859669cd907f2d96aee8d2c1355"
222+
uuid = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
223+
version = "1.3.0"
224+
225+
[[UUIDs]]
226+
deps = ["Random", "SHA"]
227+
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
228+
229+
[[Unicode]]
230+
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
231+
232+
[[VersionParsing]]
233+
git-tree-sha1 = "80229be1f670524750d905f8fc8148e5a8c4537f"
234+
uuid = "81def892-9a0e-5fdd-b105-ffc91e053289"
235+
version = "1.2.0"
236+
237+
[[YAML]]
238+
deps = ["Base64", "Dates", "Printf"]
239+
git-tree-sha1 = "78c02bd295bbd0ca330f95e07ccdfcb69f6cbcd4"
240+
uuid = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"
241+
version = "0.4.6"
242+
243+
[[ZMQ]]
244+
deps = ["FileWatching", "Sockets", "ZeroMQ_jll"]
245+
git-tree-sha1 = "fc68e8a3719166950a0f3e390a14c7302c48f8de"
246+
uuid = "c2297ded-f4af-51ae-bb23-16f91089e4e1"
247+
version = "1.2.1"
248+
249+
[[ZeroMQ_jll]]
250+
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "libsodium_jll"]
251+
git-tree-sha1 = "74a74a3896b63980734cc876da8a103454559fe8"
252+
uuid = "8f1865be-045e-5c20-9c9f-bfbfb0764568"
253+
version = "4.3.2+6"
254+
255+
[[Zlib_jll]]
256+
deps = ["Libdl"]
257+
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
258+
259+
[[libsodium_jll]]
260+
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
261+
git-tree-sha1 = "848ab3d00fe39d6fbc2a8641048f8f272af1c51e"
262+
uuid = "a9144af2-ca23-56d9-984f-0d03f7b5ccf8"
263+
version = "1.0.20+0"
264+
265+
[[nghttp2_jll]]
266+
deps = ["Artifacts", "Libdl"]
267+
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
268+
269+
[[p7zip_jll]]
270+
deps = ["Artifacts", "Libdl"]
271+
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"

words_extractor_jl/Project.toml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
name = "words_extractor_jl"
2+
uuid = "ab5e5b3c-2775-42ba-a2f5-dc8ee1810597"
3+
authors = ["Jaroslaw Zabiello <[email protected]>"]
4+
version = "0.1.0"
5+
6+
[deps]
7+
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
8+
IJulia = "7073ff75-c697-5162-941a-fcdaad2a7d2a"
9+
JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
10+
YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
module words_extractor_jl
2+
3+
using Distributed
4+
using Pipe
5+
using YAML
6+
7+
folder = "words"
8+
9+
function worker(yaml_path)
10+
path = get_filepath(yaml_path)
11+
words = get_words(yaml_path)
12+
write(path, join(words, "\n"))
13+
println(string("Saved...", path))
14+
end
15+
16+
function get_words(yaml_path)
17+
text_path = replace(yaml_path, ".yml" => ".txt")
18+
text = read(text_path, String) |> lowercase
19+
split(text, r"[\W\d]+") |> Set |> collect
20+
end
21+
22+
function get_filepath(path)
23+
meta = YAML.load_file(path)
24+
string(folder, "/extracted-words-for-", meta["label"], ".txt")
25+
end
26+
27+
function walk(path, file_ext)
28+
res = []
29+
for (root, _, files) in walkdir(path, topdown = true)
30+
for file in files
31+
if endswith(file, file_ext)
32+
filepath = joinpath(root, file)
33+
push!(res, filepath)
34+
end
35+
end
36+
end
37+
res
38+
end
39+
40+
function main()
41+
if ispath(folder)
42+
rm(folder, recursive = true)
43+
end
44+
mkdir(folder)
45+
Threads.@threads for path in walk("../data/pl/", ".yml")
46+
println("Spawn $path")
47+
worker(path)
48+
end
49+
end
50+
51+
# addprocs()
52+
# println(string("Workers ", nworkers()))
53+
println(string("Processing... using ", Threads.nthreads(), " threads"))
54+
@time main()
55+
end # module

0 commit comments

Comments
 (0)