forked from ietf-tools/datatracker
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtasks.py
More file actions
154 lines (128 loc) · 5.06 KB
/
tasks.py
File metadata and controls
154 lines (128 loc) · 5.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# Copyright The IETF Trust 2024, All Rights Reserved
#
# Celery task definitions
#
import datetime
import io
import requests
from celery import shared_task
from django.conf import settings
from django.utils import timezone
from ietf.sync import iana
from ietf.sync import rfceditor
from ietf.utils import log
from ietf.utils.timezone import date_today
@shared_task
def rfc_editor_index_update_task(full_index=False):
"""Update metadata from the RFC index
Default is to examine only changes in the past 365 days. Call with full_index=True to update
the full RFC index.
According to comments on the original script, a year's worth took about 20s on production as of
August 2022
The original rfc-editor-index-update script had a long-disabled provision for running the
rebuild_reference_relations scripts after the update. That has not been brought over
at all because it should be implemented as its own task if it is needed.
"""
skip_date = None if full_index else date_today() - datetime.timedelta(days=365)
log.log(
"Updating document metadata from RFC index going back to {since}, from {url}".format(
since=skip_date if skip_date is not None else "the beginning",
url=settings.RFC_EDITOR_INDEX_URL,
)
)
try:
response = requests.get(
settings.RFC_EDITOR_INDEX_URL,
timeout=30, # seconds
)
except requests.Timeout as exc:
log.log(f'GET request timed out retrieving RFC editor index: {exc}')
return # failed
rfc_index_xml = response.text
index_data = rfceditor.parse_index(io.StringIO(rfc_index_xml))
try:
response = requests.get(
settings.RFC_EDITOR_ERRATA_JSON_URL,
timeout=30, # seconds
)
except requests.Timeout as exc:
log.log(f'GET request timed out retrieving RFC editor errata: {exc}')
return # failed
errata_data = response.json()
if len(index_data) < rfceditor.MIN_INDEX_RESULTS:
log.log("Not enough index entries, only %s" % len(index_data))
return # failed
if len(errata_data) < rfceditor.MIN_ERRATA_RESULTS:
log.log("Not enough errata entries, only %s" % len(errata_data))
return # failed
for rfc_number, changes, doc, rfc_published in rfceditor.update_docs_from_rfc_index(
index_data, errata_data, skip_older_than_date=skip_date
):
for c in changes:
log.log("RFC%s, %s: %s" % (rfc_number, doc.name, c))
@shared_task
def iana_changes_update_task():
# compensate to avoid we ask for something that happened now and then
# don't get it back because our request interval is slightly off
CLOCK_SKEW_COMPENSATION = 5 # seconds
# actually the interface accepts 24 hours, but then we get into
# trouble with daylights savings - meh
MAX_INTERVAL_ACCEPTED_BY_IANA = datetime.timedelta(hours=23)
start = (
timezone.now()
- datetime.timedelta(hours=23)
+ datetime.timedelta(seconds=CLOCK_SKEW_COMPENSATION,)
)
end = start + datetime.timedelta(hours=23)
t = start
while t < end:
# the IANA server doesn't allow us to fetch more than a certain
# period, so loop over the requested period and make multiple
# requests if necessary
text = iana.fetch_changes_json(
settings.IANA_SYNC_CHANGES_URL, t, min(end, t + MAX_INTERVAL_ACCEPTED_BY_IANA)
)
log.log(f"Retrieved the JSON: {text}")
changes = iana.parse_changes_json(text)
added_events, warnings = iana.update_history_with_changes(
changes, send_email=True
)
for e in added_events:
log.log(
f"Added event for {e.doc_id} {e.time}: {e.desc} (parsed json: {e.json})"
)
for w in warnings:
log.log(f"WARNING: {w}")
t += MAX_INTERVAL_ACCEPTED_BY_IANA
@shared_task
def iana_protocols_update_task():
# Earliest date for which we have data suitable to update (was described as
# "this needs to be the date where this tool is first deployed" in the original
# iana-protocols-updates script)"
rfc_must_published_later_than = datetime.datetime(
2012,
11,
26,
tzinfo=datetime.timezone.utc,
)
try:
response = requests.get(
settings.IANA_SYNC_PROTOCOLS_URL,
timeout=30,
)
except requests.Timeout as exc:
log.log(f'GET request timed out retrieving IANA protocols page: {exc}')
return
rfc_numbers = iana.parse_protocol_page(response.text)
def batched(l, n):
"""Split list l up in batches of max size n.
For Python 3.12 or later, replace this with itertools.batched()
"""
return (l[i:i + n] for i in range(0, len(l), n))
for batch in batched(rfc_numbers, 100):
updated = iana.update_rfc_log_from_protocol_page(
batch,
rfc_must_published_later_than,
)
for d in updated:
log.log("Added history entry for %s" % d.display_name())