Add sync scripts for reading from IANA changes API, reading from the

OleLaursen · OleLaursen · commit 5282bd1d07b0 · 2012-09-17T15:54:22.000Z
protocols page (to see when references to newly published RFCs have
been updated) and parsing IANA review emails to be included as
comments
 - Legacy-Id: 4850
diff --git a/ietf/bin/iana-changes-updates b/ietf/bin/iana-changes-updates
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+
+import os, sys, re, json, datetime, optparse
+import syslog
+
+# boilerplate
+basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+sys.path = [ basedir ] + sys.path
+
+from ietf import settings
+from django.core import management
+management.setup_environ(settings)
+
+
+from optparse import OptionParser
+
+parser = OptionParser()
+parser.add_option("-f", "--from", dest="start",
+                  help="Start time, defaults to a little less than 23 hours ago", metavar="YYYY-MM-DD HH:MM:SS")
+parser.add_option("-t", "--to", dest="end",
+                  help="End time, defaults to 23 hours later than from", metavar="YYYY-MM-DD HH:MM:SS")
+parser.add_option("", "--no-email", dest="send_email", default=True, action="store_false",
+                  help="Skip sending emails")
+
+options, args = parser.parse_args()
+
+# compensate to avoid we ask for something that happened now and then
+# don't get it back because our request interval is slightly off
+CLOCK_SKEW_COMPENSATION = 5 # seconds
+
+# actually the interface accepts 24 hours, but then we get into
+# trouble with daylights savings - meh
+MAX_INTERVAL_ACCEPTED_BY_IANA = datetime.timedelta(hours=23)
+
+
+start = datetime.datetime.now() - datetime.timedelta(hours=23) + CLOCK_SKEW_COMPENSATION
+if options.start:
+    start = datetime.datetime.strptime(options.start, "%Y-%m-%d %H:%M:%S")
+
+end = start + datetime.timedelta(hours=23)
+if options.end:
+    end = datetime.datetime.strptime(options.end, "%Y-%m-%d %H:%M:%S")
+
+syslog.openlog(os.path.basename(__file__), syslog.LOG_PID, syslog.LOG_LOCAL0)
+
+
+from ietf.sync.iana import *
+
+syslog.syslog("Updating history log with new changes from IANA from %s, period %s - %s" % (CHANGES_URL, start, end))
+
+t = start
+while t < end:
+    # the IANA server doesn't allow us to fetch more than a certain
+    # period, so loop over the requested period and make multiple
+    # requests if necessary
+
+    text = fetch_changes_json(CHANGES_URL, t, min(end, t + MAX_INTERVAL_ACCEPTED_BY_IANA))
+    changes = parse_changes_json(text)
+    added_events, warnings = update_history_with_changes(changes, send_email=options.send_email)
+
+    for e in added_events:
+        syslog.syslog("Added event for %s %s: %s" % (e.doc_id, e.time, e.desc))
+
+    for w in warnings:
+        syslog.syslog("WARNING: %s" % w)
+
+    t += MAX_INTERVAL_ACCEPTED_BY_IANA
diff --git a/ietf/bin/iana-protocols-updates b/ietf/bin/iana-protocols-updates
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+
+import os, sys, re, json, datetime
+import syslog
+
+syslog.openlog(os.path.basename(__file__), syslog.LOG_PID, syslog.LOG_LOCAL0)
+
+# boilerplate
+basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+sys.path = [ basedir ] + sys.path
+
+from ietf import settings
+from django.core import management
+management.setup_environ(settings)
+
+
+from ietf.sync.iana import *
+
+def chunks(l, n):
+    """Split list l up in chunks of max size n."""
+    return (l[i:i+n] for i in xrange(0, len(l), n))
+
+syslog.syslog("Updating history log with new RFC entries from IANA protocols page %s" % PROTOCOLS_URL)
+
+# FIXME: this needs to be the date where this tool is first deployed
+rfc_must_published_later_than = datetime.datetime(2012, 8, 30, 0, 0, 0)
+
+text = fetch_protocol_page(PROTOCOLS_URL)
+rfc_numbers = parse_protocol_page(text)
+for chunk in chunks(rfc_numbers, 100):
+    updated = update_rfc_log_from_protocol_page(chunk, rfc_must_published_later_than)
+
+    for d in updated:
+        syslog.syslog("Added history entry for %s" % d.display_name())
diff --git a/ietf/bin/iana-review-email b/ietf/bin/iana-review-email
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+
+import os, sys, re, json, datetime, optparse
+import syslog
+
+# boilerplate
+basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+sys.path = [ basedir ] + sys.path
+
+from ietf import settings
+from django.core import management
+management.setup_environ(settings)
+
+
+syslog.openlog(os.path.basename(__file__), syslog.LOG_PID, syslog.LOG_LOCAL0)
+
+from ietf.sync.iana import *
+
+msg = sys.stdin.read()
+
+syslog.syslog("Reading IANA review email")
+
+doc_name, review_time, by, comment = parse_review_email(msg)
+add_review_comment(doc_name, review_time, by, comment)
+
+if by.name == "(System)":
+    syslog.syslog("WARNING: person responsible for email does not have a IANA role")
diff --git a/ietf/sync/iana.py b/ietf/sync/iana.py
@@ -0,0 +1,260 @@
+import re, urllib2, json, email
+
+from django.utils.http import urlquote
+
+from ietf.doc.models import *
+from ietf.doc.utils import add_state_change_event
+from ietf.person.models import *
+from ietf.idrfc.mails import email_owner, email_state_changed, email_authors
+from ietf.utils.timezone import *
+
+PROTOCOLS_URL = "http://www.iana.org/protocols/"
+CHANGES_URL = "http://datatracker.dev.icann.org:8080/data-tracker/changes"
+
+def fetch_protocol_page(url):
+    f = urllib2.urlopen(PROTOCOLS_URL)
+    text = f.read()
+    f.close()
+    return text
+    
+def parse_protocol_page(text):
+    """Parse IANA protocols page to extract referenced RFCs (as
+    rfcXXXX document names)."""
+    matches = re.findall('RFC [0-9]+', text)
+    res = set()
+    for m in matches:
+        res.add("rfc" + m[len("RFC "):])
+
+    return list(res)
+
+def update_rfc_log_from_protocol_page(rfc_names, rfc_must_published_later_than):
+    """Add notices to RFC history log that IANA is now referencing the RFC."""
+    system = Person.objects.get(name="(System)")
+
+    updated = []
+
+    docs = Document.objects.filter(docalias__name__in=rfc_names).exclude(
+        docevent__type="rfc_in_iana_registry").filter(
+        # only take those that were published after cutoff since we
+        # have a big bunch of old RFCs that we unfortunately don't have data for
+        docevent__type="published_rfc", docevent__time__gte=rfc_must_published_later_than
+        ).distinct()
+
+    for d in docs:
+        e = DocEvent(doc=d)
+        e.by = system
+        e.type = "rfc_in_iana_registry"
+        e.desc = "IANA registries were updated to include %s" % d.display_name()
+        e.save()
+
+        updated.append(d)
+
+    return updated
+        
+    
+
+def fetch_changes_json(url, start, end):
+    url += "?start=%s&end=%s" % (urlquote(local_timezone_to_utc(start).strftime("%Y-%m-%d %H:%M:%S")),
+                                 urlquote(local_timezone_to_utc(end).strftime("%Y-%m-%d %H:%M:%S")))
+    f = urllib2.urlopen(url)
+    text = f.read()
+    f.close()
+    return text
+
+def parse_changes_json(text):
+    response = json.loads(text)
+
+    if "error" in response:
+        raise Exception("IANA server returned error: %s" % response["error"])
+
+    changes = response["changes"]
+
+    # do some rudimentary validation
+    for i in changes:
+        for f in ['doc', 'type', 'time']:
+            if f not in i:
+                raise Exception('Error in response: Field %s missing in input: %s - %s' % (f, json.dumps(i), json.dumps(changes)))
+
+        # a little bit of cleaning
+        i["doc"] = i["doc"].strip()
+        if i["doc"].startswith("http://www.ietf.org/internet-drafts/"):
+            i["doc"] = i["doc"][len("http://www.ietf.org/internet-drafts/"):]
+
+    # make sure we process oldest entries first
+    changes.sort(key=lambda c: c["time"])
+
+    return changes
+
+def update_history_with_changes(changes, send_email=True):
+    """Take parsed changes from IANA and apply them. Note that we
+    expect to get these in chronologically sorted, otherwise the
+    change descriptions generated may not be right."""
+
+    # build up state lookup
+    states = {}
+
+    slookup = dict((s.slug, s)
+                   for s in State.objects.filter(type=StateType.objects.get(slug="draft-iana-action")))
+    states["action"] = {
+        "": slookup["newdoc"],
+        "In Progress": slookup["inprog"],
+        "Open": slookup["inprog"],
+        "pre-approval In Progress": slookup["inprog"],
+        "Waiting on Authors": slookup["waitauth"],
+        "Author": slookup["waitauth"],
+        "Waiting on ADs": slookup["waitad"],
+        "Waiting on AD": slookup["waitad"],
+        "AD": slookup["waitad"],
+        "Waiting on WGC": slookup["waitwgc"],
+        "WGC": slookup["waitwgc"],
+        "Waiting on RFC-Editor": slookup["waitrfc"],
+        "Waiting on RFC Editor": slookup["waitrfc"],
+        "RFC-Editor": slookup["waitrfc"],
+        "RFC-Ed-ACK": slookup["rfcedack"],
+        "RFC-Editor-ACK": slookup["rfcedack"],
+        "Completed": slookup["rfcedack"],
+        "On Hold": slookup["onhold"],
+        "No IC": slookup["noic"],
+    }
+
+    slookup = dict((s.slug, s)
+                  for s in State.objects.filter(type=StateType.objects.get(slug="draft-iana-review")))
+    states["review"] = {
+        "IANA Review Needed": slookup["need-rev"],
+        "IANA OK - Actions Needed": slookup["ok-act"],
+        "IANA OK - No Actions Needed": slookup["ok-noact"],
+        "IANA Not OK": slookup["not-ok"],
+        "Version Changed - Review Needed": slookup["changed"],
+        }
+
+    # so it turns out IANA has made a mistake and are including some
+    # wrong states, we'll have to skip those
+    wrong_action_states = ("Waiting on Reviewer", "Review Complete", "Last Call",
+                           "Last Call - Questions", "Evaluation", "Evaluation -  Questions",
+                           "With Reviewer", "IESG Notification Received", "Watiing on Last Call",
+                           "IANA Comments Submitted", "Waiting on Last Call")
+
+    system = Person.objects.get(name="(System)")
+
+    added_events = []
+    warnings = []
+
+    for c in changes:
+        docname = c['doc']
+        timestamp = datetime.datetime.strptime(c["time"], "%Y-%m-%d %H:%M:%S")
+        timestamp = utc_to_local_timezone(timestamp) # timestamps are in UTC
+
+        if c['type'] in ("iana_state", "iana_review"):
+            if c['type'] == "iana_state":
+                kind = "action"
+
+                if c["state"] in wrong_action_states:
+                    warnings.append("Wrong action state '%s' encountered in changes from IANA" % c["state"])
+                    continue
+            else:
+                kind = "review"
+
+            if c["state"] not in states[kind]:
+                warnings.append("Unknown IANA %s state %s (%s)" % (kind, c["state"], timestamp))
+                print "Unknown IANA %s state %s" % (kind, c["state"])
+                continue
+
+            state = states[kind][c["state"]]
+            state_type = "draft-iana-%s" % kind
+
+            e = StateDocEvent.objects.filter(type="changed_state", time=timestamp,
+                                             state_type=state_type, state=state)
+            if not e:
+                try:
+                    doc = Document.objects.get(docalias__name=docname)
+                except Document.DoesNotExist:
+                    warnings.append("Document %s not found" % docname)
+                    continue
+
+                # the naive way of extracting prev_state here means
+                # that we assume these changes are cronologically
+                # applied
+                prev_state = doc.get_state(state_type)
+                e = add_state_change_event(doc, system, prev_state, state, timestamp)
+
+                added_events.append(e)
+
+                if not StateDocEvent.objects.filter(doc=doc, time__gt=timestamp, state_type=state_type):
+                    save_document_in_history(doc)
+                    doc.set_state(state)
+
+                    if send_email:
+                        email_state_changed(None, doc, "IANA %s state changed to %s" % (kind, state.name))
+                        email_owner(None, doc, doc.ad, system, "IANA %s state changed to %s" % (kind, state.name))
+
+                if doc.time < timestamp:
+                    doc.time = timestamp
+                    doc.save()
+
+    return added_events, warnings
+
+
+def parse_review_email(text):
+    msg = email.message_from_string(text)
+
+    # doc
+    doc_name = ""
+    m = re.search(r"<([^>]+)>", msg["Subject"])
+    if m:
+        doc_name = m.group(1).lower()
+        if re.search(r"\.\w{3}$", doc_name): # strip off extension
+            doc_name = doc_name[:-4]
+
+        if re.search(r"-\d{2}$", doc_name): # strip off revision
+            doc_name = doc_name[:-3]
+
+    # date
+    review_time = datetime.datetime.now()
+    if "Date" in msg:
+        review_time = email_time_to_local_timezone(msg["Date"])
+
+    # by
+    by = None
+    m = re.search(r"\"(.*)\"", msg["From"])
+    if m:
+        name = m.group(1).strip()
+        if name.endswith(" via RT"):
+            name = name[:-len(" via RT")]
+
+        try:
+            by = Person.objects.get(alias__name=name, role__group__acronym="iana")
+        except Person.DoesNotExist:
+            pass
+
+    if not by:
+        by = Person.objects.get(name="(System)")
+
+    # comment
+    body = msg.get_payload().decode('quoted-printable').replace("\r", "")
+    b = body.find("(BEGIN IANA LAST CALL COMMENTS)")
+    e = body.find("(END IANA LAST CALL COMMENTS)")
+
+    comment = body[b + len("(BEGIN IANA LAST CALL COMMENTS)"):e].strip()
+
+    # strip leading IESG:
+    if comment.startswith("IESG:"):
+        comment = comment[len("IESG:"):].lstrip()
+
+    # strip ending Thanks, followed by signature
+    m = re.compile(r"^Thanks,\n\n", re.MULTILINE).search(comment)
+    if m:
+        comment = comment[:m.start()].rstrip()
+
+    return doc_name, review_time, by, comment
+
+def add_review_comment(doc_name, review_time, by, comment):
+    try:
+        e = DocEvent.objects.get(doc__name=doc_name, time=review_time, type="iana_review")
+    except DocEvent.DoesNotExist:
+        doc = Document.objects.get(name=doc_name)
+        e = DocEvent(doc=doc, time=review_time, type="iana_review")
+
+    e.desc = comment
+    e.by = by
+
+    e.save()