Merged in [12461] from rjsparks@nostrum.com:

levkowetz · levkowetz · commit e4ce33923501 · 2016-12-05T21:03:49.000Z
Added migration to fetch text from reviews in the mail archives and populate the review documents. Fixes ietf-tools#2064. Will patch into production. - Legacy-Id: 12463 Note: SVN reference [12461] has been migrated to Git commit 63a9599
diff --git a/ietf/doc/migrations/0017_fill_review_document_contents.py b/ietf/doc/migrations/0017_fill_review_document_contents.py
@@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import debug     # pyflakes:ignore
+
+import contextlib
+import os
+import urllib2
+
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+
+from django.db import migrations
+from django.conf import settings
+
+def get_filename(doc):
+    path = settings.DOCUMENT_PATH_PATTERN.format(doc=doc)
+    # ! These files right now are created with no version number?
+    #name = '%s-%s.txt' % (doc.name,doc.rev)
+    name = '%s.txt' % (doc.name,)
+    return os.path.join(path,name)
+
+def forward(apps,schema_editor):
+    # for each qualifying document
+    Document = apps.get_model('doc','Document')
+
+    for doc in tqdm(Document.objects.filter(type='review',external_url__contains="www.ietf.org/mail-archive/web"),desc="Pointers into Mhonarc"):
+        filename = get_filename(doc)
+        if not os.path.isfile(filename):
+            with contextlib.closing(urllib2.urlopen(doc.external_url)) as infile:
+                fullcontents = infile.read().decode('utf-8', 'ignore');
+                start = fullcontents.find('<!--X-Body-of-Message-->')
+                end = fullcontents.find('<!--X-Body-of-Message-End-->')
+                bodyblock=fullcontents[start+len('<!--X-Body-of-Message-->'):end]
+                text = BeautifulSoup(bodyblock,"lxml").get_text('\n\n') \
+                           .replace('FAQ at <\n\nhttp://wiki.tools','FAQ at <http://wiki.tools') \
+                           .replace('wiki/GenArtfaq\n\n>','wiki/GenArtfaq>')
+                with contextlib.closing(open(filename,'w')) as outfile:
+                    outfile.write(text.encode('utf8'))
+
+    for doc in tqdm(Document.objects.filter(type='review',external_url__contains="mailarchive.ietf.org"),desc="Pointers into Mailarchive"):
+        filename = get_filename(doc)
+        if not os.path.isfile(filename):
+            with contextlib.closing(urllib2.urlopen(doc.external_url)) as infile:
+                fullcontents = infile.read().decode('utf-8', 'ignore');
+                soup = BeautifulSoup(fullcontents,"lxml")
+                divpre = soup.find('div',{"id":"msg-payload"}).find('pre')
+                text = divpre.get_text('\n\n')
+                with contextlib.closing(open(filename,'w')) as outfile:
+                    outfile.write(text.encode('utf8'))
+
+    ## After this migration, we should figure out what to do with these stragglers:
+    ## In [29]: Document.objects.filter(type='review').exclude(Q(external_url__contains="mailarchive")|Q(external_url__contains="mail-archive")).values_list('external_url',flat=True)
+    ## Out[29]: [u'https://art.tools.ietf.org/tools/art/genart/index.cgi/t=1909/review_edit?reviewid=2300', u'https://art.tools.ietf.org/tools/art/genart/index.cgi/t=8460/review_edit?reviewid=2735', u'https://www.ietf.org/ibin/c5i?mid=6&rid=49&gid=0&k1=933&k2=55337&tid=1296220835', u'https://www.ietf.org/mailman/private/tsv-dir/2012-February/002007.html', u'', u'']
+
+def reverse(apps,schema_editor):
+    pass
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('doc', '0016_auto_20160927_0713'),
+    ]
+
+    operations = [
+        migrations.RunPython(forward,reverse)
+    ]
diff --git a/ietf/doc/utils.py b/ietf/doc/utils.py
@@ -297,6 +297,20 @@ def add_events_message_info(events):
         e.in_reply_to = e.addedmessageevent.in_reply_to
 
 
+def get_unicode_document_content(key, filename, split=True, markup=True, codec='utf-8', errors='ignore'):
+    try:
+        with open(filename, 'rb') as f:
+            raw_content = f.read().decode(codec,errors)
+    except IOError:
+        error = "Error; cannot read ("+key+")"
+        return error
+
+    if markup:
+        return markup_txt.markup_unicode(raw_content, split)
+    else:
+        return raw_content
+
+
 def get_document_content(key, filename, split=True, markup=True):
     try:
         with open(filename, 'rb') as f:
diff --git a/ietf/doc/views_doc.py b/ietf/doc/views_doc.py
@@ -51,7 +51,7 @@
     can_adopt_draft, get_chartering_type, get_document_content, get_tags_for_stream_id,
     needed_ballot_positions, nice_consensus, prettify_std_name, update_telechat, has_same_ballot,
     get_initial_notify, make_notify_changed_event, crawl_history, default_consensus,
-    add_events_message_info)
+    add_events_message_info, get_unicode_document_content)
 from ietf.community.utils import augment_docs_with_tracking_info
 from ietf.group.models import Role
 from ietf.group.utils import can_manage_group, can_manage_materials
@@ -582,7 +582,7 @@ def document_main(request, name, rev=None):
     if doc.type_id == "review":
         basename = "{}.txt".format(doc.name, doc.rev)
         pathname = os.path.join(doc.get_file_path(), basename)
-        content = get_document_content(basename, pathname, split=False)
+        content = get_unicode_document_content(basename, pathname, split=False)
 
         review_req = ReviewRequest.objects.filter(review=doc.name).first()
 
diff --git a/ietf/utils/markup_txt.py b/ietf/utils/markup_txt.py
@@ -71,3 +71,23 @@ def markup(content, split=True):
         #return (content1, content2)
     else:
         return "<pre>" + content + "</pre>\n"
+
+def markup_unicode(content, split=True):
+    # normalize line endings to LF only
+    content = content.replace("\r\n", "\n")
+    content = content.replace("\r", "\n")
+
+    # remove leading white space
+    content = content.lstrip()
+    # remove runs of blank lines
+    content = re.sub("\n\n\n+", "\n\n", content)
+
+    # expand tabs + escape 
+    content = escape(content.expandtabs())
+
+    if split:
+        n = content.find("\n", 5000)
+        content1 = "<pre>"+content[:n+1]+"</pre>\n"
+        return content1
+    else:
+        return "<pre>" + content + "</pre>\n"
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,7 @@
 # -*- conf-mode -*-
 setuptools>=18.5			# Require this first, to prevent later errors
 #
+beautifulsoup4>=4.5.1
 bibtexparser>=0.6.2
 coverage>=4.0.1,!=4.0.2
 #cssselect>=0.6.1               # for PyQuery

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,7 @@`
`1`	`1`	`# -- conf-mode --`
`2`	`2`	`setuptools>=18.5 # Require this first, to prevent later errors`
`3`	`3`	`#`
	`4`	`+beautifulsoup4>=4.5.1`
`4`	`5`	`bibtexparser>=0.6.2`
`5`	`6`	`coverage>=4.0.1,!=4.0.2`
`6`	`7`	`#cssselect>=0.6.1 # for PyQuery`