Skip to content

Commit e4ce339

Browse files
committed
Merged in [12461] from rjsparks@nostrum.com:
Added migration to fetch text from reviews in the mail archives and populate the review documents. Fixes ietf-tools#2064. Will patch into production. - Legacy-Id: 12463 Note: SVN reference [12461] has been migrated to Git commit 63a9599
1 parent 7ad38ca commit e4ce339

5 files changed

Lines changed: 104 additions & 2 deletions

File tree

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# -*- coding: utf-8 -*-
2+
from __future__ import unicode_literals
3+
4+
import debug # pyflakes:ignore
5+
6+
import contextlib
7+
import os
8+
import urllib2
9+
10+
from bs4 import BeautifulSoup
11+
from tqdm import tqdm
12+
13+
from django.db import migrations
14+
from django.conf import settings
15+
16+
def get_filename(doc):
17+
path = settings.DOCUMENT_PATH_PATTERN.format(doc=doc)
18+
# ! These files right now are created with no version number?
19+
#name = '%s-%s.txt' % (doc.name,doc.rev)
20+
name = '%s.txt' % (doc.name,)
21+
return os.path.join(path,name)
22+
23+
def forward(apps,schema_editor):
24+
# for each qualifying document
25+
Document = apps.get_model('doc','Document')
26+
27+
for doc in tqdm(Document.objects.filter(type='review',external_url__contains="www.ietf.org/mail-archive/web"),desc="Pointers into Mhonarc"):
28+
filename = get_filename(doc)
29+
if not os.path.isfile(filename):
30+
with contextlib.closing(urllib2.urlopen(doc.external_url)) as infile:
31+
fullcontents = infile.read().decode('utf-8', 'ignore');
32+
start = fullcontents.find('<!--X-Body-of-Message-->')
33+
end = fullcontents.find('<!--X-Body-of-Message-End-->')
34+
bodyblock=fullcontents[start+len('<!--X-Body-of-Message-->'):end]
35+
text = BeautifulSoup(bodyblock,"lxml").get_text('\n\n') \
36+
.replace('FAQ at <\n\nhttp://wiki.tools','FAQ at <http://wiki.tools') \
37+
.replace('wiki/GenArtfaq\n\n>','wiki/GenArtfaq>')
38+
with contextlib.closing(open(filename,'w')) as outfile:
39+
outfile.write(text.encode('utf8'))
40+
41+
for doc in tqdm(Document.objects.filter(type='review',external_url__contains="mailarchive.ietf.org"),desc="Pointers into Mailarchive"):
42+
filename = get_filename(doc)
43+
if not os.path.isfile(filename):
44+
with contextlib.closing(urllib2.urlopen(doc.external_url)) as infile:
45+
fullcontents = infile.read().decode('utf-8', 'ignore');
46+
soup = BeautifulSoup(fullcontents,"lxml")
47+
divpre = soup.find('div',{"id":"msg-payload"}).find('pre')
48+
text = divpre.get_text('\n\n')
49+
with contextlib.closing(open(filename,'w')) as outfile:
50+
outfile.write(text.encode('utf8'))
51+
52+
## After this migration, we should figure out what to do with these stragglers:
53+
## In [29]: Document.objects.filter(type='review').exclude(Q(external_url__contains="mailarchive")|Q(external_url__contains="mail-archive")).values_list('external_url',flat=True)
54+
## Out[29]: [u'https://art.tools.ietf.org/tools/art/genart/index.cgi/t=1909/review_edit?reviewid=2300', u'https://art.tools.ietf.org/tools/art/genart/index.cgi/t=8460/review_edit?reviewid=2735', u'https://www.ietf.org/ibin/c5i?mid=6&rid=49&gid=0&k1=933&k2=55337&tid=1296220835', u'https://www.ietf.org/mailman/private/tsv-dir/2012-February/002007.html', u'', u'']
55+
56+
def reverse(apps,schema_editor):
57+
pass
58+
59+
class Migration(migrations.Migration):
60+
61+
dependencies = [
62+
('doc', '0016_auto_20160927_0713'),
63+
]
64+
65+
operations = [
66+
migrations.RunPython(forward,reverse)
67+
]

ietf/doc/utils.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,20 @@ def add_events_message_info(events):
297297
e.in_reply_to = e.addedmessageevent.in_reply_to
298298

299299

300+
def get_unicode_document_content(key, filename, split=True, markup=True, codec='utf-8', errors='ignore'):
301+
try:
302+
with open(filename, 'rb') as f:
303+
raw_content = f.read().decode(codec,errors)
304+
except IOError:
305+
error = "Error; cannot read ("+key+")"
306+
return error
307+
308+
if markup:
309+
return markup_txt.markup_unicode(raw_content, split)
310+
else:
311+
return raw_content
312+
313+
300314
def get_document_content(key, filename, split=True, markup=True):
301315
try:
302316
with open(filename, 'rb') as f:

ietf/doc/views_doc.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
can_adopt_draft, get_chartering_type, get_document_content, get_tags_for_stream_id,
5252
needed_ballot_positions, nice_consensus, prettify_std_name, update_telechat, has_same_ballot,
5353
get_initial_notify, make_notify_changed_event, crawl_history, default_consensus,
54-
add_events_message_info)
54+
add_events_message_info, get_unicode_document_content)
5555
from ietf.community.utils import augment_docs_with_tracking_info
5656
from ietf.group.models import Role
5757
from ietf.group.utils import can_manage_group, can_manage_materials
@@ -582,7 +582,7 @@ def document_main(request, name, rev=None):
582582
if doc.type_id == "review":
583583
basename = "{}.txt".format(doc.name, doc.rev)
584584
pathname = os.path.join(doc.get_file_path(), basename)
585-
content = get_document_content(basename, pathname, split=False)
585+
content = get_unicode_document_content(basename, pathname, split=False)
586586

587587
review_req = ReviewRequest.objects.filter(review=doc.name).first()
588588

ietf/utils/markup_txt.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,3 +71,23 @@ def markup(content, split=True):
7171
#return (content1, content2)
7272
else:
7373
return "<pre>" + content + "</pre>\n"
74+
75+
def markup_unicode(content, split=True):
76+
# normalize line endings to LF only
77+
content = content.replace("\r\n", "\n")
78+
content = content.replace("\r", "\n")
79+
80+
# remove leading white space
81+
content = content.lstrip()
82+
# remove runs of blank lines
83+
content = re.sub("\n\n\n+", "\n\n", content)
84+
85+
# expand tabs + escape
86+
content = escape(content.expandtabs())
87+
88+
if split:
89+
n = content.find("\n", 5000)
90+
content1 = "<pre>"+content[:n+1]+"</pre>\n"
91+
return content1
92+
else:
93+
return "<pre>" + content + "</pre>\n"

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# -*- conf-mode -*-
22
setuptools>=18.5 # Require this first, to prevent later errors
33
#
4+
beautifulsoup4>=4.5.1
45
bibtexparser>=0.6.2
56
coverage>=4.0.1,!=4.0.2
67
#cssselect>=0.6.1 # for PyQuery

0 commit comments

Comments
 (0)