Skip to content

Commit 2973b88

Browse files
committed
issue2550799: provide basic support for handling html only emails
Initial implementation and testing with the dehtml html converter done. The use of beautifulsoup 4 is not tested. My test system breaks when running dehtml.py using beautiful soup. I don't get the failures when running under the test harness, but the text output is significantly different (different line breaks, number of newlines etc.) The tests for dehtml need to be generated for beautiful soup and the expected output changed. Since I have a wonky install of beautiful soup, I don't trust my output as the standard to test against. Also since beautiful soup is optional, the test harness needs to skip the beautifulsoup tests if import bs4 fails. Again something outside of my expertise. I deleted the work I had done to implement that. I could not get it working and wanted to get this feature in in some form.
1 parent f8b4786 commit 2973b88

File tree

6 files changed

+548
-21
lines changed

6 files changed

+548
-21
lines changed

CHANGES.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,12 @@ Features:
216216
figure out the original client ip and protocol. None of the core
217217
roundup code uses these headers/env vars. These headers can be
218218
spoofed by bad proxies etc. so you have been warned.
219+
- issue2550799: provide basic support for handling html only emails
220+
Emails missing text/plain parts but with text/html parts can be
221+
converted into text. If this is done the email will no longer be
222+
bounced back to the sender with an error. Enable by configuring the
223+
convert_htmltotext option in your upgraded config.ini. (Initial
224+
patch by Igor Ippolitov merged with changes by John Rouillard.)
219225

220226
Fixed:
221227

roundup/configuration.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,19 @@ def str2value(self, value):
324324
else:
325325
raise OptionValueError(self, value, self.class_description)
326326

327+
class HtmlToTextOption(Option):
328+
329+
"""What module should be used to convert emails with only text/html parts into text for display in roundup. Choose from beautifulsoup 4, dehtml - the internal code or none to disable html to text conversion. If beautifulsoup chosen but not available, dehtml will be used."""
330+
331+
class_description = "Allowed values: beautifulsoup, dehtml, none"
332+
333+
def str2value(self, value):
334+
_val = value.lower()
335+
if _val in ("beautifulsoup", "dehtml", "none"):
336+
return _val
337+
else:
338+
raise OptionValueError(self, value, self.class_description)
339+
327340
class EmailBodyOption(Option):
328341

329342
"""When to replace message body or strip quoting: always, never or for new items only"""
@@ -1012,6 +1025,14 @@ def str2value(self, value):
10121025
"multipart/alternative, and this option is set, all other\n"
10131026
"parts of the multipart/alternative are ignored. The default\n"
10141027
"is to keep all parts and attach them to the issue."),
1028+
(HtmlToTextOption, "convert_htmltotext", "none",
1029+
"If an email has only text/html parts, use this module\n"
1030+
"to convert the html to text. Choose from beautifulsoup 4,\n"
1031+
"dehtml - (internal code), or none to disable conversion.\n"
1032+
"If 'none' is selected, email without a text/plain part\n"
1033+
"will be returned to the user with a message. If\n"
1034+
"beautifulsoup is selected but not installed dehtml will\n"
1035+
"be used instead."),
10151036
(BooleanOption, "keep_real_from", "no",
10161037
"When handling emails ignore the Resent-From:-header\n"
10171038
"and use the original senders From:-header instead.\n"

roundup/dehtml.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
2+
class dehtml:
3+
def __init__(self, converter):
4+
if converter == "none":
5+
self.html2text = None
6+
return
7+
8+
try:
9+
if converter == "beautifulsoup":
10+
# Not as well tested as dehtml.
11+
from bs4 import BeautifulSoup
12+
def html2text(html):
13+
soup = BeautifulSoup(html)
14+
15+
# kill all script and style elements
16+
for script in soup(["script", "style"]):
17+
script.extract()
18+
19+
return soup.get_text('\n', strip=True).encode('utf-8')
20+
21+
self.html2text = html2text
22+
else:
23+
raise ImportError # use
24+
except ImportError:
25+
# use the fallback below if beautiful soup is not installed.
26+
from HTMLParser import HTMLParser
27+
from htmlentitydefs import name2codepoint
28+
29+
class DumbHTMLParser(HTMLParser):
30+
# class attribute
31+
text=""
32+
33+
# internal state variable
34+
_skip_data = False
35+
_last_empty = False
36+
37+
def handle_data(self, data):
38+
if self._skip_data: # skip data if in script or style block
39+
return
40+
41+
if ( data.strip() == ""):
42+
# reduce multiple blank lines to 1
43+
if ( self._last_empty ):
44+
return
45+
else:
46+
self._last_empty = True
47+
else:
48+
self._last_empty = False
49+
50+
self.text=self.text + data
51+
52+
def handle_starttag(self, tag, attrs):
53+
if (tag == "p" ):
54+
self.text= self.text + "\n"
55+
if (tag in ("style", "script")):
56+
self._skip_data = True
57+
58+
def handle_endtag(self, tag):
59+
if (tag in ("style", "script")):
60+
self._skip_data = False
61+
62+
def handle_entityref(self, name):
63+
if self._skip_data:
64+
return
65+
c = unichr(name2codepoint[name])
66+
try:
67+
self.text= self.text + c
68+
except UnicodeEncodeError:
69+
# print a space as a placeholder
70+
pass
71+
72+
def html2text(html):
73+
parser = DumbHTMLParser()
74+
parser.feed(html)
75+
parser.close()
76+
return parser.text
77+
78+
self.html2text = html2text
79+
80+
if "__main__" == __name__:
81+
html='''
82+
<body>
83+
<script>
84+
this must not be in output
85+
</script>
86+
<style>
87+
p {display:block}
88+
</style>
89+
<div class="header"><h1>Roundup</h1>
90+
<div id="searchbox" style="display: none">
91+
<form class="search" action="../search.html" method="get">
92+
<input type="text" name="q" size="18" />
93+
<input type="submit" value="Search" />
94+
<input type="hidden" name="check_keywords" value="yes" />
95+
<input type="hidden" name="area" value="default" />
96+
</form>
97+
</div>
98+
<script type="text/javascript">$('#searchbox').show(0);</script>
99+
</div>
100+
<ul class="current">
101+
<li class="toctree-l1"><a class="reference internal" href="../index.html">Home</a></li>
102+
<li class="toctree-l1"><a class="reference external" href="http://pypi.python.org/pypi/roundup">Download</a></li>
103+
<li class="toctree-l1 current"><a class="reference internal" href="../docs.html">Docs</a><ul class="current">
104+
<li class="toctree-l2"><a class="reference internal" href="features.html">Roundup Features</a></li>
105+
<li class="toctree-l2 current"><a class="current reference internal" href="">Installing Roundup</a></li>
106+
<li class="toctree-l2"><a class="reference internal" href="upgrading.html">Upgrading to newer versions of Roundup</a></li>
107+
<li class="toctree-l2"><a class="reference internal" href="FAQ.html">Roundup FAQ</a></li>
108+
<li class="toctree-l2"><a class="reference internal" href="user_guide.html">User Guide</a></li>
109+
<li class="toctree-l2"><a class="reference internal" href="customizing.html">Customising Roundup</a></li>
110+
<li class="toctree-l2"><a class="reference internal" href="admin_guide.html">Administration Guide</a></li>
111+
</ul>
112+
<div class="section" id="prerequisites">
113+
<h2><a class="toc-backref" href="#id5">Prerequisites</a></h2>
114+
<p>Roundup requires Python 2.5 or newer (but not Python 3) with a functioning
115+
anydbm module. Download the latest version from <a class="reference external" href="http://www.python.org/">http://www.python.org/</a>.
116+
It is highly recommended that users install the latest patch version
117+
of python as these contain many fixes to serious bugs.</p>
118+
<p>Some variants of Linux will need an additional &#8220;python dev&#8221; package
119+
installed for Roundup installation to work. Debian and derivatives, are
120+
known to require this.</p>
121+
<p>If you&#8217;re on windows, you will either need to be using the ActiveState python
122+
distribution (at <a class="reference external" href="http://www.activestate.com/Products/ActivePython/">http://www.activestate.com/Products/ActivePython/</a>), or you&#8217;ll
123+
have to install the win32all package separately (get it from
124+
<a class="reference external" href="http://starship.python.net/crew/mhammond/win32/">http://starship.python.net/crew/mhammond/win32/</a>).</p>
125+
</div>
126+
</body>
127+
'''
128+
129+
html2text = dehtml("dehtml").html2text
130+
if html2text:
131+
print html2text(html)
132+
133+
try:
134+
# trap error seen if N_TOKENS not defined when run.
135+
html2text = dehtml("beautifulsoup").html2text
136+
if html2text:
137+
print html2text(html)
138+
except NameError as e:
139+
print "captured error %s"%e
140+
141+
html2text = dehtml("none").html2text
142+
if html2text:
143+
print "FAIL: Error, dehtml(none) is returning a function"
144+
else:
145+
print "PASS: dehtml(none) is returning None"
146+
147+

roundup/mailgw.py

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,7 @@ def getbody(self):
382382
# Only if configured in [mailgw] unpack_rfc822
383383

384384
def extract_content(self, parent_type=None, ignore_alternatives=False,
385-
unpack_rfc822=False):
385+
unpack_rfc822=False, html2text=None):
386386
"""Extract the body and the attachments recursively.
387387
388388
If the content is hidden inside a multipart/alternative part,
@@ -392,24 +392,43 @@ def extract_content(self, parent_type=None, ignore_alternatives=False,
392392
content_type = self.gettype()
393393
content = None
394394
attachments = []
395+
html_part = False
395396

396397
if content_type == 'text/plain':
397398
content = self.getbody()
399+
elif content_type == 'text/html' and html2text:
400+
# if user allows html conversion run this.
401+
content = html2text(self.getbody())
402+
attachments.append(self.as_attachment())
403+
html_part = True
398404
elif content_type[:10] == 'multipart/':
399-
content_found = bool (content)
400-
ig = ignore_alternatives and not content_found
405+
content_found = False
406+
ig = ignore_alternatives
407+
html_part_found = False
401408
for part in self.getparts():
402-
new_content, new_attach = part.extract_content(content_type,
403-
not content and ig, unpack_rfc822)
409+
new_content, new_attach, html_part = part.extract_content(
410+
content_type, not content and ig, unpack_rfc822,
411+
html2text)
404412

405413
# If we haven't found a text/plain part yet, take this one,
406414
# otherwise make it an attachment.
407415
if not content:
408416
content = new_content
409417
cpart = part
418+
if html_part:
419+
html_part_found = True
410420
elif new_content:
411-
if content_found or content_type != 'multipart/alternative':
421+
if html_part:
422+
# attachment should be added elsewhere.
423+
pass
424+
elif content_found or content_type != 'multipart/alternative':
412425
attachments.append(part.text_as_attachment())
426+
elif html_part_found:
427+
# text/plain part found after html
428+
# save html as attachment
429+
attachments.append(cpart.as_attachment())
430+
content = new_content
431+
cpart = part
413432
else:
414433
# if we have found a text/plain in the current
415434
# multipart/alternative and find another one, we
@@ -425,20 +444,21 @@ def extract_content(self, parent_type=None, ignore_alternatives=False,
425444
attachments.extend(new_attach)
426445
if ig and content_type == 'multipart/alternative' and content:
427446
attachments = []
447+
html_part = False
428448
elif unpack_rfc822 and content_type == 'message/rfc822':
429449
s = cStringIO.StringIO(self.getbody())
430450
m = Message(s)
431451
ig = ignore_alternatives and not content
432-
new_content, attachments = m.extract_content(m.gettype(), ig,
433-
unpack_rfc822)
452+
new_content, attachments, html_part = m.extract_content(m.gettype(), ig,
453+
unpack_rfc822, html2text)
434454
attachments.insert(0, m.text_as_attachment())
435455
elif (parent_type == 'multipart/signed' and
436456
content_type == 'application/pgp-signature'):
437457
# ignore it so it won't be saved as an attachment
438458
pass
439459
else:
440460
attachments.append(self.as_attachment())
441-
return content, attachments
461+
return content, attachments, html_part
442462

443463
def text_as_attachment(self):
444464
"""Return first text/plain part as Message"""
@@ -1072,10 +1092,15 @@ def pgp_role():
10721092
def get_content_and_attachments(self):
10731093
''' get the attachments and first text part from the message
10741094
'''
1095+
from roundup.dehtml import dehtml
1096+
html2text=dehtml(self.config['MAILGW_CONVERT_HTMLTOTEXT']).html2text
1097+
10751098
ig = self.config.MAILGW_IGNORE_ALTERNATIVES
1076-
self.content, self.attachments = self.message.extract_content(
1099+
self.message.instance = self.mailgw.instance
1100+
self.content, self.attachments, html_part = self.message.extract_content(
10771101
ignore_alternatives=ig,
1078-
unpack_rfc822=self.config.MAILGW_UNPACK_RFC822)
1102+
unpack_rfc822=self.config.MAILGW_UNPACK_RFC822,
1103+
html2text=html2text )
10791104

10801105

10811106
def create_files(self):

0 commit comments

Comments
 (0)