issue2550799: provide basic support for handling html only emails

rouilj · rouilj · commit 2973b8809420 · 2017-10-13T21:46:59.000-04:00
Initial implementation and testing with the dehtml html converter
done.

The use of beautifulsoup 4 is not tested. My test system breaks when
running dehtml.py using beautiful soup. I don't get the failures when
running under the test harness, but the text output is significantly
different (different line breaks, number of newlines etc.)

The tests for dehtml need to be generated for beautiful soup and the
expected output changed. Since I have a wonky install of beautiful
soup, I don't trust my output as the standard to test against.  Also
since beautiful soup is optional, the test harness needs to skip the
beautifulsoup tests if import bs4 fails. Again something outside of my
expertise. I deleted the work I had done to implement that. I could
not get it working and wanted to get this feature in in some form.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -216,6 +216,12 @@ Features:
   figure out the original client ip and protocol. None of the core
   roundup code uses these headers/env vars. These headers can be
   spoofed by bad proxies etc. so you have been warned.
+- issue2550799: provide basic support for handling html only emails
+  Emails missing text/plain parts but with text/html parts can be
+  converted into text. If this is done the email will no longer be
+  bounced back to the sender with an error. Enable by configuring the
+  convert_htmltotext option in your upgraded config.ini. (Initial
+  patch by Igor Ippolitov merged with changes by John Rouillard.)
 
 Fixed:
 
diff --git a/roundup/configuration.py b/roundup/configuration.py
@@ -324,6 +324,19 @@ def str2value(self, value):
         else:
             raise OptionValueError(self, value, self.class_description)
         
+class HtmlToTextOption(Option):
+
+    """What module should be used to convert emails with only text/html parts into text for display in roundup. Choose from beautifulsoup 4, dehtml - the internal code or none to disable html to text conversion. If beautifulsoup chosen but not available, dehtml will be used."""
+
+    class_description = "Allowed values: beautifulsoup, dehtml, none"
+
+    def str2value(self, value):
+        _val = value.lower()
+        if _val in ("beautifulsoup", "dehtml", "none"):
+            return _val
+        else:
+            raise OptionValueError(self, value, self.class_description)
+
 class EmailBodyOption(Option):
 
     """When to replace message body or strip quoting: always, never or for new items only"""
@@ -1012,6 +1025,14 @@ def str2value(self, value):
             "multipart/alternative, and this option is set, all other\n"
             "parts of the multipart/alternative are ignored. The default\n"
             "is to keep all parts and attach them to the issue."),
+        (HtmlToTextOption, "convert_htmltotext", "none",
+            "If an email has only text/html parts, use this module\n"
+            "to convert the html to text. Choose from beautifulsoup 4,\n"
+            "dehtml - (internal code), or none to disable conversion.\n"
+            "If 'none' is selected, email without a text/plain part\n"
+            "will be returned to the user with a message. If\n"
+            "beautifulsoup is selected but not installed dehtml will\n"
+            "be used instead."),
         (BooleanOption, "keep_real_from", "no",
             "When handling emails ignore the Resent-From:-header\n"
             "and use the original senders From:-header instead.\n"
diff --git a/roundup/dehtml.py b/roundup/dehtml.py
@@ -0,0 +1,147 @@
+
+class dehtml:
+    def __init__(self, converter):
+        if converter == "none":
+            self.html2text = None
+            return
+
+        try:
+            if converter == "beautifulsoup":
+                # Not as well tested as dehtml.
+                from bs4 import BeautifulSoup
+                def html2text(html):
+                    soup = BeautifulSoup(html)
+
+                    # kill all script and style elements
+                    for script in soup(["script", "style"]):
+                        script.extract()
+
+                    return soup.get_text('\n', strip=True).encode('utf-8')
+
+                self.html2text = html2text
+            else:
+                raise ImportError # use
+        except ImportError:
+            # use the fallback below if beautiful soup is not installed.
+            from HTMLParser import HTMLParser
+            from htmlentitydefs import name2codepoint
+
+            class DumbHTMLParser(HTMLParser):
+                # class attribute
+                text=""
+
+                # internal state variable
+                _skip_data = False
+                _last_empty = False
+
+                def handle_data(self, data):
+                    if self._skip_data: # skip data if in script or style block
+                        return
+
+                    if ( data.strip() == ""):
+                        # reduce multiple blank lines to 1
+                        if ( self._last_empty ):
+                            return
+                        else:
+                            self._last_empty = True
+                    else:
+                        self._last_empty = False
+
+                    self.text=self.text + data
+
+                def handle_starttag(self, tag, attrs):
+                    if (tag == "p" ):
+                        self.text= self.text + "\n"
+                    if (tag  in ("style", "script")):
+                        self._skip_data = True
+
+                def handle_endtag(self, tag):
+                    if (tag  in ("style", "script")):
+                        self._skip_data = False
+
+                def handle_entityref(self, name):
+                    if self._skip_data:
+                        return
+                    c = unichr(name2codepoint[name])
+                    try:
+                        self.text= self.text + c
+                    except UnicodeEncodeError:
+                        # print a space as a placeholder
+                        pass
+
+            def html2text(html):
+                parser = DumbHTMLParser()
+                parser.feed(html)
+                parser.close()
+                return parser.text
+
+            self.html2text = html2text
+
+if "__main__" == __name__:
+    html='''
+<body>
+<script>
+this must not be in output
+</script>
+<style>
+p {display:block}
+</style>
+    <div class="header"><h1>Roundup</h1>
+        <div id="searchbox" style="display: none">
+          <form class="search" action="../search.html" method="get">
+            <input type="text" name="q" size="18" />
+            <input type="submit" value="Search" />
+            <input type="hidden" name="check_keywords" value="yes" />
+            <input type="hidden" name="area" value="default" />
+          </form>
+        </div>
+        <script type="text/javascript">$('#searchbox').show(0);</script>
+    </div>
+       <ul class="current">
+<li class="toctree-l1"><a class="reference internal" href="../index.html">Home</a></li>
+<li class="toctree-l1"><a class="reference external" href="http://pypi.python.org/pypi/roundup">Download</a></li>
+<li class="toctree-l1 current"><a class="reference internal" href="../docs.html">Docs</a><ul class="current">
+<li class="toctree-l2"><a class="reference internal" href="features.html">Roundup Features</a></li>
+<li class="toctree-l2 current"><a class="current reference internal" href="">Installing Roundup</a></li>
+<li class="toctree-l2"><a class="reference internal" href="upgrading.html">Upgrading to newer versions of Roundup</a></li>
+<li class="toctree-l2"><a class="reference internal" href="FAQ.html">Roundup FAQ</a></li>
+<li class="toctree-l2"><a class="reference internal" href="user_guide.html">User Guide</a></li>
+<li class="toctree-l2"><a class="reference internal" href="customizing.html">Customising Roundup</a></li>
+<li class="toctree-l2"><a class="reference internal" href="admin_guide.html">Administration Guide</a></li>
+</ul>
+<div class="section" id="prerequisites">
+<h2><a class="toc-backref" href="#id5">Prerequisites</a></h2>
+<p>Roundup requires Python 2.5 or newer (but not Python 3) with a functioning
+anydbm module. Download the latest version from <a class="reference external" href="http://www.python.org/">http://www.python.org/</a>.
+It is highly recommended that users install the latest patch version
+of python as these contain many fixes to serious bugs.</p>
+<p>Some variants of Linux will need an additional &#8220;python dev&#8221; package
+installed for Roundup installation to work. Debian and derivatives, are
+known to require this.</p>
+<p>If you&#8217;re on windows, you will either need to be using the ActiveState python
+distribution (at <a class="reference external" href="http://www.activestate.com/Products/ActivePython/">http://www.activestate.com/Products/ActivePython/</a>), or you&#8217;ll
+have to install the win32all package separately (get it from
+<a class="reference external" href="http://starship.python.net/crew/mhammond/win32/">http://starship.python.net/crew/mhammond/win32/</a>).</p>
+</div>
+</body>
+'''
+
+    html2text = dehtml("dehtml").html2text
+    if html2text:
+        print html2text(html)
+
+    try:
+        # trap error seen if N_TOKENS not defined when run.
+        html2text = dehtml("beautifulsoup").html2text
+        if html2text:
+            print html2text(html)
+    except NameError as e:
+        print "captured error %s"%e
+
+    html2text = dehtml("none").html2text
+    if html2text:
+        print "FAIL: Error, dehtml(none) is returning a function"
+    else:
+        print "PASS: dehtml(none) is returning None"
+
+
diff --git a/roundup/mailgw.py b/roundup/mailgw.py
@@ -382,7 +382,7 @@ def getbody(self):
     #   Only if configured in [mailgw] unpack_rfc822
 
     def extract_content(self, parent_type=None, ignore_alternatives=False,
-        unpack_rfc822=False):
+            unpack_rfc822=False, html2text=None):
         """Extract the body and the attachments recursively.
 
            If the content is hidden inside a multipart/alternative part,
@@ -392,24 +392,43 @@ def extract_content(self, parent_type=None, ignore_alternatives=False,
         content_type = self.gettype()
         content = None
         attachments = []
+        html_part = False
 
         if content_type == 'text/plain':
             content = self.getbody()
+        elif content_type == 'text/html' and html2text:
+            # if user allows html conversion run this.
+            content = html2text(self.getbody())
+            attachments.append(self.as_attachment())
+            html_part = True
         elif content_type[:10] == 'multipart/':
-            content_found = bool (content)
-            ig = ignore_alternatives and not content_found
+            content_found = False
+            ig = ignore_alternatives
+            html_part_found = False
             for part in self.getparts():
-                new_content, new_attach = part.extract_content(content_type,
-                    not content and ig, unpack_rfc822)
+                new_content, new_attach, html_part = part.extract_content(
+                     content_type, not content and ig, unpack_rfc822,
+                    html2text)
 
                 # If we haven't found a text/plain part yet, take this one,
                 # otherwise make it an attachment.
                 if not content:
                     content = new_content
                     cpart   = part
+                    if html_part:
+                        html_part_found = True
                 elif new_content:
-                    if content_found or content_type != 'multipart/alternative':
+                    if html_part:
+                        # attachment should be added elsewhere.
+                        pass
+                    elif content_found or content_type != 'multipart/alternative':
                         attachments.append(part.text_as_attachment())
+                    elif html_part_found:
+                        # text/plain part found after html
+                        # save html as attachment
+                        attachments.append(cpart.as_attachment())
+                        content = new_content
+                        cpart   = part
                     else:
                         # if we have found a text/plain in the current
                         # multipart/alternative and find another one, we
@@ -425,20 +444,21 @@ def extract_content(self, parent_type=None, ignore_alternatives=False,
                 attachments.extend(new_attach)
             if ig and content_type == 'multipart/alternative' and content:
                 attachments = []
+            html_part = False
         elif unpack_rfc822 and content_type == 'message/rfc822':
             s = cStringIO.StringIO(self.getbody())
             m = Message(s)
             ig = ignore_alternatives and not content
-            new_content, attachments = m.extract_content(m.gettype(), ig,
-                unpack_rfc822)
+            new_content, attachments, html_part = m.extract_content(m.gettype(), ig,
+                    unpack_rfc822, html2text)
             attachments.insert(0, m.text_as_attachment())
         elif (parent_type == 'multipart/signed' and
               content_type == 'application/pgp-signature'):
             # ignore it so it won't be saved as an attachment
             pass
         else:
             attachments.append(self.as_attachment())
-        return content, attachments
+        return content, attachments, html_part
 
     def text_as_attachment(self):
         """Return first text/plain part as Message"""
@@ -1072,10 +1092,15 @@ def pgp_role():
     def get_content_and_attachments(self):
         ''' get the attachments and first text part from the message
         '''
+        from roundup.dehtml import dehtml
+        html2text=dehtml(self.config['MAILGW_CONVERT_HTMLTOTEXT']).html2text
+
         ig = self.config.MAILGW_IGNORE_ALTERNATIVES
-        self.content, self.attachments = self.message.extract_content(
+        self.message.instance = self.mailgw.instance
+        self.content, self.attachments, html_part = self.message.extract_content(
             ignore_alternatives=ig,
-            unpack_rfc822=self.config.MAILGW_UNPACK_RFC822)
+            unpack_rfc822=self.config.MAILGW_UNPACK_RFC822,
+            html2text=html2text )
         
 
     def create_files(self):
diff --git a/test/test_mailgw.py b/test/test_mailgw.py
diff --git a/test/test_multipart.py b/test/test_multipart.py