Be more tolerant when parsing RFC2047 encoded mail headers.

schlatterbeck · schlatterbeck · commit c000ce195019 · 2012-01-04T18:55:49.000+01:00
Use backported version of my proposed changes to email.header.decode_header in http://bugs.python.org/issue1079
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -51,6 +51,9 @@ Fixed:
   formating.
 - Fix xmlrpc URL parsing so that passwords may contain a ':' character
   (Ralf)
+- Be more tolerant when parsing RFC2047 encoded mail headers. Use
+  backported version of my proposed changes to
+  email.header.decode_header in http://bugs.python.org/issue1079
 
 
 2011-07-15: 1.4.19
diff --git a/roundup/anypy/email_.py b/roundup/anypy/email_.py
@@ -1,3 +1,7 @@
+import re
+import binascii
+from email import quoprimime, base64mime
+
 try:
     # Python 2.5+
     from email.parser import FeedParser
@@ -17,3 +21,115 @@ def feed(self, s):
             def close(self):
                 p = Parser()
                 return p.parsestr(''.join(self.content))
+
+# Match encoded-word strings in the form =?charset?q?Hello_World?=
+ecre = re.compile(r'''
+  =\?                   # literal =?
+  (?P<charset>[^?]*?)   # non-greedy up to the next ? is the charset
+  \?                    # literal ?
+  (?P<encoding>[qb])    # either a "q" or a "b", case insensitive
+  \?                    # literal ?
+  (?P<encoded>.*?)      # non-greedy up to the next ?= is the encoded string
+  \?=                   # literal ?=
+  ''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)
+
+
+# Fixed header parser, see my proposed patch and discussions:
+# http://bugs.python.org/issue1079 "decode_header does not follow RFC 2047"
+# http://bugs.python.org/issue1467619 "Header.decode_header eats up spaces"
+# This implements the decode_header specific parts of my proposed patch
+# backported to python2.X
+def decode_header(header):
+    """Decode a message header value without converting charset.
+
+    Returns a list of (string, charset) pairs containing each of the decoded
+    parts of the header.  Charset is None for non-encoded parts of the header,
+    otherwise a lower-case string containing the name of the character set
+    specified in the encoded string.
+
+    header may be a string that may or may not contain RFC2047 encoded words,
+    or it may be a Header object.
+
+    An email.errors.HeaderParseError may be raised when certain decoding error
+    occurs (e.g. a base64 decoding exception).
+    """
+    # If it is a Header object, we can just return the encoded chunks.
+    if hasattr(header, '_chunks'):
+        return [(_charset._encode(string, str(charset)), str(charset))
+                    for string, charset in header._chunks]
+    # If no encoding, just return the header with no charset.
+    if not ecre.search(header):
+        return [(header, None)]
+    # First step is to parse all the encoded parts into triplets of the form
+    # (encoded_string, encoding, charset).  For unencoded strings, the last
+    # two parts will be None.
+    words = []
+    for line in header.splitlines():
+        parts = ecre.split(line)
+        first = True
+        while parts:
+            unencoded = parts.pop(0)
+            if first:
+                unencoded = unencoded.lstrip()
+                first = False
+            if unencoded:
+                words.append((unencoded, None, None))
+            if parts:
+                charset = parts.pop(0).lower()
+                encoding = parts.pop(0).lower()
+                encoded = parts.pop(0)
+                words.append((encoded, encoding, charset))
+    # Now loop over words and remove words that consist of whitespace
+    # between two encoded strings.
+    import sys
+    droplist = []
+    for n, w in enumerate(words):
+        if n>1 and w[1] and words[n-2][1] and words[n-1][0].isspace():
+            droplist.append(n-1)
+    for d in reversed(droplist):
+        del words[d]
+
+    # The next step is to decode each encoded word by applying the reverse
+    # base64 or quopri transformation.  decoded_words is now a list of the
+    # form (decoded_word, charset).
+    decoded_words = []
+    for encoded_string, encoding, charset in words:
+        if encoding is None:
+            # This is an unencoded word.
+            decoded_words.append((encoded_string, charset))
+        elif encoding == 'q':
+            word = quoprimime.header_decode(encoded_string)
+            decoded_words.append((word, charset))
+        elif encoding == 'b':
+            paderr = len(encoded_string) % 4   # Postel's law: add missing padding
+            if paderr:
+                encoded_string += '==='[:4 - paderr]
+            try:
+                word = base64mime.decode(encoded_string)
+            except binascii.Error:
+                raise HeaderParseError('Base64 decoding error')
+            else:
+                decoded_words.append((word, charset))
+        else:
+            raise AssertionError('Unexpected encoding: ' + encoding)
+    # Now convert all words to bytes and collapse consecutive runs of
+    # similarly encoded words.
+    collapsed = []
+    last_word = last_charset = None
+    for word, charset in decoded_words:
+        if isinstance(word, str):
+            pass
+        if last_word is None:
+            last_word = word
+            last_charset = charset
+        elif charset != last_charset:
+            collapsed.append((last_word, last_charset))
+            last_word = word
+            last_charset = charset
+        elif last_charset is None:
+            last_word += BSPACE + word
+        else:
+            last_word += word
+    collapsed.append((last_word, last_charset))
+    return collapsed
+
diff --git a/roundup/mailgw.py b/roundup/mailgw.py
@@ -82,7 +82,7 @@ class node. Any parts of other types are each stored in separate files
 import time, random, sys, logging
 import traceback, rfc822
 
-from email.Header import decode_header
+from anypy.email_ import decode_header
 
 from roundup import configuration, hyperdb, date, password, rfc2822, exceptions
 from roundup.mailer import Mailer, MessageSendError
@@ -244,17 +244,14 @@ def getparts(self):
 
     def _decode_header_to_utf8(self, hdr):
         l = []
-        prev_encoded = False
         for part, encoding in decode_header(hdr):
             if encoding:
                 part = part.decode(encoding)
             # RFC 2047 specifies that between encoded parts spaces are
             # swallowed while at the borders from encoded to non-encoded
             # or vice-versa we must preserve a space. Multiple adjacent
-            # non-encoded parts should not occur.
-            if l and prev_encoded != bool(encoding):
-                l.append(' ')
-            prev_encoded = bool(encoding)
+            # non-encoded parts should not occur. This is now
+            # implemented in our patched decode_header method in anypy
             l.append(part)
         return ''.join([s.encode('utf-8') for s in l])
 
diff --git a/test/test_mailgw.py b/test/test_mailgw.py
@@ -156,6 +156,15 @@ def tearDown(self):
             os.remove(SENDMAILDEBUG)
         self.db.close()
 
+    def _allowAnonymousSubmit(self):
+        p = [
+            self.db.security.getPermission('Register', 'user'),
+            self.db.security.getPermission('Email Access', None),
+            self.db.security.getPermission('Create', 'issue'),
+            self.db.security.getPermission('Create', 'msg'),
+        ]
+        self.db.security.role['anonymous'].permissions = p
+
     def _create_mailgw(self, message, args=()):
         class MailGW(self.instance.MailGW):
             def handle_message(self, message):
@@ -1924,13 +1933,7 @@ def testNewUserAuthorEncodedName(self):
 
 This is a test submission of a new issue.
 '''
-        p = [
-            self.db.security.getPermission('Register', 'user'),
-            self.db.security.getPermission('Email Access', None),
-            self.db.security.getPermission('Create', 'issue'),
-            self.db.security.getPermission('Create', 'msg'),
-        ]
-        self.db.security.role['anonymous'].permissions = p
+        self._allowAnonymousSubmit()
         self._handle_mail(message)
         m = set(self.db.user.list())
         new = list(m - l)[0]
@@ -1951,13 +1954,7 @@ def testNewUserAuthorMixedEncodedName(self):
 
 This is a test submission of a new issue.
 '''
-        p = [
-            self.db.security.getPermission('Register', 'user'),
-            self.db.security.getPermission('Email Access', None),
-            self.db.security.getPermission('Create', 'issue'),
-            self.db.security.getPermission('Create', 'msg'),
-        ]
-        self.db.security.role['anonymous'].permissions = p
+        self._allowAnonymousSubmit()
         self._handle_mail(message)
         title = self.db.issue.get('1', 'title')
         self.assertEquals(title, 'Test \xc3\x84\xc3\x96\xc3\x9c umlauts X1 X2')
@@ -1966,6 +1963,29 @@ def testNewUserAuthorMixedEncodedName(self):
         name = self.db.user.get(new, 'realname')
         self.assertEquals(name, 'Firstname \xc3\xa4\xc3\xb6\xc3\x9f Last')
 
+    def testNewUserAuthorMixedEncodedNameSpacing(self):
+        l = set(self.db.user.list())
+        # From: name has Euro symbol in it
+        message = '''Content-Type: text/plain;
+  charset="iso-8859-1"
+From: (=?utf-8?b?w6TDtsOf?==?utf-8?b?w6TDtsOf?=) <fubar@bork.bork.bork>
+To: issue_tracker@your.tracker.email.domain.example
+Message-Id: <dummy_test_message_id>
+Subject: [issue] Test (=?utf-8?b?w4TDlsOc?=) umlauts
+ X1
+
+This is a test submission of a new issue.
+'''
+        self._allowAnonymousSubmit()
+        self._handle_mail(message)
+        title = self.db.issue.get('1', 'title')
+        self.assertEquals(title, 'Test (\xc3\x84\xc3\x96\xc3\x9c) umlauts X1')
+        m = set(self.db.user.list())
+        new = list(m - l)[0]
+        name = self.db.user.get(new, 'realname')
+        self.assertEquals(name,
+            '(\xc3\xa4\xc3\xb6\xc3\x9f\xc3\xa4\xc3\xb6\xc3\x9f)')
+
     def testUnknownUser(self):
         l = set(self.db.user.list())
         message = '''Content-Type: text/plain;