working branch for fixing https://issues.roundup-tracker.org/issue2551008

rouilj · rouilj · commit 2ff9300c2ffe · 2019-06-03T20:50:58.000-04:00
diff --git a/roundup/anypy/email_.py b/roundup/anypy/email_.py
@@ -3,6 +3,9 @@
 import email
 from email import quoprimime, base64mime
 
+from roundup.anypy.strings import bs2b
+from email import charset as _charset
+
 if str == bytes:
     message_from_bytes = email.message_from_string
     message_from_binary_file = email.message_from_file
@@ -45,11 +48,12 @@ def decode_header(header):
     """
     # If it is a Header object, we can just return the encoded chunks.
     if hasattr(header, '_chunks'):
+        # no bs2b here. _charset._encode should return bytes
         return [(_charset._encode(string, str(charset)), str(charset))
                     for string, charset in header._chunks]
     # If no encoding, just return the header with no charset.
     if not ecre.search(header):
-        return [(header, None)]
+        return [(bs2b(header), None)]
     # First step is to parse all the encoded parts into triplets of the form
     # (encoded_string, encoding, charset).  For unencoded strings, the last
     # two parts will be None.
@@ -113,14 +117,14 @@ def decode_header(header):
             last_word = word
             last_charset = charset
         elif charset != last_charset:
-            collapsed.append((last_word, last_charset))
+            collapsed.append((bs2b(last_word), last_charset))
             last_word = word
             last_charset = charset
         elif last_charset is None:
             BSPACE = b' '
             last_word += BSPACE + word
         else:
             last_word += word
-    collapsed.append((last_word, last_charset))
+    collapsed.append((bs2b(last_word), last_charset))
     return collapsed
 
diff --git a/roundup/mailgw.py b/roundup/mailgw.py
@@ -204,6 +204,13 @@ def _decode_header(self, hdr):
         for part, encoding in decode_header(hdr):
             if encoding:
                 part = part.decode(encoding)
+            else:
+                # if the encoding is unknown, try decoding with utf-8
+                # and fallback on iso-8859-1 if that fails
+                try:
+                    part = part.decode('utf-8')
+                except UnicodeDecodeError:
+                    part = part.decode('iso-8859-1')
             # RFC 2047 specifies that between encoded parts spaces are
             # swallowed while at the borders from encoded to non-encoded
             # or vice-versa we must preserve a space. Multiple adjacent
diff --git a/test/test_mailgw_roundupmessage.py b/test/test_mailgw_roundupmessage.py
@@ -75,6 +75,34 @@ class HeaderRoundupMessageTests(TestCase):
         This is a test submission of a new issue.
     """)
 
+    # From line has a null/empty encoding spec
+    # to trigger failure in mailgw.py:RoundupMessage::_decode_header
+    bad_msg_utf8 = message_from_string("""
+        Content-Type: text/plain;
+            charset="iso-8859-1"
+        From: =??b?SOKCrGxsbw=====?= <hello@example.com>
+        To: Issue Tracker <issue_tracker@example.com>
+        Cc: =?utf8?b?SOKCrGxsbw==?= <hello@example.com>,
+            Some User <some.user@example.com>
+        Message-Id: <dummy_test_message_id>
+        Subject: [issue] Testing...
+
+        This is a test submission of a new issue.
+    """)
+
+    bad_msg_iso_8859_1 = message_from_string("""
+        Content-Type: text/plain;
+            charset="iso-8859-1"
+        From: =??q?\x80SOKCrGxsbw=====?= <hello@example.com>
+        To: Issue Tracker <issue_tracker@example.com>
+        Cc: =?utf8?b?SOKCrGxsbw==?= <hello@example.com>,
+            Some User <some.user@example.com>
+        Message-Id: <dummy_test_message_id>
+        Subject: [issue] Testing...
+
+        This is a test submission of a new issue.
+    """)
+
     def test_get_plain_header(self):
         self.assertEqual(
             self.msg.get_header('to'),
@@ -85,6 +113,21 @@ def test_get_encoded_header(self):
             self.msg.get_header('from'),
             'H€llo <hello@example.com>')
 
+        # issue2551008 null encoding causes crash.
+        self.assertEqual(
+            self.bad_msg_utf8.get_header('from'),
+            'H€llo <hello@example.com>')
+
+        # the decoded value is not what the user wanted,
+        # but they should have created a valid header
+        # if they wanted the right outcome...
+        self.assertIn(
+            self.bad_msg_iso_8859_1.get_header('from'),
+            (
+                '\xc2\x80SOKCrGxsbw===== <hello@example.com>', # python 2
+                '\x80SOKCrGxsbw===== <hello@example.com>'      # python 3
+            ))
+
     def test_get_address_list(self):
         self.assertEqual(self.msg.get_address_list('cc'), [
             ('H€llo', 'hello@example.com'),