Skip to content

Commit c000ce1

Browse files
committed
Be more tolerant when parsing RFC2047 encoded mail headers.
Use backported version of my proposed changes to email.header.decode_header in http://bugs.python.org/issue1079
1 parent 7c5d46d commit c000ce1

File tree

4 files changed

+156
-20
lines changed

4 files changed

+156
-20
lines changed

CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ Fixed:
5151
formating.
5252
- Fix xmlrpc URL parsing so that passwords may contain a ':' character
5353
(Ralf)
54+
- Be more tolerant when parsing RFC2047 encoded mail headers. Use
55+
backported version of my proposed changes to
56+
email.header.decode_header in http://bugs.python.org/issue1079
5457

5558

5659
2011-07-15: 1.4.19

roundup/anypy/email_.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
import re
2+
import binascii
3+
from email import quoprimime, base64mime
4+
15
try:
26
# Python 2.5+
37
from email.parser import FeedParser
@@ -17,3 +21,115 @@ def feed(self, s):
1721
def close(self):
1822
p = Parser()
1923
return p.parsestr(''.join(self.content))
24+
25+
# Match encoded-word strings in the form =?charset?q?Hello_World?=
26+
ecre = re.compile(r'''
27+
=\? # literal =?
28+
(?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
29+
\? # literal ?
30+
(?P<encoding>[qb]) # either a "q" or a "b", case insensitive
31+
\? # literal ?
32+
(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
33+
\?= # literal ?=
34+
''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)
35+
36+
37+
# Fixed header parser, see my proposed patch and discussions:
38+
# http://bugs.python.org/issue1079 "decode_header does not follow RFC 2047"
39+
# http://bugs.python.org/issue1467619 "Header.decode_header eats up spaces"
40+
# This implements the decode_header specific parts of my proposed patch
41+
# backported to python2.X
42+
def decode_header(header):
43+
"""Decode a message header value without converting charset.
44+
45+
Returns a list of (string, charset) pairs containing each of the decoded
46+
parts of the header. Charset is None for non-encoded parts of the header,
47+
otherwise a lower-case string containing the name of the character set
48+
specified in the encoded string.
49+
50+
header may be a string that may or may not contain RFC2047 encoded words,
51+
or it may be a Header object.
52+
53+
An email.errors.HeaderParseError may be raised when certain decoding error
54+
occurs (e.g. a base64 decoding exception).
55+
"""
56+
# If it is a Header object, we can just return the encoded chunks.
57+
if hasattr(header, '_chunks'):
58+
return [(_charset._encode(string, str(charset)), str(charset))
59+
for string, charset in header._chunks]
60+
# If no encoding, just return the header with no charset.
61+
if not ecre.search(header):
62+
return [(header, None)]
63+
# First step is to parse all the encoded parts into triplets of the form
64+
# (encoded_string, encoding, charset). For unencoded strings, the last
65+
# two parts will be None.
66+
words = []
67+
for line in header.splitlines():
68+
parts = ecre.split(line)
69+
first = True
70+
while parts:
71+
unencoded = parts.pop(0)
72+
if first:
73+
unencoded = unencoded.lstrip()
74+
first = False
75+
if unencoded:
76+
words.append((unencoded, None, None))
77+
if parts:
78+
charset = parts.pop(0).lower()
79+
encoding = parts.pop(0).lower()
80+
encoded = parts.pop(0)
81+
words.append((encoded, encoding, charset))
82+
# Now loop over words and remove words that consist of whitespace
83+
# between two encoded strings.
84+
import sys
85+
droplist = []
86+
for n, w in enumerate(words):
87+
if n>1 and w[1] and words[n-2][1] and words[n-1][0].isspace():
88+
droplist.append(n-1)
89+
for d in reversed(droplist):
90+
del words[d]
91+
92+
# The next step is to decode each encoded word by applying the reverse
93+
# base64 or quopri transformation. decoded_words is now a list of the
94+
# form (decoded_word, charset).
95+
decoded_words = []
96+
for encoded_string, encoding, charset in words:
97+
if encoding is None:
98+
# This is an unencoded word.
99+
decoded_words.append((encoded_string, charset))
100+
elif encoding == 'q':
101+
word = quoprimime.header_decode(encoded_string)
102+
decoded_words.append((word, charset))
103+
elif encoding == 'b':
104+
paderr = len(encoded_string) % 4 # Postel's law: add missing padding
105+
if paderr:
106+
encoded_string += '==='[:4 - paderr]
107+
try:
108+
word = base64mime.decode(encoded_string)
109+
except binascii.Error:
110+
raise HeaderParseError('Base64 decoding error')
111+
else:
112+
decoded_words.append((word, charset))
113+
else:
114+
raise AssertionError('Unexpected encoding: ' + encoding)
115+
# Now convert all words to bytes and collapse consecutive runs of
116+
# similarly encoded words.
117+
collapsed = []
118+
last_word = last_charset = None
119+
for word, charset in decoded_words:
120+
if isinstance(word, str):
121+
pass
122+
if last_word is None:
123+
last_word = word
124+
last_charset = charset
125+
elif charset != last_charset:
126+
collapsed.append((last_word, last_charset))
127+
last_word = word
128+
last_charset = charset
129+
elif last_charset is None:
130+
last_word += BSPACE + word
131+
else:
132+
last_word += word
133+
collapsed.append((last_word, last_charset))
134+
return collapsed
135+

roundup/mailgw.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ class node. Any parts of other types are each stored in separate files
8282
import time, random, sys, logging
8383
import traceback, rfc822
8484

85-
from email.Header import decode_header
85+
from anypy.email_ import decode_header
8686

8787
from roundup import configuration, hyperdb, date, password, rfc2822, exceptions
8888
from roundup.mailer import Mailer, MessageSendError
@@ -244,17 +244,14 @@ def getparts(self):
244244

245245
def _decode_header_to_utf8(self, hdr):
246246
l = []
247-
prev_encoded = False
248247
for part, encoding in decode_header(hdr):
249248
if encoding:
250249
part = part.decode(encoding)
251250
# RFC 2047 specifies that between encoded parts spaces are
252251
# swallowed while at the borders from encoded to non-encoded
253252
# or vice-versa we must preserve a space. Multiple adjacent
254-
# non-encoded parts should not occur.
255-
if l and prev_encoded != bool(encoding):
256-
l.append(' ')
257-
prev_encoded = bool(encoding)
253+
# non-encoded parts should not occur. This is now
254+
# implemented in our patched decode_header method in anypy
258255
l.append(part)
259256
return ''.join([s.encode('utf-8') for s in l])
260257

test/test_mailgw.py

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,15 @@ def tearDown(self):
156156
os.remove(SENDMAILDEBUG)
157157
self.db.close()
158158

159+
def _allowAnonymousSubmit(self):
160+
p = [
161+
self.db.security.getPermission('Register', 'user'),
162+
self.db.security.getPermission('Email Access', None),
163+
self.db.security.getPermission('Create', 'issue'),
164+
self.db.security.getPermission('Create', 'msg'),
165+
]
166+
self.db.security.role['anonymous'].permissions = p
167+
159168
def _create_mailgw(self, message, args=()):
160169
class MailGW(self.instance.MailGW):
161170
def handle_message(self, message):
@@ -1924,13 +1933,7 @@ def testNewUserAuthorEncodedName(self):
19241933
19251934
This is a test submission of a new issue.
19261935
'''
1927-
p = [
1928-
self.db.security.getPermission('Register', 'user'),
1929-
self.db.security.getPermission('Email Access', None),
1930-
self.db.security.getPermission('Create', 'issue'),
1931-
self.db.security.getPermission('Create', 'msg'),
1932-
]
1933-
self.db.security.role['anonymous'].permissions = p
1936+
self._allowAnonymousSubmit()
19341937
self._handle_mail(message)
19351938
m = set(self.db.user.list())
19361939
new = list(m - l)[0]
@@ -1951,13 +1954,7 @@ def testNewUserAuthorMixedEncodedName(self):
19511954
19521955
This is a test submission of a new issue.
19531956
'''
1954-
p = [
1955-
self.db.security.getPermission('Register', 'user'),
1956-
self.db.security.getPermission('Email Access', None),
1957-
self.db.security.getPermission('Create', 'issue'),
1958-
self.db.security.getPermission('Create', 'msg'),
1959-
]
1960-
self.db.security.role['anonymous'].permissions = p
1957+
self._allowAnonymousSubmit()
19611958
self._handle_mail(message)
19621959
title = self.db.issue.get('1', 'title')
19631960
self.assertEquals(title, 'Test \xc3\x84\xc3\x96\xc3\x9c umlauts X1 X2')
@@ -1966,6 +1963,29 @@ def testNewUserAuthorMixedEncodedName(self):
19661963
name = self.db.user.get(new, 'realname')
19671964
self.assertEquals(name, 'Firstname \xc3\xa4\xc3\xb6\xc3\x9f Last')
19681965

1966+
def testNewUserAuthorMixedEncodedNameSpacing(self):
1967+
l = set(self.db.user.list())
1968+
# From: name has Euro symbol in it
1969+
message = '''Content-Type: text/plain;
1970+
charset="iso-8859-1"
1971+
From: (=?utf-8?b?w6TDtsOf?==?utf-8?b?w6TDtsOf?=) <[email protected]>
1972+
1973+
Message-Id: <dummy_test_message_id>
1974+
Subject: [issue] Test (=?utf-8?b?w4TDlsOc?=) umlauts
1975+
X1
1976+
1977+
This is a test submission of a new issue.
1978+
'''
1979+
self._allowAnonymousSubmit()
1980+
self._handle_mail(message)
1981+
title = self.db.issue.get('1', 'title')
1982+
self.assertEquals(title, 'Test (\xc3\x84\xc3\x96\xc3\x9c) umlauts X1')
1983+
m = set(self.db.user.list())
1984+
new = list(m - l)[0]
1985+
name = self.db.user.get(new, 'realname')
1986+
self.assertEquals(name,
1987+
'(\xc3\xa4\xc3\xb6\xc3\x9f\xc3\xa4\xc3\xb6\xc3\x9f)')
1988+
19691989
def testUnknownUser(self):
19701990
l = set(self.db.user.list())
19711991
message = '''Content-Type: text/plain;

0 commit comments

Comments
 (0)