1+ import re
2+ import binascii
3+ from email import quoprimime , base64mime
4+
15try :
26 # Python 2.5+
37 from email .parser import FeedParser
@@ -17,3 +21,115 @@ def feed(self, s):
1721 def close (self ):
1822 p = Parser ()
1923 return p .parsestr ('' .join (self .content ))
24+
25+ # Match encoded-word strings in the form =?charset?q?Hello_World?=
26+ ecre = re .compile (r'''
27+ =\? # literal =?
28+ (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
29+ \? # literal ?
30+ (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
31+ \? # literal ?
32+ (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
33+ \?= # literal ?=
34+ ''' , re .VERBOSE | re .IGNORECASE | re .MULTILINE )
35+
36+
37+ # Fixed header parser, see my proposed patch and discussions:
38+ # http://bugs.python.org/issue1079 "decode_header does not follow RFC 2047"
39+ # http://bugs.python.org/issue1467619 "Header.decode_header eats up spaces"
40+ # This implements the decode_header specific parts of my proposed patch
41+ # backported to python2.X
42+ def decode_header (header ):
43+ """Decode a message header value without converting charset.
44+
45+ Returns a list of (string, charset) pairs containing each of the decoded
46+ parts of the header. Charset is None for non-encoded parts of the header,
47+ otherwise a lower-case string containing the name of the character set
48+ specified in the encoded string.
49+
50+ header may be a string that may or may not contain RFC2047 encoded words,
51+ or it may be a Header object.
52+
53+ An email.errors.HeaderParseError may be raised when certain decoding error
54+ occurs (e.g. a base64 decoding exception).
55+ """
56+ # If it is a Header object, we can just return the encoded chunks.
57+ if hasattr (header , '_chunks' ):
58+ return [(_charset ._encode (string , str (charset )), str (charset ))
59+ for string , charset in header ._chunks ]
60+ # If no encoding, just return the header with no charset.
61+ if not ecre .search (header ):
62+ return [(header , None )]
63+ # First step is to parse all the encoded parts into triplets of the form
64+ # (encoded_string, encoding, charset). For unencoded strings, the last
65+ # two parts will be None.
66+ words = []
67+ for line in header .splitlines ():
68+ parts = ecre .split (line )
69+ first = True
70+ while parts :
71+ unencoded = parts .pop (0 )
72+ if first :
73+ unencoded = unencoded .lstrip ()
74+ first = False
75+ if unencoded :
76+ words .append ((unencoded , None , None ))
77+ if parts :
78+ charset = parts .pop (0 ).lower ()
79+ encoding = parts .pop (0 ).lower ()
80+ encoded = parts .pop (0 )
81+ words .append ((encoded , encoding , charset ))
82+ # Now loop over words and remove words that consist of whitespace
83+ # between two encoded strings.
84+ import sys
85+ droplist = []
86+ for n , w in enumerate (words ):
87+ if n > 1 and w [1 ] and words [n - 2 ][1 ] and words [n - 1 ][0 ].isspace ():
88+ droplist .append (n - 1 )
89+ for d in reversed (droplist ):
90+ del words [d ]
91+
92+ # The next step is to decode each encoded word by applying the reverse
93+ # base64 or quopri transformation. decoded_words is now a list of the
94+ # form (decoded_word, charset).
95+ decoded_words = []
96+ for encoded_string , encoding , charset in words :
97+ if encoding is None :
98+ # This is an unencoded word.
99+ decoded_words .append ((encoded_string , charset ))
100+ elif encoding == 'q' :
101+ word = quoprimime .header_decode (encoded_string )
102+ decoded_words .append ((word , charset ))
103+ elif encoding == 'b' :
104+ paderr = len (encoded_string ) % 4 # Postel's law: add missing padding
105+ if paderr :
106+ encoded_string += '===' [:4 - paderr ]
107+ try :
108+ word = base64mime .decode (encoded_string )
109+ except binascii .Error :
110+ raise HeaderParseError ('Base64 decoding error' )
111+ else :
112+ decoded_words .append ((word , charset ))
113+ else :
114+ raise AssertionError ('Unexpected encoding: ' + encoding )
115+ # Now convert all words to bytes and collapse consecutive runs of
116+ # similarly encoded words.
117+ collapsed = []
118+ last_word = last_charset = None
119+ for word , charset in decoded_words :
120+ if isinstance (word , str ):
121+ pass
122+ if last_word is None :
123+ last_word = word
124+ last_charset = charset
125+ elif charset != last_charset :
126+ collapsed .append ((last_word , last_charset ))
127+ last_word = word
128+ last_charset = charset
129+ elif last_charset is None :
130+ last_word += BSPACE + word
131+ else :
132+ last_word += word
133+ collapsed .append ((last_word , last_charset ))
134+ return collapsed
135+
0 commit comments