applied unicode patch

Andrey Lebedev · Andrey Lebedev · commit 397cc6f1ed60 · 2003-01-15T22:17:20.000Z
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -15,6 +15,9 @@ are given with the most recent entry first.
 - fix StringHTMLProperty hyperlinking
 - added mysql backend
 - fixes to CGI form handling (NEEDS BACKPORTING TO 0.5)
+- applied unicode patch. All data is stored in utf-8. Incoming messages
+  converted from any encoding to utf-8, outgoing messages are encoded 
+  according to rfc2822 (sf bug 568873)
 
 
 2003-??-?? 0.5.5
diff --git a/roundup/backends/back_anydbm.py b/roundup/backends/back_anydbm.py
@@ -15,7 +15,7 @@
 # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
 # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
 # 
-#$Id: back_anydbm.py,v 1.96 2003-01-08 05:39:40 richard Exp $
+#$Id: back_anydbm.py,v 1.97 2003-01-15 22:17:19 kedder Exp $
 '''
 This module defines a backend that saves the hyperdatabase in a database
 chosen by anydbm. It is guaranteed to always be available in python
@@ -847,7 +847,7 @@ def create(self, **propvalues):
                             (self.classname, newid, key))
 
             elif isinstance(prop, String):
-                if type(value) != type(''):
+                if type(value) != type('') and type(value) != type(u''):
                     raise TypeError, 'new property "%s" not a string'%key
 
             elif isinstance(prop, Password):
@@ -1244,7 +1244,7 @@ class or a KeyError is raised.
                     journalvalues[propname] = tuple(l)
 
             elif isinstance(prop, String):
-                if value is not None and type(value) != type(''):
+                if value is not None and type(value) != type('') and type(value) != type(u''):
                     raise TypeError, 'new property "%s" not a string'%propname
 
             elif isinstance(prop, Password):
diff --git a/roundup/backends/rdbms_common.py b/roundup/backends/rdbms_common.py
@@ -1,4 +1,4 @@
-# $Id: rdbms_common.py,v 1.28 2003-01-12 23:53:20 richard Exp $
+# $Id: rdbms_common.py,v 1.29 2003-01-15 22:17:19 kedder Exp $
 ''' Relational database (SQL) backend common code.
 
 Basics:
@@ -1070,7 +1070,7 @@ def create(self, **propvalues):
                             (self.classname, newid, key))
 
             elif isinstance(prop, String):
-                if type(value) != type(''):
+                if type(value) != type('') and type(value) != type(u''):
                     raise TypeError, 'new property "%s" not a string'%key
 
             elif isinstance(prop, Password):
@@ -1432,7 +1432,7 @@ class or a KeyError is raised.
                     journalvalues[propname] = tuple(l)
 
             elif isinstance(prop, String):
-                if value is not None and type(value) != type(''):
+                if value is not None and type(value) != type('') and type(value) != type(u''):
                     raise TypeError, 'new property "%s" not a string'%propname
 
             elif isinstance(prop, Password):
diff --git a/roundup/mailgw.py b/roundup/mailgw.py
@@ -73,14 +73,16 @@ class node. Any parts of other types are each stored in separate files
 an exception, the original message is bounced back to the sender with the
 explanatory message given in the exception. 
 
-$Id: mailgw.py,v 1.106 2003-01-12 00:03:10 richard Exp $
+$Id: mailgw.py,v 1.107 2003-01-15 22:17:19 kedder Exp $
 '''
 
 import string, re, os, mimetools, cStringIO, smtplib, socket, binascii, quopri
 import time, random, sys
 import traceback, MimeWriter
 import hyperdb, date, password
 
+import rfc2822
+
 SENDMAILDEBUG = os.environ.get('SENDMAILDEBUG', '')
 
 class MailGWError(ValueError):
@@ -134,6 +136,10 @@ def getPart(self):
         s.seek(0)
         return Message(s)
 
+    def getheader(self, name, default=None):
+        hdr = mimetools.Message.getheader(self, name, default)
+        return rfc2822.decode_header(hdr)
+ 
 subject_re = re.compile(r'(?P<refwd>\s*\W?\s*(fw|fwd|re|aw)\W\s*)*'
     r'\s*(?P<quote>")?(\[(?P<classname>[^\d\s]+)(?P<nodeid>\d+)?\])?'
     r'\s*(?P<title>[^[]+)?"?(\[(?P<args>.+?)\])?', re.I)
@@ -339,7 +345,7 @@ def bounce_message(self, message, sendto, error,
         writer.addheader('MIME-Version', '1.0')
         part = writer.startmultipartbody('mixed')
         part = writer.nextpart()
-        body = part.startbody('text/plain')
+        body = part.startbody('text/plain; charset=utf-8')
         body.write('\n'.join(error))
 
         # attach the original message to the returned message
@@ -377,7 +383,19 @@ def get_part_data_decoded(self,part):
         else:
             # take it as text
             data = part.fp.read()
-        return data
+        
+        # Encode message to unicode
+        charset = rfc2822.unaliasCharset(part.getparam("charset"))
+        if charset:
+            # Do conversion only if charset specified
+            edata = unicode(data, charset).encode('utf-8')
+            # Convert from dos eol to unix
+            edata = edata.replace('\r\n', '\n')
+        else:
+            # Leave message content as is
+            edata = data
+                
+        return edata
 
     def handle_message(self, message):
         ''' message - a Message instance
diff --git a/roundup/rfc2822.py b/roundup/rfc2822.py
@@ -0,0 +1,160 @@
+import re
+from binascii import b2a_base64, a2b_base64
+
+ecre = re.compile(r'''
+  =\?                   # literal =?
+  (?P<charset>[^?]*?)   # non-greedy up to the next ? is the charset
+  \?                    # literal ?
+  (?P<encoding>[qb])    # either a "q" or a "b", case insensitive
+  \?                    # literal ?
+  (?P<encoded>.*?)      # non-greedy up to the next ?= is the encoded string
+  \?=                   # literal ?=
+  ''', re.VERBOSE | re.IGNORECASE)
+
+hqre = re.compile(r'^[-a-zA-Z0-9!*+/\[\]., ]+$')
+
+def base64_decode(s, convert_eols=None):
+    """Decode a raw base64 string.
+
+    If convert_eols is set to a string value, all canonical email linefeeds,
+    e.g. "\\r\\n", in the decoded text will be converted to the value of
+    convert_eols.  os.linesep is a good choice for convert_eols if you are
+    decoding a text attachment.
+
+    This function does not parse a full MIME header value encoded with
+    base64 (like =?iso-8895-1?b?bmloISBuaWgh?=) -- please use the high
+    level email.Header class for that functionality.
+
+    Taken from 'email' module
+    """
+    if not s:
+        return s
+    
+    dec = a2b_base64(s)
+    if convert_eols:
+        return dec.replace(CRLF, convert_eols)
+    return dec
+
+def unquote_match(match):
+    """Turn a match in the form =AB to the ASCII character with value 0xab
+
+    Taken from 'email' module
+    """
+    s = match.group(0)
+    return chr(int(s[1:3], 16))
+
+def qp_decode(s):
+    """Decode a string encoded with RFC 2045 MIME header `Q' encoding.
+
+    This function does not parse a full MIME header value encoded with
+    quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use
+    the high level email.Header class for that functionality.
+
+    Taken from 'email' module
+    """
+    s = s.replace('_', ' ')
+    return re.sub(r'=\w{2}', unquote_match, s)
+
+def _decode_header(header):
+    """Decode a message header value without converting charset.
+
+    Returns a list of (decoded_string, charset) pairs containing each of the
+    decoded parts of the header.  Charset is None for non-encoded parts of the
+    header, otherwise a lower-case string containing the name of the character
+    set specified in the encoded string.
+
+    Taken from 'email' module
+    """
+    # If no encoding, just return the header
+    header = str(header)
+    if not ecre.search(header):
+        return [(header, None)]
+
+    decoded = []
+    dec = ''
+    for line in header.splitlines():
+        # This line might not have an encoding in it
+        if not ecre.search(line):
+            decoded.append((line, None))
+            continue
+
+        parts = ecre.split(line)
+        while parts:
+            unenc = parts.pop(0)
+            if unenc:
+                if unenc.strip():
+                    decoded.append((unenc, None))
+            if parts:
+                charset, encoding = [s.lower() for s in parts[0:2]]
+                encoded = parts[2]
+                dec = ''
+                if encoding == 'q':
+                    dec = qp_decode(encoded)
+                elif encoding == 'b':
+                    dec = base64_decode(encoded)
+                else:
+                    dec = encoded
+
+                if decoded and decoded[-1][1] == charset:
+                    decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
+                else:
+                    decoded.append((dec, charset))
+            del parts[0:3]
+    return decoded
+
+def decode_header(hdr):
+    """ Decodes rfc2822 encoded header and return utf-8 encoded string
+    """
+    if not hdr:
+        return None
+    outs = u""
+    for section in _decode_header(hdr):
+        charset = unaliasCharset(section[1])
+        outs += unicode(section[0], charset or 'iso-8859-1', 'replace')
+    return outs.encode('utf-8')
+
+def encode_header(header):
+    """ Will encode in quoted-printable encoding only if header 
+    contains non latin characters
+    """
+
+    # Return empty headers unchanged
+    if not header:
+        return header
+
+    global hqre
+    # return plain header if it is not contains non-ascii characters
+    if hqre.match(header):
+        return header
+    
+    charset = 'utf-8'
+    quoted = ''
+    #max_encoded = 76 - len(charset) - 7
+    for c in header:
+        # Space may be represented as _ instead of =20 for readability
+        if c == ' ':
+            quoted += '_'
+        # These characters can be included verbatim
+        elif hqre.match(c):
+            quoted += c
+        # Otherwise, replace with hex value like =E2
+        else:
+            quoted += "=%02X" % ord(c)
+            plain = 0
+
+    return '=?%s?q?%s?=' % (charset, quoted)
+
+def unaliasCharset(charset):
+    if charset:
+        return charset.lower().replace("windows-", 'cp')
+        #return charset_table.get(charset.lower(), charset)
+    return None
+
+def test():
+    print encode_header("Contrary, Mary")
+    #print unaliasCharset('Windows-1251')
+
+if __name__ == '__main__':
+    test()
+
+# vim: et
diff --git a/roundup/roundupdb.py b/roundup/roundupdb.py
@@ -15,7 +15,7 @@
 # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
 # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
 # 
-# $Id: roundupdb.py,v 1.77 2003-01-14 22:19:27 richard Exp $
+# $Id: roundupdb.py,v 1.78 2003-01-15 22:17:19 kedder Exp $
 
 __doc__ = """
 Extending hyperdb with types specific to issue-tracking.
@@ -24,6 +24,9 @@
 import re, os, smtplib, socket, time, random
 import MimeWriter, cStringIO
 import base64, quopri, mimetypes
+
+from rfc2822 import encode_header
+
 # if available, use the 'email' module, otherwise fallback to 'rfc822'
 try :
     from email.Utils import formataddr as straddr
@@ -243,9 +246,10 @@ def send_message(self, nodeid, msgid, note, sendto, from_address=None):
         # create the message
         message = cStringIO.StringIO()
         writer = MimeWriter.MimeWriter(message)
-        writer.addheader('Subject', '[%s%s] %s'%(cn, nodeid, title))
+        writer.addheader('Subject', '[%s%s] %s'%(cn, nodeid, encode_header(title)))
         writer.addheader('To', ', '.join(sendto))
-        writer.addheader('From', straddr((authname + from_tag, from_address)))
+        writer.addheader('From', straddr((encode_header(authname) + 
+            from_tag, from_address)))
         writer.addheader('Reply-To', straddr((self.db.config.TRACKER_NAME,
             from_address)))
         writer.addheader('Date', time.strftime("%a, %d %b %Y %H:%M:%S +0000",
@@ -267,7 +271,7 @@ def send_message(self, nodeid, msgid, note, sendto, from_address=None):
             part = writer.startmultipartbody('mixed')
             part = writer.nextpart()
             part.addheader('Content-Transfer-Encoding', 'quoted-printable')
-            body = part.startbody('text/plain')
+            body = part.startbody('text/plain; charset=utf-8')
             body.write(content_encoded)
             for fileid in message_files:
                 name = files.get(fileid, 'name')
@@ -295,7 +299,7 @@ def send_message(self, nodeid, msgid, note, sendto, from_address=None):
             writer.lastpart()
         else:
             writer.addheader('Content-Transfer-Encoding', 'quoted-printable')
-            body = writer.startbody('text/plain')
+            body = writer.startbody('text/plain; charset=utf-8')
             body.write(content_encoded)
 
         # now try to send the message
diff --git a/roundup/templates/classic/html/_generic.help b/roundup/templates/classic/html/_generic.help
@@ -1,6 +1,7 @@
 <html>
 <head>
 <link rel="stylesheet" type="text/css" href="_file/style.css">
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8;">
 </head>
 <body class="body" marginwidth="0" marginheight="0">
 
diff --git a/roundup/templates/classic/html/page b/roundup/templates/classic/html/page
@@ -1,6 +1,7 @@
 <html metal:define-macro="icing">
 <head>
 <title metal:define-slot="head_title">title goes here</title>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8;">
 
 <link rel="stylesheet" type="text/css" href="_file/style.css">
 
diff --git a/roundup/templates/minimal/html/_generic.help b/roundup/templates/minimal/html/_generic.help
@@ -1,5 +1,6 @@
 <html>
 <head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8;">
 <link rel="stylesheet" type="text/css" href="_file/style.css">
 </head>
 <body class="body" marginwidth="0" marginheight="0">
diff --git a/roundup/templates/minimal/html/page b/roundup/templates/minimal/html/page
@@ -1,6 +1,7 @@
 <html metal:define-macro="icing">
 <head>
 <title metal:define-slot="head_title">title goes here</title>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8;">
 
 <link rel="stylesheet" type="text/css" href="_file/style.css">
 
diff --git a/test/test_mailgw.py b/test/test_mailgw.py