Skip to content

Commit 5861cef

Browse files
author
Richard Jones
committed
Allow customisation of regular expressions used in email parsing...
...thanks Bruno Damour
1 parent d4324c5 commit 5861cef

File tree

4 files changed

+33
-13
lines changed

4 files changed

+33
-13
lines changed

CHANGES.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ Feature:
66
- Dates can now be in the year-range 1-9999 except for metakit which is
77
still limited to 1970-2038.
88
- Add simple anti-spam recipe to docs
9+
- Allow customisation of regular expressions used in email parsing, thanks
10+
Bruno Damour
911

1012
Fixed:
1113
- Handling of unset Link search in RDBMS backend

doc/index.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ Roch'e Compaan,
8181
Wil Cooley,
8282
Joe Cooper,
8383
Kelley Dagley,
84+
Bruno Damour,
8485
Toby Dickenson,
8586
Paul F. Dubois,
8687
Eric Earnst,

roundup/configuration.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Roundup Issue Tracker configuration support
22
#
3-
# $Id: configuration.py,v 1.40 2007-02-19 20:27:53 a1s Exp $
3+
# $Id: configuration.py,v 1.41 2007-03-26 04:04:42 richard Exp $
44
#
55
__docformat__ = "restructuredtext"
66

@@ -653,6 +653,20 @@ def str2value(self, value):
653653
"will match an issue for the interval after the issue's\n"
654654
"creation or last activity. The interval is a standard\n"
655655
"Roundup interval."),
656+
(Option, "refwd_re", "\s*\W?\s*(fw|fwd|re|aw|sv|ang)\W\s*",
657+
"Regular expression matching a single reply or forward\n"
658+
"prefix prepended by the mailer. This is explicitly\n"
659+
"stripped from the subject during parsing."),
660+
(Option, "origmsg_re", "^[>|\s]*-----\s?Original Message\s?-----$",
661+
"Regular expression matching start of an original message\n"
662+
"if quoted the in body."),
663+
(Option, "sign_re", "^[>|\s]*-- ?$",
664+
"Regular expression matching the start of a signature\n"
665+
"in the message body."),
666+
(Option, "eol_re", r"[\r\n]+",
667+
"Regular expression matching end of line."),
668+
(Option, "blankline_re", r"[\r\n]+\s*[\r\n]+",
669+
"Regular expression matching a blank line."),
656670
), "Roundup Mail Gateway options"),
657671
("nosy", (
658672
(RunDetectorOption, "messages_to_author", "no",

roundup/mailgw.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# -*- coding: utf-8 -*-
12
#
23
# Copyright (c) 2001 Bizar Software Pty Ltd (http://www.bizarsoftware.com.au/)
34
# This module is free software, and you may redistribute it and/or modify
@@ -72,7 +73,7 @@ class node. Any parts of other types are each stored in separate files
7273
an exception, the original message is bounced back to the sender with the
7374
explanatory message given in the exception.
7475
75-
$Id: mailgw.py,v 1.184 2007-02-15 03:09:53 richard Exp $
76+
$Id: mailgw.py,v 1.185 2007-03-26 04:04:42 richard Exp $
7677
"""
7778
__docformat__ = 'restructuredtext'
7879

@@ -635,11 +636,14 @@ def handle_message(self, message):
635636
'argswhole'])
636637

637638
# Look for Re: et. al. Used later on for MAILGW_SUBJECT_CONTENT_MATCH
638-
re_re = r'''(?P<refwd>(\s*\W?\s*(fw|fwd|re|aw|sv|ang)\W)+)\s*'''
639-
m = re.match(re_re, tmpsubject, re.IGNORECASE|re.VERBOSE)
639+
refwd_re = config['MAILGW_REFWD_RE'].decode('iso8859-1')
640+
re_re = r'''(?P<refwd>%s)*\s*''' % refwd_re
641+
m = re.match(re_re, tmpsubject, re.IGNORECASE|re.VERBOSE|re.UNICODE)
640642
if m:
641-
matches.update(m.groupdict())
642-
tmpsubject = tmpsubject[len(matches['refwd']):] # Consume Re:
643+
m = m.groupdict()
644+
if m['refwd']:
645+
matches.update(m)
646+
tmpsubject = tmpsubject[len(m['refwd']):] # Consume Re:
643647

644648
# Look for Leading "
645649
m = re.match(r'(?P<quote>\s*")', tmpsubject,
@@ -1005,10 +1009,14 @@ def handle_message(self, message):
10051009
# figure how much we should muck around with the email body
10061010
keep_citations = config['MAILGW_KEEP_QUOTED_TEXT']
10071011
keep_body = config['MAILGW_LEAVE_BODY_UNCHANGED']
1012+
blank_line = re.compile(r'%s' % config['MAILGW_BLANKLINE_RE'])
1013+
eol = re.compile(r'%s' % config['MAILGW_EOL_RE'])
1014+
signature = re.compile(r'%s' % config['MAILGW_SIGN_RE'])
1015+
original_msg = re.compile(r'%s' % config['MAILGW_ORIGMSG_RE'])
10081016

10091017
# parse the body of the message, stripping out bits as appropriate
10101018
summary, content = parseContent(content, keep_citations,
1011-
keep_body)
1019+
keep_body, blank_line, eol, signature, original_msg)
10121020
content = content.strip()
10131021

10141022
#
@@ -1209,12 +1217,7 @@ def uidFromAddress(db, address, create=1, **user_props):
12091217
else:
12101218
return 0
12111219

1212-
1213-
def parseContent(content, keep_citations, keep_body,
1214-
blank_line=re.compile(r'[\r\n]+\s*[\r\n]+'),
1215-
eol=re.compile(r'[\r\n]+'),
1216-
signature=re.compile(r'^[>|\s]*-- ?$'),
1217-
original_msg=re.compile(r'^[>|\s]*-----\s?Original Message\s?-----$')):
1220+
def parseContent(content, keep_citations, keep_body, blank_line, eol, signature, original_msg):
12181221
''' The message body is divided into sections by blank lines.
12191222
Sections where the second and all subsequent lines begin with a ">"
12201223
or "|" character are considered "quoting sections". The first line of

0 commit comments

Comments
 (0)