Skip to content

Commit 020e7f8

Browse files
committed
merging author parsing in plain parser. Closes ietf-tools#585.
- Legacy-Id: 2820
1 parent c0f0d2c commit 020e7f8

2 files changed

Lines changed: 267 additions & 0 deletions

File tree

ietf/submit/parsers/base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ class MetaDataDraft(object):
99
revision = None
1010
filename = None
1111
group = None
12+
authors = None
1213

1314

1415
class ParseInfo(object):
@@ -51,6 +52,7 @@ def parse(self):
5152
method()
5253
if self.parsed_info.errors:
5354
return self.parsed_info
55+
return self.parsed_info
5456

5557
def parse_critical_000_invalid_chars_in_filename(self):
5658
name = self.fd.name

ietf/submit/parsers/plain_parser.py

Lines changed: 265 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,3 +68,268 @@ def parse_critical_003_wg(self):
6868
self.parsed_info.add_error('Invalid WG ID: %s' % group_acronym)
6969
else:
7070
self.parsed_info.metadraft.wg = IETFWG.objects.get(pk=NONE_WG_PK)
71+
72+
def parse_critical_authors(self):
73+
"""
74+
comes from http://svn.tools.ietf.org/svn/tools/ietfdb/branch/idsubmit/ietf/utils/draft.py
75+
"""
76+
77+
def _stripheaders(rawlines):
78+
stripped = []
79+
pages = []
80+
page = []
81+
line = ""
82+
debug = False
83+
newpage = False
84+
sentence = False
85+
haveblank = False
86+
87+
def endpage(pages, page, line):
88+
if line:
89+
page += [ line ]
90+
return begpage(pages, page)
91+
def begpage(pages, page, line=None):
92+
if page and len(page) > 5:
93+
pages += [ "\n".join(page) ]
94+
page = []
95+
newpage = True
96+
if line:
97+
page += [ line ]
98+
return pages, page
99+
100+
for line in rawlines:
101+
line = line.rstrip()
102+
if re.search("\[?[Pp]age [0-9ivx]+\]?[ \t\f]*$", line, re.I):
103+
pages, page = endpage(pages, page, line)
104+
continue
105+
if re.search("\f", line, re.I):
106+
pages, page = begpage(pages, page)
107+
continue
108+
if re.search("^ *Internet.Draft.+[12][0-9][0-9][0-9] *$", line, re.I):
109+
pages, page = begpage(pages, page, line)
110+
continue
111+
if re.search("^ *Draft.+[12][0-9][0-9][0-9] *$", line, re.I):
112+
pages, page = begpage(pages, page, line)
113+
continue
114+
if re.search("^RFC[ -]?[0-9]+.*( +)[12][0-9][0-9][0-9]$", line, re.I):
115+
pages, page = begpage(pages, page, line)
116+
continue
117+
if re.search("^draft-[-a-z0-9_.]+.*[0-9][0-9][0-9][0-9]$", line, re.I):
118+
pages, page = endpage(pages, page, line)
119+
continue
120+
if re.search(".{60,}(Jan|Feb|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|Sep|Oct|Nov|Dec) (19[89][0-9]|20[0-9][0-9]) *$", line, re.I):
121+
pages, page = begpage(pages, page, line)
122+
continue
123+
if newpage and re.search("^ *draft-[-a-z0-9_.]+ *$", line, re.I):
124+
pages, page = begpage(pages, page, line)
125+
continue
126+
if re.search("^[^ \t]+", line):
127+
sentence = True
128+
if re.search("[^ \t]", line):
129+
if newpage:
130+
if sentence:
131+
stripped += [""]
132+
else:
133+
if haveblank:
134+
stripped += [""]
135+
haveblank = False
136+
sentence = False
137+
newpage = False
138+
if re.search("[.:]$", line):
139+
sentence = True
140+
if re.search("^[ \t]*$", line):
141+
haveblank = True
142+
page += [ line ]
143+
continue
144+
page += [ line ]
145+
stripped += [ line ]
146+
pages, page = begpage(pages, page)
147+
return stripped, pages
148+
149+
self.fd.file.seek(0)
150+
raw_lines = self.fd.file.read().split("\n")
151+
draft_lines, draft_pages = _stripheaders(raw_lines)
152+
153+
longform = {
154+
"Beth": "Elizabeth",
155+
"Bill": "William",
156+
"Bob": "Robert",
157+
"Dick": "Richard",
158+
"Fred": "Alfred",
159+
"Jerry": "Gerald",
160+
"Liz": "Elizabeth",
161+
"Lynn": "Carolyn",
162+
"Ned": "Edward" ,
163+
"Ted":"Edward",
164+
}
165+
aux = {
166+
"honor" : r"(?:Dr\.?|Prof(?:\.?|essor)|Sir|Lady|Dame)",
167+
"prefix": r"([Dd]e|Hadi|van|van de|van der|Ver|von)",
168+
"suffix": r"(jr|II|2nd|III|3rd|IV|4th)",
169+
"first" : r"([A-Z][-A-Za-z]*)((\.?[- ]{1,2}[A-Za-z]+)*)",
170+
"last" : r"([-A-Za-z']{2,})",
171+
}
172+
authformats = [
173+
r" {6}(%(first)s[ \.]{1,3}((%(prefix)s )?%(last)s)( %(suffix)s)?)([, ]?(.+\.?|\(.+\.?|\)))?$" % aux,
174+
r" {6}(((%(prefix)s )?%(last)s)( %(suffix)s)?, %(first)s)([, ]([Ee]d\.?|\([Ee]d\.?\)))?$" % aux,
175+
r" {6}(%(last)s)$" % aux,
176+
]
177+
178+
authors = []
179+
companies = []
180+
181+
# Collect first-page author information first
182+
have_blankline = False
183+
have_draftline = False
184+
prev_blankline = False
185+
for line in draft_lines[:15]:
186+
leading_space = len(re.findall("^ *", line)[0])
187+
line_len = len(line.rstrip())
188+
trailing_space = line_len <= 72 and 72 - line_len or 0
189+
# Truncate long lines at the first space past column 80:
190+
trunc_space = line.find(" ", 80)
191+
if line_len > 80 and trunc_space > -1:
192+
line = line[:trunc_space]
193+
if line_len > 60:
194+
# Look for centered title, break if found:
195+
if (leading_space > 5 and abs(leading_space - trailing_space) < 5):
196+
break
197+
for authformat in authformats:
198+
match = re.search(authformat, line)
199+
if match:
200+
author = match.group(1)
201+
authors += [ author ]
202+
if line.strip() == "":
203+
if prev_blankline:
204+
break
205+
have_blankline = True
206+
prev_blankline = True
207+
else:
208+
prev_blankline = False
209+
if "draft-" in line:
210+
have_draftline = True
211+
if have_blankline and have_draftline:
212+
break
213+
214+
found_pos = []
215+
for i in range(len(authors)):
216+
author = authors[i]
217+
if author == None:
218+
continue
219+
if "," in author:
220+
last, first = author.split(",",1)
221+
author = "%s %s" % (first.strip(), last.strip())
222+
if not " " in author:
223+
if "." in author:
224+
first, last = author.rsplit(".", 1)
225+
first += "."
226+
else:
227+
author = "[A-Z].+ " + author
228+
first, last = author.rsplit(" ", 1)
229+
else:
230+
first, last = author.rsplit(" ", 1)
231+
232+
for author in [ "%s %s"%(first,last), "%s %s"%(last,first), ]:
233+
# Pattern for full author information search, based on first page author name:
234+
authpat = author
235+
# Permit expansion of first name
236+
authpat = re.sub("\. ", ".* ", authpat)
237+
authpat = re.sub("\.$", ".*", authpat)
238+
# Permit insertsion of middle name or initial
239+
authpat = re.sub(" ", "\S*( +[^ ]+)* +", authpat)
240+
# Permit expansion of double-name initials
241+
authpat = re.sub("-", ".*?-", authpat)
242+
# Some chinese names are shown with double-letter(latin) abbreviated given names, rather than
243+
# a single-letter(latin) abbreviation:
244+
authpat = re.sub("^([A-Z])[A-Z]+\.\*", r"\1[-\w]+", authpat)
245+
authpat = "^(?:%s ?)?(%s)( *\(.*\)|,( [A-Z][-A-Za-z0-9]*)?)?" % (aux["honor"], authpat)
246+
start = 0
247+
col = None
248+
249+
# Find start of author info for this author (if any).
250+
# Scan from the end of the file, looking for a match to authpath
251+
try:
252+
for j in range(len(draft_lines)-1, 15, -1):
253+
line = draft_lines[j].strip()
254+
forms = [ line ] + [ line.replace(short, longform[short]) for short in longform if short in line ]
255+
for line in forms:
256+
if re.search(authpat, line):
257+
start = j
258+
columns = re.split("( +)", line)
259+
# Find which column:
260+
cols = [ c for c in range(len(columns)) if re.search(authpat+r"$", columns[c].strip()) ]
261+
if cols:
262+
col = cols[0]
263+
if not (start, col) in found_pos:
264+
found_pos += [ (start, col) ]
265+
beg = len("".join(columns[:col]))
266+
if col == len(columns) or col == len(columns)-1:
267+
end = None
268+
else:
269+
end = beg + len("".join(columns[col:col+2]))
270+
author = re.search(authpat, columns[col].strip()).group(1)
271+
if author in companies:
272+
authors[i] = None
273+
else:
274+
authors[i] = author
275+
276+
raise StopIteration("Found Author")
277+
except StopIteration:
278+
pass
279+
if start and col != None:
280+
break
281+
if not authors[i]:
282+
continue
283+
284+
if start and col != None:
285+
done = False
286+
count = 0
287+
keyword = False
288+
blanklines = 0
289+
for line in draft_lines[start+1:]:
290+
# Break on the second blank line
291+
if not line:
292+
blanklines += 1
293+
if blanklines >= 3:
294+
break
295+
else:
296+
continue
297+
else:
298+
count += 1
299+
authmatch = [ a for a in authors[i+1:] if a and not a in companies and re.search((r"(^|\W)"+re.sub("\.? ", ".* ", a)+"(\W|$)"), line.strip()) ]
300+
if authmatch:
301+
if count == 1 or (count == 2 and not blanklines):
302+
# First line after an author -- this is a company
303+
companies += authmatch
304+
companies += [ line.strip() ] # XXX fix this for columnized author list
305+
companies = list(set(companies))
306+
for k in range(i+1, len(authors)):
307+
if authors[k] in companies:
308+
authors[k] = None
309+
elif not "@" in line:
310+
break
311+
else:
312+
pass
313+
314+
try:
315+
column = line[beg:end].strip()
316+
except:
317+
column = line
318+
column = re.sub(" *\(at\) *", "@", column)
319+
column = re.sub(" *\(dot\) *", ".", column)
320+
321+
emailmatch = re.search("[-A-Za-z0-9_.+]+@[-A-Za-z0-9_.]+", column)
322+
if emailmatch and not "@" in authors[i]:
323+
email = emailmatch.group(0).lower()
324+
authors[i] = "%s <%s>" % (authors[i], email)
325+
else:
326+
authors[i] = None
327+
328+
authors = [ re.sub(r" +"," ", a) for a in authors if a != None ]
329+
if authors:
330+
authors.sort()
331+
self.parsed_info.metadraft.authors = authors
332+
else:
333+
self.parsed_info.errors.append("Draft authors could not be found.")
334+
335+
return authors

0 commit comments

Comments
 (0)