Skip to content

Commit 7467fa4

Browse files
committed
Tweaked the author extraction code to handle company names in the author list on the first page, when the company names contain a comma, such as for instance 'Foo Bar, Inc'.
- Legacy-Id: 4781
1 parent c90a26c commit 7467fa4

1 file changed

Lines changed: 33 additions & 19 deletions

File tree

ietf/utils/draft.py

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -450,8 +450,7 @@ def extract_authors(self):
450450
r"(?:, | )([Ee]d\.?|\([Ee]d\.?\)|[Ee]ditor)$",
451451
]
452452
companyformats = [
453-
r" {6}(([A-Za-z'][-A-Za-z0-9.& ']+)(,? ?Inc\.?))$",
454-
r" {6}(([A-Za-z'][-A-Za-z0-9.& ']+)(,? ?Ltd\.?))$",
453+
r" {6}(([A-Za-z'][-A-Za-z0-9.& ']+)(,? ?(Inc|Ltd|AB|S\.A)\.?))$",
455454
r" {6}(([A-Za-z'][-A-Za-z0-9.& ']+)(/([A-Za-z'][-A-Za-z0-9.& ']+))+)$",
456455
r" {6}([a-z0-9.-]+)$",
457456
r" {6}(([A-Za-z'][-A-Za-z0-9.&']+)( [A-Za-z'][-A-Za-z0-9.&']+)*)$",
@@ -503,6 +502,7 @@ def dotexp(s):
503502
for line in self.lines[:30]:
504503
self._docheader += line+"\n"
505504
author_on_line = False
505+
company_on_line = False
506506
_debug( "**" + line)
507507
leading_space = len(re.findall("^ *", line)[0])
508508
line_len = len(line.rstrip())
@@ -526,7 +526,7 @@ def dotexp(s):
526526
for lineformat, authformat in multiauthformats:
527527
match = re.search(lineformat, line)
528528
if match:
529-
_debug("Multiauth format: '%s'" % lineformat)
529+
_debug("a. Multiauth format: '%s'" % lineformat)
530530
author_list = re.findall(authformat, line)
531531
authors += [ a[0] for a in author_list ]
532532
companies += [ None for a in author_list ]
@@ -540,22 +540,28 @@ def dotexp(s):
540540
for lineformat in authcompanyformats:
541541
match = re.search(lineformat, line)
542542
if match:
543-
_debug("Line format: '%s'" % lineformat)
544-
author = match.group("author")
545-
company = match.group("company")
546-
authors += [ author, '']
547-
companies += [ None, company ]
548-
#_debug("\nLine: " + line)
549-
#_debug("Format: " + authformat)
550-
_debug("Author: '%s'" % author)
551-
_debug("Company: '%s'" % company)
552-
author_on_line = True
553-
break
543+
_debug("b. Line format: '%s'" % lineformat)
544+
maybe_company = match.group("company").strip(" ,.")
545+
# is the putative company name just a partial name, i.e., a part
546+
# that commonly occurs after a comma as part of a company name,
547+
# as in "Foo Bar, Inc."? If so, skip; else assume there's a
548+
# company name after the comma.
549+
if not maybe_company in ["Inc", "Ltd", "S.A", "AG", "AB", "N.V", ]:
550+
author = match.group("author")
551+
company = match.group("company")
552+
authors += [ author, '']
553+
companies += [ None, company ]
554+
#_debug("\nLine: " + line)
555+
#_debug("Format: " + authformat)
556+
_debug("Author: '%s'" % author)
557+
_debug("Company: '%s'" % company)
558+
author_on_line = True
559+
break
554560
if not author_on_line:
555561
for authformat in authformats:
556562
match = re.search(authformat, line)
557563
if match:
558-
_debug("Auth format: '%s'" % authformat)
564+
_debug("c. Auth format: '%s'" % authformat)
559565
author = match.group(1)
560566
authors += [ author ]
561567
companies += [ None ]
@@ -568,10 +574,11 @@ def dotexp(s):
568574
for authformat in companyformats:
569575
match = re.search(authformat, line)
570576
if match:
571-
_debug("Auth format: '%s'" % authformat)
577+
_debug("d. Company format: '%s'" % authformat)
572578
company = match.group(1)
573579
authors += [ "" ]
574580
companies += [ company ]
581+
company_on_line = True
575582
#_debug("\nLine: " + line)
576583
#_debug("Format: " + authformat)
577584
_debug("Company: '%s'" % company)
@@ -582,7 +589,7 @@ def dotexp(s):
582589
companies += [ "" ]
583590
if line.strip() == "":
584591
if prev_blankline and authors:
585-
_debug("Breaking for having found consecutive blank lines after author name")
592+
_debug("Breaking, having found consecutive blank lines after author name")
586593
break
587594
if authors:
588595
have_blankline = True
@@ -592,7 +599,7 @@ def dotexp(s):
592599
if "draft-" in line:
593600
have_draftline = True
594601
if have_blankline and have_draftline:
595-
_debug("Breaking for having found both blank line and draft-name line")
602+
_debug("Breaking, having found both blank line and draft-name line")
596603
break
597604

598605
# remove trailing blank entries in the author list:
@@ -607,6 +614,8 @@ def dotexp(s):
607614
#companies = [ None if a else '' for a in authors ]
608615
#_debug("B:companies : %s" % str(companies))
609616
#find authors' addresses section if it exists
617+
_debug("B:authors : %s" % str(authors))
618+
610619
last_line = len(self.lines)-1
611620
address_section_pos = last_line/2
612621
for i in range(last_line/2,last_line):
@@ -990,7 +999,12 @@ def _output(docname, fields, outfile=sys.stdout):
990999
else:
9911000
if opt_attributes:
9921001
def outputkey(key, fields):
993-
outfile.write("%-24s: %s\n" % ( key, fields[key].strip().replace("\\", "\\\\" ).replace("'", "\\x27" )))
1002+
field = fields[key]
1003+
if "\n" in field:
1004+
field = "\n" + field.rstrip()
1005+
else:
1006+
field = field.strip()
1007+
outfile.write("%-24s: %s\n" % ( key, field.replace("\\", "\\\\" ).replace("'", "\\x27" )))
9941008
else:
9951009
def outputkey(key, fields):
9961010
outfile.write(" %s='%s'" % ( key.lower(), fields[key].strip().replace("\\", "\\\\" ).replace("'", "\\x27" ).replace("\n", "\\n")))

0 commit comments

Comments
 (0)