Skip to content

Commit 4558595

Browse files
committed
Added support for reverse-order (i.e., Japanese, Chinese, and other) names with uppercase family name in the draft submission author extraction.
- Legacy-Id: 4949
1 parent 61ad24d commit 4558595

1 file changed

Lines changed: 90 additions & 75 deletions

File tree

ietf/utils/draft.py

Lines changed: 90 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
import sys
4141
import time
4242

43-
version = "0.27"
43+
version = "0.28"
4444
program = os.path.basename(sys.argv[0])
4545
progdir = os.path.dirname(sys.argv[0])
4646

@@ -649,92 +649,107 @@ def dotexp(s):
649649
first, last = author.rsplit(" ", 1)
650650
else:
651651
first, last = author.rsplit(" ", 1)
652+
if "." in first and not ". " in first:
653+
first = first.replace(".", ". ").strip()
654+
652655
prefix_match = re.search(" %(prefix)s$" % aux, first)
653656
if prefix_match:
654657
prefix = prefix_match.group(1)
655658
first = first[:-len(prefix)].strip()
656659
last = prefix+" "+last
657660
_debug("First, Last: '%s' '%s'" % (first, last))
658661
for firstname, surname, casefixname in [ (first,last,last), (last,first,first), (first,last,last.upper()), (last,first,first.upper()), ]:
659-
author = "%s %s" % (firstname, casefixname)
660-
_debug("\nAuthors: "+str(authors))
661-
_debug("Author: "+author)
662-
663-
# Pattern for full author information search, based on first page author name:
664-
authpat = make_authpat(aux['honor'], firstname, casefixname, aux['suffix'])
665-
_debug("Authpat: " + authpat)
666-
start = 0
667-
col = None
668-
# Find start of author info for this author (if any).
669-
# Scan towards the front from the end of the file, looking for a match to authpath
670-
for j in range(last_line, address_section_pos, -1):
671-
line = self.lines[j]
672-
_debug( "Line: " + line)
673-
forms = [ line ] + [ line.replace(short, longform[short]) for short in longform if short in line ]
674-
for form in forms:
675-
try:
676-
if re.search(authpat, form.strip()) and not j in found_pos:
677-
_debug( "Match")
678-
679-
start = j
680-
found_pos += [ start ]
681-
_debug( " ==> start %s, normalized '%s'" % (start, form.strip()))
682-
# The author info could be formatted in multiple columns...
683-
columns = re.split("( +| and )", form)
684-
# _debug( "Columns:" + str(columns))
685-
# Find which column:
686-
# _debug( "Col range:" + str(range(len(columns))))
687-
688-
cols = [ c for c in range(len(columns)) if re.search(authpat+r"( and |, |$)", columns[c].strip()) ]
689-
if cols:
690-
col = cols[0]
691-
if not (start, col) in found_pos:
692-
found_pos += [ (start, col) ]
693-
_debug( "Col: %d" % col)
694-
beg = len("".join(columns[:col]))
695-
_debug( "Beg: %d '%s'" % (beg, "".join(columns[:col])))
696-
_debug( "Len: %d" % len(columns))
697-
if col == len(columns) or col == len(columns)-1:
698-
end = None
699-
_debug( "End1: %s" % end)
700-
else:
701-
end = beg + len("".join(columns[col:col+2]))
702-
_debug( "End2: %d '%s'" % (end, "".join(columns[col:col+2])))
703-
_debug( "Cut: '%s'" % form[beg:end])
704-
author_match = re.search(authpat, columns[col].strip()).group(1)
705-
_debug( "AuthMatch: '%s'" % (author_match,))
706-
if author_match in companies_seen:
707-
companies[i] = authors[i]
708-
authors[i] = None
709-
else:
710-
if casefixname in author_match:
711-
fullname = author_match.replace(casefixname, surname)
662+
for left, right in [(firstname, casefixname), (casefixname, firstname)]:
663+
author = "%s %s" % (left, right)
664+
_debug("\nAuthors: "+str(authors))
665+
_debug("Author: "+author)
666+
667+
# Pattern for full author information search, based on first page author name:
668+
authpat = make_authpat(aux['honor'], left, right, aux['suffix'])
669+
_debug("Authpat: " + authpat)
670+
start = 0
671+
col = None
672+
# Find start of author info for this author (if any).
673+
# Scan towards the front from the end of the file, looking for a match to authpath
674+
for j in range(last_line, address_section_pos, -1):
675+
line = self.lines[j]
676+
_debug( "Line: " + line)
677+
forms = [ line ] + [ line.replace(short, longform[short]) for short in longform if short in line ]
678+
for form in forms:
679+
try:
680+
if re.search(authpat, form.strip()) and not j in found_pos:
681+
_debug( "Match")
682+
683+
start = j
684+
found_pos += [ start ]
685+
_debug( " ==> start %s, normalized '%s'" % (start, form.strip()))
686+
# The author info could be formatted in multiple columns...
687+
columns = re.split("( +| and )", form)
688+
# _debug( "Columns:" + str(columns))
689+
# Find which column:
690+
# _debug( "Col range:" + str(range(len(columns))))
691+
692+
cols = [ c for c in range(len(columns)) if re.search(authpat+r"( and |, |$)", columns[c].strip()) ]
693+
if cols:
694+
col = cols[0]
695+
if not (start, col) in found_pos:
696+
found_pos += [ (start, col) ]
697+
_debug( "Col: %d" % col)
698+
beg = len("".join(columns[:col]))
699+
_debug( "Beg: %d '%s'" % (beg, "".join(columns[:col])))
700+
_debug( "Len: %d" % len(columns))
701+
if col == len(columns) or col == len(columns)-1:
702+
end = None
703+
_debug( "End1: %s" % end)
712704
else:
713-
fullname = author_match
714-
fullname = re.sub(" +", " ", fullname)
715-
given_names, surname = fullname.rsplit(None, 1)
716-
if " " in given_names:
717-
first, middle = given_names.split(None, 1)
705+
end = beg + len("".join(columns[col:col+2]))
706+
_debug( "End2: %d '%s'" % (end, "".join(columns[col:col+2])))
707+
_debug( "Cut: '%s'" % form[beg:end])
708+
author_match = re.search(authpat, columns[col].strip()).group(1)
709+
_debug( "AuthMatch: '%s'" % (author_match,))
710+
if re.search('\(.*\)$', author_match.strip()):
711+
author_match = author_match.rsplit('(',1)[0].strip()
712+
if author_match in companies_seen:
713+
companies[i] = authors[i]
714+
authors[i] = None
718715
else:
719-
first = given_names
720-
middle = None
721-
names = (first, middle, surname, suffix)
722-
if suffix:
723-
fullname = fullname+" "+suffix
724-
if not " ".join([ n for n in names if n ]) == fullname:
725-
_err("Author tuple doesn't match text in draft: %s, %s" % (authors[i], fullname))
726-
authors[i] = (fullname, first, middle, surname, suffix)
727-
companies[i] = None
728-
#_debug( "Author: %s: %s" % (author_match, authors[author_match]))
729-
break
730-
except AssertionError, e:
731-
sys.stderr.write("filename: "+self.filename+"\n")
732-
sys.stderr.write("authpat: "+authpat+"\n")
733-
raise
716+
fullname = author_match
717+
#if casefixname in author_match:
718+
# fullname = author_match.replace(casefixname, surname)
719+
#else:
720+
# fullname = author_match
721+
fullname = re.sub(" +", " ", fullname)
722+
if left == firstname:
723+
given_names, surname = fullname.rsplit(None, 1)
724+
else:
725+
surname, given_names = fullname.split(None, 1)
726+
if " " in given_names:
727+
first, middle = given_names.split(None, 1)
728+
else:
729+
first = given_names
730+
middle = None
731+
names = (first, middle, surname, suffix)
732+
if suffix:
733+
fullname = fullname+" "+suffix
734+
parts = [ n for n in names if n ]
735+
revpt = [ n for n in names if n ]
736+
revpt.reverse()
737+
if not ((" ".join(parts) == fullname) or (" ".join(revpt) == fullname)):
738+
_err("Author tuple doesn't match text in draft: %s, %s" % (authors[i], fullname))
739+
authors[i] = (fullname, first, middle, surname, suffix)
740+
companies[i] = None
741+
break
742+
except AssertionError, e:
743+
sys.stderr.write("filename: "+self.filename+"\n")
744+
sys.stderr.write("authpat: "+authpat+"\n")
745+
raise
746+
if start and col != None:
747+
break
734748
if start and col != None:
735749
break
736750
if start and col != None:
737751
break
752+
# End for:
738753
if not authors[i]:
739754
continue
740755
_debug("2: authors[%s]: %s" % (i, authors[i]))
@@ -763,7 +778,7 @@ def dotexp(s):
763778
# for a in authors:
764779
# if a and a not in companies_seen:
765780
# _debug("Search for: %s"%(r"(^|\W)"+re.sub("\.? ", ".* ", a)+"(\W|$)"))
766-
authmatch = [ a for a in authors[i+1:] if a and not a.lower() in companies_seen and (re.search((r"(?i)(^|\W)"+re.sub("\.? ", ".* ", a)+"(\W|$)"), line.strip()) or acronym_match(a, line.strip()) )]
781+
authmatch = [ a for a in authors[i+1:] if a and not a.lower() in companies_seen and (re.search((r"(?i)(^|\W)"+re.sub("[. ]+", ".* ", a)+"(\W|$)"), line.strip()) or acronym_match(a, line.strip()) )]
767782
if authmatch:
768783
_debug(" ? Other author or company ? : %s" % authmatch)
769784
_debug(" Line: "+line.strip())

0 commit comments

Comments
 (0)