4040import sys
4141import time
4242
43- version = "0.19 "
43+ version = "0.21 "
4444program = os .path .basename (sys .argv [0 ])
4545progdir = os .path .dirname (sys .argv [0 ])
4646
@@ -126,7 +126,6 @@ def __init__(self, text):
126126
127127 self .rawlines = self .text .split ("\n " )
128128 self .lines , self .pages = self ._stripheaders ()
129-
130129 # Some things (such as the filename) has to be on the first page. If
131130 # we didn't get back a set of pages, only one single page with the
132131 # whole document, then we need to do an enforced page split in order
@@ -403,11 +402,33 @@ def extract_authors(self):
403402 r"(?:, | )([Ee]d\.?|\([Ee]d\.?\)|[Ee]ditor)$" ,
404403 ]
405404
405+ address_section = r"^ *([0-9]+\.)? *(Author|Editor)('s|s'|s|\(s\)) (Address|Addresses|Information)"
406+
406407 ignore = [
407408 "Standards Track" , "Current Practice" , "Internet Draft" , "Working Group" ,
408409 "No Affiliation" ,
409410 ]
410- # group 12 34 5 6
411+
412+ def make_authpat (hon , first , last , suffix ):
413+ def dotexp (s ):
414+ s = re .sub ("\. " , ".* " , s )
415+ s = re .sub ("\.$" , ".*" , s )
416+ return s
417+ first = dotexp (first )
418+ last = dotexp (last )
419+ if " " in first :
420+ # if there's a middle part, let it be optional
421+ first , middle = first .split (" " , 1 )
422+ first = "%s( +%s)?" % (first , middle )
423+ # Some chinese names are shown with double-letter(latin) abbreviated given names, rather than
424+ # a single-letter(latin) abbreviation:
425+ first = re .sub ("^([A-Z])[A-Z]+\.\*" , r"\1[-\w]+" , first )
426+
427+ # permit insertion of middle names between first and last, and
428+ # add possible honorific and suffix information
429+ authpat = "(?:^| and )(?:%(hon)s ?)?(%(first)s\S*( +[^ ]+)* +%(last)s)( *\(.*|,( [A-Z][-A-Za-z0-9]*)?| %(suffix)s| [A-Z][a-z]+)?" % {"hon" :hon , "first" :first , "last" :last , "suffix" :suffix ,}
430+ return authpat
431+
411432 authors = []
412433 author_info = []
413434 companies = []
@@ -416,7 +437,7 @@ def extract_authors(self):
416437 have_blankline = False
417438 have_draftline = False
418439 prev_blankline = False
419- for line in self .lines [:15 ]:
440+ for line in self .lines [:30 ]:
420441 #_debug( "**" + line)
421442 leading_space = len (re .findall ("^ *" , line )[0 ])
422443 line_len = len (line .rstrip ())
@@ -466,6 +487,15 @@ def extract_authors(self):
466487 if have_blankline and have_draftline :
467488 break
468489
490+ #find authors' addresses section if it exists
491+ last_line = len (self .lines )- 1
492+ address_section_pos = last_line / 2
493+ for i in range (last_line / 2 ,last_line ):
494+ line = self .lines [i ]
495+ if re .search (address_section , line ):
496+ address_section_pos = i
497+ break
498+
469499 found_pos = []
470500 for i in range (len (authors )):
471501 _debug ("1: authors[%s]: %s" % (i , authors [i ]))
@@ -500,40 +530,32 @@ def extract_authors(self):
500530 author = "%s %s" % (firstname , casefixname )
501531 _debug ("\n Authors: " + str (authors ))
502532 _debug ("Author: " + author )
533+
503534 # Pattern for full author information search, based on first page author name:
504- authpat = author
505- # Permit expansion of first name
506- authpat = re .sub ("\. " , ".* " , authpat )
507- authpat = re .sub ("\.$" , ".*" , authpat )
508- # Permit insertsion of middle name or initial
509- authpat = re .sub (" " , "\S*( +[^ ]+)* +" , authpat )
510- # Permit expansion of double-name initials
511- if not "[A-Z]" in authpat :
512- authpat = re .sub ("-" , ".*?-" , authpat )
513- # Some chinese names are shown with double-letter(latin) abbreviated given names, rather than
514- # a single-letter(latin) abbreviation:
515- authpat = re .sub ("^([A-Z])[A-Z]+\.\*" , r"\1[-\w]+" , authpat )
516- authpat = "(?:^| and )(?:%s ?)?(%s)( *\(.*\)|,( [A-Z][-A-Za-z0-9]*)?| %s| [A-Z][a-z]+)?" % (aux ["honor" ], authpat , aux ["suffix" ])
535+ authpat = make_authpat (aux ['honor' ], firstname , casefixname , aux ['suffix' ])
517536 _debug ("Authpat: " + authpat )
518537 start = 0
519538 col = None
520539 # Find start of author info for this author (if any).
521540 # Scan from the end of the file, looking for a match to authpath
522541 # Scan towards the front from the end of the file, looking for a match to authpath
523- for j in range (len ( self . lines ) - 1 , 15 , - 1 ):
542+ for j in range (last_line , address_section_pos , - 1 ):
524543 line = self .lines [j ]
544+ _debug ( "Line: " + line )
525545 forms = [ line ] + [ line .replace (short , longform [short ]) for short in longform if short in line ]
526546 for form in forms :
527547 try :
528548 if re .search (authpat , form .strip ()) and not j in found_pos :
549+ _debug ( "Match" )
550+
529551 start = j
530552 found_pos += [ start ]
531553 _debug ( " ==> start %s, normalized '%s'" % (start , form .strip ()))
532554 # The author info could be formatted in multiple columns...
533555 columns = re .split ("( +| and )" , form )
534- # _debug( "Columns:" + columns; sys.stdout.flush( ))
556+ # _debug( "Columns:" + str(columns ))
535557 # Find which column:
536- #_debug( "Col range:" + range(len(columns)); sys.stdout.flush( ))
558+ # _debug( "Col range:" + str( range(len(columns))))
537559
538560 cols = [ c for c in range (len (columns )) if re .search (authpat + r"( and |, |$)" , columns [c ].strip ()) ]
539561 if cols :
@@ -571,7 +593,7 @@ def extract_authors(self):
571593 if suffix :
572594 fullname = fullname + " " + suffix
573595 if not " " .join ([ n for n in names if n ]) == fullname :
574- _err ("Author tuple doesn't match text in draft: %s: %s %s " % (authors [i ], names , fullname ))
596+ _err ("Author tuple doesn't match text in draft: %s, %s" % (authors [i ], fullname ))
575597 authors [i ] = (fullname , first , middle , surname , suffix )
576598 #_debug( "Author: %s: %s" % (author_match, authors[author_match]))
577599 break
@@ -726,7 +748,6 @@ def get_refs(self):
726748 refs .sort ()
727749 return normrefs , rfcrefs , refs
728750
729-
730751# ----------------------------------------------------------------------
731752
732753def getmeta (fn ):
@@ -736,7 +757,7 @@ def getmeta(fn):
736757
737758 if " " in fn or not fn .endswith (".txt" ):
738759 _warn ("Skipping unexpected draft name: '%s'" % (fn ))
739- return
760+ return {}
740761
741762 if os .path .exists (fn ):
742763 filename = fn
0 commit comments