merging author parsing in plain parser. Closes ietf-tools#585.

jminuscula · jminuscula · commit 020e7f89ff7c · 2011-02-08T10:42:09.000Z
- Legacy-Id: 2820
diff --git a/ietf/submit/parsers/base.py b/ietf/submit/parsers/base.py
@@ -9,6 +9,7 @@ class MetaDataDraft(object):
     revision = None
     filename = None
     group = None
+    authors = None
 
 
 class ParseInfo(object):
@@ -51,6 +52,7 @@ def parse(self):
                     method()
         if self.parsed_info.errors:
             return self.parsed_info
+        return self.parsed_info
 
     def parse_critical_000_invalid_chars_in_filename(self):
         name = self.fd.name
diff --git a/ietf/submit/parsers/plain_parser.py b/ietf/submit/parsers/plain_parser.py
@@ -68,3 +68,268 @@ def parse_critical_003_wg(self):
                     self.parsed_info.add_error('Invalid WG ID: %s' % group_acronym)
             else:
                 self.parsed_info.metadraft.wg = IETFWG.objects.get(pk=NONE_WG_PK)
+
+    def parse_critical_authors(self):
+        """
+        comes from http://svn.tools.ietf.org/svn/tools/ietfdb/branch/idsubmit/ietf/utils/draft.py
+        """
+        
+        def _stripheaders(rawlines):
+            stripped = []
+            pages = []
+            page = []
+            line = ""
+            debug = False
+            newpage = False
+            sentence = False
+            haveblank = False
+
+            def endpage(pages, page, line):
+                if line:
+                    page += [ line ]
+                return begpage(pages, page)
+            def begpage(pages, page, line=None):
+                if page and len(page) > 5:
+                    pages += [ "\n".join(page) ]
+                    page = []
+                    newpage = True
+                if line:
+                    page += [ line ]
+                return pages, page
+
+            for line in rawlines:
+                line = line.rstrip()
+                if re.search("\[?[Pp]age [0-9ivx]+\]?[ \t\f]*$", line, re.I):
+                    pages, page = endpage(pages, page, line)
+                    continue
+                if re.search("\f", line, re.I):
+                    pages, page = begpage(pages, page)
+                    continue
+                if re.search("^ *Internet.Draft.+[12][0-9][0-9][0-9] *$", line, re.I):
+                    pages, page = begpage(pages, page, line)
+                    continue
+                if re.search("^ *Draft.+[12][0-9][0-9][0-9] *$", line, re.I):
+                    pages, page = begpage(pages, page, line)
+                    continue
+                if re.search("^RFC[ -]?[0-9]+.*(  +)[12][0-9][0-9][0-9]$", line, re.I):
+                    pages, page = begpage(pages, page, line)
+                    continue
+                if re.search("^draft-[-a-z0-9_.]+.*[0-9][0-9][0-9][0-9]$", line, re.I):
+                    pages, page = endpage(pages, page, line)
+                    continue
+                if re.search(".{60,}(Jan|Feb|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|Sep|Oct|Nov|Dec) (19[89][0-9]|20[0-9][0-9]) *$", line, re.I):
+                    pages, page = begpage(pages, page, line)
+                    continue
+                if newpage and re.search("^ *draft-[-a-z0-9_.]+ *$", line, re.I):
+                    pages, page = begpage(pages, page, line)
+                    continue
+                if re.search("^[^ \t]+", line):
+                    sentence = True
+                if re.search("[^ \t]", line):
+                    if newpage:
+                        if sentence:
+                            stripped += [""]
+                    else:
+                        if haveblank:
+                            stripped += [""]
+                    haveblank = False
+                    sentence = False
+                    newpage = False
+                if re.search("[.:]$", line):
+                    sentence = True
+                if re.search("^[ \t]*$", line):
+                    haveblank = True
+                    page += [ line ]
+                    continue
+                page += [ line ]
+                stripped += [ line ]
+            pages, page = begpage(pages, page)
+            return stripped, pages
+            
+        self.fd.file.seek(0)
+        raw_lines = self.fd.file.read().split("\n")
+        draft_lines, draft_pages = _stripheaders(raw_lines)
+
+        longform = {
+            "Beth": "Elizabeth",
+            "Bill": "William",
+            "Bob": "Robert",
+            "Dick": "Richard",
+            "Fred": "Alfred",
+            "Jerry": "Gerald",
+            "Liz": "Elizabeth",
+            "Lynn": "Carolyn",
+            "Ned": "Edward" ,
+            "Ted":"Edward",
+        }
+        aux = {
+            "honor" : r"(?:Dr\.?|Prof(?:\.?|essor)|Sir|Lady|Dame)",
+            "prefix": r"([Dd]e|Hadi|van|van de|van der|Ver|von)",
+            "suffix": r"(jr|II|2nd|III|3rd|IV|4th)",
+            "first" : r"([A-Z][-A-Za-z]*)((\.?[- ]{1,2}[A-Za-z]+)*)",
+            "last"  : r"([-A-Za-z']{2,})",
+        }
+        authformats = [
+            r" {6}(%(first)s[ \.]{1,3}((%(prefix)s )?%(last)s)( %(suffix)s)?)([, ]?(.+\.?|\(.+\.?|\)))?$" % aux,
+            r" {6}(((%(prefix)s )?%(last)s)( %(suffix)s)?, %(first)s)([, ]([Ee]d\.?|\([Ee]d\.?\)))?$" % aux,
+            r" {6}(%(last)s)$" % aux,
+        ]
+
+        authors = []
+        companies = []
+
+        # Collect first-page author information first
+        have_blankline = False
+        have_draftline = False
+        prev_blankline = False
+        for line in draft_lines[:15]:
+            leading_space = len(re.findall("^ *", line)[0])
+            line_len = len(line.rstrip())
+            trailing_space = line_len <= 72 and 72 - line_len or 0
+            # Truncate long lines at the first space past column 80:
+            trunc_space = line.find(" ", 80)
+            if line_len > 80 and  trunc_space > -1:
+                line = line[:trunc_space]
+            if line_len > 60:
+                # Look for centered title, break if found:
+                if (leading_space > 5 and abs(leading_space - trailing_space) < 5):
+                    break
+                for authformat in authformats:
+                    match = re.search(authformat, line)
+                    if match:
+                        author = match.group(1)
+                        authors += [ author ]
+            if line.strip() == "":
+                if prev_blankline:
+                    break
+                have_blankline = True
+                prev_blankline = True
+            else:
+                prev_blankline = False
+            if "draft-" in line:
+                have_draftline = True
+            if have_blankline and have_draftline:
+                break
+
+        found_pos = []
+        for i in range(len(authors)):
+            author = authors[i]
+            if author == None:
+                continue
+            if "," in author:
+                last, first = author.split(",",1)
+                author = "%s %s" % (first.strip(), last.strip())
+            if not " " in author:
+                if "." in author:
+                    first, last = author.rsplit(".", 1)
+                    first += "."
+                else:
+                    author = "[A-Z].+ " + author
+                    first, last = author.rsplit(" ", 1)
+            else:
+                first, last = author.rsplit(" ", 1)
+
+            for author in [ "%s %s"%(first,last), "%s %s"%(last,first), ]:
+                # Pattern for full author information search, based on first page author name:
+                authpat = author
+                # Permit expansion of first name
+                authpat = re.sub("\. ", ".* ", authpat)
+                authpat = re.sub("\.$", ".*", authpat)
+                # Permit insertsion of middle name or initial
+                authpat = re.sub(" ", "\S*( +[^ ]+)* +", authpat)
+                # Permit expansion of double-name initials
+                authpat = re.sub("-", ".*?-", authpat)
+                # Some chinese names are shown with double-letter(latin) abbreviated given names, rather than
+                # a single-letter(latin) abbreviation:
+                authpat = re.sub("^([A-Z])[A-Z]+\.\*", r"\1[-\w]+", authpat) 
+                authpat = "^(?:%s ?)?(%s)( *\(.*\)|,( [A-Z][-A-Za-z0-9]*)?)?" % (aux["honor"], authpat)
+                start = 0
+                col = None
+
+                # Find start of author info for this author (if any).
+                # Scan from the end of the file, looking for a match to  authpath
+                try:
+                    for j in range(len(draft_lines)-1, 15, -1):
+                        line = draft_lines[j].strip()
+                        forms = [ line ] + [ line.replace(short, longform[short]) for short in longform if short in line ]
+                        for line in forms:
+                            if re.search(authpat, line):
+                                start = j
+                                columns = re.split("(    +)", line)
+                                # Find which column:
+                                cols = [ c for c in range(len(columns)) if re.search(authpat+r"$", columns[c].strip()) ]
+                                if cols:
+                                    col = cols[0]
+                                    if not (start, col) in found_pos:
+                                        found_pos += [ (start, col) ]
+                                        beg = len("".join(columns[:col]))
+                                        if col == len(columns) or col == len(columns)-1:
+                                            end = None
+                                        else:
+                                            end = beg + len("".join(columns[col:col+2]))
+                                        author = re.search(authpat, columns[col].strip()).group(1)
+                                        if author in companies:
+                                            authors[i] = None
+                                        else:
+                                            authors[i] = author
+
+                                        raise StopIteration("Found Author")
+                except StopIteration:
+                    pass
+                if start and col != None:
+                    break
+            if not authors[i]:
+                continue
+
+            if start and col != None:
+                done = False
+                count = 0
+                keyword = False
+                blanklines = 0
+                for line in draft_lines[start+1:]:
+                    # Break on the second blank line
+                    if not line:
+                        blanklines += 1
+                        if blanklines >= 3:
+                            break
+                        else:
+                            continue
+                    else:
+                        count += 1
+                    authmatch = [ a for a in authors[i+1:] if a and not a in companies and re.search((r"(^|\W)"+re.sub("\.? ", ".* ", a)+"(\W|$)"), line.strip()) ]
+                    if authmatch:
+                        if count == 1 or (count == 2 and not blanklines):
+                            # First line after an author -- this is a company
+                            companies += authmatch
+                            companies += [ line.strip() ] # XXX fix this for columnized author list
+                            companies = list(set(companies))
+                            for k in range(i+1, len(authors)):
+                                if authors[k] in companies:
+                                    authors[k] = None
+                        elif not "@" in line:
+                            break
+                        else:
+                            pass
+
+                    try:
+                        column = line[beg:end].strip()
+                    except:
+                        column = line
+                    column = re.sub(" *\(at\) *", "@", column)
+                    column = re.sub(" *\(dot\) *", ".", column)
+
+                    emailmatch = re.search("[-A-Za-z0-9_.+]+@[-A-Za-z0-9_.]+", column)
+                    if emailmatch and not "@" in authors[i]:
+                        email = emailmatch.group(0).lower()
+                        authors[i] = "%s <%s>" % (authors[i], email)
+            else:
+                authors[i] = None
+
+        authors = [ re.sub(r" +"," ", a) for a in authors if a != None ]
+        if authors:
+            authors.sort()
+            self.parsed_info.metadraft.authors = authors
+        else:
+            self.parsed_info.errors.append("Draft authors could not be found.")
+
+        return authors