Moved unidecode_name from utils.text to person.name.

levkowetz · levkowetz · commit 2c1438c2401b · 2017-09-20T15:36:30.000Z
Modified UserFactory to use a new locale for each new user, instead of the
same locale for a whole test run.  This (almost) ensures the exercise of
code to deal with non-ascii names, something which would not happen if a
locale with ascii names was chosen at the start of a run.

Modified name.initials() to not use non-word characters as initials.

Modified unidecode_name() to do more normalization, to conform to the
conventions used in internet-drafts.

Added saving of the factory-boy random state in order to be able to re-run
a test suite with the same pseudo-random sequence as in a previous failed
run.

Fixed an issue with email formatting in test_api_submit_ok().

Modified the draft author extraction code to deal better with names with
embedded apostrophes.
 - Legacy-Id: 14141
diff --git a/ietf/nomcom/utils.py b/ietf/nomcom/utils.py
@@ -22,7 +22,7 @@
 from ietf.utils.pipe import pipe
 from ietf.utils.mail import send_mail_text, send_mail
 from ietf.utils.log import log
-from ietf.utils.text import unidecode_name
+from ietf.person.name import unidecode_name
 
 import debug                            # pyflakes:ignore
 
diff --git a/ietf/person/factories.py b/ietf/person/factories.py
@@ -13,20 +13,23 @@
 import debug                            # pyflakes:ignore
 
 from ietf.person.models import Person, Alias, Email
-from ietf.utils.text import unidecode_name
+from ietf.person.name import unidecode_name
 
 
 fake = faker.Factory.create()
 
+def random_faker():
+    return faker.Faker(random.sample(faker.config.AVAILABLE_LOCALES, 1)[0])
+
 class UserFactory(factory.DjangoModelFactory):
     class Meta:
         model = User
         django_get_or_create = ('username',)
-        exclude = ['locale', ]
+        exclude = ['faker', ]
 
-    locale = random.sample(faker.config.AVAILABLE_LOCALES, 1)[0]
-    first_name = factory.Faker('first_name', locale)
-    last_name = factory.Faker('last_name', locale)
+    faker = factory.LazyFunction(random_faker)
+    first_name = factory.LazyAttribute(lambda o: o.faker.first_name())
+    last_name = factory.LazyAttribute(lambda o: o.faker.last_name())
     email = factory.LazyAttributeSequence(lambda u, n: '%s.%s_%d@%s'%( slugify(unidecode(u.first_name)),
                                                 slugify(unidecode(u.last_name)), n, fake.domain_name()))
     username = factory.LazyAttribute(lambda u: u.email)
diff --git a/ietf/person/models.py b/ietf/person/models.py
@@ -20,7 +20,7 @@
 from ietf.utils.mail import send_mail_preformatted
 from ietf.utils.storage import NoLocationMigrationFileSystemStorage
 from ietf.utils.mail import formataddr
-from ietf.utils.text import unidecode_name
+from ietf.person.name import unidecode_name
 
 
 class PersonInfo(models.Model):
diff --git a/ietf/person/name.py b/ietf/person/name.py
@@ -1,7 +1,12 @@
 import re
+import unidecode
 
 import debug                            # pyflakes:ignore
 
+
+def name_particle_match(name):
+    return re.search(r" (af|al|Al|de|der|di|Di|du|el|El|Hadi|in 't|Le|st\.?|St\.?|ten|ter|van|van der|Van|von|von der|Von|zu) ", name)
+
 def name_parts(name):
     prefix, first, middle, last, suffix = u"", u"", u"", u"", u""
 
@@ -36,7 +41,7 @@ def name_parts(name):
             full = full.lower()         # adjust case for all-uppercase input
         # This is an incomplete list.  Adjust as needed to handle known ietf
         # participant names correctly:
-        particle = re.search(r" (af|al|Al|de|der|di|Di|du|el|El|Hadi|in 't|Le|st\.?|St\.?|ten|ter|van|van der|Van|von|von der|Von|zu) ", full)
+        particle = name_particle_match(full)
         if particle:
             pos = particle.start()
             parts = full[:pos].split() + [full[pos+1:]]
@@ -52,19 +57,63 @@ def name_parts(name):
     else:
         last = parts[0]
     return prefix, first, middle, last, suffix
-    
+
 def initials(name):
     prefix, first, middle, last, suffix = name_parts(name)
     given = first
     if middle:
         given += u" "+middle
-    initials = u" ".join([ n[0]+'.' for n in given.split() ])
+    # Don't use non-word characters as initials.
+    # Example: The Bulgarian transcribed name "'Rnest Balkanska" should not have an initial of "'".
+    given = re.sub('[^ .\w]', '', given)
+    initials = u" ".join([ n[0].upper()+'.' for n in given.split() ])
     return initials
 
 def plain_name(name):
     prefix, first, middle, last, suffix = name_parts(name)
     return u" ".join([first, last])
 
+def capfirst(s):
+    # Capitalize the first word character, skipping non-word characters and
+    # leaving following word characters untouched:
+    letters = list(s)
+    for i,l in enumerate(letters):
+        if l.isalpha():
+            letters[i] = l.capitalize()
+            break
+    return ''.join(letters)
+
+def unidecode_name(uname):
+    """
+    unidecode() of cjk ideograms can produce strings which contain spaces.
+    Strip leading and trailing spaces, and reduce double-spaces to single.
+
+    For some other ranges, unidecode returns all-lowercase names; fix these
+    up with capitalization.
+    """
+    # Fix double spacing
+    name = unidecode.unidecode(uname)
+    if name == uname:
+        return name
+    name = name.strip().replace('  ', ' ')
+    # Fix all-upper and all-lower names:
+    # Check for name particles -- don't capitalize those
+    m = name_particle_match(name)
+    particle = m.group(1) if m else None
+    # Get the name parts
+    prefix, first, middle, last, suffix = name_parts(name)
+    # Capitalize names
+    first = capfirst(first)
+    middle = ' '.join([ capfirst(p) for p in middle.split() ])
+    last   = ' '.join([ capfirst(p) for p in last.split() ])
+    # Restore the particle, if any
+    if particle and last.startswith(capfirst(particle)+' '):
+        last = ' '.join([ particle, last[len(particle)+1:] ])
+    # Recombine the parts
+    parts = prefix, first, middle, last, suffix
+    name = ' '.join([ p for p in parts if p and p.strip() != '' ])
+    return name
+
 if __name__ == "__main__":
     import sys
     name = u" ".join(sys.argv[1:])
diff --git a/ietf/review/import_from_review_tool.py b/ietf/review/import_from_review_tool.py
@@ -25,7 +25,7 @@
 from ietf.doc.models import Document, DocAlias, ReviewRequestDocEvent, NewRevisionDocEvent, DocTypeName, State
 from ietf.utils.text import strip_prefix, xslugify
 from ietf.review.utils import possibly_advance_next_reviewer_for_team
-from ietf.utils.text import unidecode_name
+from ietf.person.name import unidecode_name
 
 parser = argparse.ArgumentParser()
 parser.add_argument("database", help="database must be included in settings")
diff --git a/ietf/settings.py b/ietf/settings.py
@@ -920,6 +920,9 @@ def skip_unreadable_post(record):
 
 STATS_NAMES_LIMIT = 25
 
+UTILS_TEST_RANDOM_STATE_FILE = '.factoryboy_random_state'
+
+
 # Put the production SECRET_KEY in settings_local.py, and also any other
 # sensitive or site-specific changes.  DO NOT commit settings_local.py to svn.
 from settings_local import *            # pyflakes:ignore pylint: disable=wildcard-import
diff --git a/ietf/stats/utils.py b/ietf/stats/utils.py
@@ -8,7 +8,7 @@
 from ietf.stats.models import AffiliationAlias, AffiliationIgnoredEnding, CountryAlias, MeetingRegistration
 from ietf.name.models import CountryName
 from ietf.person.models import Person, Email, Alias
-from ietf.utils.text import unidecode_name
+from ietf.person.name import unidecode_name
 
 
 def compile_affiliation_ending_stripping_regexp():
diff --git a/ietf/submit/tests.py b/ietf/submit/tests.py
@@ -1588,7 +1588,7 @@ def test_api_submit_bad_method(self):
 
     def test_api_submit_ok(self):
         r, author, name = self.post_submission('00')
-        expected = "Upload of %s OK, confirmation requests sent to:\n  %s" % (name, author.formatted_email())
+        expected = "Upload of %s OK, confirmation requests sent to:\n  %s" % (name, author.formatted_email().replace('\n',''))
         self.assertContains(r, expected, status_code=200)
 
     def test_api_submit_no_user(self):
diff --git a/ietf/submit/utils.py b/ietf/submit/utils.py
@@ -30,7 +30,7 @@
 from ietf.utils.accesstoken import generate_random_key
 from ietf.utils.draft import Draft
 from ietf.utils.mail import is_valid_email
-from ietf.utils.text import unidecode_name
+from ietf.person.name import unidecode_name
 
 
 def validate_submission(submission):
diff --git a/ietf/utils/draft.py b/ietf/utils/draft.py
@@ -509,8 +509,8 @@ def extract_authors(self):
             "honor" : r"(?:[A-Z]\.|Dr\.?|Dr\.-Ing\.|Prof(?:\.?|essor)|Sir|Lady|Dame|Sri)",
             "prefix": r"([Dd]e|Hadi|van|van de|van der|Ver|von|[Ee]l)",
             "suffix": r"(jr.?|Jr.?|II|2nd|III|3rd|IV|4th)",
-            "first" : r"([A-Z][-A-Za-z]*)(( ?\([A-Z][-A-Za-z]*\))?(\.?[- ]{1,2}[A-Za-z]+)*)",
-            "last"  : r"([-A-Za-z']{2,})",
+            "first" : r"([A-Z][-A-Za-z'`]*)(( ?\([A-Z][-A-Za-z'`]*\))?(\.?[- ]{1,2}[A-Za-z'`]+)*)",
+            "last"  : r"([-A-Za-z'`]{2,})",
             "months": r"(January|February|March|April|May|June|July|August|September|October|November|December)",
             "mabbr" : r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\.?",
             }
@@ -575,7 +575,7 @@ def dotexp(s):
 
             # permit insertion of middle names between first and last, and
             # add possible honorific and suffix information
-            authpat = r"(?:^| and )(?:%(hon)s ?)?(%(first)s\S*( +[^ ]+)* +%(last)s)( *\(.*|,( [A-Z][-A-Za-z0-9]*)?| %(suffix)s| [A-Z][a-z]+)?" % {"hon":hon, "first":first, "last":last, "suffix":suffix,}
+            authpat = r"(?:^| and )(?:%(hon)s ?)?([`']?%(first)s\S*( +[^ ]+)* +%(last)s)( *\(.*|,( [A-Z][-A-Za-z0-9]*)?| %(suffix)s| [A-Z][a-z]+)?" % {"hon":hon, "first":first, "last":last, "suffix":suffix,}
             return authpat
 
         authors = []
diff --git a/ietf/utils/test_data.py b/ietf/utils/test_data.py
@@ -20,7 +20,7 @@
 from ietf.person.models import Person, Email
 from ietf.group.utils import setup_default_community_list_for_group
 from ietf.review.models import (ReviewRequest, ReviewerSettings, ReviewResultName, ReviewTypeName, ReviewTeamSettings )
-from ietf.utils.text import unidecode_name
+from ietf.person.name import unidecode_name
 
 
 def create_person(group, role_name, name=None, username=None, email_address=None, password=None, is_staff=False, is_superuser=False):
diff --git a/ietf/utils/test_runner.py b/ietf/utils/test_runner.py
@@ -45,6 +45,7 @@
 import codecs
 import gzip
 import unittest
+import factory.random
 from fnmatch import fnmatch
 
 from coverage.report import Reporter
@@ -557,6 +558,18 @@ def setup_test_environment(self, **kwargs):
 
         maybe_create_svn_symlinks(settings)
 
+        if os.path.exists(settings.UTILS_TEST_RANDOM_STATE_FILE):
+            print "     Loading factory-boy random state from .random-state"
+            with open(settings.UTILS_TEST_RANDOM_STATE_FILE) as f:
+                s = json.load(f)
+                s[1] = tuple(s[1])      # random.setstate() won't accept a list in lieus of a tuple
+                factory.random.set_random_state(s)
+        else:
+            print "     Saving factory-boy random state to .random-state"
+            with open(settings.UTILS_TEST_RANDOM_STATE_FILE, 'w') as f:
+                s = factory.random.get_random_state()
+                json.dump(s, f)
+
         super(IetfTestRunner, self).setup_test_environment(**kwargs)
 
     def teardown_test_environment(self, **kwargs):
@@ -683,4 +696,7 @@ def run_tests(self, test_labels, extra_tests=[], **kwargs):
 
         save_test_results(failures, test_labels)
 
+        if not failures and os.path.exists(settings.UTILS_TEST_RANDOM_STATE_FILE):
+            os.unlink(settings.UTILS_TEST_RANDOM_STATE_FILE)
+
         return failures
diff --git a/ietf/utils/text.py b/ietf/utils/text.py
@@ -4,7 +4,6 @@
 import textwrap
 import types
 import unicodedata
-import unidecode
 
 from django.utils.functional import allow_lazy
 from django.utils import six
@@ -125,11 +124,3 @@ def isascii(text):
         return True
     except UnicodeEncodeError:
         return False
-        
-def unidecode_name(name):
-    """
-    unidecode() of cjk ideograms can produce strings which contain spaces.
-    Strip leading and trailing spaces, and reduce double-spaces to single.
-    """
-    return unidecode.unidecode(name).strip().replace('  ', ' ')
-