datatracker/ietf/person/name.py at dev/7.43.1.dev0 · https-github-com-bit/datatracker · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# Copyright The IETF Trust 2011-2020, All Rights Reserved
# -*- coding: utf-8 -*-


import re
import unidecode

import debug                            # pyflakes:ignore


def name_particle_match(name):
    return re.search(r" (af|al|Al|de|De|der|di|Di|du|el|El|Hadi|in 't|Le|st\.?|St\.?|ten|ter|van|van der|van 't|Van|von|von der|Von|zu) ", name)

def name_parts(name):
    prefix, first, middle, last, suffix = "", "", "", "", ""

    if not name.strip():
        return prefix, first, middle, last, suffix

    # if we got a name on the form "Some Name (Foo Bar)", get rid of
    # the paranthesized part
    name_with_paren_match = re.search(r"^([^(]+)\s*\(.*\)$", name)
    if name_with_paren_match:
        name = name_with_paren_match.group(1)

    parts = name.split()
    if len(parts) > 2 and parts[0] in ["M", "M.", "Sri", ] and "." not in parts[1]:
        prefix = parts[0];
        parts = parts[1:]
    prefix = []
    while len(parts) > 1 and parts[0] in ["Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr",
        "Dr.", "Doctor", "Prof", "Prof.", "Professor", "Sir", "Lady", "Dame",
        "Gen.", "Col.", "Maj.", "Capt.", "Lieut.", "Lt.", "Cmdr.", "Col.", ]:
        prefix.append(parts[0])
        parts = parts[1:]
    prefix = " ".join(prefix)
    if len(parts) > 2:
        if parts[-1] in ["Jr", "Jr.", "II", "2nd", "III", "3rd", "Ph.D."]:
            suffix = parts[-1]
            parts = parts[:-1]
    if len(parts) > 2:
        # Check if we have a surname with nobiliary particle
        full = " ".join(parts)
        if full.upper() == full:
            full = full.lower()         # adjust case for all-uppercase input
        # This is an incomplete list.  Adjust as needed to handle known ietf
        # participant names correctly:
        particle = name_particle_match(full)
        if particle:
            pos = particle.start()
            parts = full[:pos].split() + [full[pos+1:]]
    if len(parts) > 2:
        first = parts[0]
        last = parts[-1]
        middle = " ".join(parts[1:-1])
    elif len(parts) == 2:
        first, last = parts
    else:
        last = parts[0]
    if len(parts) >= 2:
        # Handle reverse-order names with uppercase surname correctly
        if len(first)>1 and re.search("^[A-Z-]+$", first):
            first, last = last, first.capitalize()
    # Handle exception for RFC Editor
    if (prefix, first, middle, last, suffix) == ('', 'Editor', '', 'Rfc', ''):
        first = 'RFC'
        last = 'Editor'
    return prefix, first, middle, last, suffix

def initials(name):
    prefix, first, middle, last, suffix = name_parts(name)
    given = first
    if middle:
        given += " "+middle
    # Don't use non-word characters as initials.
    # Example: The Bulgarian transcribed name "'Rnest Balkanska" should not have an initial of "'".
    given = re.sub(r'[^ .\w]', '', given)
    initials = " ".join([ n[0].upper()+'.' for n in given.split() ])
    return initials

def plain_name(name):
    prefix, first, middle, last, suffix = name_parts(name)
    return " ".join( n for n in (first, last) if n)

def capfirst(s):
    # Capitalize the first word character, skipping non-word characters and
    # leaving following word characters untouched:
    letters = list(s)
    for i,l in enumerate(letters):
        if l.isalpha():
            letters[i] = l.capitalize()
            break
    return ''.join(letters)

def unidecode_name(uname):
    """
    unidecode() of cjk ideograms can produce strings which contain spaces.
    Strip leading and trailing spaces, and reduce double-spaces to single.

    For some other ranges, unidecode returns all-lowercase names; fix these
    up with capitalization.
    """
    # Fix double spacing
    name = unidecode.unidecode(uname)
    if name == uname:
        return name
    name = re.sub('  +', ' ', name.strip().replace('@', '').replace('"', ''))
    name = re.sub(r'(\w)\.(\w)', r'\1\2', name)
    # Fix all-upper and all-lower names:
    # Check for name particles -- don't capitalize those
    m = name_particle_match(name)
    particle = m.group(1) if m else None
    # Get the name parts
    prefix, first, middle, last, suffix = name_parts(name)
    # Capitalize names
    first = first.title()
    middle = ' '.join([ capfirst(p) for p in middle.split() ])
    last   = ' '.join([ capfirst(p) for p in last.split() ])
    if len(last) == 1:
        last = (last+last).capitalize()
    # Restore the particle, if any
    if particle and last.startswith(capfirst(particle)+' '):
        last = ' '.join([ particle, last[len(particle)+1:] ])
    # Recombine the parts
    parts = prefix, first, middle, last, suffix
    name = ' '.join([ p for p in parts if p and p.strip() != '' ])
    name = re.sub('  +', ' ', name)
    return name

def normalize_name(s):
    # There is probably more to be done here, but we start by normalising
    # spaces:
    s = re.sub('  +', ' ', s)
    return s

if __name__ == "__main__":
    import sys
    name = " ".join(sys.argv[1:])
    print(name_parts(name))
    print(initials(name))