forked from adamlaska/datatracker
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathname.py
More file actions
141 lines (124 loc) · 4.96 KB
/
name.py
File metadata and controls
141 lines (124 loc) · 4.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# Copyright The IETF Trust 2011-2020, All Rights Reserved
# -*- coding: utf-8 -*-
import re
import unidecode
import debug # pyflakes:ignore
def name_particle_match(name):
return re.search(r" (af|al|Al|de|De|der|di|Di|du|el|El|Hadi|in 't|Le|st\.?|St\.?|ten|ter|van|van der|van 't|Van|von|von der|Von|zu) ", name)
def name_parts(name):
prefix, first, middle, last, suffix = "", "", "", "", ""
if not name.strip():
return prefix, first, middle, last, suffix
# if we got a name on the form "Some Name (Foo Bar)", get rid of
# the paranthesized part
name_with_paren_match = re.search(r"^([^(]+)\s*\(.*\)$", name)
if name_with_paren_match:
name = name_with_paren_match.group(1)
parts = name.split()
if len(parts) > 2 and parts[0] in ["M", "M.", "Sri", ] and "." not in parts[1]:
prefix = parts[0];
parts = parts[1:]
prefix = []
while len(parts) > 1 and parts[0] in ["Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr",
"Dr.", "Doctor", "Prof", "Prof.", "Professor", "Sir", "Lady", "Dame",
"Gen.", "Col.", "Maj.", "Capt.", "Lieut.", "Lt.", "Cmdr.", "Col.", ]:
prefix.append(parts[0])
parts = parts[1:]
prefix = " ".join(prefix)
if len(parts) > 2:
if parts[-1] in ["Jr", "Jr.", "II", "2nd", "III", "3rd", "Ph.D."]:
suffix = parts[-1]
parts = parts[:-1]
if len(parts) > 2:
# Check if we have a surname with nobiliary particle
full = " ".join(parts)
if full.upper() == full:
full = full.lower() # adjust case for all-uppercase input
# This is an incomplete list. Adjust as needed to handle known ietf
# participant names correctly:
particle = name_particle_match(full)
if particle:
pos = particle.start()
parts = full[:pos].split() + [full[pos+1:]]
if len(parts) > 2:
first = parts[0]
last = parts[-1]
middle = " ".join(parts[1:-1])
elif len(parts) == 2:
first, last = parts
else:
last = parts[0]
if len(parts) >= 2:
# Handle reverse-order names with uppercase surname correctly
if len(first)>1 and re.search("^[A-Z-]+$", first):
first, last = last, first.capitalize()
# Handle exception for RFC Editor
if (prefix, first, middle, last, suffix) == ('', 'Editor', '', 'Rfc', ''):
first = 'RFC'
last = 'Editor'
return prefix, first, middle, last, suffix
def initials(name):
prefix, first, middle, last, suffix = name_parts(name)
given = first
if middle:
given += " "+middle
# Don't use non-word characters as initials.
# Example: The Bulgarian transcribed name "'Rnest Balkanska" should not have an initial of "'".
given = re.sub(r'[^ .\w]', '', given)
initials = " ".join([ n[0].upper()+'.' for n in given.split() ])
return initials
def plain_name(name):
prefix, first, middle, last, suffix = name_parts(name)
return " ".join( n for n in (first, last) if n)
def capfirst(s):
# Capitalize the first word character, skipping non-word characters and
# leaving following word characters untouched:
letters = list(s)
for i,l in enumerate(letters):
if l.isalpha():
letters[i] = l.capitalize()
break
return ''.join(letters)
def unidecode_name(uname):
"""
unidecode() of cjk ideograms can produce strings which contain spaces.
Strip leading and trailing spaces, and reduce double-spaces to single.
For some other ranges, unidecode returns all-lowercase names; fix these
up with capitalization.
"""
# Fix double spacing
name = unidecode.unidecode(uname)
if name == uname:
return name
name = re.sub(' +', ' ', name.strip().replace('@', '').replace('"', ''))
name = re.sub(r'(\w)\.(\w)', r'\1\2', name)
# Fix all-upper and all-lower names:
# Check for name particles -- don't capitalize those
m = name_particle_match(name)
particle = m.group(1) if m else None
# Get the name parts
prefix, first, middle, last, suffix = name_parts(name)
# Capitalize names
first = first.title()
middle = ' '.join([ capfirst(p) for p in middle.split() ])
last = ' '.join([ capfirst(p) for p in last.split() ])
if len(last) == 1:
last = (last+last).capitalize()
# Restore the particle, if any
if particle and last.startswith(capfirst(particle)+' '):
last = ' '.join([ particle, last[len(particle)+1:] ])
# Recombine the parts
parts = prefix, first, middle, last, suffix
name = ' '.join([ p for p in parts if p and p.strip() != '' ])
name = re.sub(' +', ' ', name)
return name
def normalize_name(s):
# There is probably more to be done here, but we start by normalising
# spaces:
s = re.sub(' +', ' ', s)
return s
if __name__ == "__main__":
import sys
name = " ".join(sys.argv[1:])
print(name_parts(name))
print(initials(name))