|
1 | | -import pprint |
| 1 | +import pprint |
| 2 | +import re |
| 3 | +from collections import defaultdict |
2 | 4 |
|
3 | 5 | from django.contrib import admin |
4 | 6 | from django.contrib.auth.models import User |
5 | | -from ietf.person.models import Person |
| 7 | +from ietf.person.models import Person, AffiliationAlias, AffiliationIgnoredEnding |
6 | 8 |
|
7 | 9 | def merge_persons(source,target,stream): |
8 | 10 |
|
@@ -86,3 +88,88 @@ def merge_persons(source,target,stream): |
86 | 88 | else: |
87 | 89 | print >>stream, "Deleting Person: {}({})".format(source.ascii,source.pk) |
88 | 90 | source.delete() |
| 91 | + |
| 92 | + |
| 93 | +def compile_affiliation_ending_stripping_regexp(): |
| 94 | + parts = [] |
| 95 | + for ending_re in AffiliationIgnoredEnding.objects.values_list("ending", flat=True): |
| 96 | + try: |
| 97 | + re.compile(ending_re) |
| 98 | + except re.error: |
| 99 | + pass |
| 100 | + |
| 101 | + parts.append(ending_re) |
| 102 | + |
| 103 | + re_str = ",? *({}) *$".format("|".join(parts)) |
| 104 | + |
| 105 | + return re.compile(re_str, re.IGNORECASE) |
| 106 | + |
| 107 | + |
| 108 | +def get_aliased_affiliations(affiliations): |
| 109 | + """Given non-unique sequence of affiliations, returns dictionary with |
| 110 | + aliases needed. |
| 111 | +
|
| 112 | + We employ the following strategies, interleaved: |
| 113 | +
|
| 114 | + - Stripping company endings like Inc., GmbH etc. from database |
| 115 | +
|
| 116 | + - Looking up aliases stored directly in the database, like |
| 117 | + "Examplar International" -> "Examplar" |
| 118 | +
|
| 119 | + - Case-folding so Examplar and EXAMPLAR is merged with the |
| 120 | + winner being the one with most occurrences (so input should not |
| 121 | + be made unique) or most upper case letters in case of ties. |
| 122 | + Case folding can be overridden by the aliases in the database.""" |
| 123 | + |
| 124 | + res = {} |
| 125 | + |
| 126 | + ending_re = compile_affiliation_ending_stripping_regexp() |
| 127 | + |
| 128 | + known_aliases = { alias.lower(): name for alias, name in AffiliationAlias.objects.values_list("alias", "name") } |
| 129 | + |
| 130 | + affiliations_with_case_spellings = defaultdict(set) |
| 131 | + case_spelling_count = defaultdict(int) |
| 132 | + for affiliation in affiliations: |
| 133 | + original_affiliation = affiliation |
| 134 | + |
| 135 | + # check aliases from DB |
| 136 | + alias = known_aliases.get(affiliation.lower()) |
| 137 | + if alias is not None: |
| 138 | + affiliation = alias |
| 139 | + res[original_affiliation] = affiliation |
| 140 | + |
| 141 | + # strip ending |
| 142 | + alias = ending_re.sub("", affiliation) |
| 143 | + if alias != affiliation: |
| 144 | + affiliation = alias |
| 145 | + res[original_affiliation] = affiliation |
| 146 | + |
| 147 | + # check aliases from DB |
| 148 | + alias = known_aliases.get(affiliation.lower()) |
| 149 | + if alias is not None: |
| 150 | + affiliation = alias |
| 151 | + res[original_affiliation] = affiliation |
| 152 | + |
| 153 | + affiliations_with_case_spellings[affiliation.lower()].add(original_affiliation) |
| 154 | + case_spelling_count[affiliation] += 1 |
| 155 | + |
| 156 | + def affiliation_sort_key(affiliation): |
| 157 | + count = case_spelling_count[affiliation] |
| 158 | + uppercase_letters = sum(1 for c in affiliation if c.isupper()) |
| 159 | + return (count, uppercase_letters) |
| 160 | + |
| 161 | + # now we just need to pick the most popular uppercase/lowercase |
| 162 | + # spelling for each affiliation with more than one |
| 163 | + for similar_affiliations in affiliations_with_case_spellings.itervalues(): |
| 164 | + if len(similar_affiliations) > 1: |
| 165 | + most_popular = sorted(similar_affiliations, key=affiliation_sort_key, reverse=True)[0] |
| 166 | + print similar_affiliations, most_popular |
| 167 | + for affiliation in similar_affiliations: |
| 168 | + if affiliation != most_popular: |
| 169 | + res[affiliation] = most_popular |
| 170 | + print affiliation, "->", most_popular |
| 171 | + |
| 172 | + return res |
| 173 | + |
| 174 | + |
| 175 | + |
0 commit comments