Skip to content

Commit ef251c6

Browse files
committed
Add author affiliation chart.
Also add a model for registering an alias for an affiliation so that we can group affiliations that are considered the same for statistical purposes, and a model for registering unimportant endings like Inc. and GmbH. Affiliation grouping is done through three means: stripping uninteresting endings, merging entries that only differ in case and aliases that map from case-insensitive alias to name. Stripping endings and merging based on case seem to reduce the number of needed manually maintained aliases greatly. - Legacy-Id: 12785
1 parent 3954dc0 commit ef251c6

12 files changed

Lines changed: 284 additions & 11 deletions

File tree

ietf/doc/admin.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ class BallotPositionDocEventAdmin(DocEventAdmin):
174174
admin.site.register(BallotPositionDocEvent, BallotPositionDocEventAdmin)
175175

176176
class DocumentAuthorAdmin(admin.ModelAdmin):
177-
list_display = ['id', 'document', 'person', 'email', 'order']
178-
search_fields = [ 'document__name', 'person__name', 'email__address', ]
177+
list_display = ['id', 'document', 'person', 'email', 'affiliation', 'order']
178+
search_fields = [ 'document__name', 'person__name', 'email__address', 'affiliation']
179179
admin.site.register(DocumentAuthor, DocumentAuthorAdmin)
180180

ietf/person/admin.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from django.contrib import admin
22

33

4-
from ietf.person.models import Email, Alias, Person
4+
from ietf.person.models import Email, Alias, Person, AffiliationAlias, AffiliationIgnoredEnding
55
from ietf.person.name import name_parts
66

77
class EmailAdmin(admin.ModelAdmin):
@@ -33,3 +33,13 @@ def plain_name(self, obj):
3333
# actions = None
3434
admin.site.register(Person, PersonAdmin)
3535

36+
class AffiliationAliasAdmin(admin.ModelAdmin):
37+
list_filter = ["name"]
38+
list_display = ["alias", "name"]
39+
search_fields = ["alias", "name"]
40+
admin.site.register(AffiliationAlias, AffiliationAliasAdmin)
41+
42+
class AffiliationIgnoredEndingAdmin(admin.ModelAdmin):
43+
list_display = ["ending"]
44+
search_fields = ["ending"]
45+
admin.site.register(AffiliationIgnoredEnding, AffiliationIgnoredEndingAdmin)
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# -*- coding: utf-8 -*-
2+
from __future__ import unicode_literals
3+
4+
from django.db import migrations, models
5+
6+
7+
class Migration(migrations.Migration):
8+
9+
dependencies = [
10+
('person', '0014_auto_20160613_0751'),
11+
]
12+
13+
operations = [
14+
migrations.CreateModel(
15+
name='AffiliationAlias',
16+
fields=[
17+
('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
18+
('alias', models.CharField(help_text=b'Note that aliases are matched without regarding case.', max_length=255)),
19+
('name', models.CharField(max_length=255)),
20+
],
21+
),
22+
migrations.CreateModel(
23+
name='AffiliationIgnoredEnding',
24+
fields=[
25+
('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
26+
('ending', models.CharField(max_length=255)),
27+
],
28+
),
29+
]
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# -*- coding: utf-8 -*-
2+
from __future__ import unicode_literals
3+
4+
from django.db import migrations
5+
6+
def add_affiliation_info(apps, schema_editor):
7+
AffiliationAlias = apps.get_model("person", "AffiliationAlias")
8+
9+
AffiliationAlias.objects.get_or_create(alias="cisco", name="Cisco Systems")
10+
AffiliationAlias.objects.get_or_create(alias="cisco system", name="Cisco Systems")
11+
AffiliationAlias.objects.get_or_create(alias="cisco systems (india) private limited", name="Cisco Systems")
12+
AffiliationAlias.objects.get_or_create(alias="cisco systems india pvt", name="Cisco Systems")
13+
14+
AffiliationIgnoredEnding = apps.get_model("person", "AffiliationIgnoredEnding")
15+
AffiliationIgnoredEnding.objects.get_or_create(ending="LLC\.?")
16+
AffiliationIgnoredEnding.objects.get_or_create(ending="Ltd\.?")
17+
AffiliationIgnoredEnding.objects.get_or_create(ending="Inc\.?")
18+
AffiliationIgnoredEnding.objects.get_or_create(ending="GmbH\.?")
19+
20+
21+
class Migration(migrations.Migration):
22+
23+
dependencies = [
24+
('person', '0015_affiliationalias_affiliationignoredending'),
25+
]
26+
27+
operations = [
28+
migrations.RunPython(add_affiliation_info, migrations.RunPython.noop)
29+
]

ietf/person/models.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,3 +241,26 @@ def email_address(self):
241241
return
242242
return self.address
243243

244+
245+
class AffiliationAlias(models.Model):
246+
"""Records that alias should be treated as name for statistical
247+
purposes."""
248+
249+
alias = models.CharField(max_length=255, help_text="Note that aliases are matched without regarding case.")
250+
name = models.CharField(max_length=255)
251+
252+
def __unicode__(self):
253+
return u"{} -> {}".format(self.alias, self.name)
254+
255+
def save(self, *args, **kwargs):
256+
self.alias = self.alias.lower()
257+
super(AffiliationAlias, self).save(*args, **kwargs)
258+
259+
class AffiliationIgnoredEnding(models.Model):
260+
"""Records that ending should be stripped from the affiliation for statistical purposes."""
261+
262+
ending = models.CharField(max_length=255, help_text="Regexp with ending, e.g. 'Inc\\.?' - remember to escape .!")
263+
264+
def __unicode__(self):
265+
return self.ending
266+

ietf/person/utils.py

Lines changed: 89 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1-
import pprint
1+
import pprint
2+
import re
3+
from collections import defaultdict
24

35
from django.contrib import admin
46
from django.contrib.auth.models import User
5-
from ietf.person.models import Person
7+
from ietf.person.models import Person, AffiliationAlias, AffiliationIgnoredEnding
68

79
def merge_persons(source,target,stream):
810

@@ -86,3 +88,88 @@ def merge_persons(source,target,stream):
8688
else:
8789
print >>stream, "Deleting Person: {}({})".format(source.ascii,source.pk)
8890
source.delete()
91+
92+
93+
def compile_affiliation_ending_stripping_regexp():
94+
parts = []
95+
for ending_re in AffiliationIgnoredEnding.objects.values_list("ending", flat=True):
96+
try:
97+
re.compile(ending_re)
98+
except re.error:
99+
pass
100+
101+
parts.append(ending_re)
102+
103+
re_str = ",? *({}) *$".format("|".join(parts))
104+
105+
return re.compile(re_str, re.IGNORECASE)
106+
107+
108+
def get_aliased_affiliations(affiliations):
109+
"""Given non-unique sequence of affiliations, returns dictionary with
110+
aliases needed.
111+
112+
We employ the following strategies, interleaved:
113+
114+
- Stripping company endings like Inc., GmbH etc. from database
115+
116+
- Looking up aliases stored directly in the database, like
117+
"Examplar International" -> "Examplar"
118+
119+
- Case-folding so Examplar and EXAMPLAR is merged with the
120+
winner being the one with most occurrences (so input should not
121+
be made unique) or most upper case letters in case of ties.
122+
Case folding can be overridden by the aliases in the database."""
123+
124+
res = {}
125+
126+
ending_re = compile_affiliation_ending_stripping_regexp()
127+
128+
known_aliases = { alias.lower(): name for alias, name in AffiliationAlias.objects.values_list("alias", "name") }
129+
130+
affiliations_with_case_spellings = defaultdict(set)
131+
case_spelling_count = defaultdict(int)
132+
for affiliation in affiliations:
133+
original_affiliation = affiliation
134+
135+
# check aliases from DB
136+
alias = known_aliases.get(affiliation.lower())
137+
if alias is not None:
138+
affiliation = alias
139+
res[original_affiliation] = affiliation
140+
141+
# strip ending
142+
alias = ending_re.sub("", affiliation)
143+
if alias != affiliation:
144+
affiliation = alias
145+
res[original_affiliation] = affiliation
146+
147+
# check aliases from DB
148+
alias = known_aliases.get(affiliation.lower())
149+
if alias is not None:
150+
affiliation = alias
151+
res[original_affiliation] = affiliation
152+
153+
affiliations_with_case_spellings[affiliation.lower()].add(original_affiliation)
154+
case_spelling_count[affiliation] += 1
155+
156+
def affiliation_sort_key(affiliation):
157+
count = case_spelling_count[affiliation]
158+
uppercase_letters = sum(1 for c in affiliation if c.isupper())
159+
return (count, uppercase_letters)
160+
161+
# now we just need to pick the most popular uppercase/lowercase
162+
# spelling for each affiliation with more than one
163+
for similar_affiliations in affiliations_with_case_spellings.itervalues():
164+
if len(similar_affiliations) > 1:
165+
most_popular = sorted(similar_affiliations, key=affiliation_sort_key, reverse=True)[0]
166+
print similar_affiliations, most_popular
167+
for affiliation in similar_affiliations:
168+
if affiliation != most_popular:
169+
res[affiliation] = most_popular
170+
print affiliation, "->", most_popular
171+
172+
return res
173+
174+
175+

ietf/static/ietf/css/ietf.css

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -568,7 +568,7 @@ table.simple-table td:last-child {
568568
width: 7em;
569569
}
570570

571-
.popover .docname {
571+
.document-stats .popover .element {
572572
padding-left: 1em;
573573
text-indent: -1em;
574574
}

ietf/static/ietf/js/document-stats.js

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,10 @@ $(document).ready(function () {
3030
if (stdNameRegExp.test(element))
3131
displayName = element.slice(0, 3).toUpperCase() + " " + element.slice(3);
3232

33-
html.push('<div class="docname"><a href="/doc/' + element + '/">' + displayName + '</a></div>');
33+
html.push('<div class="element"><a href="/doc/' + element + '/">' + displayName + '</a></div>');
3434
}
3535
else {
36-
html.push('<div>' + element + '</div>');
36+
html.push('<div class="element">' + element + '</div>');
3737
}
3838
});
3939

@@ -44,6 +44,7 @@ $(document).ready(function () {
4444
trigger: "focus",
4545
template: '<div class="popover" role="tooltip"><div class="arrow"></div><h3 class="popover-title"></h3><div class="popover-content"></div></div>',
4646
content: html.join(""),
47+
placement: "top",
4748
html: true
4849
}).on("click", function (e) {
4950
e.preventDefault();

ietf/stats/tests.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def test_document_stats(self):
2525
self.assertTrue(authors_url in r["Location"])
2626

2727
# check various stats types
28-
for stats_type in ["authors", "pages", "words", "format", "formlang", "author/documents"]:
28+
for stats_type in ["authors", "pages", "words", "format", "formlang", "author/documents", "author/affiliation"]:
2929
for document_type in ["", "rfc", "draft"]:
3030
for time_choice in ["", "5y"]:
3131
url = urlreverse(ietf.stats.views.document_stats, kwargs={ "stats_type": stats_type })

ietf/stats/views.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from ietf.person.models import Person
2626
from ietf.name.models import ReviewRequestStateName, ReviewResultName
2727
from ietf.doc.models import DocAlias, Document
28+
from ietf.person.utils import get_aliased_affiliations
2829
from ietf.ietfauth.utils import has_role
2930

3031
def stats_index(request):
@@ -351,7 +352,7 @@ def generate_canonical_names(docalias_qs):
351352
total_persons = person_qs.count()
352353

353354
if stats_type == "author/documents":
354-
stats_title = "Number of {}s for each author".format(doc_label)
355+
stats_title = "Number of {}s per author".format(doc_label)
355356

356357
bins = defaultdict(list)
357358

@@ -369,6 +370,38 @@ def generate_canonical_names(docalias_qs):
369370
"animation": False,
370371
})
371372

373+
elif stats_type == "author/affiliation":
374+
stats_title = "Number of {} authors per affiliation".format(doc_label)
375+
376+
bins = defaultdict(list)
377+
378+
# Since people don't write the affiliation names in the
379+
# same way, and we don't want to go back and edit them
380+
# either, we transform them here.
381+
382+
name_affiliation_set = set((name, affiliation)
383+
for name, affiliation in person_qs.values_list("name", "documentauthor__affiliation"))
384+
385+
aliases = get_aliased_affiliations(affiliation for _, affiliation in name_affiliation_set)
386+
387+
for name, affiliation in name_affiliation_set:
388+
bins[aliases.get(affiliation, affiliation)].append(name)
389+
390+
series_data = []
391+
for affiliation, names in sorted(bins.iteritems(), key=lambda t: t[0].lower()):
392+
percentage = len(names) * 100.0 / total_persons
393+
if affiliation:
394+
series_data.append((affiliation, len(names)))
395+
table_data.append((affiliation, percentage, names))
396+
397+
series_data.sort(key=lambda t: t[1], reverse=True)
398+
series_data = series_data[:30]
399+
400+
chart_data.append({
401+
"data": series_data,
402+
"animation": False,
403+
})
404+
372405

373406
return render(request, "stats/document_stats.html", {
374407
"chart_data": mark_safe(json.dumps(chart_data)),

0 commit comments

Comments
 (0)