Skip to content

Commit 6378594

Browse files
committed
Add word count and submit format statistics
- Legacy-Id: 12656
1 parent 34a9f36 commit 6378594

11 files changed

Lines changed: 337 additions & 28 deletions
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# -*- coding: utf-8 -*-
2+
from __future__ import unicode_literals
3+
4+
from django.db import migrations, models
5+
6+
7+
class Migration(migrations.Migration):
8+
9+
dependencies = [
10+
('doc', '0019_auto_20161207_1036'),
11+
]
12+
13+
operations = [
14+
migrations.AddField(
15+
model_name='dochistory',
16+
name='words',
17+
field=models.IntegerField(null=True, blank=True),
18+
),
19+
migrations.AddField(
20+
model_name='document',
21+
name='words',
22+
field=models.IntegerField(null=True, blank=True),
23+
),
24+
]

ietf/doc/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ class DocumentInfo(models.Model):
7575
abstract = models.TextField(blank=True)
7676
rev = models.CharField(verbose_name="revision", max_length=16, blank=True)
7777
pages = models.IntegerField(blank=True, null=True)
78+
words = models.IntegerField(blank=True, null=True)
7879
order = models.IntegerField(default=1, blank=True) # This is probably obviated by SessionPresentaion.order
7980
intended_std_level = models.ForeignKey(IntendedStdLevelName, verbose_name="Intended standardization level", blank=True, null=True)
8081
std_level = models.ForeignKey(StdLevelName, verbose_name="Standardization level", blank=True, null=True)

ietf/settings.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,8 @@ def skip_unreadable_post(record):
468468
INTERNET_ALL_DRAFTS_ARCHIVE_DIR = '/a/www/www6s/archive/id'
469469
MEETING_RECORDINGS_DIR = '/a/www/audio'
470470

471+
DOCUMENT_FORMAT_BLACKLIST = ["tar", "dtd", "p7s"]
472+
471473
# Mailing list info URL for lists hosted on the IETF servers
472474
MAILING_LIST_INFO_URL = "https://www.ietf.org/mailman/listinfo/%(list_addr)s"
473475
MAILING_LIST_ARCHIVE_URL = "https://mailarchive.ietf.org"

ietf/stats/backfill_data.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import sys, os, argparse
2+
3+
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
4+
sys.path = [ basedir ] + sys.path
5+
os.environ["DJANGO_SETTINGS_MODULE"] = "ietf.settings"
6+
7+
virtualenv_activation = os.path.join(basedir, "env", "bin", "activate_this.py")
8+
if os.path.exists(virtualenv_activation):
9+
execfile(virtualenv_activation, dict(__file__=virtualenv_activation))
10+
11+
import django
12+
django.setup()
13+
14+
from django.conf import settings
15+
16+
from ietf.doc.models import Document
17+
from ietf.utils.draft import Draft
18+
19+
parser = argparse.ArgumentParser()
20+
parser.add_argument("--document", help="specific document name")
21+
parser.add_argument("--words", action="store_true", help="fill in word count")
22+
args = parser.parse_args()
23+
24+
25+
docs_qs = Document.objects.filter(type="draft")
26+
27+
if args.document:
28+
docs_qs = docs_qs.filter(docalias__name=args.document)
29+
30+
for doc in docs_qs.prefetch_related("docalias_set"):
31+
canonical_name = doc.name
32+
for n in doc.docalias_set.all():
33+
if n.name.startswith("rfc"):
34+
canonical_name = n.name
35+
36+
if canonical_name.startswith("rfc"):
37+
path = os.path.join(settings.RFC_PATH, canonical_name + ".txt")
38+
else:
39+
path = os.path.join(settings.INTERNET_ALL_DRAFTS_ARCHIVE_DIR, canonical_name + "-" + doc.rev + ".txt")
40+
41+
if not os.path.exists(path):
42+
print "skipping", doc.name, "no txt file found at", path
43+
continue
44+
45+
with open(path, 'r') as f:
46+
d = Draft(f.read(), path)
47+
48+
updates = {}
49+
50+
if args.words:
51+
words = d.get_wordcount()
52+
if words != doc.words:
53+
updates["words"] = words
54+
55+
if updates:
56+
Document.objects.filter(pk=doc.pk).update(**updates)
57+
print "updated", canonical_name
58+

ietf/stats/tests.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def test_document_stats(self):
3131
self.assertTrue(authors_all_url in r["Location"])
3232

3333
# check various stats types
34-
for stats_type in ["authors", "pages"]:
34+
for stats_type in ["authors", "pages", "words", "format"]:
3535
for document_type in ["all", "rfc", "draft"]:
3636
url = urlreverse(ietf.stats.views.document_stats, kwargs={ "stats_type": stats_type, "document_type": document_type })
3737
r = self.client.get(url)

ietf/stats/urls.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,6 @@
55

66
urlpatterns = patterns('',
77
url("^$", ietf.stats.views.stats_index),
8-
url("^document/(?:(?P<stats_type>authors|pages|format|spectech)/)?(?:(?P<document_type>all|rfc|draft)/)?$", ietf.stats.views.document_stats),
8+
url("^document/(?:(?P<stats_type>authors|pages|words|format|formlang)/)?(?:(?P<document_type>all|rfc|draft)/)?$", ietf.stats.views.document_stats),
99
url("^review/(?:(?P<stats_type>completion|results|states|time)/)?(?:%(acronym)s/)?$" % settings.URL_REGEXPS, ietf.stats.views.review_stats),
1010
)

ietf/stats/views.py

Lines changed: 129 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
1-
import datetime, itertools, json, calendar
1+
import datetime
2+
import itertools
3+
import json
4+
import calendar
5+
import os
6+
import re
27
from collections import defaultdict
38

49
from django.shortcuts import render
@@ -7,6 +12,7 @@
712
from django.http import HttpResponseRedirect, HttpResponseForbidden
813
from django.db.models import Count
914
from django.utils.safestring import mark_safe
15+
from django.conf import settings
1016

1117
import dateutil.relativedelta
1218

@@ -15,10 +21,11 @@
1521
ReviewRequestData,
1622
compute_review_request_stats,
1723
sum_raw_review_request_aggregations)
24+
from ietf.submit.models import Submission
1825
from ietf.group.models import Role, Group
1926
from ietf.person.models import Person
2027
from ietf.name.models import ReviewRequestStateName, ReviewResultName
21-
from ietf.doc.models import Document
28+
from ietf.doc.models import DocAlias
2229
from ietf.ietfauth.utils import has_role
2330

2431
def stats_index(request):
@@ -48,7 +55,6 @@ def generate_query_string(query_dict, overrides):
4855

4956
return query_part
5057

51-
5258
def document_stats(request, stats_type=None, document_type=None):
5359
def build_document_stats_url(stats_type_override=Ellipsis, document_type_override=Ellipsis, get_overrides={}):
5460
kwargs = {
@@ -60,10 +66,11 @@ def build_document_stats_url(stats_type_override=Ellipsis, document_type_overrid
6066

6167
# statistics type - one of the tables or the chart
6268
possible_stats_types = [
63-
("authors", "Number of authors"),
69+
("authors", "Authors"),
6470
("pages", "Pages"),
65-
# ("format", "Format"),
66-
# ("spectech", "Specification techniques"),
71+
("words", "Words"),
72+
("format", "Format"),
73+
("formlang", "Formal languages"),
6774
]
6875

6976
possible_stats_types = [ (slug, label, build_document_stats_url(stats_type_override=slug))
@@ -85,13 +92,34 @@ def build_document_stats_url(stats_type_override=Ellipsis, document_type_overrid
8592
return HttpResponseRedirect(build_document_stats_url(document_type_override=possible_document_types[0][0]))
8693

8794

95+
def put_into_bin(value, bin_size):
96+
if value is None:
97+
return (value, value)
98+
99+
v = (value // bin_size) * bin_size
100+
return (v, "{} - {}".format(v, v + bin_size - 1))
101+
102+
def generate_canonical_names(docalias_qs):
103+
for doc_id, ts in itertools.groupby(docalias_qs.order_by("document"), lambda t: t[0]):
104+
chosen = None
105+
for t in ts:
106+
if chosen is None:
107+
chosen = t
108+
else:
109+
if t[0].startswith("rfc"):
110+
chosen = t
111+
elif t[0].startswith("draft") and not chosen[0].startswith("rfc"):
112+
chosen = t
113+
114+
yield chosen
115+
88116
# filter documents
89-
doc_qs = Document.objects.filter(type="draft")
117+
docalias_qs = DocAlias.objects.filter(document__type="draft")
90118

91119
if document_type == "rfc":
92-
doc_qs = doc_qs.filter(states__type="draft", states__slug="rfc")
120+
docalias_qs = docalias_qs.filter(document__states__type="draft", document__states__slug="rfc")
93121
elif document_type == "draft":
94-
doc_qs = doc_qs.exclude(states__type="draft", states__slug="rfc")
122+
docalias_qs = docalias_qs.exclude(document__states__type="draft", document__states__slug="rfc")
95123

96124
chart_data = []
97125
table_data = []
@@ -104,19 +132,20 @@ def build_document_stats_url(stats_type_override=Ellipsis, document_type_overrid
104132
doc_label = "draft"
105133

106134
stats_title = ""
135+
bin_size = 1
107136

108137
if stats_type == "authors":
109138
stats_title = "Number of authors for each {}".format(doc_label)
110139

111-
groups = defaultdict(list)
140+
bins = defaultdict(list)
112141

113-
for name, author_count in doc_qs.values_list("name").annotate(Count("authors")).iterator():
114-
groups[author_count].append(name)
142+
for name, author_count in generate_canonical_names(docalias_qs.values_list("name").annotate(Count("document__authors"))):
143+
bins[author_count].append(name)
115144

116-
total_docs = sum(len(names) for author_count, names in groups.iteritems())
145+
total_docs = sum(len(names) for author_count, names in bins.iteritems())
117146

118147
series_data = []
119-
for author_count, names in sorted(groups.iteritems(), key=lambda t: t[0]):
148+
for author_count, names in sorted(bins.iteritems(), key=lambda t: t[0]):
120149
percentage = len(names) * 100.0 / total_docs
121150
series_data.append((author_count, percentage))
122151
table_data.append((author_count, percentage, names))
@@ -129,15 +158,15 @@ def build_document_stats_url(stats_type_override=Ellipsis, document_type_overrid
129158
elif stats_type == "pages":
130159
stats_title = "Number of pages for each {}".format(doc_label)
131160

132-
groups = defaultdict(list)
161+
bins = defaultdict(list)
133162

134-
for name, pages in doc_qs.values_list("name", "pages"):
135-
groups[pages].append(name)
163+
for name, pages in generate_canonical_names(docalias_qs.values_list("name", "document__pages")):
164+
bins[pages].append(name)
136165

137-
total_docs = sum(len(names) for pages, names in groups.iteritems())
166+
total_docs = sum(len(names) for pages, names in bins.iteritems())
138167

139168
series_data = []
140-
for pages, names in sorted(groups.iteritems(), key=lambda t: t[0]):
169+
for pages, names in sorted(bins.iteritems(), key=lambda t: t[0]):
141170
percentage = len(names) * 100.0 / total_docs
142171
if pages is not None:
143172
series_data.append((pages, len(names)))
@@ -148,7 +177,86 @@ def build_document_stats_url(stats_type_override=Ellipsis, document_type_overrid
148177
"animation": False,
149178
})
150179

180+
elif stats_type == "words":
181+
stats_title = "Number of words for each {}".format(doc_label)
182+
183+
bin_size = 500
184+
185+
bins = defaultdict(list)
186+
187+
for name, words in generate_canonical_names(docalias_qs.values_list("name", "document__words")):
188+
bins[put_into_bin(words, bin_size)].append(name)
189+
190+
total_docs = sum(len(names) for words, names in bins.iteritems())
191+
192+
series_data = []
193+
for (value, words), names in sorted(bins.iteritems(), key=lambda t: t[0][0]):
194+
percentage = len(names) * 100.0 / total_docs
195+
if words is not None:
196+
series_data.append((value, len(names)))
197+
198+
table_data.append((words, percentage, names))
199+
200+
chart_data.append({
201+
"data": series_data,
202+
"animation": False,
203+
})
204+
205+
elif stats_type == "format":
206+
stats_title = "Formats for each {}".format(doc_label)
207+
208+
bins = defaultdict(list)
209+
210+
# on new documents, we should have a Submission row with the file types
211+
submission_types = {}
151212

213+
for doc_name, file_types in Submission.objects.values_list("draft", "file_types").order_by("submission_date", "id"):
214+
submission_types[doc_name] = file_types
215+
216+
doc_names_with_missing_types = {}
217+
for canonical_name, rev, doc_name in generate_canonical_names(docalias_qs.values_list("name", "document__rev", "document__name")):
218+
types = submission_types.get(doc_name)
219+
if types:
220+
for dot_ext in types.split(","):
221+
bins[dot_ext.lstrip(".").upper()].append(canonical_name)
222+
223+
else:
224+
225+
if canonical_name.startswith("rfc"):
226+
filename = canonical_name
227+
else:
228+
filename = canonical_name + "-" + rev
229+
230+
doc_names_with_missing_types[filename] = canonical_name
231+
232+
# look up the remaining documents on disk
233+
for filename in itertools.chain(os.listdir(settings.INTERNET_ALL_DRAFTS_ARCHIVE_DIR), os.listdir(settings.RFC_PATH)):
234+
t = filename.split(".", 1)
235+
if len(t) != 2:
236+
continue
237+
238+
basename, ext = t
239+
if any(ext.lower().endswith(blacklisted_ext.lower()) for blacklisted_ext in settings.DOCUMENT_FORMAT_BLACKLIST):
240+
continue
241+
242+
canonical_name = doc_names_with_missing_types.get(basename)
243+
244+
if canonical_name:
245+
bins[ext.upper()].append(canonical_name)
246+
247+
total_docs = sum(len(names) for fmt, names in bins.iteritems())
248+
249+
series_data = []
250+
for fmt, names in sorted(bins.iteritems(), key=lambda t: t[0]):
251+
percentage = len(names) * 100.0 / total_docs
252+
series_data.append((fmt, len(names)))
253+
254+
table_data.append((fmt, percentage, names))
255+
256+
chart_data.append({
257+
"data": series_data,
258+
"animation": False,
259+
})
152260

153261
return render(request, "stats/document_stats.html", {
154262
"chart_data": mark_safe(json.dumps(chart_data)),
@@ -159,6 +267,8 @@ def build_document_stats_url(stats_type_override=Ellipsis, document_type_overrid
159267
"possible_document_types": possible_document_types,
160268
"document_type": document_type,
161269
"doc_label": doc_label,
270+
"bin_size": bin_size,
271+
"content_template": "stats/document_stats_{}.html".format(stats_type),
162272
})
163273

164274
@login_required

ietf/templates/stats/document_stats.html

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,7 @@ <h1>Document statistics</h1>
3535
</div>
3636
</div>
3737

38-
{% if stats_type == "authors" %}
39-
{% include "stats/document_stats_authors.html" %}
40-
{% elif stats_type == "pages" %}
41-
{% include "stats/document_stats_pages.html" %}
42-
{% endif %}
38+
{% include content_template %}
4339
{% endblock %}
4440

4541
{% block js %}

0 commit comments

Comments
 (0)