Add word count and submit format statistics

OleLaursen · OleLaursen · commit 637859403397 · 2017-01-16T11:36:38.000Z
- Legacy-Id: 12656
diff --git a/ietf/doc/migrations/0020_auto_20170112_0753.py b/ietf/doc/migrations/0020_auto_20170112_0753.py
@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('doc', '0019_auto_20161207_1036'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='dochistory',
+            name='words',
+            field=models.IntegerField(null=True, blank=True),
+        ),
+        migrations.AddField(
+            model_name='document',
+            name='words',
+            field=models.IntegerField(null=True, blank=True),
+        ),
+    ]
diff --git a/ietf/doc/models.py b/ietf/doc/models.py
@@ -75,6 +75,7 @@ class DocumentInfo(models.Model):
     abstract = models.TextField(blank=True)
     rev = models.CharField(verbose_name="revision", max_length=16, blank=True)
     pages = models.IntegerField(blank=True, null=True)
+    words = models.IntegerField(blank=True, null=True)
     order = models.IntegerField(default=1, blank=True) # This is probably obviated by SessionPresentaion.order
     intended_std_level = models.ForeignKey(IntendedStdLevelName, verbose_name="Intended standardization level", blank=True, null=True)
     std_level = models.ForeignKey(StdLevelName, verbose_name="Standardization level", blank=True, null=True)
diff --git a/ietf/settings.py b/ietf/settings.py
@@ -468,6 +468,8 @@ def skip_unreadable_post(record):
 INTERNET_ALL_DRAFTS_ARCHIVE_DIR = '/a/www/www6s/archive/id'
 MEETING_RECORDINGS_DIR = '/a/www/audio'
 
+DOCUMENT_FORMAT_BLACKLIST = ["tar", "dtd", "p7s"]
+
 # Mailing list info URL for lists hosted on the IETF servers
 MAILING_LIST_INFO_URL = "https://www.ietf.org/mailman/listinfo/%(list_addr)s"
 MAILING_LIST_ARCHIVE_URL = "https://mailarchive.ietf.org"
diff --git a/ietf/stats/backfill_data.py b/ietf/stats/backfill_data.py
@@ -0,0 +1,58 @@
+import sys, os, argparse
+
+basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+sys.path = [ basedir ] + sys.path
+os.environ["DJANGO_SETTINGS_MODULE"] = "ietf.settings"
+
+virtualenv_activation = os.path.join(basedir, "env", "bin", "activate_this.py")
+if os.path.exists(virtualenv_activation):
+    execfile(virtualenv_activation, dict(__file__=virtualenv_activation))
+
+import django
+django.setup()
+
+from django.conf import settings
+
+from ietf.doc.models import Document
+from ietf.utils.draft import Draft
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--document", help="specific document name")
+parser.add_argument("--words", action="store_true", help="fill in word count")
+args = parser.parse_args()
+
+
+docs_qs = Document.objects.filter(type="draft")
+
+if args.document:
+    docs_qs = docs_qs.filter(docalias__name=args.document)
+
+for doc in docs_qs.prefetch_related("docalias_set"):
+    canonical_name = doc.name
+    for n in doc.docalias_set.all():
+        if n.name.startswith("rfc"):
+            canonical_name = n.name
+
+    if canonical_name.startswith("rfc"):
+        path = os.path.join(settings.RFC_PATH, canonical_name + ".txt")
+    else:
+        path = os.path.join(settings.INTERNET_ALL_DRAFTS_ARCHIVE_DIR, canonical_name + "-" + doc.rev + ".txt")
+
+    if not os.path.exists(path):
+        print "skipping", doc.name, "no txt file found at", path
+        continue
+
+    with open(path, 'r') as f:
+        d = Draft(f.read(), path)
+
+        updates = {}
+
+        if args.words:
+            words = d.get_wordcount()
+            if words != doc.words:
+                updates["words"] = words
+
+        if updates:
+            Document.objects.filter(pk=doc.pk).update(**updates)
+            print "updated", canonical_name
+
diff --git a/ietf/stats/tests.py b/ietf/stats/tests.py
@@ -31,7 +31,7 @@ def test_document_stats(self):
         self.assertTrue(authors_all_url in r["Location"])
 
         # check various stats types
-        for stats_type in ["authors", "pages"]:
+        for stats_type in ["authors", "pages", "words", "format"]:
             for document_type in ["all", "rfc", "draft"]:
                 url = urlreverse(ietf.stats.views.document_stats, kwargs={ "stats_type": stats_type, "document_type": document_type })
                 r = self.client.get(url)
diff --git a/ietf/stats/urls.py b/ietf/stats/urls.py
@@ -5,6 +5,6 @@
 
 urlpatterns = patterns('',
     url("^$", ietf.stats.views.stats_index),
-    url("^document/(?:(?P<stats_type>authors|pages|format|spectech)/)?(?:(?P<document_type>all|rfc|draft)/)?$", ietf.stats.views.document_stats),
+    url("^document/(?:(?P<stats_type>authors|pages|words|format|formlang)/)?(?:(?P<document_type>all|rfc|draft)/)?$", ietf.stats.views.document_stats),
     url("^review/(?:(?P<stats_type>completion|results|states|time)/)?(?:%(acronym)s/)?$" % settings.URL_REGEXPS, ietf.stats.views.review_stats),
 )
diff --git a/ietf/stats/views.py b/ietf/stats/views.py
@@ -1,4 +1,9 @@
-import datetime, itertools, json, calendar
+import datetime
+import itertools
+import json
+import calendar
+import os
+import re
 from collections import defaultdict
 
 from django.shortcuts import render
@@ -7,6 +12,7 @@
 from django.http import HttpResponseRedirect, HttpResponseForbidden
 from django.db.models import Count
 from django.utils.safestring import mark_safe
+from django.conf import settings
 
 import dateutil.relativedelta
 
@@ -15,10 +21,11 @@
                                ReviewRequestData,
                                compute_review_request_stats,
                                sum_raw_review_request_aggregations)
+from ietf.submit.models import Submission
 from ietf.group.models import Role, Group
 from ietf.person.models import Person
 from ietf.name.models import ReviewRequestStateName, ReviewResultName
-from ietf.doc.models import Document
+from ietf.doc.models import DocAlias
 from ietf.ietfauth.utils import has_role
 
 def stats_index(request):
@@ -48,7 +55,6 @@ def generate_query_string(query_dict, overrides):
 
     return query_part
 
-
 def document_stats(request, stats_type=None, document_type=None):
     def build_document_stats_url(stats_type_override=Ellipsis, document_type_override=Ellipsis, get_overrides={}):
         kwargs = {
@@ -60,10 +66,11 @@ def build_document_stats_url(stats_type_override=Ellipsis, document_type_overrid
 
     # statistics type - one of the tables or the chart
     possible_stats_types = [
-        ("authors", "Number of authors"),
+        ("authors", "Authors"),
         ("pages", "Pages"),
-#        ("format", "Format"),
-#        ("spectech", "Specification techniques"),
+        ("words", "Words"),
+        ("format", "Format"),
+        ("formlang", "Formal languages"),
     ]
 
     possible_stats_types = [ (slug, label, build_document_stats_url(stats_type_override=slug))
@@ -85,13 +92,34 @@ def build_document_stats_url(stats_type_override=Ellipsis, document_type_overrid
         return HttpResponseRedirect(build_document_stats_url(document_type_override=possible_document_types[0][0]))
     
 
+    def put_into_bin(value, bin_size):
+        if value is None:
+            return (value, value)
+
+        v = (value // bin_size) * bin_size
+        return (v, "{} - {}".format(v, v + bin_size - 1))
+
+    def generate_canonical_names(docalias_qs):
+        for doc_id, ts in itertools.groupby(docalias_qs.order_by("document"), lambda t: t[0]):
+            chosen = None
+            for t in ts:
+                if chosen is None:
+                    chosen = t
+                else:
+                    if t[0].startswith("rfc"):
+                        chosen = t
+                    elif t[0].startswith("draft") and not chosen[0].startswith("rfc"):
+                        chosen = t
+
+            yield chosen
+
     # filter documents
-    doc_qs = Document.objects.filter(type="draft")
+    docalias_qs = DocAlias.objects.filter(document__type="draft")
 
     if document_type == "rfc":
-        doc_qs = doc_qs.filter(states__type="draft", states__slug="rfc")
+        docalias_qs = docalias_qs.filter(document__states__type="draft", document__states__slug="rfc")
     elif document_type == "draft":
-        doc_qs = doc_qs.exclude(states__type="draft", states__slug="rfc")
+        docalias_qs = docalias_qs.exclude(document__states__type="draft", document__states__slug="rfc")
 
     chart_data = []
     table_data = []
@@ -104,19 +132,20 @@ def build_document_stats_url(stats_type_override=Ellipsis, document_type_overrid
         doc_label = "draft"
 
     stats_title = ""
+    bin_size = 1
 
     if stats_type == "authors":
         stats_title = "Number of authors for each {}".format(doc_label)
 
-        groups = defaultdict(list)
+        bins = defaultdict(list)
 
-        for name, author_count in doc_qs.values_list("name").annotate(Count("authors")).iterator():
-            groups[author_count].append(name)
+        for name, author_count in generate_canonical_names(docalias_qs.values_list("name").annotate(Count("document__authors"))):
+            bins[author_count].append(name)
 
-        total_docs = sum(len(names) for author_count, names in groups.iteritems())
+        total_docs = sum(len(names) for author_count, names in bins.iteritems())
 
         series_data = []
-        for author_count, names in sorted(groups.iteritems(), key=lambda t: t[0]):
+        for author_count, names in sorted(bins.iteritems(), key=lambda t: t[0]):
             percentage = len(names) * 100.0 / total_docs
             series_data.append((author_count, percentage))
             table_data.append((author_count, percentage, names))
@@ -129,15 +158,15 @@ def build_document_stats_url(stats_type_override=Ellipsis, document_type_overrid
     elif stats_type == "pages":
         stats_title = "Number of pages for each {}".format(doc_label)
 
-        groups = defaultdict(list)
+        bins = defaultdict(list)
 
-        for name, pages in doc_qs.values_list("name", "pages"):
-            groups[pages].append(name)
+        for name, pages in generate_canonical_names(docalias_qs.values_list("name", "document__pages")):
+            bins[pages].append(name)
 
-        total_docs = sum(len(names) for pages, names in groups.iteritems())
+        total_docs = sum(len(names) for pages, names in bins.iteritems())
 
         series_data = []
-        for pages, names in sorted(groups.iteritems(), key=lambda t: t[0]):
+        for pages, names in sorted(bins.iteritems(), key=lambda t: t[0]):
             percentage = len(names) * 100.0 / total_docs
             if pages is not None:
                 series_data.append((pages, len(names)))
@@ -148,7 +177,86 @@ def build_document_stats_url(stats_type_override=Ellipsis, document_type_overrid
             "animation": False,
         })
 
+    elif stats_type == "words":
+        stats_title = "Number of words for each {}".format(doc_label)
+
+        bin_size = 500
+
+        bins = defaultdict(list)
+
+        for name, words in generate_canonical_names(docalias_qs.values_list("name", "document__words")):
+            bins[put_into_bin(words, bin_size)].append(name)
+
+        total_docs = sum(len(names) for words, names in bins.iteritems())
+
+        series_data = []
+        for (value, words), names in sorted(bins.iteritems(), key=lambda t: t[0][0]):
+            percentage = len(names) * 100.0 / total_docs
+            if words is not None:
+                series_data.append((value, len(names)))
+
+            table_data.append((words, percentage, names))
+
+        chart_data.append({
+            "data": series_data,
+            "animation": False,
+        })
+
+    elif stats_type == "format":
+        stats_title = "Formats for each {}".format(doc_label)
+
+        bins = defaultdict(list)
+
+        # on new documents, we should have a Submission row with the file types
+        submission_types = {}
 
+        for doc_name, file_types in Submission.objects.values_list("draft", "file_types").order_by("submission_date", "id"):
+            submission_types[doc_name] = file_types
+
+        doc_names_with_missing_types = {}
+        for canonical_name, rev, doc_name in generate_canonical_names(docalias_qs.values_list("name", "document__rev", "document__name")):
+            types = submission_types.get(doc_name)
+            if types:
+                for dot_ext in types.split(","):
+                    bins[dot_ext.lstrip(".").upper()].append(canonical_name)
+
+            else:
+
+                if canonical_name.startswith("rfc"):
+                    filename = canonical_name
+                else:
+                    filename = canonical_name + "-" + rev
+
+                doc_names_with_missing_types[filename] = canonical_name
+
+        # look up the remaining documents on disk
+        for filename in itertools.chain(os.listdir(settings.INTERNET_ALL_DRAFTS_ARCHIVE_DIR), os.listdir(settings.RFC_PATH)):
+            t = filename.split(".", 1)
+            if len(t) != 2:
+                continue
+
+            basename, ext = t
+            if any(ext.lower().endswith(blacklisted_ext.lower()) for blacklisted_ext in settings.DOCUMENT_FORMAT_BLACKLIST):
+                continue
+
+            canonical_name = doc_names_with_missing_types.get(basename)
+
+            if canonical_name:
+                bins[ext.upper()].append(canonical_name)
+
+        total_docs = sum(len(names) for fmt, names in bins.iteritems())
+
+        series_data = []
+        for fmt, names in sorted(bins.iteritems(), key=lambda t: t[0]):
+            percentage = len(names) * 100.0 / total_docs
+            series_data.append((fmt, len(names)))
+
+            table_data.append((fmt, percentage, names))
+
+        chart_data.append({
+            "data": series_data,
+            "animation": False,
+        })
 
     return render(request, "stats/document_stats.html", {
         "chart_data": mark_safe(json.dumps(chart_data)),
@@ -159,6 +267,8 @@ def build_document_stats_url(stats_type_override=Ellipsis, document_type_overrid
         "possible_document_types": possible_document_types,
         "document_type": document_type,
         "doc_label": doc_label,
+        "bin_size": bin_size,
+        "content_template": "stats/document_stats_{}.html".format(stats_type),
     })
 
 @login_required
diff --git a/ietf/templates/stats/document_stats.html b/ietf/templates/stats/document_stats.html
@@ -35,11 +35,7 @@ <h1>Document statistics</h1>
     </div>
   </div>
 
-  {% if stats_type == "authors" %}
-    {% include "stats/document_stats_authors.html" %}
-  {% elif stats_type == "pages" %}
-    {% include "stats/document_stats_pages.html" %}
-  {% endif %}
+  {% include content_template %}
 {% endblock %}
 
 {% block js %}
diff --git a/ietf/templates/stats/document_stats_format.html b/ietf/templates/stats/document_stats_format.html
diff --git a/ietf/templates/stats/document_stats_pages.html b/ietf/templates/stats/document_stats_pages.html
diff --git a/ietf/templates/stats/document_stats_words.html b/ietf/templates/stats/document_stats_words.html

Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,6 @@`
`5`	`5`
`6`	`6`	`urlpatterns = patterns('',`
`7`	`7`	`url("^$", ietf.stats.views.stats_index),`
`8`		`- url("^document/(?:(?P<stats_type>authors\|pages\|format\|spectech)/)?(?:(?P<document_type>all\|rfc\|draft)/)?$", ietf.stats.views.document_stats),`
	`8`	`+ url("^document/(?:(?P<stats_type>authors\|pages\|words\|format\|formlang)/)?(?:(?P<document_type>all\|rfc\|draft)/)?$", ietf.stats.views.document_stats),`
`9`	`9`	`url("^review/(?:(?P<stats_type>completion\|results\|states\|time)/)?(?:%(acronym)s/)?$" % settings.URL_REGEXPS, ietf.stats.views.review_stats),`
`10`	`10`	`)`