Add option to crawl as a logged-in user (--user).

larseggert · larseggert · commit 9b4e61049a70 · 2015-07-18T08:55:48.000Z
Add --pedantic option for vnu crawl, which stops the crawl on (most) errors.
Randomize the order in which URLs are crawled, so that repeated crawls don't
hit the same URLs in the same order.
Commit ready for merge.
 - Legacy-Id: 9765
diff --git a/bin/test-crawl b/bin/test-crawl
@@ -3,6 +3,7 @@
 import os, sys, re, datetime, argparse, traceback, tempfile, json, subprocess
 import html5lib
 import debug    # pyflakes:ignore
+import random
 
 # Set up import path to find our own Django
 basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
@@ -17,13 +18,16 @@ parser = argparse.ArgumentParser(
 parser.add_argument('urls', metavar='URL', nargs='*',
                     help='One or more URLs to start the crawl from')
 parser.add_argument('--urls', '-u', dest='url_file',
-                    help='file with URLs to start the crawl from')
+                    help='File with URLs to start the crawl from')
 parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0,
-                    help='responses taking longer than this (in seconds) results in SLOW being printed')
-parser.add_argument('--settings', dest='settings', help='custom settings file')
-parser.add_argument('--logfile', dest='logfile', help='write to logfile')
+                    help='Responses taking longer than this (in seconds) results in SLOW being printed')
+parser.add_argument('--settings', dest='settings', help='Custom settings file')
+parser.add_argument('--logfile', dest='logfile', help='Write to logfile')
+parser.add_argument('--user', dest='user', help='Crawl logged in as this user')
 parser.add_argument('--vnu', action='store_true',
                     help='Use validator.nu instead of html5lib for HTML validation')
+parser.add_argument('--pedantic', action='store_true',
+                    help='Check all pages and stop the crawl on the first HTML validation issue')
 
 args = parser.parse_args()
 
@@ -95,18 +99,26 @@ def extract_tastypie_urls(content):
                     yield uri
 
 def check_html_valid(url, response, vnu):
-    global parser, validated_urls, doc_types, warnings
-    # derive a key for urls like this by replacing primary keys
+    global parser, validated_urls, doc_types, warnings, pedantic
+    # These URLs have known issues, skip them until those are fixed
+    if re.search('(/secr|admin/)|/doc/.*/edit/info/', url):
+        log("%s blacklisted; skipping HTML validation" % url)
+        return
     key = url
-    key = re.sub("/[0-9.]+/",   "/nnnn/", key)
-    key = re.sub("/.+@.+/",    "/x@x.org/", key)
-    key = re.sub("#.*$",       "", key)
-    key = re.sub("\?.*$",       "", key)
-    key = re.sub("/rfc[0-9]+/",   "/rfcnnnn/", key)
-    key = re.sub("/wg/[a-z0-9-]+/",   "/wg/foo/", key)
-    key = re.sub("/rg/[a-z0-9-]+/",   "/rg/foo/", key)
-    for slug in doc_types:
-        key = re.sub("/%s-.*/"%slug, "/%s-nnnn/"%slug, key)
+    if not vnu:
+        # derive a key for urls like this by replacing primary keys
+        key = re.sub("/[0-9.]+/",   "/nnnn/", key)
+        key = re.sub("/.+@.+/",    "/x@x.org/", key)
+        key = re.sub("#.*$",       "", key)
+        key = re.sub("\?.*$",       "", key)
+        key = re.sub("/rfc[0-9]+/",   "/rfcnnnn/", key)
+        key = re.sub("/wg/[a-z0-9-]+/",   "/wg/foo/", key)
+        key = re.sub("/rg/[a-z0-9-]+/",   "/rg/foo/", key)
+        key = re.sub("/ipr/[0-9]+/",   "/ipr/nnnn/", key)
+        key = re.sub("/draft-[a-z0-9-]+/",   "/draft-foo/", key)
+        for slug in doc_types:
+            key = re.sub("/%s-.*/"%slug, "/%s-nnnn/"%slug, key)
+
     if not key in validated_urls:
         if hasattr(response, "content"):
             content = response.content
@@ -124,7 +136,10 @@ def check_html_valid(url, response, vnu):
                 tags.append("\n\t%s" % m["extract"].replace('\n', ' '))
                 tags.append("\n\t%s%s" %
                             (" " * m["hiliteStart"], "^" * m["hiliteLength"]))
-                warnings += 1
+                # disregard some HTML issues that are (usually) due to invalid
+                # database content
+                if not re.search('Forbidden code point|Bad value|seamless|The first child', m["message"]):
+                    warnings += 1
         else:
             try:
                 parser.parse(content)
@@ -157,6 +172,8 @@ def get_referrers(url):
 
 slow_threshold = args.slow_threshold
 vnu = args.vnu
+pedantic = args.pedantic
+user = args.user
 
 visited = set()
 urls = {} # url -> referrer
@@ -197,20 +214,29 @@ logfile = None
 if args.logfile:
     logfile = open(args.logfile, "w")
 
-validated_urls = {}
-
 # --- Main ---
 
 if __name__ == "__main__":
+    if (user):
+        # log in as user, to have the respective HTML generated by the templates
+        response = client.post('/accounts/login/',
+                               {'username': user, 'password': 'password'},
+                               secure=True, follow=True)
+        if (response.status_code != 200):
+            log("Could not log in as %s, HTML response %d" %
+                (user, response.status_code))
+            sys.exit(1)
 
     while urls:
-        url, referrer = urls.popitem()
+        # popitem() is documented to be random, but really isn't
+        url = random.choice(urls.keys())
+        referrer = urls.pop(url)
 
         visited.add(url)
 
         try:
             timestamp = datetime.datetime.now()
-            r = client.get(url)
+            r = client.get(url, secure=True, follow=True)
             elapsed = datetime.datetime.now() - timestamp
         except KeyboardInterrupt:
             log(" ... was fetching %s" % url)
@@ -279,6 +305,8 @@ if __name__ == "__main__":
                 log("\nElapsed  Visited  Queue Code   Time  Url  ...  Notes")
 
             log("%2d:%02d:%02d %7d %6d  %s %6.3fs %s %s" % (hrs,min,sec, len(visited), len(urls), r.status_code, elapsed.total_seconds(), url, " ".join(tags)))
+            if ((errors or warnings) and pedantic):
+                sys.exit(1)
 
     if logfile:
         logfile.close()