Skip to content

Commit 9b4e610

Browse files
committed
Add option to crawl as a logged-in user (--user).
Add --pedantic option for vnu crawl, which stops the crawl on (most) errors. Randomize the order in which URLs are crawled, so that repeated crawls don't hit the same URLs in the same order. Commit ready for merge. - Legacy-Id: 9765
1 parent cbf4cc7 commit 9b4e610

1 file changed

Lines changed: 48 additions & 20 deletions

File tree

bin/test-crawl

Lines changed: 48 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import os, sys, re, datetime, argparse, traceback, tempfile, json, subprocess
44
import html5lib
55
import debug # pyflakes:ignore
6+
import random
67

78
# Set up import path to find our own Django
89
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
@@ -17,13 +18,16 @@ parser = argparse.ArgumentParser(
1718
parser.add_argument('urls', metavar='URL', nargs='*',
1819
help='One or more URLs to start the crawl from')
1920
parser.add_argument('--urls', '-u', dest='url_file',
20-
help='file with URLs to start the crawl from')
21+
help='File with URLs to start the crawl from')
2122
parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0,
22-
help='responses taking longer than this (in seconds) results in SLOW being printed')
23-
parser.add_argument('--settings', dest='settings', help='custom settings file')
24-
parser.add_argument('--logfile', dest='logfile', help='write to logfile')
23+
help='Responses taking longer than this (in seconds) results in SLOW being printed')
24+
parser.add_argument('--settings', dest='settings', help='Custom settings file')
25+
parser.add_argument('--logfile', dest='logfile', help='Write to logfile')
26+
parser.add_argument('--user', dest='user', help='Crawl logged in as this user')
2527
parser.add_argument('--vnu', action='store_true',
2628
help='Use validator.nu instead of html5lib for HTML validation')
29+
parser.add_argument('--pedantic', action='store_true',
30+
help='Check all pages and stop the crawl on the first HTML validation issue')
2731

2832
args = parser.parse_args()
2933

@@ -95,18 +99,26 @@ def extract_tastypie_urls(content):
9599
yield uri
96100

97101
def check_html_valid(url, response, vnu):
98-
global parser, validated_urls, doc_types, warnings
99-
# derive a key for urls like this by replacing primary keys
102+
global parser, validated_urls, doc_types, warnings, pedantic
103+
# These URLs have known issues, skip them until those are fixed
104+
if re.search('(/secr|admin/)|/doc/.*/edit/info/', url):
105+
log("%s blacklisted; skipping HTML validation" % url)
106+
return
100107
key = url
101-
key = re.sub("/[0-9.]+/", "/nnnn/", key)
102-
key = re.sub("/.+@.+/", "/x@x.org/", key)
103-
key = re.sub("#.*$", "", key)
104-
key = re.sub("\?.*$", "", key)
105-
key = re.sub("/rfc[0-9]+/", "/rfcnnnn/", key)
106-
key = re.sub("/wg/[a-z0-9-]+/", "/wg/foo/", key)
107-
key = re.sub("/rg/[a-z0-9-]+/", "/rg/foo/", key)
108-
for slug in doc_types:
109-
key = re.sub("/%s-.*/"%slug, "/%s-nnnn/"%slug, key)
108+
if not vnu:
109+
# derive a key for urls like this by replacing primary keys
110+
key = re.sub("/[0-9.]+/", "/nnnn/", key)
111+
key = re.sub("/.+@.+/", "/x@x.org/", key)
112+
key = re.sub("#.*$", "", key)
113+
key = re.sub("\?.*$", "", key)
114+
key = re.sub("/rfc[0-9]+/", "/rfcnnnn/", key)
115+
key = re.sub("/wg/[a-z0-9-]+/", "/wg/foo/", key)
116+
key = re.sub("/rg/[a-z0-9-]+/", "/rg/foo/", key)
117+
key = re.sub("/ipr/[0-9]+/", "/ipr/nnnn/", key)
118+
key = re.sub("/draft-[a-z0-9-]+/", "/draft-foo/", key)
119+
for slug in doc_types:
120+
key = re.sub("/%s-.*/"%slug, "/%s-nnnn/"%slug, key)
121+
110122
if not key in validated_urls:
111123
if hasattr(response, "content"):
112124
content = response.content
@@ -124,7 +136,10 @@ def check_html_valid(url, response, vnu):
124136
tags.append("\n\t%s" % m["extract"].replace('\n', ' '))
125137
tags.append("\n\t%s%s" %
126138
(" " * m["hiliteStart"], "^" * m["hiliteLength"]))
127-
warnings += 1
139+
# disregard some HTML issues that are (usually) due to invalid
140+
# database content
141+
if not re.search('Forbidden code point|Bad value|seamless|The first child', m["message"]):
142+
warnings += 1
128143
else:
129144
try:
130145
parser.parse(content)
@@ -157,6 +172,8 @@ def get_referrers(url):
157172

158173
slow_threshold = args.slow_threshold
159174
vnu = args.vnu
175+
pedantic = args.pedantic
176+
user = args.user
160177

161178
visited = set()
162179
urls = {} # url -> referrer
@@ -197,20 +214,29 @@ logfile = None
197214
if args.logfile:
198215
logfile = open(args.logfile, "w")
199216

200-
validated_urls = {}
201-
202217
# --- Main ---
203218

204219
if __name__ == "__main__":
220+
if (user):
221+
# log in as user, to have the respective HTML generated by the templates
222+
response = client.post('/accounts/login/',
223+
{'username': user, 'password': 'password'},
224+
secure=True, follow=True)
225+
if (response.status_code != 200):
226+
log("Could not log in as %s, HTML response %d" %
227+
(user, response.status_code))
228+
sys.exit(1)
205229

206230
while urls:
207-
url, referrer = urls.popitem()
231+
# popitem() is documented to be random, but really isn't
232+
url = random.choice(urls.keys())
233+
referrer = urls.pop(url)
208234

209235
visited.add(url)
210236

211237
try:
212238
timestamp = datetime.datetime.now()
213-
r = client.get(url)
239+
r = client.get(url, secure=True, follow=True)
214240
elapsed = datetime.datetime.now() - timestamp
215241
except KeyboardInterrupt:
216242
log(" ... was fetching %s" % url)
@@ -279,6 +305,8 @@ if __name__ == "__main__":
279305
log("\nElapsed Visited Queue Code Time Url ... Notes")
280306

281307
log("%2d:%02d:%02d %7d %6d %s %6.3fs %s %s" % (hrs,min,sec, len(visited), len(urls), r.status_code, elapsed.total_seconds(), url, " ".join(tags)))
308+
if ((errors or warnings) and pedantic):
309+
sys.exit(1)
282310

283311
if logfile:
284312
logfile.close()

0 commit comments

Comments
 (0)