Skip to content

Commit 8612ce9

Browse files
committed
Merged in [9765] from lars@netapp.com:
Add option to crawl as a logged-in user (--user). Add --pedantic option for vnu crawl, which stops the crawl on (most) errors. Randomize the order in which URLs are crawled, so that repeated crawls don't hit the same URLs in the same order. - Legacy-Id: 9785 Note: SVN reference [9765] has been migrated to Git commit 9b4e610
1 parent 23bcde6 commit 8612ce9

1 file changed

Lines changed: 52 additions & 19 deletions

File tree

bin/test-crawl

Lines changed: 52 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import os, sys, re, datetime, argparse, traceback, tempfile, json, subprocess
44
import html5lib
55
import debug # pyflakes:ignore
6+
import random
67

78
# Set up import path to find our own Django
89
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
@@ -17,13 +18,19 @@ parser = argparse.ArgumentParser(
1718
parser.add_argument('urls', metavar='URL', nargs='*',
1819
help='One or more URLs to start the crawl from')
1920
parser.add_argument('--urls', '-u', dest='url_file',
20-
help='file with URLs to start the crawl from')
21+
help='File with URLs to start the crawl from')
2122
parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0,
22-
help='responses taking longer than this (in seconds) results in SLOW being printed')
23-
parser.add_argument('--settings', dest='settings', help='custom settings file')
24-
parser.add_argument('--logfile', dest='logfile', help='write to logfile')
23+
help='Responses taking longer than this (in seconds) results in SLOW being printed')
24+
parser.add_argument('--settings', help='Custom settings file')
25+
parser.add_argument('--logfile', help='Write to logfile')
26+
parser.add_argument('--user', help='Crawl logged in as this user', default=None)
2527
parser.add_argument('--validator-nu', dest='validator_nu', action='store_true',
2628
help='Use validator.nu instead of html5lib for HTML validation')
29+
parser.add_argument('--pedantic', action='store_true',
30+
help='Stop the crawl on the first HTML validation issue')
31+
parser.add_argument('--validate-all', dest='validate_all', action='store_true', default=False,
32+
help='Run html 5 validation on all pages, without skipping similar urls. '
33+
'(The default is to only run validation on one of /foo/1/, /foo/2/, /foo/3/, etc.)')
2734

2835
args = parser.parse_args()
2936

@@ -96,17 +103,25 @@ def extract_tastypie_urls(content):
96103

97104
def check_html_valid(url, response, args):
98105
global parser, validated_urls, doc_types, warnings
99-
# derive a key for urls like this by replacing primary keys
106+
# These URLs have known issues, skip them until those are fixed
107+
if re.search('(/secr|admin/)|/doc/.*/edit/info/', url):
108+
log("%s blacklisted; skipping HTML validation" % url)
109+
return
100110
key = url
101-
key = re.sub("/[0-9.]+/", "/nnnn/", key)
102-
key = re.sub("/.+@.+/", "/x@x.org/", key)
103-
key = re.sub("#.*$", "", key)
104-
key = re.sub("\?.*$", "", key)
105-
key = re.sub("/rfc[0-9]+/", "/rfcnnnn/", key)
106-
key = re.sub("/wg/[a-z0-9-]+/", "/wg/foo/", key)
107-
key = re.sub("/rg/[a-z0-9-]+/", "/rg/foo/", key)
108-
for slug in doc_types:
109-
key = re.sub("/%s-.*/"%slug, "/%s-nnnn/"%slug, key)
111+
if not args.validate_all:
112+
# derive a key for urls like this by replacing primary keys
113+
key = re.sub("/[0-9.]+/", "/nnnn/", key)
114+
key = re.sub("/.+@.+/", "/x@x.org/", key)
115+
key = re.sub("#.*$", "", key)
116+
key = re.sub("\?.*$", "", key)
117+
key = re.sub("/rfc[0-9]+/", "/rfcnnnn/", key)
118+
key = re.sub("/wg/[a-z0-9-]+/", "/wg/foo/", key)
119+
key = re.sub("/rg/[a-z0-9-]+/", "/rg/foo/", key)
120+
key = re.sub("/ipr/[0-9]+/", "/ipr/nnnn/", key)
121+
key = re.sub("/draft-[a-z0-9-]+/", "/draft-foo/", key)
122+
for slug in doc_types:
123+
key = re.sub("/%s-.*/"%slug, "/%s-nnnn/"%slug, key)
124+
110125
if not key in validated_urls:
111126
if hasattr(response, "content"):
112127
content = response.content
@@ -124,7 +139,10 @@ def check_html_valid(url, response, args):
124139
tags.append("\n\t%s" % m["extract"].replace('\n', ' '))
125140
tags.append("\n\t%s%s" %
126141
(" " * m["hiliteStart"], "^" * m["hiliteLength"]))
127-
warnings += 1
142+
# disregard some HTML issues that are (usually) due to invalid
143+
# database content
144+
if not re.search('Forbidden code point|Bad value|seamless|The first child', m["message"]):
145+
warnings += 1
128146
else:
129147
try:
130148
parser.parse(content)
@@ -140,6 +158,8 @@ def check_html_valid(url, response, args):
140158
def log(s):
141159
print(s)
142160
if logfile:
161+
if not type(s) is str:
162+
s = s.encode('utf-8')
143163
logfile.write(s)
144164
logfile.write('\n')
145165

@@ -157,6 +177,8 @@ def get_referrers(url):
157177

158178
slow_threshold = args.slow_threshold
159179

180+
181+
160182
visited = set()
161183
urls = {} # url -> referrer
162184
referrers = {}
@@ -196,20 +218,29 @@ logfile = None
196218
if args.logfile:
197219
logfile = open(args.logfile, "w")
198220

199-
validated_urls = {}
200-
201221
# --- Main ---
202222

203223
if __name__ == "__main__":
224+
if (args.user):
225+
# log in as user, to have the respective HTML generated by the templates
226+
response = client.post('/accounts/login/',
227+
{'username': args.user, 'password': 'password'},
228+
secure=True, follow=True)
229+
if (response.status_code != 200):
230+
log("Could not log in as %s, HTML response %d" %
231+
(args.user, response.status_code))
232+
sys.exit(1)
204233

205234
while urls:
206-
url, referrer = urls.popitem()
235+
# popitem() is documented to be random, but really isn't
236+
url = random.choice(urls.keys())
237+
referrer = urls.pop(url)
207238

208239
visited.add(url)
209240

210241
try:
211242
timestamp = datetime.datetime.now()
212-
r = client.get(url)
243+
r = client.get(url, secure=True, follow=True)
213244
elapsed = datetime.datetime.now() - timestamp
214245
except KeyboardInterrupt:
215246
log(" ... was fetching %s" % url)
@@ -278,6 +309,8 @@ if __name__ == "__main__":
278309
log("\nElapsed Visited Queue Code Time Url ... Notes")
279310

280311
log("%2d:%02d:%02d %7d %6d %s %6.3fs %s %s" % (hrs,min,sec, len(visited), len(urls), r.status_code, elapsed.total_seconds(), url, " ".join(tags)))
312+
if ((errors or warnings) and args.pedantic):
313+
sys.exit(1)
281314

282315
if logfile:
283316
logfile.close()

0 commit comments

Comments
 (0)