Skip to content

Commit e32af56

Browse files
committed
Added html validation to the test crawler; it will now report html which fails validation with 'WARN' indications. Reorganized the code somewhat, collecting functions, globals, etc. in groups.
- Legacy-Id: 9549
1 parent 6055215 commit e32af56

1 file changed

Lines changed: 174 additions & 113 deletions

File tree

bin/test-crawl

Lines changed: 174 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,15 @@
11
#!/usr/bin/env python
22

33
import os, sys, re, datetime, argparse, traceback, tempfile, json
4+
import html5lib
5+
import debug # pyflakes:ignore
46

5-
# args
7+
# Set up import path to find our own Django
8+
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
9+
if not basedir in sys.path:
10+
sys.path.insert(0, basedir)
11+
12+
# Parse args now, so we can use custom settings when importing django
613
parser = argparse.ArgumentParser(
714
description="""Perform a test crawl of the project. For each found URL, the HTTP
815
response status is printed. If it's not OK/redirect, FAIL is
@@ -18,45 +25,28 @@ parser.add_argument('--logfile', dest='logfile', help='write to logfile')
1825

1926
args = parser.parse_args()
2027

21-
# boilerplate
22-
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
23-
sys.path = [ basedir ] + sys.path
24-
25-
settings_module = args.settings or "ietf.settings"
26-
27-
os.environ.setdefault("DJANGO_SETTINGS_MODULE", settings_module)
28+
# Import Django, call setup()
29+
os.environ.setdefault("DJANGO_SETTINGS_MODULE", args.settings or "ietf.settings")
2830

2931
import django
3032
import django.test
3133

34+
django.setup()
35+
3236
# prevent memory from leaking when settings.DEBUG=True
3337
from django.db import connection
3438
class DontSaveQueries(object):
3539
def append(self, x):
3640
pass
3741
connection.queries = DontSaveQueries()
3842

39-
MAX_URL_LENGTH = 500
40-
41-
slow_threshold = args.slow_threshold
43+
from ietf.name.models import DocTypeName
4244

43-
initial_urls = []
44-
initial_urls.extend(args.urls)
45+
# --- Constants ---
4546

46-
if args.url_file:
47-
with open(args.url_file) as f:
48-
for line in f:
49-
line = line.partition("#")[0].strip()
50-
if line:
51-
initial_urls.append(line)
47+
MAX_URL_LENGTH = 500
5248

53-
if not initial_urls:
54-
initial_urls.append("/")
55-
initial_urls.append("/api/v1")
56-
57-
visited = set()
58-
urls = {} # url -> referrer
59-
referrers = {}
49+
# --- Functions ---
6050

6151
def strip_url(url):
6252
if url.startswith("http://testserver"):
@@ -102,20 +92,32 @@ def extract_tastypie_urls(content):
10292
uri = object_list[i]["resource_uri"]
10393
yield uri
10494

105-
django.setup()
106-
client = django.test.Client(Accept='text/html,text/plain,application/json')
107-
108-
for url in initial_urls:
109-
urls[url] = "[initial]"
110-
111-
errors = 0
112-
count = 0
113-
114-
start_time = datetime.datetime.now()
115-
116-
logfile = None
117-
if args.logfile:
118-
logfile = open(args.logfile, "w")
95+
def check_html_valid(url, response):
96+
global parser, validated_urls, doc_types, warnings
97+
# derive a key for urls like this by replacing primary keys
98+
key = url
99+
key = re.sub("/[0-9.]+/", "/nnnn/", key)
100+
key = re.sub("/.+@.+/", "/x@x.org/", key)
101+
key = re.sub("#.*$", "", key)
102+
key = re.sub("\?.*$", "", key)
103+
key = re.sub("/rfc[0-9]+/", "/rfcnnnn/", key)
104+
key = re.sub("/wg/[a-z0-9-]+/", "/wg/foo/", key)
105+
for slug in doc_types:
106+
key = re.sub("/%s-.*/"%slug, "/%s-nnnn/"%slug, key)
107+
if not key in validated_urls:
108+
if hasattr(response, "content"):
109+
content = response.content
110+
else:
111+
content = response.streaming_content
112+
try:
113+
validated_urls[key] = True
114+
parser.parse(content)
115+
except Exception:
116+
e = SyntaxWarning("ParseError")
117+
for err in parser.errors:
118+
pos, code, data = err
119+
tags.append(u"WARN invalid html: Position %s: %s" % (pos, code))
120+
warnings += 1
119121

120122
def log(s):
121123
print(s)
@@ -133,84 +135,143 @@ def get_referrers(url):
133135
ref_list.append(url)
134136
return ref_list
135137

136-
while urls:
137-
url, referrer = urls.popitem()
138+
# --- GLobals ---
138139

139-
visited.add(url)
140+
slow_threshold = args.slow_threshold
140141

141-
try:
142-
timestamp = datetime.datetime.now()
143-
r = client.get(url)
144-
elapsed = datetime.datetime.now() - timestamp
145-
except KeyboardInterrupt:
146-
log(" ... was fetching %s" % url)
147-
sys.exit(1)
148-
except:
149-
log("500 %.3fs %s FAIL (from: [ %s ])" % ((datetime.datetime.now() - timestamp).total_seconds(), url, (",\n\t".join(get_referrers(url)))))
150-
log("=============")
151-
log(traceback.format_exc())
152-
log("=============")
153-
errors += 1
154-
else:
155-
tags = []
156-
157-
if r.status_code in (301, 302):
158-
u = strip_url(r["Location"])
159-
if u not in visited and u not in urls:
160-
urls[u] = referrer # referrer is original referrer, not redirected url
161-
referrers[u] = referrer
162-
163-
elif r.status_code == 200:
164-
ctype = r["Content-Type"]
165-
if ";" in ctype:
166-
ctype = ctype[:ctype.index(";")]
167-
168-
if ctype == "text/html":
169-
try:
170-
for u in extract_html_urls(r.content):
171-
if u not in visited and u not in urls:
172-
urls[u] = url
173-
referrers[u] = url
174-
except:
175-
log("error extracting HTML urls from %s" % url)
176-
log("=============")
177-
log(traceback.format_exc())
178-
log("=============")
179-
elif ctype == "application/json":
180-
try:
181-
for u in extract_tastypie_urls(r.content):
182-
if u not in visited and u not in urls:
183-
urls[u] = url
184-
referrers[u] = url
185-
except:
186-
log("error extracting urls from %s" % url)
187-
log("=============")
188-
log(traceback.format_exc())
189-
log("=============")
190-
else:
191-
tags.append(u"FAIL for %s\n (from %s)" % (url, referrer))
192-
errors += 1
142+
visited = set()
143+
urls = {} # url -> referrer
144+
referrers = {}
145+
146+
initial_urls = []
147+
initial_urls.extend(args.urls)
148+
149+
if args.url_file:
150+
with open(args.url_file) as f:
151+
for line in f:
152+
line = line.partition("#")[0].strip()
153+
if line:
154+
initial_urls.append(line)
155+
156+
if not initial_urls:
157+
initial_urls.append("/")
158+
initial_urls.append("/api/v1")
159+
160+
for url in initial_urls:
161+
urls[url] = "[initial]"
162+
163+
parser = html5lib.HTMLParser(strict=True)
193164

194-
if elapsed.total_seconds() > slow_threshold:
195-
tags.append("SLOW")
165+
validated_urls = {}
196166

197-
acc_time = (timestamp - start_time).total_seconds()
198-
acc_secs = (timestamp - start_time).total_seconds()
199-
hrs = acc_secs // (60*60)
200-
min = (acc_secs % (60*60)) // 60
201-
sec = acc_secs % 60
167+
doc_types = [ t.slug for t in DocTypeName.objects.all() ]
202168

203-
if (len(visited) % 100) == 1:
204-
log("\nElapsed Visited Queue Code Time Url ... Notes")
169+
errors = 0
170+
warnings = 0
171+
count = 0
172+
173+
start_time = datetime.datetime.now()
174+
175+
client = django.test.Client(Accept='text/html,text/plain,application/json')
176+
177+
logfile = None
178+
if args.logfile:
179+
logfile = open(args.logfile, "w")
180+
181+
validated_urls = {}
205182

206-
log("%2d:%02d:%02d %7d %6d %s %6.3fs %s %s" % (hrs,min,sec, len(visited), len(urls), r.status_code, elapsed.total_seconds(), url, " ".join(tags)))
183+
# --- Main ---
207184

208-
if logfile:
209-
logfile.close()
210-
sys.stderr.write("Output written to %s\n\n" % logfile.name)
185+
if __name__ == "__main__":
211186

212-
if errors > 0:
213-
sys.stderr.write("Found %s errors, grep output for FAIL for details\n" % errors)
214-
sys.exit(1)
215-
else:
216-
sys.stderr.write("Found no errors.")
187+
while urls:
188+
url, referrer = urls.popitem()
189+
190+
visited.add(url)
191+
192+
try:
193+
timestamp = datetime.datetime.now()
194+
r = client.get(url)
195+
elapsed = datetime.datetime.now() - timestamp
196+
except KeyboardInterrupt:
197+
log(" ... was fetching %s" % url)
198+
sys.exit(1)
199+
except:
200+
log("500 %.3fs %s FAIL (from: [ %s ])" % ((datetime.datetime.now() - timestamp).total_seconds(), url, (",\n\t".join(get_referrers(url)))))
201+
log("=============")
202+
log(traceback.format_exc())
203+
log("=============")
204+
errors += 1
205+
else:
206+
tags = []
207+
208+
if r.status_code in (301, 302):
209+
u = strip_url(r["Location"])
210+
if u not in visited and u not in urls:
211+
urls[u] = referrer # referrer is original referrer, not redirected url
212+
referrers[u] = referrer
213+
214+
elif r.status_code == 200:
215+
ctype = r["Content-Type"]
216+
if ";" in ctype:
217+
ctype = ctype[:ctype.index(";")]
218+
219+
if ctype == "text/html":
220+
try:
221+
for u in extract_html_urls(r.content):
222+
if u not in visited and u not in urls:
223+
urls[u] = url
224+
referrers[u] = url
225+
226+
check_html_valid(url, r)
227+
228+
except:
229+
log("error extracting HTML urls from %s" % url)
230+
log("=============")
231+
log(traceback.format_exc())
232+
log("=============")
233+
234+
elif ctype == "application/json":
235+
try:
236+
for u in extract_tastypie_urls(r.content):
237+
if u not in visited and u not in urls:
238+
urls[u] = url
239+
referrers[u] = url
240+
except:
241+
log("error extracting urls from %s" % url)
242+
log("=============")
243+
log(traceback.format_exc())
244+
log("=============")
245+
246+
else:
247+
tags.append(u"FAIL for %s\n (from %s)" % (url, referrer))
248+
errors += 1
249+
250+
if elapsed.total_seconds() > slow_threshold:
251+
tags.append("SLOW")
252+
253+
acc_time = (timestamp - start_time).total_seconds()
254+
acc_secs = (timestamp - start_time).total_seconds()
255+
hrs = acc_secs // (60*60)
256+
min = (acc_secs % (60*60)) // 60
257+
sec = acc_secs % 60
258+
259+
if (len(visited) % 100) == 1:
260+
log("\nElapsed Visited Queue Code Time Url ... Notes")
261+
262+
log("%2d:%02d:%02d %7d %6d %s %6.3fs %s %s" % (hrs,min,sec, len(visited), len(urls), r.status_code, elapsed.total_seconds(), url, " ".join(tags)))
263+
264+
if logfile:
265+
logfile.close()
266+
sys.stderr.write("Output written to %s\n\n" % logfile.name)
267+
268+
if errors > 0:
269+
sys.stderr.write("Found %s errors, grep output for FAIL for details\n" % errors)
270+
sys.exit(1)
271+
else:
272+
sys.stderr.write("Found no errors.")
273+
if warnings > 0:
274+
sys.stderr.write("Found %s warnings, grep output for WARN for details\n" % warnings)
275+
sys.exit(1)
276+
else:
277+
sys.stderr.write("Found no warnings.")

0 commit comments

Comments
 (0)