Skip to content

Commit 5826bcb

Browse files
committed
Add HTML5 validation based on validator.nu to test-crawl. Commit ready for merge.
- Legacy-Id: 9726
1 parent cfe7442 commit 5826bcb

2 files changed

Lines changed: 30 additions & 12 deletions

File tree

bin/test-crawl

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env python
22

3-
import os, sys, re, datetime, argparse, traceback, tempfile, json
3+
import os, sys, re, datetime, argparse, traceback, tempfile, json, subprocess
44
import html5lib
55
import debug # pyflakes:ignore
66

@@ -22,6 +22,8 @@ parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0,
2222
help='responses taking longer than this (in seconds) results in SLOW being printed')
2323
parser.add_argument('--settings', dest='settings', help='custom settings file')
2424
parser.add_argument('--logfile', dest='logfile', help='write to logfile')
25+
parser.add_argument('--vnu', action='store_true',
26+
help='Use validator.nu instead of html5lib for HTML validation')
2527

2628
args = parser.parse_args()
2729

@@ -92,7 +94,7 @@ def extract_tastypie_urls(content):
9294
uri = object_list[i]["resource_uri"]
9395
yield uri
9496

95-
def check_html_valid(url, response):
97+
def check_html_valid(url, response, vnu):
9698
global parser, validated_urls, doc_types, warnings
9799
# derive a key for urls like this by replacing primary keys
98100
key = url
@@ -110,15 +112,30 @@ def check_html_valid(url, response):
110112
content = response.content
111113
else:
112114
content = response.streaming_content
113-
try:
114-
validated_urls[key] = True
115-
parser.parse(content)
116-
except Exception:
117-
e = SyntaxWarning("ParseError")
118-
for err in parser.errors:
119-
pos, code, data = err
120-
tags.append(u"WARN invalid html: Position %s: %s" % (pos, code))
121-
warnings += 1
115+
validated_urls[key] = True
116+
if vnu:
117+
v = subprocess.Popen(["java", "-jar", basedir + "/bin/vnu.jar",
118+
"--format", "json", "-"],
119+
stdin=subprocess.PIPE, stderr=subprocess.PIPE)
120+
for m in json.loads(v.communicate(content)[1])["messages"]:
121+
t = m["subType"] if m["type"] == "info" else m["type"]
122+
tags.append("\n%s\tLine %d: %s" %
123+
(t.upper(), m["lastLine"], m["message"]))
124+
tags.append("\n\t%s" % m["extract"].replace('\n', ' '))
125+
tags.append("\n\t%s%s" %
126+
(" " * m["hiliteStart"], "^" * m["hiliteLength"]))
127+
warnings += 1
128+
else:
129+
try:
130+
parser.parse(content)
131+
except Exception:
132+
e = SyntaxWarning("ParseError")
133+
for err in parser.errors:
134+
pos, code, data = err
135+
tags.append(u"WARN invalid html: Position %s: %s" %
136+
(pos, code))
137+
warnings += 1
138+
122139

123140
def log(s):
124141
print(s)
@@ -139,6 +156,7 @@ def get_referrers(url):
139156
# --- GLobals ---
140157

141158
slow_threshold = args.slow_threshold
159+
vnu = args.vnu
142160

143161
visited = set()
144162
urls = {} # url -> referrer
@@ -224,7 +242,7 @@ if __name__ == "__main__":
224242
urls[u] = url
225243
referrers[u] = url
226244

227-
check_html_valid(url, r)
245+
check_html_valid(url, r, vnu)
228246

229247
except:
230248
log("error extracting HTML urls from %s" % url)

bin/vnu.jar

21.9 MB
Binary file not shown.

0 commit comments

Comments
 (0)