11#!/usr/bin/env python
22
3- import os , sys , re , datetime , argparse , traceback , tempfile , json
3+ import os , sys , re , datetime , argparse , traceback , tempfile , json , subprocess
44import html5lib
55import debug # pyflakes:ignore
66
@@ -22,6 +22,8 @@ parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0,
2222 help = 'responses taking longer than this (in seconds) results in SLOW being printed' )
2323parser .add_argument ('--settings' , dest = 'settings' , help = 'custom settings file' )
2424parser .add_argument ('--logfile' , dest = 'logfile' , help = 'write to logfile' )
25+ parser .add_argument ('--vnu' , action = 'store_true' ,
26+ help = 'Use validator.nu instead of html5lib for HTML validation' )
2527
2628args = parser .parse_args ()
2729
@@ -92,7 +94,7 @@ def extract_tastypie_urls(content):
9294 uri = object_list [i ]["resource_uri" ]
9395 yield uri
9496
95- def check_html_valid (url , response ):
97+ def check_html_valid (url , response , vnu ):
9698 global parser , validated_urls , doc_types , warnings
9799 # derive a key for urls like this by replacing primary keys
98100 key = url
@@ -110,15 +112,30 @@ def check_html_valid(url, response):
110112 content = response .content
111113 else :
112114 content = response .streaming_content
113- try :
114- validated_urls [key ] = True
115- parser .parse (content )
116- except Exception :
117- e = SyntaxWarning ("ParseError" )
118- for err in parser .errors :
119- pos , code , data = err
120- tags .append (u"WARN invalid html: Position %s: %s" % (pos , code ))
121- warnings += 1
115+ validated_urls [key ] = True
116+ if vnu :
117+ v = subprocess .Popen (["java" , "-jar" , basedir + "/bin/vnu.jar" ,
118+ "--format" , "json" , "-" ],
119+ stdin = subprocess .PIPE , stderr = subprocess .PIPE )
120+ for m in json .loads (v .communicate (content )[1 ])["messages" ]:
121+ t = m ["subType" ] if m ["type" ] == "info" else m ["type" ]
122+ tags .append ("\n %s\t Line %d: %s" %
123+ (t .upper (), m ["lastLine" ], m ["message" ]))
124+ tags .append ("\n \t %s" % m ["extract" ].replace ('\n ' , ' ' ))
125+ tags .append ("\n \t %s%s" %
126+ (" " * m ["hiliteStart" ], "^" * m ["hiliteLength" ]))
127+ warnings += 1
128+ else :
129+ try :
130+ parser .parse (content )
131+ except Exception :
132+ e = SyntaxWarning ("ParseError" )
133+ for err in parser .errors :
134+ pos , code , data = err
135+ tags .append (u"WARN invalid html: Position %s: %s" %
136+ (pos , code ))
137+ warnings += 1
138+
122139
123140def log (s ):
124141 print (s )
@@ -139,6 +156,7 @@ def get_referrers(url):
139156# --- GLobals ---
140157
141158slow_threshold = args .slow_threshold
159+ vnu = args .vnu
142160
143161visited = set ()
144162urls = {} # url -> referrer
@@ -224,7 +242,7 @@ if __name__ == "__main__":
224242 urls [u ] = url
225243 referrers [u ] = url
226244
227- check_html_valid (url , r )
245+ check_html_valid (url , r , vnu )
228246
229247 except :
230248 log ("error extracting HTML urls from %s" % url )
0 commit comments