33import os , sys , re , datetime , argparse , traceback , tempfile , json , subprocess
44import html5lib
55import debug # pyflakes:ignore
6+ import random
67
78# Set up import path to find our own Django
89basedir = os .path .abspath (os .path .join (os .path .dirname (__file__ ), "../" ))
@@ -17,13 +18,16 @@ parser = argparse.ArgumentParser(
1718parser .add_argument ('urls' , metavar = 'URL' , nargs = '*' ,
1819 help = 'One or more URLs to start the crawl from' )
1920parser .add_argument ('--urls' , '-u' , dest = 'url_file' ,
20- help = 'file with URLs to start the crawl from' )
21+ help = 'File with URLs to start the crawl from' )
2122parser .add_argument ('--slow' , dest = 'slow_threshold' , type = float , default = 1.0 ,
22- help = 'responses taking longer than this (in seconds) results in SLOW being printed' )
23- parser .add_argument ('--settings' , dest = 'settings' , help = 'custom settings file' )
24- parser .add_argument ('--logfile' , dest = 'logfile' , help = 'write to logfile' )
23+ help = 'Responses taking longer than this (in seconds) results in SLOW being printed' )
24+ parser .add_argument ('--settings' , dest = 'settings' , help = 'Custom settings file' )
25+ parser .add_argument ('--logfile' , dest = 'logfile' , help = 'Write to logfile' )
26+ parser .add_argument ('--user' , dest = 'user' , help = 'Crawl logged in as this user' )
2527parser .add_argument ('--vnu' , action = 'store_true' ,
2628 help = 'Use validator.nu instead of html5lib for HTML validation' )
29+ parser .add_argument ('--pedantic' , action = 'store_true' ,
30+ help = 'Check all pages and stop the crawl on the first HTML validation issue' )
2731
2832args = parser .parse_args ()
2933
@@ -95,18 +99,26 @@ def extract_tastypie_urls(content):
9599 yield uri
96100
97101def check_html_valid (url , response , vnu ):
98- global parser , validated_urls , doc_types , warnings
99- # derive a key for urls like this by replacing primary keys
102+ global parser , validated_urls , doc_types , warnings , pedantic
103+ # These URLs have known issues, skip them until those are fixed
104+ if re .search ('(/secr|admin/)|/doc/.*/edit/info/' , url ):
105+ log ("%s blacklisted; skipping HTML validation" % url )
106+ return
100107 key = url
101- key = re .sub ("/[0-9.]+/" , "/nnnn/" , key )
102- key = re .sub ("/.+@.+/" , "/x@x.org/" , key )
103- key = re .sub ("#.*$" , "" , key )
104- key = re .sub ("\?.*$" , "" , key )
105- key = re .sub ("/rfc[0-9]+/" , "/rfcnnnn/" , key )
106- key = re .sub ("/wg/[a-z0-9-]+/" , "/wg/foo/" , key )
107- key = re .sub ("/rg/[a-z0-9-]+/" , "/rg/foo/" , key )
108- for slug in doc_types :
109- key = re .sub ("/%s-.*/" % slug , "/%s-nnnn/" % slug , key )
108+ if not vnu :
109+ # derive a key for urls like this by replacing primary keys
110+ key = re .sub ("/[0-9.]+/" , "/nnnn/" , key )
111+ key = re .sub ("/.+@.+/" , "/x@x.org/" , key )
112+ key = re .sub ("#.*$" , "" , key )
113+ key = re .sub ("\?.*$" , "" , key )
114+ key = re .sub ("/rfc[0-9]+/" , "/rfcnnnn/" , key )
115+ key = re .sub ("/wg/[a-z0-9-]+/" , "/wg/foo/" , key )
116+ key = re .sub ("/rg/[a-z0-9-]+/" , "/rg/foo/" , key )
117+ key = re .sub ("/ipr/[0-9]+/" , "/ipr/nnnn/" , key )
118+ key = re .sub ("/draft-[a-z0-9-]+/" , "/draft-foo/" , key )
119+ for slug in doc_types :
120+ key = re .sub ("/%s-.*/" % slug , "/%s-nnnn/" % slug , key )
121+
110122 if not key in validated_urls :
111123 if hasattr (response , "content" ):
112124 content = response .content
@@ -124,7 +136,10 @@ def check_html_valid(url, response, vnu):
124136 tags .append ("\n \t %s" % m ["extract" ].replace ('\n ' , ' ' ))
125137 tags .append ("\n \t %s%s" %
126138 (" " * m ["hiliteStart" ], "^" * m ["hiliteLength" ]))
127- warnings += 1
139+ # disregard some HTML issues that are (usually) due to invalid
140+ # database content
141+ if not re .search ('Forbidden code point|Bad value|seamless|The first child' , m ["message" ]):
142+ warnings += 1
128143 else :
129144 try :
130145 parser .parse (content )
@@ -157,6 +172,8 @@ def get_referrers(url):
157172
158173slow_threshold = args .slow_threshold
159174vnu = args .vnu
175+ pedantic = args .pedantic
176+ user = args .user
160177
161178visited = set ()
162179urls = {} # url -> referrer
@@ -197,20 +214,29 @@ logfile = None
197214if args .logfile :
198215 logfile = open (args .logfile , "w" )
199216
200- validated_urls = {}
201-
202217# --- Main ---
203218
204219if __name__ == "__main__" :
220+ if (user ):
221+ # log in as user, to have the respective HTML generated by the templates
222+ response = client .post ('/accounts/login/' ,
223+ {'username' : user , 'password' : 'password' },
224+ secure = True , follow = True )
225+ if (response .status_code != 200 ):
226+ log ("Could not log in as %s, HTML response %d" %
227+ (user , response .status_code ))
228+ sys .exit (1 )
205229
206230 while urls :
207- url , referrer = urls .popitem ()
231+ # popitem() is documented to be random, but really isn't
232+ url = random .choice (urls .keys ())
233+ referrer = urls .pop (url )
208234
209235 visited .add (url )
210236
211237 try :
212238 timestamp = datetime .datetime .now ()
213- r = client .get (url )
239+ r = client .get (url , secure = True , follow = True )
214240 elapsed = datetime .datetime .now () - timestamp
215241 except KeyboardInterrupt :
216242 log (" ... was fetching %s" % url )
@@ -279,6 +305,8 @@ if __name__ == "__main__":
279305 log ("\n Elapsed Visited Queue Code Time Url ... Notes" )
280306
281307 log ("%2d:%02d:%02d %7d %6d %s %6.3fs %s %s" % (hrs ,min ,sec , len (visited ), len (urls ), r .status_code , elapsed .total_seconds (), url , " " .join (tags )))
308+ if ((errors or warnings ) and pedantic ):
309+ sys .exit (1 )
282310
283311 if logfile :
284312 logfile .close ()
0 commit comments