33import os , sys , re , datetime , argparse , traceback , tempfile , json , subprocess
44import html5lib
55import debug # pyflakes:ignore
6+ import random
67
78# Set up import path to find our own Django
89basedir = os .path .abspath (os .path .join (os .path .dirname (__file__ ), "../" ))
@@ -17,13 +18,19 @@ parser = argparse.ArgumentParser(
1718parser .add_argument ('urls' , metavar = 'URL' , nargs = '*' ,
1819 help = 'One or more URLs to start the crawl from' )
1920parser .add_argument ('--urls' , '-u' , dest = 'url_file' ,
20- help = 'file with URLs to start the crawl from' )
21+ help = 'File with URLs to start the crawl from' )
2122parser .add_argument ('--slow' , dest = 'slow_threshold' , type = float , default = 1.0 ,
22- help = 'responses taking longer than this (in seconds) results in SLOW being printed' )
23- parser .add_argument ('--settings' , dest = 'settings' , help = 'custom settings file' )
24- parser .add_argument ('--logfile' , dest = 'logfile' , help = 'write to logfile' )
23+ help = 'Responses taking longer than this (in seconds) results in SLOW being printed' )
24+ parser .add_argument ('--settings' , help = 'Custom settings file' )
25+ parser .add_argument ('--logfile' , help = 'Write to logfile' )
26+ parser .add_argument ('--user' , help = 'Crawl logged in as this user' , default = None )
2527parser .add_argument ('--validator-nu' , dest = 'validator_nu' , action = 'store_true' ,
2628 help = 'Use validator.nu instead of html5lib for HTML validation' )
29+ parser .add_argument ('--pedantic' , action = 'store_true' ,
30+ help = 'Stop the crawl on the first HTML validation issue' )
31+ parser .add_argument ('--validate-all' , dest = 'validate_all' , action = 'store_true' , default = False ,
32+ help = 'Run html 5 validation on all pages, without skipping similar urls. '
33+ '(The default is to only run validation on one of /foo/1/, /foo/2/, /foo/3/, etc.)' )
2734
2835args = parser .parse_args ()
2936
@@ -96,17 +103,25 @@ def extract_tastypie_urls(content):
96103
97104def check_html_valid (url , response , args ):
98105 global parser , validated_urls , doc_types , warnings
99- # derive a key for urls like this by replacing primary keys
106+ # These URLs have known issues, skip them until those are fixed
107+ if re .search ('(/secr|admin/)|/doc/.*/edit/info/' , url ):
108+ log ("%s blacklisted; skipping HTML validation" % url )
109+ return
100110 key = url
101- key = re .sub ("/[0-9.]+/" , "/nnnn/" , key )
102- key = re .sub ("/.+@.+/" , "/x@x.org/" , key )
103- key = re .sub ("#.*$" , "" , key )
104- key = re .sub ("\?.*$" , "" , key )
105- key = re .sub ("/rfc[0-9]+/" , "/rfcnnnn/" , key )
106- key = re .sub ("/wg/[a-z0-9-]+/" , "/wg/foo/" , key )
107- key = re .sub ("/rg/[a-z0-9-]+/" , "/rg/foo/" , key )
108- for slug in doc_types :
109- key = re .sub ("/%s-.*/" % slug , "/%s-nnnn/" % slug , key )
111+ if not args .validate_all :
112+ # derive a key for urls like this by replacing primary keys
113+ key = re .sub ("/[0-9.]+/" , "/nnnn/" , key )
114+ key = re .sub ("/.+@.+/" , "/x@x.org/" , key )
115+ key = re .sub ("#.*$" , "" , key )
116+ key = re .sub ("\?.*$" , "" , key )
117+ key = re .sub ("/rfc[0-9]+/" , "/rfcnnnn/" , key )
118+ key = re .sub ("/wg/[a-z0-9-]+/" , "/wg/foo/" , key )
119+ key = re .sub ("/rg/[a-z0-9-]+/" , "/rg/foo/" , key )
120+ key = re .sub ("/ipr/[0-9]+/" , "/ipr/nnnn/" , key )
121+ key = re .sub ("/draft-[a-z0-9-]+/" , "/draft-foo/" , key )
122+ for slug in doc_types :
123+ key = re .sub ("/%s-.*/" % slug , "/%s-nnnn/" % slug , key )
124+
110125 if not key in validated_urls :
111126 if hasattr (response , "content" ):
112127 content = response .content
@@ -124,7 +139,10 @@ def check_html_valid(url, response, args):
124139 tags .append ("\n \t %s" % m ["extract" ].replace ('\n ' , ' ' ))
125140 tags .append ("\n \t %s%s" %
126141 (" " * m ["hiliteStart" ], "^" * m ["hiliteLength" ]))
127- warnings += 1
142+ # disregard some HTML issues that are (usually) due to invalid
143+ # database content
144+ if not re .search ('Forbidden code point|Bad value|seamless|The first child' , m ["message" ]):
145+ warnings += 1
128146 else :
129147 try :
130148 parser .parse (content )
@@ -140,6 +158,8 @@ def check_html_valid(url, response, args):
140158def log (s ):
141159 print (s )
142160 if logfile :
161+ if not type (s ) is str :
162+ s = s .encode ('utf-8' )
143163 logfile .write (s )
144164 logfile .write ('\n ' )
145165
@@ -157,6 +177,8 @@ def get_referrers(url):
157177
158178slow_threshold = args .slow_threshold
159179
180+
181+
160182visited = set ()
161183urls = {} # url -> referrer
162184referrers = {}
@@ -196,20 +218,29 @@ logfile = None
196218if args .logfile :
197219 logfile = open (args .logfile , "w" )
198220
199- validated_urls = {}
200-
201221# --- Main ---
202222
203223if __name__ == "__main__" :
224+ if (args .user ):
225+ # log in as user, to have the respective HTML generated by the templates
226+ response = client .post ('/accounts/login/' ,
227+ {'username' : args .user , 'password' : 'password' },
228+ secure = True , follow = True )
229+ if (response .status_code != 200 ):
230+ log ("Could not log in as %s, HTML response %d" %
231+ (args .user , response .status_code ))
232+ sys .exit (1 )
204233
205234 while urls :
206- url , referrer = urls .popitem ()
235+ # popitem() is documented to be random, but really isn't
236+ url = random .choice (urls .keys ())
237+ referrer = urls .pop (url )
207238
208239 visited .add (url )
209240
210241 try :
211242 timestamp = datetime .datetime .now ()
212- r = client .get (url )
243+ r = client .get (url , secure = True , follow = True )
213244 elapsed = datetime .datetime .now () - timestamp
214245 except KeyboardInterrupt :
215246 log (" ... was fetching %s" % url )
@@ -278,6 +309,8 @@ if __name__ == "__main__":
278309 log ("\n Elapsed Visited Queue Code Time Url ... Notes" )
279310
280311 log ("%2d:%02d:%02d %7d %6d %s %6.3fs %s %s" % (hrs ,min ,sec , len (visited ), len (urls ), r .status_code , elapsed .total_seconds (), url , " " .join (tags )))
312+ if ((errors or warnings ) and args .pedantic ):
313+ sys .exit (1 )
281314
282315 if logfile :
283316 logfile .close ()
0 commit comments