11#!/usr/bin/env python
22
33import os , sys , re , datetime , argparse , traceback , tempfile , json
4+ import html5lib
5+ import debug # pyflakes:ignore
46
5- # args
7+ # Set up import path to find our own Django
8+ basedir = os .path .abspath (os .path .join (os .path .dirname (__file__ ), "../" ))
9+ if not basedir in sys .path :
10+ sys .path .insert (0 , basedir )
11+
12+ # Parse args now, so we can use custom settings when importing django
613parser = argparse .ArgumentParser (
714 description = """Perform a test crawl of the project. For each found URL, the HTTP
815 response status is printed. If it's not OK/redirect, FAIL is
@@ -18,45 +25,28 @@ parser.add_argument('--logfile', dest='logfile', help='write to logfile')
1825
1926args = parser .parse_args ()
2027
21- # boilerplate
22- basedir = os .path .abspath (os .path .join (os .path .dirname (__file__ ), "../" ))
23- sys .path = [ basedir ] + sys .path
24-
25- settings_module = args .settings or "ietf.settings"
26-
27- os .environ .setdefault ("DJANGO_SETTINGS_MODULE" , settings_module )
28+ # Import Django, call setup()
29+ os .environ .setdefault ("DJANGO_SETTINGS_MODULE" , args .settings or "ietf.settings" )
2830
2931import django
3032import django .test
3133
34+ django .setup ()
35+
3236# prevent memory from leaking when settings.DEBUG=True
3337from django .db import connection
3438class DontSaveQueries (object ):
3539 def append (self , x ):
3640 pass
3741connection .queries = DontSaveQueries ()
3842
39- MAX_URL_LENGTH = 500
40-
41- slow_threshold = args .slow_threshold
43+ from ietf .name .models import DocTypeName
4244
43- initial_urls = []
44- initial_urls .extend (args .urls )
45+ # --- Constants ---
4546
46- if args .url_file :
47- with open (args .url_file ) as f :
48- for line in f :
49- line = line .partition ("#" )[0 ].strip ()
50- if line :
51- initial_urls .append (line )
47+ MAX_URL_LENGTH = 500
5248
53- if not initial_urls :
54- initial_urls .append ("/" )
55- initial_urls .append ("/api/v1" )
56-
57- visited = set ()
58- urls = {} # url -> referrer
59- referrers = {}
49+ # --- Functions ---
6050
6151def strip_url (url ):
6252 if url .startswith ("http://testserver" ):
@@ -102,20 +92,32 @@ def extract_tastypie_urls(content):
10292 uri = object_list [i ]["resource_uri" ]
10393 yield uri
10494
105- django .setup ()
106- client = django .test .Client (Accept = 'text/html,text/plain,application/json' )
107-
108- for url in initial_urls :
109- urls [url ] = "[initial]"
110-
111- errors = 0
112- count = 0
113-
114- start_time = datetime .datetime .now ()
115-
116- logfile = None
117- if args .logfile :
118- logfile = open (args .logfile , "w" )
95+ def check_html_valid (url , response ):
96+ global parser , validated_urls , doc_types , warnings
97+ # derive a key for urls like this by replacing primary keys
98+ key = url
99+ key = re .sub ("/[0-9.]+/" , "/nnnn/" , key )
100+ key = re .sub ("/.+@.+/" , "/x@x.org/" , key )
101+ key = re .sub ("#.*$" , "" , key )
102+ key = re .sub ("\?.*$" , "" , key )
103+ key = re .sub ("/rfc[0-9]+/" , "/rfcnnnn/" , key )
104+ key = re .sub ("/wg/[a-z0-9-]+/" , "/wg/foo/" , key )
105+ for slug in doc_types :
106+ key = re .sub ("/%s-.*/" % slug , "/%s-nnnn/" % slug , key )
107+ if not key in validated_urls :
108+ if hasattr (response , "content" ):
109+ content = response .content
110+ else :
111+ content = response .streaming_content
112+ try :
113+ validated_urls [key ] = True
114+ parser .parse (content )
115+ except Exception :
116+ e = SyntaxWarning ("ParseError" )
117+ for err in parser .errors :
118+ pos , code , data = err
119+ tags .append (u"WARN invalid html: Position %s: %s" % (pos , code ))
120+ warnings += 1
119121
120122def log (s ):
121123 print (s )
@@ -133,84 +135,143 @@ def get_referrers(url):
133135 ref_list .append (url )
134136 return ref_list
135137
136- while urls :
137- url , referrer = urls .popitem ()
138+ # --- GLobals ---
138139
139- visited . add ( url )
140+ slow_threshold = args . slow_threshold
140141
141- try :
142- timestamp = datetime .datetime .now ()
143- r = client .get (url )
144- elapsed = datetime .datetime .now () - timestamp
145- except KeyboardInterrupt :
146- log (" ... was fetching %s" % url )
147- sys .exit (1 )
148- except :
149- log ("500 %.3fs %s FAIL (from: [ %s ])" % ((datetime .datetime .now () - timestamp ).total_seconds (), url , (",\n \t " .join (get_referrers (url )))))
150- log ("=============" )
151- log (traceback .format_exc ())
152- log ("=============" )
153- errors += 1
154- else :
155- tags = []
156-
157- if r .status_code in (301 , 302 ):
158- u = strip_url (r ["Location" ])
159- if u not in visited and u not in urls :
160- urls [u ] = referrer # referrer is original referrer, not redirected url
161- referrers [u ] = referrer
162-
163- elif r .status_code == 200 :
164- ctype = r ["Content-Type" ]
165- if ";" in ctype :
166- ctype = ctype [:ctype .index (";" )]
167-
168- if ctype == "text/html" :
169- try :
170- for u in extract_html_urls (r .content ):
171- if u not in visited and u not in urls :
172- urls [u ] = url
173- referrers [u ] = url
174- except :
175- log ("error extracting HTML urls from %s" % url )
176- log ("=============" )
177- log (traceback .format_exc ())
178- log ("=============" )
179- elif ctype == "application/json" :
180- try :
181- for u in extract_tastypie_urls (r .content ):
182- if u not in visited and u not in urls :
183- urls [u ] = url
184- referrers [u ] = url
185- except :
186- log ("error extracting urls from %s" % url )
187- log ("=============" )
188- log (traceback .format_exc ())
189- log ("=============" )
190- else :
191- tags .append (u"FAIL for %s\n (from %s)" % (url , referrer ))
192- errors += 1
142+ visited = set ()
143+ urls = {} # url -> referrer
144+ referrers = {}
145+
146+ initial_urls = []
147+ initial_urls .extend (args .urls )
148+
149+ if args .url_file :
150+ with open (args .url_file ) as f :
151+ for line in f :
152+ line = line .partition ("#" )[0 ].strip ()
153+ if line :
154+ initial_urls .append (line )
155+
156+ if not initial_urls :
157+ initial_urls .append ("/" )
158+ initial_urls .append ("/api/v1" )
159+
160+ for url in initial_urls :
161+ urls [url ] = "[initial]"
162+
163+ parser = html5lib .HTMLParser (strict = True )
193164
194- if elapsed .total_seconds () > slow_threshold :
195- tags .append ("SLOW" )
165+ validated_urls = {}
196166
197- acc_time = (timestamp - start_time ).total_seconds ()
198- acc_secs = (timestamp - start_time ).total_seconds ()
199- hrs = acc_secs // (60 * 60 )
200- min = (acc_secs % (60 * 60 )) // 60
201- sec = acc_secs % 60
167+ doc_types = [ t .slug for t in DocTypeName .objects .all () ]
202168
203- if (len (visited ) % 100 ) == 1 :
204- log ("\n Elapsed Visited Queue Code Time Url ... Notes" )
169+ errors = 0
170+ warnings = 0
171+ count = 0
172+
173+ start_time = datetime .datetime .now ()
174+
175+ client = django .test .Client (Accept = 'text/html,text/plain,application/json' )
176+
177+ logfile = None
178+ if args .logfile :
179+ logfile = open (args .logfile , "w" )
180+
181+ validated_urls = {}
205182
206- log ( "%2d:%02d:%02d %7d %6d %s %6.3fs %s %s" % ( hrs , min , sec , len ( visited ), len ( urls ), r . status_code , elapsed . total_seconds (), url , " " . join ( tags )))
183+ # --- Main ---
207184
208- if logfile :
209- logfile .close ()
210- sys .stderr .write ("Output written to %s\n \n " % logfile .name )
185+ if __name__ == "__main__" :
211186
212- if errors > 0 :
213- sys .stderr .write ("Found %s errors, grep output for FAIL for details\n " % errors )
214- sys .exit (1 )
215- else :
216- sys .stderr .write ("Found no errors." )
187+ while urls :
188+ url , referrer = urls .popitem ()
189+
190+ visited .add (url )
191+
192+ try :
193+ timestamp = datetime .datetime .now ()
194+ r = client .get (url )
195+ elapsed = datetime .datetime .now () - timestamp
196+ except KeyboardInterrupt :
197+ log (" ... was fetching %s" % url )
198+ sys .exit (1 )
199+ except :
200+ log ("500 %.3fs %s FAIL (from: [ %s ])" % ((datetime .datetime .now () - timestamp ).total_seconds (), url , (",\n \t " .join (get_referrers (url )))))
201+ log ("=============" )
202+ log (traceback .format_exc ())
203+ log ("=============" )
204+ errors += 1
205+ else :
206+ tags = []
207+
208+ if r .status_code in (301 , 302 ):
209+ u = strip_url (r ["Location" ])
210+ if u not in visited and u not in urls :
211+ urls [u ] = referrer # referrer is original referrer, not redirected url
212+ referrers [u ] = referrer
213+
214+ elif r .status_code == 200 :
215+ ctype = r ["Content-Type" ]
216+ if ";" in ctype :
217+ ctype = ctype [:ctype .index (";" )]
218+
219+ if ctype == "text/html" :
220+ try :
221+ for u in extract_html_urls (r .content ):
222+ if u not in visited and u not in urls :
223+ urls [u ] = url
224+ referrers [u ] = url
225+
226+ check_html_valid (url , r )
227+
228+ except :
229+ log ("error extracting HTML urls from %s" % url )
230+ log ("=============" )
231+ log (traceback .format_exc ())
232+ log ("=============" )
233+
234+ elif ctype == "application/json" :
235+ try :
236+ for u in extract_tastypie_urls (r .content ):
237+ if u not in visited and u not in urls :
238+ urls [u ] = url
239+ referrers [u ] = url
240+ except :
241+ log ("error extracting urls from %s" % url )
242+ log ("=============" )
243+ log (traceback .format_exc ())
244+ log ("=============" )
245+
246+ else :
247+ tags .append (u"FAIL for %s\n (from %s)" % (url , referrer ))
248+ errors += 1
249+
250+ if elapsed .total_seconds () > slow_threshold :
251+ tags .append ("SLOW" )
252+
253+ acc_time = (timestamp - start_time ).total_seconds ()
254+ acc_secs = (timestamp - start_time ).total_seconds ()
255+ hrs = acc_secs // (60 * 60 )
256+ min = (acc_secs % (60 * 60 )) // 60
257+ sec = acc_secs % 60
258+
259+ if (len (visited ) % 100 ) == 1 :
260+ log ("\n Elapsed Visited Queue Code Time Url ... Notes" )
261+
262+ log ("%2d:%02d:%02d %7d %6d %s %6.3fs %s %s" % (hrs ,min ,sec , len (visited ), len (urls ), r .status_code , elapsed .total_seconds (), url , " " .join (tags )))
263+
264+ if logfile :
265+ logfile .close ()
266+ sys .stderr .write ("Output written to %s\n \n " % logfile .name )
267+
268+ if errors > 0 :
269+ sys .stderr .write ("Found %s errors, grep output for FAIL for details\n " % errors )
270+ sys .exit (1 )
271+ else :
272+ sys .stderr .write ("Found no errors." )
273+ if warnings > 0 :
274+ sys .stderr .write ("Found %s warnings, grep output for WARN for details\n " % warnings )
275+ sys .exit (1 )
276+ else :
277+ sys .stderr .write ("Found no warnings." )
0 commit comments