22
33import os , sys , re , datetime , argparse , traceback , tempfile , json , subprocess
44import html5lib
5- import debug # pyflakes:ignore
65import random
76
87# Set up import path to find our own Django
@@ -33,6 +32,8 @@ parser.add_argument('--random', action='store_true',
3332parser .add_argument ('--validate-all' , dest = 'validate_all' , action = 'store_true' , default = False ,
3433 help = 'Run html 5 validation on all pages, without skipping similar urls. '
3534 '(The default is to only run validation on one of /foo/1/, /foo/2/, /foo/3/, etc.)' )
35+ parser .add_argument ('-v' , '--verbose' , action = 'store_true' , default = False ,
36+ help = 'Be more verbose' )
3637
3738args = parser .parse_args ()
3839
@@ -44,6 +45,9 @@ import django.test
4445
4546django .setup ()
4647
48+ # This needs to come after we set up sys path to include the local django
49+ import debug # pyflakes:ignore
50+
4751# prevent memory from leaking when settings.DEBUG=True
4852from django .db import connection
4953class DontSaveQueries (object ):
@@ -59,6 +63,11 @@ MAX_URL_LENGTH = 500
5963
6064# --- Functions ---
6165
66+ def note (s ):
67+ if args .verbose :
68+ sys .stderr .write (s )
69+ sys .stderr .write ('\n ' )
70+
6271def strip_url (url ):
6372 if url .startswith ("http://testserver" ):
6473 url = url [len ("http://testserver" ):]
@@ -105,26 +114,45 @@ def extract_tastypie_urls(content):
105114
106115def check_html_valid (url , response , args ):
107116 global parser , validated_urls , doc_types , warnings
108- # These URLs have known issues, skip them until those are fixed
109- if re .search ('(/secr|admin/)|/doc/.*/edit/info/' , url ):
110- log ("%s blacklisted; skipping HTML validation" % url )
111- return
112117 key = url
113118 if not args .validate_all :
114119 # derive a key for urls like this by replacing primary keys
115- key = re .sub ("/[0-9.]+/" , "/nnnn/" , key )
116- key = re .sub ("/.+@.+/" , "/x@x.org/" , key )
117- key = re .sub ("#.*$" , "" , key )
118120 key = re .sub ("\?.*$" , "" , key )
121+ key = re .sub ("#.*$" , "" , key )
122+ key = re .sub ("/.+@.+/" , "/x@x.org/" , key )
123+ key = re .sub ("/[0-9.]+/" , "/nnnn/" , key )
124+ key = re .sub ("/[0-9.]+/" , "/mmmm/" , key )
125+ key = re .sub ("/ag/[a-z0-9-]+/" , "/ag/foo/" , key )
126+ key = re .sub ("/area/[a-z0-9-]+/" , "/area/foo/" , key )
127+ key = re .sub ("/bcp[0-9]+/" , "/bcpnnn/" , key )
128+ key = re .sub ("/conflict-review-[a-z0-9-]+/" , "/conflrev-foo/" , key )
129+ key = re .sub ("/dir/[a-z0-9-]+/" , "/dir/foo/" , key )
130+ key = re .sub ("/draft-[a-z0-9-]+/" , "/draft-foo/" , key )
131+ key = re .sub ("/group/[a-z0-9-]+/" , "/group/foo/" , key )
132+ key = re .sub ("/ipr/search/.*" , "/ipr/search/" , key )
133+ key = re .sub ("/release/[0-9dev.]+/" , "/release/n.n.n/" , key )
119134 key = re .sub ("/rfc[0-9]+/" , "/rfcnnnn/" , key )
120- key = re .sub ("/wg/[a-z0-9-]+/" , "/wg/foo/" , key )
121135 key = re .sub ("/rg/[a-z0-9-]+/" , "/rg/foo/" , key )
122- key = re .sub ("/ipr/[0-9]+/" , "/ipr/nnnn/" , key )
123- key = re .sub ("/draft-[a-z0-9-]+/" , "/draft-foo/" , key )
136+ key = re .sub ("/secr/srec/nnnn/[0-9a-z-]+/" , "/secr/sreq/nn/bar/" , key )
137+ key = re .sub ("/state/[a-z0-9-]+/" , "/state/foo/" , key )
138+ key = re .sub ("/state/[a-z0-9-]+/[a-z0-9-]+/" , "/state/foo/bar/" , key )
139+ key = re .sub ("/status-change-[a-z0-9-]+/" , "/statchg-foo/" , key )
140+ key = re .sub ("/std[0-9]+/" , "/stdnnn/" , key )
141+ key = re .sub ("/submit/status/nnnn/[0-9a-f]+/" , "/submit/status/nnnn/bar/" , key )
142+ key = re .sub ("/team/[a-z0-9-]+/" , "/team/foo/" , key )
143+ key = re .sub ("/wg/[a-z0-9-]+/" , "/wg/foo/" , key )
144+
124145 for slug in doc_types :
125146 key = re .sub ("/%s-.*/" % slug , "/%s-nnnn/" % slug , key )
126147
127148 if not key in validated_urls :
149+ note ('Validate: %-32s: %s' % (url [:32 ], key ))
150+ # These URLs have known issues, skip them until those are fixed
151+ if re .search ('(/secr|admin/)|/doc/.*/edit/info/' , url ):
152+ log ("%s blacklisted; skipping HTML validation" % url )
153+ validated_urls [key ] = True
154+ return
155+
128156 if hasattr (response , "content" ):
129157 content = response .content
130158 else :
@@ -156,6 +184,14 @@ def check_html_valid(url, response, args):
156184 (pos , code ))
157185 warnings += 1
158186
187+ def skip_url (url ):
188+ for pattern in (
189+ "^/community/[0-9]+/remove_document/" ,
190+ "^/community/personal/" ,
191+ ):
192+ if re .search (pattern , url ):
193+ return True
194+ return False
159195
160196def log (s ):
161197 print (s )
@@ -243,6 +279,9 @@ if __name__ == "__main__":
243279
244280 visited .add (url )
245281
282+ if skip_url (url ):
283+ continue
284+
246285 try :
247286 timestamp = datetime .datetime .now ()
248287 r = client .get (url , secure = True , follow = True )
@@ -298,7 +337,7 @@ if __name__ == "__main__":
298337 log ("=============" )
299338
300339 else :
301- tags .append (u"FAIL for %s \n (from %s)" % (url , referrer ))
340+ tags .append (u"FAIL (from %s)" % (referrer , ))
302341 errors += 1
303342
304343 if elapsed .total_seconds () > slow_threshold :
0 commit comments