Skip to content

Commit 8d1d0cd

Browse files
committed
Added a no-follow option to the test crawler, in order to be able to easily test a specific list of URLs.
- Legacy-Id: 16188
1 parent 2538a58 commit 8d1d0cd

1 file changed

Lines changed: 8 additions & 5 deletions

File tree

bin/test-crawl

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0,
2323
parser.add_argument('--settings', help='Custom settings file')
2424
parser.add_argument('--logfile', help='Write to logfile')
2525
parser.add_argument('--user', help='Crawl logged in as this user', default=None)
26+
parser.add_argument('--no-follow', dest='follow', action='store_false', default=True,
27+
help='Do not follow URLs found in fetched pages, just check the given URLs')
2628
parser.add_argument('--validator-nu', dest='validator_nu', action='store_true',
2729
help='Use validator.nu instead of html5lib for HTML validation')
2830
parser.add_argument('--pedantic', action='store_true',
@@ -384,7 +386,7 @@ if __name__ == "__main__":
384386

385387
if ctype == "text/html":
386388
try:
387-
if not skip_extract_from(url):
389+
if args.follow and not skip_extract_from(url):
388390
for u in extract_html_urls(r.content):
389391
if u not in visited and u not in urls:
390392
urls[u] = url
@@ -400,10 +402,11 @@ if __name__ == "__main__":
400402

401403
elif ctype == "application/json":
402404
try:
403-
for u in extract_tastypie_urls(r.content):
404-
if u not in visited and u not in urls:
405-
urls[u] = url
406-
referrers[u] = url
405+
if args.follow:
406+
for u in extract_tastypie_urls(r.content):
407+
if u not in visited and u not in urls:
408+
urls[u] = url
409+
referrers[u] = url
407410
except:
408411
log("error extracting urls from %s" % url)
409412
log("=============")

0 commit comments

Comments
 (0)