Skip to content

Commit 802f201

Browse files
committed
Modified the sanitizer and upload handler to strip also the content of some tags, and to produce valid files (if the content is otherwise valid).
- Legacy-Id: 14744
1 parent 6e5e50c commit 802f201

2 files changed

Lines changed: 33 additions & 10 deletions

File tree

ietf/secr/proceedings/utils.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,16 @@ def handle_upload_file(file,filename,meeting,subdir, request=None):
3636
destination = open(os.path.join(path,filename), 'wb+')
3737
if extension in settings.MEETING_VALID_MIME_TYPE_EXTENSIONS['text/html']:
3838
file.open()
39-
text = file.read()
39+
text = file.read().decode('utf-8')
4040
# Whole file sanitization; add back '<html>' (sanitize will remove it)
41-
clean = u"<html>\n%s\n</html>\n" % sanitize_html(text)
41+
clean = u"""<!DOCTYPE html>
42+
<html lang="en">
43+
<head><title>%s</title></head>
44+
<body>
45+
%s
46+
</body>
47+
</html>
48+
""" % (filename, sanitize_html(text))
4249
destination.write(clean.encode('utf8'))
4350
if request and clean != text:
4451
messages.warning(request, "Uploaded html content is sanitized to prevent unsafe content. "

ietf/utils/html.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,38 @@
33
"""Utilities for working with HTML."""
44
import bleach
55

6+
from html5lib.filters.base import Filter
7+
68
import debug # pyflakes:ignore
79

810
from django.utils.functional import keep_lazy
911
from django.utils import six
1012

11-
acceptable_elements = ('a', 'abbr', 'acronym', 'address', 'b', 'big',
13+
acceptable_tags = ('a', 'abbr', 'acronym', 'address', 'b', 'big',
1214
'blockquote', 'br', 'caption', 'center', 'cite', 'code', 'col',
1315
'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'font',
14-
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins', 'kbd',
16+
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'ins', 'kbd',
1517
'li', 'ol', 'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike',
1618
'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead',
1719
'tr', 'tt', 'u', 'ul', 'var')
1820

21+
strip_completely = ['style', 'script', ]
22+
23+
class StripFilter(Filter):
24+
def __iter__(self):
25+
open_tags = []
26+
for token in Filter.__iter__(self):
27+
if token["type"] in ["EmptyTag", "StartTag"]:
28+
open_tags.append(token["name"])
29+
if not set(strip_completely) & set(open_tags):
30+
yield token
31+
if token["type"] in ["EmptyTag", "EndTag"]:
32+
open_tags.pop()
33+
34+
# Leave the stripping of the strip_completely tags to StripFilter
35+
bleach_tags = list(set(acceptable_tags) | set(strip_completely))
36+
cleaner = bleach.sanitizer.Cleaner(tags=bleach_tags, filters=[StripFilter], strip=True)
37+
1938
def unescape(text):
2039
"""
2140
Returns the given text with ampersands, quotes and angle brackets decoded
@@ -27,13 +46,10 @@ def unescape(text):
2746

2847
def remove_tags(html, tags):
2948
"""Returns the given HTML sanitized, and with the given tags removed."""
30-
allowed = set(acceptable_elements) - set([ t.lower() for t in tags ])
49+
allowed = set(acceptable_tags) - set([ t.lower() for t in tags ])
3150
return bleach.clean(html, tags=allowed)
3251
remove_tags = keep_lazy(remove_tags, six.text_type)
3352

34-
def sanitize_html(html, tags=acceptable_elements, extra=[], remove=[], strip=True):
35-
tags = list(set(tags) | set(t.lower() for t in extra) ^ set(t.lower for t in remove))
36-
return bleach.clean(html, tags=tags, strip=strip)
53+
def sanitize_html(html):
54+
return cleaner.clean(html)
3755

38-
def clean_html(html):
39-
return bleach.clean(html)

0 commit comments

Comments
 (0)