Skip to content

Commit 2828683

Browse files
committed
Replaced html sanitization code that called html5lib directly with calls to bleach, and upgraded the requirements to let us use the latest html5lib and bleach.
- Legacy-Id: 14739
1 parent b92ad2f commit 2828683

4 files changed

Lines changed: 6 additions & 43 deletions

File tree

PLAN

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,9 @@ Planned work in rough order
99

1010
* Revisit the review tool, work through the accumulated tickets.
1111

12-
* Add sanitization of uploaded html documents.
13-
1412
* Introduce an API for Meetecho to use to associate recordings with sessions
1513
(and perhaps automate making copies of those videos)
1614

17-
* Upgrade html5lib to the latest release, the same for bleach which uses it.
18-
1915
* Reworked UI and refactored backend for the scretariat meeting scheduling
2016
tool.
2117

ietf/secr/proceedings/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
import debug # pyflakes:ignore
99

10-
from ietf.utils.html import sanitize
10+
from ietf.utils.html import sanitize_html
1111

1212
def handle_upload_file(file,filename,meeting,subdir, request=None):
1313
'''
@@ -38,7 +38,7 @@ def handle_upload_file(file,filename,meeting,subdir, request=None):
3838
file.open()
3939
text = file.read()
4040
# Whole file sanitization; add back '<html>' (sanitize will remove it)
41-
clean = u"<html>\n%s\n</html>\n" % sanitize(text)
41+
clean = u"<html>\n%s\n</html>\n" % sanitize_html(text)
4242
destination.write(clean.encode('utf8'))
4343
if request and clean != text:
4444
messages.warning(request, "Uploaded html content is sanitized to prevent unsafe content. "

ietf/utils/html.py

Lines changed: 2 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
# Taken from http://code.google.com/p/soclone/source/browse/trunk/soclone/utils/html.py
22

33
"""Utilities for working with HTML."""
4-
import html5lib
54
import bleach
6-
from html5lib import sanitizer, serializer, tokenizer, treebuilders, treewalkers
75

86
import debug # pyflakes:ignore
97

@@ -26,36 +24,6 @@
2624
'span', 'src', 'start', 'summary', 'title', 'type', 'valign', 'vspace',
2725
'width')
2826

29-
30-
class HTMLSanitizerMixin(sanitizer.HTMLSanitizerMixin):
31-
allowed_elements = acceptable_elements
32-
allowed_attributes = acceptable_attributes
33-
allowed_css_properties = ()
34-
allowed_css_keywords = ()
35-
allowed_svg_properties = ()
36-
37-
class HTMLSanitizer(tokenizer.HTMLTokenizer, HTMLSanitizerMixin):
38-
def __init__(self, *args, **kwargs):
39-
tokenizer.HTMLTokenizer.__init__(self, *args, **kwargs)
40-
41-
def __iter__(self):
42-
for token in tokenizer.HTMLTokenizer.__iter__(self):
43-
token = self.sanitize_token(token)
44-
if token:
45-
yield token
46-
47-
def sanitize_html(html):
48-
"""Sanitizes an HTML fragment."""
49-
p = html5lib.HTMLParser(tokenizer=HTMLSanitizer,
50-
tree=treebuilders.getTreeBuilder("dom"))
51-
dom_tree = p.parseFragment(html)
52-
walker = treewalkers.getTreeWalker("dom")
53-
stream = walker(dom_tree)
54-
s = serializer.HTMLSerializer(omit_optional_tags=False,
55-
quote_attr_values=True)
56-
output_generator = s.serialize(stream)
57-
return u''.join(output_generator)
58-
5927
def unescape(text):
6028
"""
6129
Returns the given text with ampersands, quotes and angle brackets decoded
@@ -71,10 +39,9 @@ def remove_tags(html, tags):
7139
return bleach.clean(html, tags=allowed)
7240
remove_tags = keep_lazy(remove_tags, six.text_type)
7341

74-
def sanitize(html, tags=acceptable_elements, extra=[], remove=[], strip=True):
75-
tags = list(set(tags) | set(extra) ^ set(remove))
42+
def sanitize_html(html, tags=acceptable_elements, extra=[], remove=[], strip=True):
43+
tags = list(set(tags) | set(t.lower() for t in extra) ^ set(t.lower for t in remove))
7644
return bleach.clean(html, tags=tags, strip=strip)
7745

7846
def clean_html(html):
7947
return bleach.clean(html)
80-

requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ anora>=0.1.2
55
argon2-cffi>=16.1.0 # For the Argon2 password hasher option
66
beautifulsoup4>=4.4
77
bibtexparser>=0.6.2,<1.0 # Version 1.0 doesn't work under python 2.7. 1.0.1 doesn't recognize month names or abbreviations.
8-
bleach>=1.5.0,<2.0.0
8+
bleach>=2.0.0
99
coverage>=4.0.1,!=4.0.2
1010
#cssselect>=0.6.1 # for PyQuery
1111
decorator>=3.4.0
@@ -23,7 +23,7 @@ factory-boy>=2.9.0
2323
google-api-python-client
2424
Faker!=0.8.9,!=0.8.10 # from factory-boy # Faker 0.8.9,0.8.10 sometimes return string names instead of unicode.
2525
hashids>=1.1.0
26-
html5lib>=0.90,<0.99999999 # ietf.utils.html needs a rewrite for html5lib 1.x -- major code changes in sanitizer
26+
html5lib>=1.0.1
2727
httplib2>=0.10.3
2828
jsonfield>=1.0.3 # for SubmissionCheck. This is https://github.com/bradjasper/django-jsonfield/.
2929
jwcrypto>=0.4.0 # for signed notifications

0 commit comments

Comments
 (0)