Skip to content

Commit 9341f96

Browse files
committed
Tweaked the document sanitizer to insert a charset meta tag after sanitization.
- Legacy-Id: 14832
1 parent c3e05fd commit 9341f96

2 files changed

Lines changed: 25 additions & 3 deletions

File tree

ietf/meeting/tests_views.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1720,7 +1720,7 @@ def test_upload_minutes_agenda(self):
17201720
self.assertTrue(q('form .has-error'))
17211721

17221722
# Test html sanitization
1723-
test_file = StringIO('<html><h1>Title</h1><section>Some text</section></html>')
1723+
test_file = StringIO('<html><head><title>Title</title></head><body><h1>Title</h1><section>Some text</section></body></html>')
17241724
test_file.name = "some.html"
17251725
r = self.client.post(url,dict(file=test_file))
17261726
self.assertEqual(r.status_code, 302)
@@ -1729,6 +1729,7 @@ def test_upload_minutes_agenda(self):
17291729
text = doc.text()
17301730
self.assertIn('Some text', text)
17311731
self.assertNotIn('<section>', text)
1732+
self.assertIn('charset="utf-8"', text)
17321733

17331734
test_file = StringIO(u'This is some text for a test, with the word\nvirtual at the beginning of a line.')
17341735
test_file.name = "not_really.txt"

ietf/utils/html.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
"""Utilities for working with HTML."""
44
import bleach
5+
import copy
6+
import lxml.etree
7+
import lxml.html
58
import lxml.html.clean
69

710
import debug # pyflakes:ignore
@@ -45,8 +48,26 @@ def sanitize_fragment(html):
4548
# ----------------------------------------------------------------------
4649
# Page cleaning
4750

48-
lxml_cleaner = lxml.html.clean.Cleaner(allow_tags=acceptable_tags,
49-
remove_unknown_tags=None, style=False, page_structure=False)
51+
52+
class Cleaner(lxml.html.clean.Cleaner):
53+
charset = 'utf-8'
54+
# Copied from lxml 4.2.0 and modified to insert charset meta:
55+
def clean_html(self, html):
56+
result_type = type(html)
57+
if isinstance(html, basestring):
58+
doc = lxml.html.fromstring(html)
59+
else:
60+
doc = copy.deepcopy(html)
61+
self(doc)
62+
head = doc.find('head')
63+
if head != None:
64+
meta = lxml.etree.Element('meta', charset=self.charset)
65+
meta.tail = '\n'
66+
head.insert(0, meta)
67+
return lxml.html._transform_result(result_type, doc)
68+
69+
# We will be saving as utf-8 later, so set that in the meta tag.
70+
lxml_cleaner = Cleaner(allow_tags=acceptable_tags, remove_unknown_tags=None, style=False, page_structure=False, charset='utf-8')
5071

5172
def sanitize_document(html):
5273
return lxml_cleaner.clean_html(html)

0 commit comments

Comments
 (0)