Skip to content

Commit b92ad2f

Browse files
committed
Added sanitization of uploaded html content for session agendas and minutes, and did some refactoring of the upload form classes.
- Legacy-Id: 14738
1 parent 27914a0 commit b92ad2f

6 files changed

Lines changed: 116 additions & 63 deletions

File tree

ietf/meeting/forms.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import datetime
44

55
from django import forms
6+
from django.conf import settings
7+
from django.core.exceptions import ValidationError
68
from django.db.models import Q
79
from django.forms import BaseInlineFormSet
810

@@ -17,6 +19,8 @@
1719
from ietf.message.models import Message
1820
from ietf.person.models import Person
1921
from ietf.utils.fields import DatepickerDateField, DurationField
22+
from ietf.utils.validators import ( validate_file_size, validate_mime_type,
23+
validate_file_extension, validate_no_html_frame)
2024

2125
# need to insert empty option for use in ChoiceField
2226
# countries.insert(0, ('', '-'*9 ))
@@ -305,3 +309,38 @@ def __init__(self, *args, **kwargs):
305309
super(InterimCancelForm, self).__init__(*args, **kwargs)
306310
self.fields['group'].widget.attrs['disabled'] = True
307311
self.fields['date'].widget.attrs['disabled'] = True
312+
313+
class FileUploadForm(forms.Form):
314+
file = forms.FileField(label='File to upload')
315+
316+
def __init__(self, *args, **kwargs):
317+
doc_type = kwargs.pop('doc_type')
318+
assert doc_type in settings.MEETING_VALID_UPLOAD_EXTENSIONS
319+
self.doc_type = doc_type
320+
self.extensions = settings.MEETING_VALID_UPLOAD_EXTENSIONS[doc_type]
321+
self.mime_types = settings.MEETING_VALID_UPLOAD_MIME_TYPES[doc_type]
322+
super(FileUploadForm, self).__init__(*args, **kwargs)
323+
label = '%s file to upload. ' % (self.doc_type.capitalize(), )
324+
if self.mime_types:
325+
label += 'Note that you can only upload files with these formats: %s.' % (', '.join(self.mime_types, ))
326+
self.fields['file'].label=label
327+
328+
def clean_file(self):
329+
file = self.cleaned_data['file']
330+
validate_file_size(file)
331+
ext = validate_file_extension(file, self.extensions)
332+
mime_type = None
333+
if self.mime_types:
334+
mime_type, encoding = validate_mime_type(file, self.mime_types)
335+
if mime_type != file.content_type:
336+
raise ValidationError('Upload Content-Type (%s) is different from the observed mime-type (%s)' % (file.content_type, mime_type))
337+
if mime_type in settings.MEETING_VALID_MIME_TYPE_EXTENSIONS:
338+
if not ext in settings.MEETING_VALID_MIME_TYPE_EXTENSIONS[mime_type]:
339+
raise ValidationError('Upload Content-Type (%s) does not match the extension (%s)' % (file.content_type, ext))
340+
if mime_type in ['text/html', ] or ext in settings.MEETING_VALID_MIME_TYPE_EXTENSIONS['text/html']:
341+
# We'll do html sanitization later, but for frames, we fail here,
342+
# as the sanitized version will most likely be useless.
343+
validate_no_html_frame(file)
344+
return file
345+
346+

ietf/meeting/tests_views.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1680,12 +1680,23 @@ def test_upload_minutes_agenda(self):
16801680
q = PyQuery(r.content)
16811681
self.assertTrue(q('form .has-error'))
16821682

1683+
# Test html sanitization
1684+
test_file = StringIO('<html><h1>Title</h1><section>Some text</section></html>')
1685+
test_file.name = "some.html"
1686+
r = self.client.post(url,dict(file=test_file))
1687+
self.assertEqual(r.status_code, 302)
1688+
doc = session.sessionpresentation_set.filter(document__type_id=doctype).first().document
1689+
self.assertEqual(doc.rev,'00')
1690+
text = doc.text()
1691+
self.assertIn('Some text', text)
1692+
self.assertNotIn('<section>', text)
1693+
16831694
test_file = StringIO(u'This is some text for a test, with the word\nvirtual at the beginning of a line.')
16841695
test_file.name = "not_really.txt"
16851696
r = self.client.post(url,dict(file=test_file,apply_to_all=False))
16861697
self.assertEqual(r.status_code, 302)
16871698
doc = session.sessionpresentation_set.filter(document__type_id=doctype).first().document
1688-
self.assertEqual(doc.rev,'00')
1699+
self.assertEqual(doc.rev,'01')
16891700
self.assertFalse(session2.sessionpresentation_set.filter(document__type_id=doctype))
16901701

16911702
r = self.client.get(url)
@@ -1697,7 +1708,7 @@ def test_upload_minutes_agenda(self):
16971708
r = self.client.post(url,dict(file=test_file,apply_to_all=True))
16981709
self.assertEqual(r.status_code, 302)
16991710
doc = Document.objects.get(pk=doc.pk)
1700-
self.assertEqual(doc.rev,'01')
1711+
self.assertEqual(doc.rev,'02')
17011712
self.assertTrue(session2.sessionpresentation_set.filter(document__type_id=doctype))
17021713

17031714
def test_upload_minutes_agenda_unscheduled(self):

ietf/meeting/views.py

Lines changed: 18 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,10 @@
6363
from ietf.utils.pipe import pipe
6464
from ietf.utils.pdf import pdf_pages
6565
from ietf.utils.text import xslugify
66-
from ietf.utils.validators import ( validate_file_size, validate_mime_type,
67-
validate_file_extension, validate_no_html_frame, get_mime_type)
66+
from ietf.utils.validators import get_mime_type
6867

6968
from .forms import (InterimMeetingModelForm, InterimAnnounceForm, InterimSessionModelForm,
70-
InterimCancelForm, InterimSessionInlineFormSet)
69+
InterimCancelForm, InterimSessionInlineFormSet, FileUploadForm)
7170

7271

7372
def get_menu_entries(request):
@@ -1117,14 +1116,13 @@ def add_session_drafts(request, session_id, num):
11171116
'form': form,
11181117
})
11191118

1120-
class UploadBlueSheetForm(forms.Form):
1121-
file = forms.FileField(label='Bluesheet scan to upload')
11221119

1123-
def clean_file(self):
1124-
file = self.cleaned_data['file']
1125-
validate_mime_type(file, settings.MEETING_VALID_BLUESHEET_MIME_TYPES)
1126-
validate_file_extension(file, settings.MEETING_VALID_BLUESHEET_EXTENSIONS)
1127-
return file
1120+
class UploadBlueSheetForm(FileUploadForm):
1121+
1122+
def __init__(self, *args, **kwargs):
1123+
kwargs['doc_type'] = 'bluesheets'
1124+
super(UploadBlueSheetForm, self).__init__(*args, **kwargs )
1125+
11281126

11291127
@role_required('Area Director', 'Secretariat', 'IRTF Chair', 'WG Chair', 'RG Chair')
11301128
def upload_session_bluesheets(request, session_id, num):
@@ -1193,25 +1191,15 @@ def upload_session_bluesheets(request, session_id, num):
11931191
})
11941192

11951193

1196-
# FIXME: This form validation code (based on the secretariat upload code) only looks at filename extensions
1197-
# It should look at the contents of the files instead.
1198-
class UploadMinutesForm(forms.Form):
1199-
file = forms.FileField(label='Minutes file to upload. Note that you can only upload minutes in txt, html, or pdf formats.')
1194+
class UploadMinutesForm(FileUploadForm):
12001195
apply_to_all = forms.BooleanField(label='Apply to all group sessions at this meeting',initial=True,required=False)
12011196

12021197
def __init__(self, show_apply_to_all_checkbox, *args, **kwargs):
1203-
super(UploadMinutesForm, self).__init__(*args, **kwargs)
1198+
kwargs['doc_type'] = 'minutes'
1199+
super(UploadMinutesForm, self).__init__(*args, **kwargs )
12041200
if not show_apply_to_all_checkbox:
12051201
self.fields.pop('apply_to_all')
12061202

1207-
def clean_file(self):
1208-
file = self.cleaned_data['file']
1209-
validate_file_size(file)
1210-
ext = validate_file_extension(file, settings.MEETING_VALID_MINUTES_EXTENSIONS)
1211-
mime_type, encoding = validate_mime_type(file, settings.MEETING_VALID_MINUTES_MIME_TYPES)
1212-
if ext in ['.html', '.htm'] or mime_type in ['text/html', ]:
1213-
validate_no_html_frame(file)
1214-
return file
12151203

12161204
def upload_session_minutes(request, session_id, num):
12171205
# num is redundant, but we're dragging it along an artifact of where we are in the current URL structure
@@ -1301,26 +1289,15 @@ def upload_session_minutes(request, session_id, num):
13011289
})
13021290

13031291

1304-
# FIXME: This form validation code (based on the secretariat upload code) only looks at filename extensions
1305-
# It should look at the contents of the files instead.
1306-
class UploadAgendaForm(forms.Form):
1307-
file = forms.FileField(label='Agenda file to upload. Note that you can only upload agendas in txt or html formats.')
1292+
class UploadAgendaForm(FileUploadForm):
13081293
apply_to_all = forms.BooleanField(label='Apply to all group sessions at this meeting',initial=True,required=False)
13091294

13101295
def __init__(self, show_apply_to_all_checkbox, *args, **kwargs):
1311-
super(UploadAgendaForm, self).__init__(*args, **kwargs)
1296+
kwargs['doc_type'] = 'agenda'
1297+
super(UploadAgendaForm, self).__init__(*args, **kwargs )
13121298
if not show_apply_to_all_checkbox:
13131299
self.fields.pop('apply_to_all')
13141300

1315-
def clean_file(self):
1316-
file = self.cleaned_data['file']
1317-
validate_file_size(file)
1318-
ext = validate_file_extension(file, settings.MEETING_VALID_AGENDA_EXTENSIONS)
1319-
mime_type, encoding = validate_mime_type(file, settings.MEETING_VALID_AGENDA_MIME_TYPES)
1320-
if ext in ['.html', '.htm'] or mime_type in ['text/html', ]:
1321-
validate_no_html_frame(file)
1322-
return file
1323-
13241301
def upload_session_agenda(request, session_id, num):
13251302
# num is redundant, but we're dragging it along an artifact of where we are in the current URL structure
13261303
session = get_object_or_404(Session,pk=session_id)
@@ -1399,7 +1376,7 @@ def upload_session_agenda(request, session_id, num):
13991376
e = NewRevisionDocEvent.objects.create(doc=doc,by=request.user.person,type='new_revision',desc='New revision available: %s'%doc.rev,rev=doc.rev)
14001377
doc.save_with_history([e])
14011378
# The way this function builds the filename it will never trigger the file delete in handle_file_upload.
1402-
handle_upload_file(file, filename, session.meeting, 'agenda')
1379+
handle_upload_file(file, filename, session.meeting, 'agenda', request)
14031380
return redirect('ietf.meeting.views.session_details',num=num,acronym=session.group.acronym)
14041381
else:
14051382
form = UploadAgendaForm(show_apply_to_all_checkbox, initial={'apply_to_all':session.type_id=='session'})
@@ -1412,23 +1389,16 @@ def upload_session_agenda(request, session_id, num):
14121389
})
14131390

14141391

1415-
# FIXME: This form validation code (based on the secretariat upload code) only looks at filename extensions
1416-
# It should look at the contents of the files instead.
1417-
class UploadSlidesForm(forms.Form):
1392+
class UploadSlidesForm(FileUploadForm):
14181393
title = forms.CharField(max_length=255)
1419-
file = forms.FileField(label='Slides file to upload.')
14201394
apply_to_all = forms.BooleanField(label='Apply to all group sessions at this meeting',initial=False,required=False)
14211395

14221396
def __init__(self, show_apply_to_all_checkbox, *args, **kwargs):
1423-
super(UploadSlidesForm, self).__init__(*args, **kwargs)
1397+
kwargs['doc_type'] = 'slides'
1398+
super(UploadSlidesForm, self).__init__(*args, **kwargs )
14241399
if not show_apply_to_all_checkbox:
14251400
self.fields.pop('apply_to_all')
14261401

1427-
def clean_file(self):
1428-
file = self.cleaned_data['file']
1429-
validate_file_size(file)
1430-
validate_file_extension(file, settings.MEETING_VALID_SLIDES_EXTENSIONS)
1431-
return file
14321402

14331403
def upload_session_slides(request, session_id, num, name):
14341404
# num is redundant, but we're dragging it along an artifact of where we are in the current URL structure

ietf/secr/proceedings/utils.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
1+
12
import glob
23
import os
34

5+
from django.conf import settings
6+
from django.contrib import messages
7+
48
import debug # pyflakes:ignore
59

6-
def handle_upload_file(file,filename,meeting,subdir):
10+
from ietf.utils.html import sanitize
11+
12+
def handle_upload_file(file,filename,meeting,subdir, request=None):
713
'''
814
This function takes a file object, a filename and a meeting object and subdir as string.
915
It saves the file to the appropriate directory, get_materials_path() + subdir.
@@ -28,8 +34,19 @@ def handle_upload_file(file,filename,meeting,subdir):
2834
os.remove(f)
2935

3036
destination = open(os.path.join(path,filename), 'wb+')
31-
for chunk in file.chunks():
32-
destination.write(chunk)
37+
if extension in settings.MEETING_VALID_MIME_TYPE_EXTENSIONS['text/html']:
38+
file.open()
39+
text = file.read()
40+
# Whole file sanitization; add back '<html>' (sanitize will remove it)
41+
clean = u"<html>\n%s\n</html>\n" % sanitize(text)
42+
destination.write(clean.encode('utf8'))
43+
if request and clean != text:
44+
messages.warning(request, "Uploaded html content is sanitized to prevent unsafe content. "
45+
"Your upload %s was changed by the sanitization; please check the "
46+
"resulting content. " % (filename, ))
47+
else:
48+
for chunk in file.chunks():
49+
destination.write(chunk)
3350
destination.close()
3451

3552
# unzip zipfile

ietf/settings.py

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -729,16 +729,26 @@ def skip_unreadable_post(record):
729729
MEETING_MATERIALS_DEFAULT_SUBMISSION_CUTOFF_DAYS = 26
730730
MEETING_MATERIALS_DEFAULT_SUBMISSION_CORRECTION_DAYS = 50
731731

732-
MEETING_VALID_AGENDA_EXTENSIONS = ['.txt','.html','.htm', '.md', ]
733-
MEETING_VALID_AGENDA_MIME_TYPES = ['text/plain', 'text/html', ]
734-
#
735-
MEETING_VALID_MINUTES_EXTENSIONS = ['.txt','.html','.htm', '.md', '.pdf', ]
736-
MEETING_VALID_MINUTES_MIME_TYPES = ['text/plain', 'text/html', 'application/pdf', ]
737-
#
738-
MEETING_VALID_SLIDES_EXTENSIONS = ('.doc','.docx','.pdf','.ppt','.pptx','.txt') # Note the removal of .zip
739-
#
740-
MEETING_VALID_BLUESHEET_EXTENSIONS = ['.pdf', '.txt', ]
741-
MEETING_VALID_BLUESHEET_MIME_TYPES = ['application/pdf', 'text/plain', ]
732+
MEETING_VALID_UPLOAD_EXTENSIONS = {
733+
'agenda': ['.txt','.html','.htm', '.md', ],
734+
'minutes': ['.txt','.html','.htm', '.md', '.pdf', ],
735+
'slides': ['.doc','.docx','.pdf','.ppt','.pptx','.txt', ], # Note the removal of .zip
736+
'bluesheets': ['.pdf', '.txt', ],
737+
}
738+
739+
MEETING_VALID_UPLOAD_MIME_TYPES = {
740+
'agenda': ['text/plain', 'text/html', ],
741+
'minutes': ['text/plain', 'text/html', 'application/pdf', ],
742+
'slides': None,
743+
'bluesheets': ['application/pdf', 'text/plain', ],
744+
}
745+
746+
MEETING_VALID_MIME_TYPE_EXTENSIONS = {
747+
'text/plain': ['.txt', '.md', ],
748+
'text/html': ['.html', '.htm'],
749+
'application/pdf': ['.pdf'],
750+
}
751+
742752

743753
INTERNET_DRAFT_DAYS_TO_EXPIRE = 185
744754

ietf/utils/html.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import bleach
66
from html5lib import sanitizer, serializer, tokenizer, treebuilders, treewalkers
77

8+
import debug # pyflakes:ignore
9+
810
from django.utils.functional import keep_lazy
911
from django.utils import six
1012

@@ -69,6 +71,10 @@ def remove_tags(html, tags):
6971
return bleach.clean(html, tags=allowed)
7072
remove_tags = keep_lazy(remove_tags, six.text_type)
7173

74+
def sanitize(html, tags=acceptable_elements, extra=[], remove=[], strip=True):
75+
tags = list(set(tags) | set(extra) ^ set(remove))
76+
return bleach.clean(html, tags=tags, strip=strip)
77+
7278
def clean_html(html):
7379
return bleach.clean(html)
7480

0 commit comments

Comments
 (0)