11# Taken from http://code.google.com/p/soclone/source/browse/trunk/soclone/utils/html.py
22
33"""Utilities for working with HTML."""
4- import html5lib
54import bleach
6- from html5lib import sanitizer , serializer , tokenizer , treebuilders , treewalkers
75
86import debug # pyflakes:ignore
97
2624 'span' , 'src' , 'start' , 'summary' , 'title' , 'type' , 'valign' , 'vspace' ,
2725 'width' )
2826
29-
30- class HTMLSanitizerMixin (sanitizer .HTMLSanitizerMixin ):
31- allowed_elements = acceptable_elements
32- allowed_attributes = acceptable_attributes
33- allowed_css_properties = ()
34- allowed_css_keywords = ()
35- allowed_svg_properties = ()
36-
37- class HTMLSanitizer (tokenizer .HTMLTokenizer , HTMLSanitizerMixin ):
38- def __init__ (self , * args , ** kwargs ):
39- tokenizer .HTMLTokenizer .__init__ (self , * args , ** kwargs )
40-
41- def __iter__ (self ):
42- for token in tokenizer .HTMLTokenizer .__iter__ (self ):
43- token = self .sanitize_token (token )
44- if token :
45- yield token
46-
47- def sanitize_html (html ):
48- """Sanitizes an HTML fragment."""
49- p = html5lib .HTMLParser (tokenizer = HTMLSanitizer ,
50- tree = treebuilders .getTreeBuilder ("dom" ))
51- dom_tree = p .parseFragment (html )
52- walker = treewalkers .getTreeWalker ("dom" )
53- stream = walker (dom_tree )
54- s = serializer .HTMLSerializer (omit_optional_tags = False ,
55- quote_attr_values = True )
56- output_generator = s .serialize (stream )
57- return u'' .join (output_generator )
58-
5927def unescape (text ):
6028 """
6129 Returns the given text with ampersands, quotes and angle brackets decoded
@@ -71,10 +39,9 @@ def remove_tags(html, tags):
7139 return bleach .clean (html , tags = allowed )
7240remove_tags = keep_lazy (remove_tags , six .text_type )
7341
74- def sanitize (html , tags = acceptable_elements , extra = [], remove = [], strip = True ):
75- tags = list (set (tags ) | set (extra ) ^ set (remove ))
42+ def sanitize_html (html , tags = acceptable_elements , extra = [], remove = [], strip = True ):
43+ tags = list (set (tags ) | set (t . lower () for t in extra ) ^ set (t . lower for t in remove ))
7644 return bleach .clean (html , tags = tags , strip = strip )
7745
7846def clean_html (html ):
7947 return bleach .clean (html )
80-
0 commit comments