Skip to content

Commit 4e16054

Browse files
committed
prevent <newline tag mangling
remove charref and entityref handlers; change where newline inserted after start tag. A test under python2 ended up with a newline between the opening '<' and the tag. This was caused by a string that had escaped &gt; and &lt;. embedded code block &lt;pre&gt\n\n<pre> python\nline 1\nline 2\n</pre> The code was mapping &lt; etc back to < and > and confusing the parser as to where the tag really started. It inserpreted the real pre tag as data and inserted a newline.
1 parent 25de331 commit 4e16054

File tree

1 file changed

+3
-18
lines changed

1 file changed

+3
-18
lines changed

test/html_norm.py

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ class NormalizingHtmlParser(HTMLParser):
4040
4141
Note that using this rewrites all attributes parsed by HTMLParser
4242
into attr="value" form even though HTMLParser accepts other
43-
attribute specifiction forms.
43+
attribute specification forms.
4444
"""
4545

4646
debug = False # set to true to enable more verbose output
@@ -63,7 +63,7 @@ def handle_starttag(self, tag, attrs):
6363
if self.debug: print(" attr:", attr)
6464
self.current_normalized_string += ' %s="%s"' % attr
6565

66-
self.current_normalized_string += ">"
66+
self.current_normalized_string += ">\n"
6767

6868
if tag == 'pre':
6969
self.preserve_data = True
@@ -83,26 +83,11 @@ def handle_data(self, data):
8383
data = " ".join(data.strip().split())
8484

8585
if data:
86-
self.current_normalized_string += "\n%s" % data
86+
self.current_normalized_string += "%s" % data
8787

8888
def handle_comment(self, data):
8989
print("Comment :", data)
9090

91-
def handle_entityref(self, name):
92-
c = chr(name2codepoint[name])
93-
if self.debug: print("Named ent:", c)
94-
95-
self.current_normalized_string += "%s" % c
96-
97-
def handle_charref(self, name):
98-
if name.startswith('x'):
99-
c = chr(int(name[1:], 16))
100-
else:
101-
c = chr(int(name))
102-
if self.debug: print("Num ent :", c)
103-
104-
self.current_normalized_string += "%s" % c
105-
10691
def handle_decl(self, data):
10792
print("Decl :", data)
10893

0 commit comments

Comments
 (0)