Skip to content

Commit d380fbe

Browse files
committed
change test to use html normalizer when comparing html output.
Update to Markdown2 parser changed text output keeping same html semantics. Broke test_string_markdown_code_block_attribute test. I hand patched it to get tests working but it needed a better solution. Write a simple html normalizer using HTMLParser so I don't need third party (lxml, beautifulsoup) library to clean up the test. Use the normalizer to parser the expected result and the result returned by the various markdown libraries. Hopefully this will make the test less fragile. This can have multiple uses in template testing where html is compared. I expect to have to change html_norm.py to make test writing easier in the future.
1 parent c2eb9d4 commit d380fbe

File tree

2 files changed

+146
-16
lines changed

2 files changed

+146
-16
lines changed

test/html_norm.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
"""Minimal html parser/normalizer for use in test_templating.
2+
3+
When testing markdown -> html coversion libraries, there are
4+
gratuitous whitespace changes in generated output that break the
5+
tests. Use this to try to normalize the generated HTML into something
6+
that tries to preserve the semantic meaning allowing tests to stop
7+
breaking.
8+
9+
This is not a complete parsing engine. It supports the Roundup issue
10+
tracker unit tests so that no third party libraries are needed to run
11+
the tests. If you find it useful enjoy.
12+
13+
Ideally this would be done by hijacking in some way
14+
lxml.html.usedoctest to get a liberal parser that will ignore
15+
whitespace. But that means the user has to install lxml to run the
16+
tests. Simlarly BeautifulSoup could be used to pretty print the html
17+
but again then BeautifulSoup would need to be instaled to run the
18+
tests.
19+
20+
"""
21+
from html.parser import HTMLParser
22+
23+
try:
24+
from htmlentitydefs import name2codepoint
25+
except ImportError:
26+
pass # assume running under python3, name2codepoint predefined
27+
28+
29+
class NormalizingHtmlParser(HTMLParser):
30+
"""Handle start/end tags and normalize whitespace in data.
31+
Strip doctype, comments when passed in.
32+
33+
Implements normalize method that takes input html and returns a
34+
normalized string leaving the instance ready for another call to
35+
normalize for another string.
36+
37+
38+
Note that using this rewrites all attributes parsed by HTMLParser
39+
into attr="value" form even though HTMLParser accepts other
40+
attribute specifiction forms.
41+
"""
42+
43+
debug = False # set to true to enable more verbose output
44+
45+
current_normalized_string = "" # accumulate result string
46+
preserve_data = False # if inside pre preserve whitespace
47+
48+
def handle_starttag(self, tag, attrs):
49+
"""put tag on new line with attributes.
50+
Note valid attributes according to HTMLParser:
51+
attrs='single_quote'
52+
attrs=noquote
53+
attrs="double_quote"
54+
"""
55+
if self.debug: print("Start tag:", tag)
56+
57+
self.current_normalized_string += "\n<%s" % tag
58+
59+
for attr in attrs:
60+
if self.debug: print(" attr:", attr)
61+
self.current_normalized_string += ' %s="%s"' % attr
62+
63+
self.current_normalized_string += ">"
64+
65+
if tag == 'pre':
66+
self.preserve_data = True
67+
68+
def handle_endtag(self, tag):
69+
if self.debug: print("End tag :", tag)
70+
71+
self.current_normalized_string += "\n</%s>" % tag
72+
73+
if tag == 'pre':
74+
self.preserve_data = False
75+
76+
def handle_data(self, data):
77+
if self.debug: print("Data :", data)
78+
if not self.preserve_data:
79+
# normalize whitespace remove leading/trailing
80+
data = " ".join(data.strip().split())
81+
82+
if data:
83+
self.current_normalized_string += "\n%s" % data
84+
85+
def handle_comment(self, data):
86+
print("Comment :", data)
87+
88+
def handle_entityref(self, name):
89+
c = chr(name2codepoint[name])
90+
if self.debug: print("Named ent:", c)
91+
92+
self.current_normalized_string += "%s" % c
93+
94+
def handle_charref(self, name):
95+
if name.startswith('x'):
96+
c = chr(int(name[1:], 16))
97+
else:
98+
c = chr(int(name))
99+
if self.debug: print("Num ent :", c)
100+
101+
self.current_normalized_string += "%s" % c
102+
103+
def handle_decl(self, data):
104+
print("Decl :", data)
105+
106+
def reset(self):
107+
"""wrapper around reset with clearing of csef.current_normalized_string
108+
and reset of self.preserve_data
109+
"""
110+
HTMLParser.reset(self)
111+
self.current_normalized_string = ""
112+
self.preserve_data = False
113+
114+
def normalize(self, html):
115+
self.feed(html)
116+
result = self.current_normalized_string
117+
self.reset()
118+
return result
119+
120+
121+
if __name__ == "__main__":
122+
parser = NormalizingHtmlParser()
123+
124+
parser.feed('<div class="markup"><p> paragraph text with whitespace\n and more space <pre><span class="f" data-attr="f">text more text</span></pre></div>')
125+
print("\n\ntest1", parser.current_normalized_string)
126+
127+
parser.reset()
128+
129+
parser.feed('''<div class="markup">
130+
<p> paragraph text with whitespace\n and more space
131+
<pre><span class="f" data-attr="f">text \n more text</span></pre>
132+
</div>''')
133+
print("\n\ntest2", parser.current_normalized_string)
134+
parser.reset()
135+
print("\n\nnormalize", parser.normalize('''<div class="markup">
136+
<p> paragraph text with whitespace\n and more space
137+
<pre><span class="f" data-attr="f">text \n more text &lt;</span></pre>
138+
</div>'''))

test/test_templating.py

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from roundup.cgi.templating import *
66
from .test_actions import MockNull, true
7+
from .html_norm import NormalizingHtmlParser
78

89

910
import pytest
@@ -854,29 +855,20 @@ def test_string_markdown_code_block(self):
854855
self.assertEqual(p.markdown().strip().replace('\n\n', '\n'), u2s(u'<p>embedded code block &lt;pre&gt;</p>\n<pre><code>line 1\nline 2\n</code></pre>\n<p>new &lt;/pre&gt; paragraph</p>'))
855856

856857
def test_string_markdown_code_block_attribute(self):
857-
import sys
858-
859-
_py3 = sys.version_info[0] > 2
858+
parser = NormalizingHtmlParser()
860859

861860
''' also verify that embedded html is escaped '''
862861
p = StringHTMLProperty(self.client, 'test', '1', None, 'test', u2s(u'embedded code block <pre>\n\n``` python\nline 1\nline 2\n```\n\nnew </pre> paragraph'))
863-
m = p.markdown().strip()
862+
m = parser.normalize(p.markdown())
863+
parser.reset()
864864
print(m)
865865
if type(self) == MistuneTestCase:
866-
self.assertEqual(m.replace('\n\n','\n'), '<p>embedded code block &lt;pre&gt;</p>\n<pre><code class="lang-python">line 1\nline 2\n</code></pre>\n<p>new &lt;/pre&gt; paragraph</p>')
866+
self.assertEqual(m, parser.normalize('<p>embedded code block &lt;pre&gt;</p>\n<pre><code class="lang-python">line 1\nline 2\n</code></pre>\n<p>new &lt;/pre&gt; paragraph</p>'))
867867
elif type(self) == MarkdownTestCase:
868-
self.assertEqual(m.replace('\n\n','\n'), '<p>embedded code block &lt;pre&gt;</p>\n<pre><code class="language-python">line 1\nline 2\n</code></pre>\n<p>new &lt;/pre&gt; paragraph</p>')
868+
self.assertEqual(m, parser.normalize('<p>embedded code block &lt;pre&gt;</p>\n<pre><code class="language-python">line 1\nline 2\n</code></pre>\n<p>new &lt;/pre&gt; paragraph</p>'))
869869
else:
870-
test_output = m.replace('\n\n', '\n')
871-
if _py3:
872-
nl = "\n"
873-
else:
874-
nl = ""
875-
expected_result = '<p>embedded code block &lt;pre&gt;</p>\n<div class="codehilite">%(nl)s<pre><span></span><code><span class="n">line</span> <span class="mi">1</span>\n<span class="n">line</span> <span class="mi">2</span>\n</code></pre>%(nl)s</div>\n<p>new &lt;/pre&gt; paragraph</p>' % { 'nl': nl }
876-
if test_output != expected_result:
877-
print("test_output:", test_output)
878-
print("expected_result:", expected_result)
879-
self.assertEqual( test_output, expected_result)
870+
expected_result = parser.normalize('<p>embedded code block &lt;pre&gt;</p>\n<div class="codehilite"><pre><span></span><code><span class="n">line</span> <span class="mi">1</span>\n<span class="n">line</span> <span class="mi">2</span>\n</code></pre></div>\n<p>new &lt;/pre&gt; paragraph</p>')
871+
self.assertEqual(m, expected_result)
880872

881873
def test_markdown_return_text_on_exception(self):
882874
''' string is invalid markdown. missing end of fenced code block '''

0 commit comments

Comments
 (0)