Only print the first 100 lines of a long diff. New soup2html code for spacing associated with certain tags.

levkowetz · levkowetz · commit dd37257c0c21 · 2007-06-12T17:52:07.000Z
- Legacy-Id: 337
diff --git a/ietf/tests.py b/ietf/tests.py
@@ -178,7 +178,8 @@ def doUrlsTest(self, lst):
                                 print "OK   cmp %s" % (url)
                             else:
                                 contextlines = 0
-                                diff = "\n".join(unified_diff(goodtext, testtext, master, url, "", "", contextlines, lineterm=""))
+                                difflist = list(unified_diff(goodtext, testtext, master, url, "", "", contextlines, lineterm=""))
+                                diff = "\n".join(difflist)
                                 for chunk in self.diffchunks:
                                     #print "*** Checking for chunk:", chunk[:24]
                                     while re.search(chunk, diff):
@@ -201,7 +202,9 @@ def doUrlsTest(self, lst):
                                         print "OK   cmp %s" % (url)
                                     else:
                                         print "Diff:    %s" % (url)
-                                        print diff
+                                        print "\n".join(difflist[:100])
+                                        if len(difflist) > 100:
+                                            print "... (skipping %s lines of diff)" % (len(difflist)-100)
                                 else:
                                     print "OK   cmp %s" % (url)
                                     
diff --git a/ietf/utils/soup2text.py b/ietf/utils/soup2text.py
@@ -7,7 +7,8 @@
 except:
     from BeautifulSoup import Tag, BeautifulSoup, NavigableString
 
-block_tags = ["[document]", "html", "body", "div", "blockquote", "table", "tr", "p", "pre", "h1", "h2", "h3", "h4", "h5", "h6", ]
+block_tags = ["[document]", "html", "body", "div", "blockquote", "table", "tr", "p", "pre", "h1", "h2", "h3", "h4", "h5", "h6", "li"]
+space_tags = ["th", "td", "br"]
 ignore_tags = ["head", "script", "style"]
 pre_tags = ["pre"]
 entities = [("&lt;", "<"),   ("&gt;", ">"),
@@ -86,7 +87,10 @@ def render(node, encoding='latin-1', pre=False):
                         blocks.append(child.text+"\n\n")
                         node.is_block = True
                     else:
-                        words.append(child.text)
+                        if child.text:
+                            if child.name in space_tags and not words[-1][-1] in [" ", "\t", "\n"]:
+                                words.append(" ")
+                            words.append(child.text)
         else:
             raise ValueError("Unexpected node type: '%s'" % child)
     if words:
@@ -111,7 +115,6 @@ def soup2text(html):
     # some preprocessing to handle common pathological cases
     html = re.sub("<br */?>[ \t\n]*(<br */?>)+", "<p/>", html)
     html = re.sub("<br */?>([^\n])", r"<br />\n\1", html)
-    html = re.sub("([^ \t\n])(</t[hd].*?>)", r"\1 \2", html)
     soup = TextSoup(html)
     return str(soup)