77except :
88 from BeautifulSoup import Tag , BeautifulSoup , NavigableString
99
10- block_tags = ["[document]" , "html" , "body" , "div" , "blockquote" , "table" , "tr" , "p" , "pre" , "h1" , "h2" , "h3" , "h4" , "h5" , "h6" , ]
10+ block_tags = ["[document]" , "html" , "body" , "div" , "blockquote" , "table" , "tr" , "p" , "pre" , "h1" , "h2" , "h3" , "h4" , "h5" , "h6" , "li" ]
11+ space_tags = ["th" , "td" , "br" ]
1112ignore_tags = ["head" , "script" , "style" ]
1213pre_tags = ["pre" ]
1314entities = [("<" , "<" ), (">" , ">" ),
@@ -86,7 +87,10 @@ def render(node, encoding='latin-1', pre=False):
8687 blocks .append (child .text + "\n \n " )
8788 node .is_block = True
8889 else :
89- words .append (child .text )
90+ if child .text :
91+ if child .name in space_tags and not words [- 1 ][- 1 ] in [" " , "\t " , "\n " ]:
92+ words .append (" " )
93+ words .append (child .text )
9094 else :
9195 raise ValueError ("Unexpected node type: '%s'" % child )
9296 if words :
@@ -111,7 +115,6 @@ def soup2text(html):
111115 # some preprocessing to handle common pathological cases
112116 html = re .sub ("<br */?>[ \t \n ]*(<br */?>)+" , "<p/>" , html )
113117 html = re .sub ("<br */?>([^\n ])" , r"<br />\n\1" , html )
114- html = re .sub ("([^ \t \n ])(</t[hd].*?>)" , r"\1 \2" , html )
115118 soup = TextSoup (html )
116119 return str (soup )
117120
0 commit comments