@@ -41,14 +41,15 @@ def unescape(text):
4141 text = text .replace (entity , char ) # replace ampersand last
4242 return text
4343
44- def para (words , pre ):
44+ def para (words , pre , fill ):
4545 text = "" .join (words )
4646 text = unescape (text )
4747 if not pre :
4848 text = text .strip ("\n " )
4949 text = text .lstrip ()
5050 text = re .sub ("[\t \n ]+" , " " , text )
51- text = textwrap .fill (text )
51+ if fill :
52+ text = textwrap .fill (text )
5253 return text
5354
5455def normalize (str ):
@@ -60,7 +61,7 @@ def normalize(str):
6061 str = re .sub ("<\?[^>]*\?>" , "" , str )
6162 return str
6263
63- def render (node , encoding = 'latin-1' , pre = False ):
64+ def render (node , encoding = 'latin-1' , pre = False , fill = True , clean = True ):
6465 blocks = []
6566 words = []
6667 node .pre = pre or node .name in pre_tags
@@ -76,11 +77,11 @@ def render(node, encoding='latin-1', pre=False):
7677 if child .name in ignore_tags :
7778 pass
7879 else :
79- child = render (child , encoding , node .pre )
80+ child = render (child , encoding , node .pre , fill , clean )
8081 if child .text :
8182 if child .is_block :
8283 if words :
83- blocks .append (para (words , node .pre )+ "\n " )
84+ blocks .append (para (words , node .pre , fill )+ "\n " )
8485 words = []
8586 blocks .append (child .text + "\n \n " )
8687 node .is_block = True
@@ -94,22 +95,31 @@ def render(node, encoding='latin-1', pre=False):
9495 else :
9596 raise ValueError ("Unexpected node type: '%s'" % child )
9697 if words :
97- blocks .append (para (words , node .pre ))
98+ blocks .append (para (words , node .pre , fill ))
9899
99100 node .text = '' .join (blocks )
100101 return node
101102
102103class TextSoup (BeautifulSoup ):
103104
105+ def as_text (self , encoding = 'latin-1' , pre = False , fill = True , clean = True ):
106+ node = render (self , encoding , pre , fill , clean )
107+ str = node .text
108+ if clean :
109+ str = re .sub ("[ \t ]+" , " " , str )
110+ str = re .sub ("\n \n +" , "\n \n " , str )
111+ return str
112+
113+
104114 def __str__ (self , encoding = 'latin-1' ,
105115 prettyPrint = False , indentLevel = 0 ):
106- node = render (self , encoding )
116+ node = render (self , encoding , fill = False )
107117 str = node .text
108118 str = re .sub ("[ \t ]+" , " " , str )
109119 str = re .sub ("\n \n +" , "\n \n " , str )
110120 return str
111121
112- def soup2text (html ):
122+ def soup2text (html , encoding = 'latin-1' , pre = False , fill = True ):
113123 # Line ending normalization
114124 html = html .replace ("\r \n " , "\n " ).replace ("\r " , "\n " )
115125 # remove comments
@@ -118,7 +128,7 @@ def soup2text(html):
118128 html = re .sub ("<br */?>[ \t \n ]*(<br */?>)+" , "<p/>" , html )
119129 html = re .sub ("<br */?>([^\n ])" , r"<br />\n\1" , html )
120130 soup = TextSoup (html )
121- return str ( soup )
131+ return soup . as_text ( encoding , pre , fill )
122132
123133if __name__ == "__main__" :
124134 import sys
0 commit comments