11
22from __future__ import print_function
33from roundup .anypy .strings import u2s , uchr
4+
5+
46class dehtml :
57 def __init__ (self , converter ):
68 if converter == "none" :
@@ -11,6 +13,7 @@ def __init__(self, converter):
1113 if converter == "beautifulsoup" :
1214 # Not as well tested as dehtml.
1315 from bs4 import BeautifulSoup
16+
1417 def html2text (html ):
1518 soup = BeautifulSoup (html )
1619
@@ -22,62 +25,62 @@ def html2text(html):
2225
2326 self .html2text = html2text
2427 else :
25- raise ImportError # use
28+ raise ImportError
2629 except ImportError :
2730 # use the fallback below if beautiful soup is not installed.
2831 try :
2932 # Python 3+.
3033 from html .parser import HTMLParser
3134 from html .entities import name2codepoint
32- pyver = 3
35+ pyver = 3
3336 except ImportError :
3437 # Python 2.
3538 from HTMLParser import HTMLParser
3639 from htmlentitydefs import name2codepoint
37- pyver = 2
40+ pyver = 2
3841
3942 class DumbHTMLParser (HTMLParser ):
4043 # class attribute
41- text = ""
44+ text = ""
4245
4346 # internal state variable
4447 _skip_data = False
4548 _last_empty = False
4649
4750 def handle_data (self , data ):
48- if self ._skip_data : # skip data if in script or style block
51+ if self ._skip_data : # skip data in script or style block
4952 return
5053
51- if ( data .strip () == "" ):
54+ if (data .strip () == "" ):
5255 # reduce multiple blank lines to 1
53- if ( self ._last_empty ):
56+ if (self ._last_empty ):
5457 return
5558 else :
5659 self ._last_empty = True
5760 else :
5861 self ._last_empty = False
5962
60- self .text = self .text + data
63+ self .text = self .text + data
6164
6265 def handle_starttag (self , tag , attrs ):
63- if (tag == "p" ):
64- self .text = self .text + "\n "
65- if (tag in ("style" , "script" )):
66+ if (tag == "p" ):
67+ self .text = self .text + "\n "
68+ if (tag in ("style" , "script" )):
6669 self ._skip_data = True
6770
6871 def handle_endtag (self , tag ):
69- if (tag in ("style" , "script" )):
72+ if (tag in ("style" , "script" )):
7073 self ._skip_data = False
7174
7275 def handle_entityref (self , name ):
7376 if self ._skip_data :
7477 return
7578 c = uchr (name2codepoint [name ])
7679 try :
77- self .text = self .text + c
80+ self .text = self .text + c
7881 except UnicodeEncodeError :
7982 # print a space as a placeholder
80- self .text = self .text + ' '
83+ self .text = self .text + ' '
8184
8285 def html2text (html ):
8386 if pyver == 3 :
@@ -90,8 +93,9 @@ def html2text(html):
9093
9194 self .html2text = html2text
9295
96+
9397if "__main__" == __name__ :
94- html = '''
98+ html = '''
9599<body>
96100<script>
97101this must not be in output
@@ -152,12 +156,10 @@ def html2text(html):
152156 if html2text :
153157 print (html2text (html ))
154158 except NameError as e :
155- print ("captured error %s" % e )
159+ print ("captured error %s" % e )
156160
157161 html2text = dehtml ("none" ).html2text
158162 if html2text :
159163 print ("FAIL: Error, dehtml(none) is returning a function" )
160164 else :
161165 print ("PASS: dehtml(none) is returning None" )
162-
163-
0 commit comments