Skip to content

Commit 10ce0e0

Browse files
committed
'soup2text' is a html-to-text converter which uses the BeautifulSoup.py module. It converts html to plain paragraph-filled readable text.
- Legacy-Id: 277
1 parent 100f24b commit 10ce0e0

2 files changed

Lines changed: 84 additions & 0 deletions

File tree

ietf/utils/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from log import log
33
from cache_foreign_key import FKAsOneToOne
44
from templated_form import makeTemplatedForm
5+
from soup2text import TextSoup, soup2text
56

67
makeFormattingForm = makeTemplatedForm
78

ietf/utils/soup2text.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#!/usr/bin/env python
2+
3+
import re
4+
import textwrap
5+
from BeautifulSoup import Tag, BeautifulSoup, NavigableString
6+
7+
block_tags = ["[document]", "html", "body", "div", "blockquote", "table", "tr", "p", "pre", "h1", "h2", "h3", "h4", "h5", "h6", ]
8+
ignore_tags = ["head", "script", "style"]
9+
pre_tags = ["pre"]
10+
entities = [("&lt;", "<"), ("&gt;", ">"),
11+
("&quot;", '"'), ("&apos;", "'"),
12+
("&nbsp;", " "),
13+
("&amp;", "&"), ]
14+
15+
def para(words, pre):
16+
text = " ".join(words)
17+
for entity, char in entities:
18+
text = text.replace(entity, char)
19+
if not pre:
20+
text = re.sub("[\r\n\t ]+", " ", text)
21+
text = textwrap.fill(text)
22+
return text
23+
24+
def render(node, encoding='latin-1', pre=False):
25+
blocks = []
26+
words = []
27+
node.pre = pre or node.name in pre_tags
28+
node.is_block = node.name in block_tags
29+
for child in node:
30+
if isinstance(child, NavigableString):
31+
str = child.__str__(encoding)
32+
if str and not node.pre:
33+
str = str.strip()
34+
if str and not str.startswith("<!") and not str.startswith("<?"):
35+
words.append(str)
36+
elif isinstance(child, Tag):
37+
if child.name in ignore_tags:
38+
pass
39+
else:
40+
child = render(child, encoding, node.pre)
41+
if child.text:
42+
if child.is_block:
43+
if words :
44+
blocks.append(para(words, node.pre)+"\n")
45+
words = []
46+
blocks.append(child.text+"\n\n")
47+
node.is_block = True
48+
else:
49+
words.append(child.text)
50+
else:
51+
raise ValueException("Unexpected node type: '%s'" % child)
52+
if words:
53+
blocks.append(para(words, node.pre))
54+
55+
node.text = ''.join(blocks)
56+
return node
57+
58+
class TextSoup(BeautifulSoup):
59+
60+
def __str__(self, encoding='latin-1',
61+
prettyPrint=False, indentLevel=0):
62+
node = render(self, encoding)
63+
str = node.text
64+
str = re.sub("[ \t]+", " ", str)
65+
str = re.sub("\n\n+", "\n\n", str)
66+
return str
67+
68+
def soup2text(html):
69+
soup = TextSoup(html)
70+
return str(soup)
71+
72+
if __name__ == "__main__":
73+
import sys
74+
import urllib2 as urllib
75+
for arg in sys.argv[1:]:
76+
if arg[:6] in ["http:/", "https:", "ftp://"]:
77+
file = urllib.urlopen(arg)
78+
else:
79+
file = open(arg)
80+
html = file.read()
81+
file.close
82+
soup = TextSoup(html)
83+
print str(soup)

0 commit comments

Comments
 (0)