Skip to content

Commit de9a7dd

Browse files
committed
Added the ability to give fill and pre(formatted) switches to the soup2text command
- Legacy-Id: 403
1 parent d8866a4 commit de9a7dd

1 file changed

Lines changed: 19 additions & 9 deletions

File tree

ietf/utils/soup2text.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,15 @@ def unescape(text):
4141
text = text.replace(entity, char) # replace ampersand last
4242
return text
4343

44-
def para(words, pre):
44+
def para(words, pre, fill):
4545
text = "".join(words)
4646
text = unescape(text)
4747
if not pre:
4848
text = text.strip("\n")
4949
text = text.lstrip()
5050
text = re.sub("[\t\n ]+", " ", text)
51-
text = textwrap.fill(text)
51+
if fill:
52+
text = textwrap.fill(text)
5253
return text
5354

5455
def normalize(str):
@@ -60,7 +61,7 @@ def normalize(str):
6061
str = re.sub("<\?[^>]*\?>", "", str)
6162
return str
6263

63-
def render(node, encoding='latin-1', pre=False):
64+
def render(node, encoding='latin-1', pre=False, fill=True, clean=True):
6465
blocks = []
6566
words = []
6667
node.pre = pre or node.name in pre_tags
@@ -76,11 +77,11 @@ def render(node, encoding='latin-1', pre=False):
7677
if child.name in ignore_tags:
7778
pass
7879
else:
79-
child = render(child, encoding, node.pre)
80+
child = render(child, encoding, node.pre, fill, clean)
8081
if child.text:
8182
if child.is_block:
8283
if words :
83-
blocks.append(para(words, node.pre)+"\n")
84+
blocks.append(para(words, node.pre, fill)+"\n")
8485
words = []
8586
blocks.append(child.text+"\n\n")
8687
node.is_block = True
@@ -94,22 +95,31 @@ def render(node, encoding='latin-1', pre=False):
9495
else:
9596
raise ValueError("Unexpected node type: '%s'" % child)
9697
if words:
97-
blocks.append(para(words, node.pre))
98+
blocks.append(para(words, node.pre, fill))
9899

99100
node.text = ''.join(blocks)
100101
return node
101102

102103
class TextSoup(BeautifulSoup):
103104

105+
def as_text(self, encoding='latin-1', pre=False, fill=True, clean=True):
106+
node = render(self, encoding, pre, fill, clean)
107+
str = node.text
108+
if clean:
109+
str = re.sub("[ \t]+", " ", str)
110+
str = re.sub("\n\n+", "\n\n", str)
111+
return str
112+
113+
104114
def __str__(self, encoding='latin-1',
105115
prettyPrint=False, indentLevel=0):
106-
node = render(self, encoding)
116+
node = render(self, encoding, fill=False)
107117
str = node.text
108118
str = re.sub("[ \t]+", " ", str)
109119
str = re.sub("\n\n+", "\n\n", str)
110120
return str
111121

112-
def soup2text(html):
122+
def soup2text(html, encoding='latin-1', pre=False, fill=True):
113123
# Line ending normalization
114124
html = html.replace("\r\n", "\n").replace("\r", "\n")
115125
# remove comments
@@ -118,7 +128,7 @@ def soup2text(html):
118128
html = re.sub("<br */?>[ \t\n]*(<br */?>)+", "<p/>", html)
119129
html = re.sub("<br */?>([^\n])", r"<br />\n\1", html)
120130
soup = TextSoup(html)
121-
return str(soup)
131+
return soup.as_text(encoding, pre, fill)
122132

123133
if __name__ == "__main__":
124134
import sys

0 commit comments

Comments
 (0)