Skip to content

Commit 1cafcf3

Browse files
committed
Changed approach to space normalization in soup2text(). Plain whitespace stripping followed by reassembly caused too large information loss. Accompanying changes in generic diff files.
- Legacy-Id: 321
1 parent 49ee9f8 commit 1cafcf3

3 files changed

Lines changed: 20 additions & 17 deletions

File tree

ietf/utils/soup2text.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -39,20 +39,23 @@ def unescape(text):
3939
text = text.replace(entity, char) # replace ampersand last
4040
return text
4141

42-
def para(words, pre):
43-
text = " ".join(words)
44-
# Fix occasional bad sentence end merges
45-
for i in range(1,len(words)):
46-
if words[i].startswith(". "):
47-
now = words[i-1]+" "+words[i]
48-
fix = words[i-1]+words[i]
49-
text = text.replace(now, fix)
42+
def para(words, pre):
43+
text = "".join(words)
5044
text = unescape(text)
5145
if not pre:
5246
text = re.sub("[\r\n\t ]+", " ", text)
5347
text = textwrap.fill(text)
5448
return text
5549

50+
def normalize(str):
51+
# Normalize whitespace at the beginning and end of the string
52+
str = re.sub("^[ \t\n]+", " ", str)
53+
str = re.sub("[ \t\n]+$", " ", str)
54+
# remove xml PIs and metainformation
55+
str = re.sub("<![^>]*>", "", str)
56+
str = re.sub("<\?[^>]*\?>", "", str)
57+
return str
58+
5659
def render(node, encoding='latin-1', pre=False):
5760
blocks = []
5861
words = []
@@ -62,8 +65,8 @@ def render(node, encoding='latin-1', pre=False):
6265
if isinstance(child, NavigableString):
6366
str = child.__str__(encoding)
6467
if str and not node.pre:
65-
str = str.strip()
66-
if str and not str.startswith("<!") and not str.startswith("<?"):
68+
str = normalize(str)
69+
if str:
6770
words.append(str)
6871
elif isinstance(child, Tag):
6972
if child.name in ignore_tags:
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
@@ -12,0 +12,5 @@
2-
+Did you find a bug? Let us know .
2+
+Did you find a bug? Let us know.
33
+
4-
+Any question or suggestion ?
4+
+Any question or suggestion?
55
+
6-
+This page produced by the IETF Secretariat for the IESG
6+
+This page produced by the IETF Secretariat for the IESG

test/diff/generic-diff_produced-by-2

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
@@ -27,0 +23,1 @@
2-
+Did you find a bug? Let us know .
3-
@@ -28,0 +25,5 @@
4-
+Any question or suggestion ?
1+
@@ -17,0 +17,1 @@
2+
+Did you find a bug? Let us know.
3+
@@ -18,0 +19,5 @@
4+
+Any question or suggestion?
55
+
66
+This page produced by the IETF Secretariat for the IESG
77
+

0 commit comments

Comments
 (0)