Skip to content

Commit 7f512b4

Browse files
committed
make soup2text convert numeric character codes (e.g., "&adamlaska#39;") too.
- Legacy-Id: 306
1 parent 94734fa commit 7f512b4

1 file changed

Lines changed: 26 additions & 3 deletions

File tree

ietf/utils/soup2text.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,31 @@
1313
entities = [("&lt;", "<"), ("&gt;", ">"),
1414
("&quot;", '"'), ("&apos;", "'"),
1515
("&nbsp;", " "),
16-
("&amp;", "&"), ]
16+
("&amp;", "&"), ] # ampersand last
17+
18+
def unescape(text):
19+
# Unescape character codes (if possible)
20+
start = 0
21+
while True:
22+
try:
23+
pos = text.index("&#", start)
24+
except ValueError:
25+
break
26+
match = re.match("&#\d+;", text[pos:])
27+
if match:
28+
str = match.group()
29+
num = int(str[2:-1])
30+
if num < 256:
31+
text = text[:pos] + chr(num) + text[pos+len(str):]
32+
start = pos + 1
33+
else:
34+
start = pos + len(str)
35+
else:
36+
start = pos + 2
37+
# unescape character entities
38+
for entity, char in entities:
39+
text = text.replace(entity, char) # replace ampersand last
40+
return text
1741

1842
def para(words, pre):
1943
text = " ".join(words)
@@ -23,8 +47,7 @@ def para(words, pre):
2347
now = words[i-1]+" "+words[i]
2448
fix = words[i-1]+words[i]
2549
text = text.replace(now, fix)
26-
for entity, char in entities:
27-
text = text.replace(entity, char)
50+
text = unescape(text)
2851
if not pre:
2952
text = re.sub("[\r\n\t ]+", " ", text)
3053
text = textwrap.fill(text)

0 commit comments

Comments
 (0)