Skip to content

Commit 9b78963

Browse files
committed
Fix occasional bad sentence end merges in ietf/utils/soup2text.py.
Remove some now unneded exceptions from ietf/testurl.list - Legacy-Id: 302
1 parent ac288c2 commit 9b78963

2 files changed

Lines changed: 14 additions & 11 deletions

File tree

ietf/testurl.list

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,9 @@
33
200,404 /accounts/
44
200,302 /accounts/password_change/
55
200,302 /accounts/profile/
6-
200,404 /idtracker/status/
7-
200,404 /idtracker/last_call/
86
skip /my/
97
skip /idindex/
108
skip /idindex/showdocs/all/date/
119
skip /idindex/showdocs/all/name/
12-
200,404 /liaisons/
13-
200,404 /liaisons/managers/
14-
200,404 /mailinglists/area_lists/
15-
200,404 /mailinglists/nonwg_lists/
10+
1611

ietf/utils/soup2text.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22

33
import re
44
import textwrap
5-
from ietf.contrib.BeautifulSoup import Tag, BeautifulSoup, NavigableString
5+
try:
6+
from ietf.contrib.BeautifulSoup import Tag, BeautifulSoup, NavigableString
7+
except:
8+
from BeautifulSoup import Tag, BeautifulSoup, NavigableString
69

710
block_tags = ["[document]", "html", "body", "div", "blockquote", "table", "tr", "p", "pre", "h1", "h2", "h3", "h4", "h5", "h6", ]
811
ignore_tags = ["head", "script", "style"]
@@ -12,8 +15,14 @@
1215
(" ", " "),
1316
("&", "&"), ]
1417

15-
def para(words, pre):
18+
def para(words, pre):
1619
text = " ".join(words)
20+
# Fix occasional bad sentence end merges
21+
for i in range(1,len(words)):
22+
if words[i].startswith(". "):
23+
now = words[i-1]+" "+words[i]
24+
fix = words[i-1]+words[i]
25+
text = text.replace(now, fix)
1726
for entity, char in entities:
1827
text = text.replace(entity, char)
1928
if not pre:
@@ -80,6 +89,5 @@ def soup2text(html):
8089
else:
8190
file = open(arg)
8291
html = file.read()
83-
file.close
84-
soup = TextSoup(html)
85-
print str(soup)
92+
file.close()
93+
print soup2text(html)

0 commit comments

Comments
 (0)