datatracker/ietf/utils/soup2text.py at 772a2242ad783fa9c17543d2903a086d3b148c83 · https-github-com-bit/datatracker · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python

import re
import textwrap
try:
    from ietf.contrib.BeautifulSoup import Tag, BeautifulSoup, NavigableString
except:
    from BeautifulSoup import Tag, BeautifulSoup, NavigableString

block_tags = ["[document]", "html", "body", "div", "blockquote", "table", "tr", "p", "pre", "h1", "h2", "h3", "h4", "h5", "h6", "li", "option"]
space_tags = ["th", "td"]
break_tags = ["br"]
ignore_tags = ["head", "script", "style"]
pre_tags = ["pre", "option"]
entities = [("&lt;", "<"),   ("&gt;", ">"),
            ("&quot;", '"'), ("&apos;", "'"),
            ("&nbsp;", " "),
            ("&amp;", "&"), ]           # ampersand last

def unescape(text):
    # Unescape character codes (if possible)
    start = 0
    while True:
        try:
            pos = text.index("&#", start)
        except ValueError:
            break
        match = re.match("&#\d+;", text[pos:])
        if match:
            str = match.group()
            num = int(str[2:-1])
            if num < 256:
                text = text[:pos] + chr(num) + text[pos+len(str):]
                start = pos + 1
            else:
                start = pos + len(str)
        else:
            start = pos + 2
    # unescape character entities
    for entity, char in entities:
        text = text.replace(entity, char) # replace ampersand last
    return text

def para(words, pre, fill):
    text = "".join(words)
    text = unescape(text)
    if not pre:
        text = text.strip("\n")
        text = text.lstrip()
        text = re.sub("[\t\n ]+", " ", text)
        if fill:
            text = textwrap.fill(text)
    return text

def normalize(str):
    # Normalize whitespace at the beginning and end of the string
    str = re.sub("^[ \t]+", " ", str)
    str = re.sub("[ \t]+$", " ", str)
    # remove xml PIs and metainformation
    str = re.sub("<![^>]*>", "", str)
    str = re.sub("<\?[^>]*\?>", "", str)
    return str

def render(node, encoding='latin-1', pre=False, fill=True, clean=True):
    blocks = []
    words = []
    node.pre = pre or node.name in pre_tags
    node.is_block = node.name in block_tags
    for child in node:
        if isinstance(child, NavigableString):
            str = child.__str__(encoding)
            if str and not node.pre:
                str = normalize(str)
            if str:
                words.append(str)
        elif isinstance(child, Tag):
            if child.name in ignore_tags:
                pass
            else:
                child = render(child, encoding, node.pre, fill, clean)
                if child.text:
                    if child.is_block:
                        if words :
                            blocks.append(para(words, node.pre, fill)+"\n")
                            words = []
                        blocks.append(child.text+"\n\n")
                        node.is_block = True
                    else:
                        words.append(child.text)
                        if child.text[-1] not in [" ", "\t", "\n"]:
                            if child.name in space_tags:
                                words.append(" ")
                            if child.name in break_tags:
                                words.append("\n")
        else:
            raise ValueError("Unexpected node type: '%s'" % child)
    if words:
        blocks.append(para(words, node.pre, fill))

    node.text = ''.join(blocks)
    return node

class TextSoup(BeautifulSoup):

    def as_text(self, encoding='latin-1', pre=False, fill=True, clean=True):
        node = render(self, encoding, pre, fill, clean)
        str = node.text
        if clean:
            str = re.sub("[ \t]+", " ", str)
            str = re.sub("\n\n+", "\n\n", str)
        return str


    def __str__(self, encoding='latin-1',
                prettyPrint=False, indentLevel=0):
        node = render(self, encoding, fill=False)
        str = node.text
        str = re.sub("[ \t]+", " ", str)
        str = re.sub("\n\n+", "\n\n", str)
        return str

def soup2text(html, encoding='latin-1', pre=False, fill=True):
    # Line ending normalization
    html = html.replace("\r\n", "\n").replace("\r", "\n")
    # remove comments
    html = re.sub("(?s)<!--.*?-->", "", html)
    # some preprocessing to handle common pathological cases
    html = re.sub("<br */?>[ \t\n]*(<br */?>)+", "<p/>", html)
    html = re.sub("<br */?>([^\n])", r"<br />\n\1", html)
    soup = TextSoup(html)
    return soup.as_text(encoding, pre, fill)

if __name__ == "__main__":
    import sys
    import urllib2 as urllib
    for arg in sys.argv[1:]:
        if arg[:6] in ["http:/", "https:", "ftp://"]:
            file = urllib.urlopen(arg)
        else:
            file = open(arg)
        html = file.read()
        file.close()
        print soup2text(html)