|
| 1 | +############################################################################## |
| 2 | +# |
| 3 | +# Copyright (c) 2001, 2002 Zope Corporation and Contributors. |
| 4 | +# All Rights Reserved. |
| 5 | +# |
| 6 | +# This software is subject to the provisions of the Zope Public License, |
| 7 | +# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution. |
| 8 | +# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED |
| 9 | +# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| 10 | +# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS |
| 11 | +# FOR A PARTICULAR PURPOSE |
| 12 | +# |
| 13 | +############################################################################## |
| 14 | +""" |
| 15 | +Parse HTML and compile to TALInterpreter intermediate code. |
| 16 | +""" |
| 17 | + |
| 18 | +import sys |
| 19 | +import string |
| 20 | + |
| 21 | +from TALGenerator import TALGenerator |
| 22 | +from TALDefs import ZOPE_METAL_NS, ZOPE_TAL_NS, METALError, TALError |
| 23 | +from HTMLParser import HTMLParser, HTMLParseError |
| 24 | + |
| 25 | +BOOLEAN_HTML_ATTRS = [ |
| 26 | + # List of Boolean attributes in HTML that may be given in |
| 27 | + # minimized form (e.g. <img ismap> rather than <img ismap="">) |
| 28 | + # From http://www.w3.org/TR/xhtml1/#guidelines (C.10) |
| 29 | + "compact", "nowrap", "ismap", "declare", "noshade", "checked", |
| 30 | + "disabled", "readonly", "multiple", "selected", "noresize", |
| 31 | + "defer" |
| 32 | + ] |
| 33 | + |
| 34 | +EMPTY_HTML_TAGS = [ |
| 35 | + # List of HTML tags with an empty content model; these are |
| 36 | + # rendered in minimized form, e.g. <img />. |
| 37 | + # From http://www.w3.org/TR/xhtml1/#dtds |
| 38 | + "base", "meta", "link", "hr", "br", "param", "img", "area", |
| 39 | + "input", "col", "basefont", "isindex", "frame", |
| 40 | + ] |
| 41 | + |
| 42 | +PARA_LEVEL_HTML_TAGS = [ |
| 43 | + # List of HTML elements that close open paragraph-level elements |
| 44 | + # and are themselves paragraph-level. |
| 45 | + "h1", "h2", "h3", "h4", "h5", "h6", "p", |
| 46 | + ] |
| 47 | + |
| 48 | +BLOCK_CLOSING_TAG_MAP = { |
| 49 | + "tr": ("tr", "td", "th"), |
| 50 | + "td": ("td", "th"), |
| 51 | + "th": ("td", "th"), |
| 52 | + "li": ("li",), |
| 53 | + "dd": ("dd", "dt"), |
| 54 | + "dt": ("dd", "dt"), |
| 55 | + } |
| 56 | + |
| 57 | +BLOCK_LEVEL_HTML_TAGS = [ |
| 58 | + # List of HTML tags that denote larger sections than paragraphs. |
| 59 | + "blockquote", "table", "tr", "th", "td", "thead", "tfoot", "tbody", |
| 60 | + "noframe", "ul", "ol", "li", "dl", "dt", "dd", "div", |
| 61 | + ] |
| 62 | + |
| 63 | +TIGHTEN_IMPLICIT_CLOSE_TAGS = (PARA_LEVEL_HTML_TAGS |
| 64 | + + BLOCK_CLOSING_TAG_MAP.keys()) |
| 65 | + |
| 66 | + |
| 67 | +class NestingError(HTMLParseError): |
| 68 | + """Exception raised when elements aren't properly nested.""" |
| 69 | + |
| 70 | + def __init__(self, tagstack, endtag, position=(None, None)): |
| 71 | + self.endtag = endtag |
| 72 | + if tagstack: |
| 73 | + if len(tagstack) == 1: |
| 74 | + msg = ('Open tag <%s> does not match close tag </%s>' |
| 75 | + % (tagstack[0], endtag)) |
| 76 | + else: |
| 77 | + msg = ('Open tags <%s> do not match close tag </%s>' |
| 78 | + % (string.join(tagstack, '>, <'), endtag)) |
| 79 | + else: |
| 80 | + msg = 'No tags are open to match </%s>' % endtag |
| 81 | + HTMLParseError.__init__(self, msg, position) |
| 82 | + |
| 83 | +class EmptyTagError(NestingError): |
| 84 | + """Exception raised when empty elements have an end tag.""" |
| 85 | + |
| 86 | + def __init__(self, tag, position=(None, None)): |
| 87 | + self.tag = tag |
| 88 | + msg = 'Close tag </%s> should be removed' % tag |
| 89 | + HTMLParseError.__init__(self, msg, position) |
| 90 | + |
| 91 | +class OpenTagError(NestingError): |
| 92 | + """Exception raised when a tag is not allowed in another tag.""" |
| 93 | + |
| 94 | + def __init__(self, tagstack, tag, position=(None, None)): |
| 95 | + self.tag = tag |
| 96 | + msg = 'Tag <%s> is not allowed in <%s>' % (tag, tagstack[-1]) |
| 97 | + HTMLParseError.__init__(self, msg, position) |
| 98 | + |
| 99 | +class HTMLTALParser(HTMLParser): |
| 100 | + |
| 101 | + # External API |
| 102 | + |
| 103 | + def __init__(self, gen=None): |
| 104 | + HTMLParser.__init__(self) |
| 105 | + if gen is None: |
| 106 | + gen = TALGenerator(xml=0) |
| 107 | + self.gen = gen |
| 108 | + self.tagstack = [] |
| 109 | + self.nsstack = [] |
| 110 | + self.nsdict = {'tal': ZOPE_TAL_NS, 'metal': ZOPE_METAL_NS} |
| 111 | + |
| 112 | + def parseFile(self, file): |
| 113 | + f = open(file) |
| 114 | + data = f.read() |
| 115 | + f.close() |
| 116 | + self.parseString(data) |
| 117 | + |
| 118 | + def parseString(self, data): |
| 119 | + self.feed(data) |
| 120 | + self.close() |
| 121 | + while self.tagstack: |
| 122 | + self.implied_endtag(self.tagstack[-1], 2) |
| 123 | + assert self.nsstack == [], self.nsstack |
| 124 | + |
| 125 | + def getCode(self): |
| 126 | + return self.gen.getCode() |
| 127 | + |
| 128 | + def getWarnings(self): |
| 129 | + return () |
| 130 | + |
| 131 | + # Overriding HTMLParser methods |
| 132 | + |
| 133 | + def handle_starttag(self, tag, attrs): |
| 134 | + self.close_para_tags(tag) |
| 135 | + self.scan_xmlns(attrs) |
| 136 | + tag, attrlist, taldict, metaldict = self.process_ns(tag, attrs) |
| 137 | + self.tagstack.append(tag) |
| 138 | + self.gen.emitStartElement(tag, attrlist, taldict, metaldict, |
| 139 | + self.getpos()) |
| 140 | + if tag in EMPTY_HTML_TAGS: |
| 141 | + self.implied_endtag(tag, -1) |
| 142 | + |
| 143 | + def handle_startendtag(self, tag, attrs): |
| 144 | + self.close_para_tags(tag) |
| 145 | + self.scan_xmlns(attrs) |
| 146 | + tag, attrlist, taldict, metaldict = self.process_ns(tag, attrs) |
| 147 | + if taldict.get("content"): |
| 148 | + self.gen.emitStartElement(tag, attrlist, taldict, metaldict, |
| 149 | + self.getpos()) |
| 150 | + self.gen.emitEndElement(tag, implied=-1) |
| 151 | + else: |
| 152 | + self.gen.emitStartElement(tag, attrlist, taldict, metaldict, |
| 153 | + self.getpos(), isend=1) |
| 154 | + self.pop_xmlns() |
| 155 | + |
| 156 | + def handle_endtag(self, tag): |
| 157 | + if tag in EMPTY_HTML_TAGS: |
| 158 | + # </img> etc. in the source is an error |
| 159 | + raise EmptyTagError(tag, self.getpos()) |
| 160 | + self.close_enclosed_tags(tag) |
| 161 | + self.gen.emitEndElement(tag) |
| 162 | + self.pop_xmlns() |
| 163 | + self.tagstack.pop() |
| 164 | + |
| 165 | + def close_para_tags(self, tag): |
| 166 | + if tag in EMPTY_HTML_TAGS: |
| 167 | + return |
| 168 | + close_to = -1 |
| 169 | + if BLOCK_CLOSING_TAG_MAP.has_key(tag): |
| 170 | + blocks_to_close = BLOCK_CLOSING_TAG_MAP[tag] |
| 171 | + for i in range(len(self.tagstack)): |
| 172 | + t = self.tagstack[i] |
| 173 | + if t in blocks_to_close: |
| 174 | + if close_to == -1: |
| 175 | + close_to = i |
| 176 | + elif t in BLOCK_LEVEL_HTML_TAGS: |
| 177 | + close_to = -1 |
| 178 | + elif tag in PARA_LEVEL_HTML_TAGS + BLOCK_LEVEL_HTML_TAGS: |
| 179 | + i = len(self.tagstack) - 1 |
| 180 | + while i >= 0: |
| 181 | + closetag = self.tagstack[i] |
| 182 | + if closetag in BLOCK_LEVEL_HTML_TAGS: |
| 183 | + break |
| 184 | + if closetag in PARA_LEVEL_HTML_TAGS: |
| 185 | + if closetag != "p": |
| 186 | + raise OpenTagError(self.tagstack, tag, self.getpos()) |
| 187 | + close_to = i |
| 188 | + i = i - 1 |
| 189 | + if close_to >= 0: |
| 190 | + while len(self.tagstack) > close_to: |
| 191 | + self.implied_endtag(self.tagstack[-1], 1) |
| 192 | + |
| 193 | + def close_enclosed_tags(self, tag): |
| 194 | + if tag not in self.tagstack: |
| 195 | + raise NestingError(self.tagstack, tag, self.getpos()) |
| 196 | + while tag != self.tagstack[-1]: |
| 197 | + self.implied_endtag(self.tagstack[-1], 1) |
| 198 | + assert self.tagstack[-1] == tag |
| 199 | + |
| 200 | + def implied_endtag(self, tag, implied): |
| 201 | + assert tag == self.tagstack[-1] |
| 202 | + assert implied in (-1, 1, 2) |
| 203 | + isend = (implied < 0) |
| 204 | + if tag in TIGHTEN_IMPLICIT_CLOSE_TAGS: |
| 205 | + # Pick out trailing whitespace from the program, and |
| 206 | + # insert the close tag before the whitespace. |
| 207 | + white = self.gen.unEmitWhitespace() |
| 208 | + else: |
| 209 | + white = None |
| 210 | + self.gen.emitEndElement(tag, isend=isend, implied=implied) |
| 211 | + if white: |
| 212 | + self.gen.emitRawText(white) |
| 213 | + self.tagstack.pop() |
| 214 | + self.pop_xmlns() |
| 215 | + |
| 216 | + def handle_charref(self, name): |
| 217 | + self.gen.emitRawText("&#%s;" % name) |
| 218 | + |
| 219 | + def handle_entityref(self, name): |
| 220 | + self.gen.emitRawText("&%s;" % name) |
| 221 | + |
| 222 | + def handle_data(self, data): |
| 223 | + self.gen.emitRawText(data) |
| 224 | + |
| 225 | + def handle_comment(self, data): |
| 226 | + self.gen.emitRawText("<!--%s-->" % data) |
| 227 | + |
| 228 | + def handle_decl(self, data): |
| 229 | + self.gen.emitRawText("<!%s>" % data) |
| 230 | + |
| 231 | + def handle_pi(self, data): |
| 232 | + self.gen.emitRawText("<?%s>" % data) |
| 233 | + |
| 234 | + # Internal thingies |
| 235 | + |
| 236 | + def scan_xmlns(self, attrs): |
| 237 | + nsnew = {} |
| 238 | + for key, value in attrs: |
| 239 | + if key[:6] == "xmlns:": |
| 240 | + nsnew[key[6:]] = value |
| 241 | + if nsnew: |
| 242 | + self.nsstack.append(self.nsdict) |
| 243 | + self.nsdict = self.nsdict.copy() |
| 244 | + self.nsdict.update(nsnew) |
| 245 | + else: |
| 246 | + self.nsstack.append(self.nsdict) |
| 247 | + |
| 248 | + def pop_xmlns(self): |
| 249 | + self.nsdict = self.nsstack.pop() |
| 250 | + |
| 251 | + def fixname(self, name): |
| 252 | + if ':' in name: |
| 253 | + prefix, suffix = string.split(name, ':', 1) |
| 254 | + if prefix == 'xmlns': |
| 255 | + nsuri = self.nsdict.get(suffix) |
| 256 | + if nsuri in (ZOPE_TAL_NS, ZOPE_METAL_NS): |
| 257 | + return name, name, prefix |
| 258 | + else: |
| 259 | + nsuri = self.nsdict.get(prefix) |
| 260 | + if nsuri == ZOPE_TAL_NS: |
| 261 | + return name, suffix, 'tal' |
| 262 | + elif nsuri == ZOPE_METAL_NS: |
| 263 | + return name, suffix, 'metal' |
| 264 | + return name, name, 0 |
| 265 | + |
| 266 | + def process_ns(self, name, attrs): |
| 267 | + attrlist = [] |
| 268 | + taldict = {} |
| 269 | + metaldict = {} |
| 270 | + name, namebase, namens = self.fixname(name) |
| 271 | + for item in attrs: |
| 272 | + key, value = item |
| 273 | + key, keybase, keyns = self.fixname(key) |
| 274 | + ns = keyns or namens # default to tag namespace |
| 275 | + if ns and ns != 'unknown': |
| 276 | + item = (key, value, ns) |
| 277 | + if ns == 'tal': |
| 278 | + if taldict.has_key(keybase): |
| 279 | + raise TALError("duplicate TAL attribute " + |
| 280 | + `keybase`, self.getpos()) |
| 281 | + taldict[keybase] = value |
| 282 | + elif ns == 'metal': |
| 283 | + if metaldict.has_key(keybase): |
| 284 | + raise METALError("duplicate METAL attribute " + |
| 285 | + `keybase`, self.getpos()) |
| 286 | + metaldict[keybase] = value |
| 287 | + attrlist.append(item) |
| 288 | + if namens in ('metal', 'tal'): |
| 289 | + taldict['tal tag'] = namens |
| 290 | + return name, attrlist, taldict, metaldict |
0 commit comments