| 1 | # -*- coding: utf-8 -*- |
|---|
| 2 | # |
|---|
| 3 | # Copyright (C) 2003-2006 Edgewall Software |
|---|
| 4 | # All rights reserved. |
|---|
| 5 | # |
|---|
| 6 | # This software is licensed as described in the file COPYING, which |
|---|
| 7 | # you should have received as part of this distribution. The terms |
|---|
| 8 | # are also available at http://trac.edgewall.com/license.html. |
|---|
| 9 | # |
|---|
| 10 | # This software consists of voluntary contributions made by many |
|---|
| 11 | # individuals. For exact contribution history, see the revision |
|---|
| 12 | # history and logs, available at http://projects.edgewall.com/trac/. |
|---|
| 13 | |
|---|
| 14 | import htmlentitydefs |
|---|
| 15 | from HTMLParser import HTMLParser, HTMLParseError |
|---|
| 16 | import re |
|---|
| 17 | try: |
|---|
| 18 | frozenset |
|---|
| 19 | except NameError: |
|---|
| 20 | from sets import ImmutableSet as frozenset |
|---|
| 21 | from StringIO import StringIO |
|---|
| 22 | import sys |
|---|
| 23 | |
|---|
| 24 | _EMPTY_TAGS = frozenset(['br', 'hr', 'img', 'input']) |
|---|
| 25 | _BOOLEAN_ATTRS = frozenset(['selected', 'checked', 'compact', 'declare', |
|---|
| 26 | 'defer', 'disabled', 'ismap', 'multiple', 'nohref', |
|---|
| 27 | 'noresize', 'noshade', 'nowrap']) |
|---|
| 28 | |
|---|
| 29 | |
|---|
| 30 | class Markup(str): |
|---|
| 31 | """Marks a string as being safe for inclusion in XML output without needing |
|---|
| 32 | to be escaped. |
|---|
| 33 | |
|---|
| 34 | Strings are normally automatically escaped when added to the HDF. |
|---|
| 35 | `Markup`-strings are however an exception. Use with care. |
|---|
| 36 | |
|---|
| 37 | (since Trac 0.9.3) |
|---|
| 38 | """ |
|---|
| 39 | def __new__(self, text='', *args): |
|---|
| 40 | if args: |
|---|
| 41 | text %= tuple([escape(arg) for arg in args]) |
|---|
| 42 | return str.__new__(self, text) |
|---|
| 43 | |
|---|
| 44 | def __add__(self, other): |
|---|
| 45 | return Markup(str(self) + Markup.escape(other)) |
|---|
| 46 | |
|---|
| 47 | def __mul__(self, num): |
|---|
| 48 | return Markup(str(self) * num) |
|---|
| 49 | |
|---|
| 50 | def join(self, seq): |
|---|
| 51 | return Markup(str(self).join([Markup.escape(item) for item in seq])) |
|---|
| 52 | |
|---|
| 53 | def stripentities(self, keepxmlentities=False): |
|---|
| 54 | """Return a copy of the text with any character or numeric entities |
|---|
| 55 | replaced by the equivalent UTF-8 characters. |
|---|
| 56 | |
|---|
| 57 | If the `keepxmlentities` parameter is provided and evaluates to `True`, |
|---|
| 58 | the core XML entities (&, ', >, < and "). |
|---|
| 59 | |
|---|
| 60 | (Since Trac 0.10) |
|---|
| 61 | """ |
|---|
| 62 | def _replace_entity(match): |
|---|
| 63 | if match.group(1): # numeric entity |
|---|
| 64 | ref = match.group(1) |
|---|
| 65 | if ref.startswith('x'): |
|---|
| 66 | ref = int(ref[1:], 16) |
|---|
| 67 | else: |
|---|
| 68 | ref = int(ref, 10) |
|---|
| 69 | return unichr(ref).encode('utf-8') |
|---|
| 70 | else: # character entity |
|---|
| 71 | ref = match.group(2) |
|---|
| 72 | if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', 'quot'): |
|---|
| 73 | return '&%s;' % ref |
|---|
| 74 | try: |
|---|
| 75 | codepoint = htmlentitydefs.name2codepoint[ref] |
|---|
| 76 | return unichr(codepoint).encode('utf-8') |
|---|
| 77 | except KeyError: |
|---|
| 78 | if keepxmlentities: |
|---|
| 79 | return '&%s;' % ref |
|---|
| 80 | else: |
|---|
| 81 | return ref |
|---|
| 82 | return Markup(re.sub(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)', |
|---|
| 83 | _replace_entity, self)) |
|---|
| 84 | |
|---|
| 85 | def striptags(self): |
|---|
| 86 | """Return a copy of the text with all XML/HTML tags removed.""" |
|---|
| 87 | return Markup(re.sub(r'<[^>]*?>', '', self)) |
|---|
| 88 | |
|---|
| 89 | def escape(cls, text, quotes=True): |
|---|
| 90 | """Create a Markup instance from a string and escape special characters |
|---|
| 91 | it may contain (<, >, & and "). |
|---|
| 92 | |
|---|
| 93 | If the `quotes` parameter is set to `False`, the " character is left as |
|---|
| 94 | is. Escaping quotes is generally only required for strings that are to |
|---|
| 95 | be used in attribute values. |
|---|
| 96 | """ |
|---|
| 97 | if isinstance(text, cls): |
|---|
| 98 | return text |
|---|
| 99 | text = str(text) |
|---|
| 100 | if not text: |
|---|
| 101 | return cls() |
|---|
| 102 | text = text.replace('&', '&') \ |
|---|
| 103 | .replace('<', '<') \ |
|---|
| 104 | .replace('>', '>') |
|---|
| 105 | if quotes: |
|---|
| 106 | text = text.replace('"', '"') |
|---|
| 107 | return cls(text) |
|---|
| 108 | escape = classmethod(escape) |
|---|
| 109 | |
|---|
| 110 | def unescape(self): |
|---|
| 111 | """Reverse-escapes &, <, > and " and returns a `str`.""" |
|---|
| 112 | if not self: |
|---|
| 113 | return '' |
|---|
| 114 | return str(self).replace('"', '"') \ |
|---|
| 115 | .replace('>', '>') \ |
|---|
| 116 | .replace('<', '<') \ |
|---|
| 117 | .replace('&', '&') |
|---|
| 118 | |
|---|
| 119 | def plaintext(self, keeplinebreaks=True): |
|---|
| 120 | """Returns the text as a `str`with all entities and tags removed.""" |
|---|
| 121 | text = self.striptags().stripentities() |
|---|
| 122 | if not keeplinebreaks: |
|---|
| 123 | text = text.replace('\n', ' ') |
|---|
| 124 | return text |
|---|
| 125 | |
|---|
| 126 | def sanitize(self): |
|---|
| 127 | """Parse the text as HTML and return a cleaned up XHTML representation. |
|---|
| 128 | |
|---|
| 129 | This will remove any javascript code or other potentially dangerous |
|---|
| 130 | elements. |
|---|
| 131 | |
|---|
| 132 | If the HTML cannot be parsed, an `HTMLParseError` will be raised by the |
|---|
| 133 | underlying `HTMLParser` module, which should be handled by the caller of |
|---|
| 134 | this function. |
|---|
| 135 | """ |
|---|
| 136 | buf = StringIO() |
|---|
| 137 | sanitizer = Sanitizer(buf) |
|---|
| 138 | sanitizer.feed(self.stripentities(keepxmlentities=True)) |
|---|
| 139 | return Markup(buf.getvalue()) |
|---|
| 140 | |
|---|
| 141 | |
|---|
| 142 | escape = Markup.escape |
|---|
| 143 | |
|---|
| 144 | def unescape(text): |
|---|
| 145 | """Reverse-escapes &, <, > and " and returns a `str`.""" |
|---|
| 146 | if not isinstance(text, Markup): |
|---|
| 147 | return text |
|---|
| 148 | return text.unescape() |
|---|
| 149 | |
|---|
| 150 | |
|---|
| 151 | class Deuglifier(object): |
|---|
| 152 | |
|---|
| 153 | def __new__(cls): |
|---|
| 154 | self = object.__new__(cls) |
|---|
| 155 | if not hasattr(cls, '_compiled_rules'): |
|---|
| 156 | cls._compiled_rules = re.compile('(?:' + '|'.join(cls.rules()) + ')') |
|---|
| 157 | self._compiled_rules = cls._compiled_rules |
|---|
| 158 | return self |
|---|
| 159 | |
|---|
| 160 | def format(self, indata): |
|---|
| 161 | return re.sub(self._compiled_rules, self.replace, indata) |
|---|
| 162 | |
|---|
| 163 | def replace(self, fullmatch): |
|---|
| 164 | for mtype, match in fullmatch.groupdict().items(): |
|---|
| 165 | if match: |
|---|
| 166 | if mtype == 'font': |
|---|
| 167 | return '<span>' |
|---|
| 168 | elif mtype == 'endfont': |
|---|
| 169 | return '</span>' |
|---|
| 170 | return '<span class="code-%s">' % mtype |
|---|
| 171 | |
|---|
| 172 | |
|---|
| 173 | class Sanitizer(HTMLParser): |
|---|
| 174 | |
|---|
| 175 | safe_tags = frozenset(['a', 'abbr', 'acronym', 'address', 'area', |
|---|
| 176 | 'b', 'big', 'blockquote', 'br', 'button', 'caption', 'center', |
|---|
| 177 | 'cite', 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', |
|---|
| 178 | 'div', 'dl', 'dt', 'em', 'fieldset', 'font', 'form', 'h1', 'h2', |
|---|
| 179 | 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd', |
|---|
| 180 | 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup', |
|---|
| 181 | 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', |
|---|
| 182 | 'span', 'strike', 'strong', 'sub', 'sup', 'table', 'tbody', |
|---|
| 183 | 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', 'ul', |
|---|
| 184 | 'var']) |
|---|
| 185 | safe_attrs = frozenset(['abbr', 'accept', 'accept-charset', |
|---|
| 186 | 'accesskey', 'action', 'align', 'alt', 'axis', 'border', |
|---|
| 187 | 'cellpadding', 'cellspacing', 'char', 'charoff', 'charset', |
|---|
| 188 | 'checked', 'cite', 'class', 'clear', 'cols', 'colspan', 'color', |
|---|
| 189 | 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype', |
|---|
| 190 | 'for', 'frame', 'headers', 'height', 'href', 'hreflang', |
|---|
| 191 | 'hspace', 'id', 'ismap', 'label', 'lang', 'longdesc', |
|---|
| 192 | 'maxlength', 'media', 'method', 'multiple', 'name', 'nohref', |
|---|
| 193 | 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', 'rows', |
|---|
| 194 | 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', |
|---|
| 195 | 'span', 'src', 'start', 'style', 'summary', 'tabindex', |
|---|
| 196 | 'target', 'title', 'type', 'usemap', 'valign', 'value', |
|---|
| 197 | 'vspace', 'width']) |
|---|
| 198 | uri_attrs = frozenset(['action', 'background', 'dynsrc', 'href', |
|---|
| 199 | 'lowsrc', 'src']) |
|---|
| 200 | safe_schemes = frozenset(['file', 'ftp', 'http', 'https', 'mailto', |
|---|
| 201 | None]) |
|---|
| 202 | |
|---|
| 203 | def __init__(self, out): |
|---|
| 204 | HTMLParser.__init__(self) |
|---|
| 205 | self.out = out |
|---|
| 206 | self.waiting_for = None |
|---|
| 207 | |
|---|
| 208 | def handle_starttag(self, tag, attrs): |
|---|
| 209 | if self.waiting_for: |
|---|
| 210 | return |
|---|
| 211 | if tag not in self.safe_tags: |
|---|
| 212 | self.waiting_for = tag |
|---|
| 213 | return |
|---|
| 214 | self.out.write('<' + tag) |
|---|
| 215 | |
|---|
| 216 | def _get_scheme(text): |
|---|
| 217 | if ':' not in text: |
|---|
| 218 | return None |
|---|
| 219 | chars = [char for char in text.split(':', 1)[0] |
|---|
| 220 | if char.isalnum()] |
|---|
| 221 | return ''.join(chars).lower() |
|---|
| 222 | |
|---|
| 223 | for attrname, attrval in attrs: |
|---|
| 224 | if attrname not in self.safe_attrs: |
|---|
| 225 | continue |
|---|
| 226 | elif attrname in self.uri_attrs: |
|---|
| 227 | # Don't allow URI schemes such as "javascript:" |
|---|
| 228 | if _get_scheme(attrval) not in self.safe_schemes: |
|---|
| 229 | continue |
|---|
| 230 | elif attrname == 'style': |
|---|
| 231 | # Remove dangerous CSS declarations from inline styles |
|---|
| 232 | decls = [] |
|---|
| 233 | for decl in filter(None, attrval.split(';')): |
|---|
| 234 | is_evil = False |
|---|
| 235 | if 'expression' in decl: |
|---|
| 236 | is_evil = True |
|---|
| 237 | for m in re.finditer(r'url\s*\(([^)]+)', decl): |
|---|
| 238 | if _get_scheme(m.group(1)) not in self.safe_schemes: |
|---|
| 239 | is_evil = True |
|---|
| 240 | break |
|---|
| 241 | if not is_evil: |
|---|
| 242 | decls.append(decl.strip()) |
|---|
| 243 | if not decls: |
|---|
| 244 | continue |
|---|
| 245 | attrval = '; '.join(decls) |
|---|
| 246 | self.out.write(' ' + attrname + '="' + escape(attrval) + '"') |
|---|
| 247 | |
|---|
| 248 | if tag in _EMPTY_TAGS: |
|---|
| 249 | self.out.write(' />') |
|---|
| 250 | else: |
|---|
| 251 | self.out.write('>') |
|---|
| 252 | |
|---|
| 253 | def handle_entityref(self, name): |
|---|
| 254 | if not self.waiting_for: |
|---|
| 255 | self.out.write('&%s;' % name) |
|---|
| 256 | |
|---|
| 257 | def handle_data(self, data): |
|---|
| 258 | if not self.waiting_for: |
|---|
| 259 | self.out.write(escape(data, quotes=False)) |
|---|
| 260 | |
|---|
| 261 | def handle_endtag(self, tag): |
|---|
| 262 | if self.waiting_for: |
|---|
| 263 | if self.waiting_for == tag: |
|---|
| 264 | self.waiting_for = None |
|---|
| 265 | return |
|---|
| 266 | if tag not in _EMPTY_TAGS: |
|---|
| 267 | self.out.write('</' + tag + '>') |
|---|
| 268 | |
|---|
| 269 | |
|---|
| 270 | class FragmentMeta(type): |
|---|
| 271 | |
|---|
| 272 | def __getitem__(cls, nodes): |
|---|
| 273 | return cls()[nodes] |
|---|
| 274 | |
|---|
| 275 | |
|---|
| 276 | class Fragment(object): |
|---|
| 277 | __metaclass__ = FragmentMeta |
|---|
| 278 | __slots__ = ['children'] |
|---|
| 279 | |
|---|
| 280 | def __init__(self): |
|---|
| 281 | self.children = [] |
|---|
| 282 | |
|---|
| 283 | def append(self, node): |
|---|
| 284 | """Append an element or string as child node.""" |
|---|
| 285 | if isinstance(node, Element): |
|---|
| 286 | self.children.append(node) |
|---|
| 287 | elif isinstance(node, Fragment): |
|---|
| 288 | self.children += node.children |
|---|
| 289 | elif node: |
|---|
| 290 | self.children.append(node) |
|---|
| 291 | |
|---|
| 292 | def __getitem__(self, nodes): |
|---|
| 293 | """Add child nodes to the element.""" |
|---|
| 294 | if not isinstance(nodes, (basestring, Fragment)): |
|---|
| 295 | try: |
|---|
| 296 | nodes = iter(nodes) |
|---|
| 297 | except TypeError: |
|---|
| 298 | nodes = [str(nodes)] |
|---|
| 299 | else: |
|---|
| 300 | nodes = [nodes] |
|---|
| 301 | for node in nodes: |
|---|
| 302 | self.append(node) |
|---|
| 303 | return self |
|---|
| 304 | |
|---|
| 305 | def serialize(self): |
|---|
| 306 | """Generator that yield tags and text nodes as strings.""" |
|---|
| 307 | for child in self.children: |
|---|
| 308 | if isinstance(child, Element): |
|---|
| 309 | for part in child.serialize(): |
|---|
| 310 | yield part |
|---|
| 311 | else: |
|---|
| 312 | yield escape(child, quotes=False) |
|---|
| 313 | |
|---|
| 314 | def __str__(self): |
|---|
| 315 | return ''.join(self.serialize()) |
|---|
| 316 | |
|---|
| 317 | def __add__(self, other): |
|---|
| 318 | return Fragment()[self, other] |
|---|
| 319 | |
|---|
| 320 | |
|---|
| 321 | class Element(Fragment): |
|---|
| 322 | """Simple XHTML output generator based on the builder pattern. |
|---|
| 323 | |
|---|
| 324 | Construct XHTML elements by passing the tag name to the constructor: |
|---|
| 325 | |
|---|
| 326 | >>> print Element('strong') |
|---|
| 327 | <strong></strong> |
|---|
| 328 | |
|---|
| 329 | Attributes can be specified using keyword arguments. The values of the |
|---|
| 330 | arguments will be converted to strings and any special XML characters |
|---|
| 331 | escaped: |
|---|
| 332 | |
|---|
| 333 | >>> print Element('textarea', rows=10, cols=60) |
|---|
| 334 | <textarea rows="10" cols="60"></textarea> |
|---|
| 335 | >>> print Element('span', title='1 < 2') |
|---|
| 336 | <span title="1 < 2"></span> |
|---|
| 337 | >>> print Element('span', title='"baz"') |
|---|
| 338 | <span title=""baz""></span> |
|---|
| 339 | |
|---|
| 340 | The order in which attributes are rendered is undefined. |
|---|
| 341 | |
|---|
| 342 | If an attribute value evaluates to `None`, that attribute is not included |
|---|
| 343 | in the output: |
|---|
| 344 | |
|---|
| 345 | >>> print Element('a', name=None) |
|---|
| 346 | <a></a> |
|---|
| 347 | |
|---|
| 348 | Attribute names that conflict with Python keywords can be specified by |
|---|
| 349 | appending an underscore: |
|---|
| 350 | |
|---|
| 351 | >>> print Element('div', class_='warning') |
|---|
| 352 | <div class="warning"></div> |
|---|
| 353 | |
|---|
| 354 | While the tag names and attributes are not restricted to the XHTML language, |
|---|
| 355 | some HTML characteristics such as boolean (minimized) attributes and empty |
|---|
| 356 | elements get special treatment. |
|---|
| 357 | |
|---|
| 358 | For compatibility with HTML user agents, some XHTML elements need to be |
|---|
| 359 | closed using a separate closing tag even if they are empty. For this, the |
|---|
| 360 | close tag is only ommitted for a small set of elements which are known be |
|---|
| 361 | be safe for use as empty elements: |
|---|
| 362 | |
|---|
| 363 | >>> print Element('br') |
|---|
| 364 | <br /> |
|---|
| 365 | |
|---|
| 366 | Trying to add nested elements to such an element will cause an |
|---|
| 367 | `AssertionError`: |
|---|
| 368 | |
|---|
| 369 | >>> Element('br')['Oops'] |
|---|
| 370 | Traceback (most recent call last): |
|---|
| 371 | ... |
|---|
| 372 | AssertionError: 'br' elements must not have content |
|---|
| 373 | |
|---|
| 374 | For example, boolean attributes such as "selected" or "checked" are omitted |
|---|
| 375 | if the value evaluates to `False`. Otherwise, the name of the attribute is |
|---|
| 376 | used for the value: |
|---|
| 377 | |
|---|
| 378 | >>> print Element('option', value=0, selected=False) |
|---|
| 379 | <option value="0"></option> |
|---|
| 380 | >>> print Element('option', selected='yeah') |
|---|
| 381 | <option selected="selected"></option> |
|---|
| 382 | |
|---|
| 383 | Nested elements can be added to an element using item access notation: |
|---|
| 384 | |
|---|
| 385 | >>> print Element('ul')[Element('li'), Element('li')] |
|---|
| 386 | <ul><li></li><li></li></ul> |
|---|
| 387 | |
|---|
| 388 | Text nodes can be nested in an element by adding strings instead of |
|---|
| 389 | elements. Any special characters in the strings are escaped automatically: |
|---|
| 390 | |
|---|
| 391 | >>> print Element('em')['Hello world'] |
|---|
| 392 | <em>Hello world</em> |
|---|
| 393 | >>> print Element('em')[42] |
|---|
| 394 | <em>42</em> |
|---|
| 395 | >>> print Element('em')['1 < 2'] |
|---|
| 396 | <em>1 < 2</em> |
|---|
| 397 | |
|---|
| 398 | This technique also allows mixed content: |
|---|
| 399 | |
|---|
| 400 | >>> print Element('p')['Hello ', Element('b')['world']] |
|---|
| 401 | <p>Hello <b>world</b></p> |
|---|
| 402 | |
|---|
| 403 | Elements can also be combined with other elements or strings using the |
|---|
| 404 | addition operator, which results in a `Fragment` object that contains the |
|---|
| 405 | operands: |
|---|
| 406 | |
|---|
| 407 | >>> print Element('br') + 'some text' + Element('br') |
|---|
| 408 | <br />some text<br /> |
|---|
| 409 | """ |
|---|
| 410 | __slots__ = ['tagname', 'attr'] |
|---|
| 411 | |
|---|
| 412 | def __init__(self, tagname_=None, **attr): |
|---|
| 413 | Fragment.__init__(self) |
|---|
| 414 | if tagname_: |
|---|
| 415 | self.tagname = tagname_ |
|---|
| 416 | self.attr = {} |
|---|
| 417 | self(**attr) |
|---|
| 418 | |
|---|
| 419 | def __call__(self, **attr): |
|---|
| 420 | self.attr.update(attr) |
|---|
| 421 | return self |
|---|
| 422 | |
|---|
| 423 | def append(self, node): |
|---|
| 424 | """Append an element or string as child node.""" |
|---|
| 425 | assert self.tagname not in _EMPTY_TAGS, \ |
|---|
| 426 | "'%s' elements must not have content" % self.tagname |
|---|
| 427 | Fragment.append(self, node) |
|---|
| 428 | |
|---|
| 429 | def serialize(self): |
|---|
| 430 | """Generator that yield tags and text nodes as strings.""" |
|---|
| 431 | starttag = ['<', self.tagname] |
|---|
| 432 | for name, value in self.attr.items(): |
|---|
| 433 | if value is None: |
|---|
| 434 | continue |
|---|
| 435 | if name in _BOOLEAN_ATTRS: |
|---|
| 436 | if not value: |
|---|
| 437 | continue |
|---|
| 438 | value = name |
|---|
| 439 | else: |
|---|
| 440 | name = name.rstrip('_').replace('_', '-') |
|---|
| 441 | starttag.append(' %s="%s"' % (name, escape(value))) |
|---|
| 442 | |
|---|
| 443 | if self.children or self.tagname not in _EMPTY_TAGS: |
|---|
| 444 | starttag.append('>') |
|---|
| 445 | yield Markup(''.join(starttag)) |
|---|
| 446 | for part in Fragment.serialize(self): |
|---|
| 447 | yield part |
|---|
| 448 | yield '</%s>' % self.tagname |
|---|
| 449 | |
|---|
| 450 | else: |
|---|
| 451 | starttag.append(' />') |
|---|
| 452 | yield Markup(''.join(starttag)) |
|---|
| 453 | |
|---|
| 454 | |
|---|
| 455 | for tagname in Sanitizer.safe_tags: |
|---|
| 456 | klass = type(tagname.upper(), (Element,), |
|---|
| 457 | {'tagname': tagname, '__doc__': 'HTML element <%s>' % tagname}) |
|---|
| 458 | setattr(sys.modules[__name__], tagname.upper(), klass) |
|---|
| 459 | del klass, tagname |
|---|