view writer.py @ 15:0be0586104b7

Plug the leak properly (I hope).
author David Barts <n5jrn@me.com>
date Fri, 27 Dec 2019 13:35:51 -0800
parents 091c03f1b2e8
children
line wrap: on
line source

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# A simple HTML writer, so we can process HTML in a streamwise fashion
# via callbacks, which compared to a document tree tends to be all of:
# easier to program, uses less memory, and uses less processor time.

# I m p o r t s

import os, sys
import codecs
import io
import html

# V a r i a b l e s

# We only support ASCII, ISO-8859-1, and anything capable of encoding
# the full Unicode set. Anything else is too sticky a wicket to want
# to mess with.
CODECS_TO_NAME = {}
for i in [ "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16", "UTF-16LE", "UTF-16BE", "UTF-32", "UTF-32LE", "UTF-32BE" ]:
    CODECS_TO_NAME[codecs.lookup(i)] = i
del i
_MAXCHAR = { "US-ASCII": 127, "ISO-8859-1": 255 }

# There are WAY more HTML entities than this, but we're pessimistic about
# what browsers "in the wild" support, so we stick to what XML supports.
_QUOTE_ENTITIES = {
    "\"": "quot",
    "'": "apos"
}
_OTHER_ENTITIES = {
    "&": "amp",
    "<": "lt",
    ">": "gt"
}

# C l a s s e s

class HtmlStreamWriter(object):
    """
    A simple HTML writer, intended to be used in a streamwise fashion.
    This class takes REASONABLE precautions against writing garbage, but
    does not check every last thing. It will happily write tags like
    "<garb<<age>>>" etc. if you feed it the right garbage in.
    """
    def __init__(self, stream, encoding):
        """
        Initialize this writer. An encoding is mandatory, even though we
        produce character output, because the encoding governs which
        characters we can send on for I/O without entity-escaping them.
        The supplied stream should be buffered or performance will suffer.
        """
        # Stream we're using is available to the caller as .stream
        self.stream = stream
        try:
            # A codec to use is available to the caller as .codec
            self.codec = codecs.lookup(encoding)
            # Normalized encoding name is available to the caller as .encoding
            self.encoding = CODECS_TO_NAME[self.codec]
        except (KeyError, LookupError) as e:
            raise ValueError("invalid encoding {0!r}".format(encoding))
        self._maxchar = _MAXCHAR.get(self.encoding, 0x7fffffff)

    # html.escape drops the ball, badly. It is too optimistic about what
    # entity names are likely to be understood, and is too stupid to know
    # that ASCII streams need lots of things escaped.
    def _escape(self, string, quote=False):
        for ch in string:
            entity = None
            if quote and ch in _QUOTE_ENTITIES:
                entity = _QUOTE_ENTITIES[ch]
            elif ch in _OTHER_ENTITIES:
                entity = _OTHER_ENTITIES[ch]
            if entity:
                self.stream.write("&")
                self.stream.write(entity)
                self.stream.write(";")
                continue
            ordch = ord(ch)
            if ordch > self._maxchar:
                self.stream.write("&#")
                self.stream.write(str(ordch))
                self.stream.write(";")
                continue
            self.stream.write(ch)

    def write_starttag(self, tag, attrs):
        """
        Write a start tag.
        """
        self.stream.write("<")
        self.stream.write(tag)
        self._writeattrs(attrs)
        self.stream.write(">")

    def _writeattrs(self, attrs):
        for k, v in attrs:
            self.stream.write(" ")
            self.stream.write(k)
            self.stream.write("=\"")
            self._escape(v, quote=True)
            self.stream.write("\"")

    def write_endtag(self, tag):
        """
        Write an end tag.
        """
        self.stream.write("</")
        self.stream.write(tag)
        self.stream.write(">")

    def write_startendtag(self, tag, attrs):
        """
        Write a "start-end" (i.e. empty) tag.
        """
        self.stream.write("<")
        self.stream.write(tag)
        self._writeattrs(attrs)
        self.stream.write("/>")

    def write_data(self, data):
        """
        Write text data.
        """
        self._escape(data)

    def write_raw_data(self, data):
        """
        Write raw data (e.g. style sheets, scripts, etc.)
        """
        self.stream.write(data)

    def write_charref(self, name):
        """
        Write character reference (normally not needed).
        """
        is_number = False
        try:
            junk = int(name)
            is_number = True
        except ValueError:
            pass
        if name.startswith("x"):
            try:
                junk = int(name[1:], 16)
                is_number = True
            except ValueError:
                pass
        self.stream.write("&")
        if is_number:
            self.stream.write("#")
            self.stream.write(name)
        else:
            self.stream.write(name)
        self.stream.write(";")

    def write_comment(self, data):
        """
        Write a comment.
        """
        self.stream.write("<!--")
        self.stream.write(data)
        self.stream.write("-->")

    def write_decl(self, decl):
        """
        Write a declarationm.
        """
        self.stream.write("<!")
        self.stream.write(decl)
        self.stream.write(">")

    def write_pi(self, data):
        self.stream.write("<?")
        self.stream.write(decl)
        self.stream.write(">")