Mercurial > cgi-bin > hgweb.cgi > curlyq
diff writer.py @ 0:984876b6a095
Initial commit of first two classes.
author | David Barts <n5jrn@me.com> |
---|---|
date | Thu, 26 Dec 2019 08:09:11 -0800 |
parents | |
children | 091c03f1b2e8 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/writer.py Thu Dec 26 08:09:11 2019 -0800 @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# A simple HTML writer, so we can process HTML in a streamwise fashion +# via callbacks, which compared to a document tree tends to be all of: +# easier to program, uses less memory, and uses less processor time. + +# I m p o r t s + +import os, sys +import codecs +import io +import html + +# V a r i a b l e s + +# We only support ASCII, ISO-8859-1, and anything capable of encoding +# the full Unicode set. Anything else is too sticky a wicket to want +# to mess with. +_CODECS_TO_NAME = {} +for i in [ "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16", "UTF-16LE", "UTF-16BE", "UTF-32", "UTF-32LE", "UTF-32BE" ]: + _CODECS_TO_NAME[codecs.lookup(i)] = i +del i +_MAXCHAR = { "US-ASCII": 127, "ISO-8859-1": 255 } + +# There are WAY more HTML entities than this, but we're pessimistic about +# what browsers "in the wild" support, so we stick to what XML supports. +_QUOTE_ENTITIES = { + "\"": "quot", + "'": "apos" +} +_OTHER_ENTITIES = { + "&": "amp", + "<": "lt", + ">": "gt" +} + +# C l a s s e s + +class HtmlStreamWriter(object): + """ + A simple HTML writer, intended to be used in a streamwise fashion. + This class takes REASONABLE precautions against writing garbage, but + does not check every last thing. It will happily write tags like + "<garb<<age>>>" etc. if you feed it the right garbage in. + """ + def __init__(self, stream, encoding): + """ + Initialize this writer. An encoding is mandatory, even though we + produce character output, because the encoding governs which + characters we can send on for I/O without entity-escaping them. + The supplied stream should be buffered or performance will suffer. + """ + # Stream we're using is available to the caller as .stream + self.stream = stream + try: + # A codec to use is available to the caller as .codec + self.codec = codecs.lookup(encoding) + # Normalized encoding name is available to the caller as .encoding + self.encoding = _CODECS_TO_NAME[self.codec] + except (KeyError, LookupError) as e: + raise ValueError("invalid encoding {0!r}".format(encoding)) + self._maxchar = _MAXCHAR.get(self.encoding, 0x7fffffff) + + # html.escape drops the ball, badly. It is too optimistic about what + # entity names are likely to be understood, and is too stupid to know + # that ASCII streams need lots of things escaped. + def _escape(self, string, quote=False): + for ch in string: + entity = None + if quote and ch in _QUOTE_ENTITIES: + entity = _QUOTE_ENTITIES[ch] + elif ch in _OTHER_ENTITIES: + entity = _OTHER_ENTITIES[ch] + if entity: + self.stream.write("&") + self.stream.write(entity) + self.stream.write(";") + continue + ordch = ord(ch) + if ordch > self._maxchar: + self.stream.write("&#") + self.stream.write(str(ordch)) + self.stream.write(";") + continue + self.stream.write(ch) + + def write_starttag(self, tag, attrs): + """ + Write a start tag. + """ + self.stream.write("<") + self.stream.write(tag) + self._writeattrs(attrs) + self.stream.write(">") + + def _writeattrs(self, attrs): + for k, v in attrs: + self.stream.write(" ") + self.stream.write(k) + self.stream.write("=\"") + self._escape(v, quote=True) + self.stream.write("\"") + + def write_endtag(self, tag): + """ + Write an end tag. + """ + self.stream.write("</") + self.stream.write(tag) + self.stream.write(">") + + def write_startendtag(self, tag, attrs): + """ + Write a "start-end" (i.e. empty) tag. + """ + self.stream.write("<") + self.stream.write(tag) + self._writeattrs(attrs) + self.stream.write("/>") + + def write_data(self, data): + """ + Write text data. + """ + self._escape(data) + + def write_raw_data(self, data): + """ + Write raw data (e.g. style sheets, scripts, etc.) + """ + self.stream.write(data) + + def write_charref(self, name): + """ + Write character reference (normally not needed). + """ + is_number = False + try: + junk = int(name) + is_number = True + except ValueError: + pass + if name.startswith("x"): + try: + junk = int(name[1:], 16) + is_number = True + except ValueError: + pass + self.stream.write("&") + if is_number: + self.stream.write("#") + self.stream.write(name) + else: + self.stream.write(name) + self.stream.write(";") + + def write_comment(self, data): + """ + Write a comment. + """ + self.stream.write("<!--") + self.stream.write(data) + self.stream.write("-->") + + def write_decl(self, decl): + """ + Write a declarationm. + """ + self.stream.write("<!") + self.stream.write(decl) + self.stream.write(">") + + def write_pi(self, data): + self.stream.write("<?") + self.stream.write(decl) + self.stream.write(">")