# HG changeset patch # User David Barts # Date 1577376551 28800 # Node ID 984876b6a095ce3135c3d6495b513ba91cb47b4e Initial commit of first two classes. diff -r 000000000000 -r 984876b6a095 workspace.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/workspace.py Thu Dec 26 08:09:11 2019 -0800 @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# A class that implements a workspace for curly-quoting a text. This is enough +# like a string that it can be accessed via subscripts and ranges, and enough +# like a TextIOBase object that it can be written to much like a stream. +# (However, a Workspace is neither a string nor a TextIOBase object.) +# +# The advantage of using UTF-16 (as we do here) is that all quotation marks +# of interest are represented in a single 16-bit value, so changing straight +# quotes to curly ones can be accomplished most easily. +# +# It was a deliberate design decision to return empty strings when reading +# out-of-range indices but to throw exceptions when attempting to write +# them, because both decisions made coding easier in other modules. + +# I m p o r t s + +import os, sys +import io +import codecs + +# V a r i a b l e s + +# C l a s s e s + +class Workspace(object): + # The most efficient 16-bit one on this platform + encoding = "UTF-16" + sys.byteorder[0].upper() + "E" + codec = codecs.lookup(encoding) + # Errors should never happen; UTF-16 can represent all Unicode characters + errors = 'strict' + + def __init__(self, initial_data=None): + """ + Constructor. + """ + self._length = 0 + if initial_data is not None: + data = initial_data.encode(self.encoding, self.errors) + self._fp = io.BytesIO(data) + self._dirty = True + else: + self._fp = io.BytesIO() + self._dirty = False + + def close(self): + """ + Causes our buffer to be discarded and this workspace to become + unusable. + """ + self._fp.close() + + def flush(self): + """ + Does nothing, but allowed. + """ + pass + + def seek(self, offset, whence=io.SEEK_SET): + """ + Seeks to an absolute position. + """ + return self._fp.seek(offset, whence) + + def tell(self): + """ + Returns current position. + """ + return self._fp.tell() + + def read(self, nchars=None): + """ + Read characters. + XXX - might return replacement chars from surrogate fragments. + """ + if nchars is not None and nchars >= 0: + nchars *= 2 + return self._fp.read(nchars).decode(self.encoding, "replace") + + def write(self, string): + """ + Write characters. + """ + self._fp.write(string.encode(self.encoding, self.errors)) + + def __len__(self): + """ + Length as a string. + """ + if self._dirty: + back = self.tell() + self._length = self.seek(0, io.SEEK_END) // 2 + self.seek(back) + self._dirty = False + return self._length + + def __getitem__(self, key): + """ + Direct access to a single character or range of characters. We do + not support negative indices. Return value is based on what's most + useful for curling quotes. + XXX - might return replacement chars from surrogate fragments. + """ + if isinstance(key, int): + if key < 0 or key >= len(self): + return "" + k2 = 2 * key + key = slice(k2, k2 + 2) + elif isinstance(key, slice): + if key.step is not None: + raise ValueError("__getitem__ does not support steps in slices") + length = len(self) + start = 0 if key.start is None else key.start + stop = length if key.stop is None else key.stop + start = max(0, min(length - 1, start)) + stop = max(0, min(length, stop)) + if stop <= start: + return "" + key = slice(start * 2, stop * 2) + else: + raise TypeError("__setitem__ only supports integers and slices") + return self.codec.decode(self._fp.getbuffer()[key], "replace")[0] + + def __setitem__(self, key, value): + """ + Direct access to a single character. We do not support negative + indices or replacing more than a single character at a time. + XXX - only works on characters in the BMP. + """ + if not isinstance(key, int): + raise TypeError("__setitem__ only supports integers") + if key < 0 or key >= len(self): + raise IndexError("index {0} out of range".format(key)) + if not value: + return + start = key * 2 + end = start + 2 + encoded = value[0].encode(self.encoding, self.errors) + if len(encoded) != 2: + raise ValueError("{0!r} not in BMP".format(value[0])) + self._fp.getbuffer()[start:end] = encoded + + def __del__(self): + """ + Equivalent to .close(). + """ + self.close() + + def getvalue(self): + """ + Gets the string represented by this workspace. + """ + return self.codec.decode(self._fp.getbuffer(), self.errors)[0] + + def __enter__(self): + """ + Context manager. + """ + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """ + Context manager: close on exit. + """ + self.close() + return False diff -r 000000000000 -r 984876b6a095 writer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/writer.py Thu Dec 26 08:09:11 2019 -0800 @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# A simple HTML writer, so we can process HTML in a streamwise fashion +# via callbacks, which compared to a document tree tends to be all of: +# easier to program, uses less memory, and uses less processor time. + +# I m p o r t s + +import os, sys +import codecs +import io +import html + +# V a r i a b l e s + +# We only support ASCII, ISO-8859-1, and anything capable of encoding +# the full Unicode set. Anything else is too sticky a wicket to want +# to mess with. +_CODECS_TO_NAME = {} +for i in [ "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16", "UTF-16LE", "UTF-16BE", "UTF-32", "UTF-32LE", "UTF-32BE" ]: + _CODECS_TO_NAME[codecs.lookup(i)] = i +del i +_MAXCHAR = { "US-ASCII": 127, "ISO-8859-1": 255 } + +# There are WAY more HTML entities than this, but we're pessimistic about +# what browsers "in the wild" support, so we stick to what XML supports. +_QUOTE_ENTITIES = { + "\"": "quot", + "'": "apos" +} +_OTHER_ENTITIES = { + "&": "amp", + "<": "lt", + ">": "gt" +} + +# C l a s s e s + +class HtmlStreamWriter(object): + """ + A simple HTML writer, intended to be used in a streamwise fashion. + This class takes REASONABLE precautions against writing garbage, but + does not check every last thing. It will happily write tags like + ">>" etc. if you feed it the right garbage in. + """ + def __init__(self, stream, encoding): + """ + Initialize this writer. An encoding is mandatory, even though we + produce character output, because the encoding governs which + characters we can send on for I/O without entity-escaping them. + The supplied stream should be buffered or performance will suffer. + """ + # Stream we're using is available to the caller as .stream + self.stream = stream + try: + # A codec to use is available to the caller as .codec + self.codec = codecs.lookup(encoding) + # Normalized encoding name is available to the caller as .encoding + self.encoding = _CODECS_TO_NAME[self.codec] + except (KeyError, LookupError) as e: + raise ValueError("invalid encoding {0!r}".format(encoding)) + self._maxchar = _MAXCHAR.get(self.encoding, 0x7fffffff) + + # html.escape drops the ball, badly. It is too optimistic about what + # entity names are likely to be understood, and is too stupid to know + # that ASCII streams need lots of things escaped. + def _escape(self, string, quote=False): + for ch in string: + entity = None + if quote and ch in _QUOTE_ENTITIES: + entity = _QUOTE_ENTITIES[ch] + elif ch in _OTHER_ENTITIES: + entity = _OTHER_ENTITIES[ch] + if entity: + self.stream.write("&") + self.stream.write(entity) + self.stream.write(";") + continue + ordch = ord(ch) + if ordch > self._maxchar: + self.stream.write("&#") + self.stream.write(str(ordch)) + self.stream.write(";") + continue + self.stream.write(ch) + + def write_starttag(self, tag, attrs): + """ + Write a start tag. + """ + self.stream.write("<") + self.stream.write(tag) + self._writeattrs(attrs) + self.stream.write(">") + + def _writeattrs(self, attrs): + for k, v in attrs: + self.stream.write(" ") + self.stream.write(k) + self.stream.write("=\"") + self._escape(v, quote=True) + self.stream.write("\"") + + def write_endtag(self, tag): + """ + Write an end tag. + """ + self.stream.write("") + + def write_startendtag(self, tag, attrs): + """ + Write a "start-end" (i.e. empty) tag. + """ + self.stream.write("<") + self.stream.write(tag) + self._writeattrs(attrs) + self.stream.write("/>") + + def write_data(self, data): + """ + Write text data. + """ + self._escape(data) + + def write_raw_data(self, data): + """ + Write raw data (e.g. style sheets, scripts, etc.) + """ + self.stream.write(data) + + def write_charref(self, name): + """ + Write character reference (normally not needed). + """ + is_number = False + try: + junk = int(name) + is_number = True + except ValueError: + pass + if name.startswith("x"): + try: + junk = int(name[1:], 16) + is_number = True + except ValueError: + pass + self.stream.write("&") + if is_number: + self.stream.write("#") + self.stream.write(name) + else: + self.stream.write(name) + self.stream.write(";") + + def write_comment(self, data): + """ + Write a comment. + """ + self.stream.write("") + + def write_decl(self, decl): + """ + Write a declarationm. + """ + self.stream.write("") + + def write_pi(self, data): + self.stream.write("")