changeset 0:984876b6a095

Initial commit of first two classes.
author David Barts <n5jrn@me.com>
date Thu, 26 Dec 2019 08:09:11 -0800
parents
children 173e86601dbc
files workspace.py writer.py
diffstat 2 files changed, 344 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/workspace.py	Thu Dec 26 08:09:11 2019 -0800
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# A class that implements a workspace for curly-quoting a text. This is enough
+# like a string that it can be accessed via subscripts and ranges, and enough
+# like a TextIOBase object that it can be written to much like a stream.
+# (However, a Workspace is neither a string nor a TextIOBase object.)
+#
+# The advantage of using UTF-16 (as we do here) is that all quotation marks
+# of interest are represented in a single 16-bit value, so changing straight
+# quotes to curly ones can be accomplished most easily.
+#
+# It was a deliberate design decision to return empty strings when reading
+# out-of-range indices but to throw exceptions when attempting to write
+# them, because both decisions made coding easier in other modules.
+
+# I m p o r t s
+
+import os, sys
+import io
+import codecs
+
+# V a r i a b l e s
+
+# C l a s s e s
+
+class Workspace(object):
+    # The most efficient 16-bit one on this platform
+    encoding = "UTF-16" + sys.byteorder[0].upper() + "E"
+    codec = codecs.lookup(encoding)
+    # Errors should never happen; UTF-16 can represent all Unicode characters
+    errors = 'strict'
+
+    def __init__(self, initial_data=None):
+        """
+        Constructor.
+        """
+        self._length = 0
+        if initial_data is not None:
+            data = initial_data.encode(self.encoding, self.errors)
+            self._fp = io.BytesIO(data)
+            self._dirty = True
+        else:
+            self._fp = io.BytesIO()
+            self._dirty = False
+
+    def close(self):
+        """
+        Causes our buffer to be discarded and this workspace to become
+        unusable.
+        """
+        self._fp.close()
+
+    def flush(self):
+        """
+        Does nothing, but allowed.
+        """
+        pass
+
+    def seek(self, offset, whence=io.SEEK_SET):
+        """
+        Seeks to an absolute position.
+        """
+        return self._fp.seek(offset, whence)
+
+    def tell(self):
+        """
+        Returns current position.
+        """
+        return self._fp.tell()
+
+    def read(self, nchars=None):
+        """
+        Read characters.
+        XXX - might return replacement chars from surrogate fragments.
+        """
+        if nchars is not None and nchars >= 0:
+            nchars *= 2
+        return self._fp.read(nchars).decode(self.encoding, "replace")
+
+    def write(self, string):
+        """
+        Write characters.
+        """
+        self._fp.write(string.encode(self.encoding, self.errors))
+
+    def __len__(self):
+        """
+        Length as a string.
+        """
+        if self._dirty:
+            back = self.tell()
+            self._length = self.seek(0, io.SEEK_END) // 2
+            self.seek(back)
+            self._dirty = False
+        return self._length
+
+    def __getitem__(self, key):
+        """
+        Direct access to a single character or range of characters. We do
+        not support negative indices. Return value is based on what's most
+        useful for curling quotes.
+        XXX - might return replacement chars from surrogate fragments.
+        """
+        if isinstance(key, int):
+            if key < 0 or key >= len(self):
+                return ""
+            k2 = 2 * key
+            key = slice(k2, k2 + 2)
+        elif isinstance(key, slice):
+            if key.step is not None:
+                raise ValueError("__getitem__ does not support steps in slices")
+            length = len(self)
+            start = 0 if key.start is None else key.start
+            stop = length if key.stop is None else key.stop
+            start = max(0, min(length - 1, start))
+            stop = max(0, min(length, stop))
+            if stop <= start:
+                return ""
+            key = slice(start * 2, stop * 2)
+        else:
+            raise TypeError("__setitem__ only supports integers and slices")
+        return self.codec.decode(self._fp.getbuffer()[key], "replace")[0]
+
+    def __setitem__(self, key, value):
+        """
+        Direct access to a single character. We do not support negative
+        indices or replacing more than a single character at a time.
+        XXX - only works on characters in the BMP.
+        """
+        if not isinstance(key, int):
+            raise TypeError("__setitem__ only supports integers")
+        if key < 0 or key >= len(self):
+            raise IndexError("index {0} out of range".format(key))
+        if not value:
+            return
+        start = key * 2
+        end = start + 2
+        encoded = value[0].encode(self.encoding, self.errors)
+        if len(encoded) != 2:
+            raise ValueError("{0!r} not in BMP".format(value[0]))
+        self._fp.getbuffer()[start:end] = encoded
+
+    def __del__(self):
+        """
+        Equivalent to .close().
+        """
+        self.close()
+
+    def getvalue(self):
+        """
+        Gets the string represented by this workspace.
+        """
+        return self.codec.decode(self._fp.getbuffer(), self.errors)[0]
+
+    def __enter__(self):
+        """
+        Context manager.
+        """
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """
+        Context manager: close on exit.
+        """
+        self.close()
+        return False
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/writer.py	Thu Dec 26 08:09:11 2019 -0800
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# A simple HTML writer, so we can process HTML in a streamwise fashion
+# via callbacks, which compared to a document tree tends to be all of:
+# easier to program, uses less memory, and uses less processor time.
+
+# I m p o r t s
+
+import os, sys
+import codecs
+import io
+import html
+
+# V a r i a b l e s
+
+# We only support ASCII, ISO-8859-1, and anything capable of encoding
+# the full Unicode set. Anything else is too sticky a wicket to want
+# to mess with.
+_CODECS_TO_NAME = {}
+for i in [ "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16", "UTF-16LE", "UTF-16BE", "UTF-32", "UTF-32LE", "UTF-32BE" ]:
+    _CODECS_TO_NAME[codecs.lookup(i)] = i
+del i
+_MAXCHAR = { "US-ASCII": 127, "ISO-8859-1": 255 }
+
+# There are WAY more HTML entities than this, but we're pessimistic about
+# what browsers "in the wild" support, so we stick to what XML supports.
+_QUOTE_ENTITIES = {
+    "\"": "quot",
+    "'": "apos"
+}
+_OTHER_ENTITIES = {
+    "&": "amp",
+    "<": "lt",
+    ">": "gt"
+}
+
+# C l a s s e s
+
+class HtmlStreamWriter(object):
+    """
+    A simple HTML writer, intended to be used in a streamwise fashion.
+    This class takes REASONABLE precautions against writing garbage, but
+    does not check every last thing. It will happily write tags like
+    "<garb<<age>>>" etc. if you feed it the right garbage in.
+    """
+    def __init__(self, stream, encoding):
+        """
+        Initialize this writer. An encoding is mandatory, even though we
+        produce character output, because the encoding governs which
+        characters we can send on for I/O without entity-escaping them.
+        The supplied stream should be buffered or performance will suffer.
+        """
+        # Stream we're using is available to the caller as .stream
+        self.stream = stream
+        try:
+            # A codec to use is available to the caller as .codec
+            self.codec = codecs.lookup(encoding)
+            # Normalized encoding name is available to the caller as .encoding
+            self.encoding = _CODECS_TO_NAME[self.codec]
+        except (KeyError, LookupError) as e:
+            raise ValueError("invalid encoding {0!r}".format(encoding))
+        self._maxchar = _MAXCHAR.get(self.encoding, 0x7fffffff)
+
+    # html.escape drops the ball, badly. It is too optimistic about what
+    # entity names are likely to be understood, and is too stupid to know
+    # that ASCII streams need lots of things escaped.
+    def _escape(self, string, quote=False):
+        for ch in string:
+            entity = None
+            if quote and ch in _QUOTE_ENTITIES:
+                entity = _QUOTE_ENTITIES[ch]
+            elif ch in _OTHER_ENTITIES:
+                entity = _OTHER_ENTITIES[ch]
+            if entity:
+                self.stream.write("&")
+                self.stream.write(entity)
+                self.stream.write(";")
+                continue
+            ordch = ord(ch)
+            if ordch > self._maxchar:
+                self.stream.write("&#")
+                self.stream.write(str(ordch))
+                self.stream.write(";")
+                continue
+            self.stream.write(ch)
+
+    def write_starttag(self, tag, attrs):
+        """
+        Write a start tag.
+        """
+        self.stream.write("<")
+        self.stream.write(tag)
+        self._writeattrs(attrs)
+        self.stream.write(">")
+
+    def _writeattrs(self, attrs):
+        for k, v in attrs:
+            self.stream.write(" ")
+            self.stream.write(k)
+            self.stream.write("=\"")
+            self._escape(v, quote=True)
+            self.stream.write("\"")
+
+    def write_endtag(self, tag):
+        """
+        Write an end tag.
+        """
+        self.stream.write("</")
+        self.stream.write(tag)
+        self.stream.write(">")
+
+    def write_startendtag(self, tag, attrs):
+        """
+        Write a "start-end" (i.e. empty) tag.
+        """
+        self.stream.write("<")
+        self.stream.write(tag)
+        self._writeattrs(attrs)
+        self.stream.write("/>")
+
+    def write_data(self, data):
+        """
+        Write text data.
+        """
+        self._escape(data)
+
+    def write_raw_data(self, data):
+        """
+        Write raw data (e.g. style sheets, scripts, etc.)
+        """
+        self.stream.write(data)
+
+    def write_charref(self, name):
+        """
+        Write character reference (normally not needed).
+        """
+        is_number = False
+        try:
+            junk = int(name)
+            is_number = True
+        except ValueError:
+            pass
+        if name.startswith("x"):
+            try:
+                junk = int(name[1:], 16)
+                is_number = True
+            except ValueError:
+                pass
+        self.stream.write("&")
+        if is_number:
+            self.stream.write("#")
+            self.stream.write(name)
+        else:
+            self.stream.write(name)
+        self.stream.write(";")
+
+    def write_comment(self, data):
+        """
+        Write a comment.
+        """
+        self.stream.write("<!--")
+        self.stream.write(data)
+        self.stream.write("-->")
+
+    def write_decl(self, decl):
+        """
+        Write a declarationm.
+        """
+        self.stream.write("<!")
+        self.stream.write(decl)
+        self.stream.write(">")
+
+    def write_pi(self, data):
+        self.stream.write("<?")
+        self.stream.write(decl)
+        self.stream.write(">")