changeset 22:a771878f6cf4

Remove deadwood, update runes.py.
author David Barts <n5jrn@me.com>
date Mon, 30 Dec 2019 08:16:24 -0800
parents 35f29952b51e
children dc30266d4d5b
files curlyq runes.py writer.py
diffstat 3 files changed, 295 insertions(+), 243 deletions(-) [+]
line wrap: on
line diff
--- a/curlyq	Sat Dec 28 06:32:53 2019 -0800
+++ b/curlyq	Mon Dec 30 08:16:24 2019 -0800
@@ -1,13 +1,14 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
+# I m p o r t s
+
 import os, sys
 import argparse
 import codecs
 
 from curlers import TextCurler, HtmlCurler, uncurl
 from runes import Workspace
-from writer import CODECS_TO_NAME
 
 # V a r i a b l e s
 
@@ -18,11 +19,26 @@
 input_fp = None
 output_fp = None
 
+# Codecs we support
+CODECS_TO_NAME = {}
+for i in [ "UTF-8", "UTF-16", "UTF-16LE", "UTF-16BE", "UTF-32", "UTF-32LE", "UTF-32BE" ]:
+    CODECS_TO_NAME[codecs.lookup(i)] = i
+del i
+
+# C l a s s e s
+
+class SafeWorkspace(Workspace):
+    def __getitem__(self, key):
+        try:
+            return super().__getitem__(key)
+        except IndexError:
+            return ""
+
 # F u n c t i o n s
 
 def normal():
     global input_fp, output_fp, args
-    ws = Workspace()
+    ws = SafeWorkspace()
     curler = TextCurler(ws)
     while True:
         line = input_fp.readline()
@@ -37,7 +53,7 @@
 
 def flowed():
     global input_fp, output_fp, args
-    ws = Workspace()
+    ws = SafeWorkspace()
     curler = TextCurler(ws)
     while True:
         line = input_fp.readline()
@@ -51,7 +67,7 @@
 
 def html():
     global input_fp, output_fp
-    ws = Workspace(input_fp.read())
+    ws = SafeWorkspace(input_fp.read())
     curler = HtmlCurler(ws)
     if args.force: uncurl(ws)
     curler.feed()
@@ -59,7 +75,7 @@
 
 def do_uncurl():
     global input_fp, output_fp
-    ws = Workspace(input_fp.read())
+    ws = SafeWorkspace(input_fp.read())
     uncurl(ws)
     output_fp.write(str(ws))
 
@@ -90,7 +106,7 @@
 except LookupError as e:
     sys.stderr.write("{0}: {1!s}\n".format(MYNAME, e))
     sys.exit(2)
-if not CODECS_TO_NAME.get(codec, "").startswith("UTF-"):
+if codec not in CODECS_TO_NAME:
     sys.stderr.write("{0}: {1!r} output coding does not support Unicode\n".format(MYNAME, args.ocoding))
     sys.exit(1)
 del codec
--- a/runes.py	Sat Dec 28 06:32:53 2019 -0800
+++ b/runes.py	Mon Dec 30 08:16:24 2019 -0800
@@ -1,28 +1,43 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
+# Something like Java's StringBuilder, but for Python. It needs more
+# thorough testing. This runs about 3x slower than io.StringIO, but
+# unlike that class allows for easy and meaningful random access via
+# subscripts.
+#
+# This implements two objects: Runes and Workspace. The former is a
+# relatively low-level object that deals mostly in UTF-16 rune values;
+# however, you can create a Runes object from a Python string, and you
+# can create a Python string from a Runes object. Aside from that, Runes
+# deals in numeric values, not Python strings. Workspace methods tend
+# to accept and return Python strings, which makes them more programmer-
+# friendly, at the cost of often having to fire up a codec to convert
+# things back and forth between Python strings and UTF-16 runes.
+
 # I m p o r t s
 
 import array
 import codecs
-import collections
-import struct
 import sys
 
 # C l a s s e s
 
 class Runes(object):
     """
-    A mutable, fixed-length sequence of UTF-16 runes. The attributes
-    encoding and codec contain the name of the encoding and the codec
-    used to generate the UTF-16. The attribute buffer contains the
-    buffer (an array of 16-bit unsigned integers) used to back this
-    object; modifications to that array will be reflected in this
-    object.
+    A mutable sequence of UTF-16 runes. The attributes encoding and
+    codec contain the name of the encoding and the codec used to
+    generate the UTF-16. The attribute buffer contains the buffer (an
+    array of 16-bit unsigned integers) used to back this object;
+    modifications to that array will be reflected in this object.
     """
     # The most efficient 16-bit one on this platform
     encoding = "UTF-16" + sys.byteorder[0].upper() + "E"
     codec = codecs.lookup(encoding)
+    _ERRORS = 'surrogatepass'
+    _MIN_S = 0xd800 # lowest possible surrogate
+    _MID_S = 0xdc00 # high surrogate if <, low if >=
+    _MAX_S = 0xdfff # highest possible surrogate
 
     def __init__(self, based_on=None):
         if isinstance(based_on, array.array):
@@ -31,10 +46,9 @@
             else:
                 self.buffer = array.array('H', based_on)
         elif isinstance(based_on, str):
-            # A string should always be able to encode to runes.
-            self.buffer = array.array('H', self.codec.encode(based_on, 'strict')[0])
+            self.buffer = array.array('H', self.codec.encode(based_on, self._ERRORS)[0])
         elif based_on is None:
-            self.buffer = array.array('H', bytes())
+            self.buffer = array.array('H')
         elif isinstance(based_on, Runes):
             self.buffer = array.array('H', based_on.buffer)
         else:
@@ -45,8 +59,7 @@
         Convert this object to a string. We deliberately do not have a
         __repr__ method, to underscore that runes are not strings.
         """
-        # Runes might not always be able to decode to a string.
-        return self.codec.decode(self.buffer, 'replace')[0]
+        return self.codec.decode(self.buffer, self._ERRORS)[0]
 
     def __bytes__(self):
         return bytes(self.buffer)
@@ -54,22 +67,37 @@
     def __len__(self):
         return len(self.buffer)
 
+    def _checkindex(self, index, allow_equal=False):
+        ok = 0 <= index <= len(self) if allow_equal else 0 <= index < len(self)
+        if not ok:
+            raise IndexError("index {0} out of range".format(index))
+
+    def _checktype(self, other):
+        if not isinstance(other, Runes):
+            raise TypeError("Runes required")
+
     def __lt__(self, other):
+        self._checktype(other)
         return self.buffer < other.buffer
 
     def __le__(self, other):
+        self._checktype(other)
         return self.buffer <= other.buffer
 
     def __gt__(self, other):
+        self._checktype(other)
         return self.buffer > other.buffer
 
     def __ge__(self, other):
+        self._checktype(other)
         return self.buffer >= other.buffer
 
     def __eq__(self, other):
+        self._checktype(other)
         return self.buffer == other.buffer
 
     def __ne__(self, other):
+        self._checktype(other)
         return self.buffer != other.buffer
 
     def __hash__(self):
@@ -79,11 +107,12 @@
         return bool(self.buffer)
 
     def __getitem__(self, key):
-        ret = self.buffer[key]
-        if isinstance(ret, array.array):
-            return Runes(ret)
+        if isinstance(key, int):
+            return self.buffer[key]
+        elif isinstance(key, slice):
+            return Runes(self.buffer[key])
         else:
-            return ret
+            raise AssertionError("this shouldn't happen")
 
     def __setitem__(self, key, value):
         if isinstance(key, int):
@@ -94,14 +123,25 @@
         elif isinstance(value, Runes):
             self.buffer[key] = value.buffer
         else:
-            raise TypeError("runes required")
+            raise TypeError("Runes required")
 
     def __delitem__(self, key):
         del self.buffer[key]
 
     def clear(self):
+        """
+        Remove all data from our buffer. This merely marks the buffer as
+        empty; it does nothing to destroy its contents by overwriting.
+        """
         del self[:]
 
+    def zero(self):
+        """
+        Overwrite our buffer with zeroes.
+        """
+        for i in range(len(self.buffer)):
+            self.buffer[i] = 0
+
     def __iter__(self):
         return iter(self.buffer)
 
@@ -109,82 +149,255 @@
         return reversed(self.buffer)
 
     def append(self, value):
+        """
+        Append data to our buffer.
+        """
         if isinstance(value, int):
             self.buffer.append(value)
         elif isinstance(value, Runes):
             self.buffer.extend(value.buffer)
         else:
-            raise TypeError("integer or runes required")
+            raise TypeError("integer or Runes required")
 
     def __contains__(self, value):
         return value in self.buffer
 
-    def index(self, value):
-        return self.buffer.index(value)
+    @classmethod
+    def is_high_surrogate(cls, value):
+        """
+        Is value in the UTF-16 high surrogate range?
+        """
+        return cls._MIN_S <= value < cls._MID_S
+
+    @classmethod
+    def is_low_surrogate(cls, value):
+        """
+        Is value in the UTF-16 low surrogate range?
+        """
+        return cls._MIN_S <= value <= cls._MAX_S
+
+    @classmethod
+    def is_surrogate(cls, value):
+        """
+        Is value in the UTF-16 surrogate range?
+        """
+        return cls._MIN_S <= value <= cls._MAX_S
 
-    def find(self, value):
+    def index(self, value, from_index=0):
+        """
+        Substring index, throws exception if not found.
+        """
+        self._checktype(value)
+        slimit = len(self)
+        rlimit = len(value)
+        for i in range(from_index, len(self)):
+            match = True
+            for j in range(rlimit):
+                k = i + j
+                if k >= slimit or value.buffer[j] != self.buffer[k]:
+                    match = False
+                    break
+            if match:
+                return i
+        raise ValueError("substring not found")
+
+    def find(self, value, from_index=0):
+        """
+        Substring index, returns -1 if not found.
+        """
         try:
-            return self.index(value)
+            return self.index(value, from_index)
+        except ValueError:
+            return -1
+
+    def rindex(self, value, from_index=None):
+        """
+        Reverse substring index, throws exception if not found.
+        """
+        self._checktype(value)
+        if from_index is None:
+            from_index = len(self) - 1
+        rfrom = len(value) - 1
+        for i in range(from_index, -1, -1):
+            match = True
+            for j in range(rfrom, -1, -1):
+                k = i - (rfrom - j)
+                if k < 0 or value.buffer[j] != self.buffer[k]:
+                    match = False
+                    break
+            if match:
+                return i - rfrom
+        raise ValueError("substring not found")
+
+    def rfind(self, value, from_index=None):
+        """
+        Reverse substring index, returns -1 if not found.
+        """
+        try:
+            return self.rindex(value, from_index)
         except ValueError:
             return -1
 
 class Workspace(Runes):
     """
-    A Runes object that acts a bit more string-like, in that __setitem__
-    also accepts a string as an argument and __getitem__ always returns
-    a string. We also return empty strings instead of throwing IndexError
-    when attempting to read out-of-range values, because that makes life
-    easier for us when curling quotes.
+    A Runes object (q.v.) that acts a bit more string-like.
     """
+
     def __setitem__(self, key, value):
         if isinstance(value, str):
             if isinstance(key, int):
-                Runes.__setitem__(self, key, self._ord(value))
+                value = ord(value)
+                if value > 0xffff:
+                    raise ValueError("character not in BMP")
+                super().__setitem__(key, value)
             else:
-                Runes.__setitem__(self, key, Runes(value))
+                super().__setitem__(key, Runes(value))
         else:
-            Runes.__setitem__(self, key, value)
+            super().__setitem__(key, value)
 
     def __getitem__(self, key):
-        view = memoryview(self.buffer)
-        try:
-            result = view[key]
-            if isinstance(result, int):
-                return chr(result)
-            if isinstance(result, memoryview):
-                ret = self.codec.decode(result, 'replace')[0]
+        if isinstance(key, int):
+            return chr(self.buffer[key])
+        elif isinstance(key, slice):
+            view = memoryview(self.buffer)
+            try:
+                result = view[key]
+                if not isinstance(result, memoryview):
+                    assert isinstance(result, int)
+                    return chr(result)
+                ret = self.codec.decode(result, self._ERRORS)[0]
                 result.release()
                 return ret
-            else:
-                raise AssertionError("this shouldn't happen")
-        except IndexError:
-            return ""
-        finally:
-            view.release()
+            finally:
+                view.release()
+        else:
+            raise AssertionError("this shouldn't happen")
+
+    def __contains__(self, value):
+        if isinstance(value, int):
+            return value in self.buffer
+        return self.find(value) != -1
+
+    def __iter__(self):
+        for i in range(len(self)):
+            yield self[i]
+
+    def __reversed__(self):
+        for i in range(len(self)-1, -1, -1):
+            yield self[i]
 
     def append(self, value):
+        """
+        Append string or runes to this item.
+        """
         if isinstance(value, str):
-            Runes.append(self, Runes(value))
-        else:
-            Runes.append(self, value)
+            value = Runes(value)
+        elif not isinstance(value, (int, Runes)):
+            raise TypeError("integer, string, or Runes required")
+        super().append(value)
 
-    def index(self, value):
+    def _runify(self, value):
         if isinstance(value, str):
-            return Runes.index(self, self._ord(value))
+            return Runes(value)
+        elif isinstance(value, Runes):
+            return value
         else:
-            return Runes.index(self, value)
+            raise TypeError("Runes or string required")
 
-    def find(self, value):
+    def index(self, value, from_index=0):
+        """
+        Substring index, throws exception if not found.
+        """
+        return super().index(self._runify(value), from_index)
+
+    def find(self, value, from_index=0):
+        """
+        Substring index, returns -1 if not found.
+        """
         try:
-            return self.index(value)
+            return self.index(value, from_index)
         except ValueError:
             return -1
 
-    def _ord(self, string):
-        length = len(string)
-        if length != 1:
-            raise ValueError("expected a character, but string of length {0} found".format(length))
-        raw = Runes(string)
-        if len(raw) != 1:
-            raise ValueError("character not in BMP")
-        return raw[0]
+    def rindex(self, value, from_index=None):
+        """
+        Reverse substring index, throws exception if not found.
+        """
+        return super().rindex(self._runify(value), from_index)
+
+    def rfind(self, value, from_index=None):
+        """
+        Reverse substring index, returns -1 if not found.
+        """
+        try:
+            return self.rindex(value, from_index)
+        except ValueError:
+            return -1
+
+    def _code_point_at(self, index):
+        self._checkindex(index, allow_equal=False)
+        v0 = self.buffer[index]
+        i1 = index + 1
+        v1 = None if i1 >= len(self.buffer) else self.buffer[i1]
+        if Runes.is_high_surrogate(v0) and v1 is not None and Runes.is_low_surrogate(v1):
+            return slice(index, index+2)
+        else:
+            return slice(index, i1)
+
+    def code_point_at(self, index):
+        """
+        Similar to java.lang.String.codePointAt, but returns a 1-character
+        string, not an int.
+        """
+        return self[self._code_point_at(index)]
+
+    def _code_point_before(self, index):
+        self._checkindex(index - 1, allow_equal=True)
+        i1 = index - 1
+        v1 = self.buffer[i1]
+        i2 = index - 2
+        v2 = None if i2 < 0 else self.buffer[i2]
+        if Runes.is_low_surrogate(v1) and v2 is not None and Runes.is_high_surrogate(v2):
+            return slice(i2, index)
+        else:
+            return slice(i1, index)
+
+    def code_point_before(self, index):
+        """
+        Similar to java.lang.String.codePointBefore but returns a 1-character
+        string, not an int.
+        """
+        return self[self._code_point_before(index)]
+
+    def code_point_count(self, begin=None, end=None):
+        """
+        Same behavior as java.lang.String.codePointCount (q.v.).
+        """
+        if begin is None: begin = 0
+        if end is None: end = len(self.buffer)
+        self._checkindex(begin, allow_equal=False)
+        self._checkindex(end, allow_equal=True)
+        if begin > end:
+            raise IndexError("invalid range (begin > end)")
+        i = begin
+        ret = 0
+        while i < end:
+            i = self._code_point_at(i).stop
+            ret += 1
+        return ret
+
+    def offset_by_code_points(self, index, offset):
+        """
+        Same behavior as java.lang.String.offsetByCodePoints (q.v.).
+        """
+        self._checkindex(index, allow_equal=True)
+        if offset < 0:
+            next = lambda i: self._code_point_before(i).start
+        else:
+            next = lambda i: self._code_point_at(i).stop
+        seen = 0
+        limit = abs(offset)
+        while seen < limit:
+            index = next(index)
+            seen += 1
+        return index
--- a/writer.py	Sat Dec 28 06:32:53 2019 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,177 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-# A simple HTML writer, so we can process HTML in a streamwise fashion
-# via callbacks, which compared to a document tree tends to be all of:
-# easier to program, uses less memory, and uses less processor time.
-
-# I m p o r t s
-
-import os, sys
-import codecs
-import io
-import html
-
-# V a r i a b l e s
-
-# We only support ASCII, ISO-8859-1, and anything capable of encoding
-# the full Unicode set. Anything else is too sticky a wicket to want
-# to mess with.
-CODECS_TO_NAME = {}
-for i in [ "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16", "UTF-16LE", "UTF-16BE", "UTF-32", "UTF-32LE", "UTF-32BE" ]:
-    CODECS_TO_NAME[codecs.lookup(i)] = i
-del i
-_MAXCHAR = { "US-ASCII": 127, "ISO-8859-1": 255 }
-
-# There are WAY more HTML entities than this, but we're pessimistic about
-# what browsers "in the wild" support, so we stick to what XML supports.
-_QUOTE_ENTITIES = {
-    "\"": "quot",
-    "'": "apos"
-}
-_OTHER_ENTITIES = {
-    "&": "amp",
-    "<": "lt",
-    ">": "gt"
-}
-
-# C l a s s e s
-
-class HtmlStreamWriter(object):
-    """
-    A simple HTML writer, intended to be used in a streamwise fashion.
-    This class takes REASONABLE precautions against writing garbage, but
-    does not check every last thing. It will happily write tags like
-    "<garb<<age>>>" etc. if you feed it the right garbage in.
-    """
-    def __init__(self, stream, encoding):
-        """
-        Initialize this writer. An encoding is mandatory, even though we
-        produce character output, because the encoding governs which
-        characters we can send on for I/O without entity-escaping them.
-        The supplied stream should be buffered or performance will suffer.
-        """
-        # Stream we're using is available to the caller as .stream
-        self.stream = stream
-        try:
-            # A codec to use is available to the caller as .codec
-            self.codec = codecs.lookup(encoding)
-            # Normalized encoding name is available to the caller as .encoding
-            self.encoding = CODECS_TO_NAME[self.codec]
-        except (KeyError, LookupError) as e:
-            raise ValueError("invalid encoding {0!r}".format(encoding))
-        self._maxchar = _MAXCHAR.get(self.encoding, 0x7fffffff)
-
-    # html.escape drops the ball, badly. It is too optimistic about what
-    # entity names are likely to be understood, and is too stupid to know
-    # that ASCII streams need lots of things escaped.
-    def _escape(self, string, quote=False):
-        for ch in string:
-            entity = None
-            if quote and ch in _QUOTE_ENTITIES:
-                entity = _QUOTE_ENTITIES[ch]
-            elif ch in _OTHER_ENTITIES:
-                entity = _OTHER_ENTITIES[ch]
-            if entity:
-                self.stream.write("&")
-                self.stream.write(entity)
-                self.stream.write(";")
-                continue
-            ordch = ord(ch)
-            if ordch > self._maxchar:
-                self.stream.write("&#")
-                self.stream.write(str(ordch))
-                self.stream.write(";")
-                continue
-            self.stream.write(ch)
-
-    def write_starttag(self, tag, attrs):
-        """
-        Write a start tag.
-        """
-        self.stream.write("<")
-        self.stream.write(tag)
-        self._writeattrs(attrs)
-        self.stream.write(">")
-
-    def _writeattrs(self, attrs):
-        for k, v in attrs:
-            self.stream.write(" ")
-            self.stream.write(k)
-            self.stream.write("=\"")
-            self._escape(v, quote=True)
-            self.stream.write("\"")
-
-    def write_endtag(self, tag):
-        """
-        Write an end tag.
-        """
-        self.stream.write("</")
-        self.stream.write(tag)
-        self.stream.write(">")
-
-    def write_startendtag(self, tag, attrs):
-        """
-        Write a "start-end" (i.e. empty) tag.
-        """
-        self.stream.write("<")
-        self.stream.write(tag)
-        self._writeattrs(attrs)
-        self.stream.write("/>")
-
-    def write_data(self, data):
-        """
-        Write text data.
-        """
-        self._escape(data)
-
-    def write_raw_data(self, data):
-        """
-        Write raw data (e.g. style sheets, scripts, etc.)
-        """
-        self.stream.write(data)
-
-    def write_charref(self, name):
-        """
-        Write character reference (normally not needed).
-        """
-        is_number = False
-        try:
-            junk = int(name)
-            is_number = True
-        except ValueError:
-            pass
-        if name.startswith("x"):
-            try:
-                junk = int(name[1:], 16)
-                is_number = True
-            except ValueError:
-                pass
-        self.stream.write("&")
-        if is_number:
-            self.stream.write("#")
-            self.stream.write(name)
-        else:
-            self.stream.write(name)
-        self.stream.write(";")
-
-    def write_comment(self, data):
-        """
-        Write a comment.
-        """
-        self.stream.write("<!--")
-        self.stream.write(data)
-        self.stream.write("-->")
-
-    def write_decl(self, decl):
-        """
-        Write a declarationm.
-        """
-        self.stream.write("<!")
-        self.stream.write(decl)
-        self.stream.write(">")
-
-    def write_pi(self, data):
-        self.stream.write("<?")
-        self.stream.write(decl)
-        self.stream.write(">")