diff runes.py @ 22:a771878f6cf4

Remove deadwood, update runes.py.
author David Barts <n5jrn@me.com>
date Mon, 30 Dec 2019 08:16:24 -0800
parents 35f29952b51e
children
line wrap: on
line diff
--- a/runes.py	Sat Dec 28 06:32:53 2019 -0800
+++ b/runes.py	Mon Dec 30 08:16:24 2019 -0800
@@ -1,28 +1,43 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
+# Something like Java's StringBuilder, but for Python. It needs more
+# thorough testing. This runs about 3x slower than io.StringIO, but
+# unlike that class allows for easy and meaningful random access via
+# subscripts.
+#
+# This implements two objects: Runes and Workspace. The former is a
+# relatively low-level object that deals mostly in UTF-16 rune values;
+# however, you can create a Runes object from a Python string, and you
+# can create a Python string from a Runes object. Aside from that, Runes
+# deals in numeric values, not Python strings. Workspace methods tend
+# to accept and return Python strings, which makes them more programmer-
+# friendly, at the cost of often having to fire up a codec to convert
+# things back and forth between Python strings and UTF-16 runes.
+
 # I m p o r t s
 
 import array
 import codecs
-import collections
-import struct
 import sys
 
 # C l a s s e s
 
 class Runes(object):
     """
-    A mutable, fixed-length sequence of UTF-16 runes. The attributes
-    encoding and codec contain the name of the encoding and the codec
-    used to generate the UTF-16. The attribute buffer contains the
-    buffer (an array of 16-bit unsigned integers) used to back this
-    object; modifications to that array will be reflected in this
-    object.
+    A mutable sequence of UTF-16 runes. The attributes encoding and
+    codec contain the name of the encoding and the codec used to
+    generate the UTF-16. The attribute buffer contains the buffer (an
+    array of 16-bit unsigned integers) used to back this object;
+    modifications to that array will be reflected in this object.
     """
     # The most efficient 16-bit one on this platform
     encoding = "UTF-16" + sys.byteorder[0].upper() + "E"
     codec = codecs.lookup(encoding)
+    _ERRORS = 'surrogatepass'
+    _MIN_S = 0xd800 # lowest possible surrogate
+    _MID_S = 0xdc00 # high surrogate if <, low if >=
+    _MAX_S = 0xdfff # highest possible surrogate
 
     def __init__(self, based_on=None):
         if isinstance(based_on, array.array):
@@ -31,10 +46,9 @@
             else:
                 self.buffer = array.array('H', based_on)
         elif isinstance(based_on, str):
-            # A string should always be able to encode to runes.
-            self.buffer = array.array('H', self.codec.encode(based_on, 'strict')[0])
+            self.buffer = array.array('H', self.codec.encode(based_on, self._ERRORS)[0])
         elif based_on is None:
-            self.buffer = array.array('H', bytes())
+            self.buffer = array.array('H')
         elif isinstance(based_on, Runes):
             self.buffer = array.array('H', based_on.buffer)
         else:
@@ -45,8 +59,7 @@
         Convert this object to a string. We deliberately do not have a
         __repr__ method, to underscore that runes are not strings.
         """
-        # Runes might not always be able to decode to a string.
-        return self.codec.decode(self.buffer, 'replace')[0]
+        return self.codec.decode(self.buffer, self._ERRORS)[0]
 
     def __bytes__(self):
         return bytes(self.buffer)
@@ -54,22 +67,37 @@
     def __len__(self):
         return len(self.buffer)
 
+    def _checkindex(self, index, allow_equal=False):
+        ok = 0 <= index <= len(self) if allow_equal else 0 <= index < len(self)
+        if not ok:
+            raise IndexError("index {0} out of range".format(index))
+
+    def _checktype(self, other):
+        if not isinstance(other, Runes):
+            raise TypeError("Runes required")
+
     def __lt__(self, other):
+        self._checktype(other)
         return self.buffer < other.buffer
 
     def __le__(self, other):
+        self._checktype(other)
         return self.buffer <= other.buffer
 
     def __gt__(self, other):
+        self._checktype(other)
         return self.buffer > other.buffer
 
     def __ge__(self, other):
+        self._checktype(other)
         return self.buffer >= other.buffer
 
     def __eq__(self, other):
+        self._checktype(other)
         return self.buffer == other.buffer
 
     def __ne__(self, other):
+        self._checktype(other)
         return self.buffer != other.buffer
 
     def __hash__(self):
@@ -79,11 +107,12 @@
         return bool(self.buffer)
 
     def __getitem__(self, key):
-        ret = self.buffer[key]
-        if isinstance(ret, array.array):
-            return Runes(ret)
+        if isinstance(key, int):
+            return self.buffer[key]
+        elif isinstance(key, slice):
+            return Runes(self.buffer[key])
         else:
-            return ret
+            raise AssertionError("this shouldn't happen")
 
     def __setitem__(self, key, value):
         if isinstance(key, int):
@@ -94,14 +123,25 @@
         elif isinstance(value, Runes):
             self.buffer[key] = value.buffer
         else:
-            raise TypeError("runes required")
+            raise TypeError("Runes required")
 
     def __delitem__(self, key):
         del self.buffer[key]
 
     def clear(self):
+        """
+        Remove all data from our buffer. This merely marks the buffer as
+        empty; it does nothing to destroy its contents by overwriting.
+        """
         del self[:]
 
+    def zero(self):
+        """
+        Overwrite our buffer with zeroes.
+        """
+        for i in range(len(self.buffer)):
+            self.buffer[i] = 0
+
     def __iter__(self):
         return iter(self.buffer)
 
@@ -109,82 +149,255 @@
         return reversed(self.buffer)
 
     def append(self, value):
+        """
+        Append data to our buffer.
+        """
         if isinstance(value, int):
             self.buffer.append(value)
         elif isinstance(value, Runes):
             self.buffer.extend(value.buffer)
         else:
-            raise TypeError("integer or runes required")
+            raise TypeError("integer or Runes required")
 
     def __contains__(self, value):
         return value in self.buffer
 
-    def index(self, value):
-        return self.buffer.index(value)
+    @classmethod
+    def is_high_surrogate(cls, value):
+        """
+        Is value in the UTF-16 high surrogate range?
+        """
+        return cls._MIN_S <= value < cls._MID_S
+
+    @classmethod
+    def is_low_surrogate(cls, value):
+        """
+        Is value in the UTF-16 low surrogate range?
+        """
+        return cls._MIN_S <= value <= cls._MAX_S
+
+    @classmethod
+    def is_surrogate(cls, value):
+        """
+        Is value in the UTF-16 surrogate range?
+        """
+        return cls._MIN_S <= value <= cls._MAX_S
 
-    def find(self, value):
+    def index(self, value, from_index=0):
+        """
+        Substring index, throws exception if not found.
+        """
+        self._checktype(value)
+        slimit = len(self)
+        rlimit = len(value)
+        for i in range(from_index, len(self)):
+            match = True
+            for j in range(rlimit):
+                k = i + j
+                if k >= slimit or value.buffer[j] != self.buffer[k]:
+                    match = False
+                    break
+            if match:
+                return i
+        raise ValueError("substring not found")
+
+    def find(self, value, from_index=0):
+        """
+        Substring index, returns -1 if not found.
+        """
         try:
-            return self.index(value)
+            return self.index(value, from_index)
+        except ValueError:
+            return -1
+
+    def rindex(self, value, from_index=None):
+        """
+        Reverse substring index, throws exception if not found.
+        """
+        self._checktype(value)
+        if from_index is None:
+            from_index = len(self) - 1
+        rfrom = len(value) - 1
+        for i in range(from_index, -1, -1):
+            match = True
+            for j in range(rfrom, -1, -1):
+                k = i - (rfrom - j)
+                if k < 0 or value.buffer[j] != self.buffer[k]:
+                    match = False
+                    break
+            if match:
+                return i - rfrom
+        raise ValueError("substring not found")
+
+    def rfind(self, value, from_index=None):
+        """
+        Reverse substring index, returns -1 if not found.
+        """
+        try:
+            return self.rindex(value, from_index)
         except ValueError:
             return -1
 
 class Workspace(Runes):
     """
-    A Runes object that acts a bit more string-like, in that __setitem__
-    also accepts a string as an argument and __getitem__ always returns
-    a string. We also return empty strings instead of throwing IndexError
-    when attempting to read out-of-range values, because that makes life
-    easier for us when curling quotes.
+    A Runes object (q.v.) that acts a bit more string-like.
     """
+
     def __setitem__(self, key, value):
         if isinstance(value, str):
             if isinstance(key, int):
-                Runes.__setitem__(self, key, self._ord(value))
+                value = ord(value)
+                if value > 0xffff:
+                    raise ValueError("character not in BMP")
+                super().__setitem__(key, value)
             else:
-                Runes.__setitem__(self, key, Runes(value))
+                super().__setitem__(key, Runes(value))
         else:
-            Runes.__setitem__(self, key, value)
+            super().__setitem__(key, value)
 
     def __getitem__(self, key):
-        view = memoryview(self.buffer)
-        try:
-            result = view[key]
-            if isinstance(result, int):
-                return chr(result)
-            if isinstance(result, memoryview):
-                ret = self.codec.decode(result, 'replace')[0]
+        if isinstance(key, int):
+            return chr(self.buffer[key])
+        elif isinstance(key, slice):
+            view = memoryview(self.buffer)
+            try:
+                result = view[key]
+                if not isinstance(result, memoryview):
+                    assert isinstance(result, int)
+                    return chr(result)
+                ret = self.codec.decode(result, self._ERRORS)[0]
                 result.release()
                 return ret
-            else:
-                raise AssertionError("this shouldn't happen")
-        except IndexError:
-            return ""
-        finally:
-            view.release()
+            finally:
+                view.release()
+        else:
+            raise AssertionError("this shouldn't happen")
+
+    def __contains__(self, value):
+        if isinstance(value, int):
+            return value in self.buffer
+        return self.find(value) != -1
+
+    def __iter__(self):
+        for i in range(len(self)):
+            yield self[i]
+
+    def __reversed__(self):
+        for i in range(len(self)-1, -1, -1):
+            yield self[i]
 
     def append(self, value):
+        """
+        Append string or runes to this item.
+        """
         if isinstance(value, str):
-            Runes.append(self, Runes(value))
-        else:
-            Runes.append(self, value)
+            value = Runes(value)
+        elif not isinstance(value, (int, Runes)):
+            raise TypeError("integer, string, or Runes required")
+        super().append(value)
 
-    def index(self, value):
+    def _runify(self, value):
         if isinstance(value, str):
-            return Runes.index(self, self._ord(value))
+            return Runes(value)
+        elif isinstance(value, Runes):
+            return value
         else:
-            return Runes.index(self, value)
+            raise TypeError("Runes or string required")
 
-    def find(self, value):
+    def index(self, value, from_index=0):
+        """
+        Substring index, throws exception if not found.
+        """
+        return super().index(self._runify(value), from_index)
+
+    def find(self, value, from_index=0):
+        """
+        Substring index, returns -1 if not found.
+        """
         try:
-            return self.index(value)
+            return self.index(value, from_index)
         except ValueError:
             return -1
 
-    def _ord(self, string):
-        length = len(string)
-        if length != 1:
-            raise ValueError("expected a character, but string of length {0} found".format(length))
-        raw = Runes(string)
-        if len(raw) != 1:
-            raise ValueError("character not in BMP")
-        return raw[0]
+    def rindex(self, value, from_index=None):
+        """
+        Reverse substring index, throws exception if not found.
+        """
+        return super().rindex(self._runify(value), from_index)
+
+    def rfind(self, value, from_index=None):
+        """
+        Reverse substring index, returns -1 if not found.
+        """
+        try:
+            return self.rindex(value, from_index)
+        except ValueError:
+            return -1
+
+    def _code_point_at(self, index):
+        self._checkindex(index, allow_equal=False)
+        v0 = self.buffer[index]
+        i1 = index + 1
+        v1 = None if i1 >= len(self.buffer) else self.buffer[i1]
+        if Runes.is_high_surrogate(v0) and v1 is not None and Runes.is_low_surrogate(v1):
+            return slice(index, index+2)
+        else:
+            return slice(index, i1)
+
+    def code_point_at(self, index):
+        """
+        Similar to java.lang.String.codePointAt, but returns a 1-character
+        string, not an int.
+        """
+        return self[self._code_point_at(index)]
+
+    def _code_point_before(self, index):
+        self._checkindex(index - 1, allow_equal=True)
+        i1 = index - 1
+        v1 = self.buffer[i1]
+        i2 = index - 2
+        v2 = None if i2 < 0 else self.buffer[i2]
+        if Runes.is_low_surrogate(v1) and v2 is not None and Runes.is_high_surrogate(v2):
+            return slice(i2, index)
+        else:
+            return slice(i1, index)
+
+    def code_point_before(self, index):
+        """
+        Similar to java.lang.String.codePointBefore but returns a 1-character
+        string, not an int.
+        """
+        return self[self._code_point_before(index)]
+
+    def code_point_count(self, begin=None, end=None):
+        """
+        Same behavior as java.lang.String.codePointCount (q.v.).
+        """
+        if begin is None: begin = 0
+        if end is None: end = len(self.buffer)
+        self._checkindex(begin, allow_equal=False)
+        self._checkindex(end, allow_equal=True)
+        if begin > end:
+            raise IndexError("invalid range (begin > end)")
+        i = begin
+        ret = 0
+        while i < end:
+            i = self._code_point_at(i).stop
+            ret += 1
+        return ret
+
+    def offset_by_code_points(self, index, offset):
+        """
+        Same behavior as java.lang.String.offsetByCodePoints (q.v.).
+        """
+        self._checkindex(index, allow_equal=True)
+        if offset < 0:
+            next = lambda i: self._code_point_before(i).start
+        else:
+            next = lambda i: self._code_point_at(i).stop
+        seen = 0
+        limit = abs(offset)
+        while seen < limit:
+            index = next(index)
+            seen += 1
+        return index