view runes.py @ 24:f4cc6d8cafe8

Fix bug.
author David Barts <n5jrn@me.com>
date Wed, 15 Jan 2020 11:55:15 -0800
parents a771878f6cf4
children
line wrap: on
line source

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Something like Java's StringBuilder, but for Python. It needs more
# thorough testing. This runs about 3x slower than io.StringIO, but
# unlike that class allows for easy and meaningful random access via
# subscripts.
#
# This implements two objects: Runes and Workspace. The former is a
# relatively low-level object that deals mostly in UTF-16 rune values;
# however, you can create a Runes object from a Python string, and you
# can create a Python string from a Runes object. Aside from that, Runes
# deals in numeric values, not Python strings. Workspace methods tend
# to accept and return Python strings, which makes them more programmer-
# friendly, at the cost of often having to fire up a codec to convert
# things back and forth between Python strings and UTF-16 runes.

# I m p o r t s

import array
import codecs
import sys

# C l a s s e s

class Runes(object):
    """
    A mutable sequence of UTF-16 runes. The attributes encoding and
    codec contain the name of the encoding and the codec used to
    generate the UTF-16. The attribute buffer contains the buffer (an
    array of 16-bit unsigned integers) used to back this object;
    modifications to that array will be reflected in this object.
    """
    # The most efficient 16-bit one on this platform
    encoding = "UTF-16" + sys.byteorder[0].upper() + "E"
    codec = codecs.lookup(encoding)
    _ERRORS = 'surrogatepass'
    _MIN_S = 0xd800 # lowest possible surrogate
    _MID_S = 0xdc00 # high surrogate if <, low if >=
    _MAX_S = 0xdfff # highest possible surrogate

    def __init__(self, based_on=None):
        if isinstance(based_on, array.array):
            if based_on.typecode == 'H':
                self.buffer = based_on
            else:
                self.buffer = array.array('H', based_on)
        elif isinstance(based_on, str):
            self.buffer = array.array('H', self.codec.encode(based_on, self._ERRORS)[0])
        elif based_on is None:
            self.buffer = array.array('H')
        elif isinstance(based_on, Runes):
            self.buffer = array.array('H', based_on.buffer)
        else:
            self.buffer = array.array('H', based_on)

    def __str__(self):
        """
        Convert this object to a string. We deliberately do not have a
        __repr__ method, to underscore that runes are not strings.
        """
        return self.codec.decode(self.buffer, self._ERRORS)[0]

    def __bytes__(self):
        return bytes(self.buffer)

    def __len__(self):
        return len(self.buffer)

    def _checkindex(self, index, allow_equal=False):
        ok = 0 <= index <= len(self) if allow_equal else 0 <= index < len(self)
        if not ok:
            raise IndexError("index {0} out of range".format(index))

    def _checktype(self, other):
        if not isinstance(other, Runes):
            raise TypeError("Runes required")

    def __lt__(self, other):
        self._checktype(other)
        return self.buffer < other.buffer

    def __le__(self, other):
        self._checktype(other)
        return self.buffer <= other.buffer

    def __gt__(self, other):
        self._checktype(other)
        return self.buffer > other.buffer

    def __ge__(self, other):
        self._checktype(other)
        return self.buffer >= other.buffer

    def __eq__(self, other):
        self._checktype(other)
        return self.buffer == other.buffer

    def __ne__(self, other):
        self._checktype(other)
        return self.buffer != other.buffer

    def __hash__(self):
        raise TypeError("unhashable type")

    def __bool__(self):
        return bool(self.buffer)

    def __getitem__(self, key):
        if isinstance(key, int):
            return self.buffer[key]
        elif isinstance(key, slice):
            return Runes(self.buffer[key])
        else:
            raise AssertionError("this shouldn't happen")

    def __setitem__(self, key, value):
        if isinstance(key, int):
            if isinstance(value, int):
                self.buffer[key] = value
            else:
                raise TypeError("integer required")
        elif isinstance(value, Runes):
            self.buffer[key] = value.buffer
        else:
            raise TypeError("Runes required")

    def __delitem__(self, key):
        del self.buffer[key]

    def clear(self):
        """
        Remove all data from our buffer. This merely marks the buffer as
        empty; it does nothing to destroy its contents by overwriting.
        """
        del self[:]

    def zero(self):
        """
        Overwrite our buffer with zeroes.
        """
        for i in range(len(self.buffer)):
            self.buffer[i] = 0

    def __iter__(self):
        return iter(self.buffer)

    def __reversed__(self):
        return reversed(self.buffer)

    def append(self, value):
        """
        Append data to our buffer.
        """
        if isinstance(value, int):
            self.buffer.append(value)
        elif isinstance(value, Runes):
            self.buffer.extend(value.buffer)
        else:
            raise TypeError("integer or Runes required")

    def __contains__(self, value):
        return value in self.buffer

    @classmethod
    def is_high_surrogate(cls, value):
        """
        Is value in the UTF-16 high surrogate range?
        """
        return cls._MIN_S <= value < cls._MID_S

    @classmethod
    def is_low_surrogate(cls, value):
        """
        Is value in the UTF-16 low surrogate range?
        """
        return cls._MIN_S <= value <= cls._MAX_S

    @classmethod
    def is_surrogate(cls, value):
        """
        Is value in the UTF-16 surrogate range?
        """
        return cls._MIN_S <= value <= cls._MAX_S

    def index(self, value, from_index=0):
        """
        Substring index, throws exception if not found.
        """
        self._checktype(value)
        slimit = len(self)
        rlimit = len(value)
        for i in range(from_index, len(self)):
            match = True
            for j in range(rlimit):
                k = i + j
                if k >= slimit or value.buffer[j] != self.buffer[k]:
                    match = False
                    break
            if match:
                return i
        raise ValueError("substring not found")

    def find(self, value, from_index=0):
        """
        Substring index, returns -1 if not found.
        """
        try:
            return self.index(value, from_index)
        except ValueError:
            return -1

    def rindex(self, value, from_index=None):
        """
        Reverse substring index, throws exception if not found.
        """
        self._checktype(value)
        if from_index is None:
            from_index = len(self) - 1
        rfrom = len(value) - 1
        for i in range(from_index, -1, -1):
            match = True
            for j in range(rfrom, -1, -1):
                k = i - (rfrom - j)
                if k < 0 or value.buffer[j] != self.buffer[k]:
                    match = False
                    break
            if match:
                return i - rfrom
        raise ValueError("substring not found")

    def rfind(self, value, from_index=None):
        """
        Reverse substring index, returns -1 if not found.
        """
        try:
            return self.rindex(value, from_index)
        except ValueError:
            return -1

class Workspace(Runes):
    """
    A Runes object (q.v.) that acts a bit more string-like.
    """

    def __setitem__(self, key, value):
        if isinstance(value, str):
            if isinstance(key, int):
                value = ord(value)
                if value > 0xffff:
                    raise ValueError("character not in BMP")
                super().__setitem__(key, value)
            else:
                super().__setitem__(key, Runes(value))
        else:
            super().__setitem__(key, value)

    def __getitem__(self, key):
        if isinstance(key, int):
            return chr(self.buffer[key])
        elif isinstance(key, slice):
            view = memoryview(self.buffer)
            try:
                result = view[key]
                if not isinstance(result, memoryview):
                    assert isinstance(result, int)
                    return chr(result)
                ret = self.codec.decode(result, self._ERRORS)[0]
                result.release()
                return ret
            finally:
                view.release()
        else:
            raise AssertionError("this shouldn't happen")

    def __contains__(self, value):
        if isinstance(value, int):
            return value in self.buffer
        return self.find(value) != -1

    def __iter__(self):
        for i in range(len(self)):
            yield self[i]

    def __reversed__(self):
        for i in range(len(self)-1, -1, -1):
            yield self[i]

    def append(self, value):
        """
        Append string or runes to this item.
        """
        if isinstance(value, str):
            value = Runes(value)
        elif not isinstance(value, (int, Runes)):
            raise TypeError("integer, string, or Runes required")
        super().append(value)

    def _runify(self, value):
        if isinstance(value, str):
            return Runes(value)
        elif isinstance(value, Runes):
            return value
        else:
            raise TypeError("Runes or string required")

    def index(self, value, from_index=0):
        """
        Substring index, throws exception if not found.
        """
        return super().index(self._runify(value), from_index)

    def find(self, value, from_index=0):
        """
        Substring index, returns -1 if not found.
        """
        try:
            return self.index(value, from_index)
        except ValueError:
            return -1

    def rindex(self, value, from_index=None):
        """
        Reverse substring index, throws exception if not found.
        """
        return super().rindex(self._runify(value), from_index)

    def rfind(self, value, from_index=None):
        """
        Reverse substring index, returns -1 if not found.
        """
        try:
            return self.rindex(value, from_index)
        except ValueError:
            return -1

    def _code_point_at(self, index):
        self._checkindex(index, allow_equal=False)
        v0 = self.buffer[index]
        i1 = index + 1
        v1 = None if i1 >= len(self.buffer) else self.buffer[i1]
        if Runes.is_high_surrogate(v0) and v1 is not None and Runes.is_low_surrogate(v1):
            return slice(index, index+2)
        else:
            return slice(index, i1)

    def code_point_at(self, index):
        """
        Similar to java.lang.String.codePointAt, but returns a 1-character
        string, not an int.
        """
        return self[self._code_point_at(index)]

    def _code_point_before(self, index):
        self._checkindex(index - 1, allow_equal=True)
        i1 = index - 1
        v1 = self.buffer[i1]
        i2 = index - 2
        v2 = None if i2 < 0 else self.buffer[i2]
        if Runes.is_low_surrogate(v1) and v2 is not None and Runes.is_high_surrogate(v2):
            return slice(i2, index)
        else:
            return slice(i1, index)

    def code_point_before(self, index):
        """
        Similar to java.lang.String.codePointBefore but returns a 1-character
        string, not an int.
        """
        return self[self._code_point_before(index)]

    def code_point_count(self, begin=None, end=None):
        """
        Same behavior as java.lang.String.codePointCount (q.v.).
        """
        if begin is None: begin = 0
        if end is None: end = len(self.buffer)
        self._checkindex(begin, allow_equal=False)
        self._checkindex(end, allow_equal=True)
        if begin > end:
            raise IndexError("invalid range (begin > end)")
        i = begin
        ret = 0
        while i < end:
            i = self._code_point_at(i).stop
            ret += 1
        return ret

    def offset_by_code_points(self, index, offset):
        """
        Same behavior as java.lang.String.offsetByCodePoints (q.v.).
        """
        self._checkindex(index, allow_equal=True)
        if offset < 0:
            next = lambda i: self._code_point_before(i).start
        else:
            next = lambda i: self._code_point_at(i).stop
        seen = 0
        limit = abs(offset)
        while seen < limit:
            index = next(index)
            seen += 1
        return index