Mercurial > cgi-bin > hgweb.cgi > curlyq
diff runes.py @ 22:a771878f6cf4
Remove deadwood, update runes.py.
author | David Barts <n5jrn@me.com> |
---|---|
date | Mon, 30 Dec 2019 08:16:24 -0800 |
parents | 35f29952b51e |
children |
line wrap: on
line diff
--- a/runes.py Sat Dec 28 06:32:53 2019 -0800 +++ b/runes.py Mon Dec 30 08:16:24 2019 -0800 @@ -1,28 +1,43 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- +# Something like Java's StringBuilder, but for Python. It needs more +# thorough testing. This runs about 3x slower than io.StringIO, but +# unlike that class allows for easy and meaningful random access via +# subscripts. +# +# This implements two objects: Runes and Workspace. The former is a +# relatively low-level object that deals mostly in UTF-16 rune values; +# however, you can create a Runes object from a Python string, and you +# can create a Python string from a Runes object. Aside from that, Runes +# deals in numeric values, not Python strings. Workspace methods tend +# to accept and return Python strings, which makes them more programmer- +# friendly, at the cost of often having to fire up a codec to convert +# things back and forth between Python strings and UTF-16 runes. + # I m p o r t s import array import codecs -import collections -import struct import sys # C l a s s e s class Runes(object): """ - A mutable, fixed-length sequence of UTF-16 runes. The attributes - encoding and codec contain the name of the encoding and the codec - used to generate the UTF-16. The attribute buffer contains the - buffer (an array of 16-bit unsigned integers) used to back this - object; modifications to that array will be reflected in this - object. + A mutable sequence of UTF-16 runes. The attributes encoding and + codec contain the name of the encoding and the codec used to + generate the UTF-16. The attribute buffer contains the buffer (an + array of 16-bit unsigned integers) used to back this object; + modifications to that array will be reflected in this object. """ # The most efficient 16-bit one on this platform encoding = "UTF-16" + sys.byteorder[0].upper() + "E" codec = codecs.lookup(encoding) + _ERRORS = 'surrogatepass' + _MIN_S = 0xd800 # lowest possible surrogate + _MID_S = 0xdc00 # high surrogate if <, low if >= + _MAX_S = 0xdfff # highest possible surrogate def __init__(self, based_on=None): if isinstance(based_on, array.array): @@ -31,10 +46,9 @@ else: self.buffer = array.array('H', based_on) elif isinstance(based_on, str): - # A string should always be able to encode to runes. - self.buffer = array.array('H', self.codec.encode(based_on, 'strict')[0]) + self.buffer = array.array('H', self.codec.encode(based_on, self._ERRORS)[0]) elif based_on is None: - self.buffer = array.array('H', bytes()) + self.buffer = array.array('H') elif isinstance(based_on, Runes): self.buffer = array.array('H', based_on.buffer) else: @@ -45,8 +59,7 @@ Convert this object to a string. We deliberately do not have a __repr__ method, to underscore that runes are not strings. """ - # Runes might not always be able to decode to a string. - return self.codec.decode(self.buffer, 'replace')[0] + return self.codec.decode(self.buffer, self._ERRORS)[0] def __bytes__(self): return bytes(self.buffer) @@ -54,22 +67,37 @@ def __len__(self): return len(self.buffer) + def _checkindex(self, index, allow_equal=False): + ok = 0 <= index <= len(self) if allow_equal else 0 <= index < len(self) + if not ok: + raise IndexError("index {0} out of range".format(index)) + + def _checktype(self, other): + if not isinstance(other, Runes): + raise TypeError("Runes required") + def __lt__(self, other): + self._checktype(other) return self.buffer < other.buffer def __le__(self, other): + self._checktype(other) return self.buffer <= other.buffer def __gt__(self, other): + self._checktype(other) return self.buffer > other.buffer def __ge__(self, other): + self._checktype(other) return self.buffer >= other.buffer def __eq__(self, other): + self._checktype(other) return self.buffer == other.buffer def __ne__(self, other): + self._checktype(other) return self.buffer != other.buffer def __hash__(self): @@ -79,11 +107,12 @@ return bool(self.buffer) def __getitem__(self, key): - ret = self.buffer[key] - if isinstance(ret, array.array): - return Runes(ret) + if isinstance(key, int): + return self.buffer[key] + elif isinstance(key, slice): + return Runes(self.buffer[key]) else: - return ret + raise AssertionError("this shouldn't happen") def __setitem__(self, key, value): if isinstance(key, int): @@ -94,14 +123,25 @@ elif isinstance(value, Runes): self.buffer[key] = value.buffer else: - raise TypeError("runes required") + raise TypeError("Runes required") def __delitem__(self, key): del self.buffer[key] def clear(self): + """ + Remove all data from our buffer. This merely marks the buffer as + empty; it does nothing to destroy its contents by overwriting. + """ del self[:] + def zero(self): + """ + Overwrite our buffer with zeroes. + """ + for i in range(len(self.buffer)): + self.buffer[i] = 0 + def __iter__(self): return iter(self.buffer) @@ -109,82 +149,255 @@ return reversed(self.buffer) def append(self, value): + """ + Append data to our buffer. + """ if isinstance(value, int): self.buffer.append(value) elif isinstance(value, Runes): self.buffer.extend(value.buffer) else: - raise TypeError("integer or runes required") + raise TypeError("integer or Runes required") def __contains__(self, value): return value in self.buffer - def index(self, value): - return self.buffer.index(value) + @classmethod + def is_high_surrogate(cls, value): + """ + Is value in the UTF-16 high surrogate range? + """ + return cls._MIN_S <= value < cls._MID_S + + @classmethod + def is_low_surrogate(cls, value): + """ + Is value in the UTF-16 low surrogate range? + """ + return cls._MIN_S <= value <= cls._MAX_S + + @classmethod + def is_surrogate(cls, value): + """ + Is value in the UTF-16 surrogate range? + """ + return cls._MIN_S <= value <= cls._MAX_S - def find(self, value): + def index(self, value, from_index=0): + """ + Substring index, throws exception if not found. + """ + self._checktype(value) + slimit = len(self) + rlimit = len(value) + for i in range(from_index, len(self)): + match = True + for j in range(rlimit): + k = i + j + if k >= slimit or value.buffer[j] != self.buffer[k]: + match = False + break + if match: + return i + raise ValueError("substring not found") + + def find(self, value, from_index=0): + """ + Substring index, returns -1 if not found. + """ try: - return self.index(value) + return self.index(value, from_index) + except ValueError: + return -1 + + def rindex(self, value, from_index=None): + """ + Reverse substring index, throws exception if not found. + """ + self._checktype(value) + if from_index is None: + from_index = len(self) - 1 + rfrom = len(value) - 1 + for i in range(from_index, -1, -1): + match = True + for j in range(rfrom, -1, -1): + k = i - (rfrom - j) + if k < 0 or value.buffer[j] != self.buffer[k]: + match = False + break + if match: + return i - rfrom + raise ValueError("substring not found") + + def rfind(self, value, from_index=None): + """ + Reverse substring index, returns -1 if not found. + """ + try: + return self.rindex(value, from_index) except ValueError: return -1 class Workspace(Runes): """ - A Runes object that acts a bit more string-like, in that __setitem__ - also accepts a string as an argument and __getitem__ always returns - a string. We also return empty strings instead of throwing IndexError - when attempting to read out-of-range values, because that makes life - easier for us when curling quotes. + A Runes object (q.v.) that acts a bit more string-like. """ + def __setitem__(self, key, value): if isinstance(value, str): if isinstance(key, int): - Runes.__setitem__(self, key, self._ord(value)) + value = ord(value) + if value > 0xffff: + raise ValueError("character not in BMP") + super().__setitem__(key, value) else: - Runes.__setitem__(self, key, Runes(value)) + super().__setitem__(key, Runes(value)) else: - Runes.__setitem__(self, key, value) + super().__setitem__(key, value) def __getitem__(self, key): - view = memoryview(self.buffer) - try: - result = view[key] - if isinstance(result, int): - return chr(result) - if isinstance(result, memoryview): - ret = self.codec.decode(result, 'replace')[0] + if isinstance(key, int): + return chr(self.buffer[key]) + elif isinstance(key, slice): + view = memoryview(self.buffer) + try: + result = view[key] + if not isinstance(result, memoryview): + assert isinstance(result, int) + return chr(result) + ret = self.codec.decode(result, self._ERRORS)[0] result.release() return ret - else: - raise AssertionError("this shouldn't happen") - except IndexError: - return "" - finally: - view.release() + finally: + view.release() + else: + raise AssertionError("this shouldn't happen") + + def __contains__(self, value): + if isinstance(value, int): + return value in self.buffer + return self.find(value) != -1 + + def __iter__(self): + for i in range(len(self)): + yield self[i] + + def __reversed__(self): + for i in range(len(self)-1, -1, -1): + yield self[i] def append(self, value): + """ + Append string or runes to this item. + """ if isinstance(value, str): - Runes.append(self, Runes(value)) - else: - Runes.append(self, value) + value = Runes(value) + elif not isinstance(value, (int, Runes)): + raise TypeError("integer, string, or Runes required") + super().append(value) - def index(self, value): + def _runify(self, value): if isinstance(value, str): - return Runes.index(self, self._ord(value)) + return Runes(value) + elif isinstance(value, Runes): + return value else: - return Runes.index(self, value) + raise TypeError("Runes or string required") - def find(self, value): + def index(self, value, from_index=0): + """ + Substring index, throws exception if not found. + """ + return super().index(self._runify(value), from_index) + + def find(self, value, from_index=0): + """ + Substring index, returns -1 if not found. + """ try: - return self.index(value) + return self.index(value, from_index) except ValueError: return -1 - def _ord(self, string): - length = len(string) - if length != 1: - raise ValueError("expected a character, but string of length {0} found".format(length)) - raw = Runes(string) - if len(raw) != 1: - raise ValueError("character not in BMP") - return raw[0] + def rindex(self, value, from_index=None): + """ + Reverse substring index, throws exception if not found. + """ + return super().rindex(self._runify(value), from_index) + + def rfind(self, value, from_index=None): + """ + Reverse substring index, returns -1 if not found. + """ + try: + return self.rindex(value, from_index) + except ValueError: + return -1 + + def _code_point_at(self, index): + self._checkindex(index, allow_equal=False) + v0 = self.buffer[index] + i1 = index + 1 + v1 = None if i1 >= len(self.buffer) else self.buffer[i1] + if Runes.is_high_surrogate(v0) and v1 is not None and Runes.is_low_surrogate(v1): + return slice(index, index+2) + else: + return slice(index, i1) + + def code_point_at(self, index): + """ + Similar to java.lang.String.codePointAt, but returns a 1-character + string, not an int. + """ + return self[self._code_point_at(index)] + + def _code_point_before(self, index): + self._checkindex(index - 1, allow_equal=True) + i1 = index - 1 + v1 = self.buffer[i1] + i2 = index - 2 + v2 = None if i2 < 0 else self.buffer[i2] + if Runes.is_low_surrogate(v1) and v2 is not None and Runes.is_high_surrogate(v2): + return slice(i2, index) + else: + return slice(i1, index) + + def code_point_before(self, index): + """ + Similar to java.lang.String.codePointBefore but returns a 1-character + string, not an int. + """ + return self[self._code_point_before(index)] + + def code_point_count(self, begin=None, end=None): + """ + Same behavior as java.lang.String.codePointCount (q.v.). + """ + if begin is None: begin = 0 + if end is None: end = len(self.buffer) + self._checkindex(begin, allow_equal=False) + self._checkindex(end, allow_equal=True) + if begin > end: + raise IndexError("invalid range (begin > end)") + i = begin + ret = 0 + while i < end: + i = self._code_point_at(i).stop + ret += 1 + return ret + + def offset_by_code_points(self, index, offset): + """ + Same behavior as java.lang.String.offsetByCodePoints (q.v.). + """ + self._checkindex(index, allow_equal=True) + if offset < 0: + next = lambda i: self._code_point_before(i).start + else: + next = lambda i: self._code_point_at(i).stop + seen = 0 + limit = abs(offset) + while seen < limit: + index = next(index) + seen += 1 + return index