Mercurial > cgi-bin > hgweb.cgi > curlyq
view runes.py @ 29:d5bf9985b5c4 default tip
Add degree symbol, fix bug in HTML curler.
author | David Barts <n5jrn@me.com> |
---|---|
date | Thu, 07 Oct 2021 11:55:46 -0700 |
parents | a771878f6cf4 |
children |
line wrap: on
line source
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Something like Java's StringBuilder, but for Python. It needs more # thorough testing. This runs about 3x slower than io.StringIO, but # unlike that class allows for easy and meaningful random access via # subscripts. # # This implements two objects: Runes and Workspace. The former is a # relatively low-level object that deals mostly in UTF-16 rune values; # however, you can create a Runes object from a Python string, and you # can create a Python string from a Runes object. Aside from that, Runes # deals in numeric values, not Python strings. Workspace methods tend # to accept and return Python strings, which makes them more programmer- # friendly, at the cost of often having to fire up a codec to convert # things back and forth between Python strings and UTF-16 runes. # I m p o r t s import array import codecs import sys # C l a s s e s class Runes(object): """ A mutable sequence of UTF-16 runes. The attributes encoding and codec contain the name of the encoding and the codec used to generate the UTF-16. The attribute buffer contains the buffer (an array of 16-bit unsigned integers) used to back this object; modifications to that array will be reflected in this object. """ # The most efficient 16-bit one on this platform encoding = "UTF-16" + sys.byteorder[0].upper() + "E" codec = codecs.lookup(encoding) _ERRORS = 'surrogatepass' _MIN_S = 0xd800 # lowest possible surrogate _MID_S = 0xdc00 # high surrogate if <, low if >= _MAX_S = 0xdfff # highest possible surrogate def __init__(self, based_on=None): if isinstance(based_on, array.array): if based_on.typecode == 'H': self.buffer = based_on else: self.buffer = array.array('H', based_on) elif isinstance(based_on, str): self.buffer = array.array('H', self.codec.encode(based_on, self._ERRORS)[0]) elif based_on is None: self.buffer = array.array('H') elif isinstance(based_on, Runes): self.buffer = array.array('H', based_on.buffer) else: self.buffer = array.array('H', based_on) def __str__(self): """ Convert this object to a string. We deliberately do not have a __repr__ method, to underscore that runes are not strings. """ return self.codec.decode(self.buffer, self._ERRORS)[0] def __bytes__(self): return bytes(self.buffer) def __len__(self): return len(self.buffer) def _checkindex(self, index, allow_equal=False): ok = 0 <= index <= len(self) if allow_equal else 0 <= index < len(self) if not ok: raise IndexError("index {0} out of range".format(index)) def _checktype(self, other): if not isinstance(other, Runes): raise TypeError("Runes required") def __lt__(self, other): self._checktype(other) return self.buffer < other.buffer def __le__(self, other): self._checktype(other) return self.buffer <= other.buffer def __gt__(self, other): self._checktype(other) return self.buffer > other.buffer def __ge__(self, other): self._checktype(other) return self.buffer >= other.buffer def __eq__(self, other): self._checktype(other) return self.buffer == other.buffer def __ne__(self, other): self._checktype(other) return self.buffer != other.buffer def __hash__(self): raise TypeError("unhashable type") def __bool__(self): return bool(self.buffer) def __getitem__(self, key): if isinstance(key, int): return self.buffer[key] elif isinstance(key, slice): return Runes(self.buffer[key]) else: raise AssertionError("this shouldn't happen") def __setitem__(self, key, value): if isinstance(key, int): if isinstance(value, int): self.buffer[key] = value else: raise TypeError("integer required") elif isinstance(value, Runes): self.buffer[key] = value.buffer else: raise TypeError("Runes required") def __delitem__(self, key): del self.buffer[key] def clear(self): """ Remove all data from our buffer. This merely marks the buffer as empty; it does nothing to destroy its contents by overwriting. """ del self[:] def zero(self): """ Overwrite our buffer with zeroes. """ for i in range(len(self.buffer)): self.buffer[i] = 0 def __iter__(self): return iter(self.buffer) def __reversed__(self): return reversed(self.buffer) def append(self, value): """ Append data to our buffer. """ if isinstance(value, int): self.buffer.append(value) elif isinstance(value, Runes): self.buffer.extend(value.buffer) else: raise TypeError("integer or Runes required") def __contains__(self, value): return value in self.buffer @classmethod def is_high_surrogate(cls, value): """ Is value in the UTF-16 high surrogate range? """ return cls._MIN_S <= value < cls._MID_S @classmethod def is_low_surrogate(cls, value): """ Is value in the UTF-16 low surrogate range? """ return cls._MIN_S <= value <= cls._MAX_S @classmethod def is_surrogate(cls, value): """ Is value in the UTF-16 surrogate range? """ return cls._MIN_S <= value <= cls._MAX_S def index(self, value, from_index=0): """ Substring index, throws exception if not found. """ self._checktype(value) slimit = len(self) rlimit = len(value) for i in range(from_index, len(self)): match = True for j in range(rlimit): k = i + j if k >= slimit or value.buffer[j] != self.buffer[k]: match = False break if match: return i raise ValueError("substring not found") def find(self, value, from_index=0): """ Substring index, returns -1 if not found. """ try: return self.index(value, from_index) except ValueError: return -1 def rindex(self, value, from_index=None): """ Reverse substring index, throws exception if not found. """ self._checktype(value) if from_index is None: from_index = len(self) - 1 rfrom = len(value) - 1 for i in range(from_index, -1, -1): match = True for j in range(rfrom, -1, -1): k = i - (rfrom - j) if k < 0 or value.buffer[j] != self.buffer[k]: match = False break if match: return i - rfrom raise ValueError("substring not found") def rfind(self, value, from_index=None): """ Reverse substring index, returns -1 if not found. """ try: return self.rindex(value, from_index) except ValueError: return -1 class Workspace(Runes): """ A Runes object (q.v.) that acts a bit more string-like. """ def __setitem__(self, key, value): if isinstance(value, str): if isinstance(key, int): value = ord(value) if value > 0xffff: raise ValueError("character not in BMP") super().__setitem__(key, value) else: super().__setitem__(key, Runes(value)) else: super().__setitem__(key, value) def __getitem__(self, key): if isinstance(key, int): return chr(self.buffer[key]) elif isinstance(key, slice): view = memoryview(self.buffer) try: result = view[key] if not isinstance(result, memoryview): assert isinstance(result, int) return chr(result) ret = self.codec.decode(result, self._ERRORS)[0] result.release() return ret finally: view.release() else: raise AssertionError("this shouldn't happen") def __contains__(self, value): if isinstance(value, int): return value in self.buffer return self.find(value) != -1 def __iter__(self): for i in range(len(self)): yield self[i] def __reversed__(self): for i in range(len(self)-1, -1, -1): yield self[i] def append(self, value): """ Append string or runes to this item. """ if isinstance(value, str): value = Runes(value) elif not isinstance(value, (int, Runes)): raise TypeError("integer, string, or Runes required") super().append(value) def _runify(self, value): if isinstance(value, str): return Runes(value) elif isinstance(value, Runes): return value else: raise TypeError("Runes or string required") def index(self, value, from_index=0): """ Substring index, throws exception if not found. """ return super().index(self._runify(value), from_index) def find(self, value, from_index=0): """ Substring index, returns -1 if not found. """ try: return self.index(value, from_index) except ValueError: return -1 def rindex(self, value, from_index=None): """ Reverse substring index, throws exception if not found. """ return super().rindex(self._runify(value), from_index) def rfind(self, value, from_index=None): """ Reverse substring index, returns -1 if not found. """ try: return self.rindex(value, from_index) except ValueError: return -1 def _code_point_at(self, index): self._checkindex(index, allow_equal=False) v0 = self.buffer[index] i1 = index + 1 v1 = None if i1 >= len(self.buffer) else self.buffer[i1] if Runes.is_high_surrogate(v0) and v1 is not None and Runes.is_low_surrogate(v1): return slice(index, index+2) else: return slice(index, i1) def code_point_at(self, index): """ Similar to java.lang.String.codePointAt, but returns a 1-character string, not an int. """ return self[self._code_point_at(index)] def _code_point_before(self, index): self._checkindex(index - 1, allow_equal=True) i1 = index - 1 v1 = self.buffer[i1] i2 = index - 2 v2 = None if i2 < 0 else self.buffer[i2] if Runes.is_low_surrogate(v1) and v2 is not None and Runes.is_high_surrogate(v2): return slice(i2, index) else: return slice(i1, index) def code_point_before(self, index): """ Similar to java.lang.String.codePointBefore but returns a 1-character string, not an int. """ return self[self._code_point_before(index)] def code_point_count(self, begin=None, end=None): """ Same behavior as java.lang.String.codePointCount (q.v.). """ if begin is None: begin = 0 if end is None: end = len(self.buffer) self._checkindex(begin, allow_equal=False) self._checkindex(end, allow_equal=True) if begin > end: raise IndexError("invalid range (begin > end)") i = begin ret = 0 while i < end: i = self._code_point_at(i).stop ret += 1 return ret def offset_by_code_points(self, index, offset): """ Same behavior as java.lang.String.offsetByCodePoints (q.v.). """ self._checkindex(index, allow_equal=True) if offset < 0: next = lambda i: self._code_point_before(i).start else: next = lambda i: self._code_point_at(i).stop seen = 0 limit = abs(offset) while seen < limit: index = next(index) seen += 1 return index