comparison runes.py @ 22:a771878f6cf4

Remove deadwood, update runes.py.
author David Barts <n5jrn@me.com>
date Mon, 30 Dec 2019 08:16:24 -0800
parents 35f29952b51e
children
comparison
equal deleted inserted replaced
21:35f29952b51e 22:a771878f6cf4
1 #!/usr/bin/env python3 1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*- 2 # -*- coding: utf-8 -*-
3 3
4 # Something like Java's StringBuilder, but for Python. It needs more
5 # thorough testing. This runs about 3x slower than io.StringIO, but
6 # unlike that class allows for easy and meaningful random access via
7 # subscripts.
8 #
9 # This implements two objects: Runes and Workspace. The former is a
10 # relatively low-level object that deals mostly in UTF-16 rune values;
11 # however, you can create a Runes object from a Python string, and you
12 # can create a Python string from a Runes object. Aside from that, Runes
13 # deals in numeric values, not Python strings. Workspace methods tend
14 # to accept and return Python strings, which makes them more programmer-
15 # friendly, at the cost of often having to fire up a codec to convert
16 # things back and forth between Python strings and UTF-16 runes.
17
4 # I m p o r t s 18 # I m p o r t s
5 19
6 import array 20 import array
7 import codecs 21 import codecs
8 import collections
9 import struct
10 import sys 22 import sys
11 23
12 # C l a s s e s 24 # C l a s s e s
13 25
14 class Runes(object): 26 class Runes(object):
15 """ 27 """
16 A mutable, fixed-length sequence of UTF-16 runes. The attributes 28 A mutable sequence of UTF-16 runes. The attributes encoding and
17 encoding and codec contain the name of the encoding and the codec 29 codec contain the name of the encoding and the codec used to
18 used to generate the UTF-16. The attribute buffer contains the 30 generate the UTF-16. The attribute buffer contains the buffer (an
19 buffer (an array of 16-bit unsigned integers) used to back this 31 array of 16-bit unsigned integers) used to back this object;
20 object; modifications to that array will be reflected in this 32 modifications to that array will be reflected in this object.
21 object.
22 """ 33 """
23 # The most efficient 16-bit one on this platform 34 # The most efficient 16-bit one on this platform
24 encoding = "UTF-16" + sys.byteorder[0].upper() + "E" 35 encoding = "UTF-16" + sys.byteorder[0].upper() + "E"
25 codec = codecs.lookup(encoding) 36 codec = codecs.lookup(encoding)
37 _ERRORS = 'surrogatepass'
38 _MIN_S = 0xd800 # lowest possible surrogate
39 _MID_S = 0xdc00 # high surrogate if <, low if >=
40 _MAX_S = 0xdfff # highest possible surrogate
26 41
27 def __init__(self, based_on=None): 42 def __init__(self, based_on=None):
28 if isinstance(based_on, array.array): 43 if isinstance(based_on, array.array):
29 if based_on.typecode == 'H': 44 if based_on.typecode == 'H':
30 self.buffer = based_on 45 self.buffer = based_on
31 else: 46 else:
32 self.buffer = array.array('H', based_on) 47 self.buffer = array.array('H', based_on)
33 elif isinstance(based_on, str): 48 elif isinstance(based_on, str):
34 # A string should always be able to encode to runes. 49 self.buffer = array.array('H', self.codec.encode(based_on, self._ERRORS)[0])
35 self.buffer = array.array('H', self.codec.encode(based_on, 'strict')[0])
36 elif based_on is None: 50 elif based_on is None:
37 self.buffer = array.array('H', bytes()) 51 self.buffer = array.array('H')
38 elif isinstance(based_on, Runes): 52 elif isinstance(based_on, Runes):
39 self.buffer = array.array('H', based_on.buffer) 53 self.buffer = array.array('H', based_on.buffer)
40 else: 54 else:
41 self.buffer = array.array('H', based_on) 55 self.buffer = array.array('H', based_on)
42 56
43 def __str__(self): 57 def __str__(self):
44 """ 58 """
45 Convert this object to a string. We deliberately do not have a 59 Convert this object to a string. We deliberately do not have a
46 __repr__ method, to underscore that runes are not strings. 60 __repr__ method, to underscore that runes are not strings.
47 """ 61 """
48 # Runes might not always be able to decode to a string. 62 return self.codec.decode(self.buffer, self._ERRORS)[0]
49 return self.codec.decode(self.buffer, 'replace')[0]
50 63
51 def __bytes__(self): 64 def __bytes__(self):
52 return bytes(self.buffer) 65 return bytes(self.buffer)
53 66
54 def __len__(self): 67 def __len__(self):
55 return len(self.buffer) 68 return len(self.buffer)
56 69
70 def _checkindex(self, index, allow_equal=False):
71 ok = 0 <= index <= len(self) if allow_equal else 0 <= index < len(self)
72 if not ok:
73 raise IndexError("index {0} out of range".format(index))
74
75 def _checktype(self, other):
76 if not isinstance(other, Runes):
77 raise TypeError("Runes required")
78
57 def __lt__(self, other): 79 def __lt__(self, other):
80 self._checktype(other)
58 return self.buffer < other.buffer 81 return self.buffer < other.buffer
59 82
60 def __le__(self, other): 83 def __le__(self, other):
84 self._checktype(other)
61 return self.buffer <= other.buffer 85 return self.buffer <= other.buffer
62 86
63 def __gt__(self, other): 87 def __gt__(self, other):
88 self._checktype(other)
64 return self.buffer > other.buffer 89 return self.buffer > other.buffer
65 90
66 def __ge__(self, other): 91 def __ge__(self, other):
92 self._checktype(other)
67 return self.buffer >= other.buffer 93 return self.buffer >= other.buffer
68 94
69 def __eq__(self, other): 95 def __eq__(self, other):
96 self._checktype(other)
70 return self.buffer == other.buffer 97 return self.buffer == other.buffer
71 98
72 def __ne__(self, other): 99 def __ne__(self, other):
100 self._checktype(other)
73 return self.buffer != other.buffer 101 return self.buffer != other.buffer
74 102
75 def __hash__(self): 103 def __hash__(self):
76 raise TypeError("unhashable type") 104 raise TypeError("unhashable type")
77 105
78 def __bool__(self): 106 def __bool__(self):
79 return bool(self.buffer) 107 return bool(self.buffer)
80 108
81 def __getitem__(self, key): 109 def __getitem__(self, key):
82 ret = self.buffer[key] 110 if isinstance(key, int):
83 if isinstance(ret, array.array): 111 return self.buffer[key]
84 return Runes(ret) 112 elif isinstance(key, slice):
85 else: 113 return Runes(self.buffer[key])
86 return ret 114 else:
115 raise AssertionError("this shouldn't happen")
87 116
88 def __setitem__(self, key, value): 117 def __setitem__(self, key, value):
89 if isinstance(key, int): 118 if isinstance(key, int):
90 if isinstance(value, int): 119 if isinstance(value, int):
91 self.buffer[key] = value 120 self.buffer[key] = value
92 else: 121 else:
93 raise TypeError("integer required") 122 raise TypeError("integer required")
94 elif isinstance(value, Runes): 123 elif isinstance(value, Runes):
95 self.buffer[key] = value.buffer 124 self.buffer[key] = value.buffer
96 else: 125 else:
97 raise TypeError("runes required") 126 raise TypeError("Runes required")
98 127
99 def __delitem__(self, key): 128 def __delitem__(self, key):
100 del self.buffer[key] 129 del self.buffer[key]
101 130
102 def clear(self): 131 def clear(self):
132 """
133 Remove all data from our buffer. This merely marks the buffer as
134 empty; it does nothing to destroy its contents by overwriting.
135 """
103 del self[:] 136 del self[:]
137
138 def zero(self):
139 """
140 Overwrite our buffer with zeroes.
141 """
142 for i in range(len(self.buffer)):
143 self.buffer[i] = 0
104 144
105 def __iter__(self): 145 def __iter__(self):
106 return iter(self.buffer) 146 return iter(self.buffer)
107 147
108 def __reversed__(self): 148 def __reversed__(self):
109 return reversed(self.buffer) 149 return reversed(self.buffer)
110 150
111 def append(self, value): 151 def append(self, value):
152 """
153 Append data to our buffer.
154 """
112 if isinstance(value, int): 155 if isinstance(value, int):
113 self.buffer.append(value) 156 self.buffer.append(value)
114 elif isinstance(value, Runes): 157 elif isinstance(value, Runes):
115 self.buffer.extend(value.buffer) 158 self.buffer.extend(value.buffer)
116 else: 159 else:
117 raise TypeError("integer or runes required") 160 raise TypeError("integer or Runes required")
118 161
119 def __contains__(self, value): 162 def __contains__(self, value):
120 return value in self.buffer 163 return value in self.buffer
121 164
122 def index(self, value): 165 @classmethod
123 return self.buffer.index(value) 166 def is_high_surrogate(cls, value):
124 167 """
125 def find(self, value): 168 Is value in the UTF-16 high surrogate range?
169 """
170 return cls._MIN_S <= value < cls._MID_S
171
172 @classmethod
173 def is_low_surrogate(cls, value):
174 """
175 Is value in the UTF-16 low surrogate range?
176 """
177 return cls._MIN_S <= value <= cls._MAX_S
178
179 @classmethod
180 def is_surrogate(cls, value):
181 """
182 Is value in the UTF-16 surrogate range?
183 """
184 return cls._MIN_S <= value <= cls._MAX_S
185
186 def index(self, value, from_index=0):
187 """
188 Substring index, throws exception if not found.
189 """
190 self._checktype(value)
191 slimit = len(self)
192 rlimit = len(value)
193 for i in range(from_index, len(self)):
194 match = True
195 for j in range(rlimit):
196 k = i + j
197 if k >= slimit or value.buffer[j] != self.buffer[k]:
198 match = False
199 break
200 if match:
201 return i
202 raise ValueError("substring not found")
203
204 def find(self, value, from_index=0):
205 """
206 Substring index, returns -1 if not found.
207 """
126 try: 208 try:
127 return self.index(value) 209 return self.index(value, from_index)
128 except ValueError: 210 except ValueError:
129 return -1 211 return -1
130 212
213 def rindex(self, value, from_index=None):
214 """
215 Reverse substring index, throws exception if not found.
216 """
217 self._checktype(value)
218 if from_index is None:
219 from_index = len(self) - 1
220 rfrom = len(value) - 1
221 for i in range(from_index, -1, -1):
222 match = True
223 for j in range(rfrom, -1, -1):
224 k = i - (rfrom - j)
225 if k < 0 or value.buffer[j] != self.buffer[k]:
226 match = False
227 break
228 if match:
229 return i - rfrom
230 raise ValueError("substring not found")
231
232 def rfind(self, value, from_index=None):
233 """
234 Reverse substring index, returns -1 if not found.
235 """
236 try:
237 return self.rindex(value, from_index)
238 except ValueError:
239 return -1
240
131 class Workspace(Runes): 241 class Workspace(Runes):
132 """ 242 """
133 A Runes object that acts a bit more string-like, in that __setitem__ 243 A Runes object (q.v.) that acts a bit more string-like.
134 also accepts a string as an argument and __getitem__ always returns
135 a string. We also return empty strings instead of throwing IndexError
136 when attempting to read out-of-range values, because that makes life
137 easier for us when curling quotes.
138 """ 244 """
245
139 def __setitem__(self, key, value): 246 def __setitem__(self, key, value):
140 if isinstance(value, str): 247 if isinstance(value, str):
141 if isinstance(key, int): 248 if isinstance(key, int):
142 Runes.__setitem__(self, key, self._ord(value)) 249 value = ord(value)
250 if value > 0xffff:
251 raise ValueError("character not in BMP")
252 super().__setitem__(key, value)
143 else: 253 else:
144 Runes.__setitem__(self, key, Runes(value)) 254 super().__setitem__(key, Runes(value))
145 else: 255 else:
146 Runes.__setitem__(self, key, value) 256 super().__setitem__(key, value)
147 257
148 def __getitem__(self, key): 258 def __getitem__(self, key):
149 view = memoryview(self.buffer) 259 if isinstance(key, int):
150 try: 260 return chr(self.buffer[key])
151 result = view[key] 261 elif isinstance(key, slice):
152 if isinstance(result, int): 262 view = memoryview(self.buffer)
153 return chr(result) 263 try:
154 if isinstance(result, memoryview): 264 result = view[key]
155 ret = self.codec.decode(result, 'replace')[0] 265 if not isinstance(result, memoryview):
266 assert isinstance(result, int)
267 return chr(result)
268 ret = self.codec.decode(result, self._ERRORS)[0]
156 result.release() 269 result.release()
157 return ret 270 return ret
158 else: 271 finally:
159 raise AssertionError("this shouldn't happen") 272 view.release()
160 except IndexError: 273 else:
161 return "" 274 raise AssertionError("this shouldn't happen")
162 finally: 275
163 view.release() 276 def __contains__(self, value):
277 if isinstance(value, int):
278 return value in self.buffer
279 return self.find(value) != -1
280
281 def __iter__(self):
282 for i in range(len(self)):
283 yield self[i]
284
285 def __reversed__(self):
286 for i in range(len(self)-1, -1, -1):
287 yield self[i]
164 288
165 def append(self, value): 289 def append(self, value):
290 """
291 Append string or runes to this item.
292 """
166 if isinstance(value, str): 293 if isinstance(value, str):
167 Runes.append(self, Runes(value)) 294 value = Runes(value)
168 else: 295 elif not isinstance(value, (int, Runes)):
169 Runes.append(self, value) 296 raise TypeError("integer, string, or Runes required")
170 297 super().append(value)
171 def index(self, value): 298
299 def _runify(self, value):
172 if isinstance(value, str): 300 if isinstance(value, str):
173 return Runes.index(self, self._ord(value)) 301 return Runes(value)
174 else: 302 elif isinstance(value, Runes):
175 return Runes.index(self, value) 303 return value
176 304 else:
177 def find(self, value): 305 raise TypeError("Runes or string required")
306
307 def index(self, value, from_index=0):
308 """
309 Substring index, throws exception if not found.
310 """
311 return super().index(self._runify(value), from_index)
312
313 def find(self, value, from_index=0):
314 """
315 Substring index, returns -1 if not found.
316 """
178 try: 317 try:
179 return self.index(value) 318 return self.index(value, from_index)
180 except ValueError: 319 except ValueError:
181 return -1 320 return -1
182 321
183 def _ord(self, string): 322 def rindex(self, value, from_index=None):
184 length = len(string) 323 """
185 if length != 1: 324 Reverse substring index, throws exception if not found.
186 raise ValueError("expected a character, but string of length {0} found".format(length)) 325 """
187 raw = Runes(string) 326 return super().rindex(self._runify(value), from_index)
188 if len(raw) != 1: 327
189 raise ValueError("character not in BMP") 328 def rfind(self, value, from_index=None):
190 return raw[0] 329 """
330 Reverse substring index, returns -1 if not found.
331 """
332 try:
333 return self.rindex(value, from_index)
334 except ValueError:
335 return -1
336
337 def _code_point_at(self, index):
338 self._checkindex(index, allow_equal=False)
339 v0 = self.buffer[index]
340 i1 = index + 1
341 v1 = None if i1 >= len(self.buffer) else self.buffer[i1]
342 if Runes.is_high_surrogate(v0) and v1 is not None and Runes.is_low_surrogate(v1):
343 return slice(index, index+2)
344 else:
345 return slice(index, i1)
346
347 def code_point_at(self, index):
348 """
349 Similar to java.lang.String.codePointAt, but returns a 1-character
350 string, not an int.
351 """
352 return self[self._code_point_at(index)]
353
354 def _code_point_before(self, index):
355 self._checkindex(index - 1, allow_equal=True)
356 i1 = index - 1
357 v1 = self.buffer[i1]
358 i2 = index - 2
359 v2 = None if i2 < 0 else self.buffer[i2]
360 if Runes.is_low_surrogate(v1) and v2 is not None and Runes.is_high_surrogate(v2):
361 return slice(i2, index)
362 else:
363 return slice(i1, index)
364
365 def code_point_before(self, index):
366 """
367 Similar to java.lang.String.codePointBefore but returns a 1-character
368 string, not an int.
369 """
370 return self[self._code_point_before(index)]
371
372 def code_point_count(self, begin=None, end=None):
373 """
374 Same behavior as java.lang.String.codePointCount (q.v.).
375 """
376 if begin is None: begin = 0
377 if end is None: end = len(self.buffer)
378 self._checkindex(begin, allow_equal=False)
379 self._checkindex(end, allow_equal=True)
380 if begin > end:
381 raise IndexError("invalid range (begin > end)")
382 i = begin
383 ret = 0
384 while i < end:
385 i = self._code_point_at(i).stop
386 ret += 1
387 return ret
388
389 def offset_by_code_points(self, index, offset):
390 """
391 Same behavior as java.lang.String.offsetByCodePoints (q.v.).
392 """
393 self._checkindex(index, allow_equal=True)
394 if offset < 0:
395 next = lambda i: self._code_point_before(i).start
396 else:
397 next = lambda i: self._code_point_at(i).stop
398 seen = 0
399 limit = abs(offset)
400 while seen < limit:
401 index = next(index)
402 seen += 1
403 return index