Mercurial > cgi-bin > hgweb.cgi > curlyq
comparison runes.py @ 22:a771878f6cf4
Remove deadwood, update runes.py.
author | David Barts <n5jrn@me.com> |
---|---|
date | Mon, 30 Dec 2019 08:16:24 -0800 |
parents | 35f29952b51e |
children |
comparison
equal
deleted
inserted
replaced
21:35f29952b51e | 22:a771878f6cf4 |
---|---|
1 #!/usr/bin/env python3 | 1 #!/usr/bin/env python3 |
2 # -*- coding: utf-8 -*- | 2 # -*- coding: utf-8 -*- |
3 | 3 |
4 # Something like Java's StringBuilder, but for Python. It needs more | |
5 # thorough testing. This runs about 3x slower than io.StringIO, but | |
6 # unlike that class allows for easy and meaningful random access via | |
7 # subscripts. | |
8 # | |
9 # This implements two objects: Runes and Workspace. The former is a | |
10 # relatively low-level object that deals mostly in UTF-16 rune values; | |
11 # however, you can create a Runes object from a Python string, and you | |
12 # can create a Python string from a Runes object. Aside from that, Runes | |
13 # deals in numeric values, not Python strings. Workspace methods tend | |
14 # to accept and return Python strings, which makes them more programmer- | |
15 # friendly, at the cost of often having to fire up a codec to convert | |
16 # things back and forth between Python strings and UTF-16 runes. | |
17 | |
4 # I m p o r t s | 18 # I m p o r t s |
5 | 19 |
6 import array | 20 import array |
7 import codecs | 21 import codecs |
8 import collections | |
9 import struct | |
10 import sys | 22 import sys |
11 | 23 |
12 # C l a s s e s | 24 # C l a s s e s |
13 | 25 |
14 class Runes(object): | 26 class Runes(object): |
15 """ | 27 """ |
16 A mutable, fixed-length sequence of UTF-16 runes. The attributes | 28 A mutable sequence of UTF-16 runes. The attributes encoding and |
17 encoding and codec contain the name of the encoding and the codec | 29 codec contain the name of the encoding and the codec used to |
18 used to generate the UTF-16. The attribute buffer contains the | 30 generate the UTF-16. The attribute buffer contains the buffer (an |
19 buffer (an array of 16-bit unsigned integers) used to back this | 31 array of 16-bit unsigned integers) used to back this object; |
20 object; modifications to that array will be reflected in this | 32 modifications to that array will be reflected in this object. |
21 object. | |
22 """ | 33 """ |
23 # The most efficient 16-bit one on this platform | 34 # The most efficient 16-bit one on this platform |
24 encoding = "UTF-16" + sys.byteorder[0].upper() + "E" | 35 encoding = "UTF-16" + sys.byteorder[0].upper() + "E" |
25 codec = codecs.lookup(encoding) | 36 codec = codecs.lookup(encoding) |
37 _ERRORS = 'surrogatepass' | |
38 _MIN_S = 0xd800 # lowest possible surrogate | |
39 _MID_S = 0xdc00 # high surrogate if <, low if >= | |
40 _MAX_S = 0xdfff # highest possible surrogate | |
26 | 41 |
27 def __init__(self, based_on=None): | 42 def __init__(self, based_on=None): |
28 if isinstance(based_on, array.array): | 43 if isinstance(based_on, array.array): |
29 if based_on.typecode == 'H': | 44 if based_on.typecode == 'H': |
30 self.buffer = based_on | 45 self.buffer = based_on |
31 else: | 46 else: |
32 self.buffer = array.array('H', based_on) | 47 self.buffer = array.array('H', based_on) |
33 elif isinstance(based_on, str): | 48 elif isinstance(based_on, str): |
34 # A string should always be able to encode to runes. | 49 self.buffer = array.array('H', self.codec.encode(based_on, self._ERRORS)[0]) |
35 self.buffer = array.array('H', self.codec.encode(based_on, 'strict')[0]) | |
36 elif based_on is None: | 50 elif based_on is None: |
37 self.buffer = array.array('H', bytes()) | 51 self.buffer = array.array('H') |
38 elif isinstance(based_on, Runes): | 52 elif isinstance(based_on, Runes): |
39 self.buffer = array.array('H', based_on.buffer) | 53 self.buffer = array.array('H', based_on.buffer) |
40 else: | 54 else: |
41 self.buffer = array.array('H', based_on) | 55 self.buffer = array.array('H', based_on) |
42 | 56 |
43 def __str__(self): | 57 def __str__(self): |
44 """ | 58 """ |
45 Convert this object to a string. We deliberately do not have a | 59 Convert this object to a string. We deliberately do not have a |
46 __repr__ method, to underscore that runes are not strings. | 60 __repr__ method, to underscore that runes are not strings. |
47 """ | 61 """ |
48 # Runes might not always be able to decode to a string. | 62 return self.codec.decode(self.buffer, self._ERRORS)[0] |
49 return self.codec.decode(self.buffer, 'replace')[0] | |
50 | 63 |
51 def __bytes__(self): | 64 def __bytes__(self): |
52 return bytes(self.buffer) | 65 return bytes(self.buffer) |
53 | 66 |
54 def __len__(self): | 67 def __len__(self): |
55 return len(self.buffer) | 68 return len(self.buffer) |
56 | 69 |
70 def _checkindex(self, index, allow_equal=False): | |
71 ok = 0 <= index <= len(self) if allow_equal else 0 <= index < len(self) | |
72 if not ok: | |
73 raise IndexError("index {0} out of range".format(index)) | |
74 | |
75 def _checktype(self, other): | |
76 if not isinstance(other, Runes): | |
77 raise TypeError("Runes required") | |
78 | |
57 def __lt__(self, other): | 79 def __lt__(self, other): |
80 self._checktype(other) | |
58 return self.buffer < other.buffer | 81 return self.buffer < other.buffer |
59 | 82 |
60 def __le__(self, other): | 83 def __le__(self, other): |
84 self._checktype(other) | |
61 return self.buffer <= other.buffer | 85 return self.buffer <= other.buffer |
62 | 86 |
63 def __gt__(self, other): | 87 def __gt__(self, other): |
88 self._checktype(other) | |
64 return self.buffer > other.buffer | 89 return self.buffer > other.buffer |
65 | 90 |
66 def __ge__(self, other): | 91 def __ge__(self, other): |
92 self._checktype(other) | |
67 return self.buffer >= other.buffer | 93 return self.buffer >= other.buffer |
68 | 94 |
69 def __eq__(self, other): | 95 def __eq__(self, other): |
96 self._checktype(other) | |
70 return self.buffer == other.buffer | 97 return self.buffer == other.buffer |
71 | 98 |
72 def __ne__(self, other): | 99 def __ne__(self, other): |
100 self._checktype(other) | |
73 return self.buffer != other.buffer | 101 return self.buffer != other.buffer |
74 | 102 |
75 def __hash__(self): | 103 def __hash__(self): |
76 raise TypeError("unhashable type") | 104 raise TypeError("unhashable type") |
77 | 105 |
78 def __bool__(self): | 106 def __bool__(self): |
79 return bool(self.buffer) | 107 return bool(self.buffer) |
80 | 108 |
81 def __getitem__(self, key): | 109 def __getitem__(self, key): |
82 ret = self.buffer[key] | 110 if isinstance(key, int): |
83 if isinstance(ret, array.array): | 111 return self.buffer[key] |
84 return Runes(ret) | 112 elif isinstance(key, slice): |
85 else: | 113 return Runes(self.buffer[key]) |
86 return ret | 114 else: |
115 raise AssertionError("this shouldn't happen") | |
87 | 116 |
88 def __setitem__(self, key, value): | 117 def __setitem__(self, key, value): |
89 if isinstance(key, int): | 118 if isinstance(key, int): |
90 if isinstance(value, int): | 119 if isinstance(value, int): |
91 self.buffer[key] = value | 120 self.buffer[key] = value |
92 else: | 121 else: |
93 raise TypeError("integer required") | 122 raise TypeError("integer required") |
94 elif isinstance(value, Runes): | 123 elif isinstance(value, Runes): |
95 self.buffer[key] = value.buffer | 124 self.buffer[key] = value.buffer |
96 else: | 125 else: |
97 raise TypeError("runes required") | 126 raise TypeError("Runes required") |
98 | 127 |
99 def __delitem__(self, key): | 128 def __delitem__(self, key): |
100 del self.buffer[key] | 129 del self.buffer[key] |
101 | 130 |
102 def clear(self): | 131 def clear(self): |
132 """ | |
133 Remove all data from our buffer. This merely marks the buffer as | |
134 empty; it does nothing to destroy its contents by overwriting. | |
135 """ | |
103 del self[:] | 136 del self[:] |
137 | |
138 def zero(self): | |
139 """ | |
140 Overwrite our buffer with zeroes. | |
141 """ | |
142 for i in range(len(self.buffer)): | |
143 self.buffer[i] = 0 | |
104 | 144 |
105 def __iter__(self): | 145 def __iter__(self): |
106 return iter(self.buffer) | 146 return iter(self.buffer) |
107 | 147 |
108 def __reversed__(self): | 148 def __reversed__(self): |
109 return reversed(self.buffer) | 149 return reversed(self.buffer) |
110 | 150 |
111 def append(self, value): | 151 def append(self, value): |
152 """ | |
153 Append data to our buffer. | |
154 """ | |
112 if isinstance(value, int): | 155 if isinstance(value, int): |
113 self.buffer.append(value) | 156 self.buffer.append(value) |
114 elif isinstance(value, Runes): | 157 elif isinstance(value, Runes): |
115 self.buffer.extend(value.buffer) | 158 self.buffer.extend(value.buffer) |
116 else: | 159 else: |
117 raise TypeError("integer or runes required") | 160 raise TypeError("integer or Runes required") |
118 | 161 |
119 def __contains__(self, value): | 162 def __contains__(self, value): |
120 return value in self.buffer | 163 return value in self.buffer |
121 | 164 |
122 def index(self, value): | 165 @classmethod |
123 return self.buffer.index(value) | 166 def is_high_surrogate(cls, value): |
124 | 167 """ |
125 def find(self, value): | 168 Is value in the UTF-16 high surrogate range? |
169 """ | |
170 return cls._MIN_S <= value < cls._MID_S | |
171 | |
172 @classmethod | |
173 def is_low_surrogate(cls, value): | |
174 """ | |
175 Is value in the UTF-16 low surrogate range? | |
176 """ | |
177 return cls._MIN_S <= value <= cls._MAX_S | |
178 | |
179 @classmethod | |
180 def is_surrogate(cls, value): | |
181 """ | |
182 Is value in the UTF-16 surrogate range? | |
183 """ | |
184 return cls._MIN_S <= value <= cls._MAX_S | |
185 | |
186 def index(self, value, from_index=0): | |
187 """ | |
188 Substring index, throws exception if not found. | |
189 """ | |
190 self._checktype(value) | |
191 slimit = len(self) | |
192 rlimit = len(value) | |
193 for i in range(from_index, len(self)): | |
194 match = True | |
195 for j in range(rlimit): | |
196 k = i + j | |
197 if k >= slimit or value.buffer[j] != self.buffer[k]: | |
198 match = False | |
199 break | |
200 if match: | |
201 return i | |
202 raise ValueError("substring not found") | |
203 | |
204 def find(self, value, from_index=0): | |
205 """ | |
206 Substring index, returns -1 if not found. | |
207 """ | |
126 try: | 208 try: |
127 return self.index(value) | 209 return self.index(value, from_index) |
128 except ValueError: | 210 except ValueError: |
129 return -1 | 211 return -1 |
130 | 212 |
213 def rindex(self, value, from_index=None): | |
214 """ | |
215 Reverse substring index, throws exception if not found. | |
216 """ | |
217 self._checktype(value) | |
218 if from_index is None: | |
219 from_index = len(self) - 1 | |
220 rfrom = len(value) - 1 | |
221 for i in range(from_index, -1, -1): | |
222 match = True | |
223 for j in range(rfrom, -1, -1): | |
224 k = i - (rfrom - j) | |
225 if k < 0 or value.buffer[j] != self.buffer[k]: | |
226 match = False | |
227 break | |
228 if match: | |
229 return i - rfrom | |
230 raise ValueError("substring not found") | |
231 | |
232 def rfind(self, value, from_index=None): | |
233 """ | |
234 Reverse substring index, returns -1 if not found. | |
235 """ | |
236 try: | |
237 return self.rindex(value, from_index) | |
238 except ValueError: | |
239 return -1 | |
240 | |
131 class Workspace(Runes): | 241 class Workspace(Runes): |
132 """ | 242 """ |
133 A Runes object that acts a bit more string-like, in that __setitem__ | 243 A Runes object (q.v.) that acts a bit more string-like. |
134 also accepts a string as an argument and __getitem__ always returns | |
135 a string. We also return empty strings instead of throwing IndexError | |
136 when attempting to read out-of-range values, because that makes life | |
137 easier for us when curling quotes. | |
138 """ | 244 """ |
245 | |
139 def __setitem__(self, key, value): | 246 def __setitem__(self, key, value): |
140 if isinstance(value, str): | 247 if isinstance(value, str): |
141 if isinstance(key, int): | 248 if isinstance(key, int): |
142 Runes.__setitem__(self, key, self._ord(value)) | 249 value = ord(value) |
250 if value > 0xffff: | |
251 raise ValueError("character not in BMP") | |
252 super().__setitem__(key, value) | |
143 else: | 253 else: |
144 Runes.__setitem__(self, key, Runes(value)) | 254 super().__setitem__(key, Runes(value)) |
145 else: | 255 else: |
146 Runes.__setitem__(self, key, value) | 256 super().__setitem__(key, value) |
147 | 257 |
148 def __getitem__(self, key): | 258 def __getitem__(self, key): |
149 view = memoryview(self.buffer) | 259 if isinstance(key, int): |
150 try: | 260 return chr(self.buffer[key]) |
151 result = view[key] | 261 elif isinstance(key, slice): |
152 if isinstance(result, int): | 262 view = memoryview(self.buffer) |
153 return chr(result) | 263 try: |
154 if isinstance(result, memoryview): | 264 result = view[key] |
155 ret = self.codec.decode(result, 'replace')[0] | 265 if not isinstance(result, memoryview): |
266 assert isinstance(result, int) | |
267 return chr(result) | |
268 ret = self.codec.decode(result, self._ERRORS)[0] | |
156 result.release() | 269 result.release() |
157 return ret | 270 return ret |
158 else: | 271 finally: |
159 raise AssertionError("this shouldn't happen") | 272 view.release() |
160 except IndexError: | 273 else: |
161 return "" | 274 raise AssertionError("this shouldn't happen") |
162 finally: | 275 |
163 view.release() | 276 def __contains__(self, value): |
277 if isinstance(value, int): | |
278 return value in self.buffer | |
279 return self.find(value) != -1 | |
280 | |
281 def __iter__(self): | |
282 for i in range(len(self)): | |
283 yield self[i] | |
284 | |
285 def __reversed__(self): | |
286 for i in range(len(self)-1, -1, -1): | |
287 yield self[i] | |
164 | 288 |
165 def append(self, value): | 289 def append(self, value): |
290 """ | |
291 Append string or runes to this item. | |
292 """ | |
166 if isinstance(value, str): | 293 if isinstance(value, str): |
167 Runes.append(self, Runes(value)) | 294 value = Runes(value) |
168 else: | 295 elif not isinstance(value, (int, Runes)): |
169 Runes.append(self, value) | 296 raise TypeError("integer, string, or Runes required") |
170 | 297 super().append(value) |
171 def index(self, value): | 298 |
299 def _runify(self, value): | |
172 if isinstance(value, str): | 300 if isinstance(value, str): |
173 return Runes.index(self, self._ord(value)) | 301 return Runes(value) |
174 else: | 302 elif isinstance(value, Runes): |
175 return Runes.index(self, value) | 303 return value |
176 | 304 else: |
177 def find(self, value): | 305 raise TypeError("Runes or string required") |
306 | |
307 def index(self, value, from_index=0): | |
308 """ | |
309 Substring index, throws exception if not found. | |
310 """ | |
311 return super().index(self._runify(value), from_index) | |
312 | |
313 def find(self, value, from_index=0): | |
314 """ | |
315 Substring index, returns -1 if not found. | |
316 """ | |
178 try: | 317 try: |
179 return self.index(value) | 318 return self.index(value, from_index) |
180 except ValueError: | 319 except ValueError: |
181 return -1 | 320 return -1 |
182 | 321 |
183 def _ord(self, string): | 322 def rindex(self, value, from_index=None): |
184 length = len(string) | 323 """ |
185 if length != 1: | 324 Reverse substring index, throws exception if not found. |
186 raise ValueError("expected a character, but string of length {0} found".format(length)) | 325 """ |
187 raw = Runes(string) | 326 return super().rindex(self._runify(value), from_index) |
188 if len(raw) != 1: | 327 |
189 raise ValueError("character not in BMP") | 328 def rfind(self, value, from_index=None): |
190 return raw[0] | 329 """ |
330 Reverse substring index, returns -1 if not found. | |
331 """ | |
332 try: | |
333 return self.rindex(value, from_index) | |
334 except ValueError: | |
335 return -1 | |
336 | |
337 def _code_point_at(self, index): | |
338 self._checkindex(index, allow_equal=False) | |
339 v0 = self.buffer[index] | |
340 i1 = index + 1 | |
341 v1 = None if i1 >= len(self.buffer) else self.buffer[i1] | |
342 if Runes.is_high_surrogate(v0) and v1 is not None and Runes.is_low_surrogate(v1): | |
343 return slice(index, index+2) | |
344 else: | |
345 return slice(index, i1) | |
346 | |
347 def code_point_at(self, index): | |
348 """ | |
349 Similar to java.lang.String.codePointAt, but returns a 1-character | |
350 string, not an int. | |
351 """ | |
352 return self[self._code_point_at(index)] | |
353 | |
354 def _code_point_before(self, index): | |
355 self._checkindex(index - 1, allow_equal=True) | |
356 i1 = index - 1 | |
357 v1 = self.buffer[i1] | |
358 i2 = index - 2 | |
359 v2 = None if i2 < 0 else self.buffer[i2] | |
360 if Runes.is_low_surrogate(v1) and v2 is not None and Runes.is_high_surrogate(v2): | |
361 return slice(i2, index) | |
362 else: | |
363 return slice(i1, index) | |
364 | |
365 def code_point_before(self, index): | |
366 """ | |
367 Similar to java.lang.String.codePointBefore but returns a 1-character | |
368 string, not an int. | |
369 """ | |
370 return self[self._code_point_before(index)] | |
371 | |
372 def code_point_count(self, begin=None, end=None): | |
373 """ | |
374 Same behavior as java.lang.String.codePointCount (q.v.). | |
375 """ | |
376 if begin is None: begin = 0 | |
377 if end is None: end = len(self.buffer) | |
378 self._checkindex(begin, allow_equal=False) | |
379 self._checkindex(end, allow_equal=True) | |
380 if begin > end: | |
381 raise IndexError("invalid range (begin > end)") | |
382 i = begin | |
383 ret = 0 | |
384 while i < end: | |
385 i = self._code_point_at(i).stop | |
386 ret += 1 | |
387 return ret | |
388 | |
389 def offset_by_code_points(self, index, offset): | |
390 """ | |
391 Same behavior as java.lang.String.offsetByCodePoints (q.v.). | |
392 """ | |
393 self._checkindex(index, allow_equal=True) | |
394 if offset < 0: | |
395 next = lambda i: self._code_point_before(i).start | |
396 else: | |
397 next = lambda i: self._code_point_at(i).stop | |
398 seen = 0 | |
399 limit = abs(offset) | |
400 while seen < limit: | |
401 index = next(index) | |
402 seen += 1 | |
403 return index |