Mercurial > cgi-bin > hgweb.cgi > curlyq
comparison workspace.py @ 0:984876b6a095
Initial commit of first two classes.
author | David Barts <n5jrn@me.com> |
---|---|
date | Thu, 26 Dec 2019 08:09:11 -0800 |
parents | |
children | 173e86601dbc |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:984876b6a095 |
---|---|
1 #!/usr/bin/env python3 | |
2 # -*- coding: utf-8 -*- | |
3 | |
4 # A class that implements a workspace for curly-quoting a text. This is enough | |
5 # like a string that it can be accessed via subscripts and ranges, and enough | |
6 # like a TextIOBase object that it can be written to much like a stream. | |
7 # (However, a Workspace is neither a string nor a TextIOBase object.) | |
8 # | |
9 # The advantage of using UTF-16 (as we do here) is that all quotation marks | |
10 # of interest are represented in a single 16-bit value, so changing straight | |
11 # quotes to curly ones can be accomplished most easily. | |
12 # | |
13 # It was a deliberate design decision to return empty strings when reading | |
14 # out-of-range indices but to throw exceptions when attempting to write | |
15 # them, because both decisions made coding easier in other modules. | |
16 | |
17 # I m p o r t s | |
18 | |
19 import os, sys | |
20 import io | |
21 import codecs | |
22 | |
23 # V a r i a b l e s | |
24 | |
25 # C l a s s e s | |
26 | |
27 class Workspace(object): | |
28 # The most efficient 16-bit one on this platform | |
29 encoding = "UTF-16" + sys.byteorder[0].upper() + "E" | |
30 codec = codecs.lookup(encoding) | |
31 # Errors should never happen; UTF-16 can represent all Unicode characters | |
32 errors = 'strict' | |
33 | |
34 def __init__(self, initial_data=None): | |
35 """ | |
36 Constructor. | |
37 """ | |
38 self._length = 0 | |
39 if initial_data is not None: | |
40 data = initial_data.encode(self.encoding, self.errors) | |
41 self._fp = io.BytesIO(data) | |
42 self._dirty = True | |
43 else: | |
44 self._fp = io.BytesIO() | |
45 self._dirty = False | |
46 | |
47 def close(self): | |
48 """ | |
49 Causes our buffer to be discarded and this workspace to become | |
50 unusable. | |
51 """ | |
52 self._fp.close() | |
53 | |
54 def flush(self): | |
55 """ | |
56 Does nothing, but allowed. | |
57 """ | |
58 pass | |
59 | |
60 def seek(self, offset, whence=io.SEEK_SET): | |
61 """ | |
62 Seeks to an absolute position. | |
63 """ | |
64 return self._fp.seek(offset, whence) | |
65 | |
66 def tell(self): | |
67 """ | |
68 Returns current position. | |
69 """ | |
70 return self._fp.tell() | |
71 | |
72 def read(self, nchars=None): | |
73 """ | |
74 Read characters. | |
75 XXX - might return replacement chars from surrogate fragments. | |
76 """ | |
77 if nchars is not None and nchars >= 0: | |
78 nchars *= 2 | |
79 return self._fp.read(nchars).decode(self.encoding, "replace") | |
80 | |
81 def write(self, string): | |
82 """ | |
83 Write characters. | |
84 """ | |
85 self._fp.write(string.encode(self.encoding, self.errors)) | |
86 | |
87 def __len__(self): | |
88 """ | |
89 Length as a string. | |
90 """ | |
91 if self._dirty: | |
92 back = self.tell() | |
93 self._length = self.seek(0, io.SEEK_END) // 2 | |
94 self.seek(back) | |
95 self._dirty = False | |
96 return self._length | |
97 | |
98 def __getitem__(self, key): | |
99 """ | |
100 Direct access to a single character or range of characters. We do | |
101 not support negative indices. Return value is based on what's most | |
102 useful for curling quotes. | |
103 XXX - might return replacement chars from surrogate fragments. | |
104 """ | |
105 if isinstance(key, int): | |
106 if key < 0 or key >= len(self): | |
107 return "" | |
108 k2 = 2 * key | |
109 key = slice(k2, k2 + 2) | |
110 elif isinstance(key, slice): | |
111 if key.step is not None: | |
112 raise ValueError("__getitem__ does not support steps in slices") | |
113 length = len(self) | |
114 start = 0 if key.start is None else key.start | |
115 stop = length if key.stop is None else key.stop | |
116 start = max(0, min(length - 1, start)) | |
117 stop = max(0, min(length, stop)) | |
118 if stop <= start: | |
119 return "" | |
120 key = slice(start * 2, stop * 2) | |
121 else: | |
122 raise TypeError("__setitem__ only supports integers and slices") | |
123 return self.codec.decode(self._fp.getbuffer()[key], "replace")[0] | |
124 | |
125 def __setitem__(self, key, value): | |
126 """ | |
127 Direct access to a single character. We do not support negative | |
128 indices or replacing more than a single character at a time. | |
129 XXX - only works on characters in the BMP. | |
130 """ | |
131 if not isinstance(key, int): | |
132 raise TypeError("__setitem__ only supports integers") | |
133 if key < 0 or key >= len(self): | |
134 raise IndexError("index {0} out of range".format(key)) | |
135 if not value: | |
136 return | |
137 start = key * 2 | |
138 end = start + 2 | |
139 encoded = value[0].encode(self.encoding, self.errors) | |
140 if len(encoded) != 2: | |
141 raise ValueError("{0!r} not in BMP".format(value[0])) | |
142 self._fp.getbuffer()[start:end] = encoded | |
143 | |
144 def __del__(self): | |
145 """ | |
146 Equivalent to .close(). | |
147 """ | |
148 self.close() | |
149 | |
150 def getvalue(self): | |
151 """ | |
152 Gets the string represented by this workspace. | |
153 """ | |
154 return self.codec.decode(self._fp.getbuffer(), self.errors)[0] | |
155 | |
156 def __enter__(self): | |
157 """ | |
158 Context manager. | |
159 """ | |
160 return self | |
161 | |
162 def __exit__(self, exc_type, exc_val, exc_tb): | |
163 """ | |
164 Context manager: close on exit. | |
165 """ | |
166 self.close() | |
167 return False |