Mercurial > cgi-bin > hgweb.cgi > curlyq
annotate curlers.py @ 14:152f6aa87d62
Plug a memoryview leak.
author | David Barts <n5jrn@me.com> |
---|---|
date | Fri, 27 Dec 2019 13:30:44 -0800 |
parents | 397c178c5b98 |
children | b2dab0667ec2 |
rev | line source |
---|---|
3 | 1 #!/usr/bin/env python3 |
2 # -*- coding: utf-8 -*- | |
3 | |
4 # Classes for curling both HTML and plain text. | |
5 | |
6 # I m p o r t s | |
7 | |
8 import os, sys | |
10 | 9 from runes import Workspace |
3 | 10 |
11 # V a r i a b l e s | |
12 | |
10 | 13 # Quote types, as rune values |
14 LSQUO = 0x2018 | |
15 APOS = RSQUO = 0x2019 | |
16 LDQUO = 0x201c | |
17 RDQUO = 0x201d | |
3 | 18 |
19 # Words that start with an apostrophe. Cribbed from Wordpress. | |
20 _ASTART = [ "'tain't", "'twere", "'twas", "'tis", "'twill", "'til", | |
21 "'bout", "'nuff", "'round", "'cause" , "'em" ] | |
22 | |
23 # HTML tags that enclose raw data | |
7
9df9ff8cecde
Undo that; ignoring <pre> is a sticky wicket.
David Barts <n5jrn@me.com>
parents:
6
diff
changeset
|
24 _RAW = set(["script", "style"]) |
3 | 25 |
26 # HTML block elements | |
27 _BLOCK = set([ | |
7
9df9ff8cecde
Undo that; ignoring <pre> is a sticky wicket.
David Barts <n5jrn@me.com>
parents:
6
diff
changeset
|
28 "address", "blockquote", "div", "dl", "fieldset", "form", "h1", |
9df9ff8cecde
Undo that; ignoring <pre> is a sticky wicket.
David Barts <n5jrn@me.com>
parents:
6
diff
changeset
|
29 "h2", "h3", "h4", "h5", "h6", "hr", "noscript", "ol", "p", "pre", |
9df9ff8cecde
Undo that; ignoring <pre> is a sticky wicket.
David Barts <n5jrn@me.com>
parents:
6
diff
changeset
|
30 "table", "ul" |
3 | 31 ]) |
32 | |
33 # F u n c t i o n s | |
34 | |
4 | 35 def uncurl(ws): |
36 """ | |
37 Makes all quotes in the workspace non-curly. | |
38 """ | |
39 for i in range(len(ws)): | |
40 ch = ws[i] | |
41 if ch in set([LDQUO, RDQUO]): | |
42 ws[i] = '"' | |
43 elif ch in set([LSQUO, RSQUO]): | |
44 ws[i] = "'" | |
45 | |
3 | 46 def _is_cockney(pos, ws): |
47 for i in _ASTART: | |
48 li = len(i) | |
49 if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): | |
50 return True | |
51 | |
52 # C l a s s e s | |
53 | |
54 class BaseCurler(): | |
55 def feed(self): | |
56 raise NotImplementedError() | |
57 | |
58 class TextCurler(BaseCurler): | |
59 """ | |
60 For processing plain text. Assumes the entire text is a block; it is | |
61 the responsibility of the caller to break the input into paragraphs. | |
62 """ | |
63 def __init__(self, workspace): | |
64 self.workspace = workspace | |
65 self._state = self._norm | |
66 self._pos = 0 | |
67 | |
68 def feed(self): | |
69 self._pos = 0 | |
70 self._state = self._norm | |
71 for self._pos in range(len(self.workspace)): | |
72 self._state() | |
73 | |
74 def _is_cockney(self): | |
75 pos = self._pos | |
76 ws = self.workspace | |
77 for i in _ASTART: | |
78 li = len(i) | |
79 print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i)) | |
80 if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): | |
81 return True | |
82 | |
83 def _norm(self): | |
84 pos = self._pos | |
85 ws = self.workspace | |
86 char = ws[pos] | |
87 if char == "\"": | |
88 # opening double quote | |
89 ws[pos] = LDQUO | |
90 self._state = self._seen_ld | |
91 elif char == "'": | |
92 # in this state, ' is always an apostrophe | |
93 ws[pos] = APOS | |
94 | |
95 def _seen_ld(self): | |
96 pos = self._pos | |
97 ws = self.workspace | |
98 char = ws[pos] | |
99 if char == "\"": | |
100 # closing double quote | |
101 ws[pos] = RDQUO | |
102 self._state = self._norm | |
103 elif char == "'": | |
104 if ws[pos-1].isalpha(): | |
105 # either an inter-word, or an end of word, apostrophe | |
106 ws[pos] = APOS | |
107 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): | |
108 # also an apostrophe | |
109 ws[pos] = APOS | |
110 else: | |
111 # opening single quote | |
112 ws[pos] = LSQUO | |
113 self._state = self._seen_ls | |
114 | |
115 def _seen_ls(self): | |
116 pos = self._pos | |
117 ws = self.workspace | |
118 if ws[pos] == "'": | |
119 if ws[pos-1].isalpha() and ws[pos+1].isalpha(): | |
120 # obvious apostrophe | |
121 ws[pos] = APOS | |
122 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): | |
123 # also an apostrophe | |
124 ws[pos] = APOS | |
125 elif ws[pos-1].isspace(): | |
126 # start of word apostrophe | |
127 ws[pos] = APOS | |
128 else: | |
129 # closing single quote | |
130 ws[pos] = RSQUO | |
131 self._state = self._seen_ld | |
132 | |
133 class HtmlCurler(BaseCurler): | |
134 """ | |
135 For processing HTML. Uses HTML block tags to delimit blocks. | |
136 """ | |
137 def __init__(self, workspace): | |
138 self.workspace = workspace | |
139 self._state = self._norm | |
140 self._pos = 0 | |
141 self._ltpos = 0 | |
142 self._endtag = None | |
143 self._ltstate = None | |
144 | |
145 def feed(self): | |
146 self._pos = 0 | |
147 self._state = self._norm | |
148 for self._pos in range(len(self.workspace)): | |
149 self._state() | |
150 | |
151 def _is_cockney(self): | |
152 pos = self._pos | |
153 ws = self.workspace | |
154 for i in _ASTART: | |
155 li = len(i) | |
156 print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i)) | |
157 if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): | |
158 return True | |
159 | |
160 def _goto_lt(self): | |
161 self._ltpos = self._pos | |
162 self._ltstate = self._state | |
163 self._state = self._seen_lt | |
164 | |
165 def _norm(self): | |
166 pos = self._pos | |
167 ws = self.workspace | |
168 char = ws[pos] | |
169 if char == "<": | |
170 self._goto_lt() | |
171 elif char == "\"": | |
172 # opening double quote | |
173 ws[pos] = LDQUO | |
174 self._state = self._seen_ld | |
175 elif char == "'": | |
176 # in this state, ' is always an apostrophe | |
177 ws[pos] = APOS | |
178 | |
179 def _gettag(self, start): | |
180 ws = self.workspace | |
181 end = start | |
182 while ws[end].isalnum(): | |
183 end += 1 | |
184 return ws[start:end].lower() | |
185 | |
186 def _seen_lt(self): | |
187 pos = self._pos | |
188 ws = self.workspace | |
189 if ws[pos] == ">": | |
190 start = self._ltpos + 1 | |
191 if ws[start] == '/': | |
192 if self._gettag(start + 1) in _BLOCK: | |
193 self._state = self._norm | |
194 else: | |
195 self._state = self._ltstate | |
196 else: | |
197 tag = self._gettag(start) | |
198 if tag in _BLOCK: | |
199 self._state = self._norm | |
200 elif tag in _RAW: | |
201 self._state = self._raw | |
202 self._endtag = "</" + tag | |
203 else: | |
204 self._state = self._ltstate | |
205 | |
206 def _raw(self): | |
207 pos = self._pos | |
208 ws = self.workspace | |
209 end = pos + len(self._endtag) | |
210 # only a matching end tag gets us out of the raw state | |
211 if ws[pos] == '<' and ws[pos:end].lower() == self._endtag and (not ws[end].isalnum()): | |
212 self._ltpos = pos | |
7
9df9ff8cecde
Undo that; ignoring <pre> is a sticky wicket.
David Barts <n5jrn@me.com>
parents:
6
diff
changeset
|
213 self._state = self._seen_lt |
3 | 214 |
215 def _seen_ld(self): | |
216 pos = self._pos | |
217 ws = self.workspace | |
218 char = ws[pos] | |
219 if char == "<": | |
220 self._goto_lt() | |
221 elif char == "\"": | |
222 # closing double quote | |
223 ws[pos] = RDQUO | |
224 self._state = self._norm | |
225 elif char == "'": | |
226 if ws[pos-1].isalpha(): | |
227 # either an inter-word, or an end of word, apostrophe | |
228 ws[pos] = APOS | |
229 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): | |
230 # also an apostrophe | |
231 ws[pos] = APOS | |
232 else: | |
233 # opening single quote | |
234 ws[pos] = LSQUO | |
235 self._state = self._seen_ls | |
236 | |
237 def _seen_ls(): | |
238 pos = self._pos | |
239 ws = self.workspace | |
240 char = ws[pos] | |
241 if char == "<": | |
242 self._goto_lt() | |
243 elif char == "'": | |
244 if ws[pos-1].isalpha() and ws[pos+1].isalpha(): | |
245 # obvious apostrophe | |
246 ws[pos] = APOS | |
247 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): | |
248 # also an apostrophe | |
249 ws[pos] = APOS | |
250 elif ws[pos-1].isspace(): | |
251 # start of word apostrophe | |
252 ws[pos] = APOS | |
253 else: | |
254 # closing single quote | |
255 ws[pos] = RSQUO | |
256 self._state = self._seen_ld |