Mercurial > cgi-bin > hgweb.cgi > curlyq
annotate curlers.py @ 9:84adbbb69a9d
Added tag v1_workspace for changeset 05363e803272
author | David Barts <n5jrn@me.com> |
---|---|
date | Fri, 27 Dec 2019 09:51:26 -0800 |
parents | 9df9ff8cecde |
children | 397c178c5b98 |
rev | line source |
---|---|
3 | 1 #!/usr/bin/env python3 |
2 # -*- coding: utf-8 -*- | |
3 | |
4 # Classes for curling both HTML and plain text. | |
5 | |
6 # I m p o r t s | |
7 | |
8 import os, sys | |
9 from workspace import Workspace | |
10 | |
11 # V a r i a b l e s | |
12 | |
13 # Quote types | |
14 LSQUO = "\u2018" | |
15 APOS = RSQUO = "\u2019" | |
16 LDQUO = "\u201C" | |
17 RDQUO = "\u201D" | |
18 | |
19 # Words that start with an apostrophe. Cribbed from Wordpress. | |
20 _ASTART = [ "'tain't", "'twere", "'twas", "'tis", "'twill", "'til", | |
21 "'bout", "'nuff", "'round", "'cause" , "'em" ] | |
22 | |
23 # HTML tags that enclose raw data | |
7
9df9ff8cecde
Undo that; ignoring <pre> is a sticky wicket.
David Barts <n5jrn@me.com>
parents:
6
diff
changeset
|
24 _RAW = set(["script", "style"]) |
3 | 25 |
26 # HTML block elements | |
27 _BLOCK = set([ | |
7
9df9ff8cecde
Undo that; ignoring <pre> is a sticky wicket.
David Barts <n5jrn@me.com>
parents:
6
diff
changeset
|
28 "address", "blockquote", "div", "dl", "fieldset", "form", "h1", |
9df9ff8cecde
Undo that; ignoring <pre> is a sticky wicket.
David Barts <n5jrn@me.com>
parents:
6
diff
changeset
|
29 "h2", "h3", "h4", "h5", "h6", "hr", "noscript", "ol", "p", "pre", |
9df9ff8cecde
Undo that; ignoring <pre> is a sticky wicket.
David Barts <n5jrn@me.com>
parents:
6
diff
changeset
|
30 "table", "ul" |
3 | 31 ]) |
32 | |
33 # F u n c t i o n s | |
34 | |
4 | 35 def uncurl(ws): |
36 """ | |
37 Makes all quotes in the workspace non-curly. | |
38 """ | |
39 for i in range(len(ws)): | |
40 ch = ws[i] | |
41 if ch in set([LDQUO, RDQUO]): | |
42 ws[i] = '"' | |
43 elif ch in set([LSQUO, RSQUO]): | |
44 ws[i] = "'" | |
45 | |
3 | 46 def _is_cockney(pos, ws): |
47 pos = self._pos | |
48 ws = self.workspace | |
49 for i in _ASTART: | |
50 li = len(i) | |
51 if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): | |
52 return True | |
53 | |
54 # C l a s s e s | |
55 | |
56 class BaseCurler(): | |
57 def feed(self): | |
58 raise NotImplementedError() | |
59 | |
60 class TextCurler(BaseCurler): | |
61 """ | |
62 For processing plain text. Assumes the entire text is a block; it is | |
63 the responsibility of the caller to break the input into paragraphs. | |
64 """ | |
65 def __init__(self, workspace): | |
66 self.workspace = workspace | |
67 self._state = self._norm | |
68 self._pos = 0 | |
69 | |
70 def feed(self): | |
71 self._pos = 0 | |
72 self._state = self._norm | |
73 for self._pos in range(len(self.workspace)): | |
74 self._state() | |
75 | |
76 def _is_cockney(self): | |
77 pos = self._pos | |
78 ws = self.workspace | |
79 for i in _ASTART: | |
80 li = len(i) | |
81 print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i)) | |
82 if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): | |
83 return True | |
84 | |
85 def _norm(self): | |
86 pos = self._pos | |
87 ws = self.workspace | |
88 char = ws[pos] | |
89 if char == "\"": | |
90 # opening double quote | |
91 ws[pos] = LDQUO | |
92 self._state = self._seen_ld | |
93 elif char == "'": | |
94 # in this state, ' is always an apostrophe | |
95 ws[pos] = APOS | |
96 | |
97 def _seen_ld(self): | |
98 pos = self._pos | |
99 ws = self.workspace | |
100 char = ws[pos] | |
101 if char == "\"": | |
102 # closing double quote | |
103 ws[pos] = RDQUO | |
104 self._state = self._norm | |
105 elif char == "'": | |
106 if ws[pos-1].isalpha(): | |
107 # either an inter-word, or an end of word, apostrophe | |
108 ws[pos] = APOS | |
109 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): | |
110 # also an apostrophe | |
111 ws[pos] = APOS | |
112 else: | |
113 # opening single quote | |
114 ws[pos] = LSQUO | |
115 self._state = self._seen_ls | |
116 | |
117 def _seen_ls(self): | |
118 pos = self._pos | |
119 ws = self.workspace | |
120 if ws[pos] == "'": | |
121 if ws[pos-1].isalpha() and ws[pos+1].isalpha(): | |
122 # obvious apostrophe | |
123 ws[pos] = APOS | |
124 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): | |
125 # also an apostrophe | |
126 ws[pos] = APOS | |
127 elif ws[pos-1].isspace(): | |
128 # start of word apostrophe | |
129 ws[pos] = APOS | |
130 else: | |
131 # closing single quote | |
132 ws[pos] = RSQUO | |
133 self._state = self._seen_ld | |
134 | |
135 class HtmlCurler(BaseCurler): | |
136 """ | |
137 For processing HTML. Uses HTML block tags to delimit blocks. | |
138 """ | |
139 def __init__(self, workspace): | |
140 self.workspace = workspace | |
141 self._state = self._norm | |
142 self._pos = 0 | |
143 self._ltpos = 0 | |
144 self._endtag = None | |
145 self._ltstate = None | |
146 | |
147 def feed(self): | |
148 self._pos = 0 | |
149 self._state = self._norm | |
150 for self._pos in range(len(self.workspace)): | |
151 self._state() | |
152 | |
153 def _is_cockney(self): | |
154 pos = self._pos | |
155 ws = self.workspace | |
156 for i in _ASTART: | |
157 li = len(i) | |
158 print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i)) | |
159 if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): | |
160 return True | |
161 | |
162 def _goto_lt(self): | |
163 self._ltpos = self._pos | |
164 self._ltstate = self._state | |
165 self._state = self._seen_lt | |
166 | |
167 def _norm(self): | |
168 pos = self._pos | |
169 ws = self.workspace | |
170 char = ws[pos] | |
171 if char == "<": | |
172 self._goto_lt() | |
173 elif char == "\"": | |
174 # opening double quote | |
175 ws[pos] = LDQUO | |
176 self._state = self._seen_ld | |
177 elif char == "'": | |
178 # in this state, ' is always an apostrophe | |
179 ws[pos] = APOS | |
180 | |
181 def _gettag(self, start): | |
182 ws = self.workspace | |
183 end = start | |
184 while ws[end].isalnum(): | |
185 end += 1 | |
186 return ws[start:end].lower() | |
187 | |
188 def _seen_lt(self): | |
189 pos = self._pos | |
190 ws = self.workspace | |
191 if ws[pos] == ">": | |
192 start = self._ltpos + 1 | |
193 if ws[start] == '/': | |
194 if self._gettag(start + 1) in _BLOCK: | |
195 self._state = self._norm | |
196 else: | |
197 self._state = self._ltstate | |
198 else: | |
199 tag = self._gettag(start) | |
200 if tag in _BLOCK: | |
201 self._state = self._norm | |
202 elif tag in _RAW: | |
203 self._state = self._raw | |
204 self._endtag = "</" + tag | |
205 else: | |
206 self._state = self._ltstate | |
207 | |
208 def _raw(self): | |
209 pos = self._pos | |
210 ws = self.workspace | |
211 end = pos + len(self._endtag) | |
212 # only a matching end tag gets us out of the raw state | |
213 if ws[pos] == '<' and ws[pos:end].lower() == self._endtag and (not ws[end].isalnum()): | |
214 self._ltpos = pos | |
7
9df9ff8cecde
Undo that; ignoring <pre> is a sticky wicket.
David Barts <n5jrn@me.com>
parents:
6
diff
changeset
|
215 self._state = self._seen_lt |
3 | 216 |
217 def _seen_ld(self): | |
218 pos = self._pos | |
219 ws = self.workspace | |
220 char = ws[pos] | |
221 if char == "<": | |
222 self._goto_lt() | |
223 elif char == "\"": | |
224 # closing double quote | |
225 ws[pos] = RDQUO | |
226 self._state = self._norm | |
227 elif char == "'": | |
228 if ws[pos-1].isalpha(): | |
229 # either an inter-word, or an end of word, apostrophe | |
230 ws[pos] = APOS | |
231 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): | |
232 # also an apostrophe | |
233 ws[pos] = APOS | |
234 else: | |
235 # opening single quote | |
236 ws[pos] = LSQUO | |
237 self._state = self._seen_ls | |
238 | |
239 def _seen_ls(): | |
240 pos = self._pos | |
241 ws = self.workspace | |
242 char = ws[pos] | |
243 if char == "<": | |
244 self._goto_lt() | |
245 elif char == "'": | |
246 if ws[pos-1].isalpha() and ws[pos+1].isalpha(): | |
247 # obvious apostrophe | |
248 ws[pos] = APOS | |
249 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): | |
250 # also an apostrophe | |
251 ws[pos] = APOS | |
252 elif ws[pos-1].isspace(): | |
253 # start of word apostrophe | |
254 ws[pos] = APOS | |
255 else: | |
256 # closing single quote | |
257 ws[pos] = RSQUO | |
258 self._state = self._seen_ld |