Mercurial > cgi-bin > hgweb.cgi > curlyq
comparison curlers.py @ 3:091c03f1b2e8
Getting it working...
author | David Barts <n5jrn@me.com> |
---|---|
date | Thu, 26 Dec 2019 19:54:45 -0800 |
parents | |
children | 7a83e82e65a6 |
comparison
equal
deleted
inserted
replaced
2:8884b0bf779d | 3:091c03f1b2e8 |
---|---|
1 #!/usr/bin/env python3 | |
2 # -*- coding: utf-8 -*- | |
3 | |
4 # Classes for curling both HTML and plain text. | |
5 | |
6 # I m p o r t s | |
7 | |
8 import os, sys | |
9 from workspace import Workspace | |
10 | |
11 # V a r i a b l e s | |
12 | |
13 # Quote types | |
14 LSQUO = "\u2018" | |
15 APOS = RSQUO = "\u2019" | |
16 LDQUO = "\u201C" | |
17 RDQUO = "\u201D" | |
18 | |
19 # Words that start with an apostrophe. Cribbed from Wordpress. | |
20 _ASTART = [ "'tain't", "'twere", "'twas", "'tis", "'twill", "'til", | |
21 "'bout", "'nuff", "'round", "'cause" , "'em" ] | |
22 | |
23 # HTML tags that enclose raw data | |
24 _RAW = set(["script", "style"]) | |
25 | |
26 # HTML block elements | |
27 _BLOCK = set([ | |
28 "address", "blockquote", "div", "dl", "fieldset", "form", "h1", | |
29 "h2", "h3", "h4", "h5", "h6", "hr", "noscript", "ol", "p", "pre", | |
30 "table", "ul" | |
31 ]) | |
32 | |
33 # F u n c t i o n s | |
34 | |
35 def _is_cockney(pos, ws): | |
36 pos = self._pos | |
37 ws = self.workspace | |
38 for i in _ASTART: | |
39 li = len(i) | |
40 if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): | |
41 return True | |
42 | |
43 # C l a s s e s | |
44 | |
45 class BaseCurler(): | |
46 def feed(self): | |
47 raise NotImplementedError() | |
48 | |
49 class TextCurler(BaseCurler): | |
50 """ | |
51 For processing plain text. Assumes the entire text is a block; it is | |
52 the responsibility of the caller to break the input into paragraphs. | |
53 """ | |
54 def __init__(self, workspace): | |
55 self.workspace = workspace | |
56 self._state = self._norm | |
57 self._pos = 0 | |
58 | |
59 def feed(self): | |
60 self._pos = 0 | |
61 self._state = self._norm | |
62 for self._pos in range(len(self.workspace)): | |
63 self._state() | |
64 | |
65 def _is_cockney(self): | |
66 pos = self._pos | |
67 ws = self.workspace | |
68 for i in _ASTART: | |
69 li = len(i) | |
70 print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i)) | |
71 if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): | |
72 return True | |
73 | |
74 def _norm(self): | |
75 pos = self._pos | |
76 ws = self.workspace | |
77 char = ws[pos] | |
78 if char == "\"": | |
79 # opening double quote | |
80 ws[pos] = LDQUO | |
81 self._state = self._seen_ld | |
82 elif char == "'": | |
83 # in this state, ' is always an apostrophe | |
84 ws[pos] = APOS | |
85 | |
86 def _seen_ld(self): | |
87 pos = self._pos | |
88 ws = self.workspace | |
89 char = ws[pos] | |
90 if char == "\"": | |
91 # closing double quote | |
92 ws[pos] = RDQUO | |
93 self._state = self._norm | |
94 elif char == "'": | |
95 if ws[pos-1].isalpha(): | |
96 # either an inter-word, or an end of word, apostrophe | |
97 ws[pos] = APOS | |
98 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): | |
99 # also an apostrophe | |
100 ws[pos] = APOS | |
101 else: | |
102 # opening single quote | |
103 ws[pos] = LSQUO | |
104 self._state = self._seen_ls | |
105 | |
106 def _seen_ls(self): | |
107 pos = self._pos | |
108 ws = self.workspace | |
109 if ws[pos] == "'": | |
110 if ws[pos-1].isalpha() and ws[pos+1].isalpha(): | |
111 # obvious apostrophe | |
112 ws[pos] = APOS | |
113 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): | |
114 # also an apostrophe | |
115 ws[pos] = APOS | |
116 elif ws[pos-1].isspace(): | |
117 # start of word apostrophe | |
118 ws[pos] = APOS | |
119 else: | |
120 # closing single quote | |
121 ws[pos] = RSQUO | |
122 self._state = self._seen_ld | |
123 | |
124 class HtmlCurler(BaseCurler): | |
125 """ | |
126 For processing HTML. Uses HTML block tags to delimit blocks. | |
127 """ | |
128 def __init__(self, workspace): | |
129 self.workspace = workspace | |
130 self._state = self._norm | |
131 self._pos = 0 | |
132 self._ltpos = 0 | |
133 self._endtag = None | |
134 self._ltstate = None | |
135 | |
136 def feed(self): | |
137 self._pos = 0 | |
138 self._state = self._norm | |
139 for self._pos in range(len(self.workspace)): | |
140 self._state() | |
141 | |
142 def _is_cockney(self): | |
143 pos = self._pos | |
144 ws = self.workspace | |
145 for i in _ASTART: | |
146 li = len(i) | |
147 print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i)) | |
148 if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): | |
149 return True | |
150 | |
151 def _goto_lt(self): | |
152 self._ltpos = self._pos | |
153 self._ltstate = self._state | |
154 self._state = self._seen_lt | |
155 | |
156 def _norm(self): | |
157 pos = self._pos | |
158 ws = self.workspace | |
159 char = ws[pos] | |
160 if char == "<": | |
161 self._goto_lt() | |
162 elif char == "\"": | |
163 # opening double quote | |
164 ws[pos] = LDQUO | |
165 self._state = self._seen_ld | |
166 elif char == "'": | |
167 # in this state, ' is always an apostrophe | |
168 ws[pos] = APOS | |
169 | |
170 def _gettag(self, start): | |
171 ws = self.workspace | |
172 end = start | |
173 while ws[end].isalnum(): | |
174 end += 1 | |
175 return ws[start:end].lower() | |
176 | |
177 def _seen_lt(self): | |
178 pos = self._pos | |
179 ws = self.workspace | |
180 if ws[pos] == ">": | |
181 start = self._ltpos + 1 | |
182 if ws[start] == '/': | |
183 if self._gettag(start + 1) in _BLOCK: | |
184 self._state = self._norm | |
185 else: | |
186 self._state = self._ltstate | |
187 else: | |
188 tag = self._gettag(start) | |
189 if tag in _BLOCK: | |
190 self._state = self._norm | |
191 elif tag in _RAW: | |
192 self._state = self._raw | |
193 self._endtag = "</" + tag | |
194 else: | |
195 self._state = self._ltstate | |
196 | |
197 def _raw(self): | |
198 pos = self._pos | |
199 ws = self.workspace | |
200 end = pos + len(self._endtag) | |
201 # only a matching end tag gets us out of the raw state | |
202 if ws[pos] == '<' and ws[pos:end].lower() == self._endtag and (not ws[end].isalnum()): | |
203 self._ltpos = pos | |
204 self._state = self._seen_lt | |
205 | |
206 def _seen_ld(self): | |
207 pos = self._pos | |
208 ws = self.workspace | |
209 char = ws[pos] | |
210 if char == "<": | |
211 self._goto_lt() | |
212 elif char == "\"": | |
213 # closing double quote | |
214 ws[pos] = RDQUO | |
215 self._state = self._norm | |
216 elif char == "'": | |
217 if ws[pos-1].isalpha(): | |
218 # either an inter-word, or an end of word, apostrophe | |
219 ws[pos] = APOS | |
220 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): | |
221 # also an apostrophe | |
222 ws[pos] = APOS | |
223 else: | |
224 # opening single quote | |
225 ws[pos] = LSQUO | |
226 self._state = self._seen_ls | |
227 | |
228 def _seen_ls(): | |
229 pos = self._pos | |
230 ws = self.workspace | |
231 char = ws[pos] | |
232 if char == "<": | |
233 self._goto_lt() | |
234 elif char == "'": | |
235 if ws[pos-1].isalpha() and ws[pos+1].isalpha(): | |
236 # obvious apostrophe | |
237 ws[pos] = APOS | |
238 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): | |
239 # also an apostrophe | |
240 ws[pos] = APOS | |
241 elif ws[pos-1].isspace(): | |
242 # start of word apostrophe | |
243 ws[pos] = APOS | |
244 else: | |
245 # closing single quote | |
246 ws[pos] = RSQUO | |
247 self._state = self._seen_ld |