Mercurial > cgi-bin > hgweb.cgi > curlyq
annotate curlers.py @ 29:d5bf9985b5c4 default tip
Add degree symbol, fix bug in HTML curler.
author | David Barts <n5jrn@me.com> |
---|---|
date | Thu, 07 Oct 2021 11:55:46 -0700 |
parents | d3eb798f7e95 |
children |
rev | line source |
---|---|
3 | 1 #!/usr/bin/env python3 |
2 # -*- coding: utf-8 -*- | |
3 | |
4 # Classes for curling both HTML and plain text. | |
5 | |
6 # I m p o r t s | |
7 | |
8 import os, sys | |
10 | 9 from runes import Workspace |
3 | 10 |
11 # V a r i a b l e s | |
12 | |
18 | 13 # Quote types |
14 LSQUO = "\u2018" | |
15 APOS = RSQUO = "\u2019" | |
16 LDQUO = "\u201C" | |
17 RDQUO = "\u201D" | |
3 | 18 |
19 # Words that start with an apostrophe. Cribbed from Wordpress. | |
20 _ASTART = [ "'tain't", "'twere", "'twas", "'tis", "'twill", "'til", | |
21 "'bout", "'nuff", "'round", "'cause" , "'em" ] | |
22 | |
23 # HTML tags that enclose raw data | |
7
9df9ff8cecde
Undo that; ignoring <pre> is a sticky wicket.
David Barts <n5jrn@me.com>
parents:
6
diff
changeset
|
24 _RAW = set(["script", "style"]) |
3 | 25 |
26 # HTML block elements | |
27 _BLOCK = set([ | |
7
9df9ff8cecde
Undo that; ignoring <pre> is a sticky wicket.
David Barts <n5jrn@me.com>
parents:
6
diff
changeset
|
28 "address", "blockquote", "div", "dl", "fieldset", "form", "h1", |
9df9ff8cecde
Undo that; ignoring <pre> is a sticky wicket.
David Barts <n5jrn@me.com>
parents:
6
diff
changeset
|
29 "h2", "h3", "h4", "h5", "h6", "hr", "noscript", "ol", "p", "pre", |
9df9ff8cecde
Undo that; ignoring <pre> is a sticky wicket.
David Barts <n5jrn@me.com>
parents:
6
diff
changeset
|
30 "table", "ul" |
3 | 31 ]) |
32 | |
33 # F u n c t i o n s | |
34 | |
4 | 35 def uncurl(ws): |
36 """ | |
37 Makes all quotes in the workspace non-curly. | |
38 """ | |
39 for i in range(len(ws)): | |
40 ch = ws[i] | |
41 if ch in set([LDQUO, RDQUO]): | |
42 ws[i] = '"' | |
43 elif ch in set([LSQUO, RSQUO]): | |
44 ws[i] = "'" | |
45 | |
3 | 46 def _is_cockney(pos, ws): |
47 for i in _ASTART: | |
48 li = len(i) | |
49 if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): | |
50 return True | |
51 | |
52 # C l a s s e s | |
53 | |
54 class BaseCurler(): | |
55 def feed(self): | |
56 raise NotImplementedError() | |
57 | |
58 class TextCurler(BaseCurler): | |
59 """ | |
60 For processing plain text. Assumes the entire text is a block; it is | |
61 the responsibility of the caller to break the input into paragraphs. | |
62 """ | |
63 def __init__(self, workspace): | |
64 self.workspace = workspace | |
65 self._state = self._norm | |
66 self._pos = 0 | |
67 | |
68 def feed(self): | |
69 self._pos = 0 | |
70 self._state = self._norm | |
71 for self._pos in range(len(self.workspace)): | |
72 self._state() | |
73 | |
74 def _norm(self): | |
75 pos = self._pos | |
76 ws = self.workspace | |
77 char = ws[pos] | |
78 if char == "\"": | |
79 # opening double quote | |
80 ws[pos] = LDQUO | |
81 self._state = self._seen_ld | |
82 elif char == "'": | |
83 # in this state, ' is always an apostrophe | |
84 ws[pos] = APOS | |
85 | |
86 def _seen_ld(self): | |
87 pos = self._pos | |
88 ws = self.workspace | |
89 char = ws[pos] | |
90 if char == "\"": | |
91 # closing double quote | |
92 ws[pos] = RDQUO | |
93 self._state = self._norm | |
94 elif char == "'": | |
25
d3eb798f7e95
Fix curling inside double quotes.
David Barts <n5jrn@me.com>
parents:
19
diff
changeset
|
95 if ws[pos-1].isalnum(): |
3 | 96 # either an inter-word, or an end of word, apostrophe |
97 ws[pos] = APOS | |
98 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): | |
99 # also an apostrophe | |
100 ws[pos] = APOS | |
101 else: | |
102 # opening single quote | |
103 ws[pos] = LSQUO | |
104 self._state = self._seen_ls | |
105 | |
106 def _seen_ls(self): | |
107 pos = self._pos | |
108 ws = self.workspace | |
109 if ws[pos] == "'": | |
110 if ws[pos-1].isalpha() and ws[pos+1].isalpha(): | |
111 # obvious apostrophe | |
112 ws[pos] = APOS | |
113 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): | |
114 # also an apostrophe | |
115 ws[pos] = APOS | |
116 elif ws[pos-1].isspace(): | |
117 # start of word apostrophe | |
118 ws[pos] = APOS | |
119 else: | |
120 # closing single quote | |
121 ws[pos] = RSQUO | |
122 self._state = self._seen_ld | |
123 | |
124 class HtmlCurler(BaseCurler): | |
125 """ | |
126 For processing HTML. Uses HTML block tags to delimit blocks. | |
127 """ | |
128 def __init__(self, workspace): | |
129 self.workspace = workspace | |
130 self._state = self._norm | |
131 self._pos = 0 | |
132 self._ltpos = 0 | |
133 self._endtag = None | |
134 self._ltstate = None | |
135 | |
136 def feed(self): | |
137 self._pos = 0 | |
138 self._state = self._norm | |
139 for self._pos in range(len(self.workspace)): | |
140 self._state() | |
141 | |
142 def _goto_lt(self): | |
143 self._ltpos = self._pos | |
144 self._ltstate = self._state | |
145 self._state = self._seen_lt | |
146 | |
147 def _norm(self): | |
148 pos = self._pos | |
149 ws = self.workspace | |
150 char = ws[pos] | |
151 if char == "<": | |
152 self._goto_lt() | |
153 elif char == "\"": | |
154 # opening double quote | |
155 ws[pos] = LDQUO | |
156 self._state = self._seen_ld | |
157 elif char == "'": | |
158 # in this state, ' is always an apostrophe | |
159 ws[pos] = APOS | |
160 | |
161 def _gettag(self, start): | |
162 ws = self.workspace | |
163 end = start | |
164 while ws[end].isalnum(): | |
165 end += 1 | |
166 return ws[start:end].lower() | |
167 | |
168 def _seen_lt(self): | |
169 pos = self._pos | |
170 ws = self.workspace | |
171 if ws[pos] == ">": | |
172 start = self._ltpos + 1 | |
173 if ws[start] == '/': | |
174 if self._gettag(start + 1) in _BLOCK: | |
175 self._state = self._norm | |
176 else: | |
177 self._state = self._ltstate | |
178 else: | |
179 tag = self._gettag(start) | |
180 if tag in _BLOCK: | |
181 self._state = self._norm | |
182 elif tag in _RAW: | |
183 self._state = self._raw | |
184 self._endtag = "</" + tag | |
185 else: | |
186 self._state = self._ltstate | |
187 | |
188 def _raw(self): | |
189 pos = self._pos | |
190 ws = self.workspace | |
191 end = pos + len(self._endtag) | |
192 # only a matching end tag gets us out of the raw state | |
193 if ws[pos] == '<' and ws[pos:end].lower() == self._endtag and (not ws[end].isalnum()): | |
194 self._ltpos = pos | |
7
9df9ff8cecde
Undo that; ignoring <pre> is a sticky wicket.
David Barts <n5jrn@me.com>
parents:
6
diff
changeset
|
195 self._state = self._seen_lt |
3 | 196 |
197 def _seen_ld(self): | |
198 pos = self._pos | |
199 ws = self.workspace | |
200 char = ws[pos] | |
201 if char == "<": | |
202 self._goto_lt() | |
203 elif char == "\"": | |
204 # closing double quote | |
205 ws[pos] = RDQUO | |
206 self._state = self._norm | |
207 elif char == "'": | |
25
d3eb798f7e95
Fix curling inside double quotes.
David Barts <n5jrn@me.com>
parents:
19
diff
changeset
|
208 if ws[pos-1].isalnum(): |
3 | 209 # either an inter-word, or an end of word, apostrophe |
210 ws[pos] = APOS | |
211 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): | |
212 # also an apostrophe | |
213 ws[pos] = APOS | |
214 else: | |
215 # opening single quote | |
216 ws[pos] = LSQUO | |
217 self._state = self._seen_ls | |
218 | |
29
d5bf9985b5c4
Add degree symbol, fix bug in HTML curler.
David Barts <n5jrn@me.com>
parents:
25
diff
changeset
|
219 def _seen_ls(self): |
3 | 220 pos = self._pos |
221 ws = self.workspace | |
222 char = ws[pos] | |
223 if char == "<": | |
224 self._goto_lt() | |
225 elif char == "'": | |
226 if ws[pos-1].isalpha() and ws[pos+1].isalpha(): | |
227 # obvious apostrophe | |
228 ws[pos] = APOS | |
229 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): | |
230 # also an apostrophe | |
231 ws[pos] = APOS | |
232 elif ws[pos-1].isspace(): | |
233 # start of word apostrophe | |
234 ws[pos] = APOS | |
235 else: | |
236 # closing single quote | |
237 ws[pos] = RSQUO | |
238 self._state = self._seen_ld |