3
|
1 #!/usr/bin/env python3
|
|
2 # -*- coding: utf-8 -*-
|
|
3
|
|
4 # Classes for curling both HTML and plain text.
|
|
5
|
|
6 # I m p o r t s
|
|
7
|
|
8 import os, sys
|
|
9 from workspace import Workspace
|
|
10
|
|
11 # V a r i a b l e s
|
|
12
|
|
13 # Quote types
|
|
14 LSQUO = "\u2018"
|
|
15 APOS = RSQUO = "\u2019"
|
|
16 LDQUO = "\u201C"
|
|
17 RDQUO = "\u201D"
|
|
18
|
|
19 # Words that start with an apostrophe. Cribbed from Wordpress.
|
|
20 _ASTART = [ "'tain't", "'twere", "'twas", "'tis", "'twill", "'til",
|
|
21 "'bout", "'nuff", "'round", "'cause" , "'em" ]
|
|
22
|
|
23 # HTML tags that enclose raw data
|
6
|
24 _RAW = set(["script", "style", "pre"])
|
3
|
25
|
|
26 # HTML block elements
|
|
27 _BLOCK = set([
|
6
|
28 "address", "blockquote", "div", "dl", "fieldset", "form", "h1", "h2",
|
|
29 "h3", "h4", "h5", "h6", "hr", "noscript", "ol", "p", "table", "ul"
|
3
|
30 ])
|
|
31
|
|
32 # F u n c t i o n s
|
|
33
|
4
|
34 def uncurl(ws):
|
|
35 """
|
|
36 Makes all quotes in the workspace non-curly.
|
|
37 """
|
|
38 for i in range(len(ws)):
|
|
39 ch = ws[i]
|
|
40 if ch in set([LDQUO, RDQUO]):
|
|
41 ws[i] = '"'
|
|
42 elif ch in set([LSQUO, RSQUO]):
|
|
43 ws[i] = "'"
|
|
44
|
3
|
45 def _is_cockney(pos, ws):
|
|
46 pos = self._pos
|
|
47 ws = self.workspace
|
|
48 for i in _ASTART:
|
|
49 li = len(i)
|
|
50 if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
|
|
51 return True
|
|
52
|
|
53 # C l a s s e s
|
|
54
|
|
55 class BaseCurler():
|
|
56 def feed(self):
|
|
57 raise NotImplementedError()
|
|
58
|
|
59 class TextCurler(BaseCurler):
|
|
60 """
|
|
61 For processing plain text. Assumes the entire text is a block; it is
|
|
62 the responsibility of the caller to break the input into paragraphs.
|
|
63 """
|
|
64 def __init__(self, workspace):
|
|
65 self.workspace = workspace
|
|
66 self._state = self._norm
|
|
67 self._pos = 0
|
|
68
|
|
69 def feed(self):
|
|
70 self._pos = 0
|
|
71 self._state = self._norm
|
|
72 for self._pos in range(len(self.workspace)):
|
|
73 self._state()
|
|
74
|
|
75 def _is_cockney(self):
|
|
76 pos = self._pos
|
|
77 ws = self.workspace
|
|
78 for i in _ASTART:
|
|
79 li = len(i)
|
|
80 print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i))
|
|
81 if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
|
|
82 return True
|
|
83
|
|
84 def _norm(self):
|
|
85 pos = self._pos
|
|
86 ws = self.workspace
|
|
87 char = ws[pos]
|
|
88 if char == "\"":
|
|
89 # opening double quote
|
|
90 ws[pos] = LDQUO
|
|
91 self._state = self._seen_ld
|
|
92 elif char == "'":
|
|
93 # in this state, ' is always an apostrophe
|
|
94 ws[pos] = APOS
|
|
95
|
|
96 def _seen_ld(self):
|
|
97 pos = self._pos
|
|
98 ws = self.workspace
|
|
99 char = ws[pos]
|
|
100 if char == "\"":
|
|
101 # closing double quote
|
|
102 ws[pos] = RDQUO
|
|
103 self._state = self._norm
|
|
104 elif char == "'":
|
|
105 if ws[pos-1].isalpha():
|
|
106 # either an inter-word, or an end of word, apostrophe
|
|
107 ws[pos] = APOS
|
|
108 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
|
|
109 # also an apostrophe
|
|
110 ws[pos] = APOS
|
|
111 else:
|
|
112 # opening single quote
|
|
113 ws[pos] = LSQUO
|
|
114 self._state = self._seen_ls
|
|
115
|
|
116 def _seen_ls(self):
|
|
117 pos = self._pos
|
|
118 ws = self.workspace
|
|
119 if ws[pos] == "'":
|
|
120 if ws[pos-1].isalpha() and ws[pos+1].isalpha():
|
|
121 # obvious apostrophe
|
|
122 ws[pos] = APOS
|
|
123 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
|
|
124 # also an apostrophe
|
|
125 ws[pos] = APOS
|
|
126 elif ws[pos-1].isspace():
|
|
127 # start of word apostrophe
|
|
128 ws[pos] = APOS
|
|
129 else:
|
|
130 # closing single quote
|
|
131 ws[pos] = RSQUO
|
|
132 self._state = self._seen_ld
|
|
133
|
|
134 class HtmlCurler(BaseCurler):
|
|
135 """
|
|
136 For processing HTML. Uses HTML block tags to delimit blocks.
|
|
137 """
|
|
138 def __init__(self, workspace):
|
|
139 self.workspace = workspace
|
|
140 self._state = self._norm
|
|
141 self._pos = 0
|
|
142 self._ltpos = 0
|
|
143 self._endtag = None
|
|
144 self._ltstate = None
|
|
145
|
|
146 def feed(self):
|
|
147 self._pos = 0
|
|
148 self._state = self._norm
|
|
149 for self._pos in range(len(self.workspace)):
|
|
150 self._state()
|
|
151
|
|
152 def _is_cockney(self):
|
|
153 pos = self._pos
|
|
154 ws = self.workspace
|
|
155 for i in _ASTART:
|
|
156 li = len(i)
|
|
157 print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i))
|
|
158 if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
|
|
159 return True
|
|
160
|
|
161 def _goto_lt(self):
|
|
162 self._ltpos = self._pos
|
|
163 self._ltstate = self._state
|
|
164 self._state = self._seen_lt
|
|
165
|
|
166 def _norm(self):
|
|
167 pos = self._pos
|
|
168 ws = self.workspace
|
|
169 char = ws[pos]
|
|
170 if char == "<":
|
|
171 self._goto_lt()
|
|
172 elif char == "\"":
|
|
173 # opening double quote
|
|
174 ws[pos] = LDQUO
|
|
175 self._state = self._seen_ld
|
|
176 elif char == "'":
|
|
177 # in this state, ' is always an apostrophe
|
|
178 ws[pos] = APOS
|
|
179
|
|
180 def _gettag(self, start):
|
|
181 ws = self.workspace
|
|
182 end = start
|
|
183 while ws[end].isalnum():
|
|
184 end += 1
|
|
185 return ws[start:end].lower()
|
|
186
|
|
187 def _seen_lt(self):
|
|
188 pos = self._pos
|
|
189 ws = self.workspace
|
|
190 if ws[pos] == ">":
|
|
191 start = self._ltpos + 1
|
|
192 if ws[start] == '/':
|
|
193 if self._gettag(start + 1) in _BLOCK:
|
|
194 self._state = self._norm
|
|
195 else:
|
|
196 self._state = self._ltstate
|
|
197 else:
|
|
198 tag = self._gettag(start)
|
|
199 if tag in _BLOCK:
|
|
200 self._state = self._norm
|
|
201 elif tag in _RAW:
|
|
202 self._state = self._raw
|
|
203 self._endtag = "</" + tag
|
|
204 else:
|
|
205 self._state = self._ltstate
|
|
206
|
|
207 def _raw(self):
|
|
208 pos = self._pos
|
|
209 ws = self.workspace
|
|
210 end = pos + len(self._endtag)
|
|
211 # only a matching end tag gets us out of the raw state
|
|
212 if ws[pos] == '<' and ws[pos:end].lower() == self._endtag and (not ws[end].isalnum()):
|
|
213 self._ltpos = pos
|
6
|
214 self._state = self._norm if self._endtag == "</pre" else self._ltstate
|
3
|
215
|
|
216 def _seen_ld(self):
|
|
217 pos = self._pos
|
|
218 ws = self.workspace
|
|
219 char = ws[pos]
|
|
220 if char == "<":
|
|
221 self._goto_lt()
|
|
222 elif char == "\"":
|
|
223 # closing double quote
|
|
224 ws[pos] = RDQUO
|
|
225 self._state = self._norm
|
|
226 elif char == "'":
|
|
227 if ws[pos-1].isalpha():
|
|
228 # either an inter-word, or an end of word, apostrophe
|
|
229 ws[pos] = APOS
|
|
230 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
|
|
231 # also an apostrophe
|
|
232 ws[pos] = APOS
|
|
233 else:
|
|
234 # opening single quote
|
|
235 ws[pos] = LSQUO
|
|
236 self._state = self._seen_ls
|
|
237
|
|
238 def _seen_ls():
|
|
239 pos = self._pos
|
|
240 ws = self.workspace
|
|
241 char = ws[pos]
|
|
242 if char == "<":
|
|
243 self._goto_lt()
|
|
244 elif char == "'":
|
|
245 if ws[pos-1].isalpha() and ws[pos+1].isalpha():
|
|
246 # obvious apostrophe
|
|
247 ws[pos] = APOS
|
|
248 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
|
|
249 # also an apostrophe
|
|
250 ws[pos] = APOS
|
|
251 elif ws[pos-1].isspace():
|
|
252 # start of word apostrophe
|
|
253 ws[pos] = APOS
|
|
254 else:
|
|
255 # closing single quote
|
|
256 ws[pos] = RSQUO
|
|
257 self._state = self._seen_ld
|