3
|
1 #!/usr/bin/env python3
|
|
2 # -*- coding: utf-8 -*-
|
|
3
|
|
4 # Classes for curling both HTML and plain text.
|
|
5
|
|
6 # I m p o r t s
|
|
7
|
|
8 import os, sys
|
|
9 from workspace import Workspace
|
|
10
|
|
11 # V a r i a b l e s
|
|
12
|
|
13 # Quote types
|
|
14 LSQUO = "\u2018"
|
|
15 APOS = RSQUO = "\u2019"
|
|
16 LDQUO = "\u201C"
|
|
17 RDQUO = "\u201D"
|
|
18
|
|
19 # Words that start with an apostrophe. Cribbed from Wordpress.
|
|
20 _ASTART = [ "'tain't", "'twere", "'twas", "'tis", "'twill", "'til",
|
|
21 "'bout", "'nuff", "'round", "'cause" , "'em" ]
|
|
22
|
|
23 # HTML tags that enclose raw data
|
|
24 _RAW = set(["script", "style"])
|
|
25
|
|
26 # HTML block elements
|
|
27 _BLOCK = set([
|
|
28 "address", "blockquote", "div", "dl", "fieldset", "form", "h1",
|
|
29 "h2", "h3", "h4", "h5", "h6", "hr", "noscript", "ol", "p", "pre",
|
|
30 "table", "ul"
|
|
31 ])
|
|
32
|
|
33 # F u n c t i o n s
|
|
34
|
4
|
35 def uncurl(ws):
|
|
36 """
|
|
37 Makes all quotes in the workspace non-curly.
|
|
38 """
|
|
39 for i in range(len(ws)):
|
|
40 ch = ws[i]
|
|
41 if ch in set([LDQUO, RDQUO]):
|
|
42 ws[i] = '"'
|
|
43 elif ch in set([LSQUO, RSQUO]):
|
|
44 ws[i] = "'"
|
|
45
|
3
|
46 def _is_cockney(pos, ws):
|
|
47 pos = self._pos
|
|
48 ws = self.workspace
|
|
49 for i in _ASTART:
|
|
50 li = len(i)
|
|
51 if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
|
|
52 return True
|
|
53
|
|
54 # C l a s s e s
|
|
55
|
|
56 class BaseCurler():
|
|
57 def feed(self):
|
|
58 raise NotImplementedError()
|
|
59
|
|
60 class TextCurler(BaseCurler):
|
|
61 """
|
|
62 For processing plain text. Assumes the entire text is a block; it is
|
|
63 the responsibility of the caller to break the input into paragraphs.
|
|
64 """
|
|
65 def __init__(self, workspace):
|
|
66 self.workspace = workspace
|
|
67 self._state = self._norm
|
|
68 self._pos = 0
|
|
69
|
|
70 def feed(self):
|
|
71 self._pos = 0
|
|
72 self._state = self._norm
|
|
73 for self._pos in range(len(self.workspace)):
|
|
74 self._state()
|
|
75
|
|
76 def _is_cockney(self):
|
|
77 pos = self._pos
|
|
78 ws = self.workspace
|
|
79 for i in _ASTART:
|
|
80 li = len(i)
|
|
81 print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i))
|
|
82 if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
|
|
83 return True
|
|
84
|
|
85 def _norm(self):
|
|
86 pos = self._pos
|
|
87 ws = self.workspace
|
|
88 char = ws[pos]
|
|
89 if char == "\"":
|
|
90 # opening double quote
|
|
91 ws[pos] = LDQUO
|
|
92 self._state = self._seen_ld
|
|
93 elif char == "'":
|
|
94 # in this state, ' is always an apostrophe
|
|
95 ws[pos] = APOS
|
|
96
|
|
97 def _seen_ld(self):
|
|
98 pos = self._pos
|
|
99 ws = self.workspace
|
|
100 char = ws[pos]
|
|
101 if char == "\"":
|
|
102 # closing double quote
|
|
103 ws[pos] = RDQUO
|
|
104 self._state = self._norm
|
|
105 elif char == "'":
|
|
106 if ws[pos-1].isalpha():
|
|
107 # either an inter-word, or an end of word, apostrophe
|
|
108 ws[pos] = APOS
|
|
109 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
|
|
110 # also an apostrophe
|
|
111 ws[pos] = APOS
|
|
112 else:
|
|
113 # opening single quote
|
|
114 ws[pos] = LSQUO
|
|
115 self._state = self._seen_ls
|
|
116
|
|
117 def _seen_ls(self):
|
|
118 pos = self._pos
|
|
119 ws = self.workspace
|
|
120 if ws[pos] == "'":
|
|
121 if ws[pos-1].isalpha() and ws[pos+1].isalpha():
|
|
122 # obvious apostrophe
|
|
123 ws[pos] = APOS
|
|
124 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
|
|
125 # also an apostrophe
|
|
126 ws[pos] = APOS
|
|
127 elif ws[pos-1].isspace():
|
|
128 # start of word apostrophe
|
|
129 ws[pos] = APOS
|
|
130 else:
|
|
131 # closing single quote
|
|
132 ws[pos] = RSQUO
|
|
133 self._state = self._seen_ld
|
|
134
|
|
135 class HtmlCurler(BaseCurler):
|
|
136 """
|
|
137 For processing HTML. Uses HTML block tags to delimit blocks.
|
|
138 """
|
|
139 def __init__(self, workspace):
|
|
140 self.workspace = workspace
|
|
141 self._state = self._norm
|
|
142 self._pos = 0
|
|
143 self._ltpos = 0
|
|
144 self._endtag = None
|
|
145 self._ltstate = None
|
|
146
|
|
147 def feed(self):
|
|
148 self._pos = 0
|
|
149 self._state = self._norm
|
|
150 for self._pos in range(len(self.workspace)):
|
|
151 self._state()
|
|
152
|
|
153 def _is_cockney(self):
|
|
154 pos = self._pos
|
|
155 ws = self.workspace
|
|
156 for i in _ASTART:
|
|
157 li = len(i)
|
|
158 print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i))
|
|
159 if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
|
|
160 return True
|
|
161
|
|
162 def _goto_lt(self):
|
|
163 self._ltpos = self._pos
|
|
164 self._ltstate = self._state
|
|
165 self._state = self._seen_lt
|
|
166
|
|
167 def _norm(self):
|
|
168 pos = self._pos
|
|
169 ws = self.workspace
|
|
170 char = ws[pos]
|
|
171 if char == "<":
|
|
172 self._goto_lt()
|
|
173 elif char == "\"":
|
|
174 # opening double quote
|
|
175 ws[pos] = LDQUO
|
|
176 self._state = self._seen_ld
|
|
177 elif char == "'":
|
|
178 # in this state, ' is always an apostrophe
|
|
179 ws[pos] = APOS
|
|
180
|
|
181 def _gettag(self, start):
|
|
182 ws = self.workspace
|
|
183 end = start
|
|
184 while ws[end].isalnum():
|
|
185 end += 1
|
|
186 return ws[start:end].lower()
|
|
187
|
|
188 def _seen_lt(self):
|
|
189 pos = self._pos
|
|
190 ws = self.workspace
|
|
191 if ws[pos] == ">":
|
|
192 start = self._ltpos + 1
|
|
193 if ws[start] == '/':
|
|
194 if self._gettag(start + 1) in _BLOCK:
|
|
195 self._state = self._norm
|
|
196 else:
|
|
197 self._state = self._ltstate
|
|
198 else:
|
|
199 tag = self._gettag(start)
|
|
200 if tag in _BLOCK:
|
|
201 self._state = self._norm
|
|
202 elif tag in _RAW:
|
|
203 self._state = self._raw
|
|
204 self._endtag = "</" + tag
|
|
205 else:
|
|
206 self._state = self._ltstate
|
|
207
|
|
208 def _raw(self):
|
|
209 pos = self._pos
|
|
210 ws = self.workspace
|
|
211 end = pos + len(self._endtag)
|
|
212 # only a matching end tag gets us out of the raw state
|
|
213 if ws[pos] == '<' and ws[pos:end].lower() == self._endtag and (not ws[end].isalnum()):
|
|
214 self._ltpos = pos
|
|
215 self._state = self._seen_lt
|
|
216
|
|
217 def _seen_ld(self):
|
|
218 pos = self._pos
|
|
219 ws = self.workspace
|
|
220 char = ws[pos]
|
|
221 if char == "<":
|
|
222 self._goto_lt()
|
|
223 elif char == "\"":
|
|
224 # closing double quote
|
|
225 ws[pos] = RDQUO
|
|
226 self._state = self._norm
|
|
227 elif char == "'":
|
|
228 if ws[pos-1].isalpha():
|
|
229 # either an inter-word, or an end of word, apostrophe
|
|
230 ws[pos] = APOS
|
|
231 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
|
|
232 # also an apostrophe
|
|
233 ws[pos] = APOS
|
|
234 else:
|
|
235 # opening single quote
|
|
236 ws[pos] = LSQUO
|
|
237 self._state = self._seen_ls
|
|
238
|
|
239 def _seen_ls():
|
|
240 pos = self._pos
|
|
241 ws = self.workspace
|
|
242 char = ws[pos]
|
|
243 if char == "<":
|
|
244 self._goto_lt()
|
|
245 elif char == "'":
|
|
246 if ws[pos-1].isalpha() and ws[pos+1].isalpha():
|
|
247 # obvious apostrophe
|
|
248 ws[pos] = APOS
|
|
249 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
|
|
250 # also an apostrophe
|
|
251 ws[pos] = APOS
|
|
252 elif ws[pos-1].isspace():
|
|
253 # start of word apostrophe
|
|
254 ws[pos] = APOS
|
|
255 else:
|
|
256 # closing single quote
|
|
257 ws[pos] = RSQUO
|
|
258 self._state = self._seen_ld
|