3
|
1 #!/usr/bin/env python3
|
|
2 # -*- coding: utf-8 -*-
|
|
3
|
|
4 # Classes for curling both HTML and plain text.
|
|
5
|
|
6 # I m p o r t s
|
|
7
|
|
8 import os, sys
|
|
9 from workspace import Workspace
|
|
10
|
|
11 # V a r i a b l e s
|
|
12
|
|
13 # Quote types
|
|
14 LSQUO = "\u2018"
|
|
15 APOS = RSQUO = "\u2019"
|
|
16 LDQUO = "\u201C"
|
|
17 RDQUO = "\u201D"
|
|
18
|
|
19 # Words that start with an apostrophe. Cribbed from Wordpress.
|
|
20 _ASTART = [ "'tain't", "'twere", "'twas", "'tis", "'twill", "'til",
|
|
21 "'bout", "'nuff", "'round", "'cause" , "'em" ]
|
|
22
|
|
23 # HTML tags that enclose raw data
|
|
24 _RAW = set(["script", "style"])
|
|
25
|
|
26 # HTML block elements
|
|
27 _BLOCK = set([
|
|
28 "address", "blockquote", "div", "dl", "fieldset", "form", "h1",
|
|
29 "h2", "h3", "h4", "h5", "h6", "hr", "noscript", "ol", "p", "pre",
|
|
30 "table", "ul"
|
|
31 ])
|
|
32
|
|
33 # F u n c t i o n s
|
|
34
|
|
35 def _is_cockney(pos, ws):
|
|
36 pos = self._pos
|
|
37 ws = self.workspace
|
|
38 for i in _ASTART:
|
|
39 li = len(i)
|
|
40 if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
|
|
41 return True
|
|
42
|
|
43 # C l a s s e s
|
|
44
|
|
45 class BaseCurler():
|
|
46 def feed(self):
|
|
47 raise NotImplementedError()
|
|
48
|
|
49 class TextCurler(BaseCurler):
|
|
50 """
|
|
51 For processing plain text. Assumes the entire text is a block; it is
|
|
52 the responsibility of the caller to break the input into paragraphs.
|
|
53 """
|
|
54 def __init__(self, workspace):
|
|
55 self.workspace = workspace
|
|
56 self._state = self._norm
|
|
57 self._pos = 0
|
|
58
|
|
59 def feed(self):
|
|
60 self._pos = 0
|
|
61 self._state = self._norm
|
|
62 for self._pos in range(len(self.workspace)):
|
|
63 self._state()
|
|
64
|
|
65 def _is_cockney(self):
|
|
66 pos = self._pos
|
|
67 ws = self.workspace
|
|
68 for i in _ASTART:
|
|
69 li = len(i)
|
|
70 print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i))
|
|
71 if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
|
|
72 return True
|
|
73
|
|
74 def _norm(self):
|
|
75 pos = self._pos
|
|
76 ws = self.workspace
|
|
77 char = ws[pos]
|
|
78 if char == "\"":
|
|
79 # opening double quote
|
|
80 ws[pos] = LDQUO
|
|
81 self._state = self._seen_ld
|
|
82 elif char == "'":
|
|
83 # in this state, ' is always an apostrophe
|
|
84 ws[pos] = APOS
|
|
85
|
|
86 def _seen_ld(self):
|
|
87 pos = self._pos
|
|
88 ws = self.workspace
|
|
89 char = ws[pos]
|
|
90 if char == "\"":
|
|
91 # closing double quote
|
|
92 ws[pos] = RDQUO
|
|
93 self._state = self._norm
|
|
94 elif char == "'":
|
|
95 if ws[pos-1].isalpha():
|
|
96 # either an inter-word, or an end of word, apostrophe
|
|
97 ws[pos] = APOS
|
|
98 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
|
|
99 # also an apostrophe
|
|
100 ws[pos] = APOS
|
|
101 else:
|
|
102 # opening single quote
|
|
103 ws[pos] = LSQUO
|
|
104 self._state = self._seen_ls
|
|
105
|
|
106 def _seen_ls(self):
|
|
107 pos = self._pos
|
|
108 ws = self.workspace
|
|
109 if ws[pos] == "'":
|
|
110 if ws[pos-1].isalpha() and ws[pos+1].isalpha():
|
|
111 # obvious apostrophe
|
|
112 ws[pos] = APOS
|
|
113 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
|
|
114 # also an apostrophe
|
|
115 ws[pos] = APOS
|
|
116 elif ws[pos-1].isspace():
|
|
117 # start of word apostrophe
|
|
118 ws[pos] = APOS
|
|
119 else:
|
|
120 # closing single quote
|
|
121 ws[pos] = RSQUO
|
|
122 self._state = self._seen_ld
|
|
123
|
|
124 class HtmlCurler(BaseCurler):
|
|
125 """
|
|
126 For processing HTML. Uses HTML block tags to delimit blocks.
|
|
127 """
|
|
128 def __init__(self, workspace):
|
|
129 self.workspace = workspace
|
|
130 self._state = self._norm
|
|
131 self._pos = 0
|
|
132 self._ltpos = 0
|
|
133 self._endtag = None
|
|
134 self._ltstate = None
|
|
135
|
|
136 def feed(self):
|
|
137 self._pos = 0
|
|
138 self._state = self._norm
|
|
139 for self._pos in range(len(self.workspace)):
|
|
140 self._state()
|
|
141
|
|
142 def _is_cockney(self):
|
|
143 pos = self._pos
|
|
144 ws = self.workspace
|
|
145 for i in _ASTART:
|
|
146 li = len(i)
|
|
147 print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i))
|
|
148 if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
|
|
149 return True
|
|
150
|
|
151 def _goto_lt(self):
|
|
152 self._ltpos = self._pos
|
|
153 self._ltstate = self._state
|
|
154 self._state = self._seen_lt
|
|
155
|
|
156 def _norm(self):
|
|
157 pos = self._pos
|
|
158 ws = self.workspace
|
|
159 char = ws[pos]
|
|
160 if char == "<":
|
|
161 self._goto_lt()
|
|
162 elif char == "\"":
|
|
163 # opening double quote
|
|
164 ws[pos] = LDQUO
|
|
165 self._state = self._seen_ld
|
|
166 elif char == "'":
|
|
167 # in this state, ' is always an apostrophe
|
|
168 ws[pos] = APOS
|
|
169
|
|
170 def _gettag(self, start):
|
|
171 ws = self.workspace
|
|
172 end = start
|
|
173 while ws[end].isalnum():
|
|
174 end += 1
|
|
175 return ws[start:end].lower()
|
|
176
|
|
177 def _seen_lt(self):
|
|
178 pos = self._pos
|
|
179 ws = self.workspace
|
|
180 if ws[pos] == ">":
|
|
181 start = self._ltpos + 1
|
|
182 if ws[start] == '/':
|
|
183 if self._gettag(start + 1) in _BLOCK:
|
|
184 self._state = self._norm
|
|
185 else:
|
|
186 self._state = self._ltstate
|
|
187 else:
|
|
188 tag = self._gettag(start)
|
|
189 if tag in _BLOCK:
|
|
190 self._state = self._norm
|
|
191 elif tag in _RAW:
|
|
192 self._state = self._raw
|
|
193 self._endtag = "</" + tag
|
|
194 else:
|
|
195 self._state = self._ltstate
|
|
196
|
|
197 def _raw(self):
|
|
198 pos = self._pos
|
|
199 ws = self.workspace
|
|
200 end = pos + len(self._endtag)
|
|
201 # only a matching end tag gets us out of the raw state
|
|
202 if ws[pos] == '<' and ws[pos:end].lower() == self._endtag and (not ws[end].isalnum()):
|
|
203 self._ltpos = pos
|
|
204 self._state = self._seen_lt
|
|
205
|
|
206 def _seen_ld(self):
|
|
207 pos = self._pos
|
|
208 ws = self.workspace
|
|
209 char = ws[pos]
|
|
210 if char == "<":
|
|
211 self._goto_lt()
|
|
212 elif char == "\"":
|
|
213 # closing double quote
|
|
214 ws[pos] = RDQUO
|
|
215 self._state = self._norm
|
|
216 elif char == "'":
|
|
217 if ws[pos-1].isalpha():
|
|
218 # either an inter-word, or an end of word, apostrophe
|
|
219 ws[pos] = APOS
|
|
220 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
|
|
221 # also an apostrophe
|
|
222 ws[pos] = APOS
|
|
223 else:
|
|
224 # opening single quote
|
|
225 ws[pos] = LSQUO
|
|
226 self._state = self._seen_ls
|
|
227
|
|
228 def _seen_ls():
|
|
229 pos = self._pos
|
|
230 ws = self.workspace
|
|
231 char = ws[pos]
|
|
232 if char == "<":
|
|
233 self._goto_lt()
|
|
234 elif char == "'":
|
|
235 if ws[pos-1].isalpha() and ws[pos+1].isalpha():
|
|
236 # obvious apostrophe
|
|
237 ws[pos] = APOS
|
|
238 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
|
|
239 # also an apostrophe
|
|
240 ws[pos] = APOS
|
|
241 elif ws[pos-1].isspace():
|
|
242 # start of word apostrophe
|
|
243 ws[pos] = APOS
|
|
244 else:
|
|
245 # closing single quote
|
|
246 ws[pos] = RSQUO
|
|
247 self._state = self._seen_ld
|