comparison curlers.py @ 3:091c03f1b2e8

Getting it working...
author David Barts <n5jrn@me.com>
date Thu, 26 Dec 2019 19:54:45 -0800
parents
children 7a83e82e65a6
comparison
equal deleted inserted replaced
2:8884b0bf779d 3:091c03f1b2e8
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3
4 # Classes for curling both HTML and plain text.
5
6 # I m p o r t s
7
8 import os, sys
9 from workspace import Workspace
10
11 # V a r i a b l e s
12
13 # Quote types
14 LSQUO = "\u2018"
15 APOS = RSQUO = "\u2019"
16 LDQUO = "\u201C"
17 RDQUO = "\u201D"
18
19 # Words that start with an apostrophe. Cribbed from Wordpress.
20 _ASTART = [ "'tain't", "'twere", "'twas", "'tis", "'twill", "'til",
21 "'bout", "'nuff", "'round", "'cause" , "'em" ]
22
23 # HTML tags that enclose raw data
24 _RAW = set(["script", "style"])
25
26 # HTML block elements
27 _BLOCK = set([
28 "address", "blockquote", "div", "dl", "fieldset", "form", "h1",
29 "h2", "h3", "h4", "h5", "h6", "hr", "noscript", "ol", "p", "pre",
30 "table", "ul"
31 ])
32
33 # F u n c t i o n s
34
35 def _is_cockney(pos, ws):
36 pos = self._pos
37 ws = self.workspace
38 for i in _ASTART:
39 li = len(i)
40 if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
41 return True
42
43 # C l a s s e s
44
45 class BaseCurler():
46 def feed(self):
47 raise NotImplementedError()
48
49 class TextCurler(BaseCurler):
50 """
51 For processing plain text. Assumes the entire text is a block; it is
52 the responsibility of the caller to break the input into paragraphs.
53 """
54 def __init__(self, workspace):
55 self.workspace = workspace
56 self._state = self._norm
57 self._pos = 0
58
59 def feed(self):
60 self._pos = 0
61 self._state = self._norm
62 for self._pos in range(len(self.workspace)):
63 self._state()
64
65 def _is_cockney(self):
66 pos = self._pos
67 ws = self.workspace
68 for i in _ASTART:
69 li = len(i)
70 print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i))
71 if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
72 return True
73
74 def _norm(self):
75 pos = self._pos
76 ws = self.workspace
77 char = ws[pos]
78 if char == "\"":
79 # opening double quote
80 ws[pos] = LDQUO
81 self._state = self._seen_ld
82 elif char == "'":
83 # in this state, ' is always an apostrophe
84 ws[pos] = APOS
85
86 def _seen_ld(self):
87 pos = self._pos
88 ws = self.workspace
89 char = ws[pos]
90 if char == "\"":
91 # closing double quote
92 ws[pos] = RDQUO
93 self._state = self._norm
94 elif char == "'":
95 if ws[pos-1].isalpha():
96 # either an inter-word, or an end of word, apostrophe
97 ws[pos] = APOS
98 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
99 # also an apostrophe
100 ws[pos] = APOS
101 else:
102 # opening single quote
103 ws[pos] = LSQUO
104 self._state = self._seen_ls
105
106 def _seen_ls(self):
107 pos = self._pos
108 ws = self.workspace
109 if ws[pos] == "'":
110 if ws[pos-1].isalpha() and ws[pos+1].isalpha():
111 # obvious apostrophe
112 ws[pos] = APOS
113 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
114 # also an apostrophe
115 ws[pos] = APOS
116 elif ws[pos-1].isspace():
117 # start of word apostrophe
118 ws[pos] = APOS
119 else:
120 # closing single quote
121 ws[pos] = RSQUO
122 self._state = self._seen_ld
123
124 class HtmlCurler(BaseCurler):
125 """
126 For processing HTML. Uses HTML block tags to delimit blocks.
127 """
128 def __init__(self, workspace):
129 self.workspace = workspace
130 self._state = self._norm
131 self._pos = 0
132 self._ltpos = 0
133 self._endtag = None
134 self._ltstate = None
135
136 def feed(self):
137 self._pos = 0
138 self._state = self._norm
139 for self._pos in range(len(self.workspace)):
140 self._state()
141
142 def _is_cockney(self):
143 pos = self._pos
144 ws = self.workspace
145 for i in _ASTART:
146 li = len(i)
147 print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i))
148 if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
149 return True
150
151 def _goto_lt(self):
152 self._ltpos = self._pos
153 self._ltstate = self._state
154 self._state = self._seen_lt
155
156 def _norm(self):
157 pos = self._pos
158 ws = self.workspace
159 char = ws[pos]
160 if char == "<":
161 self._goto_lt()
162 elif char == "\"":
163 # opening double quote
164 ws[pos] = LDQUO
165 self._state = self._seen_ld
166 elif char == "'":
167 # in this state, ' is always an apostrophe
168 ws[pos] = APOS
169
170 def _gettag(self, start):
171 ws = self.workspace
172 end = start
173 while ws[end].isalnum():
174 end += 1
175 return ws[start:end].lower()
176
177 def _seen_lt(self):
178 pos = self._pos
179 ws = self.workspace
180 if ws[pos] == ">":
181 start = self._ltpos + 1
182 if ws[start] == '/':
183 if self._gettag(start + 1) in _BLOCK:
184 self._state = self._norm
185 else:
186 self._state = self._ltstate
187 else:
188 tag = self._gettag(start)
189 if tag in _BLOCK:
190 self._state = self._norm
191 elif tag in _RAW:
192 self._state = self._raw
193 self._endtag = "</" + tag
194 else:
195 self._state = self._ltstate
196
197 def _raw(self):
198 pos = self._pos
199 ws = self.workspace
200 end = pos + len(self._endtag)
201 # only a matching end tag gets us out of the raw state
202 if ws[pos] == '<' and ws[pos:end].lower() == self._endtag and (not ws[end].isalnum()):
203 self._ltpos = pos
204 self._state = self._seen_lt
205
206 def _seen_ld(self):
207 pos = self._pos
208 ws = self.workspace
209 char = ws[pos]
210 if char == "<":
211 self._goto_lt()
212 elif char == "\"":
213 # closing double quote
214 ws[pos] = RDQUO
215 self._state = self._norm
216 elif char == "'":
217 if ws[pos-1].isalpha():
218 # either an inter-word, or an end of word, apostrophe
219 ws[pos] = APOS
220 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
221 # also an apostrophe
222 ws[pos] = APOS
223 else:
224 # opening single quote
225 ws[pos] = LSQUO
226 self._state = self._seen_ls
227
228 def _seen_ls():
229 pos = self._pos
230 ws = self.workspace
231 char = ws[pos]
232 if char == "<":
233 self._goto_lt()
234 elif char == "'":
235 if ws[pos-1].isalpha() and ws[pos+1].isalpha():
236 # obvious apostrophe
237 ws[pos] = APOS
238 elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
239 # also an apostrophe
240 ws[pos] = APOS
241 elif ws[pos-1].isspace():
242 # start of word apostrophe
243 ws[pos] = APOS
244 else:
245 # closing single quote
246 ws[pos] = RSQUO
247 self._state = self._seen_ld