comparison curlers.py @ 7:9df9ff8cecde

Undo that; ignoring <pre> is a sticky wicket.
author David Barts <n5jrn@me.com>
date Thu, 26 Dec 2019 20:56:38 -0800
parents da3fb2312c88
children 397c178c5b98
comparison
equal deleted inserted replaced
6:da3fb2312c88 7:9df9ff8cecde
19 # Words that start with an apostrophe. Cribbed from Wordpress. 19 # Words that start with an apostrophe. Cribbed from Wordpress.
20 _ASTART = [ "'tain't", "'twere", "'twas", "'tis", "'twill", "'til", 20 _ASTART = [ "'tain't", "'twere", "'twas", "'tis", "'twill", "'til",
21 "'bout", "'nuff", "'round", "'cause" , "'em" ] 21 "'bout", "'nuff", "'round", "'cause" , "'em" ]
22 22
23 # HTML tags that enclose raw data 23 # HTML tags that enclose raw data
24 _RAW = set(["script", "style", "pre"]) 24 _RAW = set(["script", "style"])
25 25
26 # HTML block elements 26 # HTML block elements
27 _BLOCK = set([ 27 _BLOCK = set([
28 "address", "blockquote", "div", "dl", "fieldset", "form", "h1", "h2", 28 "address", "blockquote", "div", "dl", "fieldset", "form", "h1",
29 "h3", "h4", "h5", "h6", "hr", "noscript", "ol", "p", "table", "ul" 29 "h2", "h3", "h4", "h5", "h6", "hr", "noscript", "ol", "p", "pre",
30 "table", "ul"
30 ]) 31 ])
31 32
32 # F u n c t i o n s 33 # F u n c t i o n s
33 34
34 def uncurl(ws): 35 def uncurl(ws):
209 ws = self.workspace 210 ws = self.workspace
210 end = pos + len(self._endtag) 211 end = pos + len(self._endtag)
211 # only a matching end tag gets us out of the raw state 212 # only a matching end tag gets us out of the raw state
212 if ws[pos] == '<' and ws[pos:end].lower() == self._endtag and (not ws[end].isalnum()): 213 if ws[pos] == '<' and ws[pos:end].lower() == self._endtag and (not ws[end].isalnum()):
213 self._ltpos = pos 214 self._ltpos = pos
214 self._state = self._norm if self._endtag == "</pre" else self._ltstate 215 self._state = self._seen_lt
215 216
216 def _seen_ld(self): 217 def _seen_ld(self):
217 pos = self._pos 218 pos = self._pos
218 ws = self.workspace 219 ws = self.workspace
219 char = ws[pos] 220 char = ws[pos]