Mercurial > cgi-bin > hgweb.cgi > curlyq
view curlers.py @ 7:9df9ff8cecde
Undo that; ignoring <pre> is a sticky wicket.
author | David Barts <n5jrn@me.com> |
---|---|
date | Thu, 26 Dec 2019 20:56:38 -0800 |
parents | da3fb2312c88 |
children | 397c178c5b98 |
line wrap: on
line source
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Classes for curling both HTML and plain text. # I m p o r t s import os, sys from workspace import Workspace # V a r i a b l e s # Quote types LSQUO = "\u2018" APOS = RSQUO = "\u2019" LDQUO = "\u201C" RDQUO = "\u201D" # Words that start with an apostrophe. Cribbed from Wordpress. _ASTART = [ "'tain't", "'twere", "'twas", "'tis", "'twill", "'til", "'bout", "'nuff", "'round", "'cause" , "'em" ] # HTML tags that enclose raw data _RAW = set(["script", "style"]) # HTML block elements _BLOCK = set([ "address", "blockquote", "div", "dl", "fieldset", "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "noscript", "ol", "p", "pre", "table", "ul" ]) # F u n c t i o n s def uncurl(ws): """ Makes all quotes in the workspace non-curly. """ for i in range(len(ws)): ch = ws[i] if ch in set([LDQUO, RDQUO]): ws[i] = '"' elif ch in set([LSQUO, RSQUO]): ws[i] = "'" def _is_cockney(pos, ws): pos = self._pos ws = self.workspace for i in _ASTART: li = len(i) if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): return True # C l a s s e s class BaseCurler(): def feed(self): raise NotImplementedError() class TextCurler(BaseCurler): """ For processing plain text. Assumes the entire text is a block; it is the responsibility of the caller to break the input into paragraphs. """ def __init__(self, workspace): self.workspace = workspace self._state = self._norm self._pos = 0 def feed(self): self._pos = 0 self._state = self._norm for self._pos in range(len(self.workspace)): self._state() def _is_cockney(self): pos = self._pos ws = self.workspace for i in _ASTART: li = len(i) print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i)) if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): return True def _norm(self): pos = self._pos ws = self.workspace char = ws[pos] if char == "\"": # opening double quote ws[pos] = LDQUO self._state = self._seen_ld elif char == "'": # in this state, ' is always an apostrophe ws[pos] = APOS def _seen_ld(self): pos = self._pos ws = self.workspace char = ws[pos] if char == "\"": # closing double quote ws[pos] = RDQUO self._state = self._norm elif char == "'": if ws[pos-1].isalpha(): # either an inter-word, or an end of word, apostrophe ws[pos] = APOS elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): # also an apostrophe ws[pos] = APOS else: # opening single quote ws[pos] = LSQUO self._state = self._seen_ls def _seen_ls(self): pos = self._pos ws = self.workspace if ws[pos] == "'": if ws[pos-1].isalpha() and ws[pos+1].isalpha(): # obvious apostrophe ws[pos] = APOS elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): # also an apostrophe ws[pos] = APOS elif ws[pos-1].isspace(): # start of word apostrophe ws[pos] = APOS else: # closing single quote ws[pos] = RSQUO self._state = self._seen_ld class HtmlCurler(BaseCurler): """ For processing HTML. Uses HTML block tags to delimit blocks. """ def __init__(self, workspace): self.workspace = workspace self._state = self._norm self._pos = 0 self._ltpos = 0 self._endtag = None self._ltstate = None def feed(self): self._pos = 0 self._state = self._norm for self._pos in range(len(self.workspace)): self._state() def _is_cockney(self): pos = self._pos ws = self.workspace for i in _ASTART: li = len(i) print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i)) if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): return True def _goto_lt(self): self._ltpos = self._pos self._ltstate = self._state self._state = self._seen_lt def _norm(self): pos = self._pos ws = self.workspace char = ws[pos] if char == "<": self._goto_lt() elif char == "\"": # opening double quote ws[pos] = LDQUO self._state = self._seen_ld elif char == "'": # in this state, ' is always an apostrophe ws[pos] = APOS def _gettag(self, start): ws = self.workspace end = start while ws[end].isalnum(): end += 1 return ws[start:end].lower() def _seen_lt(self): pos = self._pos ws = self.workspace if ws[pos] == ">": start = self._ltpos + 1 if ws[start] == '/': if self._gettag(start + 1) in _BLOCK: self._state = self._norm else: self._state = self._ltstate else: tag = self._gettag(start) if tag in _BLOCK: self._state = self._norm elif tag in _RAW: self._state = self._raw self._endtag = "</" + tag else: self._state = self._ltstate def _raw(self): pos = self._pos ws = self.workspace end = pos + len(self._endtag) # only a matching end tag gets us out of the raw state if ws[pos] == '<' and ws[pos:end].lower() == self._endtag and (not ws[end].isalnum()): self._ltpos = pos self._state = self._seen_lt def _seen_ld(self): pos = self._pos ws = self.workspace char = ws[pos] if char == "<": self._goto_lt() elif char == "\"": # closing double quote ws[pos] = RDQUO self._state = self._norm elif char == "'": if ws[pos-1].isalpha(): # either an inter-word, or an end of word, apostrophe ws[pos] = APOS elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): # also an apostrophe ws[pos] = APOS else: # opening single quote ws[pos] = LSQUO self._state = self._seen_ls def _seen_ls(): pos = self._pos ws = self.workspace char = ws[pos] if char == "<": self._goto_lt() elif char == "'": if ws[pos-1].isalpha() and ws[pos+1].isalpha(): # obvious apostrophe ws[pos] = APOS elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): # also an apostrophe ws[pos] = APOS elif ws[pos-1].isspace(): # start of word apostrophe ws[pos] = APOS else: # closing single quote ws[pos] = RSQUO self._state = self._seen_ld