Mercurial > cgi-bin > hgweb.cgi > curlyq
diff curlers.py @ 3:091c03f1b2e8
Getting it working...
author | David Barts <n5jrn@me.com> |
---|---|
date | Thu, 26 Dec 2019 19:54:45 -0800 |
parents | |
children | 7a83e82e65a6 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/curlers.py Thu Dec 26 19:54:45 2019 -0800 @@ -0,0 +1,247 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Classes for curling both HTML and plain text. + +# I m p o r t s + +import os, sys +from workspace import Workspace + +# V a r i a b l e s + +# Quote types +LSQUO = "\u2018" +APOS = RSQUO = "\u2019" +LDQUO = "\u201C" +RDQUO = "\u201D" + +# Words that start with an apostrophe. Cribbed from Wordpress. +_ASTART = [ "'tain't", "'twere", "'twas", "'tis", "'twill", "'til", + "'bout", "'nuff", "'round", "'cause" , "'em" ] + +# HTML tags that enclose raw data +_RAW = set(["script", "style"]) + +# HTML block elements +_BLOCK = set([ + "address", "blockquote", "div", "dl", "fieldset", "form", "h1", + "h2", "h3", "h4", "h5", "h6", "hr", "noscript", "ol", "p", "pre", + "table", "ul" +]) + +# F u n c t i o n s + +def _is_cockney(pos, ws): + pos = self._pos + ws = self.workspace + for i in _ASTART: + li = len(i) + if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): + return True + +# C l a s s e s + +class BaseCurler(): + def feed(self): + raise NotImplementedError() + +class TextCurler(BaseCurler): + """ + For processing plain text. Assumes the entire text is a block; it is + the responsibility of the caller to break the input into paragraphs. + """ + def __init__(self, workspace): + self.workspace = workspace + self._state = self._norm + self._pos = 0 + + def feed(self): + self._pos = 0 + self._state = self._norm + for self._pos in range(len(self.workspace)): + self._state() + + def _is_cockney(self): + pos = self._pos + ws = self.workspace + for i in _ASTART: + li = len(i) + print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i)) + if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): + return True + + def _norm(self): + pos = self._pos + ws = self.workspace + char = ws[pos] + if char == "\"": + # opening double quote + ws[pos] = LDQUO + self._state = self._seen_ld + elif char == "'": + # in this state, ' is always an apostrophe + ws[pos] = APOS + + def _seen_ld(self): + pos = self._pos + ws = self.workspace + char = ws[pos] + if char == "\"": + # closing double quote + ws[pos] = RDQUO + self._state = self._norm + elif char == "'": + if ws[pos-1].isalpha(): + # either an inter-word, or an end of word, apostrophe + ws[pos] = APOS + elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): + # also an apostrophe + ws[pos] = APOS + else: + # opening single quote + ws[pos] = LSQUO + self._state = self._seen_ls + + def _seen_ls(self): + pos = self._pos + ws = self.workspace + if ws[pos] == "'": + if ws[pos-1].isalpha() and ws[pos+1].isalpha(): + # obvious apostrophe + ws[pos] = APOS + elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): + # also an apostrophe + ws[pos] = APOS + elif ws[pos-1].isspace(): + # start of word apostrophe + ws[pos] = APOS + else: + # closing single quote + ws[pos] = RSQUO + self._state = self._seen_ld + +class HtmlCurler(BaseCurler): + """ + For processing HTML. Uses HTML block tags to delimit blocks. + """ + def __init__(self, workspace): + self.workspace = workspace + self._state = self._norm + self._pos = 0 + self._ltpos = 0 + self._endtag = None + self._ltstate = None + + def feed(self): + self._pos = 0 + self._state = self._norm + for self._pos in range(len(self.workspace)): + self._state() + + def _is_cockney(self): + pos = self._pos + ws = self.workspace + for i in _ASTART: + li = len(i) + print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i)) + if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): + return True + + def _goto_lt(self): + self._ltpos = self._pos + self._ltstate = self._state + self._state = self._seen_lt + + def _norm(self): + pos = self._pos + ws = self.workspace + char = ws[pos] + if char == "<": + self._goto_lt() + elif char == "\"": + # opening double quote + ws[pos] = LDQUO + self._state = self._seen_ld + elif char == "'": + # in this state, ' is always an apostrophe + ws[pos] = APOS + + def _gettag(self, start): + ws = self.workspace + end = start + while ws[end].isalnum(): + end += 1 + return ws[start:end].lower() + + def _seen_lt(self): + pos = self._pos + ws = self.workspace + if ws[pos] == ">": + start = self._ltpos + 1 + if ws[start] == '/': + if self._gettag(start + 1) in _BLOCK: + self._state = self._norm + else: + self._state = self._ltstate + else: + tag = self._gettag(start) + if tag in _BLOCK: + self._state = self._norm + elif tag in _RAW: + self._state = self._raw + self._endtag = "</" + tag + else: + self._state = self._ltstate + + def _raw(self): + pos = self._pos + ws = self.workspace + end = pos + len(self._endtag) + # only a matching end tag gets us out of the raw state + if ws[pos] == '<' and ws[pos:end].lower() == self._endtag and (not ws[end].isalnum()): + self._ltpos = pos + self._state = self._seen_lt + + def _seen_ld(self): + pos = self._pos + ws = self.workspace + char = ws[pos] + if char == "<": + self._goto_lt() + elif char == "\"": + # closing double quote + ws[pos] = RDQUO + self._state = self._norm + elif char == "'": + if ws[pos-1].isalpha(): + # either an inter-word, or an end of word, apostrophe + ws[pos] = APOS + elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): + # also an apostrophe + ws[pos] = APOS + else: + # opening single quote + ws[pos] = LSQUO + self._state = self._seen_ls + + def _seen_ls(): + pos = self._pos + ws = self.workspace + char = ws[pos] + if char == "<": + self._goto_lt() + elif char == "'": + if ws[pos-1].isalpha() and ws[pos+1].isalpha(): + # obvious apostrophe + ws[pos] = APOS + elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): + # also an apostrophe + ws[pos] = APOS + elif ws[pos-1].isspace(): + # start of word apostrophe + ws[pos] = APOS + else: + # closing single quote + ws[pos] = RSQUO + self._state = self._seen_ld