diff curlers.py @ 3:091c03f1b2e8

Getting it working...
author David Barts <n5jrn@me.com>
date Thu, 26 Dec 2019 19:54:45 -0800
parents
children 7a83e82e65a6
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/curlers.py	Thu Dec 26 19:54:45 2019 -0800
@@ -0,0 +1,247 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Classes for curling both HTML and plain text.
+
+# I m p o r t s
+
+import os, sys
+from workspace import Workspace
+
+# V a r i a b l e s
+
+# Quote types
+LSQUO = "\u2018"
+APOS = RSQUO = "\u2019"
+LDQUO = "\u201C"
+RDQUO = "\u201D"
+
+# Words that start with an apostrophe. Cribbed from Wordpress.
+_ASTART = [ "'tain't", "'twere", "'twas", "'tis", "'twill", "'til",
+    "'bout", "'nuff", "'round", "'cause" , "'em" ]
+
+# HTML tags that enclose raw data
+_RAW = set(["script", "style"])
+
+# HTML block elements
+_BLOCK = set([
+    "address", "blockquote", "div", "dl", "fieldset", "form", "h1",
+    "h2", "h3", "h4", "h5", "h6", "hr", "noscript", "ol", "p", "pre",
+    "table", "ul"
+])
+
+# F u n c t i o n s
+
+def _is_cockney(pos, ws):
+    pos = self._pos
+    ws = self.workspace
+    for i in _ASTART:
+        li = len(i)
+        if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
+            return True
+
+# C l a s s e s
+
+class BaseCurler():
+    def feed(self):
+        raise NotImplementedError()
+
+class TextCurler(BaseCurler):
+    """
+    For processing plain text. Assumes the entire text is a block; it is
+    the responsibility of the caller to break the input into paragraphs.
+    """
+    def __init__(self, workspace):
+        self.workspace = workspace
+        self._state = self._norm
+        self._pos = 0
+
+    def feed(self):
+        self._pos = 0
+        self._state = self._norm
+        for self._pos in range(len(self.workspace)):
+            self._state()
+
+    def _is_cockney(self):
+        pos = self._pos
+        ws = self.workspace
+        for i in _ASTART:
+            li = len(i)
+            print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i))
+            if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
+                return True
+
+    def _norm(self):
+        pos = self._pos
+        ws = self.workspace
+        char = ws[pos]
+        if char == "\"":
+            # opening double quote
+            ws[pos] = LDQUO
+            self._state = self._seen_ld
+        elif char == "'":
+            # in this state, ' is always an apostrophe
+            ws[pos] = APOS
+
+    def _seen_ld(self):
+        pos = self._pos
+        ws = self.workspace
+        char = ws[pos]
+        if char == "\"":
+            # closing double quote
+            ws[pos] = RDQUO
+            self._state = self._norm
+        elif char == "'":
+            if ws[pos-1].isalpha():
+                # either an inter-word, or an end of word, apostrophe
+                ws[pos] = APOS
+            elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
+                # also an apostrophe
+                ws[pos] = APOS
+            else:
+                # opening single quote
+                ws[pos] = LSQUO
+                self._state = self._seen_ls
+
+    def _seen_ls(self):
+        pos = self._pos
+        ws = self.workspace
+        if ws[pos] == "'":
+            if ws[pos-1].isalpha() and ws[pos+1].isalpha():
+                # obvious apostrophe
+                ws[pos] = APOS
+            elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
+                # also an apostrophe
+                ws[pos] = APOS
+            elif ws[pos-1].isspace():
+                # start of word apostrophe
+                ws[pos] = APOS
+            else:
+                # closing single quote
+                ws[pos] = RSQUO
+                self._state = self._seen_ld
+
+class HtmlCurler(BaseCurler):
+    """
+    For processing HTML. Uses HTML block tags to delimit blocks.
+    """
+    def __init__(self, workspace):
+        self.workspace = workspace
+        self._state = self._norm
+        self._pos = 0
+        self._ltpos = 0
+        self._endtag = None
+        self._ltstate = None
+
+    def feed(self):
+        self._pos = 0
+        self._state = self._norm
+        for self._pos in range(len(self.workspace)):
+            self._state()
+
+    def _is_cockney(self):
+        pos = self._pos
+        ws = self.workspace
+        for i in _ASTART:
+            li = len(i)
+            print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i))
+            if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
+                return True
+
+    def _goto_lt(self):
+        self._ltpos = self._pos
+        self._ltstate = self._state
+        self._state = self._seen_lt
+
+    def _norm(self):
+        pos = self._pos
+        ws = self.workspace
+        char = ws[pos]
+        if char == "<":
+            self._goto_lt()
+        elif char == "\"":
+            # opening double quote
+            ws[pos] = LDQUO
+            self._state = self._seen_ld
+        elif char == "'":
+            # in this state, ' is always an apostrophe
+            ws[pos] = APOS
+
+    def _gettag(self, start):
+        ws = self.workspace
+        end = start
+        while ws[end].isalnum():
+            end += 1
+        return ws[start:end].lower()
+
+    def _seen_lt(self):
+        pos = self._pos
+        ws = self.workspace
+        if ws[pos] == ">":
+            start = self._ltpos + 1
+            if ws[start] == '/':
+                if self._gettag(start + 1) in _BLOCK:
+                    self._state = self._norm
+                else:
+                    self._state = self._ltstate
+            else:
+                tag = self._gettag(start)
+                if tag in _BLOCK:
+                    self._state = self._norm
+                elif tag in _RAW:
+                    self._state = self._raw
+                    self._endtag = "</" + tag
+                else:
+                    self._state = self._ltstate
+
+    def _raw(self):
+        pos = self._pos
+        ws = self.workspace
+        end = pos + len(self._endtag)
+        # only a matching end tag gets us out of the raw state
+        if ws[pos] == '<' and ws[pos:end].lower() == self._endtag and (not ws[end].isalnum()):
+            self._ltpos = pos
+            self._state = self._seen_lt
+
+    def _seen_ld(self):
+        pos = self._pos
+        ws = self.workspace
+        char = ws[pos]
+        if char == "<":
+            self._goto_lt()
+        elif char == "\"":
+            # closing double quote
+            ws[pos] = RDQUO
+            self._state = self._norm
+        elif char == "'":
+            if ws[pos-1].isalpha():
+                # either an inter-word, or an end of word, apostrophe
+                ws[pos] = APOS
+            elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
+                # also an apostrophe
+                ws[pos] = APOS
+            else:
+                # opening single quote
+                ws[pos] = LSQUO
+                self._state = self._seen_ls
+
+    def _seen_ls():
+        pos = self._pos
+        ws = self.workspace
+        char = ws[pos]
+        if char == "<":
+            self._goto_lt()
+        elif char == "'":
+            if ws[pos-1].isalpha() and ws[pos+1].isalpha():
+                # obvious apostrophe
+                ws[pos] = APOS
+            elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
+                # also an apostrophe
+                ws[pos] = APOS
+            elif ws[pos-1].isspace():
+                # start of word apostrophe
+                ws[pos] = APOS
+            else:
+                # closing single quote
+                ws[pos] = RSQUO
+                self._state = self._seen_ld