view curlers.py @ 7:9df9ff8cecde

Undo that; ignoring <pre> is a sticky wicket.
author David Barts <n5jrn@me.com>
date Thu, 26 Dec 2019 20:56:38 -0800
parents da3fb2312c88
children 397c178c5b98
line wrap: on
line source

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Classes for curling both HTML and plain text.

# I m p o r t s

import os, sys
from workspace import Workspace

# V a r i a b l e s

# Quote types
LSQUO = "\u2018"
APOS = RSQUO = "\u2019"
LDQUO = "\u201C"
RDQUO = "\u201D"

# Words that start with an apostrophe. Cribbed from Wordpress.
_ASTART = [ "'tain't", "'twere", "'twas", "'tis", "'twill", "'til",
    "'bout", "'nuff", "'round", "'cause" , "'em" ]

# HTML tags that enclose raw data
_RAW = set(["script", "style"])

# HTML block elements
_BLOCK = set([
    "address", "blockquote", "div", "dl", "fieldset", "form", "h1",
    "h2", "h3", "h4", "h5", "h6", "hr", "noscript", "ol", "p", "pre",
    "table", "ul"
])

# F u n c t i o n s

def uncurl(ws):
    """
    Makes all quotes in the workspace non-curly.
    """
    for i in range(len(ws)):
        ch = ws[i]
        if ch in set([LDQUO, RDQUO]):
            ws[i] = '"'
        elif ch in set([LSQUO, RSQUO]):
            ws[i] = "'"

def _is_cockney(pos, ws):
    pos = self._pos
    ws = self.workspace
    for i in _ASTART:
        li = len(i)
        if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
            return True

# C l a s s e s

class BaseCurler():
    def feed(self):
        raise NotImplementedError()

class TextCurler(BaseCurler):
    """
    For processing plain text. Assumes the entire text is a block; it is
    the responsibility of the caller to break the input into paragraphs.
    """
    def __init__(self, workspace):
        self.workspace = workspace
        self._state = self._norm
        self._pos = 0

    def feed(self):
        self._pos = 0
        self._state = self._norm
        for self._pos in range(len(self.workspace)):
            self._state()

    def _is_cockney(self):
        pos = self._pos
        ws = self.workspace
        for i in _ASTART:
            li = len(i)
            print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i))
            if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
                return True

    def _norm(self):
        pos = self._pos
        ws = self.workspace
        char = ws[pos]
        if char == "\"":
            # opening double quote
            ws[pos] = LDQUO
            self._state = self._seen_ld
        elif char == "'":
            # in this state, ' is always an apostrophe
            ws[pos] = APOS

    def _seen_ld(self):
        pos = self._pos
        ws = self.workspace
        char = ws[pos]
        if char == "\"":
            # closing double quote
            ws[pos] = RDQUO
            self._state = self._norm
        elif char == "'":
            if ws[pos-1].isalpha():
                # either an inter-word, or an end of word, apostrophe
                ws[pos] = APOS
            elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
                # also an apostrophe
                ws[pos] = APOS
            else:
                # opening single quote
                ws[pos] = LSQUO
                self._state = self._seen_ls

    def _seen_ls(self):
        pos = self._pos
        ws = self.workspace
        if ws[pos] == "'":
            if ws[pos-1].isalpha() and ws[pos+1].isalpha():
                # obvious apostrophe
                ws[pos] = APOS
            elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
                # also an apostrophe
                ws[pos] = APOS
            elif ws[pos-1].isspace():
                # start of word apostrophe
                ws[pos] = APOS
            else:
                # closing single quote
                ws[pos] = RSQUO
                self._state = self._seen_ld

class HtmlCurler(BaseCurler):
    """
    For processing HTML. Uses HTML block tags to delimit blocks.
    """
    def __init__(self, workspace):
        self.workspace = workspace
        self._state = self._norm
        self._pos = 0
        self._ltpos = 0
        self._endtag = None
        self._ltstate = None

    def feed(self):
        self._pos = 0
        self._state = self._norm
        for self._pos in range(len(self.workspace)):
            self._state()

    def _is_cockney(self):
        pos = self._pos
        ws = self.workspace
        for i in _ASTART:
            li = len(i)
            print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i))
            if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
                return True

    def _goto_lt(self):
        self._ltpos = self._pos
        self._ltstate = self._state
        self._state = self._seen_lt

    def _norm(self):
        pos = self._pos
        ws = self.workspace
        char = ws[pos]
        if char == "<":
            self._goto_lt()
        elif char == "\"":
            # opening double quote
            ws[pos] = LDQUO
            self._state = self._seen_ld
        elif char == "'":
            # in this state, ' is always an apostrophe
            ws[pos] = APOS

    def _gettag(self, start):
        ws = self.workspace
        end = start
        while ws[end].isalnum():
            end += 1
        return ws[start:end].lower()

    def _seen_lt(self):
        pos = self._pos
        ws = self.workspace
        if ws[pos] == ">":
            start = self._ltpos + 1
            if ws[start] == '/':
                if self._gettag(start + 1) in _BLOCK:
                    self._state = self._norm
                else:
                    self._state = self._ltstate
            else:
                tag = self._gettag(start)
                if tag in _BLOCK:
                    self._state = self._norm
                elif tag in _RAW:
                    self._state = self._raw
                    self._endtag = "</" + tag
                else:
                    self._state = self._ltstate

    def _raw(self):
        pos = self._pos
        ws = self.workspace
        end = pos + len(self._endtag)
        # only a matching end tag gets us out of the raw state
        if ws[pos] == '<' and ws[pos:end].lower() == self._endtag and (not ws[end].isalnum()):
            self._ltpos = pos
            self._state = self._seen_lt

    def _seen_ld(self):
        pos = self._pos
        ws = self.workspace
        char = ws[pos]
        if char == "<":
            self._goto_lt()
        elif char == "\"":
            # closing double quote
            ws[pos] = RDQUO
            self._state = self._norm
        elif char == "'":
            if ws[pos-1].isalpha():
                # either an inter-word, or an end of word, apostrophe
                ws[pos] = APOS
            elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
                # also an apostrophe
                ws[pos] = APOS
            else:
                # opening single quote
                ws[pos] = LSQUO
                self._state = self._seen_ls

    def _seen_ls():
        pos = self._pos
        ws = self.workspace
        char = ws[pos]
        if char == "<":
            self._goto_lt()
        elif char == "'":
            if ws[pos-1].isalpha() and ws[pos+1].isalpha():
                # obvious apostrophe
                ws[pos] = APOS
            elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
                # also an apostrophe
                ws[pos] = APOS
            elif ws[pos-1].isspace():
                # start of word apostrophe
                ws[pos] = APOS
            else:
                # closing single quote
                ws[pos] = RSQUO
                self._state = self._seen_ld