# HG changeset patch # User David Barts # Date 1577418885 28800 # Node ID 091c03f1b2e865fe1c022ac0e314815f5fe7b478 # Parent 8884b0bf779dbdae262aeef7d50f96b785e1b07f Getting it working... diff -r 8884b0bf779d -r 091c03f1b2e8 curlers.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/curlers.py Thu Dec 26 19:54:45 2019 -0800 @@ -0,0 +1,247 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Classes for curling both HTML and plain text. + +# I m p o r t s + +import os, sys +from workspace import Workspace + +# V a r i a b l e s + +# Quote types +LSQUO = "\u2018" +APOS = RSQUO = "\u2019" +LDQUO = "\u201C" +RDQUO = "\u201D" + +# Words that start with an apostrophe. Cribbed from Wordpress. +_ASTART = [ "'tain't", "'twere", "'twas", "'tis", "'twill", "'til", + "'bout", "'nuff", "'round", "'cause" , "'em" ] + +# HTML tags that enclose raw data +_RAW = set(["script", "style"]) + +# HTML block elements +_BLOCK = set([ + "address", "blockquote", "div", "dl", "fieldset", "form", "h1", + "h2", "h3", "h4", "h5", "h6", "hr", "noscript", "ol", "p", "pre", + "table", "ul" +]) + +# F u n c t i o n s + +def _is_cockney(pos, ws): + pos = self._pos + ws = self.workspace + for i in _ASTART: + li = len(i) + if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): + return True + +# C l a s s e s + +class BaseCurler(): + def feed(self): + raise NotImplementedError() + +class TextCurler(BaseCurler): + """ + For processing plain text. Assumes the entire text is a block; it is + the responsibility of the caller to break the input into paragraphs. + """ + def __init__(self, workspace): + self.workspace = workspace + self._state = self._norm + self._pos = 0 + + def feed(self): + self._pos = 0 + self._state = self._norm + for self._pos in range(len(self.workspace)): + self._state() + + def _is_cockney(self): + pos = self._pos + ws = self.workspace + for i in _ASTART: + li = len(i) + print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i)) + if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): + return True + + def _norm(self): + pos = self._pos + ws = self.workspace + char = ws[pos] + if char == "\"": + # opening double quote + ws[pos] = LDQUO + self._state = self._seen_ld + elif char == "'": + # in this state, ' is always an apostrophe + ws[pos] = APOS + + def _seen_ld(self): + pos = self._pos + ws = self.workspace + char = ws[pos] + if char == "\"": + # closing double quote + ws[pos] = RDQUO + self._state = self._norm + elif char == "'": + if ws[pos-1].isalpha(): + # either an inter-word, or an end of word, apostrophe + ws[pos] = APOS + elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): + # also an apostrophe + ws[pos] = APOS + else: + # opening single quote + ws[pos] = LSQUO + self._state = self._seen_ls + + def _seen_ls(self): + pos = self._pos + ws = self.workspace + if ws[pos] == "'": + if ws[pos-1].isalpha() and ws[pos+1].isalpha(): + # obvious apostrophe + ws[pos] = APOS + elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): + # also an apostrophe + ws[pos] = APOS + elif ws[pos-1].isspace(): + # start of word apostrophe + ws[pos] = APOS + else: + # closing single quote + ws[pos] = RSQUO + self._state = self._seen_ld + +class HtmlCurler(BaseCurler): + """ + For processing HTML. Uses HTML block tags to delimit blocks. + """ + def __init__(self, workspace): + self.workspace = workspace + self._state = self._norm + self._pos = 0 + self._ltpos = 0 + self._endtag = None + self._ltstate = None + + def feed(self): + self._pos = 0 + self._state = self._norm + for self._pos in range(len(self.workspace)): + self._state() + + def _is_cockney(self): + pos = self._pos + ws = self.workspace + for i in _ASTART: + li = len(i) + print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i)) + if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): + return True + + def _goto_lt(self): + self._ltpos = self._pos + self._ltstate = self._state + self._state = self._seen_lt + + def _norm(self): + pos = self._pos + ws = self.workspace + char = ws[pos] + if char == "<": + self._goto_lt() + elif char == "\"": + # opening double quote + ws[pos] = LDQUO + self._state = self._seen_ld + elif char == "'": + # in this state, ' is always an apostrophe + ws[pos] = APOS + + def _gettag(self, start): + ws = self.workspace + end = start + while ws[end].isalnum(): + end += 1 + return ws[start:end].lower() + + def _seen_lt(self): + pos = self._pos + ws = self.workspace + if ws[pos] == ">": + start = self._ltpos + 1 + if ws[start] == '/': + if self._gettag(start + 1) in _BLOCK: + self._state = self._norm + else: + self._state = self._ltstate + else: + tag = self._gettag(start) + if tag in _BLOCK: + self._state = self._norm + elif tag in _RAW: + self._state = self._raw + self._endtag = " stop or start < 0 or stop < 0: raise ValueError("invalid bounds") @@ -199,6 +224,10 @@ return "{0}({1!r}, {2!r})".format(self.__class__.__name__, self.start, self.stop) class Mapping(object): + """ + Represents a mapping of a single view segment into an indexable + object. + """ def __init__(self, bounds, offset): if not isinstance(bounds, Bounds): raise TypeError("bounds must be a Bounds object") @@ -216,7 +245,8 @@ Implements a view on a subscriptable object. The view is composed of zero or more segments of the source object. Has the same idiosyncratic behavior for out-of-bounds indices that Workspace has (and for the - same reason). + same reason). Mutating this object causes the parent object to also + be mutated. """ def __init__(self, indexable, bounds): self.indexable = indexable @@ -254,12 +284,17 @@ return None def __setitem__(self, key, value): + """ + Direct access to replace a single character. + """ if not isinstance(key, int): raise TypeError("__setitem__ only supports integers") self.indexable[self._mapped(key)] = value - # XXX - this is sorta brute-forced and could be more efficient def __getitem__(self, key): + """ + Direct access to a single character or range of characters. + """ # Trivial cases if isinstance(key, int): return self._get1(key) diff -r 8884b0bf779d -r 091c03f1b2e8 writer.py --- a/writer.py Thu Dec 26 13:18:53 2019 -0800 +++ b/writer.py Thu Dec 26 19:54:45 2019 -0800 @@ -17,9 +17,9 @@ # We only support ASCII, ISO-8859-1, and anything capable of encoding # the full Unicode set. Anything else is too sticky a wicket to want # to mess with. -_CODECS_TO_NAME = {} +CODECS_TO_NAME = {} for i in [ "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16", "UTF-16LE", "UTF-16BE", "UTF-32", "UTF-32LE", "UTF-32BE" ]: - _CODECS_TO_NAME[codecs.lookup(i)] = i + CODECS_TO_NAME[codecs.lookup(i)] = i del i _MAXCHAR = { "US-ASCII": 127, "ISO-8859-1": 255 } @@ -57,7 +57,7 @@ # A codec to use is available to the caller as .codec self.codec = codecs.lookup(encoding) # Normalized encoding name is available to the caller as .encoding - self.encoding = _CODECS_TO_NAME[self.codec] + self.encoding = CODECS_TO_NAME[self.codec] except (KeyError, LookupError) as e: raise ValueError("invalid encoding {0!r}".format(encoding)) self._maxchar = _MAXCHAR.get(self.encoding, 0x7fffffff)