Mercurial > cgi-bin > hgweb.cgi > curlyq
changeset 3:091c03f1b2e8
Getting it working...
author | David Barts <n5jrn@me.com> |
---|---|
date | Thu, 26 Dec 2019 19:54:45 -0800 |
parents | 8884b0bf779d |
children | 7a83e82e65a6 |
files | curlers.py curlyq workspace.py writer.py |
diffstat | 4 files changed, 411 insertions(+), 18 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/curlers.py Thu Dec 26 19:54:45 2019 -0800 @@ -0,0 +1,247 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Classes for curling both HTML and plain text. + +# I m p o r t s + +import os, sys +from workspace import Workspace + +# V a r i a b l e s + +# Quote types +LSQUO = "\u2018" +APOS = RSQUO = "\u2019" +LDQUO = "\u201C" +RDQUO = "\u201D" + +# Words that start with an apostrophe. Cribbed from Wordpress. +_ASTART = [ "'tain't", "'twere", "'twas", "'tis", "'twill", "'til", + "'bout", "'nuff", "'round", "'cause" , "'em" ] + +# HTML tags that enclose raw data +_RAW = set(["script", "style"]) + +# HTML block elements +_BLOCK = set([ + "address", "blockquote", "div", "dl", "fieldset", "form", "h1", + "h2", "h3", "h4", "h5", "h6", "hr", "noscript", "ol", "p", "pre", + "table", "ul" +]) + +# F u n c t i o n s + +def _is_cockney(pos, ws): + pos = self._pos + ws = self.workspace + for i in _ASTART: + li = len(i) + if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): + return True + +# C l a s s e s + +class BaseCurler(): + def feed(self): + raise NotImplementedError() + +class TextCurler(BaseCurler): + """ + For processing plain text. Assumes the entire text is a block; it is + the responsibility of the caller to break the input into paragraphs. + """ + def __init__(self, workspace): + self.workspace = workspace + self._state = self._norm + self._pos = 0 + + def feed(self): + self._pos = 0 + self._state = self._norm + for self._pos in range(len(self.workspace)): + self._state() + + def _is_cockney(self): + pos = self._pos + ws = self.workspace + for i in _ASTART: + li = len(i) + print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i)) + if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): + return True + + def _norm(self): + pos = self._pos + ws = self.workspace + char = ws[pos] + if char == "\"": + # opening double quote + ws[pos] = LDQUO + self._state = self._seen_ld + elif char == "'": + # in this state, ' is always an apostrophe + ws[pos] = APOS + + def _seen_ld(self): + pos = self._pos + ws = self.workspace + char = ws[pos] + if char == "\"": + # closing double quote + ws[pos] = RDQUO + self._state = self._norm + elif char == "'": + if ws[pos-1].isalpha(): + # either an inter-word, or an end of word, apostrophe + ws[pos] = APOS + elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): + # also an apostrophe + ws[pos] = APOS + else: + # opening single quote + ws[pos] = LSQUO + self._state = self._seen_ls + + def _seen_ls(self): + pos = self._pos + ws = self.workspace + if ws[pos] == "'": + if ws[pos-1].isalpha() and ws[pos+1].isalpha(): + # obvious apostrophe + ws[pos] = APOS + elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): + # also an apostrophe + ws[pos] = APOS + elif ws[pos-1].isspace(): + # start of word apostrophe + ws[pos] = APOS + else: + # closing single quote + ws[pos] = RSQUO + self._state = self._seen_ld + +class HtmlCurler(BaseCurler): + """ + For processing HTML. Uses HTML block tags to delimit blocks. + """ + def __init__(self, workspace): + self.workspace = workspace + self._state = self._norm + self._pos = 0 + self._ltpos = 0 + self._endtag = None + self._ltstate = None + + def feed(self): + self._pos = 0 + self._state = self._norm + for self._pos in range(len(self.workspace)): + self._state() + + def _is_cockney(self): + pos = self._pos + ws = self.workspace + for i in _ASTART: + li = len(i) + print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i)) + if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): + return True + + def _goto_lt(self): + self._ltpos = self._pos + self._ltstate = self._state + self._state = self._seen_lt + + def _norm(self): + pos = self._pos + ws = self.workspace + char = ws[pos] + if char == "<": + self._goto_lt() + elif char == "\"": + # opening double quote + ws[pos] = LDQUO + self._state = self._seen_ld + elif char == "'": + # in this state, ' is always an apostrophe + ws[pos] = APOS + + def _gettag(self, start): + ws = self.workspace + end = start + while ws[end].isalnum(): + end += 1 + return ws[start:end].lower() + + def _seen_lt(self): + pos = self._pos + ws = self.workspace + if ws[pos] == ">": + start = self._ltpos + 1 + if ws[start] == '/': + if self._gettag(start + 1) in _BLOCK: + self._state = self._norm + else: + self._state = self._ltstate + else: + tag = self._gettag(start) + if tag in _BLOCK: + self._state = self._norm + elif tag in _RAW: + self._state = self._raw + self._endtag = "</" + tag + else: + self._state = self._ltstate + + def _raw(self): + pos = self._pos + ws = self.workspace + end = pos + len(self._endtag) + # only a matching end tag gets us out of the raw state + if ws[pos] == '<' and ws[pos:end].lower() == self._endtag and (not ws[end].isalnum()): + self._ltpos = pos + self._state = self._seen_lt + + def _seen_ld(self): + pos = self._pos + ws = self.workspace + char = ws[pos] + if char == "<": + self._goto_lt() + elif char == "\"": + # closing double quote + ws[pos] = RDQUO + self._state = self._norm + elif char == "'": + if ws[pos-1].isalpha(): + # either an inter-word, or an end of word, apostrophe + ws[pos] = APOS + elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): + # also an apostrophe + ws[pos] = APOS + else: + # opening single quote + ws[pos] = LSQUO + self._state = self._seen_ls + + def _seen_ls(): + pos = self._pos + ws = self.workspace + char = ws[pos] + if char == "<": + self._goto_lt() + elif char == "'": + if ws[pos-1].isalpha() and ws[pos+1].isalpha(): + # obvious apostrophe + ws[pos] = APOS + elif ws[pos+1].isdecimal() or _is_cockney(pos, ws): + # also an apostrophe + ws[pos] = APOS + elif ws[pos-1].isspace(): + # start of word apostrophe + ws[pos] = APOS + else: + # closing single quote + ws[pos] = RSQUO + self._state = self._seen_ld
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/curlyq Thu Dec 26 19:54:45 2019 -0800 @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os, sys +import argparse +import codecs + +from curlers import TextCurler, HtmlCurler +from workspace import Workspace, Bounds, Mapping, SegmentedView +from writer import CODECS_TO_NAME + +# V a r i a b l e s + +# Name invoked by +MYNAME = os.path.basename(sys.argv[0]) + +# Streams +input_fp = None +output_fp = None + +# F u n c t i o n s + +def normal(): + global input_fp, output_fp + with Workspace() as ws: + curler = TextCurler(ws) + while True: + line = input_fp.readline() + ws.write(line) + if line == "" or line == "\n": + curler.feed() + output_fp.write(ws.getvalue()) + ws.clear() + if line == "": + break + +def flowed(): + global input_fp, output_fp, args + with Workspace() as ws: + curler = TextCurler(ws) + while True: + line = input_fp.readline() + if line == "": + break + ws.write(line) + curler.feed() + output_fp.write(ws.getvalue()) + ws.clear() + +def html(): + global input_fp, output_fp + with Workspace(input_fp.read()) as ws: + curler = HtmlCurler(ws) + curler.feed() + output_fp.write(ws.getvalue()) + +# M a i n P r o g r a m + +# Parse arguments +parser = argparse.ArgumentParser(description='Source code character checker.', prog=MYNAME) +group = parser.add_mutually_exclusive_group() +group.add_argument("--flowed", action="store_true", help="Input is flowed text.") +group.add_argument("--html", action="store_true", help="Input is HTML.") +parser.add_argument("--force", action="store_true", help="Force all quotes to straight ones first.") +parser.add_argument("--icoding", default="UTF-8", help="Input encoding.") +parser.add_argument("--inplace", action="store_true", help="Edit file in-place.") +parser.add_argument("--ocoding", default="UTF-8", help="Output encoding.") +parser.add_argument("input", nargs="?", help="Input file.") +parser.add_argument("output", nargs="?", help="Output file.") +try: + args = parser.parse_args() +except SystemExit: + sys.exit(2) + +# Sanity-check codings +try: + codec = codecs.lookup(args.icoding) + codec = codecs.lookup(args.ocoding) +except LookupError as e: + sys.stderr.write("{0}: {1!s}\n".format(MYNAME, e)) + sys.exit(2) +if not CODECS_TO_NAME.get(codec, "").startswith("UTF-"): + sys.stderr.write("{0}: {1!s} output coding does not support Unicode\n".format(MYNAME, args.ocoding)) + sys.exit(1) +del codec + +# Get streams +try: + if args.input and (not args.output) and args.inplace: + args.output = args.input + args.input += "~" + os.rename(args.input, args.output) + if args.input: + input_fp = open(args.input, "r", encoding=args.icoding) + else: + input_fp = open(0, "r", encoding=args.icoding) + if args.output: + output_fp = open(args.output, "w", encoding=args.ocoding) + else: + output_fp = open(1, "w", encoding=args.ocoding) +except (OSError, LookupError) as e: + sys.stderr.write("{0}: {1!s}\n".format(MYNAME, e)) + sys.exit(1) + +# Choose our mode +if args.flowed: + flowed() +elif args.html: + html() +else: + normal()
--- a/workspace.py Thu Dec 26 13:18:53 2019 -0800 +++ b/workspace.py Thu Dec 26 19:54:45 2019 -0800 @@ -1,18 +1,8 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# A class that implements a workspace for curly-quoting a text. This is enough -# like a string that it can be accessed via subscripts and ranges, and enough -# like a TextIOBase object that it can be written to much like a stream. -# (However, a Workspace is neither a string nor a TextIOBase object.) -# -# The advantage of using UTF-16 (as we do here) is that all quotation marks -# of interest are represented in a single 16-bit value, so changing straight -# quotes to curly ones can be accomplished most easily. -# -# It was a deliberate design decision to return empty strings when reading -# out-of-range indices but to throw exceptions when attempting to write -# them, because both decisions made coding easier in other modules. +# Classes that implement a workspace for curly-quoting a text, and views +# into the same. # I m p o r t s @@ -24,7 +14,23 @@ # C l a s s e s +# Our workspace class. This is enough like a string that it can be +# accessed via subscripts and ranges, and enough like a TextIOBase object +# that it can be written to much like a stream. (However, a Workspace is +# neither a string nor a TextIOBase object.) +# +# The advantage of using UTF-16 (as we do here) is that all quotation +# marks of interest are represented in a single 16-bit value, so changing +# straight quotes to curly ones can be accomplished most easily. +# +# It was a deliberate design decision to return empty strings when reading +# out-of-range indices but to throw exceptions when attempting to write +# them, because both decisions made coding easier in other modules. class Workspace(object): + """ + A workspace for text-processing; a mutable hybrid of a string and an + in-memory file. + """ # The most efficient 16-bit one on this platform encoding = "UTF-16" + sys.byteorder[0].upper() + "E" codec = codecs.lookup(encoding) @@ -35,7 +41,6 @@ """ Constructor. """ - self._length = 0 if initial_data is not None: data = initial_data.encode(self.encoding, self.errors) self._fp = io.BytesIO(data) @@ -82,6 +87,23 @@ """ self._fp.write(string.encode(self.encoding, self.errors)) + def truncate(self, size=None): + """ + Truncate. + XXX - can create a runt surrogate pair + """ + if size is None: + self._fp.truncate(None) + else: + self._fp.truncate(2 * size) + + def clear(self): + """ + Clear this object's contents. + """ + self.truncate(0) + self.seek(0, os.SEEK_SET) + def __len__(self): """ Length in characters. @@ -162,6 +184,9 @@ return False class Bounds(object): + """ + A set of index bounds. + """ def __init__(self, start, stop): if start > stop or start < 0 or stop < 0: raise ValueError("invalid bounds") @@ -199,6 +224,10 @@ return "{0}({1!r}, {2!r})".format(self.__class__.__name__, self.start, self.stop) class Mapping(object): + """ + Represents a mapping of a single view segment into an indexable + object. + """ def __init__(self, bounds, offset): if not isinstance(bounds, Bounds): raise TypeError("bounds must be a Bounds object") @@ -216,7 +245,8 @@ Implements a view on a subscriptable object. The view is composed of zero or more segments of the source object. Has the same idiosyncratic behavior for out-of-bounds indices that Workspace has (and for the - same reason). + same reason). Mutating this object causes the parent object to also + be mutated. """ def __init__(self, indexable, bounds): self.indexable = indexable @@ -254,12 +284,17 @@ return None def __setitem__(self, key, value): + """ + Direct access to replace a single character. + """ if not isinstance(key, int): raise TypeError("__setitem__ only supports integers") self.indexable[self._mapped(key)] = value - # XXX - this is sorta brute-forced and could be more efficient def __getitem__(self, key): + """ + Direct access to a single character or range of characters. + """ # Trivial cases if isinstance(key, int): return self._get1(key)
--- a/writer.py Thu Dec 26 13:18:53 2019 -0800 +++ b/writer.py Thu Dec 26 19:54:45 2019 -0800 @@ -17,9 +17,9 @@ # We only support ASCII, ISO-8859-1, and anything capable of encoding # the full Unicode set. Anything else is too sticky a wicket to want # to mess with. -_CODECS_TO_NAME = {} +CODECS_TO_NAME = {} for i in [ "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16", "UTF-16LE", "UTF-16BE", "UTF-32", "UTF-32LE", "UTF-32BE" ]: - _CODECS_TO_NAME[codecs.lookup(i)] = i + CODECS_TO_NAME[codecs.lookup(i)] = i del i _MAXCHAR = { "US-ASCII": 127, "ISO-8859-1": 255 } @@ -57,7 +57,7 @@ # A codec to use is available to the caller as .codec self.codec = codecs.lookup(encoding) # Normalized encoding name is available to the caller as .encoding - self.encoding = _CODECS_TO_NAME[self.codec] + self.encoding = CODECS_TO_NAME[self.codec] except (KeyError, LookupError) as e: raise ValueError("invalid encoding {0!r}".format(encoding)) self._maxchar = _MAXCHAR.get(self.encoding, 0x7fffffff)