changeset 3:091c03f1b2e8

Getting it working...
author David Barts <n5jrn@me.com>
date Thu, 26 Dec 2019 19:54:45 -0800
parents 8884b0bf779d
children 7a83e82e65a6
files curlers.py curlyq workspace.py writer.py
diffstat 4 files changed, 411 insertions(+), 18 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/curlers.py	Thu Dec 26 19:54:45 2019 -0800
@@ -0,0 +1,247 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Classes for curling both HTML and plain text.
+
+# I m p o r t s
+
+import os, sys
+from workspace import Workspace
+
+# V a r i a b l e s
+
+# Quote types
+LSQUO = "\u2018"
+APOS = RSQUO = "\u2019"
+LDQUO = "\u201C"
+RDQUO = "\u201D"
+
+# Words that start with an apostrophe. Cribbed from Wordpress.
+_ASTART = [ "'tain't", "'twere", "'twas", "'tis", "'twill", "'til",
+    "'bout", "'nuff", "'round", "'cause" , "'em" ]
+
+# HTML tags that enclose raw data
+_RAW = set(["script", "style"])
+
+# HTML block elements
+_BLOCK = set([
+    "address", "blockquote", "div", "dl", "fieldset", "form", "h1",
+    "h2", "h3", "h4", "h5", "h6", "hr", "noscript", "ol", "p", "pre",
+    "table", "ul"
+])
+
+# F u n c t i o n s
+
+def _is_cockney(pos, ws):
+    pos = self._pos
+    ws = self.workspace
+    for i in _ASTART:
+        li = len(i)
+        if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
+            return True
+
+# C l a s s e s
+
+class BaseCurler():
+    def feed(self):
+        raise NotImplementedError()
+
+class TextCurler(BaseCurler):
+    """
+    For processing plain text. Assumes the entire text is a block; it is
+    the responsibility of the caller to break the input into paragraphs.
+    """
+    def __init__(self, workspace):
+        self.workspace = workspace
+        self._state = self._norm
+        self._pos = 0
+
+    def feed(self):
+        self._pos = 0
+        self._state = self._norm
+        for self._pos in range(len(self.workspace)):
+            self._state()
+
+    def _is_cockney(self):
+        pos = self._pos
+        ws = self.workspace
+        for i in _ASTART:
+            li = len(i)
+            print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i))
+            if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
+                return True
+
+    def _norm(self):
+        pos = self._pos
+        ws = self.workspace
+        char = ws[pos]
+        if char == "\"":
+            # opening double quote
+            ws[pos] = LDQUO
+            self._state = self._seen_ld
+        elif char == "'":
+            # in this state, ' is always an apostrophe
+            ws[pos] = APOS
+
+    def _seen_ld(self):
+        pos = self._pos
+        ws = self.workspace
+        char = ws[pos]
+        if char == "\"":
+            # closing double quote
+            ws[pos] = RDQUO
+            self._state = self._norm
+        elif char == "'":
+            if ws[pos-1].isalpha():
+                # either an inter-word, or an end of word, apostrophe
+                ws[pos] = APOS
+            elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
+                # also an apostrophe
+                ws[pos] = APOS
+            else:
+                # opening single quote
+                ws[pos] = LSQUO
+                self._state = self._seen_ls
+
+    def _seen_ls(self):
+        pos = self._pos
+        ws = self.workspace
+        if ws[pos] == "'":
+            if ws[pos-1].isalpha() and ws[pos+1].isalpha():
+                # obvious apostrophe
+                ws[pos] = APOS
+            elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
+                # also an apostrophe
+                ws[pos] = APOS
+            elif ws[pos-1].isspace():
+                # start of word apostrophe
+                ws[pos] = APOS
+            else:
+                # closing single quote
+                ws[pos] = RSQUO
+                self._state = self._seen_ld
+
+class HtmlCurler(BaseCurler):
+    """
+    For processing HTML. Uses HTML block tags to delimit blocks.
+    """
+    def __init__(self, workspace):
+        self.workspace = workspace
+        self._state = self._norm
+        self._pos = 0
+        self._ltpos = 0
+        self._endtag = None
+        self._ltstate = None
+
+    def feed(self):
+        self._pos = 0
+        self._state = self._norm
+        for self._pos in range(len(self.workspace)):
+            self._state()
+
+    def _is_cockney(self):
+        pos = self._pos
+        ws = self.workspace
+        for i in _ASTART:
+            li = len(i)
+            print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i))
+            if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
+                return True
+
+    def _goto_lt(self):
+        self._ltpos = self._pos
+        self._ltstate = self._state
+        self._state = self._seen_lt
+
+    def _norm(self):
+        pos = self._pos
+        ws = self.workspace
+        char = ws[pos]
+        if char == "<":
+            self._goto_lt()
+        elif char == "\"":
+            # opening double quote
+            ws[pos] = LDQUO
+            self._state = self._seen_ld
+        elif char == "'":
+            # in this state, ' is always an apostrophe
+            ws[pos] = APOS
+
+    def _gettag(self, start):
+        ws = self.workspace
+        end = start
+        while ws[end].isalnum():
+            end += 1
+        return ws[start:end].lower()
+
+    def _seen_lt(self):
+        pos = self._pos
+        ws = self.workspace
+        if ws[pos] == ">":
+            start = self._ltpos + 1
+            if ws[start] == '/':
+                if self._gettag(start + 1) in _BLOCK:
+                    self._state = self._norm
+                else:
+                    self._state = self._ltstate
+            else:
+                tag = self._gettag(start)
+                if tag in _BLOCK:
+                    self._state = self._norm
+                elif tag in _RAW:
+                    self._state = self._raw
+                    self._endtag = "</" + tag
+                else:
+                    self._state = self._ltstate
+
+    def _raw(self):
+        pos = self._pos
+        ws = self.workspace
+        end = pos + len(self._endtag)
+        # only a matching end tag gets us out of the raw state
+        if ws[pos] == '<' and ws[pos:end].lower() == self._endtag and (not ws[end].isalnum()):
+            self._ltpos = pos
+            self._state = self._seen_lt
+
+    def _seen_ld(self):
+        pos = self._pos
+        ws = self.workspace
+        char = ws[pos]
+        if char == "<":
+            self._goto_lt()
+        elif char == "\"":
+            # closing double quote
+            ws[pos] = RDQUO
+            self._state = self._norm
+        elif char == "'":
+            if ws[pos-1].isalpha():
+                # either an inter-word, or an end of word, apostrophe
+                ws[pos] = APOS
+            elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
+                # also an apostrophe
+                ws[pos] = APOS
+            else:
+                # opening single quote
+                ws[pos] = LSQUO
+                self._state = self._seen_ls
+
+    def _seen_ls():
+        pos = self._pos
+        ws = self.workspace
+        char = ws[pos]
+        if char == "<":
+            self._goto_lt()
+        elif char == "'":
+            if ws[pos-1].isalpha() and ws[pos+1].isalpha():
+                # obvious apostrophe
+                ws[pos] = APOS
+            elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
+                # also an apostrophe
+                ws[pos] = APOS
+            elif ws[pos-1].isspace():
+                # start of word apostrophe
+                ws[pos] = APOS
+            else:
+                # closing single quote
+                ws[pos] = RSQUO
+                self._state = self._seen_ld
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/curlyq	Thu Dec 26 19:54:45 2019 -0800
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import os, sys
+import argparse
+import codecs
+
+from curlers import TextCurler, HtmlCurler
+from workspace import Workspace, Bounds, Mapping, SegmentedView
+from writer import CODECS_TO_NAME
+
+# V a r i a b l e s
+
+# Name invoked by
+MYNAME = os.path.basename(sys.argv[0])
+
+# Streams
+input_fp = None
+output_fp = None
+
+# F u n c t i o n s
+
+def normal():
+    global input_fp, output_fp
+    with Workspace() as ws:
+        curler = TextCurler(ws)
+        while True:
+            line = input_fp.readline()
+            ws.write(line)
+            if line == "" or line == "\n":
+                curler.feed()
+                output_fp.write(ws.getvalue())
+                ws.clear()
+            if line == "":
+                break
+
+def flowed():
+    global input_fp, output_fp, args
+    with Workspace() as ws:
+        curler = TextCurler(ws)
+        while True:
+            line = input_fp.readline()
+            if line == "":
+                break
+            ws.write(line)
+            curler.feed()
+            output_fp.write(ws.getvalue())
+            ws.clear()
+
+def html():
+    global input_fp, output_fp
+    with Workspace(input_fp.read()) as ws:
+        curler = HtmlCurler(ws)
+        curler.feed()
+        output_fp.write(ws.getvalue())
+
+# M a i n   P r o g r a m
+
+# Parse arguments
+parser = argparse.ArgumentParser(description='Source code character checker.', prog=MYNAME)
+group = parser.add_mutually_exclusive_group()
+group.add_argument("--flowed", action="store_true", help="Input is flowed text.")
+group.add_argument("--html", action="store_true", help="Input is HTML.")
+parser.add_argument("--force", action="store_true", help="Force all quotes to straight ones first.")
+parser.add_argument("--icoding", default="UTF-8", help="Input encoding.")
+parser.add_argument("--inplace", action="store_true", help="Edit file in-place.")
+parser.add_argument("--ocoding", default="UTF-8", help="Output encoding.")
+parser.add_argument("input", nargs="?", help="Input file.")
+parser.add_argument("output", nargs="?", help="Output file.")
+try:
+    args = parser.parse_args()
+except SystemExit:
+    sys.exit(2)
+
+# Sanity-check codings
+try:
+    codec = codecs.lookup(args.icoding)
+    codec = codecs.lookup(args.ocoding)
+except LookupError as e:
+    sys.stderr.write("{0}: {1!s}\n".format(MYNAME, e))
+    sys.exit(2)
+if not CODECS_TO_NAME.get(codec, "").startswith("UTF-"):
+    sys.stderr.write("{0}: {1!s} output coding does not support Unicode\n".format(MYNAME, args.ocoding))
+    sys.exit(1)
+del codec
+
+# Get streams
+try:
+    if args.input and (not args.output) and args.inplace:
+        args.output = args.input
+        args.input += "~"
+        os.rename(args.input, args.output)
+    if args.input:
+        input_fp = open(args.input, "r", encoding=args.icoding)
+    else:
+        input_fp = open(0, "r", encoding=args.icoding)
+    if args.output:
+        output_fp = open(args.output, "w", encoding=args.ocoding)
+    else:
+        output_fp = open(1, "w", encoding=args.ocoding)
+except (OSError, LookupError) as e:
+    sys.stderr.write("{0}: {1!s}\n".format(MYNAME, e))
+    sys.exit(1)
+
+# Choose our mode
+if args.flowed:
+    flowed()
+elif args.html:
+    html()
+else:
+    normal()
--- a/workspace.py	Thu Dec 26 13:18:53 2019 -0800
+++ b/workspace.py	Thu Dec 26 19:54:45 2019 -0800
@@ -1,18 +1,8 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
-# A class that implements a workspace for curly-quoting a text. This is enough
-# like a string that it can be accessed via subscripts and ranges, and enough
-# like a TextIOBase object that it can be written to much like a stream.
-# (However, a Workspace is neither a string nor a TextIOBase object.)
-#
-# The advantage of using UTF-16 (as we do here) is that all quotation marks
-# of interest are represented in a single 16-bit value, so changing straight
-# quotes to curly ones can be accomplished most easily.
-#
-# It was a deliberate design decision to return empty strings when reading
-# out-of-range indices but to throw exceptions when attempting to write
-# them, because both decisions made coding easier in other modules.
+# Classes that implement a workspace for curly-quoting a text, and views
+# into the same.
 
 # I m p o r t s
 
@@ -24,7 +14,23 @@
 
 # C l a s s e s
 
+# Our workspace class. This is enough like a string that it can be
+# accessed via subscripts and ranges, and enough like a TextIOBase object
+# that it can be written to much like a stream. (However, a Workspace is
+# neither a string nor a TextIOBase object.)
+#
+# The advantage of using UTF-16 (as we do here) is that all quotation
+# marks of interest are represented in a single 16-bit value, so changing
+# straight quotes to curly ones can be accomplished most easily.
+#
+# It was a deliberate design decision to return empty strings when reading
+# out-of-range indices but to throw exceptions when attempting to write
+# them, because both decisions made coding easier in other modules.
 class Workspace(object):
+    """
+    A workspace for text-processing; a mutable hybrid of a string and an
+    in-memory file.
+    """
     # The most efficient 16-bit one on this platform
     encoding = "UTF-16" + sys.byteorder[0].upper() + "E"
     codec = codecs.lookup(encoding)
@@ -35,7 +41,6 @@
         """
         Constructor.
         """
-        self._length = 0
         if initial_data is not None:
             data = initial_data.encode(self.encoding, self.errors)
             self._fp = io.BytesIO(data)
@@ -82,6 +87,23 @@
         """
         self._fp.write(string.encode(self.encoding, self.errors))
 
+    def truncate(self, size=None):
+        """
+        Truncate.
+        XXX - can create a runt surrogate pair
+        """
+        if size is None:
+            self._fp.truncate(None)
+        else:
+            self._fp.truncate(2 * size)
+
+    def clear(self):
+        """
+        Clear this object's contents.
+        """
+        self.truncate(0)
+        self.seek(0, os.SEEK_SET)
+
     def __len__(self):
         """
         Length in characters.
@@ -162,6 +184,9 @@
         return False
 
 class Bounds(object):
+    """
+    A set of index bounds.
+    """
     def __init__(self, start, stop):
         if start > stop or start < 0 or stop < 0:
             raise ValueError("invalid bounds")
@@ -199,6 +224,10 @@
         return "{0}({1!r}, {2!r})".format(self.__class__.__name__, self.start, self.stop)
 
 class Mapping(object):
+    """
+    Represents a mapping of a single view segment into an indexable
+    object.
+    """
     def __init__(self, bounds, offset):
         if not isinstance(bounds, Bounds):
             raise TypeError("bounds must be a Bounds object")
@@ -216,7 +245,8 @@
     Implements a view on a subscriptable object. The view is composed of
     zero or more segments of the source object. Has the same idiosyncratic
     behavior for out-of-bounds indices that Workspace has (and for the
-    same reason).
+    same reason). Mutating this object causes the parent object to also
+    be mutated.
     """
     def __init__(self, indexable, bounds):
         self.indexable = indexable
@@ -254,12 +284,17 @@
         return None
 
     def __setitem__(self, key, value):
+        """
+        Direct access to replace a single character.
+        """
         if not isinstance(key, int):
             raise TypeError("__setitem__ only supports integers")
         self.indexable[self._mapped(key)] = value
 
-    # XXX - this is sorta brute-forced and could be more efficient
     def __getitem__(self, key):
+        """
+        Direct access to a single character or range of characters.
+        """
         # Trivial cases
         if isinstance(key, int):
             return self._get1(key)
--- a/writer.py	Thu Dec 26 13:18:53 2019 -0800
+++ b/writer.py	Thu Dec 26 19:54:45 2019 -0800
@@ -17,9 +17,9 @@
 # We only support ASCII, ISO-8859-1, and anything capable of encoding
 # the full Unicode set. Anything else is too sticky a wicket to want
 # to mess with.
-_CODECS_TO_NAME = {}
+CODECS_TO_NAME = {}
 for i in [ "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16", "UTF-16LE", "UTF-16BE", "UTF-32", "UTF-32LE", "UTF-32BE" ]:
-    _CODECS_TO_NAME[codecs.lookup(i)] = i
+    CODECS_TO_NAME[codecs.lookup(i)] = i
 del i
 _MAXCHAR = { "US-ASCII": 127, "ISO-8859-1": 255 }
 
@@ -57,7 +57,7 @@
             # A codec to use is available to the caller as .codec
             self.codec = codecs.lookup(encoding)
             # Normalized encoding name is available to the caller as .encoding
-            self.encoding = _CODECS_TO_NAME[self.codec]
+            self.encoding = CODECS_TO_NAME[self.codec]
         except (KeyError, LookupError) as e:
             raise ValueError("invalid encoding {0!r}".format(encoding))
         self._maxchar = _MAXCHAR.get(self.encoding, 0x7fffffff)