changeset 10:397c178c5b98

Make it array-based.
author David Barts <n5jrn@me.com>
date Fri, 27 Dec 2019 11:26:00 -0800
parents 84adbbb69a9d
children 1f5e471101b0
files curlers.py curlyq runes.py
diffstat 3 files changed, 221 insertions(+), 40 deletions(-) [+]
line wrap: on
line diff
--- a/curlers.py	Fri Dec 27 09:51:26 2019 -0800
+++ b/curlers.py	Fri Dec 27 11:26:00 2019 -0800
@@ -6,15 +6,15 @@
 # I m p o r t s
 
 import os, sys
-from workspace import Workspace
+from runes import Workspace
 
 # V a r i a b l e s
 
-# Quote types
-LSQUO = "\u2018"
-APOS = RSQUO = "\u2019"
-LDQUO = "\u201C"
-RDQUO = "\u201D"
+# Quote types, as rune values
+LSQUO = 0x2018
+APOS = RSQUO = 0x2019
+LDQUO = 0x201c
+RDQUO = 0x201d
 
 # Words that start with an apostrophe. Cribbed from Wordpress.
 _ASTART = [ "'tain't", "'twere", "'twas", "'tis", "'twill", "'til",
@@ -44,8 +44,6 @@
             ws[i] = "'"
 
 def _is_cockney(pos, ws):
-    pos = self._pos
-    ws = self.workspace
     for i in _ASTART:
         li = len(i)
         if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
--- a/curlyq	Fri Dec 27 09:51:26 2019 -0800
+++ b/curlyq	Fri Dec 27 11:26:00 2019 -0800
@@ -6,7 +6,7 @@
 import codecs
 
 from curlers import TextCurler, HtmlCurler, uncurl
-from workspace import Workspace
+from runes import Workspace
 from writer import CODECS_TO_NAME
 
 # V a r i a b l e s
@@ -22,46 +22,46 @@
 
 def normal():
     global input_fp, output_fp, args
-    with Workspace() as ws:
-        curler = TextCurler(ws)
-        while True:
-            line = input_fp.readline()
-            ws.write(line)
-            if line == "" or line == "\n":
-                if args.force: uncurl(ws)
-                curler.feed()
-                output_fp.write(ws.getvalue())
-                ws.clear()
-            if line == "":
-                break
+    ws = Workspace()
+    curler = TextCurler(ws)
+    while True:
+        line = input_fp.readline()
+        ws.append(line)
+        if line == "" or line == "\n":
+            if args.force: uncurl(ws)
+            curler.feed()
+            output_fp.write(str(ws))
+            ws.clear()
+        if line == "":
+            break
 
 def flowed():
     global input_fp, output_fp, args
-    with Workspace() as ws:
-        curler = TextCurler(ws)
-        while True:
-            line = input_fp.readline()
-            if line == "":
-                break
-            ws.write(line)
-            if args.force: uncurl(ws)
-            curler.feed()
-            output_fp.write(ws.getvalue())
-            ws.clear()
+    ws = Workspace()
+    curler = TextCurler(ws)
+    while True:
+        line = input_fp.readline()
+        if line == "":
+            break
+        ws.append(line)
+        if args.force: uncurl(ws)
+        curler.feed()
+        output_fp.write(str(ws))
+        ws.clear()
 
 def html():
     global input_fp, output_fp
-    with Workspace(input_fp.read()) as ws:
-        curler = HtmlCurler(ws)
-        if args.force: uncurl(ws)
-        curler.feed()
-        output_fp.write(ws.getvalue())
+    ws = Workspace(input_fp.read())
+    curler = HtmlCurler(ws)
+    if args.force: uncurl(ws)
+    curler.feed()
+    output_fp.write(str(ws))
 
 def do_uncurl():
     global input_fp, output_fp
-    with Workspace(input_fp.read()) as ws:
-        uncurl(ws)
-        output_fp.write(ws.getvalue())
+    ws = Workspace(input_fp.read())
+    uncurl(ws)
+    output_fp.write(str(ws))
 
 # M a i n   P r o g r a m
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/runes.py	Fri Dec 27 11:26:00 2019 -0800
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# I m p o r t s
+
+import array
+import codecs
+import collections
+import struct
+import sys
+
+# C l a s s e s
+
+class Runes(object):
+    """
+    A mutable, fixed-length sequence of UTF-16 runes. The attributes
+    encoding and codec contain the name of the encoding and the codec
+    used to generate the UTF-16. The attribute buffer contains the
+    buffer (an array of 16-bit unsigned integers) used to back this
+    object; modifications to that array will be reflected in this
+    object.
+    """
+    # The most efficient 16-bit one on this platform
+    encoding = "UTF-16" + sys.byteorder[0].upper() + "E"
+    codec = codecs.lookup(encoding)
+
+    def __init__(self, based_on=None):
+        if isinstance(based_on, array.array):
+            if based_on.typecode == 'H':
+                self.buffer = based_on
+            else:
+                self.buffer = array.array('H', based_on)
+        elif isinstance(based_on, str):
+            # A string should always be able to encode to runes.
+            self.buffer = array.array('H', self.codec.encode(based_on, 'strict')[0])
+        elif based_on is None:
+            self.buffer = array.array('H', bytes())
+        else:
+            self.buffer = array.array('H', based_on)
+
+    def __str__(self):
+        """
+        Convert this object to a string. We deliberately do not have a
+        __repr__ method, to underscore that runes are not strings.
+        """
+        # Runes might not always be able to decode to a string.
+        return self.codec.decode(self.buffer, 'replace')[0]
+
+    def __bytes__(self):
+        return bytes(self.buffer)
+
+    def __len__(self):
+        return len(self.buffer)
+
+    def __lt__(self, other):
+        return self.buffer < other.buffer
+
+    def __le__(self, other):
+        return self.buffer <= other.buffer
+
+    def __gt__(self, other):
+        return self.buffer > other.buffer
+
+    def __ge__(self, other):
+        return self.buffer >= other.buffer
+
+    def __eq__(self, other):
+        return self.buffer == other.buffer
+
+    def __ne__(self, other):
+        return self.buffer != other.buffer
+
+    def __hash__(self):
+        return hash(self.buffer)
+
+    def __bool__(self):
+        return bool(self.buffer)
+
+    def __getitem__(self, key):
+        ret = self.buffer[key]
+        if isinstance(ret, array.array):
+            return Runes(ret)
+        else:
+            return ret
+
+    def __setitem__(self, key, value):
+        if isinstance(key, int):
+            if isinstance(value, int):
+                self.buffer[key] = value
+            else:
+                raise TypeError("integer required")
+        elif isinstance(value, Runes):
+            self.buffer[key] = value.buffer
+        else:
+            raise TypeError("runes required")
+
+    def __delitem__(self, key):
+        del self.buffer[key]
+
+    def clear(self):
+        del self[:]
+
+    def __iter__(self):
+        return iter(self.buffer)
+
+    def __reversed__(self):
+        return reversed(self.buffer)
+
+    def append(self, value):
+        if isinstance(value, int):
+            self.buffer.append(value)
+        elif isinstance(value, Runes):
+            self.buffer.extend(value.buffer)
+        else:
+            raise TypeError("integer or runes required")
+
+    def __contains__(self, value):
+        return value in self.buffer
+
+    def index(self, value):
+        return self.buffer.index(value)
+
+    def find(self, value):
+        try:
+            return self.index(value)
+        except ValueError:
+            return -1
+
+class Workspace(Runes):
+    """
+    A Runes object that acts a bit more string-like, in that __setitem__
+    also accepts a string as an argument and __getitem__ always returns
+    a string. We also return empty strings instead of throwing IndexError
+    when attempting to read out-of-range values, because that makes life
+    easier for us when curling quotes.
+    """
+    def __setitem__(self, key, value):
+        if isinstance(value, str):
+            if isinstance(key, int):
+                Runes.__setitem__(self, key, self._ord(value))
+            else:
+                Runes.__setitem__(self, key, Runes(value))
+        else:
+            Runes.__setitem__(self, key, value)
+
+    def __getitem__(self, key):
+        try:
+            ret = Runes.__getitem__(self, key)
+            if isinstance (ret, int):
+                return chr(ret)
+            elif isinstance(ret, Runes):
+                return str(ret)
+            else:
+                raise AssertionError("this shouldn't happen")
+        except IndexError:
+            return ""
+
+    def append(self, value):
+        if isinstance(value, str):
+            Runes.append(self, Runes(value))
+        else:
+            Runes.append(self, value)
+
+    def index(self, value):
+        if isinstance(value, str):
+            return Runes.index(self, self._ord(value))
+        else:
+            return Runes.index(self, value)
+
+    def find(self, value):
+        try:
+            return self.index(value)
+        except ValueError:
+            return -1
+
+    def _ord(self, string):
+        length = len(string)
+        if length != 1:
+            raise ValueError("expected a character, but string of length {0} found".format(length))
+        raw = Runes(string)
+        if len(raw) != 1:
+            raise ValueError("character not in BMP")
+        return raw[0]