# HG changeset patch # User David Barts # Date 1577474760 28800 # Node ID 397c178c5b98ecf3e8a68ef32bcd84ad1088fd9a # Parent 84adbbb69a9d76348708ef6ba86b138a0b1c6596 Make it array-based. diff -r 84adbbb69a9d -r 397c178c5b98 curlers.py --- a/curlers.py Fri Dec 27 09:51:26 2019 -0800 +++ b/curlers.py Fri Dec 27 11:26:00 2019 -0800 @@ -6,15 +6,15 @@ # I m p o r t s import os, sys -from workspace import Workspace +from runes import Workspace # V a r i a b l e s -# Quote types -LSQUO = "\u2018" -APOS = RSQUO = "\u2019" -LDQUO = "\u201C" -RDQUO = "\u201D" +# Quote types, as rune values +LSQUO = 0x2018 +APOS = RSQUO = 0x2019 +LDQUO = 0x201c +RDQUO = 0x201d # Words that start with an apostrophe. Cribbed from Wordpress. _ASTART = [ "'tain't", "'twere", "'twas", "'tis", "'twill", "'til", @@ -44,8 +44,6 @@ ws[i] = "'" def _is_cockney(pos, ws): - pos = self._pos - ws = self.workspace for i in _ASTART: li = len(i) if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha(): diff -r 84adbbb69a9d -r 397c178c5b98 curlyq --- a/curlyq Fri Dec 27 09:51:26 2019 -0800 +++ b/curlyq Fri Dec 27 11:26:00 2019 -0800 @@ -6,7 +6,7 @@ import codecs from curlers import TextCurler, HtmlCurler, uncurl -from workspace import Workspace +from runes import Workspace from writer import CODECS_TO_NAME # V a r i a b l e s @@ -22,46 +22,46 @@ def normal(): global input_fp, output_fp, args - with Workspace() as ws: - curler = TextCurler(ws) - while True: - line = input_fp.readline() - ws.write(line) - if line == "" or line == "\n": - if args.force: uncurl(ws) - curler.feed() - output_fp.write(ws.getvalue()) - ws.clear() - if line == "": - break + ws = Workspace() + curler = TextCurler(ws) + while True: + line = input_fp.readline() + ws.append(line) + if line == "" or line == "\n": + if args.force: uncurl(ws) + curler.feed() + output_fp.write(str(ws)) + ws.clear() + if line == "": + break def flowed(): global input_fp, output_fp, args - with Workspace() as ws: - curler = TextCurler(ws) - while True: - line = input_fp.readline() - if line == "": - break - ws.write(line) - if args.force: uncurl(ws) - curler.feed() - output_fp.write(ws.getvalue()) - ws.clear() + ws = Workspace() + curler = TextCurler(ws) + while True: + line = input_fp.readline() + if line == "": + break + ws.append(line) + if args.force: uncurl(ws) + curler.feed() + output_fp.write(str(ws)) + ws.clear() def html(): global input_fp, output_fp - with Workspace(input_fp.read()) as ws: - curler = HtmlCurler(ws) - if args.force: uncurl(ws) - curler.feed() - output_fp.write(ws.getvalue()) + ws = Workspace(input_fp.read()) + curler = HtmlCurler(ws) + if args.force: uncurl(ws) + curler.feed() + output_fp.write(str(ws)) def do_uncurl(): global input_fp, output_fp - with Workspace(input_fp.read()) as ws: - uncurl(ws) - output_fp.write(ws.getvalue()) + ws = Workspace(input_fp.read()) + uncurl(ws) + output_fp.write(str(ws)) # M a i n P r o g r a m diff -r 84adbbb69a9d -r 397c178c5b98 runes.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/runes.py Fri Dec 27 11:26:00 2019 -0800 @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# I m p o r t s + +import array +import codecs +import collections +import struct +import sys + +# C l a s s e s + +class Runes(object): + """ + A mutable, fixed-length sequence of UTF-16 runes. The attributes + encoding and codec contain the name of the encoding and the codec + used to generate the UTF-16. The attribute buffer contains the + buffer (an array of 16-bit unsigned integers) used to back this + object; modifications to that array will be reflected in this + object. + """ + # The most efficient 16-bit one on this platform + encoding = "UTF-16" + sys.byteorder[0].upper() + "E" + codec = codecs.lookup(encoding) + + def __init__(self, based_on=None): + if isinstance(based_on, array.array): + if based_on.typecode == 'H': + self.buffer = based_on + else: + self.buffer = array.array('H', based_on) + elif isinstance(based_on, str): + # A string should always be able to encode to runes. + self.buffer = array.array('H', self.codec.encode(based_on, 'strict')[0]) + elif based_on is None: + self.buffer = array.array('H', bytes()) + else: + self.buffer = array.array('H', based_on) + + def __str__(self): + """ + Convert this object to a string. We deliberately do not have a + __repr__ method, to underscore that runes are not strings. + """ + # Runes might not always be able to decode to a string. + return self.codec.decode(self.buffer, 'replace')[0] + + def __bytes__(self): + return bytes(self.buffer) + + def __len__(self): + return len(self.buffer) + + def __lt__(self, other): + return self.buffer < other.buffer + + def __le__(self, other): + return self.buffer <= other.buffer + + def __gt__(self, other): + return self.buffer > other.buffer + + def __ge__(self, other): + return self.buffer >= other.buffer + + def __eq__(self, other): + return self.buffer == other.buffer + + def __ne__(self, other): + return self.buffer != other.buffer + + def __hash__(self): + return hash(self.buffer) + + def __bool__(self): + return bool(self.buffer) + + def __getitem__(self, key): + ret = self.buffer[key] + if isinstance(ret, array.array): + return Runes(ret) + else: + return ret + + def __setitem__(self, key, value): + if isinstance(key, int): + if isinstance(value, int): + self.buffer[key] = value + else: + raise TypeError("integer required") + elif isinstance(value, Runes): + self.buffer[key] = value.buffer + else: + raise TypeError("runes required") + + def __delitem__(self, key): + del self.buffer[key] + + def clear(self): + del self[:] + + def __iter__(self): + return iter(self.buffer) + + def __reversed__(self): + return reversed(self.buffer) + + def append(self, value): + if isinstance(value, int): + self.buffer.append(value) + elif isinstance(value, Runes): + self.buffer.extend(value.buffer) + else: + raise TypeError("integer or runes required") + + def __contains__(self, value): + return value in self.buffer + + def index(self, value): + return self.buffer.index(value) + + def find(self, value): + try: + return self.index(value) + except ValueError: + return -1 + +class Workspace(Runes): + """ + A Runes object that acts a bit more string-like, in that __setitem__ + also accepts a string as an argument and __getitem__ always returns + a string. We also return empty strings instead of throwing IndexError + when attempting to read out-of-range values, because that makes life + easier for us when curling quotes. + """ + def __setitem__(self, key, value): + if isinstance(value, str): + if isinstance(key, int): + Runes.__setitem__(self, key, self._ord(value)) + else: + Runes.__setitem__(self, key, Runes(value)) + else: + Runes.__setitem__(self, key, value) + + def __getitem__(self, key): + try: + ret = Runes.__getitem__(self, key) + if isinstance (ret, int): + return chr(ret) + elif isinstance(ret, Runes): + return str(ret) + else: + raise AssertionError("this shouldn't happen") + except IndexError: + return "" + + def append(self, value): + if isinstance(value, str): + Runes.append(self, Runes(value)) + else: + Runes.append(self, value) + + def index(self, value): + if isinstance(value, str): + return Runes.index(self, self._ord(value)) + else: + return Runes.index(self, value) + + def find(self, value): + try: + return self.index(value) + except ValueError: + return -1 + + def _ord(self, string): + length = len(string) + if length != 1: + raise ValueError("expected a character, but string of length {0} found".format(length)) + raw = Runes(string) + if len(raw) != 1: + raise ValueError("character not in BMP") + return raw[0]