curlyq: curlers.py comparison

comparison curlers.py @ 3:091c03f1b2e8

Getting it working...

author	David Barts <n5jrn@me.com>
date	Thu, 26 Dec 2019 19:54:45 -0800
parents
children	7a83e82e65a6

comparison

equal deleted inserted replaced

-:8884b0bf779d
+:091c03f1b2e8
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Classes for curling both HTML and plain text.
+# I m p o r t s
+import os, sys
+from workspace import Workspace
+# V a r i a b l e s
+# Quote types
+LSQUO = "\u2018"
+APOS = RSQUO = "\u2019"
+LDQUO = "\u201C"
+RDQUO = "\u201D"
+# Words that start with an apostrophe. Cribbed from Wordpress.
+_ASTART = [ "'tain't", "'twere", "'twas", "'tis", "'twill", "'til",
+"'bout", "'nuff", "'round", "'cause" , "'em" ]
+# HTML tags that enclose raw data
+_RAW = set(["script", "style"])
+# HTML block elements
+_BLOCK = set([
+"address", "blockquote", "div", "dl", "fieldset", "form", "h1",
+"h2", "h3", "h4", "h5", "h6", "hr", "noscript", "ol", "p", "pre",
+"table", "ul"
+])
+# F u n c t i o n s
+def _is_cockney(pos, ws):
+pos = self._pos
+ws = self.workspace
+for i in _ASTART:
+li = len(i)
+if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
+return True
+# C l a s s e s
+class BaseCurler():
+def feed(self):
+raise NotImplementedError()
+class TextCurler(BaseCurler):
+"""
+For processing plain text. Assumes the entire text is a block; it is
+the responsibility of the caller to break the input into paragraphs.
+"""
+def __init__(self, workspace):
+self.workspace = workspace
+self._state = self._norm
+self._pos = 0
+def feed(self):
+self._pos = 0
+self._state = self._norm
+for self._pos in range(len(self.workspace)):
+self._state()
+def _is_cockney(self):
+pos = self._pos
+ws = self.workspace
+for i in _ASTART:
+li = len(i)
+print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i))
+if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
+return True
+def _norm(self):
+pos = self._pos
+ws = self.workspace
+char = ws[pos]
+if char == "\"":
+# opening double quote
+ws[pos] = LDQUO
+self._state = self._seen_ld
+elif char == "'":
+# in this state, ' is always an apostrophe
+ws[pos] = APOS
+def _seen_ld(self):
+pos = self._pos
+ws = self.workspace
+char = ws[pos]
+if char == "\"":
+# closing double quote
+ws[pos] = RDQUO
+self._state = self._norm
+elif char == "'":
+if ws[pos-1].isalpha():
+# either an inter-word, or an end of word, apostrophe
+ws[pos] = APOS
+elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
+# also an apostrophe
+ws[pos] = APOS
+else:
+# opening single quote
+ws[pos] = LSQUO
+self._state = self._seen_ls
+def _seen_ls(self):
+pos = self._pos
+ws = self.workspace
+if ws[pos] == "'":
+if ws[pos-1].isalpha() and ws[pos+1].isalpha():
+# obvious apostrophe
+ws[pos] = APOS
+elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
+# also an apostrophe
+ws[pos] = APOS
+elif ws[pos-1].isspace():
+# start of word apostrophe
+ws[pos] = APOS
+else:
+# closing single quote
+ws[pos] = RSQUO
+self._state = self._seen_ld
+class HtmlCurler(BaseCurler):
+"""
+For processing HTML. Uses HTML block tags to delimit blocks.
+"""
+def __init__(self, workspace):
+self.workspace = workspace
+self._state = self._norm
+self._pos = 0
+self._ltpos = 0
+self._endtag = None
+self._ltstate = None
+def feed(self):
+self._pos = 0
+self._state = self._norm
+for self._pos in range(len(self.workspace)):
+self._state()
+def _is_cockney(self):
+pos = self._pos
+ws = self.workspace
+for i in _ASTART:
+li = len(i)
+print("comparing {0!r} and {1!r}\n".format(ws[pos:pos+li].lower(), i))
+if ws[pos:pos+li].lower() == i and not ws[pos+li].isalpha():
+return True
+def _goto_lt(self):
+self._ltpos = self._pos
+self._ltstate = self._state
+self._state = self._seen_lt
+def _norm(self):
+pos = self._pos
+ws = self.workspace
+char = ws[pos]
+if char == "<":
+self._goto_lt()
+elif char == "\"":
+# opening double quote
+ws[pos] = LDQUO
+self._state = self._seen_ld
+elif char == "'":
+# in this state, ' is always an apostrophe
+ws[pos] = APOS
+def _gettag(self, start):
+ws = self.workspace
+end = start
+while ws[end].isalnum():
+end += 1
+return ws[start:end].lower()
+def _seen_lt(self):
+pos = self._pos
+ws = self.workspace
+if ws[pos] == ">":
+start = self._ltpos + 1
+if ws[start] == '/':
+if self._gettag(start + 1) in _BLOCK:
+self._state = self._norm
+else:
+self._state = self._ltstate
+else:
+tag = self._gettag(start)
+if tag in _BLOCK:
+self._state = self._norm
+elif tag in _RAW:
+self._state = self._raw
+self._endtag = "</" + tag
+else:
+self._state = self._ltstate
+def _raw(self):
+pos = self._pos
+ws = self.workspace
+end = pos + len(self._endtag)
+# only a matching end tag gets us out of the raw state
+if ws[pos] == '<' and ws[pos:end].lower() == self._endtag and (not ws[end].isalnum()):
+self._ltpos = pos
+self._state = self._seen_lt
+def _seen_ld(self):
+pos = self._pos
+ws = self.workspace
+char = ws[pos]
+if char == "<":
+self._goto_lt()
+elif char == "\"":
+# closing double quote
+ws[pos] = RDQUO
+self._state = self._norm
+elif char == "'":
+if ws[pos-1].isalpha():
+# either an inter-word, or an end of word, apostrophe
+ws[pos] = APOS
+elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
+# also an apostrophe
+ws[pos] = APOS
+else:
+# opening single quote
+ws[pos] = LSQUO
+self._state = self._seen_ls
+def _seen_ls():
+pos = self._pos
+ws = self.workspace
+char = ws[pos]
+if char == "<":
+self._goto_lt()
+elif char == "'":
+if ws[pos-1].isalpha() and ws[pos+1].isalpha():
+# obvious apostrophe
+ws[pos] = APOS
+elif ws[pos+1].isdecimal() or _is_cockney(pos, ws):
+# also an apostrophe
+ws[pos] = APOS
+elif ws[pos-1].isspace():
+# start of word apostrophe
+ws[pos] = APOS
+else:
+# closing single quote
+ws[pos] = RSQUO
+self._state = self._seen_ld

Mercurial > cgi-bin > hgweb.cgi > curlyq

comparison curlers.py @ 3:091c03f1b2e8