diff src/name/blackcap/clipman/Coerce.kt @ 31:0c6c18a733b7

Compiles, new menu still a mess.
author David Barts <n5jrn@me.com>
date Thu, 30 Jan 2020 16:01:51 -0800
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/name/blackcap/clipman/Coerce.kt	Thu Jan 30 16:01:51 2020 -0800
@@ -0,0 +1,137 @@
+/*
+ * Font coercion. Would be easier if we didn't have to rummage through
+ * a doc tree and could use SAX-style callbacks instead. Would run faster
+ * if we didn't have a doc tree. Would use less memory if we didn't have
+ * a doc tree. But all HTML parsers these days force a doc tree on you.
+ * Sigh.
+ */
+package name.blackcap.clipman
+
+import org.jsoup.Jsoup
+import org.jsoup.nodes.*
+import java.util.Formatter
+import java.util.logging.Level
+import java.util.logging.Logger
+
+/**
+ * Coerce fonts in HTML document.
+ * @param uncoerced String HTML document in
+ * @param pFamily String proportionally-spaced font family
+ * @param pSize Float proportionally-spaced font size
+ * @param mFamily String monospaced font family
+ * @param mSize Float monospaced font size
+ * @return HTML document with coerced fonts
+ */
+fun coerceHTML(uncoerced: String, pFamily: String, pSize: Float, mFamily: String, mSize: Float): String {
+    val doc = Jsoup.parse(uncoerced)
+
+    /* apply standard scrubbing */
+    val head = _scrub(doc)
+
+    /* remove all stylesheets (and their content) and references to same */
+    doc.select("link[rel=stylesheet],style").remove()
+
+    /* add a style sheet of our own */
+    head?.appendElement("style")?.appendChild(DataNode(Formatter().run {
+        format("%nbody { font-family: \"%s\"; font-size: %.2f; }%n",
+            pFamily, pSize)
+        format("code, kbd, pre, samp, tt { font-family: \"%s\"; font-size: %.2f; }%n",
+            mFamily, mSize)
+        toString()
+    }))
+
+    /* remove all styling tags, but keep their content */
+    doc.select("basefont,big,div,font,small,span").forEach { discardTag ->
+        discardTag.getChildrenAsArray().forEach {
+            discardTag.before(it)
+        }
+        discardTag.remove()
+    }
+
+    /* remove all styling attributes */
+    val hitList = arrayOf("bgcolor", "class", "color", "style")
+    val selector = hitList.joinToString(prefix = "[", separator = "],[", postfix = "]")
+    doc.select(selector).forEach {
+        it.removeAll(*hitList)
+    }
+
+    /* remove body styling attributes */
+    doc.selectFirst(":root>body")?.removeAll("background", "text", "link", "alink", "vlink")
+
+    /* clean up horizontal rules */
+    doc.select("hr").forEach {
+        it.removeAll("shade", "noshade")
+    }
+
+    /* that's all! */
+    return _output(doc)
+}
+
+/**
+ * "Scrub" an HTML document (make it Java and clipboard-friendly)
+ * @param unscrubbed String HTML document in
+ * @return scrubbed HTML document
+ */
+fun scrub(unscrubbed: String): String {
+    val doc = Jsoup.parse(unscrubbed)
+    _scrub(doc)
+    return _output(doc)
+}
+
+private fun _scrub(doc: Document): Element? {
+    /* remove any doctype or XML declarations */
+    doc.getChildrenAsArray().forEach {
+        if (it is DocumentType || it is XmlDeclaration) {
+            it.remove()
+        }
+    }
+
+    /* remove all non-HTML-4.01 attributes from <html> */
+    doc.attributes().map { it.key } .forEach {
+        if (it !in setOf("lang", "dir")) {
+            doc.removeAttr(it)
+        }
+    }
+
+    /* remove any conflicting charset declarations */
+    doc.select("meta[http-equiv=content-type],meta[charset]").remove()
+
+    /* add a charset declaration of our own */
+    val head = doc.selectFirst(":root>head")
+    if (head == null) {
+        LOGGER.log(Level.SEVERE, "no head found!")
+        return null
+    }
+    head.prependElement("meta").run {
+        attr("http-equiv", "Content-Type")
+        attr("content", "text/html; charset=" + CHARSET_NAME)
+    }
+    return head
+}
+
+private fun _output(doc: Document): String {
+    doc.outputSettings()
+        .charset(CHARSET_NAME)
+        .syntax(Document.OutputSettings.Syntax.xml);
+    return doc.outerHtml()
+}
+
+/**
+ * Turns node list to array. Needed to avoid ConcurrentModificationException.
+ * Ah, the joys of the doc tree.
+ * @return Array<Node>
+ */
+fun Node.getChildrenAsArray(): Array<Node> = childNodes().run {
+    Array<Node>(size) { get(it) }
+}
+
+/**
+ * Remove all specified attributes from the element.
+ */
+fun Element.removeAll(vararg hitList: String) {
+    hitList.forEach {
+        if (hasAttr(it)) {
+            removeAttr(it)
+        }
+    }
+}