Mercurial > cgi-bin > hgweb.cgi > ClipMan
diff src/name/blackcap/clipman/Coerce.kt @ 31:0c6c18a733b7
Compiles, new menu still a mess.
author | David Barts <n5jrn@me.com> |
---|---|
date | Thu, 30 Jan 2020 16:01:51 -0800 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/name/blackcap/clipman/Coerce.kt Thu Jan 30 16:01:51 2020 -0800 @@ -0,0 +1,137 @@ +/* + * Font coercion. Would be easier if we didn't have to rummage through + * a doc tree and could use SAX-style callbacks instead. Would run faster + * if we didn't have a doc tree. Would use less memory if we didn't have + * a doc tree. But all HTML parsers these days force a doc tree on you. + * Sigh. + */ +package name.blackcap.clipman + +import org.jsoup.Jsoup +import org.jsoup.nodes.* +import java.util.Formatter +import java.util.logging.Level +import java.util.logging.Logger + +/** + * Coerce fonts in HTML document. + * @param uncoerced String HTML document in + * @param pFamily String proportionally-spaced font family + * @param pSize Float proportionally-spaced font size + * @param mFamily String monospaced font family + * @param mSize Float monospaced font size + * @return HTML document with coerced fonts + */ +fun coerceHTML(uncoerced: String, pFamily: String, pSize: Float, mFamily: String, mSize: Float): String { + val doc = Jsoup.parse(uncoerced) + + /* apply standard scrubbing */ + val head = _scrub(doc) + + /* remove all stylesheets (and their content) and references to same */ + doc.select("link[rel=stylesheet],style").remove() + + /* add a style sheet of our own */ + head?.appendElement("style")?.appendChild(DataNode(Formatter().run { + format("%nbody { font-family: \"%s\"; font-size: %.2f; }%n", + pFamily, pSize) + format("code, kbd, pre, samp, tt { font-family: \"%s\"; font-size: %.2f; }%n", + mFamily, mSize) + toString() + })) + + /* remove all styling tags, but keep their content */ + doc.select("basefont,big,div,font,small,span").forEach { discardTag -> + discardTag.getChildrenAsArray().forEach { + discardTag.before(it) + } + discardTag.remove() + } + + /* remove all styling attributes */ + val hitList = arrayOf("bgcolor", "class", "color", "style") + val selector = hitList.joinToString(prefix = "[", separator = "],[", postfix = "]") + doc.select(selector).forEach { + it.removeAll(*hitList) + } + + /* remove body styling attributes */ + doc.selectFirst(":root>body")?.removeAll("background", "text", "link", "alink", "vlink") + + /* clean up horizontal rules */ + doc.select("hr").forEach { + it.removeAll("shade", "noshade") + } + + /* that's all! */ + return _output(doc) +} + +/** + * "Scrub" an HTML document (make it Java and clipboard-friendly) + * @param unscrubbed String HTML document in + * @return scrubbed HTML document + */ +fun scrub(unscrubbed: String): String { + val doc = Jsoup.parse(unscrubbed) + _scrub(doc) + return _output(doc) +} + +private fun _scrub(doc: Document): Element? { + /* remove any doctype or XML declarations */ + doc.getChildrenAsArray().forEach { + if (it is DocumentType || it is XmlDeclaration) { + it.remove() + } + } + + /* remove all non-HTML-4.01 attributes from <html> */ + doc.attributes().map { it.key } .forEach { + if (it !in setOf("lang", "dir")) { + doc.removeAttr(it) + } + } + + /* remove any conflicting charset declarations */ + doc.select("meta[http-equiv=content-type],meta[charset]").remove() + + /* add a charset declaration of our own */ + val head = doc.selectFirst(":root>head") + if (head == null) { + LOGGER.log(Level.SEVERE, "no head found!") + return null + } + head.prependElement("meta").run { + attr("http-equiv", "Content-Type") + attr("content", "text/html; charset=" + CHARSET_NAME) + } + return head +} + +private fun _output(doc: Document): String { + doc.outputSettings() + .charset(CHARSET_NAME) + .syntax(Document.OutputSettings.Syntax.xml); + return doc.outerHtml() +} + +/** + * Turns node list to array. Needed to avoid ConcurrentModificationException. + * Ah, the joys of the doc tree. + * @return Array<Node> + */ +fun Node.getChildrenAsArray(): Array<Node> = childNodes().run { + Array<Node>(size) { get(it) } +} + +/** + * Remove all specified attributes from the element. + */ +fun Element.removeAll(vararg hitList: String) { + hitList.forEach { + if (hasAttr(it)) { + removeAttr(it) + } + } +}