Mercurial > cgi-bin > hgweb.cgi > ClipMan
view src/name/blackcap/clipman/Coerce.kt @ 54:a9d5c94a177c
Add README file.
author | David Barts <n5jrn@me.com> |
---|---|
date | Tue, 13 Apr 2021 10:33:33 -0700 |
parents | 0c6c18a733b7 |
children |
line wrap: on
line source
/* * Font coercion. Would be easier if we didn't have to rummage through * a doc tree and could use SAX-style callbacks instead. Would run faster * if we didn't have a doc tree. Would use less memory if we didn't have * a doc tree. But all HTML parsers these days force a doc tree on you. * Sigh. */ package name.blackcap.clipman import org.jsoup.Jsoup import org.jsoup.nodes.* import java.util.Formatter import java.util.logging.Level import java.util.logging.Logger /** * Coerce fonts in HTML document. * @param uncoerced String HTML document in * @param pFamily String proportionally-spaced font family * @param pSize Float proportionally-spaced font size * @param mFamily String monospaced font family * @param mSize Float monospaced font size * @return HTML document with coerced fonts */ fun coerceHTML(uncoerced: String, pFamily: String, pSize: Float, mFamily: String, mSize: Float): String { val doc = Jsoup.parse(uncoerced) /* apply standard scrubbing */ val head = _scrub(doc) /* remove all stylesheets (and their content) and references to same */ doc.select("link[rel=stylesheet],style").remove() /* add a style sheet of our own */ head?.appendElement("style")?.appendChild(DataNode(Formatter().run { format("%nbody { font-family: \"%s\"; font-size: %.2f; }%n", pFamily, pSize) format("code, kbd, pre, samp, tt { font-family: \"%s\"; font-size: %.2f; }%n", mFamily, mSize) toString() })) /* remove all styling tags, but keep their content */ doc.select("basefont,big,div,font,small,span").forEach { discardTag -> discardTag.getChildrenAsArray().forEach { discardTag.before(it) } discardTag.remove() } /* remove all styling attributes */ val hitList = arrayOf("bgcolor", "class", "color", "style") val selector = hitList.joinToString(prefix = "[", separator = "],[", postfix = "]") doc.select(selector).forEach { it.removeAll(*hitList) } /* remove body styling attributes */ doc.selectFirst(":root>body")?.removeAll("background", "text", "link", "alink", "vlink") /* clean up horizontal rules */ doc.select("hr").forEach { it.removeAll("shade", "noshade") } /* that's all! */ return _output(doc) } /** * "Scrub" an HTML document (make it Java and clipboard-friendly) * @param unscrubbed String HTML document in * @return scrubbed HTML document */ fun scrub(unscrubbed: String): String { val doc = Jsoup.parse(unscrubbed) _scrub(doc) return _output(doc) } private fun _scrub(doc: Document): Element? { /* remove any doctype or XML declarations */ doc.getChildrenAsArray().forEach { if (it is DocumentType || it is XmlDeclaration) { it.remove() } } /* remove all non-HTML-4.01 attributes from <html> */ doc.attributes().map { it.key } .forEach { if (it !in setOf("lang", "dir")) { doc.removeAttr(it) } } /* remove any conflicting charset declarations */ doc.select("meta[http-equiv=content-type],meta[charset]").remove() /* add a charset declaration of our own */ val head = doc.selectFirst(":root>head") if (head == null) { LOGGER.log(Level.SEVERE, "no head found!") return null } head.prependElement("meta").run { attr("http-equiv", "Content-Type") attr("content", "text/html; charset=" + CHARSET_NAME) } return head } private fun _output(doc: Document): String { doc.outputSettings() .charset(CHARSET_NAME) .syntax(Document.OutputSettings.Syntax.xml); return doc.outerHtml() } /** * Turns node list to array. Needed to avoid ConcurrentModificationException. * Ah, the joys of the doc tree. * @return Array<Node> */ fun Node.getChildrenAsArray(): Array<Node> = childNodes().run { Array<Node>(size) { get(it) } } /** * Remove all specified attributes from the element. */ fun Element.removeAll(vararg hitList: String) { hitList.forEach { if (hasAttr(it)) { removeAttr(it) } } }