Mercurial > cgi-bin > hgweb.cgi > ClipMan

/*
 * Font coercion. Would be easier if we didn't have to rummage through
 * a doc tree and could use SAX-style callbacks instead. Would run faster
 * if we didn't have a doc tree. Would use less memory if we didn't have
 * a doc tree. But all HTML parsers these days force a doc tree on you.
 * Sigh.
 */
package name.blackcap.clipman

import org.jsoup.Jsoup
import org.jsoup.nodes.*
import java.util.Formatter
import java.util.logging.Level
import java.util.logging.Logger

/**
 * Coerce fonts in HTML document.
 * @param uncoerced String HTML document in
 * @param pFamily String proportionally-spaced font family
 * @param pSize Float proportionally-spaced font size
 * @param mFamily String monospaced font family
 * @param mSize Float monospaced font size
 * @return HTML document with coerced fonts
 */
fun coerceHTML(uncoerced: String, pFamily: String, pSize: Float, mFamily: String, mSize: Float): String {
    val doc = Jsoup.parse(uncoerced)

    /* apply standard scrubbing */
    val head = _scrub(doc)

    /* remove all stylesheets (and their content) and references to same */
    doc.select("link[rel=stylesheet],style").remove()

    /* add a style sheet of our own */
    head?.appendElement("style")?.appendChild(DataNode(Formatter().run {
        format("%nbody { font-family: \"%s\"; font-size: %.2f; }%n",
            pFamily, pSize)
        format("code, kbd, pre, samp, tt { font-family: \"%s\"; font-size: %.2f; }%n",
            mFamily, mSize)
        toString()
    }))

    /* remove all styling tags, but keep their content */
    doc.select("basefont,big,div,font,small,span").forEach { discardTag ->
        discardTag.getChildrenAsArray().forEach {
            discardTag.before(it)
        }
        discardTag.remove()
    }

    /* remove all styling attributes */
    val hitList = arrayOf("bgcolor", "class", "color", "style")
    val selector = hitList.joinToString(prefix = "[", separator = "],[", postfix = "]")
    doc.select(selector).forEach {
        it.removeAll(*hitList)
    }

    /* remove body styling attributes */
    doc.selectFirst(":root>body")?.removeAll("background", "text", "link", "alink", "vlink")

    /* clean up horizontal rules */
    doc.select("hr").forEach {
        it.removeAll("shade", "noshade")
    }

    /* that's all! */
    return _output(doc)
}

/**
 * "Scrub" an HTML document (make it Java and clipboard-friendly)
 * @param unscrubbed String HTML document in
 * @return scrubbed HTML document
 */
fun scrub(unscrubbed: String): String {
    val doc = Jsoup.parse(unscrubbed)
    _scrub(doc)
    return _output(doc)
}

private fun _scrub(doc: Document): Element? {
    /* remove any doctype or XML declarations */
    doc.getChildrenAsArray().forEach {
        if (it is DocumentType || it is XmlDeclaration) {
            it.remove()
        }
    }

    /* remove all non-HTML-4.01 attributes from <html> */
    doc.attributes().map { it.key } .forEach {
        if (it !in setOf("lang", "dir")) {
            doc.removeAttr(it)
        }
    }

    /* remove any conflicting charset declarations */
    doc.select("meta[http-equiv=content-type],meta[charset]").remove()

    /* add a charset declaration of our own */
    val head = doc.selectFirst(":root>head")
    if (head == null) {
        LOGGER.log(Level.SEVERE, "no head found!")
        return null
    }
    head.prependElement("meta").run {
        attr("http-equiv", "Content-Type")
        attr("content", "text/html; charset=" + CHARSET_NAME)
    }
    return head
}

private fun _output(doc: Document): String {
    doc.outputSettings()
        .charset(CHARSET_NAME)
        .syntax(Document.OutputSettings.Syntax.xml);
    return doc.outerHtml()
}

/**
 * Turns node list to array. Needed to avoid ConcurrentModificationException.
 * Ah, the joys of the doc tree.
 * @return Array<Node>
 */
fun Node.getChildrenAsArray(): Array<Node> = childNodes().run {
    Array<Node>(size) { get(it) }
}

/**
 * Remove all specified attributes from the element.
 */
fun Element.removeAll(vararg hitList: String) {
    hitList.forEach {
        if (hasAttr(it)) {
            removeAttr(it)
        }
    }
}
author	David Barts <n5jrn@me.com>
date	Sun, 12 Apr 2020 14:31:06 -0700
parents	0c6c18a733b7
children