[Web] Add html2text converter; Show quarantaine html elements as text, do not escape html
This commit is contained in:
394
data/web/inc/lib/vendor/soundasleep/html2text/src/Html2Text.php
vendored
Normal file
394
data/web/inc/lib/vendor/soundasleep/html2text/src/Html2Text.php
vendored
Normal file
@@ -0,0 +1,394 @@
|
||||
<?php
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2010 Jevon Wright and others.
|
||||
* All rights reserved. This program and the accompanying materials
|
||||
* are made available under the terms of the Eclipse Public License v1.0
|
||||
* which accompanies this distribution, and is available at
|
||||
* http://www.eclipse.org/legal/epl-v10.html
|
||||
*
|
||||
* or
|
||||
*
|
||||
* LGPL which is available at http://www.gnu.org/licenses/lgpl.html
|
||||
*
|
||||
*
|
||||
* Contributors:
|
||||
* Jevon Wright - initial API and implementation
|
||||
****************************************************************************/
|
||||
|
||||
namespace Html2Text;
|
||||
|
||||
class Html2Text {
|
||||
|
||||
/**
|
||||
* Tries to convert the given HTML into a plain text format - best suited for
|
||||
* e-mail display, etc.
|
||||
*
|
||||
* <p>In particular, it tries to maintain the following features:
|
||||
* <ul>
|
||||
* <li>Links are maintained, with the 'href' copied over
|
||||
* <li>Information in the <head> is lost
|
||||
* </ul>
|
||||
*
|
||||
* @param string $html the input HTML
|
||||
* @param boolean $ignore_error Ignore xml parsing errors
|
||||
* @return string the HTML converted, as best as possible, to text
|
||||
* @throws Html2TextException if the HTML could not be loaded as a {@link DOMDocument}
|
||||
*/
|
||||
public static function convert($html, $ignore_error = false) {
|
||||
// replace with spaces
|
||||
$html = str_replace(" ", " ", $html);
|
||||
$html = str_replace("\xc2\xa0", " ", $html);
|
||||
|
||||
$is_office_document = static::isOfficeDocument($html);
|
||||
|
||||
if ($is_office_document) {
|
||||
// remove office namespace
|
||||
$html = str_replace(array("<o:p>", "</o:p>"), "", $html);
|
||||
}
|
||||
|
||||
$html = static::fixNewlines($html);
|
||||
if (mb_detect_encoding($html, "UTF-8", true)) {
|
||||
$html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8");
|
||||
}
|
||||
|
||||
$doc = static::getDocument($html, $ignore_error);
|
||||
|
||||
$output = static::iterateOverNode($doc, null, false, $is_office_document);
|
||||
|
||||
// remove leading and trailing spaces on each line
|
||||
$output = preg_replace("/[ \t]*\n[ \t]*/im", "\n", $output);
|
||||
$output = preg_replace("/ *\t */im", "\t", $output);
|
||||
|
||||
// unarmor pre blocks
|
||||
$output = str_replace("\r", "\n", $output);
|
||||
|
||||
// remove unnecessary empty lines
|
||||
$output = preg_replace("/\n\n\n*/im", "\n\n", $output);
|
||||
|
||||
// remove leading and trailing whitespace
|
||||
$output = trim($output);
|
||||
|
||||
return $output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Unify newlines; in particular, \r\n becomes \n, and
|
||||
* then \r becomes \n. This means that all newlines (Unix, Windows, Mac)
|
||||
* all become \ns.
|
||||
*
|
||||
* @param string $text text with any number of \r, \r\n and \n combinations
|
||||
* @return string the fixed text
|
||||
*/
|
||||
static function fixNewlines($text) {
|
||||
// replace \r\n to \n
|
||||
$text = str_replace("\r\n", "\n", $text);
|
||||
// remove \rs
|
||||
$text = str_replace("\r", "\n", $text);
|
||||
|
||||
return $text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse HTML into a DOMDocument
|
||||
*
|
||||
* @param string $html the input HTML
|
||||
* @param boolean $ignore_error Ignore xml parsing errors
|
||||
* @return DOMDocument the parsed document tree
|
||||
*/
|
||||
static function getDocument($html, $ignore_error = false) {
|
||||
|
||||
$doc = new \DOMDocument();
|
||||
|
||||
$html = trim($html);
|
||||
|
||||
if (!$html) {
|
||||
// DOMDocument doesn't support empty value and throws an error
|
||||
// Return empty document instead
|
||||
return $doc;
|
||||
}
|
||||
|
||||
if ($html[0] !== '<') {
|
||||
// If HTML does not begin with a tag, we put a body tag around it.
|
||||
// If we do not do this, PHP will insert a paragraph tag around
|
||||
// the first block of text for some reason which can mess up
|
||||
// the newlines. See pre.html test for an example.
|
||||
$html = '<body>' . $html . '</body>';
|
||||
}
|
||||
|
||||
if ($ignore_error) {
|
||||
$doc->strictErrorChecking = false;
|
||||
$doc->recover = true;
|
||||
$doc->xmlStandalone = true;
|
||||
$old_internal_errors = libxml_use_internal_errors(true);
|
||||
$load_result = $doc->loadHTML($html, LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET);
|
||||
libxml_use_internal_errors($old_internal_errors);
|
||||
}
|
||||
else {
|
||||
$load_result = $doc->loadHTML($html);
|
||||
}
|
||||
|
||||
if (!$load_result) {
|
||||
throw new Html2TextException("Could not load HTML - badly formed?", $html);
|
||||
}
|
||||
|
||||
return $doc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Can we guess that this HTML is generated by Microsoft Office?
|
||||
*/
|
||||
static function isOfficeDocument($html) {
|
||||
return strpos($html, "urn:schemas-microsoft-com:office") !== false;
|
||||
}
|
||||
|
||||
static function isWhitespace($text) {
|
||||
return strlen(trim($text, "\n\r\t ")) === 0;
|
||||
}
|
||||
|
||||
static function nextChildName($node) {
|
||||
// get the next child
|
||||
$nextNode = $node->nextSibling;
|
||||
while ($nextNode != null) {
|
||||
if ($nextNode instanceof \DOMText) {
|
||||
if (!static::isWhitespace($nextNode->wholeText)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ($nextNode instanceof \DOMElement) {
|
||||
break;
|
||||
}
|
||||
$nextNode = $nextNode->nextSibling;
|
||||
}
|
||||
$nextName = null;
|
||||
if (($nextNode instanceof \DOMElement || $nextNode instanceof \DOMText) && $nextNode != null) {
|
||||
$nextName = strtolower($nextNode->nodeName);
|
||||
}
|
||||
|
||||
return $nextName;
|
||||
}
|
||||
|
||||
static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false) {
|
||||
|
||||
if ($node instanceof \DOMText) {
|
||||
// Replace whitespace characters with a space (equivilant to \s)
|
||||
if ($in_pre) {
|
||||
$text = "\n" . trim($node->wholeText, "\n\r\t ") . "\n";
|
||||
// Remove trailing whitespace only
|
||||
$text = preg_replace("/[ \t]*\n/im", "\n", $text);
|
||||
// armor newlines with \r.
|
||||
return str_replace("\n", "\r", $text);
|
||||
} else {
|
||||
$text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $node->wholeText);
|
||||
if (!static::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) {
|
||||
return "\n" . $text;
|
||||
}
|
||||
return $text;
|
||||
}
|
||||
}
|
||||
if ($node instanceof \DOMDocumentType) {
|
||||
// ignore
|
||||
return "";
|
||||
}
|
||||
if ($node instanceof \DOMProcessingInstruction) {
|
||||
// ignore
|
||||
return "";
|
||||
}
|
||||
|
||||
$name = strtolower($node->nodeName);
|
||||
$nextName = static::nextChildName($node);
|
||||
|
||||
// start whitespace
|
||||
switch ($name) {
|
||||
case "hr":
|
||||
$prefix = '';
|
||||
if ($prevName != null) {
|
||||
$prefix = "\n";
|
||||
}
|
||||
return $prefix . "---------------------------------------------------------------\n";
|
||||
|
||||
case "style":
|
||||
case "head":
|
||||
case "title":
|
||||
case "meta":
|
||||
case "script":
|
||||
// ignore these tags
|
||||
return "";
|
||||
|
||||
case "h1":
|
||||
case "h2":
|
||||
case "h3":
|
||||
case "h4":
|
||||
case "h5":
|
||||
case "h6":
|
||||
case "ol":
|
||||
case "ul":
|
||||
// add two newlines, second line is added below
|
||||
$output = "\n";
|
||||
break;
|
||||
|
||||
case "td":
|
||||
case "th":
|
||||
// add tab char to separate table fields
|
||||
$output = "\t";
|
||||
break;
|
||||
|
||||
case "p":
|
||||
// Microsoft exchange emails often include HTML which, when passed through
|
||||
// html2text, results in lots of double line returns everywhere.
|
||||
//
|
||||
// To fix this, for any p element with a className of `MsoNormal` (the standard
|
||||
// classname in any Microsoft export or outlook for a paragraph that behaves
|
||||
// like a line return) we skip the first line returns and set the name to br.
|
||||
if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') {
|
||||
$output = "";
|
||||
$name = 'br';
|
||||
break;
|
||||
}
|
||||
// add two lines
|
||||
$output = "\n\n";
|
||||
break;
|
||||
|
||||
case "pre":
|
||||
case "tr":
|
||||
case "div":
|
||||
// add one line
|
||||
$output = "\n";
|
||||
break;
|
||||
|
||||
case "li":
|
||||
$output = "- ";
|
||||
break;
|
||||
|
||||
default:
|
||||
// print out contents of unknown tags
|
||||
$output = "";
|
||||
break;
|
||||
}
|
||||
|
||||
// debug
|
||||
//$output .= "[$name,$nextName]";
|
||||
|
||||
if (isset($node->childNodes)) {
|
||||
|
||||
$n = $node->childNodes->item(0);
|
||||
$previousSiblingName = null;
|
||||
|
||||
while($n != null) {
|
||||
|
||||
$text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document);
|
||||
|
||||
// Pass current node name to next child, as previousSibling does not appear to get populated
|
||||
if ($n instanceof \DOMDocumentType
|
||||
|| $n instanceof \DOMProcessingInstruction
|
||||
|| ($n instanceof \DOMText && static::isWhitespace($text))) {
|
||||
// Keep current previousSiblingName, these are invisible
|
||||
}
|
||||
else {
|
||||
$previousSiblingName = strtolower($n->nodeName);
|
||||
}
|
||||
|
||||
$node->removeChild($n);
|
||||
$n = $node->childNodes->item(0);
|
||||
|
||||
// suppress last br tag inside a node list
|
||||
if ($n != null || $previousSiblingName != 'br') {
|
||||
$output .= $text;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// end whitespace
|
||||
switch ($name) {
|
||||
case "h1":
|
||||
case "h2":
|
||||
case "h3":
|
||||
case "h4":
|
||||
case "h5":
|
||||
case "h6":
|
||||
$output .= "\n";
|
||||
break;
|
||||
|
||||
case "p":
|
||||
// add two lines
|
||||
$output .= "\n\n";
|
||||
break;
|
||||
|
||||
case "pre":
|
||||
case "br":
|
||||
// add one line
|
||||
$output .= "\n";
|
||||
break;
|
||||
|
||||
case "div":
|
||||
break;
|
||||
|
||||
case "a":
|
||||
// links are returned in [text](link) format
|
||||
$href = $node->getAttribute("href");
|
||||
|
||||
$output = trim($output);
|
||||
|
||||
// remove double [[ ]] s from linking images
|
||||
if (substr($output, 0, 1) == "[" && substr($output, -1) == "]") {
|
||||
$output = substr($output, 1, strlen($output) - 2);
|
||||
|
||||
// for linking images, the title of the <a> overrides the title of the <img>
|
||||
if ($node->getAttribute("title")) {
|
||||
$output = $node->getAttribute("title");
|
||||
}
|
||||
}
|
||||
|
||||
// if there is no link text, but a title attr
|
||||
if (!$output && $node->getAttribute("title")) {
|
||||
$output = $node->getAttribute("title");
|
||||
}
|
||||
|
||||
if ($href == null) {
|
||||
// it doesn't link anywhere
|
||||
if ($node->getAttribute("name") != null) {
|
||||
$output = "[$output]";
|
||||
}
|
||||
} else {
|
||||
if ($href == $output || $href == "mailto:$output" || $href == "http://$output" || $href == "https://$output") {
|
||||
// link to the same address: just use link
|
||||
$output;
|
||||
} else {
|
||||
// replace it
|
||||
if ($output) {
|
||||
$output = "[$output]($href)";
|
||||
} else {
|
||||
// empty string
|
||||
$output = $href;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// does the next node require additional whitespace?
|
||||
switch ($nextName) {
|
||||
case "h1": case "h2": case "h3": case "h4": case "h5": case "h6":
|
||||
$output .= "\n";
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
case "img":
|
||||
if ($node->getAttribute("title")) {
|
||||
$output = "[" . $node->getAttribute("title") . "]";
|
||||
} elseif ($node->getAttribute("alt")) {
|
||||
$output = "[" . $node->getAttribute("alt") . "]";
|
||||
} else {
|
||||
$output = "";
|
||||
}
|
||||
break;
|
||||
|
||||
case "li":
|
||||
$output .= "\n";
|
||||
break;
|
||||
|
||||
default:
|
||||
// do nothing
|
||||
}
|
||||
|
||||
return $output;
|
||||
}
|
||||
}
|
28
data/web/inc/lib/vendor/soundasleep/html2text/src/Html2TextException.php
vendored
Normal file
28
data/web/inc/lib/vendor/soundasleep/html2text/src/Html2TextException.php
vendored
Normal file
@@ -0,0 +1,28 @@
|
||||
<?php
|
||||
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2010 Jevon Wright and others.
|
||||
* All rights reserved. This program and the accompanying materials
|
||||
* are made available under the terms of the Eclipse Public License v1.0
|
||||
* which accompanies this distribution, and is available at
|
||||
* http://www.eclipse.org/legal/epl-v10.html
|
||||
*
|
||||
* or
|
||||
*
|
||||
* LGPL which is available at http://www.gnu.org/licenses/lgpl.html
|
||||
*
|
||||
*
|
||||
* Contributors:
|
||||
* Jevon Wright - initial API and implementation
|
||||
****************************************************************************/
|
||||
|
||||
namespace Html2Text;
|
||||
|
||||
class Html2TextException extends \Exception {
|
||||
var $more_info;
|
||||
|
||||
public function __construct($message = "", $more_info = "") {
|
||||
parent::__construct($message);
|
||||
$this->more_info = $more_info;
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user