Я использовал собственный метод DOMDocument, но с некоторыми улучшениями по безопасности.
Обратите внимание, что другие ответы, использующие DOMDocument, не учитывают нити html, такие как
This is a <em>HTML</em> strand
. Вышеприведет к
<p>This is a <em>HTML</em> strand
Мое решение ниже
function closeDanglingTags($html) {
if (strpos($html, '<') || strpos($html, '>')) {
// There are definitiley HTML tags
$wrapped = false;
if (strpos(trim($html), '<') !== 0) {
// The HTML starts with a text node. Wrap it in an element with an id to prevent the software wrapping it with a <p>
// that we know nothing about and cannot safely retrieve
$html = cHE::getDivHtml($html, null, 'closedanglingtagswrapper');
$wrapped = true;
}
$doc = new DOMDocument();
$doc->encoding = 'utf-8';
@$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
if ($doc->firstChild) {
// Test whether the firstchild is definitely a DOMDocumentType
if ($doc->firstChild instanceof DOMDocumentType) {
// Remove the added doctype
$doc->removeChild($doc->firstChild);
}
}
if ($wrapped) {
// The contents originally started with a text node and was wrapped in a div#plasmappclibtextwrap. Take the contents
// out of that div
$node = $doc->getElementById('closedanglingtagswrapper');
$children = $node->childNodes; // The contents of the div. Equivalent to $('selector').children()
$doc = new DOMDocument(); // Create a new document to add the contents to, equiv. to "var doc = $('<html></html>');"
foreach ($children as $childnode) {
$doc->appendChild($doc->importNode($childnode, true)); // E.g. doc.append()
}
}
// Remove the added html,body tags
return trim(str_replace(array('<html><body>', '</body></html>'), '', html_entity_decode($doc->saveHTML())));
} else {
return $html;
}
}