<?php
function plaintext_from_HTML($HTML_string)
{
$document_data = new DOMDocument();
$document_data->loadHTML($HTML_string);
return go_through_recursively($document_data->getElementsByTagName('body')[0]);
}
function go_through_recursively(DomNode $node)
{
global $buffer;
if (!isset($buffer))
$buffer = '';
$node_name = $node->nodeName;
$parent_node_name = $node->parentNode->nodeName;
$text_contents = $node->textContent;
// This is some text content, meaning we are inside an element such as a <p> or <h1>.
if ($node_name == '#text')
{
if ($parent_node_name == 'h1')
$buffer .= 'H1: ' . $text_contents . "\n\n";
if ($parent_node_name == 'h2')
$buffer .= 'H2: ' . $text_contents . "\n\n";
if ($parent_node_name == 'h3')
$buffer .= 'H3: ' . $text_contents . "\n\n";
if ($parent_node_name == 'p')
$buffer .= $text_contents . "\n\n";
if ($parent_node_name == 'strong')
$buffer .= '**' . $text_contents . '**';
if ($parent_node_name == 'em')
$buffer .= '*' . $text_contents . '*';
if ($parent_node_name == 'a')
$buffer .= $text_contents . ' ( ' . 'this is supposed to be the URL, but I can\'t figure out how to grab the "href"...' . ' )';
}
else // It's an actual element.
{
if ($node_name == 'br')
$buffer .= "\n";
if ($node_name == 'hr')
$buffer .= '---------------' . "\n" . "\n";
}
if ($node->childNodes)
{
foreach ($node->childNodes as $node)
go_through_recursively($node);
}
return $buffer;
}
$HTML_string = '
<h1>Test of h1</h1>
<p>This is a p test.</p>
<h2>Test of h2</h2>
<p>This is a p test with a <strong>strong emphasis</strong> followed by this.</p>
<h3>Test of h3</h3>
<p>This here is a link: <a href="http://www.example.com/1">Example.com</a>.<br>
And this is a linebreak.</p>
<p>Another paragraph, followed by a horizontal line:</p>
<hr>
<p>A final paragraph with <a href="http://www.example.com/2"><em>some emphasis</em> inside a link</a>.</p>
';
echo plaintext_from_HTML($HTML_string);
/*
DESIRED/EXPECTED OUTPUT:
H1: Test of h1
This is a p test.
H2: Test of h2
This is a p test with a **strong emphasis** followed by this.
H3: Test of h3
This here is a link: Example.com ( http://www.example.com/1 ).
And this is a linebreak.
Another paragraph, followed by a horizontal line:
---------------
A final paragraph with *some emphasis* inside a link ( http://www.example.com/2 ).
ACTUAL OUTPUT:
H1: Test of h1
This is a p test.
H2: Test of h2
This is a p test with a
**strong emphasis** followed by this.
H3: Test of h3
This here is a link:
Example.com ( this is supposed to be the URL, but I can't figure out how to grab the "href"... ).
And this is a linebreak.
Another paragraph, followed by a horizontal line:
---------------
A final paragraph with
*some emphasis* inside a link ( this is supposed to be the URL, but I can't figure out how to grab the "href"... ).
*/
H1: Test of h1
This is a p test.
H2: Test of h2
This is a p test with a
**strong emphasis** followed by this.
H3: Test of h3
This here is a link:
Example.com ( this is supposed to be the URL, but I can't figure out how to grab the "href"... ).
And this is a linebreak.
Another paragraph, followed by a horizontal line:
---------------
A final paragraph with
*some emphasis* inside a link ( this is supposed to be the URL, but I can't figure out how to grab the "href"... ).