<?php
/**
* Strip the HTML of Word documents
*
* @author rene.veldink
* @author christoph.roensch
*/
class Hgs_Filter_StripWordHtml #implements Zend_Filter_Interface
{
/**
* @var string
*/
const ALLOW_DEFAULT = '<b><i><sup><sub><em><strong><u><br><ol><li><ul><span><div><h1><h2><h3><p>';
/**
* @var string
*/
protected $_allow = '<b><i><sup><sub><em><strong><u><br><ol><li><ul><span><div><h1><h2><h3><p>';
/**
* Constructor
*
* @param array|string $options
* @return void
*/
public function __construct($options = array())
{
if (is_string($options)) {
$this->_allow = $options;
}
if (isset($options['allow'])) {
$this->_allow = $options['allow'];
}
}
/**
* @param string $value
* @return string
*/
public function filter($value)
{
return $this->strip_word_html($value, $this->_allow);
}
/**
* - moved $allow default value to class property
* - fixed regex for 'simplify style tags', <br> were replaced with <b>
* - fixed bad escape sequence near $num_matches
* @link http://php.net/manual/de/function.strip-tags.php#99643
*/
private function strip_word_html($text, $allowed_tags /*= self::ALLOW_DEFAULT*/)
{
echo "Debug: $this->_allow\n";
mb_regex_encoding('UTF-8');
//replace MS special characters first
$search = array('/‘/u', '/’/u', '/“/u', '/”/u', '/—/u');
$replace = array('\'', '\'', '"', '"', '-');
$text = preg_replace($search, $replace, $text);
//make sure _all_ html entities are converted to the plain ascii equivalents - it appears
//in some MS headers, some html entities are encoded and some aren't
$text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');
//try to strip out any C style comments first, since these, embedded in html comments, seem to
//prevent strip_tags from removing html comments (MS Word introduced combination)
if(mb_stripos($text, '/*') !== FALSE){
$text = mb_eregi_replace('#/\*.*?\*/#s', '', $text, 'm');
}
//introduce a space into any arithmetic expressions that could be caught by strip_tags so that they won't be
//'<1' becomes '< 1'(note: somewhat application specific)
$text = preg_replace(array('/<([0-9]+)/'), array('< $1'), $text);
$text = strip_tags($text, $allowed_tags);
//eliminate extraneous whitespace from start and end of line, or anywhere there are two or more spaces, convert it to one
$text = preg_replace(array('/^\s\s+/', '/\s\s+$/', '/\s\s+/u'), array('', '', ' '), $text);
//strip out inline css and simplify style tags
$search = array('#<(strong|b)\s+[^>]*>(.*?)</(strong|b)>#isu', '#<(em|i)[^>]*>(.*?)</(em|i)>#isu', '#<u[^>]*>(.*?)</u>#isu');
$replace = array('<b>$2</b>', '<i>$2</i>', '<u>$1</u>');
$text = preg_replace($search, $replace, $text);
//on some of the ?newer MS Word exports, where you get conditionals of the form 'if gte mso 9', etc., it appears
//that whatever is in one of the html comments prevents strip_tags from eradicating the html comment that contains
//some MS Style Definitions - this last bit gets rid of any leftover comments
$num_matches = preg_match_all('/\<!--/u', $text, $matches);
if ($num_matches) {
$text = preg_replace('/\<!--(.)*--\>/isu', '', $text);
}
return $text;
}
}
$a = "<p>Hey<br /><b>Ho</b>,<i>Lets Go!</i></p><ol><li>1</li></ol><ul><li>2</li></ul>";
$filter = new Hgs_Filter_StripWordHtml(Hgs_Filter_StripWordHtml::ALLOW_DEFAULT);
$b = $filter->filter($a);
var_dump($a == $b);
echo "\n$b";
Debug: <
Fatal error: Call to undefined function mb_stripos() in /in/GkIjs on line 64
Process exited with code 255.
Output for 5.0.0 - 5.0.2
Debug: <b><i><sup><sub><em><strong><u><br><ol><li><ul><span><div><h1><h2><h3><p>
Fatal error: Call to undefined function mb_stripos() in /in/GkIjs on line 64
Process exited with code 255.
Output for 4.4.2 - 4.4.9
Parse error: syntax error, unexpected T_CONST, expecting T_OLD_FUNCTION or T_FUNCTION or T_VAR or '}' in /in/GkIjs on line 13
Process exited with code 255.
Parse error: parse error, unexpected T_CONST, expecting T_OLD_FUNCTION or T_FUNCTION or T_VAR or '}' in /in/GkIjs on line 13
Process exited with code 255.
Output for 4.3.2 - 4.3.4
Parse error: parse error, expecting `T_OLD_FUNCTION' or `T_FUNCTION' or `T_VAR' or `'}'' in /in/GkIjs on line 13
Process exited with code 255.