3v4l.org

run code in 300+ PHP versions simultaneously
<?php /** * Strip the HTML of Word documents * * @author rene.veldink * @author christoph.roensch */ class Hgs_Filter_StripWordHtml #implements Zend_Filter_Interface { /** * @var string */ const ALLOW_DEFAULT = " <b><i><sup><sub><em><strong><u><br><ol><li><ul><span><div><h1><h2><h3><p>"; /** * @var string */ protected $_allow = self::ALLOW_DEFAULT; /** * Constructor * * @param array|string $options * @return void */ public function __construct($options = array()) { if (is_string($options)) { $this->_allow = $options; } if (isset($options['allow'])) { $this->_allow = $options['allow']; } } /** * @param string $value * @return string */ public function filter($value) { return $this->strip_word_html($value, $this->_allow); } /** * - moved $allow default value to class property * - fixed regex for 'simplify style tags', <br> were replaced with <b> * - fixed bad escape sequence near $num_matches * @link http://php.net/manual/de/function.strip-tags.php#99643 */ private function strip_word_html($text, $allowed_tags = self::ALLOW_DEFAULT) { echo "Debug: $this->_allow\n"; mb_regex_encoding('UTF-8'); //replace MS special characters first $search = array('/&lsquo;/u', '/&rsquo;/u', '/&ldquo;/u', '/&rdquo;/u', '/&mdash;/u'); $replace = array('\'', '\'', '"', '"', '-'); $text = preg_replace($search, $replace, $text); //make sure _all_ html entities are converted to the plain ascii equivalents - it appears //in some MS headers, some html entities are encoded and some aren't $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8'); //try to strip out any C style comments first, since these, embedded in html comments, seem to //prevent strip_tags from removing html comments (MS Word introduced combination) if(mb_stripos($text, '/*') !== FALSE){ $text = mb_eregi_replace('#/\*.*?\*/#s', '', $text, 'm'); } //introduce a space into any arithmetic expressions that could be caught by strip_tags so that they won't be //'<1' becomes '< 1'(note: somewhat application specific) $text = preg_replace(array('/<([0-9]+)/'), array('< $1'), $text); $text = strip_tags($text, $allowed_tags); //eliminate extraneous whitespace from start and end of line, or anywhere there are two or more spaces, convert it to one $text = preg_replace(array('/^\s\s+/', '/\s\s+$/', '/\s\s+/u'), array('', '', ' '), $text); //strip out inline css and simplify style tags $search = array('#<(strong|b)\s+[^>]*>(.*?)</(strong|b)>#isu', '#<(em|i)[^>]*>(.*?)</(em|i)>#isu', '#<u[^>]*>(.*?)</u>#isu'); $replace = array('<b>$2</b>', '<i>$2</i>', '<u>$1</u>'); $text = preg_replace($search, $replace, $text); //on some of the ?newer MS Word exports, where you get conditionals of the form 'if gte mso 9', etc., it appears //that whatever is in one of the html comments prevents strip_tags from eradicating the html comment that contains //some MS Style Definitions - this last bit gets rid of any leftover comments $num_matches = preg_match_all('/\<!--/u', $text, $matches); if ($num_matches) { $text = preg_replace('/\<!--(.)*--\>/isu', '', $text); } return $text; } } $a = "<p>Hey<br /><b>Ho</b>,<i>Lets Go!</i></p><ol><li>1</li></ol><ul><li>2</li></ul>"; $filter = new Hgs_Filter_StripWordHtml(Hgs_Filter_StripWordHtml::ALLOW_DEFAULT); $b = $filter->filter($a); var_dump($a == $b); echo "\n$b";
Finding entry points
Branch analysis from position: 0
1 jumps found. (Code = 62) Position 1 = -2
filename:       /in/jITsR
function name:  (null)
number of ops:  17
compiled vars:  !0 = $a, !1 = $filter, !2 = $b
line      #* E I O op                           fetch          ext  return  operands
-------------------------------------------------------------------------------------
   88     0  E >   ASSIGN                                                   !0, '%3Cp%3EHey%3Cbr+%2F%3E%3Cb%3EHo%3C%2Fb%3E%2C%3Ci%3ELets+Go%21%3C%2Fi%3E%3C%2Fp%3E%3Col%3E%3Cli%3E1%3C%2Fli%3E%3C%2Fol%3E%3Cul%3E%3Cli%3E2%3C%2Fli%3E%3C%2Ful%3E'
   90     1        NEW                                              $4      'Hgs_Filter_StripWordHtml'
          2        SEND_VAL_EX                                              '+%3Cb%3E%3Ci%3E%3Csup%3E%3Csub%3E%3Cem%3E%3Cstrong%3E%3Cu%3E%3Cbr%3E%3Col%3E%3Cli%3E%3Cul%3E%3Cspan%3E%3Cdiv%3E%3Ch1%3E%3Ch2%3E%3Ch3%3E%3Cp%3E'
          3        DO_FCALL                                      0          
          4        ASSIGN                                                   !1, $4
   91     5        INIT_METHOD_CALL                                         !1, 'filter'
          6        SEND_VAR_EX                                              !0
          7        DO_FCALL                                      0  $7      
          8        ASSIGN                                                   !2, $7
   93     9        INIT_FCALL                                               'var_dump'
         10        IS_EQUAL                                         ~9      !0, !2
         11        SEND_VAL                                                 ~9
         12        DO_ICALL                                                 
   95    13        NOP                                                      
         14        FAST_CONCAT                                      ~11     '%0A', !2
         15        ECHO                                                     ~11
         16      > RETURN                                                   1

Class Hgs_Filter_StripWordHtml:
Function __construct:
Finding entry points
Branch analysis from position: 0
2 jumps found. (Code = 43) Position 1 = 3, Position 2 = 5
Branch analysis from position: 3
2 jumps found. (Code = 43) Position 1 = 7, Position 2 = 10
Branch analysis from position: 7
1 jumps found. (Code = 62) Position 1 = -2
Branch analysis from position: 10
Branch analysis from position: 5
filename:       /in/jITsR
function name:  __construct
number of ops:  11
compiled vars:  !0 = $options
line      #* E I O op                           fetch          ext  return  operands
-------------------------------------------------------------------------------------
   26     0  E >   RECV_INIT                                        !0      <array>
   28     1        TYPE_CHECK                                   64          !0
          2      > JMPZ                                                     ~1, ->5
   29     3    >   ASSIGN_OBJ                                               '_allow'
          4        OP_DATA                                                  !0
   31     5    >   ISSET_ISEMPTY_DIM_OBJ                         0          !0, 'allow'
          6      > JMPZ                                                     ~3, ->10
   32     7    >   FETCH_DIM_R                                      ~5      !0, 'allow'
          8        ASSIGN_OBJ                                               '_allow'
          9        OP_DATA                                                  ~5
   34    10    > > RETURN                                                   null

End of function __construct

Function filter:
Finding entry points
Branch analysis from position: 0
1 jumps found. (Code = 62) Position 1 = -2
filename:       /in/jITsR
function name:  filter
number of ops:  9
compiled vars:  !0 = $value
line      #* E I O op                           fetch          ext  return  operands
-------------------------------------------------------------------------------------
   40     0  E >   RECV                                             !0      
   42     1        INIT_METHOD_CALL                                         'strip_word_html'
          2        SEND_VAR_EX                                              !0
          3        CHECK_FUNC_ARG                                           
          4        FETCH_OBJ_FUNC_ARG                               $1      '_allow'
          5        SEND_FUNC_ARG                                            $1
          6        DO_FCALL                                      0  $2      
          7      > RETURN                                                   $2
   43     8*     > RETURN                                                   null

End of function filter

Function strip_word_html:
Finding entry points
Branch analysis from position: 0
2 jumps found. (Code = 43) Position 1 = 30, Position 2 = 37
Branch analysis from position: 30
2 jumps found. (Code = 43) Position 1 = 69, Position 2 = 75
Branch analysis from position: 69
1 jumps found. (Code = 62) Position 1 = -2
Branch analysis from position: 75
Branch analysis from position: 37
filename:       /in/jITsR
function name:  strip_word_html
number of ops:  77
compiled vars:  !0 = $text, !1 = $allowed_tags, !2 = $search, !3 = $replace, !4 = $num_matches, !5 = $matches
line      #* E I O op                           fetch          ext  return  operands
-------------------------------------------------------------------------------------
   51     0  E >   RECV                                             !0      
          1        RECV_INIT                                        !1      <const ast>
   53     2        ROPE_INIT                                     3  ~8      'Debug%3A+'
          3        FETCH_OBJ_R                                      ~6      '_allow'
          4        ROPE_ADD                                      1  ~8      ~8, ~6
          5        ROPE_END                                      2  ~7      ~8, '%0A'
          6        ECHO                                                     ~7
   54     7        INIT_FCALL                                               'mb_regex_encoding'
          8        SEND_VAL                                                 'UTF-8'
          9        DO_ICALL                                                 
   56    10        ASSIGN                                                   !2, <array>
   57    11        ASSIGN                                                   !3, <array>
   58    12        INIT_FCALL                                               'preg_replace'
         13        SEND_VAR                                                 !2
         14        SEND_VAR                                                 !3
         15        SEND_VAR                                                 !0
         16        DO_ICALL                                         $13     
         17        ASSIGN                                                   !0, $13
   61    18        INIT_FCALL                                               'html_entity_decode'
         19        SEND_VAR                                                 !0
         20        SEND_VAL                                                 3
         21        SEND_VAL                                                 'UTF-8'
         22        DO_ICALL                                         $15     
         23        ASSIGN                                                   !0, $15
   64    24        INIT_FCALL                                               'mb_stripos'
         25        SEND_VAR                                                 !0
         26        SEND_VAL                                                 '%2F%2A'
         27        DO_ICALL                                         $17     
         28        TYPE_CHECK                                  1018          $17
         29      > JMPZ                                                     ~18, ->37
   65    30    >   INIT_FCALL                                               'mb_eregi_replace'
         31        SEND_VAL                                                 '%23%2F%5C%2A.%2A%3F%5C%2A%2F%23s'
         32        SEND_VAL                                                 ''
         33        SEND_VAR                                                 !0
         34        SEND_VAL                                                 'm'
         35        DO_ICALL                                         $19     
         36        ASSIGN                                                   !0, $19
   69    37    >   INIT_FCALL                                               'preg_replace'
         38        SEND_VAL                                                 <array>
         39        SEND_VAL                                                 <array>
         40        SEND_VAR                                                 !0
         41        DO_ICALL                                         $21     
         42        ASSIGN                                                   !0, $21
   70    43        INIT_FCALL                                               'strip_tags'
         44        SEND_VAR                                                 !0
         45        SEND_VAR                                                 !1
         46        DO_ICALL                                         $23     
         47        ASSIGN                                                   !0, $23
   72    48        INIT_FCALL                                               'preg_replace'
         49        SEND_VAL                                                 <array>
         50        SEND_VAL                                                 <array>
         51        SEND_VAR                                                 !0
         52        DO_ICALL                                         $25     
         53        ASSIGN                                                   !0, $25
   74    54        ASSIGN                                                   !2, <array>
   75    55        ASSIGN                                                   !3, <array>
   76    56        INIT_FCALL                                               'preg_replace'
         57        SEND_VAR                                                 !2
         58        SEND_VAR                                                 !3
         59        SEND_VAR                                                 !0
         60        DO_ICALL                                         $29     
         61        ASSIGN                                                   !0, $29
   80    62        INIT_FCALL                                               'preg_match_all'
         63        SEND_VAL                                                 '%2F%5C%3C%21--%2Fu'
         64        SEND_VAR                                                 !0
         65        SEND_REF                                                 !5
         66        DO_ICALL                                         $31     
         67        ASSIGN                                                   !4, $31
   81    68      > JMPZ                                                     !4, ->75
   82    69    >   INIT_FCALL                                               'preg_replace'
         70        SEND_VAL                                                 '%2F%5C%3C%21--%28.%29%2A--%5C%3E%2Fisu'
         71        SEND_VAL                                                 ''
         72        SEND_VAR                                                 !0
         73        DO_ICALL                                         $33     
         74        ASSIGN                                                   !0, $33
   84    75    > > RETURN                                                   !0
   85    76*     > RETURN                                                   null

End of function strip_word_html

End of class Hgs_Filter_StripWordHtml.

Generated using Vulcan Logic Dumper, using php 8.0.0


preferences:
177.74 ms | 1404 KiB | 29 Q