3v4l.org

run code in 300+ PHP versions simultaneously
<?php $scraper = new DOMScraper(); //example couldent think of a site with an example table $scraper->setSite('http://www.farsnews.com/economy/agriculture')->setSource(); //get only tables with id="some_table_id" or any attribute match eg class="somthing" echo '<div>'.$scraper->getInnerHTML('div','class=ctgnewsmainpane').'</div>'; /** * Generic DOM scapper using DOMDocument and cURL */ Class DOMScraper extends DOMDocument{ public $site; private $source; private $dom; function __construct(){ libxml_use_internal_errors(true); $this->preserveWhiteSpace = false; $this->strictErrorChecking = false; } function setSite($site){ $this->site = $site; return $this; } function setSource(){ if(empty($this->site))return 'Error: Missing $this->site, use setSite() first'; $this->source = $this->get_data($this->site); return $this; } function getInnerHTML($tag, $class=null, $nodeValue = false){ if(empty($this->site))return 'Error: Missing $this->source, use setSource() first'; $this->loadHTML($this->source); $tmp = $this->getElementsByTagName($tag); $ret = null; foreach ($tmp as $v){ if($class !== null){ $attr = explode('=',$class); if($v->getAttribute($attr[0])==$attr[1]){ if($nodeValue == true){ $ret .= trim($v->nodeValue); }else{ $ret .= $this->innerHTML($v); } } }else{ if($nodeValue == true){ $ret .= trim($v->nodeValue); }else{ $ret .= $this->innerHTML($v); } } } return $ret; } function innerHTML($dom){ $ret = ""; $nodes = $dom->childNodes; foreach($nodes as $v){ $tmp = new DOMDocument(); $tmp->appendChild($tmp->importNode($v, true)); $ret .= trim($tmp->saveHTML()); } return $ret; } function get_data($url){ if(function_exists('curl_init')){ $ch = curl_init(); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_TIMEOUT, 5); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); $data = curl_exec($ch); curl_close($ch); return $data; }else{ return file_get_contents($url); } } } ?>
Finding entry points
Branch analysis from position: 0
1 jumps found. (Code = 62) Position 1 = -2
filename:       /in/ekWpF
function name:  (null)
number of ops:  16
compiled vars:  !0 = $scraper
line      #* E I O op                           fetch          ext  return  operands
-------------------------------------------------------------------------------------
    3     0  E >   NEW                                              $1      'DOMScraper'
          1        DO_FCALL                                      0          
          2        ASSIGN                                                   !0, $1
    5     3        INIT_METHOD_CALL                                         !0, 'setSite'
          4        SEND_VAL_EX                                              'http%3A%2F%2Fwww.farsnews.com%2Feconomy%2Fagriculture'
          5        DO_FCALL                                      0  $4      
          6        INIT_METHOD_CALL                                         $4, 'setSource'
          7        DO_FCALL                                      0          
    8     8        INIT_METHOD_CALL                                         !0, 'getInnerHTML'
          9        SEND_VAL_EX                                              'div'
         10        SEND_VAL_EX                                              'class%3Dctgnewsmainpane'
         11        DO_FCALL                                      0  $6      
         12        CONCAT                                           ~7      '%3Cdiv%3E', $6
         13        CONCAT                                           ~8      ~7, '%3C%2Fdiv%3E'
         14        ECHO                                                     ~8
   88    15      > RETURN                                                   1

Class DOMScraper:
Function __construct:
Finding entry points
Branch analysis from position: 0
1 jumps found. (Code = 62) Position 1 = -2
filename:       /in/ekWpF
function name:  __construct
number of ops:  8
compiled vars:  none
line      #* E I O op                           fetch          ext  return  operands
-------------------------------------------------------------------------------------
   20     0  E >   INIT_FCALL                                               'libxml_use_internal_errors'
          1        SEND_VAL                                                 <true>
          2        DO_ICALL                                                 
   21     3        ASSIGN_OBJ                                               'preserveWhiteSpace'
          4        OP_DATA                                                  <false>
   22     5        ASSIGN_OBJ                                               'strictErrorChecking'
          6        OP_DATA                                                  <false>
   23     7      > RETURN                                                   null

End of function __construct

Function setsite:
Finding entry points
Branch analysis from position: 0
1 jumps found. (Code = 62) Position 1 = -2
filename:       /in/ekWpF
function name:  setSite
number of ops:  6
compiled vars:  !0 = $site
line      #* E I O op                           fetch          ext  return  operands
-------------------------------------------------------------------------------------
   25     0  E >   RECV                                             !0      
   26     1        ASSIGN_OBJ                                               'site'
          2        OP_DATA                                                  !0
   27     3        FETCH_THIS                                       ~2      
          4      > RETURN                                                   ~2
   28     5*     > RETURN                                                   null

End of function setsite

Function setsource:
Finding entry points
Branch analysis from position: 0
2 jumps found. (Code = 43) Position 1 = 2, Position 2 = 3
Branch analysis from position: 2
1 jumps found. (Code = 62) Position 1 = -2
Branch analysis from position: 3
1 jumps found. (Code = 62) Position 1 = -2
filename:       /in/ekWpF
function name:  setSource
number of ops:  13
compiled vars:  none
line      #* E I O op                           fetch          ext  return  operands
-------------------------------------------------------------------------------------
   31     0  E >   ISSET_ISEMPTY_PROP_OBJ                                   'site'
          1      > JMPZ                                                     ~0, ->3
          2    > > RETURN                                                   'Error%3A+Missing+%24this-%3Esite%2C+use+setSite%28%29+first'
   32     3    >   INIT_METHOD_CALL                                         'get_data'
          4        CHECK_FUNC_ARG                                           
          5        FETCH_OBJ_FUNC_ARG                               $2      'site'
          6        SEND_FUNC_ARG                                            $2
          7        DO_FCALL                                      0  $3      
          8        ASSIGN_OBJ                                               'source'
          9        OP_DATA                                                  $3
   33    10        FETCH_THIS                                       ~4      
         11      > RETURN                                                   ~4
   34    12*     > RETURN                                                   null

End of function setsource

Function getinnerhtml:
Finding entry points
Branch analysis from position: 0
2 jumps found. (Code = 43) Position 1 = 5, Position 2 = 6
Branch analysis from position: 5
1 jumps found. (Code = 62) Position 1 = -2
Branch analysis from position: 6
2 jumps found. (Code = 77) Position 1 = 17, Position 2 = 59
Branch analysis from position: 17
2 jumps found. (Code = 78) Position 1 = 18, Position 2 = 59
Branch analysis from position: 18
2 jumps found. (Code = 43) Position 1 = 20, Position 2 = 46
Branch analysis from position: 20
2 jumps found. (Code = 43) Position 1 = 33, Position 2 = 45
Branch analysis from position: 33
2 jumps found. (Code = 43) Position 1 = 35, Position 2 = 41
Branch analysis from position: 35
1 jumps found. (Code = 42) Position 1 = 45
Branch analysis from position: 45
1 jumps found. (Code = 42) Position 1 = 58
Branch analysis from position: 58
1 jumps found. (Code = 42) Position 1 = 17
Branch analysis from position: 17
Branch analysis from position: 41
1 jumps found. (Code = 42) Position 1 = 58
Branch analysis from position: 58
Branch analysis from position: 45
Branch analysis from position: 46
2 jumps found. (Code = 43) Position 1 = 48, Position 2 = 54
Branch analysis from position: 48
1 jumps found. (Code = 42) Position 1 = 58
Branch analysis from position: 58
Branch analysis from position: 54
1 jumps found. (Code = 42) Position 1 = 17
Branch analysis from position: 17
Branch analysis from position: 59
1 jumps found. (Code = 62) Position 1 = -2
Branch analysis from position: 59
filename:       /in/ekWpF
function name:  getInnerHTML
number of ops:  62
compiled vars:  !0 = $tag, !1 = $class, !2 = $nodeValue, !3 = $tmp, !4 = $ret, !5 = $v, !6 = $attr
line      #* E I O op                           fetch          ext  return  operands
-------------------------------------------------------------------------------------
   36     0  E >   RECV                                             !0      
          1        RECV_INIT                                        !1      null
          2        RECV_INIT                                        !2      <false>
   37     3        ISSET_ISEMPTY_PROP_OBJ                                   'site'
          4      > JMPZ                                                     ~7, ->6
          5    > > RETURN                                                   'Error%3A+Missing+%24this-%3Esource%2C+use+setSource%28%29+first'
   38     6    >   INIT_METHOD_CALL                                         'loadHTML'
          7        CHECK_FUNC_ARG                                           
          8        FETCH_OBJ_FUNC_ARG                               $8      'source'
          9        SEND_FUNC_ARG                                            $8
         10        DO_FCALL                                      0          
   39    11        INIT_METHOD_CALL                                         'getElementsByTagName'
         12        SEND_VAR_EX                                              !0
         13        DO_FCALL                                      0  $10     
         14        ASSIGN                                                   !3, $10
   40    15        ASSIGN                                                   !4, null
   41    16      > FE_RESET_R                                       $13     !3, ->59
         17    > > FE_FETCH_R                                               $13, !5, ->59
   42    18    >   TYPE_CHECK                                  1020          !1
         19      > JMPZ                                                     ~14, ->46
   43    20    >   INIT_FCALL                                               'explode'
         21        SEND_VAL                                                 '%3D'
         22        SEND_VAR                                                 !1
         23        DO_ICALL                                         $15     
         24        ASSIGN                                                   !6, $15
   44    25        INIT_METHOD_CALL                                         !5, 'getAttribute'
         26        CHECK_FUNC_ARG                                           
         27        FETCH_DIM_FUNC_ARG                               $17     !6, 0
         28        SEND_FUNC_ARG                                            $17
         29        DO_FCALL                                      0  $18     
         30        FETCH_DIM_R                                      ~19     !6, 1
         31        IS_EQUAL                                                 $18, ~19
         32      > JMPZ                                                     ~20, ->45
   45    33    >   BOOL                                             ~21     !2
         34      > JMPZ                                                     ~21, ->41
   46    35    >   INIT_FCALL                                               'trim'
         36        FETCH_OBJ_R                                      ~22     !5, 'nodeValue'
         37        SEND_VAL                                                 ~22
         38        DO_ICALL                                         $23     
         39        ASSIGN_OP                                     8          !4, $23
         40      > JMP                                                      ->45
   48    41    >   INIT_METHOD_CALL                                         'innerHTML'
         42        SEND_VAR_EX                                              !5
         43        DO_FCALL                                      0  $25     
         44        ASSIGN_OP                                     8          !4, $25
         45    > > JMP                                                      ->58
   52    46    >   BOOL                                             ~27     !2
         47      > JMPZ                                                     ~27, ->54
   53    48    >   INIT_FCALL                                               'trim'
         49        FETCH_OBJ_R                                      ~28     !5, 'nodeValue'
         50        SEND_VAL                                                 ~28
         51        DO_ICALL                                         $29     
         52        ASSIGN_OP                                     8          !4, $29
         53      > JMP                                                      ->58
   55    54    >   INIT_METHOD_CALL                                         'innerHTML'
         55        SEND_VAR_EX                                              !5
         56        DO_FCALL                                      0  $31     
         57        ASSIGN_OP                                     8          !4, $31
   41    58    > > JMP                                                      ->17
         59    >   FE_FREE                                                  $13
   59    60      > RETURN                                                   !4
   60    61*     > RETURN                                                   null

End of function getinnerhtml

Function innerhtml:
Finding entry points
Branch analysis from position: 0
2 jumps found. (Code = 77) Position 1 = 5, Position 2 = 23
Branch analysis from position: 5
2 jumps found. (Code = 78) Position 1 = 6, Position 2 = 23
Branch analysis from position: 6
1 jumps found. (Code = 42) Position 1 = 5
Branch analysis from position: 5
Branch analysis from position: 23
1 jumps found. (Code = 62) Position 1 = -2
Branch analysis from position: 23
filename:       /in/ekWpF
function name:  innerHTML
number of ops:  26
compiled vars:  !0 = $dom, !1 = $ret, !2 = $nodes, !3 = $v, !4 = $tmp
line      #* E I O op                           fetch          ext  return  operands
-------------------------------------------------------------------------------------
   62     0  E >   RECV                                             !0      
   63     1        ASSIGN                                                   !1, ''
   64     2        FETCH_OBJ_R                                      ~6      !0, 'childNodes'
          3        ASSIGN                                                   !2, ~6
   65     4      > FE_RESET_R                                       $8      !2, ->23
          5    > > FE_FETCH_R                                               $8, !3, ->23
   66     6    >   NEW                                              $9      'DOMDocument'
          7        DO_FCALL                                      0          
          8        ASSIGN                                                   !4, $9
   67     9        INIT_METHOD_CALL                                         !4, 'appendChild'
         10        INIT_METHOD_CALL                                         !4, 'importNode'
         11        SEND_VAR_EX                                              !3
         12        SEND_VAL_EX                                              <true>
         13        DO_FCALL                                      0  $12     
         14        SEND_VAR_NO_REF_EX                                       $12
         15        DO_FCALL                                      0          
   68    16        INIT_FCALL                                               'trim'
         17        INIT_METHOD_CALL                                         !4, 'saveHTML'
         18        DO_FCALL                                      0  $14     
         19        SEND_VAR                                                 $14
         20        DO_ICALL                                         $15     
         21        ASSIGN_OP                                     8          !1, $15
   65    22      > JMP                                                      ->5
         23    >   FE_FREE                                                  $8
   70    24      > RETURN                                                   !1
   71    25*     > RETURN                                                   null

End of function innerhtml

Function get_data:
Finding entry points
Branch analysis from position: 0
2 jumps found. (Code = 43) Position 1 = 5, Position 2 = 41
Branch analysis from position: 5
1 jumps found. (Code = 62) Position 1 = -2
Branch analysis from position: 41
1 jumps found. (Code = 62) Position 1 = -2
filename:       /in/ekWpF
function name:  get_data
number of ops:  46
compiled vars:  !0 = $url, !1 = $ch, !2 = $data
line      #* E I O op                           fetch          ext  return  operands
-------------------------------------------------------------------------------------
   73     0  E >   RECV                                             !0      
   74     1        INIT_FCALL                                               'function_exists'
          2        SEND_VAL                                                 'curl_init'
          3        DO_ICALL                                         $3      
          4      > JMPZ                                                     $3, ->41
   75     5    >   INIT_FCALL_BY_NAME                                       'curl_init'
          6        DO_FCALL                                      0  $4      
          7        ASSIGN                                                   !1, $4
   76     8        INIT_FCALL_BY_NAME                                       'curl_setopt'
          9        SEND_VAR_EX                                              !1
         10        FETCH_CONSTANT                                   ~6      'CURLOPT_FOLLOWLOCATION'
         11        SEND_VAL_EX                                              ~6
         12        SEND_VAL_EX                                              <true>
         13        DO_FCALL                                      0          
   77    14        INIT_FCALL_BY_NAME                                       'curl_setopt'
         15        SEND_VAR_EX                                              !1
         16        FETCH_CONSTANT                                   ~8      'CURLOPT_TIMEOUT'
         17        SEND_VAL_EX                                              ~8
         18        SEND_VAL_EX                                              5
         19        DO_FCALL                                      0          
   78    20        INIT_FCALL_BY_NAME                                       'curl_setopt'
         21        SEND_VAR_EX                                              !1
         22        FETCH_CONSTANT                                   ~10     'CURLOPT_URL'
         23        SEND_VAL_EX                                              ~10
         24        SEND_VAR_EX                                              !0
         25        DO_FCALL                                      0          
   79    26        INIT_FCALL_BY_NAME                                       'curl_setopt'
         27        SEND_VAR_EX                                              !1
         28        FETCH_CONSTANT                                   ~12     'CURLOPT_RETURNTRANSFER'
         29        SEND_VAL_EX                                              ~12
         30        SEND_VAL_EX                                              <true>
         31        DO_FCALL                                      0          
   80    32        INIT_FCALL_BY_NAME                                       'curl_exec'
         33        SEND_VAR_EX                                              !1
         34        DO_FCALL                                      0  $14     
         35        ASSIGN                                                   !2, $14
   81    36        INIT_FCALL_BY_NAME                                       'curl_close'
         37        SEND_VAR_EX                                              !1
         38        DO_FCALL                                      0          
   82    39      > RETURN                                                   !2
         40*       JMP                                                      ->45
   84    41    >   INIT_FCALL                                               'file_get_contents'
         42        SEND_VAR                                                 !0
         43        DO_ICALL                                         $17     
         44      > RETURN                                                   $17
   86    45*     > RETURN                                                   null

End of function get_data

End of class DOMScraper.

Generated using Vulcan Logic Dumper, using php 8.0.0


preferences:
267.8 ms | 1417 KiB | 24 Q