@ 2014-06-24T11:54:53Z <?php
$hrefPattern = '/<a\\s+([^>]*)href=(["\']??)(?P<link>[^"\'>]*?)\\2([^>]*)>(?P<text>.*)<\\/a>/siU';
$html = <<<HTML
<p>If you find any cases where this code falls down, let us know using
the Feedback link below.</p>
<p>Before using this or similar scripts to fetch pages from other
websites, we suggest you read through the related article on <a href="/php/parse-robots/" title="foobar" target="_parent">setting a user agent and parsing robots.txt</a>.</p>
<h2>First checking robots.txt</h2>
<p>As mentioned above, before using a script to download files you
should always <a target="_blank" href="/php/parse-robots/">check the robots.txt
file</a>. Here we're making use of the <tt>robots_allowed</tt> function
from the article linked above to determine whether we're allowed to
access files:</p>
<p>As mentioned above, before using a script to download files you
should always <a href="/php/parse-robots/">check the robots.txt
file</a>. Here we're making use of the <tt>robots_allowed</tt> function
from the article linked above to determine whether we're allowed to
access files:</p>
HTML;
preg_match_all($hrefPattern, $html, $matches, PREG_SET_ORDER ^ PREG_OFFSET_CAPTURE);
var_dump($matches);
Enable javascript to submit You have javascript disabled. You will not be able to edit any code.
Output for 4.3.3 - 4.3.11 , 4.4.0 - 4.4.9 , 5.0.0 - 5.0.5 , 5.1.0 - 5.1.6 , 5.2.0 - 5.2.17 , 5.3.0 - 5.3.29 , 5.4.0 - 5.4.45 , 5.5.24 - 5.5.35 , 5.6.8 - 5.6.28 , 7.0.0 - 7.0.20 , 7.1.0 - 7.1.10 , 7.2.0 - 7.2.33 , 7.3.12 - 7.3.33 , 7.4.0 - 7.4.33 , 8.0.0 - 8.0.30 , 8.1.0 - 8.1.28 , 8.2.0 - 8.2.18 , 8.3.0 - 8.3.6 array(3) {
[0]=>
array(8) {
[0]=>
array(2) {
[0]=>
string(108) "<a href="/php/parse-robots/" title="foobar" target="_parent">setting a user agent and parsing robots.txt</a>"
[1]=>
int(228)
}
[1]=>
array(2) {
[0]=>
string(0) ""
[1]=>
int(231)
}
[2]=>
array(2) {
[0]=>
string(1) """
[1]=>
int(236)
}
["link"]=>
array(2) {
[0]=>
string(18) "/php/parse-robots/"
[1]=>
int(237)
}
[3]=>
array(2) {
[0]=>
string(18) "/php/parse-robots/"
[1]=>
int(237)
}
[4]=>
array(2) {
[0]=>
string(32) " title="foobar" target="_parent""
[1]=>
int(256)
}
["text"]=>
array(2) {
[0]=>
string(43) "setting a user agent and parsing robots.txt"
[1]=>
int(289)
}
[5]=>
array(2) {
[0]=>
string(43) "setting a user agent and parsing robots.txt"
[1]=>
int(289)
}
}
[1]=>
array(8) {
[0]=>
array(2) {
[0]=>
string(75) "<a target="_blank" href="/php/parse-robots/">check the robots.txt
file</a>"
[1]=>
int(462)
}
[1]=>
array(2) {
[0]=>
string(16) "target="_blank" "
[1]=>
int(465)
}
[2]=>
array(2) {
[0]=>
string(1) """
[1]=>
int(486)
}
["link"]=>
array(2) {
[0]=>
string(18) "/php/parse-robots/"
[1]=>
int(487)
}
[3]=>
array(2) {
[0]=>
string(18) "/php/parse-robots/"
[1]=>
int(487)
}
[4]=>
array(2) {
[0]=>
string(0) ""
[1]=>
int(506)
}
["text"]=>
array(2) {
[0]=>
string(26) "check the robots.txt
file"
[1]=>
int(507)
}
[5]=>
array(2) {
[0]=>
string(26) "check the robots.txt
file"
[1]=>
int(507)
}
}
[2]=>
array(8) {
[0]=>
array(2) {
[0]=>
string(59) "<a href="/php/parse-robots/">check the robots.txt
file</a>"
[1]=>
int(773)
}
[1]=>
array(2) {
[0]=>
string(0) ""
[1]=>
int(776)
}
[2]=>
array(2) {
[0]=>
string(1) """
[1]=>
int(781)
}
["link"]=>
array(2) {
[0]=>
string(18) "/php/parse-robots/"
[1]=>
int(782)
}
[3]=>
array(2) {
[0]=>
string(18) "/php/parse-robots/"
[1]=>
int(782)
}
[4]=>
array(2) {
[0]=>
string(0) ""
[1]=>
int(801)
}
["text"]=>
array(2) {
[0]=>
string(26) "check the robots.txt
file"
[1]=>
int(802)
}
[5]=>
array(2) {
[0]=>
string(26) "check the robots.txt
file"
[1]=>
int(802)
}
}
}
Output for 4.3.0 - 4.3.2 array(3) {
[0]=>
array(6) {
[0]=>
array(2) {
[0]=>
string(108) "<a href="/php/parse-robots/" title="foobar" target="_parent">setting a user agent and parsing robots.txt</a>"
[1]=>
int(228)
}
[1]=>
array(2) {
[0]=>
string(0) ""
[1]=>
int(231)
}
[2]=>
array(2) {
[0]=>
string(1) """
[1]=>
int(236)
}
[3]=>
array(2) {
[0]=>
string(18) "/php/parse-robots/"
[1]=>
int(237)
}
[4]=>
array(2) {
[0]=>
string(32) " title="foobar" target="_parent""
[1]=>
int(256)
}
[5]=>
array(2) {
[0]=>
string(43) "setting a user agent and parsing robots.txt"
[1]=>
int(289)
}
}
[1]=>
array(6) {
[0]=>
array(2) {
[0]=>
string(75) "<a target="_blank" href="/php/parse-robots/">check the robots.txt
file</a>"
[1]=>
int(462)
}
[1]=>
array(2) {
[0]=>
string(16) "target="_blank" "
[1]=>
int(465)
}
[2]=>
array(2) {
[0]=>
string(1) """
[1]=>
int(486)
}
[3]=>
array(2) {
[0]=>
string(18) "/php/parse-robots/"
[1]=>
int(487)
}
[4]=>
array(2) {
[0]=>
string(0) ""
[1]=>
int(506)
}
[5]=>
array(2) {
[0]=>
string(26) "check the robots.txt
file"
[1]=>
int(507)
}
}
[2]=>
array(6) {
[0]=>
array(2) {
[0]=>
string(59) "<a href="/php/parse-robots/">check the robots.txt
file</a>"
[1]=>
int(773)
}
[1]=>
array(2) {
[0]=>
string(0) ""
[1]=>
int(776)
}
[2]=>
array(2) {
[0]=>
string(1) """
[1]=>
int(781)
}
[3]=>
array(2) {
[0]=>
string(18) "/php/parse-robots/"
[1]=>
int(782)
}
[4]=>
array(2) {
[0]=>
string(0) ""
[1]=>
int(801)
}
[5]=>
array(2) {
[0]=>
string(26) "check the robots.txt
file"
[1]=>
int(802)
}
}
}
preferences:dark mode live preview
244.13 ms | 409 KiB | 312 Q