<?php
$hrefPattern = '/<a\\s+[^>]*href=(["\']??)([^"\'>]*?)\\1([^>]*)>(.*)<\/a>/siU';
$html = <<<HTML
<p>If you find any cases where this code falls down, let us know using
the Feedback link below.</p>
<p>Before using this or similar scripts to fetch pages from other
websites, we suggest you read through the related article on <a href="/php/parse-robots/">setting a user agent and parsing robots.txt</a>.</p>
<h2>First checking robots.txt</h2>
<p>As mentioned above, before using a script to download files you
should always <a href="/php/parse-robots/">check the robots.txt
file</a>. Here we're making use of the <tt>robots_allowed</tt> function
from the article linked above to determine whether we're allowed to
access files:</p>
<code class="final"><?PHP
<i>// Original PHP code by Chirp Internet: www.chirp.com.au
// Please acknowledge use of this code by including this header.</i>
<span> ini_set('user_agent', '<i>NameOfAgent (http://www.example.net)</i>');</span>
$url = "http://www.example.net/somepage.html";
<span> if(robots_allowed($url, "<i>NameOfAgent</i>")) {</span>
$input = @file_get_contents($url) or die("Could not access file: $url");
$regexp = "<tt><a\s[^>]*href=(\"??)([^\" >]*?)\\1[^>]*>(.*)<\/a></tt>";
if(preg_match_all("/$regexp/siU", $input, $matches, PREG_SET_ORDER)) {
foreach($matches as $match) {
<i>// $match[2] = link address
// $match[3] = link text</i>
}
}
<span> } else {
die('Access denied by robots.txt');
}</span>
?></code>
HTML;
preg_match_all($hrefPattern, $html, $matches);
var_dump($matches);
Warning: Undefined variable $url in /in/c3qaN on line 27
Warning: Undefined variable $url in /in/c3qaN on line 28
Warning: Undefined variable $input in /in/c3qaN on line 29
Warning: Undefined variable $url in /in/c3qaN on line 29
Warning: Undefined variable $url in /in/c3qaN on line 29
Warning: Undefined variable $regexp in /in/c3qaN on line 30
Warning: Undefined variable $regexp in /in/c3qaN on line 31
Warning: Undefined variable $input in /in/c3qaN on line 31
Warning: Undefined variable $matches in /in/c3qaN on line 31
Warning: Undefined variable $matches in /in/c3qaN on line 32
Warning: Undefined variable $match in /in/c3qaN on line 32
Warning: Undefined variable $match in /in/c3qaN on line 33
Warning: Trying to access array offset on null in /in/c3qaN on line 33
Warning: Undefined variable $match in /in/c3qaN on line 34
Warning: Trying to access array offset on null in /in/c3qaN on line 34
array(5) {
[0]=>
array(2) {
[0]=>
string(76) "<a href="/php/parse-robots/">setting a user agent and parsing robots.txt</a>"
[1]=>
string(59) "<a href="/php/parse-robots/">check the robots.txt
file</a>"
}
[1]=>
array(2) {
[0]=>
string(1) """
[1]=>
string(1) """
}
[2]=>
array(2) {
[0]=>
string(18) "/php/parse-robots/"
[1]=>
string(18) "/php/parse-robots/"
}
[3]=>
array(2) {
[0]=>
string(0) ""
[1]=>
string(0) ""
}
[4]=>
array(2) {
[0]=>
string(43) "setting a user agent and parsing robots.txt"
[1]=>
string(26) "check the robots.txt
file"
}
}
Warning: Undefined variable $url in /in/c3qaN on line 27
Warning: Undefined variable $url in /in/c3qaN on line 28
Warning: Undefined variable $input in /in/c3qaN on line 29
Warning: Undefined variable $url in /in/c3qaN on line 29
Warning: Undefined variable $url in /in/c3qaN on line 29
Warning: Undefined variable $regexp in /in/c3qaN on line 30
Warning: Undefined variable $regexp in /in/c3qaN on line 31
Warning: Undefined variable $input in /in/c3qaN on line 31
Warning: Undefined variable $matches in /in/c3qaN on line 31
Warning: Undefined variable $matches in /in/c3qaN on line 32
Warning: Undefined variable $match in /in/c3qaN on line 32
Warning: Undefined variable $match in /in/c3qaN on line 33
Warning: Trying to access array offset on value of type null in /in/c3qaN on line 33
Warning: Undefined variable $match in /in/c3qaN on line 34
Warning: Trying to access array offset on value of type null in /in/c3qaN on line 34
array(5) {
[0]=>
array(2) {
[0]=>
string(76) "<a href="/php/parse-robots/">setting a user agent and parsing robots.txt</a>"
[1]=>
string(59) "<a href="/php/parse-robots/">check the robots.txt
file</a>"
}
[1]=>
array(2) {
[0]=>
string(1) """
[1]=>
string(1) """
}
[2]=>
array(2) {
[0]=>
string(18) "/php/parse-robots/"
[1]=>
string(18) "/php/parse-robots/"
}
[3]=>
array(2) {
[0]=>
string(0) ""
[1]=>
string(0) ""
}
[4]=>
array(2) {
[0]=>
string(43) "setting a user agent and parsing robots.txt"
[1]=>
string(26) "check the robots.txt
file"
}
}
Output for 7.4.0 - 7.4.33
Notice: Undefined variable: url in /in/c3qaN on line 27
Notice: Undefined variable: url in /in/c3qaN on line 28
Notice: Undefined variable: input in /in/c3qaN on line 29
Notice: Undefined variable: url in /in/c3qaN on line 29
Notice: Undefined variable: url in /in/c3qaN on line 29
Notice: Undefined variable: regexp in /in/c3qaN on line 30
Notice: Undefined variable: regexp in /in/c3qaN on line 31
Notice: Undefined variable: input in /in/c3qaN on line 31
Notice: Undefined variable: matches in /in/c3qaN on line 31
Notice: Undefined variable: matches in /in/c3qaN on line 32
Notice: Undefined variable: match in /in/c3qaN on line 32
Notice: Undefined variable: match in /in/c3qaN on line 33
Notice: Trying to access array offset on value of type null in /in/c3qaN on line 33
Notice: Undefined variable: match in /in/c3qaN on line 34
Notice: Trying to access array offset on value of type null in /in/c3qaN on line 34
array(5) {
[0]=>
array(2) {
[0]=>
string(76) "<a href="/php/parse-robots/">setting a user agent and parsing robots.txt</a>"
[1]=>
string(59) "<a href="/php/parse-robots/">check the robots.txt
file</a>"
}
[1]=>
array(2) {
[0]=>
string(1) """
[1]=>
string(1) """
}
[2]=>
array(2) {
[0]=>
string(18) "/php/parse-robots/"
[1]=>
string(18) "/php/parse-robots/"
}
[3]=>
array(2) {
[0]=>
string(0) ""
[1]=>
string(0) ""
}
[4]=>
array(2) {
[0]=>
string(43) "setting a user agent and parsing robots.txt"
[1]=>
string(26) "check the robots.txt
file"
}
}
Output for 7.3.32 - 7.3.33
array(5) {
[0]=>
array(2) {
[0]=>
string(76) "<a href="/php/parse-robots/">setting a user agent and parsing robots.txt</a>"
[1]=>
string(59) "<a href="/php/parse-robots/">check the robots.txt
file</a>"
}
[1]=>
array(2) {
[0]=>
string(1) """
[1]=>
string(1) """
}
[2]=>
array(2) {
[0]=>
string(18) "/php/parse-robots/"
[1]=>
string(18) "/php/parse-robots/"
}
[3]=>
array(2) {
[0]=>
string(0) ""
[1]=>
string(0) ""
}
[4]=>
array(2) {
[0]=>
string(43) "setting a user agent and parsing robots.txt"
[1]=>
string(26) "check the robots.txt
file"
}
}
Notice: Undefined variable: url in /in/c3qaN on line 27
Notice: Undefined variable: url in /in/c3qaN on line 28
Notice: Undefined variable: input in /in/c3qaN on line 29
Notice: Undefined variable: url in /in/c3qaN on line 29
Notice: Undefined variable: url in /in/c3qaN on line 29
Notice: Undefined variable: regexp in /in/c3qaN on line 30
Notice: Undefined variable: regexp in /in/c3qaN on line 31
Notice: Undefined variable: input in /in/c3qaN on line 31
Notice: Undefined variable: matches in /in/c3qaN on line 31
Notice: Undefined variable: matches in /in/c3qaN on line 32
Notice: Undefined variable: match in /in/c3qaN on line 32
Notice: Undefined variable: match in /in/c3qaN on line 33
Notice: Undefined variable: match in /in/c3qaN on line 34
array(5) {
[0]=>
array(2) {
[0]=>
string(76) "<a href="/php/parse-robots/">setting a user agent and parsing robots.txt</a>"
[1]=>
string(59) "<a href="/php/parse-robots/">check the robots.txt
file</a>"
}
[1]=>
array(2) {
[0]=>
string(1) """
[1]=>
string(1) """
}
[2]=>
array(2) {
[0]=>
string(18) "/php/parse-robots/"
[1]=>
string(18) "/php/parse-robots/"
}
[3]=>
array(2) {
[0]=>
string(0) ""
[1]=>
string(0) ""
}
[4]=>
array(2) {
[0]=>
string(43) "setting a user agent and parsing robots.txt"
[1]=>
string(26) "check the robots.txt
file"
}
}
Notice: Undefined variable: url in /in/c3qaN on line 28
Notice: Undefined variable: url in /in/c3qaN on line 29
Notice: Undefined variable: input in /in/c3qaN on line 29
Notice: Undefined variable: url in /in/c3qaN on line 29
Notice: Undefined variable: url in /in/c3qaN on line 30
Notice: Undefined variable: regexp in /in/c3qaN on line 31
Notice: Undefined variable: regexp in /in/c3qaN on line 31
Notice: Undefined variable: input in /in/c3qaN on line 31
Notice: Undefined variable: matches in /in/c3qaN on line 32
Notice: Undefined variable: matches in /in/c3qaN on line 32
Notice: Undefined variable: match in /in/c3qaN on line 33
Notice: Undefined variable: match in /in/c3qaN on line 33
Notice: Undefined variable: match in /in/c3qaN on line 34
array(5) {
[0]=>
array(2) {
[0]=>
string(76) "<a href="/php/parse-robots/">setting a user agent and parsing robots.txt</a>"
[1]=>
string(59) "<a href="/php/parse-robots/">check the robots.txt
file</a>"
}
[1]=>
array(2) {
[0]=>
string(1) """
[1]=>
string(1) """
}
[2]=>
array(2) {
[0]=>
string(18) "/php/parse-robots/"
[1]=>
string(18) "/php/parse-robots/"
}
[3]=>
array(2) {
[0]=>
string(0) ""
[1]=>
string(0) ""
}
[4]=>
array(2) {
[0]=>
string(43) "setting a user agent and parsing robots.txt"
[1]=>
string(26) "check the robots.txt
file"
}
}
Notice: Undefined variable: url in /in/c3qaN on line 27
Notice: Undefined variable: url in /in/c3qaN on line 28
Notice: Undefined variable: input in /in/c3qaN on line 29
Notice: Undefined variable: url in /in/c3qaN on line 29
Notice: Undefined variable: url in /in/c3qaN on line 29
Notice: Undefined variable: regexp in /in/c3qaN on line 30
Notice: Undefined variable: regexp in /in/c3qaN on line 31
Notice: Undefined variable: input in /in/c3qaN on line 31
Notice: Undefined variable: matches in /in/c3qaN on line 31
Notice: Undefined variable: matches in /in/c3qaN on line 32
Notice: Undefined variable: match in /in/c3qaN on line 32
Notice: Undefined variable: match in /in/c3qaN on line 33
Notice: Undefined variable: match in /in/c3qaN on line 34
array(5) {
[0]=>
array(2) {
[0]=>
string(76) "<a href="/php/parse-robots/">setting a user agent and parsing robots.txt</a>"
[1]=>
string(59) "<a href="/php/parse-robots/">check the robots.txt
file</a>"
}
[1]=>
array(2) {
[0]=>
string(1) """
[1]=>
string(1) """
}
[2]=>
array(2) {
[0]=>
string(18) "/php/parse-robots/"
[1]=>
string(18) "/php/parse-robots/"
}
[3]=>
array(2) {
[0]=>
string(0) ""
[1]=>
string(0) ""
}
[4]=>
array(2) {
[0]=>
string(43) "setting a user agent and parsing robots.txt"
[1]=>
string(26) "check the robots.txt
file"
}
}