<?php
final class UrlScraper {
private static $encoding;
private static $array;
private static function call($name) {
return array(__CLASS__, $name);
}
private static function checkEmpty($dir) {
return $dir !== '.' && $dir !== '';
}
private static function htmlDecode($html) {
return html_entity_decode($html, ENT_QUOTES, self::$encoding);
}
private static function mergeRecursive($item) {
if (!self::checkEmpty($item)) {
self::$array[] = $item;
}
}
private static function parseUrl($url, $is_base = false) {
if (
strpos($url, 'javascript:') === 0 or
false === $p = parse_url($url) or
$is_base && !isset($p['host'])
) {
throw new InvalidArgumentException('Invalid URL');
}
$p['scheme'] = isset($p['scheme']) ? $p['scheme'] . '://' : 'http://';
$p['port'] = isset($p['port']) ? ':' . $p['port'] : '';
$p['path'] = isset($p['path']) ? $p['path'] : '';
$q['head'] = isset($p['host']) ? $p['scheme'] . $p['host'] . $p['port'] : '';
$q['query'] = isset($p['query']) ? '?' . $p['query'] : '';
$q['fragment'] = isset($p['fragment']) ? '#' . $p['fragment'] : '';
$q['dirs'] = explode('/', $p['path']);
$q['is_abs'] = $p['path'] !== '' && $q['dirs'][0] === '';
$q['is_dir'] = end($q['dirs']) === '';
$q['dirs'] = array_values(array_filter($q['dirs'], self::call('checkEmpty')));
return $q;
}
public static function parseLinks($html, $base_url = null, $encoding = 'UTF-8') {
$regex = '@<a[^>]*?(?<!\.)href="([^"]*+)"[^>]*+>(.*?)</a>@si';
if (!preg_match_all($regex, $html, $matches, PREG_SET_ORDER)) {
return array();
}
if ($base_url !== null) {
$p = self::parseUrl($base_url, true);;
if (!$p['is_dir']) {
array_pop($p['dirs']);
}
}
$array = array();
foreach ($matches as $match) {
list(, $set['href'], $set['label']) = $match;
try {
$q = self::parseUrl($set['href']);
if ($base_url === null) {
if ($q['head'] === '' && $q['query'] === '' && !$q['dirs'] && !$q['is_abs']) {
continue;
}
} else {
if ($q['head'] === '') {
$tmp = $q['is_abs'] ? array() : $p['dirs'];
foreach ($q['dirs'] as $dir) {
if ($dir === '..') {
array_pop($tmp);
}
}
self::$array = array();
call_user_func(
'array_walk_recursive',
array($p['head'], $tmp, $q['query'], $q['fragment']),
self::call('mergeRecursive')
);
$set['href'] = implode('/', self::$array);
} else {
$tmp = array_filter($q['dirs'], self::call('checkEmpty'));
}
if ($p['dirs'] === $tmp && $p['query'] === $q['query']) {
continue;
}
}
$set['label'] =
(
'' === $text = trim(strip_tags($set['label'])) and
preg_match_all('@(?<!\.)alt="([^"]++)"@', $set['label'], $matches)
) ?
implode(' - ', $matches[1]) :
$text
;
self::$encoding = $encoding;
$set = array_map(self::call('htmlDecode'), $set);
$array[serialize($set)] = $set;
} catch (Exception $e) { }
}
return array_values($array);
}
}
$html = <<<EOD
<a href="jump.php">画像A[TEXT]<img src="image_a.png" alt="画像A[ALT]" /></a>
<a href="jump.php"><img src="image_a.png" alt="画像B[ALT]" /></a>
EOD;
$base_url = 'http://example.com/foo/bar/';
$base_url = null;
var_dump($base_url, UrlScraper::parseLinks($html, $base_url));
- Output for 7.0.0 - 7.0.20, 7.1.0 - 7.1.33, 7.2.0 - 7.2.33, 7.3.0 - 7.3.33, 7.4.0 - 7.4.33, 8.0.0 - 8.0.30, 8.1.0 - 8.1.28, 8.2.0 - 8.2.18, 8.3.0 - 8.3.6
- NULL
array(2) {
[0]=>
array(2) {
["href"]=>
string(8) "jump.php"
["label"]=>
string(13) "画像A[TEXT]"
}
[1]=>
array(2) {
["href"]=>
string(8) "jump.php"
["label"]=>
string(12) "画像B[ALT]"
}
}
- Output for 5.3.0 - 5.3.29, 5.4.0 - 5.4.45, 5.5.0 - 5.5.38, 5.6.0 - 5.6.28
- NULL
array(2) {
[0]=>
array(2) {
["label"]=>
string(13) "画像A[TEXT]"
["href"]=>
string(8) "jump.php"
}
[1]=>
array(2) {
["label"]=>
string(12) "画像B[ALT]"
["href"]=>
string(8) "jump.php"
}
}
preferences:
272.3 ms | 402 KiB | 332 Q