3v4l.org

run code in 300+ PHP versions simultaneously
<?php final class UrlScraper { private static $encoding; private static $array; private static function call($name) { return array(__CLASS__, $name); } private static function checkEmpty($dir) { return $dir !== '.' && $dir !== ''; } private static function htmlDecode($html) { return html_entity_decode($html, ENT_QUOTES, self::$encoding); } private static function mergeRecursive($item) { if (!self::checkEmpty($item)) { self::$array[] = $item; } } private static function parseUrl($url, $is_base = false) { if ( strpos($url, 'javascript:') === 0 or false === $p = parse_url($url) or $is_base && !isset($p['host']) ) { throw new InvalidArgumentException('Invalid URL'); } $p['scheme'] = isset($p['scheme']) ? $p['scheme'] . '://' : 'http://'; $p['port'] = isset($p['port']) ? ':' . $p['port'] : ''; $p['path'] = isset($p['path']) ? $p['path'] : ''; $q['head'] = isset($p['host']) ? $p['scheme'] . $p['host'] . $p['port'] : ''; $q['query'] = isset($p['query']) ? '?' . $p['query'] : ''; $q['fragment'] = isset($p['fragment']) ? '#' . $p['fragment'] : ''; $q['dirs'] = explode('/', $p['path']); $q['is_abs'] = $p['path'] !== '' && $q['dirs'][0] === ''; $q['is_dir'] = end($q['dirs']) === ''; $q['dirs'] = array_values(array_filter($q['dirs'], self::call('checkEmpty'))); return $q; } public static function parseLinks($html, $base_url = null, $encoding = 'UTF-8') { $regex = '@<a[^>]*?(?<!\.)href="([^"]*+)"[^>]*+>(.*?)</a>@si'; if (!preg_match_all($regex, $html, $matches, PREG_SET_ORDER)) { return array(); } if ($base_url !== null) { $p = self::parseUrl($base_url, true);; if (!$p['is_dir']) { array_pop($p['dirs']); } } $array = array(); foreach ($matches as $match) { list(, $set['href'], $set['label']) = $match; try { $q = self::parseUrl($set['href']); if ($base_url === null) { if ($q['head'] === '' && $q['query'] === '' && !$q['dirs'] && !$q['is_abs']) { continue; } } else { if ($q['head'] === '') { $tmp = $q['is_abs'] ? array() : $p['dirs']; foreach ($q['dirs'] as $dir) { if ($dir === '..') { array_pop($tmp); } } self::$array = array(); call_user_func( 'array_walk_recursive', array($p['head'], $tmp, $q['query'], $q['fragment']), self::call('mergeRecursive') ); $set['href'] = implode('/', self::$array); } else { $tmp = array_filter($q['dirs'], self::call('checkEmpty')); } if ($p['dirs'] === $tmp && $p['query'] === $q['query']) { continue; } } $set['label'] = ( '' === $text = trim(strip_tags($set['label'])) and preg_match_all('@(?<!\.)alt="([^"]++)"@', $set['label'], $matches) ) ? implode(' - ', $matches[1]) : $text ; self::$encoding = $encoding; $set = array_map(self::call('htmlDecode'), $set); $array[serialize($set)] = $set; } catch (Exception $e) { } } return array_values($array); } } $html = <<<'EOD' <a href="?q=php">?q=php</a> <a href="./?q=php">./?q=php</a> <a href="../?q=php">../?q=php</a> <a href="/?q=php">/?q=php</a> <a href="http://example.com/foo/bar/?q=php">http://example.com/foo/bar/?q=php</a> <a href="#php">#php</a> <a href="./#php">./#php</a> <a href="../#php">../#php</a> <a href="/#php">/#php</a> <a href="http://example.com/foo/bar/#php">http://example.com/foo/bar/#php</a> <a href="jump.php">画像A[TEXT]<img src="image_a.png" alt="画像A[ALT]" /></a> <a href="jump.php"><img src="image_a.png" alt="画像B[ALT]" /></a> EOD; $base_url = 'http://example.com/foo/bar/'; $base_url = null; var_dump($base_url, UrlScraper::parseLinks($html, $base_url));

preferences:
42.21 ms | 402 KiB | 5 Q