Online PHP editor | rfc for LkLaY

<?php error_reporting(E_ALL); ini_set('display_errors', 1); ini_set("memory_limit", "768M"); // Get the URL and depth from POST request $url = isset($_POST['url']) ? $_POST['url'] : ''; $depth = isset($_POST['depth']) ? $_POST['depth'] : 0; $maxPages = isset($_POST['max_pages']) ? $_POST['max_pages'] : 0; // Generate a random 8-digit number for cache folder naming $randomNumber = rand(10000000, 99999999); $cacheFolder = '/var/www/html/' . $randomNumber . '/'; // Set date and time for crawl start $crawlStartTime = date('d-m-y h:i:s'); /* // Flush some information to the browser header('Content-type: text/html; charset=utf-8'); function flushOutput($message) { echo $message; ob_flush(); flush(); usleep(400000); } echo str_pad('', 4096); flushOutput("Crawl $randomNumber initiated at "); flushOutput("$crawlStartTime "); flushOutput("Master URL: $url Depth: $depth"); flushOutput("Maximum Pages: $maxPages<hr> "); flushOutput(" "); flushOutput("Crawling in progress. Do not close this window... "); */ // Create the cache folder if (!is_dir($cacheFolder) && !mkdir($cacheFolder, 0777, true)) { echo "Check directory permissions and try again."; die('Failed to create cache folder: ' . $cacheFolder); } function scrapeWebPage($url, &$data, &$visitedUrls, $depth, &$pagesCrawled, $cacheFolder, $maxPages) { if ($depth <= 0 || in_array($url, $visitedUrls) || $pagesCrawled >= $maxPages) { usleep(1000); // Slow down requests 0.001s return; // Skip already visited URLs, reach max depth, or exceed page limit } $visitedUrls[] = $url; // Add the current URL to the visited list if (!filter_var($url, FILTER_VALIDATE_URL)) { usleep(2000); // Slow down requests 0.002s return; // Skip invalid URLs } // Check the size of the file before retrieving its content $headers = get_headers($url, 1); if (isset($headers['Content-Length']) && $headers['Content-Length'] > 5 * 1024 * 1024) { usleep(1000); // Slow down requests 0.001s return; // Skip files larger than 5MB } $cacheFile = $cacheFolder . md5($url) . '.html'; if (file_exists($cacheFile)) { $data .= file_get_contents($cacheFile); // Retrieve data from cache usleep(1000); // Slow down requests 0.001s return; // Skip scraping if data is cached } $html = file_get_contents($url); // add @ to suppress errors here if ($html === false) { usleep(1000); // Slow down requests 0.001s return; // Skip if failed to retrieve HTML content } $doc = new DOMDocument(); libxml_use_internal_errors(true); // Suppress HTML parsing errors $doc->loadHTML($html); libxml_clear_errors(); $pagesCrawled++; // Increment the count of crawled pages // Extract and store the scraped data $title = html_entity_decode($doc->getElementsByTagName('title')->item(0)->textContent); $size = strlen($html); $scriptTag = $doc->getElementsByTagName('script')->length > 0 ? "Yes" : "No"; // Extract meta tags $metaDescription = 'none'; $metaKeywords = 'none'; foreach ($doc->getElementsByTagName('meta') as $tag) { if ($tag->getAttribute('name') == 'description') { $metaDescription = $tag->getAttribute('content'); } if ($tag->getAttribute('name') == 'keywords') { $metaKeywords = $tag->getAttribute('content'); } } // Store the scraped data in the cache $fileContent = sprintf( '<div name="%s"><h3 name="title">%s</h3><a name="URL" href="%s">%s</a>%s%d%s%s%s</div><hr>', $url, htmlentities($title), $url, $url, $scriptTag, $size, htmlentities($metaDescription), htmlentities($metaKeywords), date('ymd') ); file_put_contents($cacheFile, $fileContent, FILE_APPEND); // Append the data to the file $data .= $fileContent; // Append the scraped data to the output $depth--; // Decrement the depth parameter // Crawl the links on the current page foreach ($doc->getElementsByTagName('a') as $linkTag) { $linkUrl = $linkTag->getAttribute('href'); if (!empty($linkUrl)) { $linkUrl = (strpos($linkUrl, 'http') === 0) ? $linkUrl : resolveUrl($url, $linkUrl); scrapeWebPage($linkUrl, $data, $visitedUrls, $depth, $pagesCrawled, $cacheFolder, $maxPages); // Recursive call } } } function resolveUrl($baseUrl, $relativeUrl) { // If the relative URL is already absolute, return it if (filter_var($relativeUrl, FILTER_VALIDATE_URL)) { return $relativeUrl; } // Parse the base URL to get its components $baseParts = parse_url($baseUrl); // If the relative URL starts with a '/', it is relative to the domain if (strpos($relativeUrl, '/') === 0) { return $baseParts['scheme'] . '://' . $baseParts['host'] . $relativeUrl; } // Parse the base path and ensure it ends with a '/' $basePath = isset($baseParts['path']) ? rtrim(dirname($baseParts['path']), '/') . '/' : '/'; // Return the resolved URL return $baseParts['scheme'] . '://' . $baseParts['host'] . $basePath . ltrim($relativeUrl, '/'); } $data = ''; $visitedUrls = []; $pagesCrawled = 0; // Initialize the count of crawled pages scrapeWebPage($url, $data, $visitedUrls, $depth, $pagesCrawled, $cacheFolder, $maxPages); $fileName = "crawl" . $randomNumber; $mergedData = ''; // Remove the cache folder if (is_dir($cacheFolder)) { $files = glob($cacheFolder . '*'); foreach ($files as $file) { if (is_file($file)) { $mergedData .= file_get_contents($file); unlink($file); // Delete the cached file } } rmdir($cacheFolder); // Delete the cache folder } // Split the merged data into chunks $maxSize = 1048500; // <1MB $chunks = []; $currentChunk = ''; $currentSize = 0; // Populate the chunks array foreach (explode("\n", $mergedData) as $line) { $lineSize = strlen($line) + 1; // +1 for the newline character if ($currentSize + $lineSize > $maxSize) { $chunks[] = $currentChunk; // Save the current chunk $currentChunk = $line; // Start a new chunk $currentSize = $lineSize; // Reset the current size } else { $currentChunk .= $line . "\n"; // Append to the current chunk $currentSize += $lineSize; // Update the current size } } // Add the last chunk if it exists if ($currentChunk !== '') { $chunks[] = $currentChunk; } // Save each chunk in separate output files if (count($chunks) === 1) { // Only one chunk, use the base filename $chunkFileName = 'raw/' . $fileName . '.html'; // Specify the 'raw' directory file_put_contents($chunkFileName, $chunks[0]); echo "<hr>Crawl complete Created file: <a href='/$chunkFileName'>$chunkFileName</a> "; // List the created file } else { // Multiple chunks, use the part naming convention foreach ($chunks as $index => $chunk) { $chunkFileName = 'raw/' . $fileName . '_part' . ($index + 1) . '.html'; // Specify the 'raw' directory file_put_contents($chunkFileName, $chunk); echo "<hr>Crawl complete Created file: <a href='/$chunkFileName'>$chunkFileName</a> "; // List the created files } } echo "Pages Crawled: $pagesCrawled It is now safe to close this window."; ?> <?php error_reporting(E_ALL); ini_set('display_errors', 1); ini_set("memory_limit", "768M"); // Get the URL and depth from POST request $url = $_POST['url'] ?? ''; $depth = $_POST['depth'] ?? 0; $maxPages = $_POST['max_pages'] ?? 0; // Generate a random 8-digit number for cache folder naming $randomNumber = rand(10000000, 99999999); $cacheFolder = '/var/www/html/' . $randomNumber . '/'; // Set date and time for crawl start $crawlStartTime = date('d-m-y h:i:s'); // Flush some information to the browser header('Content-type: text/html; charset=utf-8'); function flushOutput($message) { echo $message; ob_flush(); flush(); usleep(400000); } echo str_pad('', 4096); flushOutput("Crawl $randomNumber initiated at "); flushOutput("$crawlStartTime "); flushOutput("Master URL: $url Depth: $depth"); flushOutput("Maximum Pages: $maxPages<hr> "); flushOutput(" "); flushOutput("Crawling in progress. Do not close this window... "); // Create the cache folder if (!is_dir($cacheFolder) && !mkdir($cacheFolder, 0777, true)) { echo "Check directory permissions and try again."; die('Failed to create cache folder: ' . $cacheFolder); } function scrapeWebPage($url, &$data, &$visitedUrls, $depth, &$pagesCrawled, $cacheFolder, $maxPages) { if ($depth <= 0 || in_array($url, $visitedUrls) || $pagesCrawled >= $maxPages) { usleep(1000); // Slow down requests 0.001s return; // Skip already visited URLs, reach max depth, or exceed page limit } $visitedUrls[] = $url; // Add the current URL to the visited list if (!filter_var($url, FILTER_VALIDATE_URL)) { usleep(2000); // Slow down requests 0.002s return; // Skip invalid URLs } // Check the size of the file before retrieving its content $headers = get_headers($url, 1); if (isset($headers['Content-Length']) && $headers['Content-Length'] > 5 * 1024 * 1024) { usleep(1000); // Slow down requests 0.001s return; // Skip files larger than 5MB } $cacheFile = $cacheFolder . md5($url) . '.html'; if (file_exists($cacheFile)) { $data .= file_get_contents($cacheFile); // Retrieve data from cache usleep(1000); // Slow down requests 0.001s return; // Skip scraping if data is cached } $html = file_get_contents($url); // add @ to suppress errors here if ($html === false) { usleep(1000); // Slow down requests 0.001s return; // Skip if failed to retrieve HTML content } $doc = new DOMDocument(); libxml_use_internal_errors(true); // Suppress HTML parsing errors $doc->loadHTML($html); libxml_clear_errors(); $pagesCrawled++; // Increment the count of crawled pages // Extract and store the scraped data $title = html_entity_decode($doc->getElementsByTagName('title')->item(0)->textContent); $size = strlen($html); $scriptTag = $doc->getElementsByTagName('script')->length > 0 ? "Yes" : "No"; // Extract meta tags $metaDescription = 'none'; $metaKeywords = 'none'; foreach ($doc->getElementsByTagName('meta') as $tag) { if ($tag->getAttribute('name') == 'description') { $metaDescription = $tag->getAttribute('content'); } if ($tag->getAttribute('name') == 'keywords') { $metaKeywords = $tag->getAttribute('content'); } } // Store the scraped data in the cache $fileContent = sprintf( '<div name="%s"><h3 name="title">%s</h3><a name="URL" href="%s">%s</a>%s%d%s%s%s</div><hr>', $url, htmlentities($title), $url, $url, $scriptTag, $size, htmlentities($metaDescription), htmlentities($metaKeywords), date('ymd') ); file_put_contents($cacheFile, $fileContent, FILE_APPEND); // Append the data to the file $data .= $fileContent; // Append the scraped data to the output $depth--; // Decrement the depth parameter // Crawl the links on the current page foreach ($doc->getElementsByTagName('a') as $linkTag) { $linkUrl = $linkTag->getAttribute('href'); if (!empty($linkUrl)) { $linkUrl = (strpos($linkUrl, 'http') === 0) ? $linkUrl : resolveUrl($url, $linkUrl); scrapeWebPage($linkUrl, $data, $visitedUrls, $depth, $pagesCrawled, $cacheFolder, $maxPages); // Recursive call } } } function resolveUrl($baseUrl, $relativeUrl) { // If the relative URL is already absolute, return it if (filter_var($relativeUrl, FILTER_VALIDATE_URL)) { return $relativeUrl; } // Parse the base URL to get its components $baseParts = parse_url($baseUrl); // If the relative URL starts with a '/', it is relative to the domain if (strpos($relativeUrl, '/') === 0) { return $baseParts['scheme'] . '://' . $baseParts['host'] . $relativeUrl; } // Parse the base path and ensure it ends with a '/' $basePath = isset($baseParts['path']) ? rtrim(dirname($baseParts['path']), '/') . '/' : '/'; // Return the resolved URL return $baseParts['scheme'] . '://' . $baseParts['host'] . $basePath . ltrim($relativeUrl, '/'); } $data = ''; $visitedUrls = []; $pagesCrawled = 0; // Initialize the count of crawled pages scrapeWebPage($url, $data, $visitedUrls, $depth, $pagesCrawled, $cacheFolder, $maxPages); $fileName = "crawl" . $randomNumber; $mergedData = ''; // Remove the cache folder if (is_dir($cacheFolder)) { $files = glob($cacheFolder . '*'); foreach ($files as $file) { if (is_file($file)) { $mergedData .= file_get_contents($file); unlink($file); // Delete the cached file } } rmdir($cacheFolder); // Delete the cache folder } // Split the merged data into chunks $maxSize = 1048500; // <1MB $chunks = []; $currentChunk = ''; $currentSize = 0; // Populate the chunks array foreach (explode("\n", $mergedData) as $line) { $lineSize = strlen($line) + 1; // +1 for the newline character if ($currentSize + $lineSize > $maxSize) { $chunks[] = $currentChunk; // Save the current chunk $currentChunk = $line; // Start a new chunk $currentSize = $lineSize; // Reset the current size } else { $currentChunk .= $line . "\n"; // Append to the current chunk $currentSize += $lineSize; // Update the current size } } // Add the last chunk if it exists if ($currentChunk !== '') { $chunks[] = $currentChunk; } // Save each chunk in separate output files if (count($chunks) === 1) { // Only one chunk, use the base filename $chunkFileName = 'raw/' . $fileName . '.html'; // Specify the 'raw' directory file_put_contents($chunkFileName, $chunks[0]); echo "<hr>Crawl complete Created file: <a href='/$chunkFileName'>$chunkFileName</a> "; // List the created file } else { // Multiple chunks, use the part naming convention foreach ($chunks as $index => $chunk) { $chunkFileName = 'raw/' . $fileName . '_part' . ($index + 1) . '.html'; // Specify the 'raw' directory file_put_contents($chunkFileName, $chunk); echo "<hr>Crawl complete Created file: <a href='/$chunkFileName'>$chunkFileName</a> "; // List the created files } } echo "Pages Crawled: $pagesCrawled It is now safe to close this window."; ?>

3v4l.org

Active branches

Archived branches