<?php
error_reporting(E_ALL);
ini_set('display_errors', 1);
ini_set("memory_limit", "768M");
// Get the URL and depth from POST request
$url = isset($_POST['url']) ? $_POST['url'] : '';
$depth = isset($_POST['depth']) ? $_POST['depth'] : 0;
$maxPages = isset($_POST['max_pages']) ? $_POST['max_pages'] : 0;
// Generate a random 8-digit number for cache folder naming
$randomNumber = rand(10000000, 99999999);
$cacheFolder = '/var/www/html/' . $randomNumber . '/';
// Set date and time for crawl start
$crawlStartTime = date('d-m-y h:i:s');
/*
// Flush some information to the browser
header('Content-type: text/html; charset=utf-8');
function flushOutput($message) {
echo $message;
ob_flush();
flush();
usleep(400000);
}
echo str_pad('', 4096);
flushOutput("Crawl $randomNumber initiated at ");
flushOutput("$crawlStartTime<br>");
flushOutput("Master URL: $url <br>Depth: $depth");
flushOutput("Maximum Pages: $maxPages<hr><br>");
flushOutput("<br>");
flushOutput("Crawling in progress. Do not close this window...<br><br>");
*/
// Create the cache folder
if (!is_dir($cacheFolder) && !mkdir($cacheFolder, 0777, true)) {
echo "Check directory permissions and try again.";
die('Failed to create cache folder: ' . $cacheFolder);
}
function scrapeWebPage($url, &$data, &$visitedUrls, $depth, &$pagesCrawled, $cacheFolder, $maxPages) {
if ($depth <= 0 || in_array($url, $visitedUrls) || $pagesCrawled >= $maxPages) {
usleep(1000); // Slow down requests 0.001s
return; // Skip already visited URLs, reach max depth, or exceed page limit
}
$visitedUrls[] = $url; // Add the current URL to the visited list
if (!filter_var($url, FILTER_VALIDATE_URL)) {
usleep(2000); // Slow down requests 0.002s
return; // Skip invalid URLs
}
// Check the size of the file before retrieving its content
$headers = get_headers($url, 1);
if (isset($headers['Content-Length']) && $headers['Content-Length'] > 5 * 1024 * 1024) {
usleep(1000); // Slow down requests 0.001s
return; // Skip files larger than 5MB
}
$cacheFile = $cacheFolder . md5($url) . '.html';
if (file_exists($cacheFile)) {
$data .= file_get_contents($cacheFile); // Retrieve data from cache
usleep(1000); // Slow down requests 0.001s
return; // Skip scraping if data is cached
}
$html = file_get_contents($url); // add @ to suppress errors here
if ($html === false) {
usleep(1000); // Slow down requests 0.001s
return; // Skip if failed to retrieve HTML content
}
$doc = new DOMDocument();
libxml_use_internal_errors(true); // Suppress HTML parsing errors
$doc->loadHTML($html);
libxml_clear_errors();
$pagesCrawled++; // Increment the count of crawled pages
// Extract and store the scraped data
$title = html_entity_decode($doc->getElementsByTagName('title')->item(0)->textContent);
$size = strlen($html);
$scriptTag = $doc->getElementsByTagName('script')->length > 0 ? "Yes" : "No";
// Extract meta tags
$metaDescription = 'none';
$metaKeywords = 'none';
foreach ($doc->getElementsByTagName('meta') as $tag) {
if ($tag->getAttribute('name') == 'description') {
$metaDescription = $tag->getAttribute('content');
}
if ($tag->getAttribute('name') == 'keywords') {
$metaKeywords = $tag->getAttribute('content');
}
}
// Store the scraped data in the cache
$fileContent = sprintf(
'<div name="%s"><h3 name="title">%s</h3><a name="URL" href="%s">%s</a><p name="scriptTag">%s</p><p name="size">%d</p><p name="meta description">%s</p><p name="meta keywords">%s</p><p name="dateStamp">%s</p></div><hr>',
$url,
htmlentities($title),
$url,
$url,
$scriptTag,
$size,
htmlentities($metaDescription),
htmlentities($metaKeywords),
date('ymd')
);
file_put_contents($cacheFile, $fileContent, FILE_APPEND); // Append the data to the file
$data .= $fileContent; // Append the scraped data to the output
$depth--; // Decrement the depth parameter
// Crawl the links on the current page
foreach ($doc->getElementsByTagName('a') as $linkTag) {
$linkUrl = $linkTag->getAttribute('href');
if (!empty($linkUrl)) {
$linkUrl = (strpos($linkUrl, 'http') === 0) ? $linkUrl : resolveUrl($url, $linkUrl);
scrapeWebPage($linkUrl, $data, $visitedUrls, $depth, $pagesCrawled, $cacheFolder, $maxPages); // Recursive call
}
}
}
function resolveUrl($baseUrl, $relativeUrl) {
// If the relative URL is already absolute, return it
if (filter_var($relativeUrl, FILTER_VALIDATE_URL)) {
return $relativeUrl;
}
// Parse the base URL to get its components
$baseParts = parse_url($baseUrl);
// If the relative URL starts with a '/', it is relative to the domain
if (strpos($relativeUrl, '/') === 0) {
return $baseParts['scheme'] . '://' . $baseParts['host'] . $relativeUrl;
}
// Parse the base path and ensure it ends with a '/'
$basePath = isset($baseParts['path']) ? rtrim(dirname($baseParts['path']), '/') . '/' : '/';
// Return the resolved URL
return $baseParts['scheme'] . '://' . $baseParts['host'] . $basePath . ltrim($relativeUrl, '/');
}
$data = '';
$visitedUrls = [];
$pagesCrawled = 0; // Initialize the count of crawled pages
scrapeWebPage($url, $data, $visitedUrls, $depth, $pagesCrawled, $cacheFolder, $maxPages);
$fileName = "crawl" . $randomNumber;
$mergedData = '';
// Remove the cache folder
if (is_dir($cacheFolder)) {
$files = glob($cacheFolder . '*');
foreach ($files as $file) {
if (is_file($file)) {
$mergedData .= file_get_contents($file);
unlink($file); // Delete the cached file
}
}
rmdir($cacheFolder); // Delete the cache folder
}
// Split the merged data into chunks
$maxSize = 1048500; // <1MB
$chunks = [];
$currentChunk = '';
$currentSize = 0;
// Populate the chunks array
foreach (explode("\n", $mergedData) as $line) {
$lineSize = strlen($line) + 1; // +1 for the newline character
if ($currentSize + $lineSize > $maxSize) {
$chunks[] = $currentChunk; // Save the current chunk
$currentChunk = $line; // Start a new chunk
$currentSize = $lineSize; // Reset the current size
} else {
$currentChunk .= $line . "\n"; // Append to the current chunk
$currentSize += $lineSize; // Update the current size
}
}
// Add the last chunk if it exists
if ($currentChunk !== '') {
$chunks[] = $currentChunk;
}
// Save each chunk in separate output files
if (count($chunks) === 1) {
// Only one chunk, use the base filename
$chunkFileName = 'raw/' . $fileName . '.html'; // Specify the 'raw' directory
file_put_contents($chunkFileName, $chunks[0]);
echo "<hr>Crawl complete<br>Created file: <a href='/$chunkFileName'>$chunkFileName</a><br>"; // List the created file
} else {
// Multiple chunks, use the part naming convention
foreach ($chunks as $index => $chunk) {
$chunkFileName = 'raw/' . $fileName . '_part' . ($index + 1) . '.html'; // Specify the 'raw' directory
file_put_contents($chunkFileName, $chunk);
echo "<hr>Crawl complete<br>Created file: <a href='/$chunkFileName'>$chunkFileName</a><br>"; // List the created files
}
}
echo "Pages Crawled: <b> $pagesCrawled </b><br>It is now safe to close this window.";
?>
<?php
error_reporting(E_ALL);
ini_set('display_errors', 1);
ini_set("memory_limit", "768M");
// Get the URL and depth from POST request
$url = $_POST['url'] ?? '';
$depth = $_POST['depth'] ?? 0;
$maxPages = $_POST['max_pages'] ?? 0;
// Generate a random 8-digit number for cache folder naming
$randomNumber = rand(10000000, 99999999);
$cacheFolder = '/var/www/html/' . $randomNumber . '/';
// Set date and time for crawl start
$crawlStartTime = date('d-m-y h:i:s');
// Flush some information to the browser
header('Content-type: text/html; charset=utf-8');
function flushOutput($message) {
echo $message;
ob_flush();
flush();
usleep(400000);
}
echo str_pad('', 4096);
flushOutput("Crawl $randomNumber initiated at ");
flushOutput("$crawlStartTime<br>");
flushOutput("Master URL: $url <br>Depth: $depth");
flushOutput("Maximum Pages: $maxPages<hr><br>");
flushOutput("<br>");
flushOutput("Crawling in progress. Do not close this window...<br><br>");
// Create the cache folder
if (!is_dir($cacheFolder) && !mkdir($cacheFolder, 0777, true)) {
echo "Check directory permissions and try again.";
die('Failed to create cache folder: ' . $cacheFolder);
}
function scrapeWebPage($url, &$data, &$visitedUrls, $depth, &$pagesCrawled, $cacheFolder, $maxPages) {
if ($depth <= 0 || in_array($url, $visitedUrls) || $pagesCrawled >= $maxPages) {
usleep(1000); // Slow down requests 0.001s
return; // Skip already visited URLs, reach max depth, or exceed page limit
}
$visitedUrls[] = $url; // Add the current URL to the visited list
if (!filter_var($url, FILTER_VALIDATE_URL)) {
usleep(2000); // Slow down requests 0.002s
return; // Skip invalid URLs
}
// Check the size of the file before retrieving its content
$headers = get_headers($url, 1);
if (isset($headers['Content-Length']) && $headers['Content-Length'] > 5 * 1024 * 1024) {
usleep(1000); // Slow down requests 0.001s
return; // Skip files larger than 5MB
}
$cacheFile = $cacheFolder . md5($url) . '.html';
if (file_exists($cacheFile)) {
$data .= file_get_contents($cacheFile); // Retrieve data from cache
usleep(1000); // Slow down requests 0.001s
return; // Skip scraping if data is cached
}
$html = file_get_contents($url); // add @ to suppress errors here
if ($html === false) {
usleep(1000); // Slow down requests 0.001s
return; // Skip if failed to retrieve HTML content
}
$doc = new DOMDocument();
libxml_use_internal_errors(true); // Suppress HTML parsing errors
$doc->loadHTML($html);
libxml_clear_errors();
$pagesCrawled++; // Increment the count of crawled pages
// Extract and store the scraped data
$title = html_entity_decode($doc->getElementsByTagName('title')->item(0)->textContent);
$size = strlen($html);
$scriptTag = $doc->getElementsByTagName('script')->length > 0 ? "Yes" : "No";
// Extract meta tags
$metaDescription = 'none';
$metaKeywords = 'none';
foreach ($doc->getElementsByTagName('meta') as $tag) {
if ($tag->getAttribute('name') == 'description') {
$metaDescription = $tag->getAttribute('content');
}
if ($tag->getAttribute('name') == 'keywords') {
$metaKeywords = $tag->getAttribute('content');
}
}
// Store the scraped data in the cache
$fileContent = sprintf(
'<div name="%s"><h3 name="title">%s</h3><a name="URL" href="%s">%s</a><p name="scriptTag">%s</p><p name="size">%d</p><p name="meta description">%s</p><p name="meta keywords">%s</p><p name="dateStamp">%s</p></div><hr>',
$url,
htmlentities($title),
$url,
$url,
$scriptTag,
$size,
htmlentities($metaDescription),
htmlentities($metaKeywords),
date('ymd')
);
file_put_contents($cacheFile, $fileContent, FILE_APPEND); // Append the data to the file
$data .= $fileContent; // Append the scraped data to the output
$depth--; // Decrement the depth parameter
// Crawl the links on the current page
foreach ($doc->getElementsByTagName('a') as $linkTag) {
$linkUrl = $linkTag->getAttribute('href');
if (!empty($linkUrl)) {
$linkUrl = (strpos($linkUrl, 'http') === 0) ? $linkUrl : resolveUrl($url, $linkUrl);
scrapeWebPage($linkUrl, $data, $visitedUrls, $depth, $pagesCrawled, $cacheFolder, $maxPages); // Recursive call
}
}
}
function resolveUrl($baseUrl, $relativeUrl) {
// If the relative URL is already absolute, return it
if (filter_var($relativeUrl, FILTER_VALIDATE_URL)) {
return $relativeUrl;
}
// Parse the base URL to get its components
$baseParts = parse_url($baseUrl);
// If the relative URL starts with a '/', it is relative to the domain
if (strpos($relativeUrl, '/') === 0) {
return $baseParts['scheme'] . '://' . $baseParts['host'] . $relativeUrl;
}
// Parse the base path and ensure it ends with a '/'
$basePath = isset($baseParts['path']) ? rtrim(dirname($baseParts['path']), '/') . '/' : '/';
// Return the resolved URL
return $baseParts['scheme'] . '://' . $baseParts['host'] . $basePath . ltrim($relativeUrl, '/');
}
$data = '';
$visitedUrls = [];
$pagesCrawled = 0; // Initialize the count of crawled pages
scrapeWebPage($url, $data, $visitedUrls, $depth, $pagesCrawled, $cacheFolder, $maxPages);
$fileName = "crawl" . $randomNumber;
$mergedData = '';
// Remove the cache folder
if (is_dir($cacheFolder)) {
$files = glob($cacheFolder . '*');
foreach ($files as $file) {
if (is_file($file)) {
$mergedData .= file_get_contents($file);
unlink($file); // Delete the cached file
}
}
rmdir($cacheFolder); // Delete the cache folder
}
// Split the merged data into chunks
$maxSize = 1048500; // <1MB
$chunks = [];
$currentChunk = '';
$currentSize = 0;
// Populate the chunks array
foreach (explode("\n", $mergedData) as $line) {
$lineSize = strlen($line) + 1; // +1 for the newline character
if ($currentSize + $lineSize > $maxSize) {
$chunks[] = $currentChunk; // Save the current chunk
$currentChunk = $line; // Start a new chunk
$currentSize = $lineSize; // Reset the current size
} else {
$currentChunk .= $line . "\n"; // Append to the current chunk
$currentSize += $lineSize; // Update the current size
}
}
// Add the last chunk if it exists
if ($currentChunk !== '') {
$chunks[] = $currentChunk;
}
// Save each chunk in separate output files
if (count($chunks) === 1) {
// Only one chunk, use the base filename
$chunkFileName = 'raw/' . $fileName . '.html'; // Specify the 'raw' directory
file_put_contents($chunkFileName, $chunks[0]);
echo "<hr>Crawl complete<br>Created file: <a href='/$chunkFileName'>$chunkFileName</a><br>"; // List the created file
} else {
// Multiple chunks, use the part naming convention
foreach ($chunks as $index => $chunk) {
$chunkFileName = 'raw/' . $fileName . '_part' . ($index + 1) . '.html'; // Specify the 'raw' directory
file_put_contents($chunkFileName, $chunk);
echo "<hr>Crawl complete<br>Created file: <a href='/$chunkFileName'>$chunkFileName</a><br>"; // List the created files
}
}
echo "Pages Crawled: <b> $pagesCrawled </b><br>It is now safe to close this window.";
?>