<?php
class scraper
{
public static $response;
public static $dom;
// randomize useragent string
public static function get_useragent()
{
$ua = array(
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_1) AppleWebKit/536.25 (KHTML, like Gecko) Version/6.0 Safari/536.25",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
);
shuffle($ua);
$x = array_rand($ua);
return $ua[$x];
}
// download using curl
public static function data_download($_url)
{
if(!function_exists('curl_init'))
{
die('Sorry cURL is not installed!');
}
try
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $_url);
curl_setopt($ch, CURLOPT_REFERER, "http://www.google.com/");
curl_setopt($ch, CURLOPT_USERAGENT, self::get_useragent());
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
$output = curl_exec($ch);
curl_close($ch);
return $output;
}
catch(Zend_Exception $e)
{
echo 'Caught Exception: ' . get_class($e) . "\n";
echo 'Message: ' . $e->getMessage() . "\n";
}
}
public static function data_cache($url)
{
$cache = Zend_Registry::get('cache');
$cache_id = md5($url);
try
{
if(!$response = $cache->load($cache_id))
{
// delay execution for a few seconds at random
//sleep(rand(5, 15));
$response = trim(self::data_download($url));
$response = $cache->save($response, $cache_id);
$response = $cache->load($cache_id);
self::$response = $response;
}
else
{
$response = $cache->load($cache_id);
self::$response = $response;
}
}
catch(Zend_Exception $e)
{
echo 'Caught Exception: ' . get_class($e) . "\n";
echo 'Message: ' . $e->getMessage() . "\n";
}
}
public static function query_css($query)
{
$html = self::$response;
$dom = new Zend_Dom_Query($html);
$results = $dom->query($query);
return $results;
}
public static function query_xpath($query)
{
$html = self::$response;
$dom = new Zend_Dom_Query($html);
$results = $dom->queryXpath($query);
;
return $results;
}
public static function load_pq_dom()
{
try
{
$html = self::$response;
$dom = phpQuery::newDocumentHTML($html);
self::$dom = $dom;
}
catch(Zend_Exception $e)
{
echo 'Caught Exception: ' . get_class($e) . "\n";
echo 'Message: ' . $e->getMessage() . "\n";
}
}
public static function query_pq($query)
{
self::load_pq_dom();
$dom = self::$dom;
$dom = $dom->find($query);
$result = $dom->html();
return $result;
}
public static function parse_table($query, $_key, $_value)
{
self::load_pq_dom();
$dom = self::$dom;
foreach(pq($query) as $block)
{
$key = trim(pq($_key, $block)->text());
$value = trim(pq($_value, $block)->text());
$array[$key] = $value;
}
return $array;
}
public static function parse_list($query)
{
self::load_pq_dom();
$dom = self::$dom;
foreach(pq($query) as $block)
{
$list_item = trim(pq('ul li', $block)->text());
$array[] = $list_item;
}
return $array;
}
public static function parse_options($query)
{
self::load_pq_dom();
$dom = self::$dom;
foreach(pq($query) as $block)
{
$list_item = trim(pq('option', $block)->text());
$array[] = $list_item;
}
return $array;
}
public static function parse_links($query)
{
$dom = self::$dom;
$a = self::query_css($query);
foreach($a as $href)
{
$links[] = $href->getAttribute('href');
}
return $links;
}
public static function setup_database($host, $db_name, $user, $password)
{
Zend_Loader_Autoloader::getInstance()->registerNamespace("RedBean_");
R::setup("mysql:host={$host}; dbname={$db_name}", $user, $password);
}
}
preferences:
24.59 ms | 402 KiB | 5 Q