I am using a PHP crawler that has a memory leak. It is good for the first ~3125 links, then it runs out of memory.I tried getting rid of the MySQL insert, but that did not change anything. Can someone help me diagnose this problem? Thank you so much.
<?php
include $_SERVER['DOCUMENT_ROOT'] . '/config.php';
ini_set('max_execution_time', 0);
// USAGE
$startURL = $your_url;
$depth = 9999;
$crawler = new crawler($startURL, $depth);
// Exclude path with the following structure to be processed
$crawler->addFilterPath('customer/account/login/referer');
$crawler->run();
class crawler
{
protected $_url;
protected $_depth;
protected $_host;
protected $_seen = array();
protected $_filter = array();
public function __construct($url, $depth = 5)
{
$this->_url = $url;
$this->_depth = $depth;
$parse = parse_url($url);
$this->_host = $parse['host'];
}
protected function _processAnchors($content, $url, $depth)
{
$dom = new DOMDocument('1.0');
@$dom->loadHTML($content);
$anchors = $dom->getElementsByTagName('a');
foreach ($anchors as $element) {
$href = $element->getAttribute('href');
if (0 !== strpos($href, 'http')) {
$path = '/' . ltrim($href, '/');
if (extension_loaded('http')) {
$href = http_build_url($url, array('path' => $path));
} else {
$parts = parse_url($url);
$href = $parts['scheme'] . '://';
if (isset($parts['user']) && isset($parts['pass'])) {
$href .= $parts['user'] . ':' . $parts['pass'] . '@';
}
$href .= $parts['host'];
if (isset($parts['port'])) {
$href .= ':' . $parts['port'];
}
$href .= $path;
}
}
// Crawl only link that belongs to the start domain
$this->crawl_page($href, $depth - 1);
}
}
protected function _getContent($url)
{
$handle = curl_init($url);
curl_setopt($handle, CURLOPT_RETURNTRANSFER, TRUE);
$response = curl_exec($handle);
curl_close($handle);
return array($response);
}
protected function _printResult($url, $depth)
{
ob_end_flush();
$currentDepth = $this->_depth - $depth;
$count = count($this->_seen);
echo "$url <br>";
include $_SERVER['DOCUMENT_ROOT'] . '/config.php';
$databaseconnect = new PDO("mysql:dbname=DB_NAME;host=$mysqlhost;charset=utf8", $mysqlusername, $mysqlpassword);
$statement = $databaseconnect->prepare("INSERT INTO data(url,name) VALUES(:url,:name)");
$statement->execute(array(':url' => $url,
':name' => $url));
ob_start();
flush();
}
protected function isValid($url, $depth)
{
if (strpos($url, $this->_host) === false
|| $depth === 0
|| isset($this->_seen[$url])
) {
return false;
}
foreach ($this->_filter as $excludePath) {
if (strpos($url, $excludePath) !== false) {
return false;
}
}
return true;
}
public function crawl_page($url, $depth)
{
if (!$this->isValid($url, $depth)) {
return;
}
// add to the seen URL
$this->_seen[$url] = true;
// get Content and Return Code
list($content) = $this->_getContent($url);
// print Result for current Page
$this->_printResult($url, $depth);
// process subPages
$this->_processAnchors($content, $url, $depth);
}
public function addFilterPath($path)
{
$this->_filter[] = $path;
}
public function run()
{
$this->crawl_page($this->_url
, $this->_depth);
}
}
?>