trying to do a loop with a url parameter, into a function which does a curl, gets all html and runs xpath on it. But the results varies. Is there something special I need to consider using curl or xpath? Sometimes it collects an emtpy string. The code works, just this flaw that is really hard to debug.
Here is the code I use.
private function getArticles($url){
// Instantiate cURL to grab the HTML page.
$c = curl_init($url);
curl_setopt($c, CURLOPT_HEADER, false);
curl_setopt($c, CURLOPT_USERAGENT, $this->getUserAgent());
curl_setopt($c, CURLOPT_FAILONERROR, true);
curl_setopt($c, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($c, CURLOPT_AUTOREFERER, true);
curl_setopt($c, CURLOPT_RETURNTRANSFER, true);
curl_setopt($c, CURLOPT_TIMEOUT, 10);
// Grab the data.
$html = curl_exec($c);
// Check if the HTML didn't load right, if it didn't - report an error
if (!$html) {
echo "<p>cURL error number: " .curl_errno($c) . " on URL: " . $url ."</p>" .
"<p>cURL error: " . curl_error($c) . "</p>";
}
// Close connection.
curl_close($c);
// Parse the HTML information and return the results.
$dom = new DOMDocument();
@$dom->loadHtml($html);
$xpath = new DOMXPath($dom);
// Get a list of articles from the section page
$cname = $xpath->query('//*[@id="item-details"]/div/div[1]/h1');
$link = $xpath->query('//*[@id="item-details"]/div/ul/li[1]/a/@href');
$streetadress = $xpath->query('//*[@id="item-details"]/div[2]/div[3]/div[1]/text()[1]');
$zip = $xpath->query('//*[@id="item-details"]/div[2]/div[3]/div[1]/text()[2]');
$phone1 = $xpath->query('//*[@id="item-details"]/div/h2/span[2]');
$phone2 = $xpath->query('//*[@id="item-details"]/div/h2[2]/span[2]');
$ceo = $xpath->query('//*[@id="company-financials"]/div/div[2]/span');
$orgnr = $xpath->query('//*[@id="company-financials"]/div/div[1]/span');
$turnover13 = $xpath->query('//*[@class="geb-turnover1"]');
$turnover12 = $xpath->query('//*[@class="geb-turnover2"]');
$turnover11 = $xpath->query('//*[@class="geb-turnover3"]');
$logo = $xpath->query('//*[@id="item-info"]/p/img/@src');
$desc = $xpath->query('//*[@id="item-info"]/div[1]/div');
$capturelink = "";
// $capturelink = $this->getWebCapture($link->item(0)->nodeValue);
return array(
'companyname' => $cname->item(0)->nodeValue,
'streetadress' => $streetadress->item(0)->nodeValue,
'zip' => $zip->item(0)->nodeValue,
'phone1' => $phone1->item(0)->nodeValue,
'phone2' => $phone2->item(0)->nodeValue,
'link' => $link->item(0)->nodeValue,
'ceo' => $ceo->item(0)->nodeValue,
'orgnr' => $orgnr->item(0)->nodeValue,
'turnover2013' => $turnover13->item(0)->nodeValue,
'turnover2012' => $turnover12->item(0)->nodeValue,
'turnover2011' => $turnover11->item(0)->nodeValue,
'description' => $desc->item(0)->nodeValue,
'logo' => $logo->item(0)->nodeValue,
'capturelink' => $capturelink);
}
// End Get Articles
Edit:
I really tried everything on this one. But ended up using phpQuery and now it works. I do think php dom and xpath combined is not always a good mix. At least for me in this case.
This how I use it instead of xpath:
....
require('phpQuery.php');
phpQuery::newDocumentHTML($html);
$capture = "";
// $capture = $this->getWebCapture(pq('.website')->attr('href'));
return array(
'companyname' => pq('.header')->find('h1')->text(),
'streetadress' => pq('.address-container:first-child')->text(),
'zip' => pq('.address-container')->text(),
'phone1' => pq('.phone-number')->text(),
'phone2' => pq('.phone-number')->text(),
'link' => pq('.website')->attr('href'),
'ceo' => pq('.geb-ceo')->text(),
'orgnr' => pq('.geb-org-number')->text(),
'turnover2013' => pq('.geb-turnover1')->text(),
'turnover2012' => pq('.geb-turnover2')->text(),
'turnover2011' => pq('.geb-turnover3')->text(),
'description' => pq('#item-info div div')->text(),
'logo' => pq('#item-info logo img')->attr('src'),
'capture' => $capture);