douniao7308 2016-07-04 14:36
浏览 65

libxml无法正确解析

For some reason , libxml fails to get the text correctly from a page. What I want to get the text "In last few days ..." which is the main text of the article, but libxml exits saying that it couldn't find the end tag of some start tag. This is the code:

[niko@dev1 tmp]$ cat domtest2.php 
<?php

    $url="http://www.journaldev.com/253/65-html5-tutorials-examples-and-resources-for-web-developers";
    $ch=curl_init();
    curl_setopt($ch,CURLOPT_URL,$url);
    curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
    $htmltext=curl_exec($ch);

    $dom=new DOMDocument;
    $result=$dom->loadHTML($htmltext);
    $full_text=$dom->textContent;

    echo $full_text;
?>
[niko@dev1 tmp]$ 

Code output:

[niko@dev1 tmp]$ php -f domtest2.php
PHP Warning:  DOMDocument::loadHTML(): htmlParseStartTag: misplaced <head> tag in Entity, line: 57 in /tmp/domtest2.php on line 10
PHP Stack trace:
PHP   1. {main}() /tmp/domtest2.php:0
PHP   2. DOMDocument->loadHTML() /tmp/domtest2.php:10
PHP Warning:  DOMDocument::loadHTML(): Tag header invalid in Entity, line: 69 in /tmp/domtest2.php on line 10
PHP Stack trace:
PHP   1. {main}() /tmp/domtest2.php:0
PHP   2. DOMDocument->loadHTML() /tmp/domtest2.php:10
PHP Warning:  DOMDocument::loadHTML(): AttValue: " expected in Entity, line: 76 in /tmp/domtest2.php on line 10
PHP Stack trace:
PHP   1. {main}() /tmp/domtest2.php:0
PHP   2. DOMDocument->loadHTML() /tmp/domtest2.php:10
PHP Warning:  DOMDocument::loadHTML(): Tag section invalid in Entity, line: 76 in /tmp/domtest2.php on line 10
PHP Stack trace:
PHP   1. {main}() /tmp/domtest2.php:0
PHP   2. DOMDocument->loadHTML() /tmp/domtest2.php:10
PHP Warning:  DOMDocument::loadHTML(): Couldn't find end of Start Tag section in Entity, line: 76 in /tmp/domtest2.php on line 10
PHP Stack trace:
PHP   1. {main}() /tmp/domtest2.php:0
PHP   2. DOMDocument->loadHTML() /tmp/domtest2.php:10
65 HTML5 Tutorials, Examples and Resources for Web Developers - JournalDevwindow._wpemojiSettings={"baseUrl":"https:\/\/s.w.org\/images\/core\/emoji\/72x72\/","ext":".png","source":{"concatemoji":"http:\/\/www.journaldev.com\/wp-includes\/js\/wp-emoji-release.min.js"}};!function(a,b,c){function d(a){var c,d,e,f=b.createElement("canvas"),g=f.getContext&&f.getContext("2d"),h=String.fromCharCode;if(!g||!g.fillText)return!1;switch(g.textBaseline="top",g.font="600 32px Arial",a){case"flag":return g.fillText(h(55356,56806,55356,56826),0,0),f.toDataURL().length>3e3;case"diversity":return g.fillText(h(55356,57221),0,0),c=g.getImageData(16,16,1,1).data,d=c[0]+","+c[1]+","+c[2]+","+c[3],g.fillText(h(55356,57221,55356,57343),0,0),c=g.getImageData(16,16,1,1).data,e=c[0]+","+c[1]+","+c[2]+","+c[3],d!==e;case"simple":return g.fillText(h(55357,56835),0,0),0!==g.getImageData(16,16,1,1).data[0];case"unicode8":return g.fillText(h(55356,57135),0,0),0!==g.getImageData(16,16,1,1).data[0]}return!1}function e(a){var c=b.createElement("script");c.src=a,c.type="text/javascript",b.getElementsByTagName("head")[0].appendChild(c)}var f,g,h,i;for(i=Array("simple","flag","unicode8","diversity"),c.supports={everything:!0,everythingExceptFlag:!0},h=0;h<i.length;h++)c.supports[i[h]]=d(i[h]),c.supports.everything=c.supports.everything&&c.supports[i[h]],"flag"!==i[h]&&(c.supports.everythingExceptFlag=c.supports.everythingExceptFlag&&c.supports[i[h]]);c.supports.everythingExceptFlag=c.supports.everythingExceptFlag&&!c.supports.flag,c.DOMReady=!1,c.readyCallback=function(){c.DOMReady=!0},c.supports.everything||(g=function(){c.readyCallback()},b.addEventListener?(b.addEventListener("DOMContentLoaded",g,!1),a.addEventListener("load",g,!1)):(a.attachEvent("onload",g),b.attachEvent("onreadystatechange",function(){"complete"===b.readyState&&c.readyCallback()})),f=c.source||{},f.concatemoji?e(f.concatemoji):f.wpemoji&&f.twemoji&&(e(f.twemoji),e(f.wpemoji)))}(window,document,window._wpemojiSettings);img.wp-smiley,img.emoji{display:inline !important;border:none !important;box-shadow:none !important;height:1em !important;width:1em !important;margin:0
.07em !important;vertical-align:-0.1em !important;background:none !important;padding:0
!important}jQuery(function(){jQuery('.wpdm-popup').click(function(){tb_show(jQuery(this).html(),this.href+'&modal=1&width=600&height=400');return false;});jQuery('.haspass').click(function(){var url=jQuery(this).attr('href');var id=jQuery(this).attr('rel');var password=jQuery('#pass_'+id).val();jQuery.post('http://www.journaldev.com/',{download:id,password:password},function(res){if(res=='error'){jQuery('#wpdm_file_'+id+' .perror').html('Wrong Password');setTimeout("jQuery('#wpdm_file_"+id+" .perror').html('');",3000);return false;}else{location.href='http://www.journaldev.com/?wpdmact=process&did='+res;}});return false;});}).enews
.screenread{height:1px;left:-1000em;overflow:hidden;position:absolute;top:-1000em;width:1px}(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)})(window,document,'script','//www.google-analytics.com/analytics.js','ga');ga('create','UA-12171637-4','auto');ga('send','pageview');.site-title
a{background:url(http://cdn.journaldev.com/wp-content/uploads/2014/05/cropped-Final-JD-Logo.png) no-repeat !important}.simple-social-icons ul li a, .simple-social-icons ul li a:hover{background-color:#f6f5f2 !important;border-radius:3px;color:#aaa !important;border:0px
#fff solid !important;font-size:18px;padding:9px}.simple-social-icons ul li a:hover{background-color:#000 !important;border-color:#fff !important;color:#fff !important}.mctb-bar,.mctb-response,.mctb-close{background:#f7682c !important}.mctb-bar,.mctb-label,.mctb-close{color:#fff !important}.mctb-button{background:#096abf !important;border-color:#096abf !important}.mctb-email:focus{outline-color:#096abf !important}.mctb-button{color:#fff !important}JournalDevJava, Java EE, Android, Web Development Tutorials
[niko@dev1 tmp]$

How can I find what is wrong here? You can reproduce this error if you copy the code and run in your terminal. Also, I want to point out that when I copied manually the html source from the url into a file and ran the test it worked, so I suspect there might be something with encoding. However, the article is written in english, doesn't make sense.

  • 写回答

0条回答 默认 最新

    报告相同问题?

    悬赏问题

    • ¥17 pro*C预编译“闪回查询”报错SCN不能识别
    • ¥15 微信会员卡接入微信支付商户号收款
    • ¥15 如何获取烟草零售终端数据
    • ¥15 数学建模招标中位数问题
    • ¥15 phython路径名过长报错 不知道什么问题
    • ¥15 深度学习中模型转换该怎么实现
    • ¥15 HLs设计手写数字识别程序编译通不过
    • ¥15 Stata外部命令安装问题求帮助!
    • ¥15 从键盘随机输入A-H中的一串字符串,用七段数码管方法进行绘制。提交代码及运行截图。
    • ¥15 TYPCE母转母,插入认方向