dongyutan1703
dongyutan1703
2013-01-30 19:03

如何使用Simple-HTML-DOM提取完整的子链接?

The following is the basic code I use to extract sublinks from a page:

<?php
    include_once('simple_html_dom.php');
    function extract_links($target_url)
    {   
        $html = new simple_html_dom();
        $html->load_file($target_url);  
        $i=0;
        $crawl =array();
        foreach($html->find('a') as $link)
        {
            $crawl[$i] = $link->href;
            $i++;
        }
        var_dump($crawl);
    }
    extract_links('http://stackoverflow.com');
?>

The output is as follows:

array
  0 => string 'http://stackexchange.com' (length=24)
  1 => string '/users/login' (length=12)
  2 => string 'http://careers.stackoverflow.com' (length=32)
  3 => string 'http://chat.stackoverflow.com' (length=29)
  4 => string 'http://meta.stackoverflow.com' (length=29)
  5 => string '/about' (length=6)
  6 => string '/faq' (length=4)
  7 => string '/' (length=1)
  8 => string '/questions' (length=10)
  9 => string '/tags' (length=5)
  10 => string '/users' (length=6)
  11 => string '/badges' (length=7)
  12 => string '/unanswered' (length=11)
  13 => string '/questions/ask' (length=14)
  14 => string '?tab=interesting' (length=16)
  15 => string '?tab=featured' (length=13)
  16 => string '?tab=hot' (length=8)
  17 => string '?tab=week' (length=9)
  18 => string '?tab=month' (length=10)
  19 => string '/questions/14611052/basic-standalone-jpa-example-with-postgres-using-eclipse' (length=76)
  20 => string '/questions/tagged/eclipse' (length=25)
  21 => string '/questions/tagged/postgresql' (length=28)
  22 => string '/questions/tagged/jpa' (length=21)
  23 => string '/questions/14611052/basic-standalone-jpa-example-with-postgres-using-eclipse' (length=76)
  24 => string '/users/865448/tostao' (length=20)
  25 => string '/questions/14611172/unable-to-fully-print-a-page-containing-iframes-in-chrome' (length=77)
  26 => string '/questions/tagged/javascript' (length=28)
  27 => string '/questions/tagged/jquery' (length=24)
  28 => string '/questions/tagged/html' (length=22)
  29 => string '/questions/tagged/html5' (length=23)
  30 => string '/questions/tagged/google-chrome' (length=31)
  31 => string '/questions/14611172/unable-to-fully-print-a-page-containing-iframes-in-chrome' (length=77)
  32 => string '/users/962868/tejas' (length=19)
  33 => string '/questions/14609779/how-can-i-configure-bash-to-handle-crlf-shell-scripts' (length=73)
  34 => string '/questions/tagged/linux' (length=23)
  35 => string '/questions/tagged/windows' (length=25)
  36 => string '/questions/tagged/bash' (length=22)
  37 => string '/questions/tagged/line-endings' (length=30)
  38 => string '/questions/14609779/how-can-i-configure-bash-to-handle-crlf-shell-scripts/?lastactivity' (length=87)
  39 => string '/users/1899640/that-other-guy' (length=29)
  40 => string '/questions/14611169/using-one-socket-for-peer-to-peer-communication' (length=67)
  41 => string '/questions/tagged/sockets' (length=25)
  42 => string '/questions/tagged/p2p' (length=21)
  43 => string '/questions/14611169/using-one-socket-for-peer-to-peer-communication' (length=67)
  44 => string '/users/911651/xsnrg' (length=19)
  45 => string '/questions/14611166/possible-mistake-in-ios-dev-guide' (length=53)
  46 => string '/questions/tagged/iphone' (length=24)
  47 => string '/questions/tagged/ios' (length=21)
  48 => string '/questions/tagged/objective-c' (length=29)
  49 => string '/questions/14611166/possible-mistake-in-ios-dev-guide' (length=53)
  50 => string '/users/107715/matt-n' (length=20)
  51 => string '/questions/14611163/how-to-use-dispatcher-in-wpf-to-make-a-timer' (length=64)
  52 => string '/questions/tagged/wpf' (length=21)
  53 => string '/questions/tagged/timer' (length=23)
  54 => string '/questions/tagged/dispatcher' (length=28)
  55 => string '/questions/14611163/how-to-use-dispatcher-in-wpf-to-make-a-timer' (length=64)
  56 => string '/users/1741800/nashat' (length=21)
  57 => string '/questions/14610879/how-can-i-handle-an-access-violation-in-visual-studio-c' (length=75)
  58 => string '/questions/tagged/visual-c%2b%2b' (length=32)
  59 => string '/questions/tagged/exception-handling' (length=36)
  60 => string '/questions/tagged/access-violation' (length=34)
  61 => string '/questions/tagged/structured-exception' (length=38)
  62 => string '/questions/14610879/how-can-i-handle-an-access-violation-in-visual-studio-c/?lastactivity' (length=89)
  63 => string '/users/901812/big-endian' (length=24)
  64 => string '/questions/14611162/mvc-condintional-authorization' (length=50)
  65 => string '/questions/tagged/c%23' (length=22)
  66 => string '/questions/tagged/asp.net-mvc' (length=29)
  67 => string '/questions/tagged/asp.net-mvc-4' (length=31)
  68 => string '/questions/tagged/authorization' (length=31)
  69 => string '/questions/14611162/mvc-condintional-authorization' (length=50)
  70 => string '/users/644969/cadrell0' (length=22)
  71 => string '/questions/14611160/get-customer-role-nopcommerce' (length=49)
  72 => string '/questions/tagged/c%23' (length=22)
  73 => string '/questions/tagged/razor' (length=23)
  74 => string '/questions/tagged/nopcommerce' (length=29)
  75 => string '/questions/14611160/get-customer-role-nopcommerce' (length=49)
  76 => string '/users/1378841/mlg74' (length=20)
  77 => string '/questions/14611158/iframe-resizing-nested-in-gridview' (length=54)
  78 => string '/questions/tagged/resize' (length=24)
  79 => string '/questions/14611158/iframe-resizing-nested-in-gridview' (length=54)
  80 => string '/users/2026451/satish-patil' (length=27)
  81 => string '/questions/14611157/php-how-to-check-the-value-got-this-word-from-a-var' (length=71)
  82 => string '/questions/tagged/php' (length=21)
  83 => string '/questions/tagged/preg-match' (length=28)
  84 => string '/questions/tagged/strpos' (length=24)
  85 => string '/questions/14611157/php-how-to-check-the-value-got-this-word-from-a-var' (length=71)
  86 => string '/users/963414/samual99' (length=22)
  87 => string '/questions/14611155/how-to-get-the-coordinates-of-boundries-of-drawable-on-the-mapview' (length=86)
  88 => string '/questions/tagged/android' (length=25)
  89 => string '/questions/tagged/google-maps' (length=29)
  90 => string '/questions/14611155/how-to-get-the-coordinates-of-boundries-of-drawable-on-the-mapview' (length=86)
  91 => string '/users/1520564/blubar' (length=21)
  92 => string '/questions/14611153/why-css-is-empty-when-ssl-is-on-and-appcache-is-enabled-ipad-safari' (length=87)
  93 => string '/questions/tagged/css' (length=21)
  94 => string '/questions/tagged/ipad' (length=22)
  95 => string '/questions/tagged/ssl' (length=21)
  96 => string '/questions/tagged/mobile-safari' (length=31)
  97 => string '/questions/tagged/html5-appcache' (length=32)
  98 => string '/questions/14611153/why-css-is-empty-when-ssl-is-on-and-appcache-is-enabled-ipad-safari' (length=87)
  99 => string '/users/2026375/twoface' (length=22)
  100 => string '/questions/14611149/laravel-how-to-temporarily-store-eloquent-models-in-db-without-a-proper-schem' (length=97)
  101 => string '/questions/tagged/php' (length=21)
  102 => string '/questions/tagged/laravel' (length=25)
  103 => string '/questions/14611149/laravel-how-to-temporarily-store-eloquent-models-in-db-without-a-proper-schem' (length=97)
  104 => string '/users/291557/duality' (length=21)
  105 => string '/questions/13928812/xmlserializer-generateserializer-and-collections' (length=68)
  106 => string '/questions/tagged/c%23' (length=22)
  107 => string '/questions/tagged/xml-serialization' (length=35)
  108 => string '/questions/13928812/xmlserializer-generateserializer-and-collections/?lastactivity' (length=82)
  109 => string '/users/1200614/phil' (length=19)
  110 => string '/questions/14611145/keep-buttons-in-view-when-keyboard-opens-android' (length=68)
  111 => string '/questions/tagged/android' (length=25)
  112 => string '/questions/tagged/keyboard' (length=26)
  113 => string '/questions/tagged/resize' (length=24)
  114 => string '/questions/tagged/window' (length=24)
  115 => string '/questions/tagged/views' (length=23)
  116 => string '/questions/14611145/keep-buttons-in-view-when-keyboard-opens-android' (length=68)
  117 => string '/users/1137413/725623452362' (length=27)
  118 => string '/questions/14611144/ssdp-discovery-from-a-browser' (length=49)
  119 => string '/questions/tagged/silverlight' (length=29)
  120 => string '/questions/tagged/flash' (length=23)
  121 => string '/questions/14611144/ssdp-discovery-from-a-browser' (length=49)
  122 => string '/users/191882/legege' (length=20)
  123 => string '/questions/14611143/how-to-syncrhonize-on-site-in-memory-no-sql-datasources-with-central-database-in' (length=100)
  124 => string '/questions/tagged/architecture' (length=30)
  125 => string '/questions/tagged/nosql' (length=23)
  126 => string '/questions/tagged/java-ee-6' (length=27)
  127 => string '/questions/tagged/in-memory-database' (length=36)
  more elements...

Now consider '/about' sublink in the array. I want it to be displayed as 'https://stackoverflow.com/about'. Why only subpart of sublink is returned while in some cases complete sublink is returned ? Also some links are starting with '?' sign. How to sanitize these links ?

EDIT: Consider "http://en.wikipedia.org/wiki/Web_crawler". Now if I perform extract_links on it, I get a sublink like this "http://en.wikipedia.org/wiki/Web_crawler/wiki/Web_search_engine" which is invalid and most of the links are of this format. The correct link is "http://en.wikipedia.org/wiki/Web_search_engine". And I am using this function in another program which will pass an array of links so I cannot keep the if conditions static. The following is the code fragment I am using now:

foreach($html->find('a') as $link)
{   
    $href = $link->href;
    $fchr = substr($href, 0, 1);
    if ($fchr === '/')
    {
        $href = $target_url.$href;
    }
    else if ($fchr === '?')
    {
        $href = $target_url.'/'. $href;
    }
}
  • 点赞
  • 写回答
  • 关注问题
  • 收藏
  • 复制链接分享
  • 邀请回答

3条回答

  • dsnd7200 dsnd7200 8年前

    @pguardiario's comment

    As suggested in his comment phpUri is the perfect solution for converting relative URLs to absolute. You can find it here,

    点赞 评论 复制链接分享
  • dpp78272 dpp78272 8年前

    In html source, definition of this a element is <a href="/about">about</a>, thus it returned existing href value as /about.

    If you want to display links with http and domain try this (all internal links could start with / or ?);

    foreach($html->find('a') as $link) {
        $href = $link->href;
        $fchr = substr($href, 0, 1);
        if ($fchr === '/') {
            $href = 'http://stackoverflow.com'. $href;
        } else if ($fchr === '?') {
            $href = 'http://stackoverflow.com/'. $href;
        }
        $crawl[] = $href;
    }
    

    PN: You don't need $i while pushing non-assoc arrays, just $array[] = .. is ok.

    UPDATE

    Assuming you have trouble with "internal links" (so, these are may not contain http or site url). And actually I don't know how to detect internal links anymore. I think this grabs all those: 'foo', '/foo', '?foo', '#foo', '../foo', '/../foo'. But suppose these seems not valid?? '../foo', '/../foo'.

    // or more strong pattern
    if (!preg_match('~^((ht|f)tp?s*)://.*~i', trim($href))) {
        $href = 'http://www.site.com/'. ltrim($href, '/');
    }
    
    点赞 评论 复制链接分享
  • drd99007 drd99007 8年前

    Any link starting with "/" is an absolute path from the doc root. To get the complete URL you would need to prepend the hostname in which that link was found. For relative links, such as the "?tab=etc", you will need to prepend the complete URL in which the link was found. If you want to ignore the query string links ("?tab=etc") use a regular expression to do so.

    点赞 评论 复制链接分享