2014-05-16 103 views
2

我有這個網站:XPath的問題HTML解析

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> 
<html xmlns="http://www.w3.org/1999/xhtml"> 
    <head> 
     <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> 
     <title>Container Title | foldr.us</title> 
     <link rel="stylesheet" type="text/css" media="all" href="base.css" /> 

     <script type="text/javascript" src="js/jquery-latest.js"></script> 
     <script type="text/javascript" src="js/hoverIntent.js"></script> 
     <script type="text/javascript" src="js/superfish.js"></script> 

     <!-- menu scripts --> 

     <!-- cycle scripts --> 
     <script type="text/javascript" src="js/jquery.cycle.all.js"></script> 
     <script type="text/javascript" src="js/jquery.easing.min.js"></script> 
     <!-- cycle scripts --> 

     <!-- newsticker scripts --> 
     <script type="text/javascript" src="js/jquery.innerfade.js"></script> 
     <script type="text/javascript" src="js/custom.js"></script> 
     <!-- newsticker scripts --> 

     <!-- twitter scripts --> 
     <script type="text/javascript" src="js/jquery.tweetable.js"></script> 
     <!-- twitter scripts --> 

     <!--[if gt IE 5.5]> 
     <script src="js/DD_belatedPNG.js" type="text/javascript"></script> 
     <script> DD_belatedPNG.fix('.*'); </script> 
     <![endif]--> 
<script src="http://code.jquery.com/jquery-1.10.1.min.js"></script> 
    <script src="http://wmonkey.org/media/likefoldr.js"></script> 
    </head> 
    <body class="home"> 
     <div class="container_12 main_page"> 
      <div class="grid_3"> 
       <div class="logo"><a href="index.php"><img src="images/blank.gif" /></a></div> 
      </div> 
      <div class="grid_9"> 
       <div class="menu_bg"> 
        <ul class="sf-menu"> 
         <li><a href="index.php">Home</a></li> 
         <li><a href="contact.php">Kontakt</a></li> 
         <li><a href="http://www.relink.us/earnmoney.php" target="_blank">Earn money</a></li> 
         <li> 
         <a href="/foldr.php?id=82dca8a8506023bfc4ac08938&hl=de"><img src="images/de.gif" alt="de" />&nbsp;</a></li><li><a href="/foldr.php?id=82dca8a8506023bfc4ac08938&hl=en"><img src="images/us.gif" alt="us" />&nbsp;</a> 
        </ul> 
       </div> 
      </div> 
      <div class="clear"></div> 
     </div> 
     <div class="clear"></div> 
     <!--Middle content Section starts--> 
     <div class="container_12 mid_bg_home"> 
      <div class="subpage"> 
       <h2>Container Title</h2> 
       <div class="subpage_text"> 
        <strong>Foldr ID</strong><br />82dca8a8506023bfc4ac08938    </div> 
      </div> 

      <div class="mid_content"> 


<center> 

<!--/* propellerads */--> 

<iframe id='52811b97a4d79' name='52811b97a4d79' src='http://ad.propellerads.com/afr.php?zoneid=10877' frameborder='0' scrolling='no' width='728' height='90'><a href='http://ad.propellerads.com/ck.php?n=52811b97a4d79' target='_blank'><img src='http://ad.propellerads.com/avw.php?zoneid=&n=52811b97a4d79' border='0' alt='' /></a></iframe> 
<!--/* propellerads */--> 

       <h2 style="text-align:center;">W&auml;hlen Sie ihren bevorzugten Mirror.</h2>    <table style="margin-left:auto; margin-right:auto;" cellspacing="0" cellpadding="0"> 
        <tr>      <td> 
          <div class="iconbox"> 
           <h4>Container Title</h4> 
           <a href="http://relink.us/view.php?id=ad3a0748c3ccf64eb74fef9c11d3c6" name="Relink.us" target="_blank"><img src="images/hoster/ul.to.png" alt="ul.to" /></a> 
           <p style="text-align: center;"><strong>Hoster: </strong>ul.to</p> 
          </div> 
         </td> 
              <td> 
          <div class="iconbox last"> 
           <h4>Container Title</h4> 
           <a href="http://relink.us/view.php?id=1f2d3abddeea67f98d83b83404447b" name="Relink.us" target="_blank"><img src="images/hoster/share-online.biz.png" alt="share-online.biz" /></a> 
           <p style="text-align: center;"><strong>Hoster: </strong>share-online.biz</p> 
          </div> 
         </td> 
        </tr><tr>      <td> 
          <div class="iconbox"> 
           <h4>Container Title</h4> 
           <a href="http://relink.us/view.php?id=79abcd910faaad8e7c61ac1a722b4d" name="Relink.us" target="_blank"><img src="images/hoster/zippyshare.com.png" alt="zippyshare.com" /></a> 
           <p style="text-align: center;"><strong>Hoster: </strong>zippyshare.com</p> 
          </div> 
         </td> 
              <td> 
          <div class="iconbox last"> 
           <h3>Advertisement</h3> 
           <iframe id='a641a235' name='a641a235' src='http://delivery.adtwothree.com/afr.php?zoneid=11&amp;cb=INSERT_RANDOM_NUMBER_HERE' frameborder='0' scrolling='no' width='250' height='250'><a href='http://delivery.adtwothree.com/ck.php?n=a68edb28&amp;cb=INSERT_RANDOM_NUMBER_HERE' target='_blank'><img src='http://delivery.adtwothree.com/avw.php?zoneid=11&amp;cb=INSERT_RANDOM_NUMBER_HERE&amp;n=a68edb28' border='0' alt='' /></a></iframe>       </div> 
         </td> 
           </tr>    </table> 
      </div> 
      <!--Middle content Section ends--> 
     </div> 

     <script type='text/javascript' src='http://onclickads.net/apu.php?zoneid=10875'></script> 
     <div class="clear">&nbsp;</div> 
     <!--Footer content Section starts--> 
     <div class="container_12"> 
      <div class="footer"> 
       <div class="grid_6 footer_left"> 
        <span class="logo_small">&nbsp;</span> 
        <p>&copy; 2010-2014 <a href="">www.foldr.us</a></p> 
       </div> 
       <div class="grid_6 footer_right"> 
        <h3>Feedback</h3> 
        <p>Anregung, Kritik, Vorschl&auml;ge?!</p> 
        <strong>Email :</strong> <a href="mailto:yxc">[email protected]</a></p> 
       </div> 
      </div> 
     </div> 
    <!--Footer content Section ends--> 
    <script type="text/javascript"> 

     var _gaq = _gaq || []; 
     _gaq.push(['_setAccount', 'UA-8662050-13']); 
     _gaq.push(['_trackPageview']); 

     (function() { 
     var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true; 
     ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js'; 
     var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s); 
     })(); 

    </script> 
<div id="fb-root"></div> 
    </body> 
</html> 

我的XPath查詢是這樣的:/html/body/div[3]/div[2]/center/table/tbody/tr[1]/td[1]/div/a/@href我用這個PHP代碼解析它。但它打印出NULL不是我想要的鏈接。我的錯誤在哪裏?

$DOM = new DOMDocument; 

libxml_use_internal_errors(true); 
$DOM->loadHTML($fold); //$fold is the html document. 

$xpath = new DOMXpath($DOM); 
$items = $xpath->query("/html/body/div[3]/div[2]/center/table/tbody/tr[1]/td[1]/div/a/@href"); 

echo $items->item(0)->nodeValue; 

回答

0

您的源代碼中沒有tbody

可以跳過它從您的XPath表達式除去tbody步驟或用descendant//)定位步驟替換它(這將允許它,或在殼體tabletr之間的任何標籤它是在源代碼中定義) :

/html/body/div[3]/div[2]/center/table//tr[1]/td[1]/div/a/@href 
0

你的代碼中有幸存的HTML源代碼的改變,如果你尋找特定的線索去討論的價值,而不是從文件頂部的確切路徑的一個更好的機會。

在你的情況,我建議這個XPath:

//a[img/@alt = 'ul.to']/@href 

它的意思是尋找所有<a>標籤(其中包含<img>標記,有屬性ALT =「ul.to」),然後得到HREF的<a>標籤。