2016-10-21 64 views
3

我想寫一個網頁內容的摘要。爲此,我需要從網頁中提取所有不相關的文本和數據。如何從網頁中提取主要內容?

我已經使用了boilerpipe,但是文字提取效果不好。結果是here,在這裏你可以看到很多不相關的文字。

也嘗試JSoup廢除不相關的數據,通過刪除頁眉,頁腳,外部鏈接等,但同樣,結果不符合標準。

Document doc = Jsoup.connect("www.anyurl.com").get() 
    doc.head().remove(); 
    doc.getElementsByTag("header").remove(); 
    doc.getElementsByTag("footer").remove(); 
    doc.getElementsByTag("form").remove(); 
    doc.getElementsByTag("table").remove(); 
    doc.getElementsByTag("meta").remove(); 
    doc.getElementsByTag("img").remove(); 
    doc.getElementsByTag("a").remove(); 
    doc.getElementsByTag("br").remove(); 

    doc.getElementsByClass("tags").remove(); 
    doc.getElementsByClass("copyright").remove(); 
    doc.getElementsByClass("widget").remove(); 

    doc.select("div[class*=foot").remove(); 
    doc.select("div[class*=tag").remove(); 
    doc.select("div[class*=Loading").remove(); 
    doc.select("div[class*=Widget").remove(); 
    doc.select("div[class*=Head").remove(); 
    doc.select("div[class*=menu").remove(); 
    doc.select("p[class*=link").remove(); 

    Elements paragraphs = doc.select("p"); 
    Elements divs = doc.select("div"); 

    formattedOutput = paragraphs.text() + divs.text(); 

任何人都可以建議我如何做到這一點?除了boilerpipe以外,是否有任何Java庫可以幫你嗎?

+0

我看着鏈接提取的主要內容,而且我沒有看到「很多無關的文字」。我認爲你不會僅僅通過詢問如何刪除更多不相關的文本來獲得任何地方,因爲有關與否可能是一個意見問題。相反,給我們一個你想做什麼的具體想法。或者,也許解決方案只是你需要有一個更具體的想法。 – ajb

+1

** 1)**也許在繼續之前,您應該看看他們的[使用條款](http://www.medicalnewstoday.com/terms)'您不得放置或使用任何軟件來放置任何除非您先與我們聯繫,並已收到我們的書面許可才能將整篇文章在線發佈,否則您將在本網站上刊登「醫學新聞」雜誌的全部文章 - 這些完整的使用條款將適用。** 2)**也許可以使用其中的一種[newsfeeds](http://www.medicalnewstoday.com/newsfeeds-rss)已經提供您正在尋找的信息。 – SubOptimal

回答

0

我不知道關於Java,但你可以使用從網頁

<?php 

class ContentExtractor { 

    var $container_tags = array(
      'div', 'table', 'td', 'th', 'tr', 'tbody', 'thead', 'tfoot', 'col', 
      'colgroup', 'ul', 'ol', 'html', 'center', 'span' 
     ); 
    var $removed_tags = array(
      'script', 'noscript', 'style', 'form', 'meta', 'input', 'iframe', 'embed', 'hr', 'img', 
      '#comment', 'link', 'label' 
     ); 
    var $ignore_len_tags = array(
      'span' 
     ); 

    var $link_text_ratio = 0.04; 
    var $min_text_len = 20; 
    var $min_words = 0; 

    var $total_links = 0; 
    var $total_unlinked_words = 0; 
    var $total_unlinked_text=''; 
    var $text_blocks = 0; 

    var $tree = null; 
    var $unremoved=array(); 

    function sanitize_text($text){ 
     $text = str_ireplace('&nbsp;', ' ', $text); 
     $text = html_entity_decode($text, ENT_QUOTES); 

     $utf_spaces = array("\xC2\xA0", "\xE1\x9A\x80", "\xE2\x80\x83", 
      "\xE2\x80\x82", "\xE2\x80\x84", "\xE2\x80\xAF", "\xA0"); 
     $text = str_replace($utf_spaces, ' ', $text); 

     return trim($text); 
    } 

    function extract($text, $ratio = null, $min_len = null){ 
     $this->tree = new DOMDocument(); 

     $start = microtime(true); 
     if (!@$this->tree->loadHTML($text)) return false; 

     $root = $this->tree->documentElement; 
     $start = microtime(true); 
     $this->HeuristicRemove($root, (($ratio == null) || ($min_len == null))); 

     if ($ratio == null) { 
      $this->total_unlinked_text = $this->sanitize_text($this->total_unlinked_text); 

      $words = preg_split('/[\s\r\n\t\|?!.,]+/', $this->total_unlinked_text); 
      $words = array_filter($words); 
      $this->total_unlinked_words = count($words); 
      unset($words); 
      if ($this->total_unlinked_words>0) { 
       $this->link_text_ratio = $this->total_links/$this->total_unlinked_words;// + 0.01; 
       $this->link_text_ratio *= 1.3; 
      } 

     } else { 
      $this->link_text_ratio = $ratio; 
     }; 

     if ($min_len == null) { 
      $this->min_text_len = strlen($this->total_unlinked_text)/$this->text_blocks; 
     } else { 
      $this->min_text_len = $min_len; 
     } 

     $start = microtime(true); 
     $this->ContainerRemove($root); 

     return $this->tree->saveHTML(); 
    } 

    function HeuristicRemove($node, $do_stats = false){ 
     if (in_array($node->nodeName, $this->removed_tags)){ 
      return true; 
     }; 

     if ($do_stats) { 
      if ($node->nodeName == 'a') { 
       $this->total_links++; 
      } 
      $found_text = false; 
     }; 

     $nodes_to_remove = array(); 

     if ($node->hasChildNodes()){ 
      foreach($node->childNodes as $child){ 
       if ($this->HeuristicRemove($child, $do_stats)) { 
        $nodes_to_remove[] = $child; 
       } else if ($do_stats && ($node->nodeName != 'a') && ($child->nodeName == '#text')) { 
        $this->total_unlinked_text .= $child->wholeText; 
        if (!$found_text){ 
         $this->text_blocks++; 
         $found_text=true; 
        } 
       }; 
      } 
      foreach ($nodes_to_remove as $child){ 
       $node->removeChild($child); 
      } 
     } 

     return false; 
    } 

    function ContainerRemove($node){ 
     if (is_null($node)) return 0; 
     $link_cnt = 0; 
     $word_cnt = 0; 
     $text_len = 0; 
     $delete = false; 
     $my_text = ''; 

     $ratio = 1; 

     $nodes_to_remove = array(); 
     if ($node->hasChildNodes()){ 
      foreach($node->childNodes as $child){ 
       $data = $this->ContainerRemove($child); 

       if ($data['delete']) { 
        $nodes_to_remove[]=$child; 
       } else { 
        $text_len += $data[2]; 
       } 

       $link_cnt += $data[0]; 

       if ($child->nodeName == 'a') { 
        $link_cnt++; 
       } else { 
        if ($child->nodeName == '#text') $my_text .= $child->wholeText; 
        $word_cnt += $data[1]; 
       } 
      } 

      foreach ($nodes_to_remove as $child){ 
       $node->removeChild($child); 
      } 

      $my_text = $this->sanitize_text($my_text); 

      $words = preg_split('/[\s\r\n\t\|?!.,\[\]]+/', $my_text); 
      $words = array_filter($words); 

      $word_cnt += count($words); 
      $text_len += strlen($my_text); 

     }; 

     if (in_array($node->nodeName, $this->container_tags)){ 
      if ($word_cnt>0) $ratio = $link_cnt/$word_cnt; 

      if ($ratio > $this->link_text_ratio){ 
        $delete = true; 
      } 

      if (!in_array($node->nodeName, $this->ignore_len_tags)) { 
       if (($text_len < $this->min_text_len) || ($word_cnt<$this->min_words)) { 
        $delete = true; 
       } 
      } 

     } 

     return array($link_cnt, $word_cnt, $text_len, 'delete' => $delete); 
    } 

} 

/**************************** 
    Simple usage example 
*****************************/ 

$html = file_get_contents('http://en.wikipedia.org/wiki/Shannon_index'); 

$extractor = new ContentExtractor(); 
$content = $extractor->extract($html); 
echo $content; 

?>