2014-04-01 58 views
1

我想估計新聞對道瓊斯報價的影響。爲此,我使用beutifullsoup庫編寫了Python html解析器。我提取一篇文章並將其存儲在XML文件中,以便使用NLTK庫進行進一步分析。我該如何提高解析速度?下面的代碼完成所需的任務,但速度非常緩慢。慢速html解析器。如何提高速度?

這裏是HTML解析器的代碼:(不改變模塊您目前正在使用)

import urllib2 
import re 
import xml.etree.cElementTree as ET 
import nltk 
from bs4 import BeautifulSoup 
from datetime import date 
from dateutil.rrule import rrule, DAILY 
from nltk.corpus import stopwords 
from collections import defaultdict 

def main_parser(): 
    #starting date 
    a = date(2014, 3, 27) 
    #ending date 
    b = date(2014, 3, 27) 
    articles = ET.Element("articles") 
    f = open('~/Documents/test.xml', 'w') 
    #loop through the links and per each link extract the text of the article, store the latter at xml file 
    for dt in rrule(DAILY, dtstart=a, until=b): 
     url = "http://www.reuters.com/resources/archive/us/" + dt.strftime("%Y") + dt.strftime("%m") + dt.strftime("%d") + ".html" 
     page = urllib2.urlopen(url) 
     #use html5lib ??? possibility to use another parser 
     soup = BeautifulSoup(page.read(), "html5lib") 
     article_date = ET.SubElement(articles, "article_date") 
     article_date.text = str(dt) 
     for links in soup.find_all("div", "headlineMed"): 
      anchor_tag = links.a 
      if not 'video' in anchor_tag['href']: 
       try: 
        article_time = ET.SubElement(article_date, "article_time") 
        article_time.text = str(links.text[-11:]) 

        article_header = ET.SubElement(article_time, "article_name") 
        article_header.text = str(anchor_tag.text) 

        article_link = ET.SubElement(article_time, "article_link") 
        article_link.text = str(anchor_tag['href']).encode('utf-8') 

        try: 
         article_text = ET.SubElement(article_time, "article_text") 
         #get text and remove all stop words 
         article_text.text = str(remove_stop_words(extract_article(anchor_tag['href']))).encode('ascii','ignore') 
        except Exception: 
         pass 
       except Exception: 
        pass 

    tree = ET.ElementTree(articles) 
    tree.write("~/Documents/test.xml","utf-8") 

#getting the article text from the spicific url 
def extract_article(url): 
    plain_text = "" 
    html = urllib2.urlopen(url).read() 
    soup = BeautifulSoup(html, "html5lib") 
    tag = soup.find_all("p") 
    #replace all html tags 
    plain_text = re.sub(r'<p>|</p>|[|]|<span class=.*</span>|<a href=.*</a>', "", str(tag)) 
    plain_text = plain_text.replace(", ,", "") 
    return str(plain_text) 

def remove_stop_words(text): 
    text=nltk.word_tokenize(text) 
    filtered_words = [w for w in text if not w in stopwords.words('english')] 
    return ' '.join(filtered_words) 

回答

1

了一些修復可應用於:

  • 使用lxml解析器,而不是html5lib - 這是很多(和3個以上的支路)更快
  • 只解析文檔的一部分與SoupStrainer(請注意,html5lib不支持SoupStrainer - 它總是會緩慢解析整個文檔)

下面是代碼在更改後的樣子。簡要性能測試表明,至少提高了3倍:

import urllib2 
import xml.etree.cElementTree as ET 
from datetime import date 

from bs4 import SoupStrainer, BeautifulSoup 
import nltk 
from dateutil.rrule import rrule, DAILY 
from nltk.corpus import stopwords 


def main_parser(): 
    a = b = date(2014, 3, 27) 
    articles = ET.Element("articles") 
    for dt in rrule(DAILY, dtstart=a, until=b): 
     url = "http://www.reuters.com/resources/archive/us/" + dt.strftime("%Y") + dt.strftime("%m") + dt.strftime(
      "%d") + ".html" 

     links = SoupStrainer("div", "headlineMed") 
     soup = BeautifulSoup(urllib2.urlopen(url), "lxml", parse_only=links) 

     article_date = ET.SubElement(articles, "article_date") 
     article_date.text = str(dt) 
     for link in soup.find_all('a'): 
      if not 'video' in link['href']: 
       try: 
        article_time = ET.SubElement(article_date, "article_time") 
        article_time.text = str(link.text[-11:]) 

        article_header = ET.SubElement(article_time, "article_name") 
        article_header.text = str(link.text) 

        article_link = ET.SubElement(article_time, "article_link") 
        article_link.text = str(link['href']).encode('utf-8') 

        try: 
         article_text = ET.SubElement(article_time, "article_text") 
         article_text.text = str(remove_stop_words(extract_article(link['href']))).encode('ascii', 'ignore') 
        except Exception: 
         pass 
       except Exception: 
        pass 

    tree = ET.ElementTree(articles) 
    tree.write("~/Documents/test.xml", "utf-8") 


def extract_article(url): 
    paragraphs = SoupStrainer('p') 
    soup = BeautifulSoup(urllib2.urlopen(url), "lxml", parse_only=paragraphs) 
    return soup.text 


def remove_stop_words(text): 
    text = nltk.word_tokenize(text) 
    filtered_words = [w for w in text if not w in stopwords.words('english')] 
    return ' '.join(filtered_words) 

注意,我從extract_article()移除正則表達式處理 - 看起來像你可以得到從p標籤的全部文本。

我可能會介紹一些問題 - 請檢查一切是否正確。


另一種解決方案是使用lxml一切從解析(更換beautifulSoup)來創建XML(更換xml.etree.ElementTree)。


另一種解決方案(絕對速度最快)將切換到Scrapy web的刮網絡框架。 它很簡單,速度非常快。有各種各樣的電池,你可以想象,包括在內。例如有鏈接提取器,XML出口商,數據庫管道等。值得期待。

希望有所幫助。