2015-05-07 59 views
1

這是我的代碼。我的目標是抓取多個域。我在url數組中設置域,但我無法抓取。Python多域名爬蟲InvalidSchema異常

該代碼可以找到網址,但不解析或抓取。

這是結果:我的代碼運行 ( '總鏈路數:',387) ( '新聞鏈接:',146)

# -*- coding: utf-8 -*- 
import requests 
from bs4 import BeautifulSoup 
import codecs 

headers = { 
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)", 
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 
    "accept-charset": "cp1254,ISO-8859-9,utf-8;q=0.7,*;q=0.3", 
    "accept-encoding": "gzip,deflate,sdch", 
    "accept-language": "tr,tr-TR,en-US,en;q=0.8", 
} 

def haber_oku(haber_url): 
    r = requests.get(haber_url, headers=headers) 
    if r.status_code != 200: 
     return 
    soup = BeautifulSoup(r.content) 
    result = soup.find("div", {'itemprop': 'articleBody'}) 
    if result: 
     return result.get_text() 
    else: 
     result = soup.find("div", {'itemprop': 'description'}) 
     if result: 
      return result.get_text() 
    return 

def scrape_hurriyet(keywords, detay_goster, url): 

    if len(keywords) > 0: 
     keywords = keywords.split(',') 
    s = 0 

    r = requests.get(url, headers=headers) 
    if r.status_code != 200: 
     print("request reddedildi") 
     return 

    soup = BeautifulSoup(r.content) 
    results = soup.findAll("a") 
    print ("Toplam link sayisi : ", len(results)) 
    liste_link = [] 
    liste_text = [] 
    haberler = [] 
    for result in results: 
     h = result.get('href') 
     t = result.get_text() 
     if h is not None: 
      if str(h).find('http://www.hurriyet.com.tr/') or str(h).find('http://www.milliyet.com.tr/spor') >= 0: 
       if h not in liste_link: 
        if h.find('.asp') or h.find('.htm') > 0: 
         liste_link.append(h) 
         liste_text.append(t) 

    print ("Tekil linkler: ", len(liste_link)) 
    i = 0 
    while i < len(liste_link): 
     h = liste_link[i] 
     t = liste_text[i] 
     haber = haber_oku(h) 

     if haber is not None: 
      haber = BeautifulSoup(haber).get_text() 
      ok = 0 
      found = "" 

      if len(keywords) == 0: 
       haberler.append(haber) 

      else: 
       for keyword in keywords: 
        print ('----------------------') 
        if haber.find(keyword) >= 0: 
         found = found + " " + keyword 
         ok += 1 
        if ok > 0: 
         print ("3", h, t, found) 
        if detay_goster is True: 
         haberler.append(haber) 
     i += 1 
    k = 0 
    while k < len(haberler): 
     f = codecs.open("abc" + str(k+1) + ".txt", encoding='utf-8', mode='w+') 
     f.write(haberler[k]) 
     k += 1 
    f.close() 

keywords = '' 
url = ['http://www.hurriyet.com.tr/', 'http://www.milliyet.com.tr/'] 
s = 0 
while s < len(url): 
    scrape_hurriyet(keywords, True, url[s]) 
    s += 1 

他們有例外:

Traceback (most recent call last): 
    File "C:/Users/KerimCaner/PycharmProjects/Hurriyet/hurriyet.py", line 94, in <module> 
    scrape_hurriyet(keywords, True, url[s]) 
    File "C:/Users/KerimCaner/PycharmProjects/Hurriyet/hurriyet.py", line 62, in scrape_hurriyet 
    haber = haber_oku(h) 
    File "C:/Users/KerimCaner/PycharmProjects/Hurriyet/hurriyet.py", line 17, in haber_oku 
    r = requests.get(haber_url, headers=headers) 
    File "C:\Users\KerimCaner\AppData\Roaming\Python\Python27\site-packages\requests\api.py", line 69, in get 
    return request('get', url, params=params, **kwargs) 
    File "C:\Users\KerimCaner\AppData\Roaming\Python\Python27\site-packages\requests\api.py", line 50, in request 
    response = session.request(method=method, url=url, **kwargs) 
    File "C:\Users\KerimCaner\AppData\Roaming\Python\Python27\site-packages\requests\sessions.py", line 465, in request 
    resp = self.send(prep, **send_kwargs) 
    File "C:\Users\KerimCaner\AppData\Roaming\Python\Python27\site-packages\requests\sessions.py", line 567, in send 
    adapter = self.get_adapter(url=request.url) 
    File "C:\Users\KerimCaner\AppData\Roaming\Python\Python27\site-packages\requests\sessions.py", line 641, in get_adapter 
    raise InvalidSchema("No connection adapters were found for '%s'" % url) 
requests.exceptions.InvalidSchema: No connection adapters were found for 'javascript:;' 

回答

0

您得到的錯誤:requests.exceptions.InvalidSchema: No connection adapters were found for 'javascript:;'指出您正在嘗試抓取一段javascript。您目前正在抓取錨標記中的所有網址,但您需要過濾掉javascript-url。您應該替換以下行:

if h is not None: 

像這樣的東西:

if h is not None and not(h.startswith("javascript")):