2016-04-28 23 views
2

我有以下代碼從the page。它工作得很好,並打印頁面內容。但是當我將r更改爲谷歌新聞頁面(評論elow)時,出現錯誤(IOError Traceback (most recent call last))。爲什麼?我如何使用googlesoup與谷歌新聞頁面?運行精細beautifulsoup錯誤,同時試圖從谷歌新聞頁面獲取信息:enthought canopy

代碼:

from bs4 import BeautifulSoup 
import urllib 
r = urllib.urlopen('http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts').read() 
#r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read() 
soup = BeautifulSoup(r) 
print type(soup) 

print soup.prettify() 

代碼創建錯誤:

from bs4 import BeautifulSoup 
import urllib 
r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read() 
soup = BeautifulSoup(r) 
print type(soup) 

print soup.prettify() 


--------------------------------------------------------------------------- 
IOError         Traceback (most recent call last) 
c:\users\abc\appdata\local\temp\tmpvxie2e.py in <module>() 
     2 import urllib 
     3 r = urllib.urlopen('http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts').read() 
----> 4 r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read() 
     5 soup = BeautifulSoup(r) 
     6 print type(soup) 

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\urllib.pyc in urlopen(url, data, proxies, context) 
    85   opener = _urlopener 
    86  if data is None: 
---> 87   return opener.open(url) 
    88  else: 
    89   return opener.open(url, data) 

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\urllib.pyc in open(self, fullurl, data) 
    211   try: 
    212    if data is None: 
--> 213     return getattr(self, name)(url) 
    214    else: 
    215     return getattr(self, name)(url, data) 

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\urllib.pyc in open_https(self, url, data) 
    441    if realhost: h.putheader('Host', realhost) 
    442    for args in self.addheaders: h.putheader(*args) 
--> 443    h.endheaders(data) 
    444    errcode, errmsg, headers = h.getreply() 
    445    fp = h.getfile() 

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in endheaders(self, message_body) 
    1047   else: 
    1048    raise CannotSendHeader() 
-> 1049   self._send_output(message_body) 
    1050 
    1051  def request(self, method, url, body=None, headers={}): 

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in _send_output(self, message_body) 
    891    msg += message_body 
    892    message_body = None 
--> 893   self.send(msg) 
    894   if message_body is not None: 
    895    #message_body was not a string (i.e. it is a file) and 

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in send(self, data) 
    853   if self.sock is None: 
    854    if self.auto_open: 
--> 855     self.connect() 
    856    else: 
    857     raise NotConnected() 

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in connect(self) 
    1272 
    1273    self.sock = self._context.wrap_socket(self.sock, 
-> 1274             server_hostname=server_hostname) 
    1275 
    1276  __all__.append("HTTPSConnection") 

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\ssl.pyc in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname) 
    350       suppress_ragged_eofs=suppress_ragged_eofs, 
    351       server_hostname=server_hostname, 
--> 352       _context=self) 
    353 
    354  def set_npn_protocols(self, npn_protocols): 

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\ssl.pyc in __init__(self, sock, keyfile, certfile, server_side, cert_reqs, ssl_version, ca_certs, do_handshake_on_connect, family, type, proto, fileno, suppress_ragged_eofs, npn_protocols, ciphers, server_hostname, _context) 
    577       # non-blocking 
    578       raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets") 
--> 579      self.do_handshake() 
    580 
    581    except (OSError, ValueError): 

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\ssl.pyc in do_handshake(self, block) 
    806    if timeout == 0.0 and block: 
    807     self.settimeout(None) 
--> 808    self._sslobj.do_handshake() 
    809   finally: 
    810    self.settimeout(timeout) 

IOError: [Errno socket error] EOF occurred in violation of protocol (_ssl.c:590) 

UPDATE1。

正如意見建議我嘗試下面的代碼,但仍面臨着同樣的問題:(

from requests.adapters import HTTPAdapter 
from requests.packages.urllib3.poolmanager import PoolManager 
import ssl 

class MyAdapter(HTTPAdapter): 
    def init_poolmanager(self, connections, maxsize, block=False): 
     self.poolmanager = PoolManager(num_pools=connections, 
             maxsize=maxsize, 
             block=block, 
             ssl_version=ssl.PROTOCOL_TLSv1) 


import requests 
s = requests.Session() 
s.mount('https://', MyAdapter()) 

from requests.adapters import HTTPAdapter 
from requests.packages.urllib3.poolmanager import PoolManager 

class SSLAdapter(HTTPAdapter): 
    '''An HTTPS Transport Adapter that uses an arbitrary SSL version.''' 
    def __init__(self, ssl_version=None, **kwargs): 
     self.ssl_version = ssl_version 

     super(SSLAdapter, self).__init__(**kwargs) 

    def init_poolmanager(self, connections, maxsize, block=False): 
     self.poolmanager = PoolManager(num_pools=connections, 
             maxsize=maxsize, 
             block=block, 
             ssl_version=self.ssl_version) 

from bs4 import BeautifulSoup 
import urllib 
r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read() 
soup = BeautifulSoup(r) 
print type(soup) 

print soup.prettify() 
+1

作爲執行的代碼是 - 沒有錯誤。請發佈完整的回溯。 – alecxe

+0

我已經分開兩個代碼 – user2543622

+0

它看起來好像你的問題與美麗的湯無關;您的代碼能夠獲取HTTP頁面並且無法獲取HTTPS。我有一個朦朧的回憶,在過去遇到過類似的問題,那裏的問題是'urllib'(我認爲)不知道在哪裏找到SSL證書或類似的東西。我很抱歉如此模糊。 –

回答

1

所以,這個問題的出現是由於使用Python的enthought篷版本。在Python中的大多數版本的urllib沒有按沒有檢查或驗證一個SSL證書在樹冠版本中,它似乎希望檢查SSL證書我沒有找到有關如何實現的文檔

另外,你會看到下面的代碼我在BeautifulSoup調用中添加了html.parser參數,它會按照您的方式工作,但是BeautifulSoup4中的設置已更改,最好的做法是傳遞您想要使用的參數which parser

下面是你的代碼的工作版本,能夠獲取你想要的谷歌新聞頁面的SSL版本:

from bs4 import BeautifulSoup 
import requests 

headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'} 

#r = requests.get('http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts', headers=headers) 
r = requests.get('https://www.google.com/search?q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d', headers=headers) 

soup = BeautifulSoup(r.text, "html.parser") 
print type(soup) 

print soup.prettify() 
+0

您的代碼運行,但問題是它沒有返回實際的頁面內容如我的原始查詢和頁面http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts'-當我通過我的查詢在頁面上方運行時,它將打印頁面的實際內容html格式。如果頁面是https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d',最重要的消息是Archie Goodwin:Player Profile,但我不會看到在返回的結果:( – user2543622

+0

@ user2543622這是真的,我沒想到要測試,這是因爲正在使用的網址需要執行JavaScript才能正確加載結果我修改了我的代碼中的查詢網址以上,這應該爲你做的詭計 – bmcculley

+0

我檢查了修改後的代碼和co記事本的結果。然後,我搜索了「Archie」,但沒有得到任何點擊:( – user2543622