2011-12-12 48 views
2

我使用urllib(注意不是urllib2)並從用戶提供的URL獲取頁面的標題。不幸的是,有時url不是HTML,而是遠程站點上的一些大文件或一些非常長的正在運行的進程。使用python urllib如何避免非HTML內容

我檢查了python文檔,但urllib是有限的,看着源似乎我可以改變它,但我不能這樣做在服務器上。有關於info()的提及,但沒有關於如何實現它的例子。

我使用FancyURLopener我的猜測是不是在現有的urllib2,我不知道如果urllib2的可以解決問題

  1. 有沒有方法來定義一個套接字超時?
  2. 更重要的是,我怎麼限制要求只有HTML/XHTML的內容類型和其他人忽略任何東西完全就是我要保證整個內容不被下載

我仍然通過urllib的來源和檢查準備urllib2,但我不是這些工具的專家。

回答

7

Here,它指出info()方法返回與URL相關的元信息。你可以用它來獲取標題,看看Content-Type是什麼(text/html),如果它不是你想要的,就放棄請求。

>>> import urllib 
>>> d = urllib.urlopen('http://www.google.com/') 
>>> try: 
...  if d.info()['content-type'].startswith('text/html'): 
...    print 'its html' 
...  else: 
...    print 'its not html' 
... except KeyError: 
...  print 'its not html' 
... 
its html 

我砍死在一起的東西很快就允許urllib指定爲您HEAD請求。 :)

import urllib 
import socket 
from urllib import unwrap, toBytes, quote, splittype, splithost, splituser, unquote, addinfourl 

class MyURLOpener(urllib.FancyURLopener): 
    def open_http(self, url, data=None, method=None): 
     """Use HTTP protocol.""" 
     import httplib 
     user_passwd = None 
     proxy_passwd= None 
     if isinstance(url, str): 
      host, selector = splithost(url) 
      if host: 
       user_passwd, host = splituser(host) 
       host = unquote(host) 
      realhost = host 
     else: 
      host, selector = url 
      # check whether the proxy contains authorization information 
      proxy_passwd, host = splituser(host) 
      # now we proceed with the url we want to obtain 
      urltype, rest = splittype(selector) 
      url = rest 
      user_passwd = None 
      if urltype.lower() != 'http': 
       realhost = None 
      else: 
       realhost, rest = splithost(rest) 
       if realhost: 
        user_passwd, realhost = splituser(realhost) 
       if user_passwd: 
        selector = "%s://%s%s" % (urltype, realhost, rest) 
       if proxy_bypass(realhost): 
        host = realhost 

      #print "proxy via http:", host, selector 
     if not host: raise IOError, ('http error', 'no host given') 

     if proxy_passwd: 
      import base64 
      proxy_auth = base64.b64encode(proxy_passwd).strip() 
     else: 
      proxy_auth = None 

     if user_passwd: 
      import base64 
      auth = base64.b64encode(user_passwd).strip() 
     else: 
      auth = None 
     h = httplib.HTTP(host) 

     if method is not None: 
      h.putrequest(method, selector) 
     else: 
      h.putrequest('GET', selector) 

     if data is not None: 
      #h.putrequest('POST', selector) 
      h.putheader('Content-Type', 'application/x-www-form-urlencoded') 
      h.putheader('Content-Length', '%d' % len(data)) 

     if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth) 
     if auth: h.putheader('Authorization', 'Basic %s' % auth) 
     if realhost: h.putheader('Host', realhost) 
     for args in self.addheaders: h.putheader(*args) 
     h.endheaders(data) 
     errcode, errmsg, headers = h.getreply() 
     fp = h.getfile() 
     if errcode == -1: 
      if fp: fp.close() 
      # something went wrong with the HTTP status line 
      raise IOError, ('http protocol error', 0, 
          'got a bad status line', None) 
     # According to RFC 2616, "2xx" code indicates that the client's 
     # request was successfully received, understood, and accepted. 
     if (200 <= errcode < 300): 
      return addinfourl(fp, headers, "http:" + url, errcode) 
     else: 
      if data is None: 
       return self.http_error(url, fp, errcode, errmsg, headers) 
      else: 
       return self.http_error(url, fp, errcode, errmsg, headers, data) 

    def open(self, fullurl, data=None, method=None): 
     """Use URLopener().open(file) instead of open(file, 'r').""" 
     fullurl = unwrap(toBytes(fullurl)) 
     # percent encode url, fixing lame server errors for e.g, like space 
     # within url paths. 
     fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|") 
     if self.tempcache and fullurl in self.tempcache: 
      filename, headers = self.tempcache[fullurl] 
      fp = open(filename, 'rb') 
      return addinfourl(fp, headers, fullurl) 
     urltype, url = splittype(fullurl) 
     if not urltype: 
      urltype = 'file' 
     if urltype in self.proxies: 
      proxy = self.proxies[urltype] 
      urltype, proxyhost = splittype(proxy) 
      host, selector = splithost(proxyhost) 
      url = (host, fullurl) # Signal special case to open_*() 
     else: 
      proxy = None 
     name = 'open_' + urltype 
     self.type = urltype 
     name = name.replace('-', '_') 
     if not hasattr(self, name): 
      if proxy: 
       return self.open_unknown_proxy(proxy, fullurl, data) 
      else: 
       return self.open_unknown(fullurl, data) 
     try: 
      return getattr(self, name)(url, data, method) 
     except socket.error, msg: 
      raise IOError, ('socket error', msg), sys.exc_info()[2] 


opener = MyURLOpener() 

# NOTE: including any data no longer implicitly makes the method POST, 
#  so you must now specify the method to POST if you include data 
# NOTE: this overrides only open_http, and not open_https, but you can 
#  use a similar technique, and override open_https as well 

d = opener.open('http://www.google.com/', method='HEAD') 
+0

你有任何示例代碼? – Vangel

+0

我實際上做。我正在努力爲你做。這裏是。 :) –

+0

該死的權利,這是很好的。謝謝一堆。 – Vangel