2017-09-13 114 views
0

我想知道我的請求是否被網站阻止,我需要設置一個proxy.I首先嚐試關閉http的連接,我失敗了。我也嘗試測試我的代碼,但現在看起來沒有輸出。可能我使用代理一切都會好嗎? 這是代碼。Python HTTPConnectionPool無法建立新的連接:[Errno 11004] getaddrinfo失敗

import requests 
from urllib.parse import urlencode 
import json 
from bs4 import BeautifulSoup 
import re 
from html.parser import HTMLParser 
from multiprocessing import Pool 
from requests.exceptions import RequestException 
import time 


def get_page_index(offset, keyword): 
    #headers = {'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'} 
    data = { 
     'offset': offset, 
     'format': 'json', 
     'keyword': keyword, 
     'autoload': 'true', 
     'count': 20, 
     'cur_tab': 1 
    } 
    url = 'http://www.toutiao.com/search_content/?' + urlencode(data) 
    try: 
     response = requests.get(url, headers={'Connection': 'close'}) 
     response.encoding = 'utf-8' 
     if response.status_code == 200: 
      return response.text 
     return None 
    except RequestException as e: 
     print(e) 

def parse_page_index(html): 
    data = json.loads(html) 
    if data and 'data' in data.keys(): 
     for item in data.get('data'): 
      url = item.get('article_url') 
      if url and len(url) < 100: 
       yield url 

def get_page_detail(url): 
    try: 
     response = requests.get(url, headers={'Connection': 'close'}) 
     response.encoding = 'utf-8' 
     if response.status_code == 200: 
      return response.text 
     return None 
    except RequestException as e: 
     print(e) 

def parse_page_detail(html): 
    soup = BeautifulSoup(html, 'lxml') 
    title = soup.select('title')[0].get_text() 
    pattern = re.compile(r'articleInfo: (.*?)},', re.S) 
    pattern_abstract = re.compile(r'abstract: (.*?)\.', re.S) 
    res = re.search(pattern, html) 
    res_abstract = re.search(pattern_abstract, html) 
    if res and res_abstract: 
     data = res.group(1).replace(r".replace(/<br \/>|\n|\r/ig, '')", "") + '}' 
     abstract = res_abstract.group(1).replace(r"'", "") 
     content = re.search(r'content: (.*?),', data).group(1) 
     source = re.search(r'source: (.*?),', data).group(1) 
     time_pattern = re.compile(r'time: (.*?)}', re.S) 
     date = re.search(time_pattern, data).group(1) 
     date_today = time.strftime('%Y-%m-%d') 
     img = re.findall(r'src=&quot;(.*?)&quot', content) 
     if date[1:11] == date_today and len(content) > 50 and img: 
      return { 
       'title': title, 
       'content': content, 
       'source': source, 
       'date': date, 
       'abstract': abstract, 
       'img': img[0] 
      } 

def main(offset): 
    flag = 1 
    html = get_page_index(offset, '光伏') 
    for url in parse_page_index(html): 
     html = get_page_detail(url) 
     if html: 
      data = parse_page_detail(html) 
      if data: 
       html_parser = HTMLParser() 
       cwl = html_parser.unescape(data.get('content')) 
       data['content'] = cwl 
       print(data) 
       print(data.get('img')) 
       flag += 1 
       if flag == 5: 
        break 



if __name__ == '__main__': 
    pool = Pool() 
    pool.map(main, [i*20 for i in range(10)]) 

和錯誤是在這裏!

HTTPConnectionPool(host='tech.jinghua.cn', port=80): Max retries exceeded with url: /zixun/20160720/f191549.shtml (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x00000000048523C8>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed',)) 

順便說一句,當我首先測試我的代碼時,它顯示一切正常! 在此先感謝!

+0

我測試我的代碼again.I獲取輸出,但會停止當我遇到錯誤HTTPConnectionPool,有沒有辦法解決這個中斷。謝謝 – cwl

回答

1

在我看來,你正在HTTPConnectionPool的連接限制。既然你在同一時間啓動10個線程

請嘗試以下之一:

  1. 增加請求超時(秒):requests.get('url', timeout=5)
  2. 關閉響應:Response.close()。而不是返回response.text,將響應分配給varialble,關閉Response,然後返回變量
+0

謝謝,我關閉了響應,它的工作正常! – cwl

相關問題