0
我想知道我的請求是否被網站阻止,我需要設置一個proxy.I首先嚐試關閉http的連接,我失敗了。我也嘗試測試我的代碼,但現在看起來沒有輸出。可能我使用代理一切都會好嗎? 這是代碼。Python HTTPConnectionPool無法建立新的連接:[Errno 11004] getaddrinfo失敗
import requests
from urllib.parse import urlencode
import json
from bs4 import BeautifulSoup
import re
from html.parser import HTMLParser
from multiprocessing import Pool
from requests.exceptions import RequestException
import time
def get_page_index(offset, keyword):
#headers = {'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}
data = {
'offset': offset,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': 20,
'cur_tab': 1
}
url = 'http://www.toutiao.com/search_content/?' + urlencode(data)
try:
response = requests.get(url, headers={'Connection': 'close'})
response.encoding = 'utf-8'
if response.status_code == 200:
return response.text
return None
except RequestException as e:
print(e)
def parse_page_index(html):
data = json.loads(html)
if data and 'data' in data.keys():
for item in data.get('data'):
url = item.get('article_url')
if url and len(url) < 100:
yield url
def get_page_detail(url):
try:
response = requests.get(url, headers={'Connection': 'close'})
response.encoding = 'utf-8'
if response.status_code == 200:
return response.text
return None
except RequestException as e:
print(e)
def parse_page_detail(html):
soup = BeautifulSoup(html, 'lxml')
title = soup.select('title')[0].get_text()
pattern = re.compile(r'articleInfo: (.*?)},', re.S)
pattern_abstract = re.compile(r'abstract: (.*?)\.', re.S)
res = re.search(pattern, html)
res_abstract = re.search(pattern_abstract, html)
if res and res_abstract:
data = res.group(1).replace(r".replace(/<br \/>|\n|\r/ig, '')", "") + '}'
abstract = res_abstract.group(1).replace(r"'", "")
content = re.search(r'content: (.*?),', data).group(1)
source = re.search(r'source: (.*?),', data).group(1)
time_pattern = re.compile(r'time: (.*?)}', re.S)
date = re.search(time_pattern, data).group(1)
date_today = time.strftime('%Y-%m-%d')
img = re.findall(r'src="(.*?)"', content)
if date[1:11] == date_today and len(content) > 50 and img:
return {
'title': title,
'content': content,
'source': source,
'date': date,
'abstract': abstract,
'img': img[0]
}
def main(offset):
flag = 1
html = get_page_index(offset, '光伏')
for url in parse_page_index(html):
html = get_page_detail(url)
if html:
data = parse_page_detail(html)
if data:
html_parser = HTMLParser()
cwl = html_parser.unescape(data.get('content'))
data['content'] = cwl
print(data)
print(data.get('img'))
flag += 1
if flag == 5:
break
if __name__ == '__main__':
pool = Pool()
pool.map(main, [i*20 for i in range(10)])
和錯誤是在這裏!
HTTPConnectionPool(host='tech.jinghua.cn', port=80): Max retries exceeded with url: /zixun/20160720/f191549.shtml (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x00000000048523C8>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed',))
順便說一句,當我首先測試我的代碼時,它顯示一切正常! 在此先感謝!
我測試我的代碼again.I獲取輸出,但會停止當我遇到錯誤HTTPConnectionPool,有沒有辦法解決這個中斷。謝謝 – cwl