2017-05-17 217 views
0

我試圖運行一個腳本來取消網站的新聞文本。Python urllib.error.HTTPError:HTTP錯誤404:未找到

看起來主頁重組後的報廢是有限制的。

我一直得到這個錯誤,但如果我測試一個唱歌網址它的作品。

有什麼建議嗎?

Traceback (most recent call last): 
    File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\bigkinds_headers_v1.3_20170522.py", line 157, in <module> 
    result = fetch_news_detail(news['href']) 
    File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\bigkinds_headers_v1.3_20170522.py", line 107, in fetch_news_detail 
    res = urlopen(url).read().decode('utf-8', errors='ignore') 
    File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 223, in urlopen 
    return opener.open(url, data, timeout) 
    File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 532, in open 
    response = meth(req, response) 
    File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 642, in http_response 
    'http', request, response, code, msg, hdrs) 
    File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 570, in error 
    return self._call_chain(*args) 
    File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 504, in _call_chain 
    result = func(*args) 
    File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 650, in http_error_default 
    raise HTTPError(req.full_url, code, msg, hdrs, fp) 
urllib.error.HTTPError: HTTP Error 404: Not Found 

這是我的代碼

import urllib.request 
from bs4 import BeautifulSoup 
import time 
import urllib.parse 
import json 
from urllib.request import urlopen 
import random 

def fetch_news_list(page, keyword,start,end): 

    result = [] 

    url = "https://www.bigkinds.or.kr/news/newsResult.do" 

    headers = { 
     'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_2 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13F69 Safari/601.1', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive' 
     } 

    param = { 
     'pageInfo':'newsResult', 
     'login_chk':'null', 
     'LOGIN_SN':'null', 
     'LOGIN_NAME':'null', 
     'indexName':'news', 
     'keyword':keyword, 
     'byLine':'', 
     'searchScope':'1', 
     'searchFtr':'1', 
     'startDate':start, 
     'endDate':end, 
     'sortMethod':'date', 
     'contentLength':'100', 
     'providerCode':'', 
     'categoryCode':'', 
     'incidentCode':'', 
     'dateCode':'', 
     'highlighting':'', 
     'sessionUSID':'', 
     'sessionUUID':'test', 
     'listMode':'', 
     'categoryTab':'', 
     'newsId':'', 
     'filterProviderCode':'', 
     'filterCategoryCode':'', 
     'filterIncidentCode':'', 
     'filterDateCode':'', 
     'startNo':page, 
     'resultNumber':'100', 
     'topmenuoff':'', 
     'resultState':'', 
     'keywordJson':'{"searchDetailTxt1":keyword,"agreeDetailTxt1":"","needDetailTxt1":"","exceptDetailTxt1":"","o_id":"option1","startDate":start,"endDate":end,"providerNm":"","categoryNm":"","incidentCategoryNm":"","providerCode":"","categoryCode":"","incidentCategoryCode":"","searchFtr":"1","searchScope":"1","searchKeyword":"keyword"}', 
     'keywordFilterJson':'', 
     'totalCount':'', 
     'interval':'', 
     'quotationKeyword1':'', 
     'quotationKeyword2':'', 
     'quotationKeyword3':'', 
     'searchFromUseYN':'N', 
     'mainTodayPersonYn':'', 
     'period':'1year' 
     } 

    param = urllib.parse.urlencode(param).encode() 

    req = urllib.request.Request(url, param, headers) 
    sleepTime = random.randint(4,10) 
    time.sleep(sleepTime) 
    print(str(sleepTime) + ' seconds wait.') 

    try : 

     res = urllib.request.urlopen(req) 

    except URLError as e: 
     if hasattr(e, 'reason'): 
      print('We failed to reach a server.') 
      print('Reason: ', e.reason) 
     elif hasattr(e, 'code'): 
      print('The server couldn\'t fulfill the request.') 
      print('Error code: ', e.code) 



    html = res.read() 

    soup = BeautifulSoup(html, "html.parser") 

    div_tags = soup.find_all('div', class_='resTxt') 

    for cts in div_tags: 

     ids = cts.find('h3')['id'][5:31] 
     title = cts.find('h3',class_='list_newsId').get_text(strip=True) 
     href = 'https://www.bigkinds.or.kr/news/detailView.do?docId=' + ids + '&returnCnt=1' 

     sets = { 
      'title' : title, 
      'href' : href 
      } 

     result. append(sets) 

    return result 

def fetch_news_detail(url): 
    result = {} 
    res = urlopen(url).read().decode('utf-8', errors='ignore') 

    responseJson = json.loads(res)  

    category = responseJson.get("detail").get("CATEGORY_MAIN") 
    date = responseJson.get("detail").get("DATE") 
    provider = responseJson.get("detail").get("PROVIDER") 
    content = responseJson.get("detail").get("CONTENT") 

    result = { 
     'category': category, 
     'date': date, 
     'provider' : provider, 
     'content': content 
    } 

    return result 


keyword = input('(eg., 외국인 NOT(증시 OR 순매수 OR 증권 OR 코스피 OR 코스닥 OR 주식 OR 주가 OR 투타 OR KBO OR 야구 OR KBL OR 농구 OR 축구 OR 올림픽 OR K리그))\n input word: ') 
start = input('(eg., 2017-01-01)\n input startday: ') 
end = input('(eg., 2017-02-01)\n input endday: ') 

page = 1 
count = 1 
flag = True 

f = open('bigkinds.txt', 'w', encoding='utf-8') 

while True: 
    if not flag: 
     break 

    news_list = fetch_news_list(page, keyword,start,end) 
    sleepTime = random.randint(3,8) 
    time.sleep(sleepTime) 
    print(str(sleepTime) + ' seconds wait.') 

    for news in news_list: 
     result = fetch_news_detail(news['href']) 

     result['title'] = news['title'] 

     f.write('==' * 40 + '\n') 
     f.write('category: ' + result['category'] + '\n') 
     f.write('title: ' + result['title'] + '\n') 
     f.write('date: ' + result['date'] + '\n') 
     f.write('provider: ' + result['provider'] + '\n') 
     f.write('content: ' + result['content'] + '\n') 
     f.write('==' * 40 + '\n') 

     count += 1 
     if count >=5002: 
      flag = False 
      break 
      sleepTime = random.randint(2,10) 
      time.sleep(sleepTime) 
      print(str(sleepTime) + ' seconds wait.') 

    page += 1 

f.close() 
+0

建議您使用流行的[requests](http://docs.python -requests.org/en/master/)包而不是'urllib'。 –

回答

0

從Yelp的數據報廢時,我也遇到這樣的問題。很難繞過這些限制。我建議你嘗試以下。

  1. 更改您的用戶代理。目前的用戶代理似乎是針對iPhone的。使用一個有效的個人電腦。
  2. 如果以上工作正常,但在發出一定數量的頁面請求後您被阻止 - 請查看StarCluster。您將在「2017/4/13(UTEP CoBA 310 - CALC LAB#2)」中找到代碼草稿,格式爲https://yangwangteaching.wordpress.com/data-science-meetup/
+0

謝謝。我會再試一次。 – wooah

+0

如果在更改用戶代理後仍然出現404錯誤,請嘗試從新的IP地址運行。 – manojps