2016-10-23 36 views
0

頁面在這裏,可以讓信息從https://www.gabar.org/membersearchresults.cfm請求+ BS4沒有從

,但不能從https://www.gabar.org/membersearchresults.cfm?start=1&id=70FFBD1B-9C8E-9913-79DBB8B989DED6C1

from bs4 import BeautifulSoup 
import requests 
import traceback 


links_to_visit = [] 
navigation_links = [] # for testing next button 

base_url = 'https://www.gabar.org' 


def make_soup(link): 
    r = requests.get(link) 
    soup = BeautifulSoup(r.content, 'html.parser') 
    return soup 


def all_results(url): 
    global links_to_visit 
    global navigation_links 
    soup = make_soup(url) 
    print(soup) 
    div = soup.find('div', {'class': 'cs_control'}) 
    links = div.find_all('a') 
    print(links) 
    for link in links: 
     try: 
      if link.text == 'Next': # prev, next, new search 
       navigation_links.append(link) 
       print('got it') 
      elif not '/MemberSearchDetail.cfm?ID=' in link.get('href'): 
       pass # I dont need that link 
      else: 
       links_to_visit.append(link) 
     except: 
      traceback.print_exc() 
    print(len(links_to_visit)) 
    print(links_to_visit) 
    #print(links_to_visit[-1].get('href')) 


def start(): 
    flag = 1 
    page = 1 
    while page < 60716: 
     flag = 0 
     if navigation_links[-1].text == 'Next': 
      flag = 1 
      next_link = navigation_links[-1] 
      #print(next_link.get('href')) 
     page += 25 
     print(base_url + next_link.get('href')) 
     all_results(base_url + next_link.get('href')) 
     print('page is:', page) 

if __name__ == '__main__': 
    all_results('https://www.gabar.org/membersearchresults.cfm') 
    start() 

碼結果是什麼,我需要了解或做,如果我要得到完整的結果?

回答

2

您需要了解的是,HTTP請求的URL不止一個。在這種情況下,搜索結果僅適用於執行搜索的會話,因此只有在您是該會話的「所有者」時才能分頁。大多數網站使用會話cookie來識別會話cookie,您需要隨HTTP請求一起發送會話cookie。

這可能是一個巨大的麻煩,但幸運的蟒蛇請求照顧所有這一切與requests.session你。而不是使用requests.get(url)初始化會話session=requests.session(),然後在隨後的請求session.get(url)中使用該會話。這會自動爲您保留Cookie,並且在許多方面會像實際瀏覽器一樣行事。

你可以閱讀更多關於requests.session如何工作here

最後但並非最不重要的,你的固定碼=)

from bs4 import BeautifulSoup 
import requests 
import traceback 


links_to_visit = [] 
navigation_links = [] # for testing next button 
# we initialize the session here 
session = requests.session() 

base_url = 'https://www.gabar.org' 


def make_soup(link): 
    # r = requests.get(link) 
    # we use the session here in order to preserve cookies across requests 
    r = session.get(link) 
    soup = BeautifulSoup(r.content, 'html.parser') 
    return soup 


def all_results(url): 
    # globals are almost never needed or recommended and certainly not here. 
    # you can just leave this out 
    # global links_to_visit 
    # global navigation_links 
    soup = make_soup(url) 
    print(soup) 
    div = soup.find('div', {'class': 'cs_control'}) 
    links = div.find_all('a') 
    print(links) 
    for link in links: 
     try: 
      if link.text == 'Next': # prev, next, new search 
       navigation_links.append(link) 
       print('got it') 
      elif not '/MemberSearchDetail.cfm?ID=' in link.get('href'): 
       pass # I dont need that link 
      else: 
       links_to_visit.append(link) 
     except: 
      traceback.print_exc() 
    print(len(links_to_visit)) 
    print(links_to_visit) 
    #print(links_to_visit[-1].get('href')) 


def start(): 
    flag = 1 
    page = 1 
    while page < 60716: 
     flag = 0 
     if navigation_links[-1].text == 'Next': 
      flag = 1 
      next_link = navigation_links[-1] 
      #print(next_link.get('href')) 
     page += 25 
     print(base_url + next_link.get('href')) 
     all_results(base_url + next_link.get('href')) 
     print('page is:', page) 

if __name__ == '__main__': 
    all_results('https://www.gabar.org/membersearchresults.cfm') 
    start()