2017-12-27 458 views
2

我有波紋管代碼:熊貓導出爲CSV

from xlsxwriter import Workbook 
import os,shutil 
import requests 
import pandas 
from bs4 import BeautifulSoup 
MAX_RETRIES = 20 
base_url='https://pagellapolitica.it/politici/sfoggio/9/matteo-renzi?page=' 

for page in range(1,32,1): 
    l=[] 
    session = requests.Session() 
    adapter = requests.adapters.HTTPAdapter(max_retries=MAX_RETRIES) 
    session.mount('https://', adapter) 
    session.mount('http://', adapter) 

    site=(base_url+str(page)+".html") 
    # print(site) 
    c=session.get(site) 
    r=c.content 
    soup=BeautifulSoup(r,'html.parser') 
    all=soup.find_all("div",{"class":"clearfix"}) 


for d in all: 
    links=d.find_all("a") 
    len(links) 
    l=[] 
    # workbook = Workbook('bbb.xlsx') 
    # worksheet = workbook.add_worksheet() 
    # row +=0 
    # worksheet.write(row,0,'Link') 
    # worksheet.write(row,1,'Name ') 
    # row+=1 
    for a in links[5:17]: 

     d={} 
     href=(a["href"]) 
     basic_url=('https://pagellapolitica.it/') 
     site =basic_url + href 
     #print(site) 

     c=requests.get(site) 
     r=c.content 
     soup=BeautifulSoup(r,'html.parser') 
     Name=soup.find("h3",{"class":"pull-left"}).text 
     Fact_checking=soup.find("label",{"class":"verdict-analisi"}).text 
     quote=soup.find("div",{"class":"col-xs-12 col-sm-6 col-smm-6 col-md-5 col-lg-5"}).text 
     all=soup.find_all("span",{"class":"item"}) 
     Topic=all[0].text 
     Date=all[2].text 
     a=all[3].find("a",{"class":"" ""}) 
     Link=a["href"] 
     Text=soup.find("div",{"class":"col-xs-12 col-md-12 col-lg-12"}).text 
     d["Name"]=Name 
     d["Fact_checking"]=Fact_checking 
     d["Quote"]=quote 
     d["Economic_topic"]=Topic 
     d["Date"]=Date 
     d["Link"]=Link 
     d["Text"]=Text 
     l.append(d) 

     df=pandas.DataFrame(l) 
     df.to_csv("outing.csv") 

,當我在CSV導出數據我只得到6行results.When我做印刷(DF)和打印(L)的問題它會打印列表中的所有數據,但是當我檢查len(l)時,我只能得到6的長度。任何想法爲什麼會發生這種情況? 提前謝謝!

+0

您可以在問題的底部添加'df.info()'輸出。請務必在添加之前將其格式化爲可讀。 – Nitred

+5

你在內循環的每次迭代中覆蓋你的CSV文件... – MaxU

+0

在循環的外部移動'df = pandas.DataFrame(l)'和'df.to_csv(「outing.csv」)' –

回答

2

考慮使用pandas.DataFrame()構建綁定到各個數據框中的字典列表,然後將最終數據框的各個數據框列表與pandas.concat()連接起來。

df_list = [] 

for d in all: 
    links=d.find_all("a") 
    len(links) 
    l=[] 

    for a in links[5:17]:  
     d={} 
     ... 
     d["Name"]=Name 
     d["Fact_checking"]=Fact_checking 
     d["Quote"]=quote 
     d["Economic_topic"]=Topic 
     d["Date"]=Date 
     d["Link"]=Link 
     d["Text"]=Text 
     l.append(d) 

    df_list.append(pandas.DataFrame(l)) 

final_df = pandas.concat(df_list) 
final_df.to_csv("outing.csv")