2016-12-20 204 views
0

我有多個.csv文件,我想將它們連接成一個文件。本質上,我想選擇某些列並將它們並排追加。連接多個csv文件到一個

此處我有這裏的代碼不起作用。根本沒有錯誤信息。它什麼都不做。

有人知道如何解決它嗎?

import pandas as pd 
import datetime 
import numpy as np 
import glob 
import csv 
import os 



def concatenate(indir='/My Documents/Python/Test/in', 
       outfile='/My Documents/Python/Test/out/Forecast.csv'): 
    os.chdir(indir) 
    fileList = glob.glob('*.csv') 
    print(fileList) 
    dfList = [] 
    colnames=["DateTime","WindSpeed","Capacity","p0.025","p0.05","p0.1","p0.5","p0.9","p0.95","p0.975","suffix"] 
    for filename in fileList: 
     print(filename) 
     df = pd.read_csv(filename ,delimiter=',',engine = 'python', encoding='latin-1', index_col = False) 
     dfList.append(df) 
    concatDF = pd.concat(dfList,axis=0) 
    concatDF.columns=colnames 
    concatDF.to_csv(outfile,index=None) 
+0

你將不得不循環遍歷並連接它們。如果一個CSV文件有缺失的行,您將不得不中止或發明缺失數據的表示。 –

+0

只是扔在那裏。你在調用這個函數嗎?如果沒有,代碼將不會做任何事情。 – Parfait

回答

0

我跑這個代碼來設置我的文件系統

設置

import pandas as pd 
import numpy as np 

def setup_test_files(indir='in'): 
    colnames = [ 
     "WindSpeed", "Capacity", 
     "p0.025", "p0.05", "p0.1", "p0.5", 
     "p0.9", "p0.95", "p0.975", "suffix" 
    ] 
    tidx = pd.date_range('2016-03-31', periods=3, freq='M', name='DateTime') 

    for filename in ['in/fn_{}.csv'.format(i) for i in range(3)]: 
     pd.DataFrame(
      np.random.rand(3, len(colnames)), 
      tidx, colnames 
     ).round(2).to_csv(filename) 
     print(filename) 

setup_test_files() 

這創造了3個文件名爲['fn_0.csv', 'fn_1.csv', 'fn_2.csv']
他們看起來是這樣的文件

with open('in/fn_0.csv', 'r') as fo: 
    print(''.join(fo.readlines())) 

DateTime,WindSpeed,Capacity,p0.025,p0.05,p0.1,p0.5,p0.9,p0.95,p0.975,suffix 
2016-03-31,0.03,0.76,0.62,0.21,0.76,0.36,0.44,0.61,0.23,0.04 
2016-04-30,0.39,0.12,0.31,0.99,0.86,0.35,0.15,0.61,0.55,0.03 
2016-05-31,0.72,1.0,0.71,0.86,0.41,0.79,0.22,0.76,0.92,0.79 

我將定義一個解析器函數和一個單獨做串聯。爲什麼?因爲我認爲這樣更容易。

import pandas as pd 
import glob 
import os 


def read_csv(fn): 
    colnames = [ 
     "DateTime", "WindSpeed", "Capacity", 
     "p0.025", "p0.05", "p0.1", "p0.5", 
     "p0.9", "p0.95", "p0.975", "suffix" 
    ] 
    df = pd.read_csv(fn, encoding='latin-1') 
    df.columns = colnames 
    return df 


def concatenate(indir='in', outfile='out/Forecast.csv'): 
    curdir = os.getcwd() 

    try: 
     os.chdir(indir) 
     file_list = glob.glob('*.csv') 
     df_names = [fn.replace('.csv', '') for fn in file_list] 

     concat_df = pd.concat(
      [read_csv(fn) for fn in file_list], 
      axis=1, keys=df_names) 

     # notice I was nice enough to change directory back :-) 
     os.chdir(curdir) 

     concat_df.to_csv(outfile, index=None) 

    except: 
     os.chdir(curdir) 

然後運行級聯

concatenate() 

您可以在讀取結果這樣

print(pd.read_csv('out/Forecast.csv', header=[0, 1])) 

     fn_0                 \ 
    DateTime WindSpeed Capacity p0.025 p0.05 p0.1 p0.5 p0.9 p0.95 p0.975 
0 2016-03-31  0.03  0.76 0.62 0.21 0.76 0.36 0.44 0.61 0.23 
1 2016-04-30  0.39  0.12 0.31 0.99 0.86 0.35 0.15 0.61 0.55 
2 2016-05-31  0.72  1.00 0.71 0.86 0.41 0.79 0.22 0.76 0.92 

    ...  fn_2                
    ... WindSpeed Capacity p0.025 p0.05 p0.1 p0.5 p0.9 p0.95 p0.975 suffix 
0 ...  0.80  0.79 0.38 0.94 0.91 0.18 0.27 0.14 0.39 0.91 
1 ...  0.60  0.97 0.04 0.69 0.04 0.65 0.94 0.81 0.37 0.22 
2 ...  0.78  0.53 0.83 0.93 0.92 0.12 0.15 0.65 0.06 0.11 

[3 rows x 33 columns] 

注:

您沒有理會將DateTime作爲您的索引。我認爲這可能是你想要的。如果是這樣,改變read_csvconcatenate功能,這

import pandas as pd 
import glob 
import os 


def read_csv(fn): 
    colnames = [ 
     "WindSpeed", "Capacity", 
     "p0.025", "p0.05", "p0.1", "p0.5", 
     "p0.9", "p0.95", "p0.975", "suffix" 
    ] 
    # notice extra parameters for specifying index and parsing dates 
    df = pd.read_csv(fn, index_col=0, parse_dates=[0], encoding='latin-1') 
    df.index.name = "DateTime" 
    df.columns = colnames 
    return df 


def concatenate(indir='in', outfile='out/Forecast.csv'): 
    curdir = os.getcwd() 
    try: 
     os.chdir(indir) 
     file_list = glob.glob('*.csv') 
     df_names = [fn.replace('.csv', '') for fn in file_list] 

     concat_df = pd.concat(
      [read_csv(fn) for fn in file_list], 
      axis=1, keys=df_names) 

     os.chdir(curdir) 
     concat_df.to_csv(outfile) 
    except: 
     os.chdir(curdir) 

這就是最後的結果看起來像這種變化,注意日期將對準這樣

   fn_0              \ 
      WindSpeed Capacity p0.025 p0.05 p0.1 p0.5 p0.9 p0.95 p0.975 
DateTime                  
2016-03-31  0.03  0.76 0.62 0.21 0.76 0.36 0.44 0.61 0.23 
2016-04-30  0.39  0.12 0.31 0.99 0.86 0.35 0.15 0.61 0.55 
2016-05-31  0.72  1.00 0.71 0.86 0.41 0.79 0.22 0.76 0.92 

        ...  fn_2           \ 
      suffix ... WindSpeed Capacity p0.025 p0.05 p0.1 p0.5 p0.9 
DateTime   ...              
2016-03-31 0.04 ...  0.80  0.79 0.38 0.94 0.91 0.18 0.27 
2016-04-30 0.03 ...  0.60  0.97 0.04 0.69 0.04 0.65 0.94 
2016-05-31 0.79 ...  0.78  0.53 0.83 0.93 0.92 0.12 0.15 


      p0.95 p0.975 suffix 
DateTime       
2016-03-31 0.14 0.39 0.91 
2016-04-30 0.81 0.37 0.22 
2016-05-31 0.65 0.06 0.11 

[3 rows x 30 columns] 
相關問題