這裏假設你所有的數據都可以放在內存中;如果沒有,你將不得不嘗試一次只加載一些文件集,或者一次只加載兩個文件。
它進行比較並將輸出寫入summary.csv文件,每對文件一行。
import csv
import glob
import os
import itertools
def get_data(fname):
"""
Load a .csv file
Returns a dict of {'exchange':float(price)}
"""
with open(fname, 'rb') as inf:
items = (row.split() for row in csv.reader(inf))
return {item[0]:float(item[1]) for item in items}
def do_compare(a_name, a_data, b_name, b_data):
"""
Compare two data files of {'key': float(value)}
Returns a list of
- the name of the first file
- the name of the second file
- the number of keys in A which are not in B
- the number of keys in B which are not in A
- the number of values in A less than the corresponding value in B
- the number of values in A equal to the corresponding value in B
- the number of values in A greater than the corresponding value in B
"""
a_keys = set(a_data.iterkeys())
b_keys = set(b_data.iterkeys())
unique_to_a = len(a_keys - b_keys)
unique_to_b = len(b_keys - a_keys)
lt,eq,gt = 0,0,0
pairs = ((a_data[key], b_data[key]) for key in a_keys & b_keys)
for ai,bi in pairs:
if ai < bi:
lt +=1
elif ai == bi:
eq += 1
else:
gt += 1
return [a_name, b_name, unique_to_a, unique_to_b, lt, eq, gt]
def main():
os.chdir('d:/tariff_compare')
# load data from csv files
data = {}
for fname in glob.glob("*.csv"):
data[fname] = get_data(fname)
# do comparison
files = data.keys()
files.sort()
with open('summary.csv', 'wb') as outf:
outcsv = csv.writer(outf)
outcsv.writerow(["File A", "File B", "Unique to A", "Unique to B", "A<B", "A==B", "A>B"])
for a,b in itertools.combinations(files, 2):
outcsv.writerow(do_compare(a, data[a], b, data[b]))
if __name__=="__main__":
main()
編輯: user1277476使得一個好點;如果您通過交換(或者它們已經按照排序順序)對文件進行預先排序,則可以同時遍歷所有文件,除了每條內存中的當前行以外都保留所有文件。
這將讓你做一個更深入的比較,每個交換條目 - 包含一個值,或頂部或底部的N值等
我會盡快實施,但看起來正是我所需要的。謝謝! – user1480902