我試圖將大量numpy結構化數組存儲爲hdf5文件中的數據集。例如,
f ['tree1'] = structured_array1
。
。
F [ 'tree60000'] = structured_array60000 (有〜60000個樹),使用h5py創建大量數據集 - 無法註冊數據類型原子(無法插入重複鍵)
約70%的方式進入讀取文件時,得到錯誤 RuntimeError:無法註冊的數據類型原子(罐」 t插入重複鍵)
此問題僅出現在非常大的ascii文件(10e7行,5gb)中。如果文件在(10e6行,500mb)左右,則不會發生。它也不會發生,如果我拿出數據類型,並存儲爲一個numpy字符串數組。
我可以解決這個問題,如果我停止閱讀文件的一半,關閉我的終端,再次打開它,並繼續讀取文件從中途開始到最後(我保存行結束的行號)。我嘗試打開和關閉python函數本身的hdf5文件,但這不起作用。
dt = [
('scale', 'f4'),
('haloid', 'i8'),
('scale_desc', 'f4'),
('haloid_desc', 'i8'),
('num_prog', 'i4'),
('pid', 'i8'),
('upid', 'i8'),
('pid_desc', 'i8'),
('phantom', 'i4'),
('mvir_sam', 'f4'),
('mvir', 'f4'),
('rvir', 'f4'),
('rs', 'f4'),
('vrms', 'f4'),
('mmp', 'i4'),
('scale_lastmm', 'f4'),
('vmax', 'f4'),
('x', 'f4'),
('y', 'f4'),
('z', 'f4'),
('vx', 'f4'),
('vy', 'f4'),
('vz', 'f4'),
('jx', 'f4'),
('jy', 'f4'),
('jz', 'f4'),
('spin', 'f4'),
('haloid_breadth_first', 'i8'),
('haloid_depth_first', 'i8'),
('haloid_tree_root', 'i8'),
('haloid_orig', 'i8'),
('snap_num', 'i4'),
('haloid_next_coprog_depthfirst', 'i8'),
('haloid_last_prog_depthfirst', 'i8'),
('haloid_last_mainleaf_depthfirst', 'i8'),
('rs_klypin', 'f4'),
('mvir_all', 'f4'),
('m200b', 'f4'),
('m200c', 'f4'),
('m500c', 'f4'),
('m2500c', 'f4'),
('xoff', 'f4'),
('voff', 'f4'),
('spin_bullock', 'f4'),
('b_to_a', 'f4'),
('c_to_a', 'f4'),
('axisA_x', 'f4'),
('axisA_y', 'f4'),
('axisA_z', 'f4'),
('b_to_a_500c', 'f4'),
('c_to_a_500c', 'f4'),
('axisA_x_500c', 'f4'),
('axisA_y_500c', 'f4'),
('axisA_z_500c', 'f4'),
('t_by_u', 'f4'),
('mass_pe_behroozi', 'f4'),
('mass_pe_diemer', 'f4')
]
def read_in_trees(self):
"""Store each tree as an hdf5 dataset.
"""
with open(self.fname) as ascii_file:
with h5py.File(self.hdf5_name,"r+") as f:
tree_id = ""
current_tree = []
for line in ascii_file:
if(line[0]=='#'): #new tree
arr = np.array(current_tree, dtype = dt)
f[tree_id] = arr
current_tree = []
tree_id = line[6:].strip('\n')
else: #read in next tree element
current_tree.append(tuple(line.split()))
return
錯誤:
/Volumes/My Passport for Mac/raw_trees/bolshoi/rockstar/asciiReaderOne.py in read_in_trees(self)
129 arr = np.array(current_tree, dtype = dt)
130 # depth_sort = arr['haloid_depth_first'].argsort()
--> 131 f[tree_id] = arr
132 current_tree = []
133 first_line = False
/Library/Python/2.7/site-packages/h5py/_objects.so in h5py._objects.with_phil.wrapper (/Users/travis/build/MacPython/h5py-wheels/h5py/h5py/_objects.c:2458)()
/Library/Python/2.7/site-packages/h5py/_objects.so in h5py._objects.with_phil.wrapper (/Users/travis/build/MacPython/h5py-wheels/h5py/h5py/_objects.c:2415)()
/Library/Python/2.7/site-packages/h5py/_hl/group.pyc in __setitem__(self, name, obj)
281
282 else:
--> 283 ds = self.create_dataset(None, data=obj, dtype=base.guess_dtype(obj))
284 h5o.link(ds.id, self.id, name, lcpl=lcpl)
285
/Library/Python/2.7/site-packages/h5py/_hl/group.pyc in create_dataset(self, name, shape, dtype, data, **kwds)
101 """
102 with phil:
--> 103 dsid = dataset.make_new_dset(self, shape, dtype, data, **kwds)
104 dset = dataset.Dataset(dsid)
105 if name is not None:
/Library/Python/2.7/site-packages/h5py/_hl/dataset.pyc in make_new_dset(parent, shape, dtype, data, chunks, compression, shuffle, fletcher32, maxshape, compression_opts, fillvalue, scaleoffset, track_times)
124
125 if data is not None:
--> 126 dset_id.write(h5s.ALL, h5s.ALL, data)
127
128 return dset_id
/Library/Python/2.7/site-packages/h5py/_objects.so in h5py._objects.with_phil.wrapper (/Users/travis/build/MacPython/h5py-wheels/h5py/h5py/_objects.c:2458)()
/Library/Python/2.7/site-packages/h5py/_objects.so in h5py._objects.with_phil.wrapper (/Users/travis/build/MacPython/h5py-wheels/h5py/h5py/_objects.c:2415)()
/Library/Python/2.7/site-packages/h5py/h5d.so in h5py.h5d.DatasetID.write (/Users/travis/build/MacPython/h5py-wheels/h5py/h5py/h5d.c:3260)()
/Library/Python/2.7/site-packages/h5py/h5t.so in h5py.h5t.py_create (/Users/travis/build/MacPython/h5py-wheels/h5py/h5py/h5t.c:15314)()
/Library/Python/2.7/site-packages/h5py/h5t.so in h5py.h5t.py_create (/Users/travis/build/MacPython/h5py-wheels/h5py/h5py/h5t.c:14903)()
/Library/Python/2.7/site-packages/h5py/h5t.so in h5py.h5t._c_compound (/Users/travis/build/MacPython/h5py-wheels/h5py/h5py/h5t.c:14192)()
/Library/Python/2.7/site-packages/h5py/h5t.so in h5py.h5t.py_create (/Users/travis/build/MacPython/h5py-wheels/h5py/h5py/h5t.c:15314)()
/Library/Python/2.7/site-packages/h5py/h5t.so in h5py.h5t.py_create (/Users/travis/build/MacPython/h5py-wheels/h5py/h5py/h5t.c:14749)()
/Library/Python/2.7/site-packages/h5py/h5t.so in h5py.h5t._c_float (/Users/travis/build/MacPython/h5py-wheels/h5py/h5py/h5t.c:12379)()
RuntimeError: Unable to register datatype atom (Can't insert duplicate key)
在'ascii_file'中重複''tree_id''的聲音。如果沒有真正的重複,可能會有某種截斷。或者接口中的某些'c'代碼使用小整數進行索引。是否有某種'h5py'' flush'方法?即告訴它,保存你擁有的數據集,然後重新開始? – hpaulj
您的10e6/7錯誤範圍接近'2 * 32',這可能會在32位計算機上產生問題。 – hpaulj
沒有重複的樹。當我關閉終端並在中途重新啓動時,hdf5數據會被保存,但我正在尋找一種不太好用的解決方案。 – kevinttan