0
我只想從兩個tsv文件中自動提取小樣本。採樣行不一定必須精確,每個採樣只需要均勻分佈。當切割發生時,bash shell會輸出'tail:stdout:Broken pipe',儘管程序起初似乎仍然運行正常。我不是特別喜歡我的程序輸出「Broken」這個詞,但我並不在乎。問題是每個後續的'砍'需要更長的時間,我不明白爲什麼。我有內存泄漏嗎?有什麼我應該關閉嗎?我也不喜歡有除了聲明之外的嘗試,但我不確定解決這個問題的好方法。在python中使用bash命令切割文件運行緩慢'tail:stdout:Broken pipe'
import os
import sys
import subprocess
import commands
import csv as tsv
def main(scorebreaks, positives, negatives):
#just to isolate the attributeId
newpositives = os.path.basename(positives)
attributeid = newpositives.rstrip('-positive.tsv')
#create output folder if it doesn't exist
path_to_script_dir = os.path.dirname(os.path.abspath(positives))
newpath = path_to_script_dir + '/ezcut_output'
if not os.path.exists(newpath): os.makedirs(newpath)
with open(scorebreaks, 'rb') as tsvfile:
tsvreader = tsv.reader(tsvfile, delimiter='\t')
scorebreakslist = zip(*(line.strip().split('\t') for line in tsvfile))
#print scorebreakslist[0][1] #would give line number at .99
#print scorebreakslist[1][1] #would give .99
whatiteration = input('What iteration? ')
chunksize = input('Chunk size? ')
numberofchunks = int(input('Number of chunks? '))-1
scorejumpamt = 1.0/numberofchunks #number of chunks is 20? score jump amt == .05
#print scorejumpamt
scorei = 1.0
choparray = [100]
while True: #cause i needed a do-while loop
scorei = float(scorei) - float(scorejumpamt)
scorei = '%.2f'%(scorei)
#print scorei
if float(scorei) < 0.00: break
try:
arraynum = scorebreakslist[1].index(str(scorei))
except ValueError:
break
#print scorebreakslist[1]
#add the linenumber to an array for use in cutting
choparray.append(scorebreakslist[0][arraynum])
#print len(choparray)
#the actual file manipulation section of code
index=0
for number in choparray:
indexkinda = 1-float(scorejumpamt)*float(index)
indexkinda = '%.2f'%(indexkinda)
#print indexkinda
if indexkinda < 0: break
if float(indexkinda) > 0.50:
#print indexkinda
cmd = 'tail -n+%s %s | head -n%s > %s/%s-%s-%s.tsv' % (number, positives, chunksize, newpath, indexkinda, attributeid, whatiteration)
subprocess.call(cmd, shell=True)
#subprocess.call(cmd, shell=True)
index+=1
else: #maybe make this not get anything below 0.1 for speed
#print indexkinda
cmd = 'tail -n+%s %s | head -n%s > %s/%s-%s-%s.tsv' % (number, negatives, chunksize, newpath, indexkinda, attributeid, whatiteration)
subprocess.call(cmd, shell=True)
index+=1
main(sys.argv[1], sys.argv[2], sys.argv[3])
雖然我不重讀任何文件的相同部分? 'tail -n +(somenumber)佔據文件的底部。所以文件的相同部分永遠不會被讀取。 – 2014-09-28 16:21:14
@ user2005645,否,'tail'必須重新讀取所有這些行才能跳過它們。沒有辦法直接說「跳到文件中的第n行」。 – o11c 2014-10-01 22:59:51