2014-05-12 42 views
1

我想提取一個包含大約250K文件的大小〜500MB的zip文件的內容。提取一個大的zip文件的內容

這裏就是我想要做的事 -

import java.io.File; 
import java.io.IOException; 
import java.util.ArrayList; 
import java.util.List; 
import java.util.concurrent.ExecutorService; 
import java.util.concurrent.Executors; 

import org.apache.commons.io.FileUtils; 
import org.apache.commons.io.IOUtils; 

import de.schlichtherle.truezip.file.TFile; 
import de.schlichtherle.truezip.file.TFileInputStream; 

public class ArchiveReaderExecutor { 

    private final ExecutorService pool; 

    public ArchiveReaderExecutor() { 
     pool = Executors.newFixedThreadPool(8); 
    } 

    /** 
    * Splits the archive file into list of lists as provided in the batch size 
    * variable 
    * 
    * @param archive 
    * 
    * @return 
    */ 
    public List<List<TFile>> splitArchiveFile(final File archive) { 
     final TFile tFile = new TFile(archive.getAbsolutePath()); 
     final ArrayList<TFile> individualFiles = new ArrayList<TFile>(); 
     recursivelyReadLeafnodes(tFile, individualFiles); 
     final List<List<TFile>> returnList = new ArrayList<List<TFile>>(); 

     /* 
     * Splitting the entire list into list of objects for batch processing 
     */ 
     int count = 0; 
     List<TFile> innerList = null; 

     for (TFile splitFile : individualFiles) { 
      if (count == 0) { 
       innerList = new ArrayList<TFile>(); 
       returnList.add(innerList); 
      } 

      if (count < 100) { 
       ++count; 
      } else { 
       count = 0; 
      } 
      innerList.add(splitFile); 
     } 
     return returnList; 
    } 

    public List<TFile> recursivelyReadLeafnodes(TFile inputTFile, 
      ArrayList<TFile> individualFiles) { 
     TFile[] tfiles = null; 

     if (inputTFile.isArchive() || inputTFile.isDirectory()) { 
      tfiles = inputTFile.listFiles(); 
     } else { 
      tfiles = new TFile[0]; 
      tfiles[0] = inputTFile; 
     } 

     for (final TFile tFile : tfiles) { 
      if (tFile.isFile() && !tFile.getName().startsWith(".")) { 
       individualFiles.add(tFile); 
      } else if (tFile.isDirectory()) { 
       recursivelyReadLeafnodes(tFile, individualFiles); 
      } 
     } 

     return individualFiles; 
    } 

    public void runExtraction() { 

     File src = new File("Really_Big_File.zip"); 
     List<List<TFile>> files = splitArchiveFile(src); 
     for (List<TFile> list : files) { 
      pool.execute(new FileExtractorSavor(list)); 
     } 
     pool.shutdown(); 

    } 


    class FileExtractorSavor implements Runnable{ 
     List<TFile> files; 
     public FileExtractorSavor(List<TFile> files) { 
      this.files = files; 
     } 
     @Override 
     public void run() { 
      File file = null; 
      TFileInputStream in = null; 
      for (TFile tFile : files) { 
       try { 
        in = new TFileInputStream(tFile); 
        file = new File("Target_Location"+tFile.getName()); 
        FileUtils.writeStringToFile(file, IOUtils.toString(in)); 
       } catch (IOException e) { 
        e.printStackTrace(); 
       } finally { 
        IOUtils.closeQuietly(in); 
       } 
      } 

     } 

    } 

    public static void main(String[] args) { 
     new ArchiveReaderExecutor().runExtraction(); 
    } 
} 

當我同時運行這段代碼,有很多在等待/阻塞狀態的線程,這裏的線程轉儲:

"pool-1-thread-7" prio=5 tid=7fd8093dd000 nid=0x11d3f3000 waiting for monitor entry [11d3f2000] 
    java.lang.Thread.State: BLOCKED (on object monitor) 
    at de.schlichtherle.truezip.socket.ConcurrentInputShop$SynchronizedConcurrentInputStream.close(ConcurrentInputShop.java:223) 
    - waiting to lock <785460200> (a de.schlichtherle.truezip.fs.archive.FsDefaultArchiveController$Input) 
    at de.schlichtherle.truezip.io.DecoratingInputStream.close(DecoratingInputStream.java:79) 
    at org.apache.commons.io.IOUtils.closeQuietly(IOUtils.java:178) 
    at ArchiveReaderExecutor$FileExtractorSavor.run(ArchiveReaderExecutor.java:136) 
    at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895) 
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918) 
    at java.lang.Thread.run(Thread.java:695) 

    Locked ownable synchronizers: 
    - <79ed370e0> (a java.util.concurrent.locks.ReentrantLock$NonfairSync) 
"pool-1-thread-5" prio=5 tid=7fd8093db800 nid=0x11d1ed000 waiting for monitor entry [11d1ec000] 
    java.lang.Thread.State: BLOCKED (on object monitor) 
    at de.schlichtherle.truezip.socket.ConcurrentInputShop$SynchronizedConcurrentInputStream.close(ConcurrentInputShop.java:223) 
    - waiting to lock <785460200> (a de.schlichtherle.truezip.fs.archive.FsDefaultArchiveController$Input) 
    at de.schlichtherle.truezip.io.DecoratingInputStream.close(DecoratingInputStream.java:79) 
    at org.apache.commons.io.IOUtils.closeQuietly(IOUtils.java:178) 
    at ArchiveReaderExecutor$FileExtractorSavor.run(ArchiveReaderExecutor.java:136) 
    at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895) 
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918) 
    at java.lang.Thread.run(Thread.java:695) 

    Locked ownable synchronizers: 
    - <79ed46468> (a java.util.concurrent.locks.ReentrantLock$NonfairSync) 

TFile.cp_r(src, dst, TArchiveDetector.NULL, TArchiveDetector.NULL); 

花了更長的時間,因爲它是在一個單獨的線程運行:

我使用也試過。

我的問題,什麼是使用TrueZip在java中提取zip文件內容的快速,最佳和最佳方式?

+0

你爲什麼認爲封鎖線程是一件壞事? – chrylis

+0

幾乎所有的線程至少有40%的時間處於阻塞狀態,是不是不正常?此外,通貨膨脹過程大約需要1小時才能完成,而對於包含25K記錄的大小爲50MB的文件,該過程在3分鐘內完成。通過數學計算,30分鐘內不應該包含10次文件的更大文件? – Shyam

+1

如果沒有更多的信息,我不能說具體的性能問題,但是I/O密集型應用程序中的線程會被阻塞很多,這並不奇怪。 – chrylis

回答

1

這裏沒有什麼錯。 TrueZIP/TrueVFS爲每個安裝的存檔文件維護一個文件描述符。當多個線程同時讀取歸檔文件的內容時,TrueZIP/TrueVFS內核會序列化所有訪問,以便只有一個線程正在使用文件描述符並隨時更新其位置。所有其他線程將被阻止。

+0

謝謝你的關鍵點。我最終使閱讀器單線程和多線程寫入器,這種方式不會是競爭線程阻塞讀取操作。現在整個通貨膨脹過程在2-3分鐘內完成,文件大小爲〜500MB。 – Shyam

+0

當你說讓讀者單線程,你的意思是recursivelyReadLeafnodes函數?沒有可運行? – vivek85

+0

不,我修改了實現。與在FileExtractorSavor中處理提取操作不同的是,來自單個存檔文件的數據首先被讀入一個字符串,然後這些字符串被並行處理。 – Shyam