2012-11-02 66 views
0

我有300萬行數據,每個數據行有30個功能 - 很難將所有內容都包含在我的計算機中,並且很慢用學習算法處理它。我想寫一點代碼,可以進行隨機採樣,但是在JAVA中,並且通過我的PC配置,它不起作用,或者需要很多時間才能執行。我知道使用C或C++編寫更好的解決方案,但我也對這種情況下python的可用性感到好奇。在這種情況下使用Python是合理的,因爲Java由於緩慢和內存限制而無法有效地工作 - 請不要說增加堆大小或類似的東西 -使用Python解決方案無法在JAVA中處理,因爲執行速度很慢

+0

所以有什麼問題嗎? – Darek

+0

你是否看到這個解釋中的問題 – erogol

+0

恐怕我明白你的問題的確切性質。你面臨什麼問題,你想做什麼? – Frankline

回答

1

如果性能很關鍵,這就是我使用的那種解決方案。

public class SimpleTable { 
    private final List<RandomAccessFile> files = new ArrayList<RandomAccessFile>(); 
    private final List<FloatBuffer> buffers = new ArrayList<FloatBuffer>(); 
    private final File baseDir; 
    private final int rows; 

    private SimpleTable(File baseDir, int rows) { 
     this.baseDir = baseDir; 
     this.rows = rows; 
    } 

    public static SimpleTable create(String baseName, int rows) throws IOException { 
     File baseDir = new File(baseName); 
     if (!baseDir.mkdirs()) throw new IOException("Failed to create " + baseName); 
     PrintWriter pw = new PrintWriter(baseName + "/rows"); 
     pw.println(rows); 
     pw.close(); 
     return new SimpleTable(baseDir, rows); 
    } 

    public static SimpleTable load(String baseName) throws IOException { 
     BufferedReader br = new BufferedReader(new FileReader(baseName + "/rows")); 
     int rows = Integer.parseInt(br.readLine()); 
     br.close(); 
     File baseDir = new File(baseName); 
     SimpleTable table = new SimpleTable(baseDir, rows); 
     File[] files = baseDir.listFiles(); 
     Arrays.sort(files); 
     for (File file : files) { 
      if (!file.getName().endsWith(".float")) continue; 
      table.addColumnForFile(file); 
     } 
     return table; 
    } 

    private FloatBuffer addColumnForFile(File file) throws IOException { 
     RandomAccessFile rw = new RandomAccessFile(file, "rw"); 
     MappedByteBuffer mbb = rw.getChannel().map(FileChannel.MapMode.READ_WRITE, 0, rows * 8); 
     mbb.order(ByteOrder.nativeOrder()); 
     FloatBuffer db = mbb.asFloatBuffer(); 
     files.add(rw); 
     buffers.add(db); 
     return db; 
    } 

    public int rows() { 
     return rows; 
    } 

    public int columns() { 
     return buffers.size(); 
    } 

    public FloatBuffer addColumn() throws IOException { 
     return addColumnForFile(new File(baseDir, String.format("%04d.float", buffers.size()))); 
    } 

    public FloatBuffer getColumn(int n) { 
     return buffers.get(n); 
    } 

    public void close() throws IOException { 
     for (RandomAccessFile file : files) { 
      file.close(); 
     } 
     files.clear(); 
     buffers.clear(); 
    } 
} 

public class SimpleTableTestMain { 
    public static void main(String... args) throws IOException { 
     long start = System.nanoTime(); 
     SimpleTable st = SimpleTable.create("test", 3 * 1000 * 1000); 
     for (int i = 0; i < 50; i++) { 
      FloatBuffer db = st.addColumn(); 
      for (int j = 0; j < db.capacity(); j++) 
       db.put(j, i + j); 
     } 
     st.close(); 

     long mid = System.nanoTime(); 

     SimpleTable st2 = SimpleTable.load("test"); 
     for (int i = 0; i < 50; i++) { 
      FloatBuffer db = st2.getColumn(i); 
      double sum = 0; 
      for (int j = 0; j < db.capacity(); j++) 
       sum += db.get(j); 
      assert sum > 0; 
     } 

     long end = System.nanoTime(); 
     System.out.printf("Took %.3f seconds to write and %.3f seconds to read %,d rows and %,d columns%n", 
       (mid - start)/1e9, (end - mid)/1e9, st2.rows(), st2.columns()); 
     st2.close(); 
    } 
} 

打印

Took 2.070 seconds to write and 2.206 seconds to read 3,000,000 rows and 50 columns 
相關問題