2014-12-22 32 views
1

我有寫在下面(鍵,值)數據的文本文件格式:寫作的Hadoop序列文件

1,34 
5,67 
8,88 

該文件被放置在本地文件系統。

我想將它轉換成一個hadoop序列文件,再次在本地文件系統上,以便在mahout中使用它。序列文件應該包含所有記錄。例如,對於記錄1,1是鍵,34是值。對其他記錄也是如此。

我是Java新手。我會很感激的幫助。

謝謝。

+0

您應該檢查從桑傑·薩勃拉曼尼亞類似問題的答案http://stackoverflow.com/questions/5377118/how-to-convert-txt-file-to-hadoops-sequence-文件格式 –

回答

0

那麼我確實發現了一種方法。下面是代碼:

import java.io.BufferedReader; 
import java.io.FileNotFoundException; 
import java.io.FileReader; 
import java.io.IOException; 
import org.apache.hadoop.conf.Configuration; 
import org.apache.hadoop.fs.FileSystem; 
import org.apache.hadoop.fs.Path; 
import org.apache.hadoop.io.LongWritable; 
import org.apache.hadoop.io.SequenceFile; 
import org.apache.hadoop.io.Text; 

public class CreateSequenceFile { 
    public static void main(String[] argsx) throws FileNotFoundException, IOException 
     { 
     String myfile = "/home/ashokharnal/keyvalue.txt"; 
     String outputseqfile = "/home/ashokharnal/part-0000"; 
     Path path = new Path(outputseqfile); 

     //open input file 
     BufferedReader br = new BufferedReader(new FileReader(myfile)); 
     //create Sequence Writer 
     Configuration conf = new Configuration();   
     FileSystem fs = FileSystem.get(conf); 
     SequenceFile.Writer writer = new SequenceFile.Writer(fs,conf,path,LongWritable.class,Text.class); 
     LongWritable key ; 
     Text value ; 
     String line = br.readLine(); 
     String field_delimiter = ","; 
     String[] temp; 
     while (line != null) { 
      try 
      { 
       temp = line.split(field_delimiter); 
       key = new LongWritable(Integer.valueOf(temp[0])) ; 
       value = new Text(temp[1].toString()); 
       writer.append(key,value);  
       System.out.println("Appended to sequence file key " + key.toString() + " and value " + value.toString()); 
       line = br.readLine();  
      } 
      catch(Exception ex) 
      { 
       ex.printStackTrace(); 
      } 
     }   
    writer.close(); 
} 
}