2012-12-27 176 views
2

請指教我如何將文本文件轉換爲.arff格式(weka) ,因爲我想對1000個文本文件進行數據聚類。如何將文本文件轉換爲.arff格式(weka)

問候

+1

另請參閱:http://stackoverflow.com/questions/13915683/how-to-convert-a-log-system -file-into-arff-format 請注意,Weka只在分類上很好,它的集羣功能幾乎不存在。 –

回答

0

下面是代碼,您可以使用

package text.Classification; 
import java.io.*; 
import weka.core.*; 
public class TextDirectoryToArff { 

public Instances createDataset(String directoryPath) throws Exception { 
FastVector atts; 
FastVector attVals; 
atts = new FastVector(); 
atts.addElement(new Attribute("contents", (FastVector) null)); 
String[] s = { "class1", "class2", "class3" }; 
attVals = new FastVector(); 
for (String p : s) 
    attVals.addElement(p); 
atts.addElement(new Attribute("class", attVals)); 
Instances data = new Instances("MyRelation", atts, 0); 
System.out.println(data); 

InputStreamReader is = null; 
File dir = new File(directoryPath); 
String[] files = dir.list(); 
for (int i = 0; i < files.length; i++) { 
    if (files[i].endsWith(".txt")) { 
    double[] newInst = new double[2]; 
    File txt = new File(directoryPath + File.separator + files[i]); 

    is = new InputStreamReader(new FileInputStream(txt)); 
    StringBuffer txtStr = new StringBuffer(); 

    int c; 
    while ((c = is.read()) != -1) { 
     txtStr.append((char) c); 
    } 
    newInst[0] = data.attribute(0).addStringValue(txtStr.toString()); 
    int j=i%(s.length-1); 
    newInst[1] = attVals.indexOf(s[j]); 
    data.add(new Instance(1.0, newInst)); 

    } 
} 
return data; 
} 



public static void main(String[] args) { 

TextDirectoryToArff tdta = new TextDirectoryToArff(); 
try { 
    Instances dataset = tdta.createDataset("/home/asadul/Desktop/Downloads/text_example/class5"); 
    PrintWriter fileWriter = new PrintWriter("/home/asadul/Desktop/Downloads/text_example/abc.arff", "UTF-8"); 
    fileWriter.println(dataset); 
    fileWriter.close(); 
} catch (Exception e) { 
    System.err.println(e.getMessage()); 
    e.printStackTrace(); 
} 

} 

}