2013-10-15 17 views
3

我試圖運行在第7章的hello world例子,我創建了日食以下內容,然後打包成一個jar: -如何行動書在運行Mahout的例子

package com.mycode.mahout 
import java.io.File; 
import java.io.IOException; 
import java.util.ArrayList; 
import java.util.List; 

import org.apache.hadoop.conf.Configuration; 
import org.apache.hadoop.fs.FileSystem; 
import org.apache.hadoop.fs.Path; 
import org.apache.hadoop.io.IntWritable; 
import org.apache.hadoop.io.LongWritable; 
import org.apache.hadoop.io.SequenceFile; 
import org.apache.hadoop.io.Text; 
import org.apache.mahout.clustering.WeightedVectorWritable; 
import org.apache.mahout.clustering.kmeans.Cluster; 
import org.apache.mahout.clustering.kmeans.KMeansDriver; 
import org.apache.mahout.common.distance.EuclideanDistanceMeasure; 
import org.apache.mahout.math.RandomAccessSparseVector; 
import org.apache.mahout.math.Vector; 
import org.apache.mahout.math.VectorWritable; 

public class SimpleKMeansClustering { 
    public static final double[][] points = { {1, 1}, {2, 1}, {1, 2}, 
              {2, 2}, {3, 3}, {8, 8}, 
              {9, 8}, {8, 9}, {9, 9}}; 

    public static void writePointsToFile(List<Vector> points, 
             String fileName, 
             FileSystem fs, 
             Configuration conf) throws IOException { 
    Path path = new Path(fileName); 
    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, 
     path, LongWritable.class, VectorWritable.class); 
    long recNum = 0; 
    VectorWritable vec = new VectorWritable(); 
    for (Vector point : points) { 
     vec.set(point); 
     writer.append(new LongWritable(recNum++), vec); 
    } 
    writer.close(); 
    } 

    public static List<Vector> getPoints(double[][] raw) { 
    List<Vector> points = new ArrayList<Vector>(); 
    for (int i = 0; i < raw.length; i++) { 
     double[] fr = raw[i]; 
     Vector vec = new RandomAccessSparseVector(fr.length); 
     vec.assign(fr); 
     points.add(vec); 
    } 
    return points; 
    } 

    public static void main(String args[]) throws Exception { 

    int k = 2; 

    List<Vector> vectors = getPoints(points); 

    File testData = new File("testdata"); 
    if (!testData.exists()) { 
     testData.mkdir(); 
    } 
    testData = new File("testdata/points"); 
    if (!testData.exists()) { 
     testData.mkdir(); 
    } 

    Configuration conf = new Configuration(); 
    FileSystem fs = FileSystem.get(conf); 
    writePointsToFile(vectors, "testdata/points/file1", fs, conf); 

    Path path = new Path("testdata/clusters/part-00000"); 
    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, 
     path, Text.class, Cluster.class); 

    for (int i = 0; i < k; i++) { 
     Vector vec = vectors.get(i); 
     Cluster cluster = new Cluster(vec, i, new EuclideanDistanceMeasure()); 
     writer.append(new Text(cluster.getIdentifier()), cluster); 
    } 
    writer.close(); 

    KMeansDriver.run(conf, new Path("testdata/points"), new Path("testdata/clusters"), 
     new Path("output"), new EuclideanDistanceMeasure(), 0.001, 10, 
     true, false); 

    SequenceFile.Reader reader = new SequenceFile.Reader(fs, 
     new Path("output/" + Cluster.CLUSTERED_POINTS_DIR 
       + "/part-m-00000"), conf); 

    IntWritable key = new IntWritable(); 
    WeightedVectorWritable value = new WeightedVectorWritable(); 
    while (reader.next(key, value)) { 
     System.out.println(value.toString() + " belongs to cluster " 
         + key.toString()); 
    } 
    reader.close(); 
    } 

} 

我收拾它作爲myjob.jar

現在我該如何執行這個在我的集羣?

我嘗試以下操作: -

hadoop jar myjob.jar com.mycode.mahout.SimpleKMeansClustering 
java -jar myjob.jar 
java -cp myjob.jar 

我得到follwing錯誤: -

[[email protected] tmp]# hadoop jar mahoutfirst.jar com.mahout.emc.SimpleKMeansClustering 
    Exception in thread "main" java.lang.NoClassDefFoundError:   org/apache/mahout/math/Vector` 
     at java.lang.Class.forName0(Native Method) 
     at java.lang.Class.forName(Class.java:270) 
     at org.apache.hadoop.util.RunJar.main(RunJar.java:201) 
    Caused by: java.lang.ClassNotFoundException: org.apache.mahout.math.Vector 
     at java.net.URLClassLoader$1.run(URLClassLoader.java:366) 
     at java.net.URLClassLoader$1.run(URLClassLoader.java:355) 
     at java.security.AccessController.doPrivileged(Native Method) 
     at java.net.URLClassLoader.findClass(URLClassLoader.java:354) 
     at java.lang.ClassLoader.loadClass(ClassLoader.java:424) 
     at java.lang.ClassLoader.loadClass(ClassLoader.java:357) 
     ... 3 more 

請諮詢什麼是運行使用Mahout中編寫的代碼的正確途徑。

回答

0

望着不類定義中發現上面的異常,看來你可能需要包括亨利馬烏相關罐子(象夫-core.jar添加,我猜)與Hadoop的工作。

要傳遞的罐子在整個集羣映射器,你需要大概使用DistributedCache或-libjar Hadoop的選項。後者的想法是explained here

3

即使這個漂亮的晚,但我面臨着類似的問題和下面的方法做的工作對我來說,因爲我不希望使用Maven:

1)轉到您的象夫的安裝目錄&找工作*。罐作爲

ls /usr/lib/mahout/ 
conf lib  mahout-core-0.5-cdh3u3-job.jar mahout-examples-0.5-cdh3u3-job.jar mahout-taste-webapp-0.5-cdh3u3.war 

2)複印亨利馬烏-例子-0.5-cdh3u3-job.jar到代碼駐留

3)使用由亨利馬烏提供的 「工作」 JAR文件的目錄。它打包了所有的依賴關係。你也需要添加你的類。由於您使用hadoop和mahout庫編譯了您的類,因此您已準備好了.class文件。

4)你的類文件添加到工作罐子象夫核-0.5-cdh3u3-job.jar在你的目錄:

jar uf mahout-core-0.5-cdh3u3-job.jar SimpleKMeansClustering.class 

4)運行Hadoop的水瓶中使用代碼:

hadoop jar mahout-core-0.5-cdh3u3-job.jar SimpleKMeansClustering 

5)在你結束的map-reduce作業,您可以看到:

1.0: [1.000, 1.000] belongs to cluster 0 
1.0: [2.000, 1.000] belongs to cluster 0 
1.0: [1.000, 2.000] belongs to cluster 0 
1.0: [2.000, 2.000] belongs to cluster 0 
1.0: [3.000, 3.000] belongs to cluster 0 
1.0: [8.000, 8.000] belongs to cluster 1 
1.0: [9.000, 8.000] belongs to cluster 1 
1.0: [8.000, 9.000] belongs to cluster 1 
1.0: [9.000, 9.000] belongs to cluster 1