2013-11-28 39 views
0

我正在做一項任務並學習如何使用Hadoop。我試圖使用維基百科頁面,但在嘗試設置輸入格式時仍然出現錯誤。這裏是我的代碼:Hadoop WikipediaPageInputFormat

import java.io.IOException; 
import java.util.HashSet; 
import java.util.Iterator; 
import java.util.Set; 

import org.apache.hadoop.fs.Path; 
import org.apache.hadoop.io.LongWritable; 
import org.apache.hadoop.io.Text; 
import org.apache.hadoop.mapred.FileInputFormat; 
import org.apache.hadoop.mapred.FileOutputFormat; 
import org.apache.hadoop.mapred.JobClient; 
import org.apache.hadoop.mapred.JobConf; 
import org.apache.hadoop.mapred.MapReduceBase; 
import org.apache.hadoop.mapred.Mapper; 
import org.apache.hadoop.mapred.OutputCollector; 
import org.apache.hadoop.mapred.Reducer; 
import org.apache.hadoop.mapred.Reporter; 
import org.apache.hadoop.mapred.TextOutputFormat; 

import edu.umd.cloud9.collection.wikipedia.WikipediaPage; 
import edu.umd.cloud9.collection.wikipedia.WikipediaPageInputFormat; 



public class InvertedIndex { 

    public static class InvertedIndexMapper extends MapReduceBase 
    implements Mapper<LongWritable, WikipediaPage, Text, Text> { 

    private Text title = new Text(); 
    private Text word = new Text(); 

     public void map(LongWritable key, WikipediaPage value, 
       OutputCollector<Text, Text> output, Reporter report) 
       throws IOException { 

      String articleTitle = value.getTitle(); 
      title.set(articleTitle); 

      String content = value.getContent(); 
      String[] words = content.split(" "); 

      for (String s : words) { 
       word.set(s); 
       output.collect(word, title); 
      } 
     } 
    } 

    public static class InvertedIndexReducer extends MapReduceBase 
     implements Reducer<Text, Text, Text, Text> { 

     public void reduce (Text key, Iterator<Text> values, 
       OutputCollector<Text, Text> output, Reporter report) 
       throws IOException { 

      Set<String> articlesSet = new HashSet<String>(); 
      Text articleNames = new Text(); 

      while (values.hasNext()) { 
       articlesSet.add(values.toString()); 
      } 

      String names = ""; 

      for (String s : articlesSet) { 
       names += s + ", "; 
      } 

      articleNames.set(names); 

      output.collect(key, articleNames); 

     } 
    } 

    public static void main (String[] args) throws Exception { 
     JobConf conf = new JobConf(InvertedIndex.class); 
     conf.setJobName("InvertedIndex"); 

     conf.setInputFormat(WikipediaPageInputFormat.class); 
     conf.setOutputFormat(TextOutputFormat.class); 

     conf.setMapperClass(InvertedIndexMapper.class); 
     conf.setReducerClass(InvertedIndexReducer.class); 
     conf.setMapOutputKeyClass(Text.class); 
     conf.setMapOutputValueClass(Text.class); 
     conf.setOutputKeyClass(Text.class); 
     conf.setOutputValueClass(Text.class); 

     FileInputFormat.setInputPaths(conf, new Path(args[0])); 
     FileOutputFormat.setOutputPath(conf, new Path(args[1])); 

     long startTime = System.currentTimeMillis(); 
     JobClient.runJob(conf); 
     System.out.println("Job finished in :" + (System.currentTimeMillis() - startTime)/1000 + " seconds"); 
    } 
} 

我使用Eclipse和獲得在該行conf.setInputFormat(WikipediaPageInputFormat.class)以下的錯誤;

The method setInputFormat (Class ? extends InputFormat) in the type JobConf is not applicable for the arguments (Class WikipediaPageInputFormat)

當我嘗試編譯我收到以下錯誤:

InvertedIndex.java:81: cannot find symbol

symbol : method setInputFormatClass(java.lang.Class edu.umd.cloud9.collection.wikipedia.WikipediaPageInputFormat)

location: class org.apache.hadoop.mapred.JobConf conf.setInputFormatClass(WikipediaPageInputFormat.class);

我將不勝感激如何糾正這一點,如果某人以前經歷過的任何幫助或建議。

謝謝!

回答

1

WikipediaPageInputFormat是新的Hadoop API(org.apache.hadoop.mapreduce。*)的一個類,所以您應該使用Job類而不是JobConf來配置您的作業。 爲了方便,Cloud9還爲舊API提供了WikipediaPageInputFormatOld。