我正在做一項任務並學習如何使用Hadoop。我試圖使用維基百科頁面,但在嘗試設置輸入格式時仍然出現錯誤。這裏是我的代碼:Hadoop WikipediaPageInputFormat
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextOutputFormat;
import edu.umd.cloud9.collection.wikipedia.WikipediaPage;
import edu.umd.cloud9.collection.wikipedia.WikipediaPageInputFormat;
public class InvertedIndex {
public static class InvertedIndexMapper extends MapReduceBase
implements Mapper<LongWritable, WikipediaPage, Text, Text> {
private Text title = new Text();
private Text word = new Text();
public void map(LongWritable key, WikipediaPage value,
OutputCollector<Text, Text> output, Reporter report)
throws IOException {
String articleTitle = value.getTitle();
title.set(articleTitle);
String content = value.getContent();
String[] words = content.split(" ");
for (String s : words) {
word.set(s);
output.collect(word, title);
}
}
}
public static class InvertedIndexReducer extends MapReduceBase
implements Reducer<Text, Text, Text, Text> {
public void reduce (Text key, Iterator<Text> values,
OutputCollector<Text, Text> output, Reporter report)
throws IOException {
Set<String> articlesSet = new HashSet<String>();
Text articleNames = new Text();
while (values.hasNext()) {
articlesSet.add(values.toString());
}
String names = "";
for (String s : articlesSet) {
names += s + ", ";
}
articleNames.set(names);
output.collect(key, articleNames);
}
}
public static void main (String[] args) throws Exception {
JobConf conf = new JobConf(InvertedIndex.class);
conf.setJobName("InvertedIndex");
conf.setInputFormat(WikipediaPageInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
conf.setMapperClass(InvertedIndexMapper.class);
conf.setReducerClass(InvertedIndexReducer.class);
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(Text.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
long startTime = System.currentTimeMillis();
JobClient.runJob(conf);
System.out.println("Job finished in :" + (System.currentTimeMillis() - startTime)/1000 + " seconds");
}
}
我使用Eclipse和獲得在該行conf.setInputFormat(WikipediaPageInputFormat.class)以下的錯誤;
The method setInputFormat (Class ? extends InputFormat) in the type JobConf is not applicable for the arguments (Class WikipediaPageInputFormat)
當我嘗試編譯我收到以下錯誤:
InvertedIndex.java:81: cannot find symbol
symbol : method setInputFormatClass(java.lang.Class edu.umd.cloud9.collection.wikipedia.WikipediaPageInputFormat)
location: class org.apache.hadoop.mapred.JobConf conf.setInputFormatClass(WikipediaPageInputFormat.class);
我將不勝感激如何糾正這一點,如果某人以前經歷過的任何幫助或建議。
謝謝!