2017-10-09 85 views
0

我試圖解決使用Hadoop這個問題。java.lang.ArrayIndexOutOfBoundsException:在MapReduce的2錯誤,Hadoop的

查找使用平均收視率前十位的企業評級。評分最高的商家將排在第一位。回想一下,review.csv文件中的第4列代表評級。

我的Java代碼:

package bd; 
import java.io.IOException; 
import java.util.ArrayList; 
import java.util.Collections; 
import java.util.List; 
import java.util.Map.Entry; 
import java.util.TreeMap; 
import org.apache.hadoop.conf.Configuration; 
import org.apache.hadoop.fs.Path; 
import org.apache.hadoop.io.*; 
import org.apache.hadoop.mapreduce.Job; 
import org.apache.hadoop.mapreduce.Mapper; 
import org.apache.hadoop.mapreduce.Reducer; 
import org.apache.hadoop.mapreduce.Mapper.Context; 
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 
import org.apache.hadoop.util.GenericOptionsParser; 


    public class TopTenRatedBusiness { 

     /* 
     * Mapper Class : BusinessRatingMapper 
     * Class BusinessRatingMapper parses review.csv file and emits business id and respective rating 
     */ 
     public static class BusinessRatingMapper extends Mapper<LongWritable, Text, Text, FloatWritable> { 
      /* 
      * Map function that emits a business ID as a key and rating as a value 
      */ 
      @Override 
      protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException { 

       String reviews[] = value.toString().split("::"); 
       /* 
       * reviews[2] gives business id and reviews[3] gives business rating 
       */ 
       context.write(new Text(reviews[2]), new FloatWritable(Float.parseFloat(reviews[3]))); 

      } 
     } 

     /* 
     * Reducer class: TopRatedBusinessReducer 
     * Class TopRatedBusinessReducer emits top 10 business id with their average rating 
     */ 
     static TreeMap<Float, List<Text>> reviewID = new TreeMap<Float, List<Text>>(Collections.reverseOrder()); 

     public static class BusinessRatingReducer extends Reducer<Text, FloatWritable, Text, FloatWritable> { 

      /* 
      * Reduce function 
      */ 
      public void reduce(Text key, Iterable<FloatWritable> values, Context context)throws IOException, InterruptedException { 
       float sumOfRatings = 0; 
       int countOfRatings = 0; 
       for (FloatWritable value : values) { 
        sumOfRatings += value.get(); 
        countOfRatings++; 
       } 

       Float averageRating = sumOfRatings/countOfRatings; 

       if (reviewID.containsKey(averageRating)) { 
        reviewID.get(averageRating).add(new Text(key.toString())); 
       } else { 
        List<Text> businessIDList = new ArrayList<Text>(); 
        businessIDList.add(new Text(key.toString())); 

        /* 
        * Putting average rating and corresponding business ID 
        */ 
        reviewID.put(averageRating, businessIDList); 
       } 
      } 


      @Override 
      protected void cleanup(Reducer<Text, FloatWritable, Text, FloatWritable>.Context context)throws IOException, InterruptedException { 

       int count=0; 
       for(Entry<Float, List<Text>> entry : reviewID.entrySet()) { 
        if(count > 10){ 
         break; 
        } 

       FloatWritable result=new FloatWritable(); 
       result.set(entry.getKey()); 

       for (int i = 0; i <entry.getValue().size(); i++) { 
         if (count >= 10) { 
          break; 
         } 
         context.write(new Text(entry.getValue().get(i).toString()), result); 
         count++; 
        } 

       } 

      } 
     } 

      /* 
      * Driver Program 
      */ 

      public static void main(String[] args) throws IOException,ClassNotFoundException, InterruptedException, NoSuchMethodException { 

       Configuration conf = new Configuration(); 
       String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); 
       if (otherArgs.length != 2) { 
        System.err.println("Usage: TopTenRatedBusiness <in> <out>"); 
        System.exit(2); 

       } 
       /* 
       * Create a job with name "TopTenRatedBusiness" 
       */ 

       Job job = new Job(conf, "TopTenRatedBusiness"); 
       job.setJarByClass(TopTenRatedBusiness.class); 

       job.setMapperClass(BusinessRatingMapper.class); 
       job.setMapOutputKeyClass(Text.class); 
       job.setMapOutputValueClass(FloatWritable.class); 

       job.setReducerClass(BusinessRatingReducer.class); 
       job.setOutputKeyClass(Text.class); 
       job.setOutputValueClass(FloatWritable.class); 

       FileInputFormat.addInputPath(job, new Path(otherArgs[0])); 
       FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 
       System.exit(job.waitForCompletion(true) ? 0 : 1); 

     } 

} 

我的數據集:

review.csv文件包含用戶對業務提供的星級評價。使用user_id將該評論與同一用戶的其他評論相關聯。使用business_id將此審閱與同一業務的其他人關聯。

review.csv file contains the following columns "review_id"::"user_id"::"business_id"::"stars" 
'review_id': (a unique identifier for the review) 
'user_id': (the identifier of the reviewed business), 
'business_id': (the identifier of the authoring user), 
'stars': (star rating, integer 1-5),the rating given by the user to a business 

我收到以下錯誤,當我運行此:

17/10/09 21:18:33 INFO input.FileInputFormat: Total input paths to process : 1 
17/10/09 21:18:33 INFO util.NativeCodeLoader: Loaded the native-hadoop library 
17/10/09 21:18:33 WARN snappy.LoadSnappy: Snappy native library not loaded 
17/10/09 21:18:34 INFO mapred.JobClient: Running job: job_201710090351_0033 
17/10/09 21:18:35 INFO mapred.JobClient: map 0% reduce 0% 
17/10/09 21:18:41 INFO mapred.JobClient: Task Id : attempt_201710090351_0033_m_000000_0, Status : FAILED 
java.lang.ArrayIndexOutOfBoundsException: 2 
    at bd.TopTenRatedBusiness$BusinessRatingMapper.map(TopTenRatedBusiness.java:37) 
    at bd.TopTenRatedBusiness$BusinessRatingMapper.map(TopTenRatedBusiness.java:26) 
    at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:145) 
    at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:764) 
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:364) 
    at org.apache.hadoop.mapred.Child$4.run(Child.java:255) 
    at java.security.AccessController.doPrivileged(Native Method) 
    at javax.security.auth.Subject.doAs(Subject.java:422) 
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1190) 
    at org.apache.hadoop.mapred.Child.main(Child.java:249) 

17/10/09 21:18:47 INFO mapred.JobClient: Task Id : attempt_201710090351_0033_m_000000_1, Status : FAILED 
java.lang.ArrayIndexOutOfBoundsException: 2 
    at bd.TopTenRatedBusiness$BusinessRatingMapper.map(TopTenRatedBusiness.java:37) 
    at bd.TopTenRatedBusiness$BusinessRatingMapper.map(TopTenRatedBusiness.java:26) 
    at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:145) 
    at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:764) 
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:364) 
    at org.apache.hadoop.mapred.Child$4.run(Child.java:255) 
    at java.security.AccessController.doPrivileged(Native Method) 
    at javax.security.auth.Subject.doAs(Subject.java:422) 
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1190) 
    at org.apache.hadoop.mapred.Child.main(Child.java:249) 

17/10/09 21:18:52 INFO mapred.JobClient: Task Id : attempt_201710090351_0033_m_000000_2, Status : FAILED 
java.lang.ArrayIndexOutOfBoundsException: 2 
    at bd.TopTenRatedBusiness$BusinessRatingMapper.map(TopTenRatedBusiness.java:37) 
    at bd.TopTenRatedBusiness$BusinessRatingMapper.map(TopTenRatedBusiness.java:26) 
    at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:145) 
    at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:764) 
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:364) 
    at org.apache.hadoop.mapred.Child$4.run(Child.java:255) 
    at java.security.AccessController.doPrivileged(Native Method) 
    at javax.security.auth.Subject.doAs(Subject.java:422) 
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1190) 
    at org.apache.hadoop.mapred.Child.main(Child.java:249) 

17/10/09 21:18:58 INFO mapred.JobClient: Job complete: job_201710090351_0033 
17/10/09 21:18:58 INFO mapred.JobClient: Counters: 7 
17/10/09 21:18:58 INFO mapred.JobClient: Job Counters 
17/10/09 21:18:58 INFO mapred.JobClient:  Launched map tasks=4 
17/10/09 21:18:58 INFO mapred.JobClient:  SLOTS_MILLIS_REDUCES=0 
17/10/09 21:18:58 INFO mapred.JobClient:  Total time spent by all reduces waiting after reserving slots (ms)=0 
17/10/09 21:18:58 INFO mapred.JobClient:  Failed map tasks=1 
17/10/09 21:18:58 INFO mapred.JobClient:  SLOTS_MILLIS_MAPS=23391 
17/10/09 21:18:58 INFO mapred.JobClient:  Total time spent by all maps waiting after reserving slots (ms)=0 
17/10/09 21:18:58 INFO mapred.JobClient:  Data-local map tasks=4 

幾個樣品輸入線

0xuZfa0t4MNWd3eIFF02ug::kT43SxDgMGzbeXpO51f0hQ::wbpbaWBfU54JbjLIDwERQA::5.0 
bBqVqhOvNgFs8I1Wk68QUQ::T9hGHsbJW9Hw1cJAlIAWmw::4iTRjN_uAdAb7_YZDVHJdg::5.0 
fu7TcxnAOdnbdLcyFhMmZg::Z_WAxc4RUpKp3y12BH1bEg::qw5gR8vW7mSOK4VROSwdMA::4.0 
LMy8UOKOeh0b9qrz-s1fQA::OlMjqqzWZUv2-62CSqKq_A::81IjU5L-t-QQwsE38C63hQ::4.0 
JjyRj9EiBXQTFDQAxRtt4g::fs5bpfk-2pvq2v8S1De5pQ::Hnz1_h_D1eHSRtQqHSCZkw::2.0 

回答

0

你的代碼樣本輸入工作正常。

所以它看起來是與您的數據,哪裏會有這無法處理錯誤行的問題。您可以檢查是否有任何標題列,或者您需要查看完整文件。

,您可以檢查的另一件事是你提供的具有唯一review.CSV文件,並沒有別的輸入目錄路徑。

+0

沒有標題。你能進一步指導我嗎? – user8747066

+0

我GOOGLE了出來,並發現https://github.com/patilankita79/Analysis-of-Yelp-Dataset-Using-Pig-Latin,這是您要運行輸入文件? (review.csv) –

+0

我用你寫的代碼運行了review.csv,我可以成功運行它。輸入路徑的結構是什麼?輸入路徑目錄應該只包含review.csv文件。 –

0

此行給你一個錯誤

context.write(new Text(reviews[2]), new FloatWritable(Float.parseFloat(reviews[3]))); 

嘗試使用調試器來解決這個問題

+0

你可以請建議更改嗎? – user8747066

+0

我是新來的hadoop – user8747066

+0

你有沒有得到解決這個問題。我似乎因爲同樣的問題而陷入困境。 – Manas

相關問題