2015-10-12 45 views
0

我使用的數據準備養豬,我也面臨這似乎是一個簡單的問題,但我無法處理: 例如,我有名字,是 輸入: -如何在豬身上產生重複的羣體?

id | name 
------------- 
1 | Alicia 
2 | Ana 
3 | Benita 
4 | Berta 
5 | Bertha 

而且我期待所需的輸出,我們:-(可以使用for循環的功能來實現這一目標?)

id  | name 
-------------------------- 
1_XX_1 | Alicia_id_1 
2_XX_1 | Ana_id_1 
3_XX_1 | Benita_id_1 
4_XX_1 | Berta_id_1 
5_XX_1 | Bertha_id_1 

1_XX_2 | Alicia_id_2 
2_XX_2 | Ana_id_2 
3_XX_2 | Benita_id_2 
4_XX_2 | Berta_id_2 
5_XX_2 | Bertha_id_2 

1_XX_3 | Alicia_id_3 
2_XX_3 | Ana_id_3 
3_XX_3 | Benita_id_3 
4_XX_3 | Berta_id_3 
5_XX_3 | Bertha_id_3 

回答

2

你可以做到這一點與UDF這能給你一些關於重新輸入想要的次數可用性被複制。在UDF下面會做到這一點。

package pigexerciseudf; 


import java.io.IOException; 
import java.util.ArrayList; 
import java.util.List; 

import org.apache.pig.EvalFunc; 
import org.apache.pig.data.BagFactory; 
import org.apache.pig.data.DataBag; 
import org.apache.pig.data.DataType; 
import org.apache.pig.data.Tuple; 
import org.apache.pig.data.TupleFactory; 
import org.apache.pig.impl.logicalLayer.FrontendException; 
import org.apache.pig.impl.logicalLayer.schema.Schema; 

public class replicateinput extends EvalFunc<DataBag> 
{ 
    public replicateinput() 
    { 

    } 
    int rep_factor=0; 
    public replicateinput(String a) 
    { 
     rep_factor=Integer.parseInt(a); 
    } 

    public DataBag exec(Tuple input) throws IOException 
    { 
     BagFactory bf=BagFactory.getInstance(); 
     DataBag output=bf.newDefaultBag(); 
      try 
      { 
      for(int i=1;i<=rep_factor;i++) 
      { 
       TupleFactory tp=TupleFactory.getInstance(); 
       Tuple t1=tp.newTuple(2); 
       String key=(String)input.get(0); 
       System.out.println("key="+key); 
       String value=(String)input.get(1); 
       String key_out=key+"_XX_"+i; 
       String value_out=value+"_id_"+i; 
       t1.set(0,key_out); 
       t1.set(1,value_out); 
       output.add(t1); 
      } 
      return output; 
      } 
      catch(Exception e) 
      { 
       throw new IOException(e); 
      } 
    } 


    public Schema outputschema(Schema input) 
    { 
     try 
     { 
     List<Schema.FieldSchema> mylist=new ArrayList<Schema.FieldSchema>(); 
     mylist.add(new Schema.FieldSchema("key_out",DataType.CHARARRAY)); 
     mylist.add(new Schema.FieldSchema("value_out",DataType.CHARARRAY)); 
     Schema tupleschema=new Schema(mylist); 
     Schema bagschema=new Schema(new Schema.FieldSchema("pair",tupleschema,DataType.TUPLE)); 
     Schema returnbagsc=new Schema(new Schema.FieldSchema("pairs",bagschema,DataType.BAG)); 
     return returnbagsc; 
     } 
     catch(FrontendException e) 
     { 
      throw new RuntimeException("not able to defime the schema"); 
     } 
    } 
} 

輸入文件:

1,艾麗西亞

2,安娜

3,貝尼塔

4,貝塔

5,伯莎

REGISTER '/path/to/pigexerciseudf.jar'; 
define replicat pigexerciseudf.replicateinput('3');          
A = LOAD '/home/hduser/exer.dat' using PigStorage(',') as (a:chararray,b:chararray);  
B = FOREACH A GENERATE FLATTEN(replicat(a,b)) as (line:chararray) ;      
dump B; 

輸出:

(1_XX_1,Alicia_id_1)

(1_XX_2,Alicia_id_2)

(1_XX_3,Alicia_id_3)

(2_XX_1,Ana_id_1)

(2_XX_2,Ana_id_2)

(2_XX_3,Ana_id_3)

(3_XX_1,Benita_id_1)

(3_XX_2,Benita_id_2)

(3_XX_3,Benita_id_3)

(4_XX_1,貝塔_id_1)

(4_XX_2,Berta_id_2)

(4_XX_3,Berta_id_3)

(5_XX_1,Bertha_id_1)

(5_XX_2,Bertha_id_2)

(5_XX_3,Bertha_id_3)

+0

維涅什 - >感謝您入住response..Can我們acheive與Java上述soultion UDF?我的意思是在PIG腳本中? – swadyada

+0

@swadyada以上是用UDF實現的。爲了在你的豬腳本中使用它,你必須註冊包含UDF的jar,並用輸入需要複製的次數來定義函數,並在foreach循環中使用它來生成輸出。 –