你可以做到這一點與UDF這能給你一些關於重新輸入想要的次數可用性被複制。在UDF下面會做到這一點。
package pigexerciseudf;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
public class replicateinput extends EvalFunc<DataBag>
{
public replicateinput()
{
}
int rep_factor=0;
public replicateinput(String a)
{
rep_factor=Integer.parseInt(a);
}
public DataBag exec(Tuple input) throws IOException
{
BagFactory bf=BagFactory.getInstance();
DataBag output=bf.newDefaultBag();
try
{
for(int i=1;i<=rep_factor;i++)
{
TupleFactory tp=TupleFactory.getInstance();
Tuple t1=tp.newTuple(2);
String key=(String)input.get(0);
System.out.println("key="+key);
String value=(String)input.get(1);
String key_out=key+"_XX_"+i;
String value_out=value+"_id_"+i;
t1.set(0,key_out);
t1.set(1,value_out);
output.add(t1);
}
return output;
}
catch(Exception e)
{
throw new IOException(e);
}
}
public Schema outputschema(Schema input)
{
try
{
List<Schema.FieldSchema> mylist=new ArrayList<Schema.FieldSchema>();
mylist.add(new Schema.FieldSchema("key_out",DataType.CHARARRAY));
mylist.add(new Schema.FieldSchema("value_out",DataType.CHARARRAY));
Schema tupleschema=new Schema(mylist);
Schema bagschema=new Schema(new Schema.FieldSchema("pair",tupleschema,DataType.TUPLE));
Schema returnbagsc=new Schema(new Schema.FieldSchema("pairs",bagschema,DataType.BAG));
return returnbagsc;
}
catch(FrontendException e)
{
throw new RuntimeException("not able to defime the schema");
}
}
}
輸入文件:
1,艾麗西亞
2,安娜
3,貝尼塔
4,貝塔
5,伯莎
REGISTER '/path/to/pigexerciseudf.jar';
define replicat pigexerciseudf.replicateinput('3');
A = LOAD '/home/hduser/exer.dat' using PigStorage(',') as (a:chararray,b:chararray);
B = FOREACH A GENERATE FLATTEN(replicat(a,b)) as (line:chararray) ;
dump B;
輸出:
(1_XX_1,Alicia_id_1)
(1_XX_2,Alicia_id_2)
(1_XX_3,Alicia_id_3)
(2_XX_1,Ana_id_1)
(2_XX_2,Ana_id_2)
(2_XX_3,Ana_id_3)
(3_XX_1,Benita_id_1)
(3_XX_2,Benita_id_2)
(3_XX_3,Benita_id_3)
(4_XX_1,貝塔_id_1)
(4_XX_2,Berta_id_2)
(4_XX_3,Berta_id_3)
(5_XX_1,Bertha_id_1)
(5_XX_2,Bertha_id_2)
(5_XX_3,Bertha_id_3)
維涅什 - >感謝您入住response..Can我們acheive與Java上述soultion UDF?我的意思是在PIG腳本中? – swadyada
@swadyada以上是用UDF實現的。爲了在你的豬腳本中使用它,你必須註冊包含UDF的jar,並用輸入需要複製的次數來定義函數,並在foreach循環中使用它來生成輸出。 –