package driver;
import java.io.IOException;
import mapper.NormalMapper;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import data.Conf;
import data.Record;
import reducer.NormalReducer;
* Driver of NormalJoin(or BasicJoin)
* Implements JoinDriver interface
public class NormalJoin implements JoinDriver{
public static void main(String[] args) throws ClassNotFoundException,
IOException, InterruptedException {
if (args.length != 3) {
.println("Usage: Join <left-side table path> <right-side table path> <output path>");
String userDir = System.getProperty("user.dir");
Conf conf = new Conf();
if (!conf.loadConf(userDir + "/conf.properties")) { // TODO
System.err.println("Failed in loading configuration file, exit");
new NormalJoin().join(args, conf);
public void join(String[] args, Conf conf) throws IOException {
JobConf job = new JobConf(NormalJoin.class);
job.setJobName("Equal Join");
Path inLeft = new Path(args[0]);
Path inRight = new Path(args[1]);
Path out = new Path(args[2]);
FileInputFormat.addInputPath(job, inLeft);
FileInputFormat.addInputPath(job, inRight);
FileOutputFormat.setOutputPath(job, out);
// configuration
job.set("inputNameLeft", inLeft.toString());
job.set("mapred.textoutputformat.separator", conf.separator);
job.setBoolean("mapred.conf.eliminateDuplicate", conf.eliminateDuplicate);
15/08/03 04:29:47 INFO Configuration.deprecation:
mapred.textoutputformat.separator is deprecated. Instead, use
15/08/03 04:29:47 INFO client.RMProxy: Connecting to ResourceManager at /
15/08/03 04:29:47 INFO client.RMProxy: Connecting to ResourceManager at /
15/08/03 04:29:48 WARN mapreduce.JobSubmitter: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
15/08/03 04:29:50 INFO mapred.FileInputFormat: Total input paths to process : 2
15/08/03 04:29:50 INFO mapreduce.JobSubmitter: number of splits:3
15/08/03 04:29:50 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1437465092759_0002
15/08/03 04:29:51 INFO impl.YarnClientImpl: Submitted application application_1437465092759_0002
15/08/03 04:29:51 INFO mapreduce.Job: The url to track the job: http://quickstart.cloudera:8088/proxy/application_1437465092759_0002/
15/08/03 04:29:51 INFO mapreduce.Job: Running job: job_1437465092759_0002
15/08/03 04:30:04 INFO mapreduce.Job: Job job_1437465092759_0002 running in uber mode : false
15/08/03 04:30:04 INFO mapreduce.Job: map 0% reduce 0%
15/08/03 04:30:20 INFO mapreduce.Job: map 33% reduce 0%
15/08/03 04:30:22 INFO mapreduce.Job: map 67% reduce 0%
15/08/03 04:30:23 INFO mapreduce.Job: map 100% reduce 0%
15/08/03 04:30:28 INFO mapreduce.Job: map 100% reduce 100%
15/08/03 04:30:28 INFO mapreduce.Job: Job job_1437465092759_0002 completed successfully
15/08/03 04:30:28 INFO mapreduce.Job: Counters: 49
File System Counters
FILE: Number of bytes read=5768091
FILE: Number of bytes written=11979199
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=5283057
HDFS: Number of bytes written=0
HDFS: Number of read operations=12
HDFS: Number of large read operations=0
HDFS: Number of write operations=2
Job Counters
Launched map tasks=3
Launched reduce tasks=1
Data-local map tasks=3
Total time spent by all maps in occupied slots (ms)=44449
Total time spent by all reduces in occupied slots (ms)=5532
Total time spent by all map tasks (ms)=44449
Total time spent by all reduce tasks (ms)=5532
Total vcore-seconds taken by all map tasks=44449
Total vcore-seconds taken by all reduce tasks=5532
Total megabyte-seconds taken by all map tasks=45515776
Total megabyte-seconds taken by all reduce tasks=5664768
Map-Reduce Framework
Map input records=69495
Map output records=69495
Map output bytes=5629095
Map output materialized bytes=5768103
Input split bytes=327
Combine input records=0
Combine output records=0
Reduce input groups=55273
Reduce shuffle bytes=5768103
Reduce input records=69495
Reduce output records=0
Spilled Records=138990
Shuffled Maps =3
Failed Shuffles=0
Merged Map outputs=3
GC time elapsed (ms)=672
CPU time spent (ms)=4400
Physical memory (bytes) snapshot=805781504
Virtual memory (bytes) snapshot=6027804672
Total committed heap usage (bytes)=557592576
Shuffle Errors
File Input Format Counters
Bytes Read=5282730
File Output Format Counters
Bytes Written=0
