2014-03-03 44 views
4

我想從map-reduce作業中寫出一個活潑的塊壓縮序列文件。我使用Hadoop的 2.0.0-cdh4.5.0,和活潑的Java 1.0.4.1hadoop mapreduce:java.lang.UnsatisfiedLinkError:org.apache.hadoop.util.NativeCodeLoader.buildSupportsSappeppy()Z

這裏是我的代碼:

package jinvestor.jhouse.mr; 

import java.io.ByteArrayOutputStream; 
import java.io.IOException; 
import java.io.OutputStream; 
import java.util.Arrays; 
import java.util.List; 

import jinvestor.jhouse.core.House; 
import jinvestor.jhouse.core.util.HouseAvroUtil; 
import jinvestor.jhouse.download.HBaseHouseDAO; 

import org.apache.commons.io.IOUtils; 
import org.apache.hadoop.conf.Configuration; 
import org.apache.hadoop.fs.FileSystem; 
import org.apache.hadoop.fs.LocatedFileStatus; 
import org.apache.hadoop.fs.Path; 
import org.apache.hadoop.fs.RemoteIterator; 
import org.apache.hadoop.hbase.client.Result; 
import org.apache.hadoop.hbase.client.Scan; 
import org.apache.hadoop.hbase.io.ImmutableBytesWritable; 
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil; 
import org.apache.hadoop.hbase.mapreduce.TableMapper; 
import org.apache.hadoop.hbase.util.Bytes; 
import org.apache.hadoop.io.LongWritable; 
import org.apache.hadoop.io.SequenceFile; 
import org.apache.hadoop.io.compress.CompressionCodec; 
import org.apache.hadoop.io.compress.SnappyCodec; 
import org.apache.hadoop.mapred.FileOutputFormat; 
import org.apache.hadoop.mapred.JobConf; 
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 
import org.apache.hadoop.mapreduce.Job; 
import org.apache.mahout.math.DenseVector; 
import org.apache.mahout.math.NamedVector; 
import org.apache.mahout.math.VectorWritable; 

/** 
* Produces mahout vectors from House entries in HBase. 
* 
* @author Michael Scott Knapp 
* 
*/ 
public class HouseVectorizer { 

    private final Configuration configuration; 
    private final House minimumHouse; 
    private final House maximumHouse; 

    public HouseVectorizer(final Configuration configuration, 
      final House minimumHouse, final House maximumHouse) { 
     this.configuration = configuration; 
     this.minimumHouse = minimumHouse; 
     this.maximumHouse = maximumHouse; 
    } 

    public void vectorize() throws IOException, ClassNotFoundException, InterruptedException { 
     JobConf jobConf = new JobConf(); 
     jobConf.setMapOutputKeyClass(LongWritable.class); 
     jobConf.setMapOutputValueClass(VectorWritable.class); 

     // we want the vectors written straight to HDFS, 
     // the order does not matter. 
     jobConf.setNumReduceTasks(0); 

     Path outputDir = new Path("/home/cloudera/house_vectors"); 
     FileSystem fs = FileSystem.get(configuration); 
     if (fs.exists(outputDir)) { 
      fs.delete(outputDir, true); 
     } 

     FileOutputFormat.setOutputPath(jobConf, outputDir); 

     // I want the mappers to know the max and min value 
     // so they can normalize the data. 
     // I will add them as properties in the configuration, 
     // by serializing them with avro. 
     String minmax = HouseAvroUtil.toBase64String(Arrays.asList(minimumHouse, 
       maximumHouse)); 
     jobConf.set("minmax", minmax); 

     Job job = Job.getInstance(jobConf); 
     Scan scan = new Scan(); 
     scan.addFamily(Bytes.toBytes("data")); 
     TableMapReduceUtil.initTableMapperJob("homes", scan, 
       HouseVectorizingMapper.class, LongWritable.class, 
       VectorWritable.class, job); 
     job.setOutputFormatClass(SequenceFileOutputFormat.class); 
     job.setOutputKeyClass(LongWritable.class); 
     job.setOutputValueClass(VectorWritable.class); 
     job.setMapOutputKeyClass(LongWritable.class); 
     job.setMapOutputValueClass(VectorWritable.class); 

     SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); 
     SequenceFileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); 
     SequenceFileOutputFormat.setOutputPath(job, outputDir); 
     job.getConfiguration().setClass("mapreduce.map.output.compress.codec", 
       SnappyCodec.class, 
       CompressionCodec.class); 

     job.waitForCompletion(true); 
    } 

當我運行它,我得到這個:

java.lang.Exception: java.lang.UnsatisfiedLinkError: org.apache.hadoop.util.NativeCodeLoader.buildSupportsSnappy()Z 
    at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:401) 
Caused by: java.lang.UnsatisfiedLinkError: org.apache.hadoop.util.NativeCodeLoader.buildSupportsSnappy()Z 
    at org.apache.hadoop.util.NativeCodeLoader.buildSupportsSnappy(Native Method) 
    at org.apache.hadoop.io.compress.SnappyCodec.checkNativeCodeLoaded(SnappyCodec.java:62) 
    at org.apache.hadoop.io.compress.SnappyCodec.getCompressorType(SnappyCodec.java:127) 
    at org.apache.hadoop.io.compress.CodecPool.getCompressor(CodecPool.java:104) 
    at org.apache.hadoop.io.compress.CodecPool.getCompressor(CodecPool.java:118) 
    at org.apache.hadoop.io.SequenceFile$Writer.init(SequenceFile.java:1169) 
    at org.apache.hadoop.io.SequenceFile$Writer.<init>(SequenceFile.java:1080) 
    at org.apache.hadoop.io.SequenceFile$BlockCompressWriter.<init>(SequenceFile.java:1400) 
    at org.apache.hadoop.io.SequenceFile.createWriter(SequenceFile.java:274) 
    at org.apache.hadoop.io.SequenceFile.createWriter(SequenceFile.java:527) 
    at org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.getSequenceWriter(SequenceFileOutputFormat.java:64) 
    at org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.getRecordWriter(SequenceFileOutputFormat.java:75) 
    at org.apache.hadoop.mapred.MapTask$NewDirectOutputCollector.<init>(MapTask.java:617) 
    at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:737) 
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:338) 
    at org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable.run(LocalJobRunner.java:233) 
    at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471) 
    at java.util.concurrent.FutureTask.run(FutureTask.java:262) 
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) 
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) 
    at java.lang.Thread.run(Thread.java:744) 

如果我註釋掉這幾行,然後我測試通過:

SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); 
     SequenceFileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); 
     job.getConfiguration().setClass("mapreduce.map.output.compress.coded", 
       SnappyCodec.class, 
       CompressionCodec.class); 

不過,我真的想用活潑的compressi在我的序列文件中。有人可以向我解釋我做錯了什麼嗎?

+0

您如何安裝LZO以及如何運行作業? – Chiron

+0

我不使用LZO壓縮afaik,只是活潑。我正在從單元測試中運行這項工作。 – msknapp

+0

是的,我的錯。但是,您需要設置屬性java.library.path。 例如:-Djava.library.path =/lib/hadoop/native – Chiron

回答

1

我的問題是我的JRE不包含適當的本機庫。這可能會也可能不會,因爲我將JDK從cloudera的預建VM切換到JDK 1.7。快速的.so文件位於hadoop/lib/native目錄中,JRE需要它們。將它們添加到類路徑似乎沒有解決我的問題。我解決它是這樣的:

$ cd /usr/lib/hadoop/lib/native 
$ sudo cp *.so /usr/java/latest/jre/lib/amd64/ 

然後,我能夠使用SnappyCodec類。儘管你的路徑可能不同。

這似乎讓我的下一個問題:

產生的原因:了java.lang.RuntimeException:本機活潑庫不可用:SnappyCompressor尚未加載。

仍試圖解決這個問題。

+0

你成功了嗎?如何解決它? –

+2

在升級CDH版本後,複製這些文件的人會導致問題。你需要將它們複製到每個CDH升級中,並相信我你會忘記你複製了這些文件。正確的方法是使用LD_LIBRARY_PATH!您需要確保它在網關實例上具有正確的值。在CDH,這可能是你已經覆蓋它。那裏的默認值通常很好。當遠程執行此操作時,可以使用java -cp ...然後設置-Djava.library.path。 – Niko

0

我你需要所有的文件,而不僅僅是* .so。理想情況下,你會將文件夾包含到路徑中,而不是從那裏複製庫。此後需要重新啓動MapReduce服務,以便採用新的庫並可以使用。

尼科

1

檢查你的核心的site.xml和mapred-site.xml中就應該包含正確的屬性,並與庫文件夾的路徑

核心的site.xml

<property> 
    <name>io.compression.codecs</name> 
<value>org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.SnappyCodec</value> 
</property> 

mapred-site.xml

<property> 
     <name>mapreduce.map.output.compress</name> 
     <value>true</value> 
    </property> 

    <property> 
    <name>mapred.map.output.compress.codec</name> 
    <value>org.apache.hadoop.io.compress.SnappyCodec</value> 
    </property> 


    <property> 
     <name>mapreduce.admin.user.env</name> 
     <value>LD_LIBRARY_PATH=/usr/hdp/2.2.0.0-1084/hadoop/lib/native</value> 
    </property> 

LD_LIBRARY_PATH - 必須包含libsnappy.so的路徑。

5

發現從Cloudera Communities

  1. 以下信息,確保LD_LIBRARY_PATHJAVA_LIBRARY_PATH含有具有libsnappy的.so文件**本地目錄路徑。
  2. 確保已在SPARK環境中導出LD_LIBRARY_PATH和JAVA_LIBRARY路徑(spark-env.sh)。

例如我使用Hortonworks HDP和我有以下結構在我的spark-env.sh

export JAVA_LIBRARY_PATH=$JAVA_LIBRARY_PATH:/usr/hdp/2.2.0.0-2041/hadoop/lib/native 
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/hdp/2.2.0.0-2041/hadoop/lib/native 
export SPARK_YARN_USER_ENV="JAVA_LIBRARY_PATH=$JAVA_LIBRARY_PATH,LD_LIBRARY_PATH=$LD_LIBRARY_PATH" 
0

從Windows \ System32下除去hadoop.dll(我手動複製)和後設置HADOOP_HOME = \ hadoop-2.6.4 IT WORKS !!!

相關問題