2017-03-27 87 views
0

我有以下DataFrames和它們之間的連接操作,但連接失敗時未引用任何實際錯誤。Apache Spark DataFrames連接失敗,使用scala

//HospitalFacility class to fill in 
case class HospitalFacility(Name: String, Rating: Int, Cost: Int); 
//I pass the pid as an input parameter. 
//hc : HiveConext successfully created 
//Provider_Facility & Facility_Master are my two hive tables. 
def fetchHospitalFacilityData(pid: String): String = { 
    val filteredProviderSpecilaityDF = hc.sql("select FacilityId, Rating, Cost from Provider_Facility where ProviderId='" + pid + "'"); 
    println(filteredProviderSpecilaityDF); 
    filteredProviderSpecilaityDF.foreach (println); //Prints perfectly 

    val allFacilityDF = hc.sql("select id, Name from Facility_Master"); 
    println(allFacilityDF); 
    allFacilityDF.foreach(println); //Prints perfectly 

    //The below line throws error. 
    val resultDF = filteredProviderSpecilaityDF.join(allFacilityDF,filteredProviderSpecilaityDF("FacilityId") === allFacilityDF("id") ,"right_outer"); 
    println(resultDF); 

    val filteredFacilityList = resultDF.rdd.map { spec => HospitalFacility(spec.getString(0).toString(), spec.getInt(3), spec.getInt(4)) }.collect(); 
    filteredFacilityList.foreach(println); //does not reach this point 
    return result; 
    } 

時引發下面列出的錯誤:

 
Exception in thread "broadcast-hash-join-0" java.lang.NoSuchMethodError: org.apache.spark.util.Utils$.tryOrIOException(Lscala/Function0;)V 
    at org.apache.spark.sql.execution.joins.UnsafeHashedRelation.writeExternal(HashedRelation.scala:264) 
    at java.io.ObjectOutputStream.writeExternalData(ObjectOutputStream.java:1458) 
    at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1429) 
    at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1177) 
    at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:347) 
    at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:44) 
    at org.apache.spark.broadcast.TorrentBroadcast$.blockifyObject(TorrentBroadcast.scala:203) 
    at org.apache.spark.broadcast.TorrentBroadcast.writeBlocks(TorrentBroadcast.scala:102) 
    at org.apache.spark.broadcast.TorrentBroadcast.(TorrentBroadcast.scala:85) 
    at org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast(TorrentBroadcastFactory.scala:34) 
    at org.apache.spark.broadcast.BroadcastManager.newBroadcast(BroadcastManager.scala:63) 
    at org.apache.spark.SparkContext.broadcast(SparkContext.scala:1326) 
    at org.apache.spark.sql.execution.joins.BroadcastHashOuterJoin$$anonfun$broadcastFuture$1$$anonfun$apply$1.apply(BroadcastHashOuterJoin.scala:94) 
    at org.apache.spark.sql.execution.joins.BroadcastHashOuterJoin$$anonfun$broadcastFuture$1$$anonfun$apply$1.apply(BroadcastHashOuterJoin.scala:82) 
    at org.apache.spark.sql.execution.SQLExecution$.withExecutionId(SQLExecution.scala:100) 
    at org.apache.spark.sql.execution.joins.BroadcastHashOuterJoin$$anonfun$broadcastFuture$1.apply(BroadcastHashOuterJoin.scala:82) 
    at org.apache.spark.sql.execution.joins.BroadcastHashOuterJoin$$anonfun$broadcastFuture$1.apply(BroadcastHashOuterJoin.scala:82) 
    at scala.concurrent.impl.Future$PromiseCompletingRunnable.liftedTree1$1(Future.scala:24) 
    at scala.concurrent.impl.Future$PromiseCompletingRunnable.run(Future.scala:24) 
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) 
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) 
    at java.lang.Thread.run(Thread.java:745) 

誰能請幫助我。

+1

檢查你的scala/spark庫和集羣版本!你似乎有一種不匹配 – eliasah

+0

這是從Spark-shell工作,但不是從Scala程序工作 –

+1

你能創建一個最低可重現和可驗證的例子嗎? (建立文件,代碼,火花版本,你如何運行它等)我們將無法幫助你,否則 – eliasah

回答

0

也許allFacilityDF("id")=== filteredProviderSpecilaityDF("FacilityId") 返回布爾序列不是序列[字符串] 的PARM usingColumns定義如下:列 名稱加入的。這兩列必須存在於雙方。

+0

我只是在我的Spark-shell中運行它,它的工作正常。但是當我通過servlet運行它時拋出了上面提到的錯誤。從scala程序調用的任何可能出錯的東西? –