0
我想使用spark和scala來獲取HBase數據。但是,我收到了一個我無法想到的錯誤。HBase TimeoutIOException
代碼
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.HConstants
import org.apache.hadoop.hbase.client.{HBaseAdmin, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.SparkConf
object HBase {
def main(args: Array[String]): Unit = {
val tableName = "posts"
val sc = new SparkContext(new SparkConf().setAppName("HBaseReadWrite").setMaster("local[4]"))
val conf = HBaseConfiguration.create()
conf.set(HConstants.ZOOKEEPER_QUORUM, "localhost")
conf.set(TableInputFormat.INPUT_TABLE, tableName)
val admin = new HBaseAdmin(conf)
if(!admin.isTableAvailable(conf.get(tableName))) {
println("Table doesn't exist")
return
}
val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[ImmutableBytesWritable], classOf[Result])
println(hBaseRDD.map(x => x._2).map(result => Bytes.toString(result.getRow)).collect().take(5).mkString("\n"))
}
}
build.sbt
name := "NLPAnnotationController"
version := "1.0"
scalaVersion := "2.10.5"
resolvers += "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/"
resolvers += "sonatype snapshots" at "https://oss.sonatype.org/content/repositories/snapshots/"
organization := "com.scryAnalytics"
val hadoop_version = "0.98.19-hadoop2"
libraryDependencies ++= Seq(
"org.apache.spark" % "spark-core_2.10" % "1.2.0",
"org.apache.hbase" % "hbase-spark" % "1.2.0-cdh5.7.2",
"org.apache.hbase" % "hbase-client" % hadoop_version excludeAll(ExclusionRule(organization = "javax.servlet", name="javax.servlet-api"), ExclusionRule(organization = "org.mortbay.jetty", name="jetty"), ExclusionRule(organization = "org.mortbay.jetty", name="servlet-api-2.5")),
"org.apache.hbase" % "hbase-common" % hadoop_version excludeAll(ExclusionRule(organization = "javax.servlet", name="javax.servlet-api"), ExclusionRule(organization = "org.mortbay.jetty", name="jetty"), ExclusionRule(organization = "org.mortbay.jetty", name="servlet-api-2.5")),
"org.apache.hbase" % "hbase-server" % hadoop_version excludeAll(ExclusionRule(organization = "javax.servlet", name="javax.servlet-api"), ExclusionRule(organization = "org.mortbay.jetty", name="jetty"), ExclusionRule(organization = "org.mortbay.jetty", name="servlet-api-2.5")),
"org.scala-lang" % "scala-library" % "2.10.5",
"it.nerdammer.bigdata" % "spark-hbase-connector_2.10" % "1.0.3"
)
錯誤
Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/hadoop/hbase/exceptions/TimeoutIOException
at HBase$.main(HBase.scala:20)
at HBase.main(HBase.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:147)
Caused by: java.lang.ClassNotFoundException: org.apache.hadoop.hbase.exceptions.TimeoutIOException
at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:425)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308)
at java.lang.ClassLoader.loadClass(ClassLoader.java:358)
我試圖改變依賴,仍然沒有任何進展的版本。 任何幫助將是偉大的。提前致謝。
我猜你正在使用分佈式系統,並且你的本地程序正在與遠程系統(數據庫或Hadoop)進行通信。 Hadoop通過網絡協議拋出一個異常,並將其返回給您序列化。在你的結尾,應用程序想要將一個錯誤消息反序列化成一個'org/apache/hadoop/hbase/exceptions/TimeoutIOException'類型的異常對象。爲此,它需要在類路徑中找到該類,但是失敗。你有沒有在你的本地類路徑中的圖書館?如果沒有,請添加它 – radumanolescu
你在談論哪些庫?我在構建文件中有hbase依賴關係。 – wadhwasahil
@wadhwasahil:你可以使用'conf.setInt(「timeout」,120000)來設置超時時間''你也需要檢查你是否擁有所有的依賴關係,當你運行你的作業時,你可以使用--jars傳遞Hbase依賴關係使用spark-submit命令。 – Shankar