我在YARN的Spark上使用scipy餘弦相似度運行以下udf。我首先在30個樣本觀察數據上進行了測試。它運行良好,並在5秒內創建一個餘弦相似矩陣。Spark java.lang.NoSuchMethodError
這裏是代碼:
def cosineSimilarity(df):
""" Cosine similarity of the each document with other
"""
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
from scipy.spatial import distance
cosine = udf(lambda v1, v2: (
float(1-distance.cosine(v1, v2)) if v1 is not None and v2 is not None else None),
DoubleType())
# Creating a cross product of the table to get the cosine similarity vectors
crosstabDF=df.withColumnRenamed('id','id_1').withColumnRenamed('w2v_vector','w2v_vector_1')\
.join(df.withColumnRenamed('id','id_2').withColumnRenamed('w2v_vector','w2v_vector_2'))
similardocs_df= crosstabDF.withColumn('cosinesim', cosine("w2v_vector_1","w2v_vector_2"))
return similardocs_df
#similardocs_df=cosineSimilarity(w2vdf.select('id','w2v_vector'))
similardocs_df=cosineSimilarity(w2vdf_sample.select('id','w2v_vector'))
然後我試圖通過全矩陣(58K記錄),它運行了一段時間,然後是給我下面的錯誤:
我想值得一提的是,有一次它在5分鐘內爲整個數據運行。但是現在,整個數據在運行時沒有任何問題,它給了我這個錯誤。
WARN org.spark_project.jetty.servlet.ServletHandler (ServletHandler.java:doHandle(667)) - Error for /jobs/
java.lang.NoSuchMethodError: javax.servlet.http.HttpServletRequest.getDispatcherType()Ljavax/servlet/DispatcherType;
at org.spark_project.jetty.servlets.gzip.AbstractCompressedStream.doCompress(AbstractCompressedStream.java:248)
at org.spark_project.jetty.servlets.gzip.AbstractCompressedStream.checkOut(AbstractCompressedStream.java:354)
at org.spark_project.jetty.servlets.gzip.AbstractCompressedStream.write(AbstractCompressedStream.java:229)
at sun.nio.cs.StreamEncoder.writeBytes(StreamEncoder.java:221)
at sun.nio.cs.StreamEncoder.implWrite(StreamEncoder.java:282)
at sun.nio.cs.StreamEncoder.write(StreamEncoder.java:125)
at sun.nio.cs.StreamEncoder.write(StreamEncoder.java:135)
at java.io.OutputStreamWriter.write(OutputStreamWriter.java:220)
at java.io.PrintWriter.write(PrintWriter.java:456)
at java.io.PrintWriter.write(PrintWriter.java:473)
at java.io.PrintWriter.print(PrintWriter.java:603)
at org.apache.spark.ui.JettyUtils$$anon$2.doGet(JettyUtils.scala:86)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:707)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:820)
at org.spark_project.jetty.servlet.ServletHolder.handle(ServletHolder.java:812)
at org.spark_project.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1669)
at org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.doFilter(AmIpFilter.java:164)
at org.spark_project.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1652)
at org.spark_project.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:585)
at org.spark_project.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1127)
at org.spark_project.jetty.servlet.ServletHandler.doScope(ServletHandler.java:515)
at org.spark_project.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1061)
at org.spark_project.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
at org.spark_project.jetty.servlets.gzip.GzipHandler.handle(GzipHandler.java:479)
at org.spark_project.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:215)
at org.spark_project.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:97)
at org.spark_project.jetty.server.Server.handle(Server.java:499)
at org.spark_project.jetty.server.HttpChannel.handle(HttpChannel.java:311)
at org.spark_project.jetty.server.HttpConnection.onFillable(HttpConnection.java:257)
at org.spark_project.jetty.io.AbstractConnection$2.run(AbstractConnection.java:544)
at org.spark_project.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:635)
at org.spark_project.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:555)
at java.lang.Thread.run(Thread.java:744)
2017-02-23 21:01:48,024 WARN org.spark_project.jetty.server.HttpChannel (HttpChannel.java:handle(384)) - /jobs/
請嘗試清除緩存? – Hng
嗯。我可以嘗試。但是,如何從YARN清除緩存中的所有可用數據幀。我知道我可以通過df.unpersist()清除特定的數據幀。但是如果我想知道所有緩存的對象在那裏,並且一次從YARN集羣清除所有想法? – Baktaawar
@嗯嗯,我這樣做: SqlContext.clearCache()。所以它會清理所有的緩存變量。但它似乎仍然沒有工作。 在谷歌搜索這個錯誤我發現這是一個問題,如果兩個版本的庫不相同(火花有一個等)所以分片是一種方法。但那麼我的問題是,如果它是一個庫版本問題,它是如何工作的一些示例數據? – Baktaawar