0
我試圖在只有22MB的示例文檔上運行帶有Spark的K-means,並且出現Java堆空間錯誤。有什麼想法嗎?它在羣集行上失敗。Spark堆棧空間錯誤運行K意味着EC2實例
樣本數據和代碼都在我的github
# run in ipython spark shell, IPYTHON=1 pyspark
from pyspark import SparkContext
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt
import json
from pyspark.sql import SQLContext, Row
sqlContext = SQLContext(sc)
sample = sqlContext.read.json("/home/ubuntu/yelp_project/sample.json")
sample.registerTempTable("sample")
reviews = sample.map(lambda x: Row(name= x[1], reviews=' '.join((a[3] for a in x[0]))))
hashingTF = HashingTF()
tf = hashingTF.transform(reviews.map(lambda x: x.reviews))
clusters = KMeans.train(tf, 2, maxIterations=10, runs=10, initializationMode="random")