我在Apache Spark上運行「連接」操作,看到沒有弱的可伸縮性。如果有人能解釋這一點,將不勝感激。我創建了兩個數據框(「a」,「b」)和(「a」,「c」),並通過第一列連接數據框。我爲「一對一」連接生成數據幀值。另外,我使用相同的分區程序來避免混洗。Apache Spark連接操作的弱擴展性差
數據框中的行數 - 1024 * 1024 * 16 * cores_total(cores_total - 啓動程序的核心總數)。 列「a」由隨機Int值組成,所有「b」列的值等於1,「c」列的所有值等於2.
理論上,隨着數據大小和內核到64倍,執行時間應該保持不變,但執行時間稍微增加。我得到下面的執行時間:
Apache的版本星火 - 2.1.0。我們使用8個集羣節點,配備1 Gbit以太網,每個節點有2個Intel Xeon E5-2630,64 GB RAM。
/* join perf */
import scala.io.Source
import scala.math._
import org.apache.spark._
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import scala.util.control.Breaks._
import scala.collection.mutable._
import org.apache.spark.rdd._
import org.apache.spark.sql._
import scala.util.Random
import org.apache.spark.util.SizeEstimator
import org.apache.spark.HashPartitioner
object joinPerf {
def get_array(n: Int): Array[Int] = {
var res = Array[Int]()
for (x <- 1 to n) {
res :+= Random.nextInt
}
return res
}
def main(args: Array[String]) {
val start_time = System.nanoTime
val conf = new SparkConf().setAppName("joinPerf")
val sc = new SparkContext(conf)
val cores_total = sc.getConf.get("spark.cores.max").toInt
val partitions_total = sc.getConf.get("spark.default.parallelism").toInt
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext._
import sqlContext.implicits._
println("start")
val elems_total = 1024 * 1024 * 16 * cores_total
val start_cnt = 1024 * 1024
Random.setSeed(785354)
var vals = Vector[Int]()
for (x <- 1 to start_cnt) {
vals :+= Random.nextInt
}
var test_rdd = sc.parallelize(vals)
println(test_rdd.count)
test_rdd = test_rdd.flatMap(x => get_array(elems_total/start_cnt)).distinct
println("test_rdd count = " + test_rdd.count)
println("partitions count = " + test_rdd.getNumPartitions)
var test_rdd1 = test_rdd.map(x => (x, 1)).toDF("a", "b").repartition(partitions_total, $"a").cache
var test_rdd2 = test_rdd.map(x => (x, 2)).toDF("a", "c").repartition(partitions_total, $"a").cache
println("test_rdd1 count = " + test_rdd1.count)
println("test_rdd2 count = " + test_rdd2.count)
var start_test_time = System.nanoTime
var test_res = test_rdd1.join(test_rdd2, test_rdd1("a") === test_rdd2("a"))
println(test_res.count)
print("join time = ")
println((System.nanoTime - start_test_time)/1e9d + " sec. ")
print("all time = ")
println((System.nanoTime - start_time)/1e9d + " sec. ")
sc.stop()
}
}
配置參數:每芯
spark.serializer org.apache.spark.serializer.KryoSerializer
spark.kryoserializer.buffer.max 1024
spark.kryo.unsafe true
spark.kryo.referenceTracking false
spark.driver.memory 22g
spark.executor.memory 22g
spark.driver.maxResultSize 22g
spark.rpc.message.maxSize 2047
spark.memory.fraction 0.8
spark.memory.storageFraction 0.5
spark.executor.extraJavaOptions "-XX:+UseParallelGC"
分區 - 4.
啓動程序的實施例:
./bin/spark-submit --class "joinPerf" --conf spark.executor.cores=8 --conf spark.cores.max=64 --conf spark.default.parallelism=256 ./joinPerf.jar