4

我查看了文檔,它說支持以下連接類型:Spark中的各種連接類型是什麼?

要執行的連接的類型。默認內部。必須是以下之一:內部,交叉, left_semi,left_anti。外部,完整,全部,外部,左側,左側外部,右側,右側外部, left_semi,left_anti。

我查看了SQL連接上的StackOverflow answer,頂部的一些答案沒有提到上面的一些連接,例如, left_semileft_anti。他們在Spark中意味着什麼?

回答

7

下面是一個簡單的實驗說明:

import org.apache.spark._ 
import org.apache.spark.sql._ 
import org.apache.spark.sql.expressions._ 
import org.apache.spark.sql.functions._ 

object SparkSandbox extends App { 

    case class Row(id: Int, value: String) 

    private[this] implicit val spark = SparkSession.builder().master("local[*]").getOrCreate() 
    import spark.implicits._ 
    spark.sparkContext.setLogLevel("ERROR") 

    val r1 = Seq(Row(1, "A1"), Row(2, "A2"), Row(3, "A3"), Row(4, "A4")).toDS() 
    val r2 = Seq(Row(3, "A3"), Row(4, "A4"), Row(4, "A4_1"), Row(5, "A5"), Row(6, "A6")).toDS() 

    val joinTypes = Seq("inner", "outer", "full", "full_outer", "left", "left_outer", "right", "right_outer", "left_semi", "left_anti") 

    joinTypes foreach {joinType => 
    println(s"${joinType.toUpperCase()} JOIN") 
    r1.join(right = r2, usingColumns = Seq("id"), joinType = joinType).orderBy("id").show() 
    } 
} 

輸出

INNER JOIN 
+---+-----+-----+ 
| id|value|value| 
+---+-----+-----+ 
| 3| A3| A3| 
| 4| A4| A4_1| 
| 4| A4| A4| 
+---+-----+-----+ 

OUTER JOIN 
+---+-----+-----+ 
| id|value|value| 
+---+-----+-----+ 
| 1| A1| null| 
| 2| A2| null| 
| 3| A3| A3| 
| 4| A4| A4| 
| 4| A4| A4_1| 
| 5| null| A5| 
| 6| null| A6| 
+---+-----+-----+ 

FULL JOIN 
+---+-----+-----+ 
| id|value|value| 
+---+-----+-----+ 
| 1| A1| null| 
| 2| A2| null| 
| 3| A3| A3| 
| 4| A4| A4_1| 
| 4| A4| A4| 
| 5| null| A5| 
| 6| null| A6| 
+---+-----+-----+ 

FULL_OUTER JOIN 
+---+-----+-----+ 
| id|value|value| 
+---+-----+-----+ 
| 1| A1| null| 
| 2| A2| null| 
| 3| A3| A3| 
| 4| A4| A4_1| 
| 4| A4| A4| 
| 5| null| A5| 
| 6| null| A6| 
+---+-----+-----+ 

LEFT JOIN 
+---+-----+-----+ 
| id|value|value| 
+---+-----+-----+ 
| 1| A1| null| 
| 2| A2| null| 
| 3| A3| A3| 
| 4| A4| A4_1| 
| 4| A4| A4| 
+---+-----+-----+ 

LEFT_OUTER JOIN 
+---+-----+-----+ 
| id|value|value| 
+---+-----+-----+ 
| 1| A1| null| 
| 2| A2| null| 
| 3| A3| A3| 
| 4| A4| A4_1| 
| 4| A4| A4| 
+---+-----+-----+ 

RIGHT JOIN 
+---+-----+-----+ 
| id|value|value| 
+---+-----+-----+ 
| 3| A3| A3| 
| 4| A4| A4| 
| 4| A4| A4_1| 
| 5| null| A5| 
| 6| null| A6| 
+---+-----+-----+ 

RIGHT_OUTER JOIN 
+---+-----+-----+ 
| id|value|value| 
+---+-----+-----+ 
| 3| A3| A3| 
| 4| A4| A4_1| 
| 4| A4| A4| 
| 5| null| A5| 
| 6| null| A6| 
+---+-----+-----+ 

LEFT_SEMI JOIN 
+---+-----+ 
| id|value| 
+---+-----+ 
| 3| A3| 
| 4| A4| 
+---+-----+ 

LEFT_ANTI JOIN 
+---+-----+ 
| id|value| 
+---+-----+ 
| 1| A1| 
| 2| A2| 
+---+-----+ 
0

有多種類型的連接中,下面列出了更多的細節星火-SQL可參考在github上這個link和編碼例Reference

加入

1) JOIN 
2) {LEFT|RIGHT|FULL} OUTER JOIN 
3) LEFT SEMI JOIN 
4) CROSS JOIN 

例如:

package org.apache.spark.sql.catalyst.plans 

import java.util.Locale 

import org.apache.spark.sql.catalyst.expressions.Attribute 

object JoinType { 
    def apply(typ: String): JoinType = typ.toLowerCase(Locale.ROOT).replace("_", "") match { 
    case "inner" => Inner 
    case "outer" | "full" | "fullouter" => FullOuter 
    case "leftouter" | "left" => LeftOuter 
    case "rightouter" | "right" => RightOuter 
    case "leftsemi" => LeftSemi 
    case "leftanti" => LeftAnti 
    case "cross" => Cross 
    case _ => 
     val supported = Seq(
     "inner", 
     "outer", "full", "fullouter", "full_outer", 
     "leftouter", "left", "left_outer", 
     "rightouter", "right", "right_outer", 
     "leftsemi", "left_semi", 
     "leftanti", "left_anti", 
     "cross") 

     throw new IllegalArgumentException(s"Unsupported join type '$typ'. " + 
     "Supported join types include: " + supported.mkString("'", "', '", "'") + ".") 
    } 
} 

sealed abstract class JoinType { 
    def sql: String 
} 

/** 
* The explicitCartesian flag indicates if the inner join was constructed with a CROSS join 
* indicating a cartesian product has been explicitly requested. 
*/ 
sealed abstract class InnerLike extends JoinType { 
    def explicitCartesian: Boolean 
} 

case object Inner extends InnerLike { 
    override def explicitCartesian: Boolean = false 
    override def sql: String = "INNER" 
} 

case object Cross extends InnerLike { 
    override def explicitCartesian: Boolean = true 
    override def sql: String = "CROSS" 
} 

case object LeftOuter extends JoinType { 
    override def sql: String = "LEFT OUTER" 
} 

case object RightOuter extends JoinType { 
    override def sql: String = "RIGHT OUTER" 
} 

case object FullOuter extends JoinType { 
    override def sql: String = "FULL OUTER" 
} 

case object LeftSemi extends JoinType { 
    override def sql: String = "LEFT SEMI" 
} 

case object LeftAnti extends JoinType { 
    override def sql: String = "LEFT ANTI" 
} 

case class ExistenceJoin(exists: Attribute) extends JoinType { 
    override def sql: String = { 
    // This join type is only used in the end of optimizer and physical plans, we will not 
    // generate SQL for this join type 
    throw new UnsupportedOperationException 
    } 
} 

case class NaturalJoin(tpe: JoinType) extends JoinType { 
    require(Seq(Inner, LeftOuter, RightOuter, FullOuter).contains(tpe), 
    "Unsupported natural join type " + tpe) 
    override def sql: String = "NATURAL " + tpe.sql 
} 

case class UsingJoin(tpe: JoinType, usingColumns: Seq[String]) extends JoinType { 
    require(Seq(Inner, LeftOuter, LeftSemi, RightOuter, FullOuter, LeftAnti).contains(tpe), 
    "Unsupported using join type " + tpe) 
    override def sql: String = "USING " + tpe.sql 
} 

object LeftExistence { 
    def unapply(joinType: JoinType): Option[JoinType] = joinType match { 
    case LeftSemi | LeftAnti => Some(joinType) 
    case j: ExistenceJoin => Some(joinType) 
    case _ => None 
    } 
} 

一些計算器的例子使用這種link

+0

這並沒有告訴我什麼左半呢例如 – pathikrit