2016-11-20 98 views
1

我想加載我的數據並對其進行一些基本的線性迴歸。所以首先,我需要使用VectorAssembler來製作我的功能欄。但是,當我使用assembler.transform(df)時,dfDataFrame,並且它期望DataSet。我試過df.toDS,但它給value toDS is not a member of org.apache.spark.sql.DataFrame。的確,它是org.apache.spark.sql.DatasetHolder的成員。Spark 2.0 - 將DataFrame轉換爲DataSet

我在這裏發生了什麼問題?

package main.scala 

import org.apache.spark.SparkContext 
import org.apache.spark.SparkContext._ 
import org.apache.spark.SparkConf 
import org.apache.spark.sql.functions._ 
import org.apache.spark.sql.SQLContext 
import org.apache.spark.sql.DatasetHolder 
import org.apache.spark.ml.regression.LinearRegression 
import org.apache.spark.ml.feature.RFormula 
import org.apache.spark.ml.feature.VectorAssembler 
import org.apache.spark.ml.linalg.Vectors 

object Analyzer { 
    def main(args: Array[String]) { 

    val conf = new SparkConf() 
    val sc = new SparkContext(conf) 
    val sqlContext = new SQLContext(sc)  
    import sqlContext.implicits._ 

    val df = sqlContext.read 
    .format("com.databricks.spark.csv") 
    .option("header", "false") 
    .option("delimiter", "\t") 
    .option("parserLib", "UNIVOCITY") 
    .option("inferSchema", "true") 
    .load("data/snap/*") 

    val assembler = new VectorAssembler() 
    .setInputCols(Array("own", "want", "wish", "trade", "comment")) 
    .setOutputCol("features") 

    val df1 = assembler.transform(df) 

    val formula = new RFormula().setFormula("rank ~ own + want + wish + trade + comment") 
    .setFeaturesCol("features") 
     .setLabelCol("rank") 
} 
} 

回答

2

顯然的問題是,因爲我仍在使用Spark 1.6風格的SparkSQLContext。我更改爲SparkSession,而transform()能夠隱式接受DataFrame

package main.scala 

import org.apache.spark.sql.SparkSession 
import org.apache.spark.sql.Dataset 
import org.apache.spark.ml.regression.LinearRegression 
import org.apache.spark.ml.feature.RFormula 
import org.apache.spark.ml.feature.VectorAssembler 
import org.apache.spark.ml.linalg.Vectors 

object Analyzer { 
    def main(args: Array[String]) { 

     val spark = SparkSession.builder().getOrCreate() 
     import spark.implicits._ 

     val df = spark.read 
     .format("com.databricks.spark.csv") 
     .option("header", "false") 
     .option("delimiter", "\t") 
     .option("parserLib", "UNIVOCITY") 
     .option("inferSchema", "true") 
     .load("data/snap/*")   

     df.show() 

     val assembler = new VectorAssembler() 
     .setInputCols(Array("own", "want", "wish", "trade", "comment")) 
     .setOutputCol("features") 

     val df1 = assembler.transform(df) 
    } 
} 
相關問題