我一直堅信Spark是這份工作的最佳工具。 我測試了下面的代碼,結果很好。
import java.sql.Date
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.sql.{Row, SQLContext}
import com.datastax.spark.connector._
import com.datastax.spark.connector.cql.CassandraConnector
import java.sql._
import com.github.nscala_time.time.Imports._
object Migration {
def main(args: scala.Array[String]) {
def changeDate(created: java.util.Date) : String = {
var sDate = new DateTime(created)
var sDay = sDate.getDayOfMonth()
var sMonth = sDate.getMonthOfYear()
var sYear = sDate.getYear()
var created_date = "" + sYear + "-" + sMonth + "-" + sDay
created_date //return
}
//spark configuration
val conf = new SparkConf().setAppName("migration")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val connector = CassandraConnector(conf)
val rdd = sc.cassandraTable("keyspace", "table_a")
println("Starting migration...")
rdd.map(row => {
val x = new java.util.Date(row.getLong("x"))
val y = new java.util.Date(row.getLong("y"))
val z = row.getString("z")
val t = row.getString("t")
val k = changeDate(x)
connector.withSessionDo(session => {
val statement = session.prepare(s"INSERT INTO keyspace.table_b (k, y, z, x, t) " + "values (?, ?, ?, ?, ?)")
val bound = statement.bind(k, y, z, x, t)
session.executeAsync(bound)
})
}).foreach(x => x.getUninterruptibly())
println("Done.")
} }
數據框直到1.4纔可用,因此您必須升級。我將分享一個可以從頭開始並調整用例的示例工作。 – phact
查看https://github.com/rssvihla/spark_commons/blob/master/examples/spark_bulk_operations/src/main/scala/pro/foundev/scala/SchemaMigration.scala – phact
https://github.com/rssvihla https://github.com/rssvihla /spark_commons/blob/master/examples/spark_bulk_operations/src/main/scala/pro/foundev/scala/CassandraCapable.scala#L69 – phact