我在使用YARN的Hadoop集羣上運行以下代碼。 它分析一些電子郵件並執行情感註釋,最後它將結果DataFrame寫爲HDFS上的Parquet表。不幸的是,它在HDFS上的最後一個數據幀寫入行#66上保持失敗,錯誤顯示在底部。無論如何,我無法解釋爲什麼每當我使用數據集的一個小樣本時,它終止成功。DataFrame寫入HDFS期間SparkException
object ETLDriver {
val appName = "ENRON-etl"
val conf = new SparkConf().setAppName(appName)
val sc = new SparkContext(conf)
def main (args: Array[String]): Unit = {
val allExtracted = sc.objectFile[(String, Seq[String])](Commons.ENRON_EXTRACTED_TXT)
// Testing on a sub-sample
// val allExtracted = sc.objectFile[(String, Seq[String])](Commons.ENRON_EXTRACTED_TXT).sample(false, 0.01, 42)
// get custodians from csv file stored in HDFS
val csv = sc.textFile(Commons.ENRON_CUSTODIANS_CSV_HDFS).map{line => line.split(",")}
var custodians = sc.broadcast(csv.map{record => Custodian(record(0),record(1),Option(record(2)))}.collect().toSeq)
// parse emails
val allParsed: RDD[MailBox] = allExtracted.map { case (mailbox, emails) =>
val parsedEmails = emails flatMap { email =>
try Some(EmailParser.parse(email, custodians.value))
catch {
case e: EmailParsingException => None
}
}
MailBox(mailbox, parsedEmails)
}
// classify sentiment and save w/o body
val mailboxesSentiment = allParsed.map { mailbox =>
// load sentiment annotator pipeline
val nlpProps = new Properties
nlpProps.setProperty("annotators", "tokenize, ssplit, pos, parse, lemma, sentiment")
val pipeline = new StanfordCoreNLP(nlpProps)
// annotation
val emailsWithSentiment = mailbox.emails.map { email =>
val document = new Annotation(email.body)
pipeline.annotate(document)
val sentiment = document.get[String](classOf[SentimentCoreAnnotations.ClassName])
EmailWithSentiment(email.date, email.from, email.to ++ email.cC++ email.bcc, email.subject, sentiment)
}
MailBoxWithSentiment(mailbox.name, emailsWithSentiment)
}
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
val dfFull = allParsed.toDF()
dfFull.write.mode(SaveMode.Overwrite).parquet(Commons.ENRON_FULL_DATAFRAME)
val dfSentiment = mailboxesSentiment.toDF()
dfSentiment.write.mode(SaveMode.Overwrite).parquet(Commons.ENRON_SENTIMENT_DATAFRAME)
}
}
錯誤:
AM Container for appattempt_1456482988572_5307_000001 exited with exitCode: 15
For more detailed output, check application tracking page:http://head05.custer_url:8088/cluster/app/application_1456482988572_5307Then, click on links to logs of each attempt.
Diagnostics: Exception from container-launch.
Container id: container_1456482988572_5307_01_000001
Exit code: 15
Stack trace: ExitCodeException exitCode=15:
at org.apache.hadoop.util.Shell.runCommand(Shell.java:576)
at org.apache.hadoop.util.Shell.run(Shell.java:487)
at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:753)
at org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor.launchContainer(LinuxContainerExecutor.java:371)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:302)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:82)
at java.util.concurrent.FutureTask.run(FutureTask.java:262)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Shell output: main : command provided 1
main : run as user is lsde03
main : requested yarn user is lsde03
Container exited with a non-zero exit code 15
Failing this attempt
再登錄星火here
你能注意到這行是第66,因爲StackOverflow上沒有行號。這個? 'dfSentiment.write.mode(SaveMode.Overwrite).parquet(Commons.ENRON_SENTIMENT_DATAFRAME)' – morganw09dev
yes就是那一行 –