我轉換了現有的代碼,該代碼在python下面粘貼在pyspark中。與純python替代品相比,Pyspark代碼的性能不夠
Python代碼:
import json
import csv
def main():
# create a simple JSON array
with open('paytm_tweets_data_1495614657.json') as str:
tweetsList = []
# change the JSON string into a JSON object
jsonObject = json.load(str)
#print(jsonObject)
# # print the keys and values
for i in range(len(jsonObject)):
tweetsList.insert(i,jsonObject[i]["text"])
#print(tweetsList)
displaySentiment(tweetsList)
def displaySentiment(tweetsList):
aDict = {}
from sentiment import sentiment_score
for i in range(len(tweetsList)):
aDict[tweetsList[i]] = sentiment_score(tweetsList[i])
print (aDict)
with open('PaytmtweetSentiment.csv', 'w') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames = ["Tweets", "Sentiment Value"])
writer.writeheader()
writer = csv.writer(csv_file)
for key, value in aDict.items():
writer.writerow([key, value])
if __name__ == '__main__':
main()
轉換Pyspark代碼:
import json
import csv
import os
from pyspark import SparkContext, SparkConf
from pyspark.python.pyspark.shell import spark
os.environ['PYSPARK_PYTHON'] = "/usr/local/bin/python3"
def main():
path = "/Users/i322865/DeepInsights/bitbucket-code/ai-engine/twitter-sentiment-analysis/flipkart_tweets_data_1495601666.json"
peopleDF = spark.read.json(path).rdd
df = peopleDF.map(lambda row: row['text'])
displaySentiment(df.collect())
def displaySentiment(tweetsList):
from sentiment import sentiment_score
aDict = sentiment_score(tweetsList)
#
with open('paytmtweetSentiment.csv', 'w') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames = ["Tweets", "Sentiment Value"])
writer.writeheader()
writer = csv.writer(csv_file)
for i in range(len(tweetsList)):
writer.writerow([tweetsList[i], aDict[i]])
print([tweetsList[i], aDict[i]])
if __name__ == '__main__':
conf = SparkConf().setAppName("Test").setMaster("local")
sc = SparkContext.getOrCreate(conf=conf)
main()
我跑了兩個節目,但沒有看到任何顯著的性能提升。我錯過了什麼?請你能提出一些想法嗎?
另外,我是否也應該使用'減少'?我目前只使用'地圖'。
這種類型的問題並不適合該網站,但事實上,這仍然是一個不好的代碼,說實話,當然沒有進攻! Pyspark不是一種編程語言。另一方面,Python也是如此。 – eliasah
@eliasah對不起,修改了問題。感謝您的快速反饋。 – coders
調用'df.collect()'兩次當然性能較差。調用它在所有渲染Spark大多無用 –