2015-11-01 44 views
0

我正試圖收集所有(或在有限的Twitter firehose中)特定單詞的推文,並將Twitter流式傳輸一天放入JSON文件中。這裏是我的代碼:Twitter和單行JSON輸出

import re 
import twitter 
import numpy as np 
import pandas as pd 
import os 
import json 
import time 
import datetime 

q = 'JustinBieber' #just for the sake of demonstration 
max_time = 86400 

CONSUMER_KEY = '...' 
CONSUMER_SECRET = '...' 
OAUTH_TOKEN = '...' 
OAUTH_TOKEN_SECRET = '...' 

auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET, CONSUMER_KEY, CONSUMER_SECRET) 

twitter_api = twitter.TwitterStream(auth=auth) 

count = 0 
max_iter = None 

twitter_stream = twitter.TwitterStream(auth=twitter_api.auth) 

stream = twitter_stream.statuses.filter(track=q) 

statuses = [] 

start = datetime.datetime.now() 
startstr = str(start) 
dummy = str(start.month)+str(start.day)+str(start.year) 
jsonfile = str(q)+dummy+'.json' 

with open(jsonfile, 'w') as f: 

    try: 
     for tweet in stream: 

      json.dump(tweet, f) 
      count += 1 

      #if count % 100 == 0: # for Justin Beiber 
      if count % 5 == 0:  # for less popular 

       print('{0} tweets fetched...'.format(count)) 

      now = datetime.datetime.now() 

      if now - start > datetime.timedelta(0, 120, 0): 
       break 

      if max_iter is not None and count >= max_iter: 
       #f.close() 
       break 

    except Exception as e: 
     print(e) 

f.close() 

當我這樣做,我得到一個非常長的,單成蔭的JSON文件看起來像這樣(對於不同的搜索詞):

{"favorited": false, "contributors": null, "truncated": false, "text": "This Saturday is #Trailfest, 10k and 15m trail running races. Good luc 
k to all of the runners participating! http://t.co/pxgPNn432c", "possibly_sensitive": false, "is_quote_status": false, "in_reply_to_status_id" 
: null, "user": {"follow_request_sent": null, "profile_use_background_image": true, "default_profile_image": false, "id": 26342031, "verified" 
: false, "profile_image_url_https": "https://pbs.twimg.com/profile_images/550440833946628096/U5LL94A0_normal.jpeg", "profile_sidebar_fill_colo 
r": "B00100", "profile_text_color": "999894", "followers_count": 1414, "profile_sidebar_border_color": "EA3001", "id_str": "26342031", "profil 
e_background_color": "010002", "listed_count": 74, "profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/3539 
72397/P1070158_-_Version_2_copy.jpg", "utc_offset": -21600, "statuses_count": 683, "description": "Pajarito Mountain Ski Area is friendly, cha 
llenging and authentic. The perfect place to spend some quality mountain time, close to Santa Fe and ABQ. #pajarito", "friends_count": 244, " 
location": "Los Alamos, NM", "profile_link_color": "121111", "profile_image_url": "http://pbs.twimg.com/profile_images/550440833946628096/U5LL 
94A0_normal.jpeg", "following": null, "geo_enabled": false, "profile_banner_url": "https://pbs.twimg.com/profile_banners/26342031/1428964972", 
"profile_background_image_url": "http://pbs.twimg.com/profile_background_images/353972397/P1070158_-_Version_2_copy.jpg", "name": "Pajarito M 
ountain", "lang": "en", "profile_background_tile": false, "favourites_count": 30, "screen_name": "SkiPajarito", "notifications": null, "url": 
"http://www.skipajarito.com", "created_at": "Tue Mar 24 22:29:08 +0000 2009", "contributors_enabled": false, "time_zone": "Mountain Time (US & 
Canada)", "protected": false, "default_profile": false, "is_translator": false}, "filter_level": "low", "geo": null, "id": 653676227929210880 
, "favorite_count": 0, "lang": "en", "entities": {"user_mentions": [], "symbols": [], "hashtags": [{"indices": [17, 27], "text": "Trailfest"}] 
, "urls": [], "media": [{"expanded_url": "http://twitter.com/SkiPajarito/status/653676227929210880/photo/1", "display_url": "pic.twitter.com/p 
xgPNn432c", "url": "http://t.co/pxgPNn432c", "media_url_https": "https://pbs.twimg.com/media/CRJTENDWsAAy3Eg.jpg", "id_str": "6536762276104560 
64", "sizes": {"small": {"h": 226, "resize": "fit", "w": 340}, "large": {"h": 681, "resize": "fit", "w": 1024}, "medium": {"h": 399, "resize": 
"fit", "w": 600}, "thumb": {"h": 150, "resize": "crop", "w": 150}}, "indices": [109, 131], "type": "photo", "id": 653676227610456064, "media_ 
url": "http://pbs.twimg.com/media/CRJTENDWsAAy3Eg.jpg"}]}, "in_reply_to_user_id_str": null, "retweeted": false, "coordinates": null, "timestam 
p_ms": "1444683532112", "source": "<a href=\"http://www.hootsuite.com\" rel=\"nofollow\">Hootsuite</a>", "in_reply_to_status_id_str": null, "i 
n_reply_to_screen_name": null, "id_str": "653676227929210880", "extended_entities": {"media": [{"expanded_url": "http://twitter.com/SkiPajarit 
o/status/653676227929210880/photo/1", "display_url": "pic.twitter.com/pxgPNn432c", "url": "http://t.co/pxgPNn432c", "media_url_https": "https: 
//pbs.twimg.com/media/CRJTENDWsAAy3Eg.jpg", "id_str": "653676227610456064", "sizes": {"small": {"h": 226, "resize": "fit", "w": 340}, "large": 
{"h": 681, "resize": "fit", "w": 1024}, "medium": {"h": 399, "resize": "fit", "w": 600}, "thumb": {"h": 150, "resize": "crop", "w": 150}}, "i 
ndices": [109, 131], "type": "photo", "id": 653676227610456064, "media_url": "http://pbs.twimg.com/media/CRJTENDWsAAy3Eg.jpg"}]}, "place": nul 
l, "retweet_count": 0, "created_at": "Mon Oct 12 20:58:52 +0000 2015", "in_reply_to_user_id": null}{"favorited": false, "contributors": null, 
"truncated": false, "text": "Sleep with a spoon? Pray to Ullr? Which of these rituals do you partake in? http://t.co/KCXMfsR318 @PowderMagazin 
e http://t.co/JElQ95Qr6R", "possibly_sensitive": false, "is_quote_status": false, "in_reply_to_status_id": null, "user": {"follow_request_sent 
": null, "profile_use_background_image": true, "default_profile_image": false, "id": 26342031, "verified": false, "profile_image_url_https": " 
https://pbs.twimg.com/profile_images/550440833946628096/U5LL94A0_normal.jpeg", "profile_sidebar_fill_color": "B00100", "profile_text_color": " 
999894", "followers_count": 1417, "profile_sidebar_border_color": "EA3001", "id_str": "26342031", "profile_background_color": "010002", "liste 
d_count": 74, "profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/353972397/P1070158_-_Version_2_copy.jpg", 
"utc_offset": -21600, "statuses_count": 684, "description": "Pajarito Mountain Ski Area is friendly, challenging and authentic. The perfect 
place to spend some quality mountain time, close to Santa Fe and ABQ. #pajarito", "friends_count": 244, "location": "Los Alamos, NM", "profile 
_link_color": "121111", "profile_image_url": "http://pbs.twimg.com/profile_images/550440833946628096/U5LL94A0_normal.jpeg", "following": null, 
"geo_enabled": false, "profile_banner_url": "https://pbs.twimg.com/profile_banners/26342031/1428964972", "profile_background_image_url": "htt 
p://pbs.twimg.com/profile_background_images/353972397/P1070158_-_Version_2_copy.jpg", "name": "Pajarito Mountain", "lang": "en", "profile_back 
ground_tile": false, "favourites_count": 30, "screen_name": "SkiPajarito", "notifications": null,... 

我曾嘗試解析它在多種方式,如:

with open(filename, 'rb') as f: 
    data = f.readlines() 
data = map(lambda x: x.rstrip(), data) 
data_json_str = "[" + ','.join(data) + "]" 
data_df = pd.read_json(data_json_str) 

它得到我的錯誤:

JSONDecodeError:分機ra數據:第1行第3792行 - 第1行第69900行(char 3791 - 69899)

有關我可能做錯什麼的想法?

+0

使用'縮進= 2'(也許'sort_keys = TRUE')中那樣json.dump通話。 我不確定我瞭解你的問題。是否需要閱讀由json.dump編寫的json文件? – Berci

+0

爲什麼不用一次調用'json.load(f)'解析返回的文件並查看會發生什麼? – Pynchia

+0

將嘗試。我現在正在飛機上,而我們即將不得不關閉。當我回到wifi時更多... –

回答

0

你可以嘗試做這樣的:

file = './Sample-2-Tweets.json' 
    count = 0 
    text = "" 
    if file != None: 
     with open(file) as data_file: 
      for row in data_file:`enter code here` 
       data = json.loads(row) 
       count = count + 1 
       text = text + "\nTweet Created at: " + data['createdAt']['$date'] + "\nGeo-Location Latitude: " + str(data['geoLocation']['latitude']) + "\nGeo-Location Longitude: " + str(data['geoLocation']['longitude']) + "\nTweet Text " + data['text'] + "\nPlace Name: " + data['place']['name'] + "\nPlace Full Name: " + data['place']['fullName'] + "\n\n...... Next Tweet ........" 
     scr.insert(tk.INSERT, text) 
     print("Counter " + str(count))