我製作了這個愚蠢的小應用程序,它使用twitters api並掃描最後的'x'推文,找到小小的URL,找出小網址的URL指向,累積已被推送過的頂級域名的頻率,並根據頻率輸出html tagcloud頁面。python,google app engine和twitter:我的程序不能正常工作
但它不能正常工作。有一些解決問題,我會照顧。我不認爲這是我最關心的。我真的想嘗試解決的是兩件事:
有時候,應用程序崩潰(特別是如果我選擇大量的推文掃描),通常是「Downloaderror Applicationerror 2」和「Downloaderror Applicationerror 5」。我無法解決的另一個問題是運行所需的時間......它很慢。我嘗試設置短暫超時。但是,如果我掃描了很多推文,它仍然需要運行。
任何想法?謝謝!
import logging
import wsgiref.handlers
from google.appengine.ext import webapp
import urllib2
from urllib import urlencode
from urllib2 import urlopen
from BeautifulSoup import BeautifulStoneSoup
import socket
import re
from urlparse import urlparse
from google.appengine.api import urlfetch
#from google.appengine.api.urlfetch import DownloadError
#timeout = 3
#socket.setdefaulttimeout(timeout)
class Link():
def __init__(self, a, b):
self.link = a
self.number = b
def __str__(self):
return "%s ; %s" % (self.link, self.number)
def getFeed(i):
r = urlopen('http://search.twitter.com/search.atom?q=twitter&since=2010-02-28&rpp=100&page=%i' %(i))
return r
def processFeed(f):
soup = BeautifulStoneSoup(f.read(),selfClosingTags=["link"])
tweets = []
final = {}
k = 0
j = 0
for entry in soup.findAll("entry"):
title = entry.find('title').contents[0]
if 'http' in title:
temp = re.search("(?P<url>https?://[^\s]+)", title).group("url")
tweets.append(Link(temp,0))
#The for loop below takes care of good urls (yahoo.com), non-sense url (http://asdfaf, http://blah.blah), pages not found (http://google.com/tuff).
#BUT...there are certain response from the host server that just totally crashes the program.
#Downloaderror Applicationerror 5 is a timeout error and Downloaderror Applicationerror 2 is also a connection error
for address in tweets:
#address.link = address.link.strip()
try:
response = urllib2.urlopen(address.link)
#response = urlfetch.fetch(address.link, method=urlfetch.HEAD, deadline=10)
#url_destination = response.final_url
url_destination = response.url
address.link = url_destination
j = j + 1
except urllib2.URLError:
pass
except urllib2.HTTPError:
pass
except UnicodeDecodeError:
pass
while k < j:
o = urlparse(tweets[k].link)
tweets[k].link = o.netloc
k = k + 1
for link in tweets:
temp = link.link.split('.')
temp[len(temp)-1] = temp[len(temp)-1][0:3]
temp = [temp[len(temp)-2],temp[len(temp)-1]]
link.link = '.'.join(temp)
if link.link in final:
final[link.link] += 1
else:
final[link.link] = 1
return final
def TagCloudDivHeader(txt):
return "<div class = 'tagcloud'>\n<div class = 'tagcloudtitle'>%s</div>\n" % txt
def TagCloudDivFooter():
return "</div>\n"
def size(freq):
return freq
def writeTerm(term,freq):
return " <span class='term' style='font-size:"+str(size(freq))+"em'>" + "<a href = 'http://%s'>" %term + term.encode('ISO-8859-1', 'replace') + "</a></span> "+ "\n"
def genForm(prompt = ""):
numberoftweets = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
res = ""
if prompt:
res += "<div class= 'formtitle'>%s</div>" % (prompt)
res += """<form action="index.py" method="post">"""
res +="""<label for="Tweets">Number of Tweets to scan:</label>
<select id="Tweets" name="Tweets">"""
for n in numberoftweets:
res += "<option value = \"%i\">%i</option>" %(n*100,n*100)
res += "</select>"
res += '<input type="submit" value="Go" name="gobtn"/> </form>'
res += "</br>WARNING!!!! The fewer Tweets you scan, the more stable this program is!!!!"
return res
def makeTagCloud(cloudtitle, items):
result = ''
result += TagCloudDivHeader(cloudtitle)
for thing in items:
result += writeTerm(thing, items[thing])
result += TagCloudDivFooter()
result += HTMLFooter()
return result
def HTMLHeader(pageheader = ""):
s = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">"
s += "\n<html><head>\n <title>%s</title>\n" % pageheader
s = s + "<link rel='stylesheet' href='/assets/mystyles.css' type='text/css' />\n"
s = s + "</head>\n"
s = s + "<body>\n"
return s
def HTMLFooter():
return "</body>\n</html>"
def generateLinks():
result = ""
result += HTMLHeader("Who's getting the most traffic from Twitter?")
result += "<p>" + "<a href = 'results/'>Proceed?</a>" + "</p>\n"
result += HTMLFooter()
return result
class MainHandler(webapp.RequestHandler):
def get(self):
self.response.headers['Content-Type'] = 'text/html'
path = self.request.path
logging.info("path is " + path)
form = genForm()
contents = generateLinks()
self.response.out.write(HTMLHeader("Who's getting the most traffic from Twitter?"))
self.response.out.write(form)
self.response.out.write(HTMLFooter())
def post(self):
self.response.out.write("Where are links are Twitter taking you?")
self.response.out.write(HTMLHeader("Domain cloud for Twitter Tweets"))
tweets = int(self.request.get('Tweets'))
tweets = int(tweets/100)
self.response.out.write(makeTagCloud("Domains most linked to by Tweets", processFeed(getFeed(tweets))))
def main():
application = webapp.WSGIApplication([('/.*', MainHandler)],debug=True)
wsgiref.handlers.CGIHandler().run(application)
if __name__ == '__main__':
main()
你有沒有管理解決這個問題? – theheadofabroom 2011-07-03 14:43:56