from django.http import HttpResponse
from bs4 import BeautifulSoup as bsoup
import urlparse
from urllib2 import urlopen
from urllib import urlretrieve
import os
import sys
import zipfile
from django.core.servers.basehttp import FileWrapper
def getdata(request):
out = 'C:\Users\user\Desktop\images'
if request.GET.get('q'):
#url = str(request.GET['q'])
url = "http://google.com"
soup = bsoup(urlopen(url))
parsedURL = list(urlparse.urlparse(url))
for image in soup.findAll("img"):
print "Old Image Path: %(src)s" % image
#Get file name
filename = image["src"].split("/")[-1]
#Get full path name if url has to be parsed
parsedURL[2] = image["src"]
image["src"] = '%s\%s' % (out,filename)
print 'New Path: %s' % image["src"]
# print image
outpath = os.path.join(out, filename)
#retrieve images
if image["src"].lower().startswith("http"):
urlretrieve(image["src"], outpath)
urlretrieve(urlparse.urlunparse(parsedURL), out) #Constructs URL from tuple (parsedURL)
#Create HTML File and writes to it to check output (stored in same directory).
html = soup.prettify("utf-8")
with open("output.html", "wb") as file:
url = 'You submitted nothing!'
return HttpResponse(url)
分享的附加說明可能有幫助:在該文件的屬性下,Windows列出屬性A,這意味着Windows根據我的理解將其識別爲存檔文件。 – johns4ta
這裏涉及多個因素。對於一個我不確定谷歌會服務的標準頁面。 Google可能會爲不同的用戶代理提供不同的服務。你也說這些是存檔,我不知道,但你應該看到的內容標題,你可能能夠找到編碼,它應該是gzip或什麼 – dusual
我剛剛使用谷歌,因爲它只包含一個圖像在頁面上解析。我嘗試使用7-zip打開文件,但我收到一條錯誤消息,說它無法打開文件作爲存檔。 – johns4ta