我正在學習python爬蟲這些天,並且我寫了一個簡單的爬蟲來獲取Pixiv ID在Pixiv上的圖片。如何優化我的python爬蟲的內存使用情況
它工作的很好,但是這裏出現了一個大問題:當它運行時,它佔用了我的計算機上近1.2G的內存。
但是,有時它只佔用10M內存,我真的不知道哪些代碼導致如此大的內存使用率。
我已經上傳腳本到我的VPS(只有768M內存Vulter服務器)並試圖運行。結果,我得到了一個MerroyError。
所以我想知道如何優化內存使用(即使花費更多的時間來運行)。
這裏是我的代碼:
(我已經重寫了所有代碼,使其通過pep8
,如果還不清楚,請告訴我哪些代碼讓你感到困惑)
from lxml import etree
import re
import os
import requests
# Get a single Picture.
def get_single(Pixiv_ID, Tag_img_src, Headers):
Filter_Server = re.compile("[\d]+")
Filter_Posttime = re.compile("img\/[^_]*_p0")
Posttime = Filter_Posttime.findall(Tag_img_src)[0]
Server = Filter_Server.findall(Tag_img_src)[0]
Picture_Type = [".png", ".jpg", ".gif"]
for i in range(len(Picture_Type)):
Original_URL = "http://i" + str(Server) + ".pixiv.net/img-original/"\
+ Posttime+Picture_Type[i]
Picture = requests.get(Original_URL, headers=Headers, stream=True)
if Picture.status_code == 200:
break
if Picture.status_code != 200:
return -1
Filename = "./pic/"\
+ str(Pixiv_ID) + "_p0"\
+ Picture_Type[i]
Picture_File = open(Filename, "wb+")
for chunk in Picture.iter_content(None):
Picture_File.write(chunk)
Picture_File.close()
Picture.close()
return 200
# Get manga which is a bundle of pictures.
def get_manga(Pixiv_ID, Tag_a_href, Tag_img_src, Headers):
os.mkdir("./pic/" + str(Pixiv_ID))
Filter_Server = re.compile("[\d]+")
Filter_Posttime = re.compile("img\/[^_]*_p")
Manga_URL = "http://www.pixiv.net/"+Tag_a_href
Manga_HTML = requests.get(Manga_URL, headers=Headers)
Manga_XML = etree.HTML(Manga_HTML.content)
Manga_Pages = Manga_XML.xpath('/html/body'
'/nav[@class="page-menu"]'
'/div[@class="page"]'
'/span[@class="total"]/text()')[0]
Posttime = Filter_Posttime.findall(Tag_img_src)[0]
Server = Filter_Server.findall(Tag_img_src)[0]
Manga_HTML.close()
Picture_Type = [".png", ".jpg", ".gif"]
for Number in range(int(Manga_Pages)):
for i in range(len(Picture_Type)):
Original_URL = "http://i" + str(Server) + \
".pixiv.net/img-original/"\
+ Posttime + str(Number) + Picture_Type[i]
Picture = requests.get(Original_URL, headers=Headers, stream=True)
if Picture.status_code == 200:
break
if Picture.status_code != 200:
return -1
Filename = "./pic/"+str(Pixiv_ID) + "/"\
+ str(Pixiv_ID) + "_p"\
+ str(Number) + Picture_Type[i]
Picture_File = open(Filename, "wb+")
for chunk in Picture.iter_content(None):
Picture_File.write(chunk)
Picture_File.close()
Picture.close()
return 200
# Main function.
def get_pic(Pixiv_ID):
Index_URL = "http://www.pixiv.net/member_illust.php?"\
"mode=medium&illust_id="+str(Pixiv_ID)
Headers = {'referer': Index_URL}
Index_HTML = requests.get(Index_URL, headers=Headers, stream=True)
if Index_HTML.status_code != 200:
return Index_HTML.status_code
Index_XML = etree.HTML(Index_HTML.content)
Tag_a_href_List = Index_XML.xpath('/html/body'
'/div[@id="wrapper"]'
'/div[@class="newindex"]'
'/div[@class="newindex-inner"]'
'/div[@class="newindex-bg-container"]'
'/div[@class="cool-work"]'
'/div[@class="cool-work-main"]'
'/div[@class="img-container"]'
'/a/@href')
Tag_img_src_List = Index_XML.xpath('/html/body'
'/div[@id="wrapper"]'
'/div[@class="newindex"]'
'/div[@class="newindex-inner"]'
'/div[@class="newindex-bg-container"]'
'/div[@class="cool-work"]'
'/div[@class="cool-work-main"]'
'/div[@class="img-container"]'
'/a/img/@src')
if Tag_a_href_List == [] or Tag_img_src_List == []:
return 404
else:
Tag_a_href = Tag_a_href_List[0]
Tag_img_src = Tag_img_src_List[0]
Index_HTML.close()
if Tag_a_href.find("manga") != -1:
return get_manga(Pixiv_ID, Tag_a_href, Tag_img_src, Headers)
else:
return get_single(Pixiv_ID, Tag_img_src, Headers)
# Check whether the picture already exists.
def check_exist(Pixiv_ID):
if not os.path.isdir("Pic"):
os.mkdir("Pic")
if os.path.isdir("./Pic/"+str(Pixiv_ID)):
return True
Picture_Type = [".png", ".jpg", ".gif"]
Picture_Exist = False
for i in range(len(Picture_Type)):
Path = "./Pic/" + str(Pixiv_ID)\
+ "_p0" + Picture_Type[i]
if os.path.isfile(Path):
return True
return Picture_Exist
# The script starts here.
for i in range(0, 38849402):
Pixiv_ID = 38849402-i
Picture_Exist = check_exist(Pixiv_ID)
if not Picture_Exist:
Return_Code = get_pic(Pixiv_ID)
if Return_Code == 200:
print str(Pixiv_ID), "finish!"
elif Return_Code == -1:
print str(Pixiv_ID), "got an unknown error."
elif Return_Code == 404:
print str(Pixiv_ID), "not found. Maybe deleted."
else:
print str(Pixiv_ID), "picture exists!"
這是一個太大的混亂通過,你應該嘗試memory_profiler。乍看之下,它看起來像是一次讀取圖像。嘗試編寫一個[MCVE],如果可能的話,很難遵循所有的全局變量,非標準的命名等。 – pvg
@pvg我已經評論了我的腳本的變量和邏輯。現在清楚了嗎? – Kon
它並沒有太大的幫助,試着像'r = requests.get(url,stream = True)'這樣的請求流式傳輸。在'iter_content'中設置chunk_size爲None,因爲5很荒謬。 – pvg