2017-03-08 114 views
-1

我正在學習python爬蟲這些天,並且我寫了一個簡單的爬蟲來獲取Pixiv ID在Pixiv上的圖片。如何優化我的python爬蟲的內存使用情況

它工作的很好,但是這裏出現了一個大問題:當它運行時,它佔用了我的計算機上近1.2G的內存。

但是,有時它只佔用10M內存,我真的不知道哪些代碼導致如此大的內存使用率。

我已經上傳腳本到我的VPS(只有768M內存Vulter服務器)並試圖運行。結果,我得到了一個MerroyError。

所以我想知道如何優化內存使用(即使花費更多的時間來運行)。

這裏是我的代碼:

(我已經重寫了所有代碼,使其通過pep8,如果還不清楚,請告訴我哪些代碼讓你感到困惑)

from lxml import etree 
import re 
import os 
import requests 


# Get a single Picture. 
def get_single(Pixiv_ID, Tag_img_src, Headers): 
    Filter_Server = re.compile("[\d]+") 
    Filter_Posttime = re.compile("img\/[^_]*_p0") 
    Posttime = Filter_Posttime.findall(Tag_img_src)[0] 
    Server = Filter_Server.findall(Tag_img_src)[0] 
    Picture_Type = [".png", ".jpg", ".gif"] 
    for i in range(len(Picture_Type)): 
     Original_URL = "http://i" + str(Server) + ".pixiv.net/img-original/"\ 
         + Posttime+Picture_Type[i] 
     Picture = requests.get(Original_URL, headers=Headers, stream=True) 
     if Picture.status_code == 200: 
      break 
    if Picture.status_code != 200: 
     return -1 
    Filename = "./pic/"\ 
       + str(Pixiv_ID) + "_p0"\ 
       + Picture_Type[i] 
    Picture_File = open(Filename, "wb+") 
    for chunk in Picture.iter_content(None): 
     Picture_File.write(chunk) 
    Picture_File.close() 
    Picture.close() 
    return 200 


# Get manga which is a bundle of pictures. 
def get_manga(Pixiv_ID, Tag_a_href, Tag_img_src, Headers): 
    os.mkdir("./pic/" + str(Pixiv_ID)) 
    Filter_Server = re.compile("[\d]+") 
    Filter_Posttime = re.compile("img\/[^_]*_p") 
    Manga_URL = "http://www.pixiv.net/"+Tag_a_href 
    Manga_HTML = requests.get(Manga_URL, headers=Headers) 
    Manga_XML = etree.HTML(Manga_HTML.content) 
    Manga_Pages = Manga_XML.xpath('/html/body' 
            '/nav[@class="page-menu"]' 
            '/div[@class="page"]' 
            '/span[@class="total"]/text()')[0] 
    Posttime = Filter_Posttime.findall(Tag_img_src)[0] 
    Server = Filter_Server.findall(Tag_img_src)[0] 
    Manga_HTML.close() 
    Picture_Type = [".png", ".jpg", ".gif"] 
    for Number in range(int(Manga_Pages)): 
     for i in range(len(Picture_Type)): 
      Original_URL = "http://i" + str(Server) + \ 
          ".pixiv.net/img-original/"\ 
          + Posttime + str(Number) + Picture_Type[i] 
      Picture = requests.get(Original_URL, headers=Headers, stream=True) 
      if Picture.status_code == 200: 
       break 
     if Picture.status_code != 200: 
      return -1 
     Filename = "./pic/"+str(Pixiv_ID) + "/"\ 
        + str(Pixiv_ID) + "_p"\ 
        + str(Number) + Picture_Type[i] 
     Picture_File = open(Filename, "wb+") 
     for chunk in Picture.iter_content(None): 
      Picture_File.write(chunk) 
     Picture_File.close() 
     Picture.close() 
    return 200 


# Main function. 
def get_pic(Pixiv_ID): 
    Index_URL = "http://www.pixiv.net/member_illust.php?"\ 
       "mode=medium&illust_id="+str(Pixiv_ID) 
    Headers = {'referer': Index_URL} 
    Index_HTML = requests.get(Index_URL, headers=Headers, stream=True) 
    if Index_HTML.status_code != 200: 
     return Index_HTML.status_code 
    Index_XML = etree.HTML(Index_HTML.content) 
    Tag_a_href_List = Index_XML.xpath('/html/body' 
             '/div[@id="wrapper"]' 
             '/div[@class="newindex"]' 
             '/div[@class="newindex-inner"]' 
             '/div[@class="newindex-bg-container"]' 
             '/div[@class="cool-work"]' 
             '/div[@class="cool-work-main"]' 
             '/div[@class="img-container"]' 
             '/a/@href') 
    Tag_img_src_List = Index_XML.xpath('/html/body' 
             '/div[@id="wrapper"]' 
             '/div[@class="newindex"]' 
             '/div[@class="newindex-inner"]' 
             '/div[@class="newindex-bg-container"]' 
             '/div[@class="cool-work"]' 
             '/div[@class="cool-work-main"]' 
             '/div[@class="img-container"]' 
             '/a/img/@src') 
    if Tag_a_href_List == [] or Tag_img_src_List == []: 
     return 404 
    else: 
     Tag_a_href = Tag_a_href_List[0] 
     Tag_img_src = Tag_img_src_List[0] 
    Index_HTML.close() 
    if Tag_a_href.find("manga") != -1: 
     return get_manga(Pixiv_ID, Tag_a_href, Tag_img_src, Headers) 
    else: 
     return get_single(Pixiv_ID, Tag_img_src, Headers) 


# Check whether the picture already exists. 
def check_exist(Pixiv_ID): 
    if not os.path.isdir("Pic"): 
     os.mkdir("Pic") 
    if os.path.isdir("./Pic/"+str(Pixiv_ID)): 
     return True 
    Picture_Type = [".png", ".jpg", ".gif"] 
    Picture_Exist = False 
    for i in range(len(Picture_Type)): 
     Path = "./Pic/" + str(Pixiv_ID)\ 
       + "_p0" + Picture_Type[i] 
     if os.path.isfile(Path): 
      return True 
    return Picture_Exist 


# The script starts here. 
for i in range(0, 38849402): 
    Pixiv_ID = 38849402-i 
    Picture_Exist = check_exist(Pixiv_ID) 
    if not Picture_Exist: 
     Return_Code = get_pic(Pixiv_ID) 
     if Return_Code == 200: 
      print str(Pixiv_ID), "finish!" 
     elif Return_Code == -1: 
      print str(Pixiv_ID), "got an unknown error." 
     elif Return_Code == 404: 
      print str(Pixiv_ID), "not found. Maybe deleted." 
    else: 
     print str(Pixiv_ID), "picture exists!" 
+4

這是一個太大的混亂通過,你應該嘗試memory_profiler。乍看之下,它看起來像是一次讀取圖像。嘗試編寫一個[MCVE],如果可能的話,很難遵循所有的全局變量,非標準的命名等。 – pvg

+0

@pvg我已經評論了我的腳本的變量和邏輯。現在清楚了嗎? – Kon

+0

它並沒有太大的幫助,試着像'r = requests.get(url,stream = True)'這樣的請求流式傳輸。在'iter_content'中設置chunk_size爲None,因爲5很荒謬。 – pvg

回答

1

OMG!

最後,我知道出了什麼問題。

我使用mem_top()來查看佔用內存的東西。

猜猜是什麼?

它是for i in range(0, 38849402):

在存儲器中,有一個列表[0,1,2,3 ... 38849401],它佔用了我的記憶。

我將其更改爲:

Pixiv_ID = 38849402 
while Pixiv_ID > 0: 

    some code here 

    Pixiv_ID = Pixiv_ID-1 

現在的內存使用量只是沒有超過20M以上。

感覺興奮!

+0

啊哈!優秀。這是切換到Python 3的另一個真正的好理由。但現在看看這個代碼,組合函數,實際的html解析器。你的痛苦並不是沒有用的。 – pvg

+0

或者使用'xrange' –

+0

@ColonelThirtyTwo Killjoy。 – pvg