2017-02-12 122 views
1

問題

我試圖刮圖像和自動分頁。我使用的是網頁上項目的span說明對比總項目:1 - 20 of 83,829 results。我想通過這個while循環運行夢魘,但它會掛起並給我一個Javascript heap out of memory錯誤。有沒有辦法讓每次都執行它,而不是推到堆棧上,因爲我覺得它就是這樣做的。如何在while循環中執行nightmarejs

代碼來解決

function scrapeEach(paginate) { 
// while paginate has next scrapeEach 
while (paginate.next) { 
    nightmare 
    .wait(4000) 
    .realClick('#pageNext .results-next') 
    .wait(4000) 
    .evaluate(() => { 
     return document.body.innerHTML 
    }) 
    .then(result => { 
     scrapeImages(result); 
     paginate.update(); 
     paginate.state(); 
    }) 
    .catch(error => { 
     console.error('Search failed:', error); 
    }); 
    } 
return nightmare.end() 
} 

這裏是額外的代碼與scrapeEach()工作, 我創造了這個分頁對象跟蹤的網頁是這樣的:

function Paginate(url, pgd) { 

    this.url = url; 
    this.array = pgd.split(" "); 

    this.currentPage = Number(this.array[0]); 
    this.totalItems = Number(_.join(_.filter(this.array[4], char => char != ","), '')); 
    this.itemsPerPage = Number(this.array[2]); 
    this.totalPages = Math.floor(this.totalItems/this.itemsPerPage) 
    this.next = true; 

    this.update =() => { 
    this.currentPage += 1; 
    if (this.currentPage >= this.totalPages) 
    this.next = false; 
    } 

    this.state =() => { 
    console.log("-------- Pagination ----------") 
    console.log("current page: " + this.currentPage); 
    console.log("total pages: " + this.totalPages); 
    console.log("total items: " + this.totalItems); 
    console.log("items per page: " + this.itemsPerPage); 
    console.log("has next page: " + this.next); 
    console.log("------------------------------\n"); 
    } 
} 

這刮削圖像關閉一頁

// scrapes all image data on one page and updates to db 
function scrapeImages(html) { 
    xr(html, '#returns > li', [ 
    { 
    img: 'dl.return-art > dd > a > [email protected]', 
    title: 'dl.return-art > dt > [email protected]', 
    created: 'dl.return-art > .created', 
    medium: 'dl.return-art > .medium', 
    dimensions: 'dl.return-art > .dimensions', 
    credit: 'dl.return-art > .credit', 
    accession: 'dl.return-art > .accession' 
    } 
    ])((err, res) => { 
    if (err) 
    throw err; 
    Artwork.addArt(res); 
}) 
} 

此功能sta RTS的全過程

// the onview endpoint 
function onView() { 
    nightmare.goto(config.NGA.online) 
    nightmare 
    .wait(3000) 
    .evaluate(() => { 
     return [document.location.href, document.querySelector('span.results-span').innerHTML] 
    }) 
    .then(([url, pgd]) => scrapeEach(new Paginate(url, pgd))) 
    } 

錯誤消息

FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - JavaScript heap out of memory 
1: node::Abort() [/usr/local/bin/node] 
2: node::FatalException(v8::Isolate*, v8::Local<v8::Value>, v8::Local<v8::Message>) [/usr/local/bin/node] 
3: v8::internal::V8::FatalProcessOutOfMemory(char const*, bool) [/usr/local/bin/node] 
4: v8::internal::Factory::NewRawOneByteString(int, v8::internal::PretenureFlag) [/usr/local/bin/node] 
5: v8::internal::Factory::NumberToString(v8::internal::Handle<v8::internal::Object>, bool) [/usr/local/bin/node] 
6: v8::internal::Runtime_NumberToStringSkipCache(int, v8::internal::Object**, v8::internal::Isolate*) [/usr/local/bin/node] 
7: 0x18caa0a079a7 
8: 0x18caa0f37cbc 
Abort trap: 6 

回答

0

所以我計算出來的過程。問題是異步問題,while循環異步運行直到完成,然後噩夢實例永遠不會運行。我將自動點擊轉到下一頁的解決方案進行了更新,以計算網址&pageNumber=中的下一頁,並使用該網站的自定義頁面對象傳遞有關頁面,網址和網頁上的項目的數據。我還添加了一些調試信息來顯示。

function Paginate(url, pgd) { 

this.url = url; 
this.array = pgd.split(" "); 

this.currentPage = Number(this.array[0]); 
this.totalItems = Number(_.join(_.filter(this.array[4], char => char != ","), '')); 
this.itemsPerPage = Number(this.array[2]); 
this.totalPages = Math.floor(this.totalItems/this.itemsPerPage) 
this.next = true; 

this.update =() => { 
    let chunks = url.split("&").filter(segment => !segment.includes('Number=')); 

    this.currentPage += 1; 
    if (this.currentPage >= this.totalPages) 
     this.next = false; 

    this.url = _.join(chunks, "") + '&pageNumber=' + this.currentPage; 
} 

this.state =() => { 
    console.log("-------- Pagination ----------") 
    console.log("current page: " + this.currentPage); 
    console.log("total pages: " + this.totalPages); 
    console.log("total items: " + this.totalItems); 
    console.log("items per page: " + this.itemsPerPage); 
    console.log("has next page: " + this.next); 
    console.log("current url: " + this.url); 
    console.log("------------------------------\n"); 
} 

} 

我用異步的whilst同步使用while循環,並在每次迭代執行的噩夢。

function scrapeEach(paginate) { 
    // while paginate has next scrapeEach 
    let hasNext =() => paginate.next && paginate.currentPage < 10 
    async.whilst(hasNext, next => { 

    nightmare 
    .goto(paginate.url) 
    .wait(4000) 
    .evaluate(() => { 
     return document.body.innerHTML 
    }) 
    .then(result => { 
     scrapeImages(result); 
     paginate.update(); 
     paginate.state(); 
     next(); 
    }) 
    .catch(error => { 
     console.error('Search failed:', error); 
    }); 
    }, err => { 
    if (err) 
     throw err; 
    console.log("finished!"); 
    }) 
    return nightmare; 
} 
0

限制並行運行

var limit = 10;  // concurrent read // this can be increased 
    var running = 0; 

    function scrapeEach(paginate) { 
    // while paginate has next scrapeEach 
    while (paginate.next && running < limit) { 
     running++; 
     nightmare 
     .wait(4000) 
     .realClick('#pageNext .results-next') 
     .wait(4000) 
     .evaluate(() => { 
      return document.body.innerHTML 
     }) 
     .then(result => { 

      scrapeImages(result , function(){ 

      paginate.update(); 
      paginate.state(); 
      running--; 

      }); 
     }) 
     .catch(error => { 
      console.error('Search failed:', error); 
      running--; 
     }); 
     } 
    return nightmare.end() 
    } 


    // scrapes all image data on one page and updates to db 
    function scrapeImages(html ,cb) { 
     xr(html, '#returns > li', [ 
     { 
     img: 'dl.return-art > dd > a > [email protected]', 
     title: 'dl.return-art > dt > [email protected]', 
     created: 'dl.return-art > .created', 
     medium: 'dl.return-art > .medium', 
     dimensions: 'dl.return-art > .dimensions', 
     credit: 'dl.return-art > .credit', 
     accession: 'dl.return-art > .accession' 
     } 
     ])((err, res) => { 
     if (err) 
     throw err; 
     Artwork.addArt(res); 
     cb(); 
    }) 
    } 
+0

我不知道如何使用這個,我有一個條件需要在'while'循環中遇到,然後如果有下一頁,它會自動點擊下一頁。 –

+0

哪個條件? –

+0

如果有下一個頁面的話有下一個按鈕 –