2017-08-29 196 views
1

我寫了下面的代碼來每天從門戶網站中取消招標信息。申請vs For循環R

packages <- c('rvest', 'stringi', 'tidyverse','lubridate','dplyr') 
purrr::walk(packages, library, character.only = TRUE, warn.conflicts = FALSE) 
start_time <- proc.time() 

主頁廢料並獲得總記錄數。

data <- read_html('https://eprocure.gov.in/mmp/latestactivetenders') 
total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]') 
All_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE)) 
links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a') 
links_fair <- html_attr(links,'href') 
links_fair <- links_fair[grep("tendersfullview",links_fair)] 
All_tenders <- cbind(All_tenders,links_fair) 

讀的記錄總數獲取

Count_of_Recs_raw <- html_nodes(data, xpath = '//*[(@id = "edit-l-active-teners")]//div') 
Count_of_Recs <- as.numeric(gsub("Total Tenders : ","",html_text(Count_of_Recs_raw[1]))) 

功能像日期和因素清洗和處理數據字段。

process_dates <- function(data){ 
    cols2date <- c('Bid.Submission.Closing.Date','epublished_date','document_download_start_date','bid_submission_start_date','bid_opening_date','document_download_end_date','bid_submission_end_date') 
    date_processed_data <- data 
    date_processed_data[cols2date] <- lapply(data[cols2date] , dmy_hm) 
    return(date_processed_data) 
} 

clean_process_data <- function(data){ 
    cols2factor <- c('State.Name','product_category','pre_qualification','organisation_name','organisation_type','tender_type') 
    clean_processed_data <- data 
    clean_processed_data[cols2factor] <- lapply(data[cols2factor] , factor) 
    #clean_processed_data <- process_dates(clean_processed_data) 
    return(clean_processed_data) 

} 

下面的代碼是在那裏正是我的問題在於...

表刮痧這裏開始。頁面已經被取消以獲取數據框架的結構。

for (page_no in 2:round(Count_of_Recs/10)){ 
    closeAllConnections() 
    on.exit(closeAllConnections()) 
    url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page=' 
    url <- paste(url_bit1, page_no, sep="") 
    cat(page_no,"\t",proc.time() - start_time,"\n") 
    data <- read_html(url) 
    total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]') 
    Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE)) 
    links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a') 
    links_fair <- html_attr(links,'href') 
    links_fair <- links_fair[grep("tendersfullview",links_fair)] 
    Page_tenders <- cbind(Page_tenders,links_fair) 
    All_tenders <- rbind(All_tenders,Page_tenders) 
} 

這個for循環通常會花費數小時才能完成。 我正在尋找使用適用於良好效果的家庭,以節省時間。這個程序有獲取和處理的所有記錄,然後爲每個單獨記錄每一次再次再殺一個全新的頁面(這裏沒有列出代碼)的進一步的責任....

我曾嘗試下面的代碼,但它不「T給我我想要的東西:

url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page=' 
read_page <- function(datain){ 
    closeAllConnections() 
    on.exit(closeAllConnections()) 
    url <- paste(url_bit1, datain$S.No., sep="") 
    cat(S.No.,"\t",proc.time() - start_time,"\n") 
    data <- read_html(url) 
    total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]') 
    Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE)) 
    links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a') 
    links_fair <- html_attr(links,'href') 
    links_fair <- links_fair[grep("tendersfullview",links_fair)] 
    Page_tenders <- cbind(Page_tenders,links_fair) 
    All_tenders <- rbind(All_tenders,Page_tenders) 
} 

All_tenders <- sapply(All_tenders, FUN=read_page(All_tenders$S.No.)) 

任何建議,指導,建議,意見或幫助是值得歡迎的。我一直在使用R只有3-4個月。我也意識到Python在這方面優於R的優勢,但我傾向於R來解決這個問題。

回答

1

您的sapply功能不正確。我對您的代碼進行了一些編輯,並在樣本大小N = 50上進行了測試。我們可能使用system.time()來了解完成任務需要多少時間。

的「爲」的方法:

system.time(
    for (page_no in 1:50){ 
    closeAllConnections() 
    on.exit(closeAllConnections()) 
    url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page=' 
    url <- paste(url_bit1, page_no, sep="") 
    cat(page_no,"\t",proc.time() - start_time,"\n") 
    data <- read_html(url) 
    total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]') 
    Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE)) 
    links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a') 
    links_fair <- html_attr(links,'href') 
    links_fair <- links_fair[grep("tendersfullview",links_fair)] 
    Page_tenders <- cbind(Page_tenders,links_fair) 
    All_tenders <- rbind(All_tenders,Page_tenders) 
    } 
) 

#user system elapsed 
# 50.15 81.26 132.73 

的「lapply」的方法:

All_tenders = NULL 
url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page=' 
read_page <- function(datain){ 
    closeAllConnections() 
    on.exit(closeAllConnections()) 
    url <- paste(url_bit1, datain, sep="") 
    cat(datain,"\t",proc.time() - start_time,"\n") 
    data <- read_html(url) 
    total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]') 
    Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE)) 
    links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a') 
    links_fair <- html_attr(links,'href') 
    links_fair <- links_fair[grep("tendersfullview",links_fair)] 
    Page_tenders <- cbind(Page_tenders,links_fair) 
    All_tenders <- rbind(All_tenders,Page_tenders) 
} 

system.time(
    All_tenders <- lapply(1:50, function(x) read_page(x)) 
) 
# user system elapsed 
# 49.84 78.97 131.16 

如果我們希望把我們的研究結果在一個數據幀,然後轉換All_tenders列表的數據幀如下:

All_tenders = do.call(rbind, lapply(All_tenders, data.frame, stringsAsFactors=FALSE) 

原來lapply稍快。

+0

我認爲在你的函數變化'All_tenders'使它真的慢... –

+0

那麼,你必須處理它。 Web抓取並不快,否則你可能會被服務器管理員禁止。另一種方式(明顯更快)的方法是通過Python使用TOR和來自不同IP的多個請求,但這是另一回事。 –

+0

這對我來說是愚蠢的......正如我所提到的,我只是R的新手...... 請您詳細說明以下部分 system.time( All_tenders < - lapply(1:50,function (x)read_page(x)) ) All_tenders = do.call(rbind,lapply(All_tenders,data.frame,stringsAsFactors = FALSE) –

1

for循環和sapply原理不同: - for循環做的東西反覆:他們做的第一個計算單元上,然後在第二... - sapply做的東西的元素列表上的獨立(以任意順序)。所以resulsts是獨立建構的。itertively

All_tenders <- rbind(All_tenders,Page_tenders) 

All_tenders變量增加:

所以在和你的for循環,當你做

雖然你sapply功能,它不會工作(因爲它不知道其他元素的結果)。

所以,你應該做這樣的事情:

url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page=' 
read_page <- function(datain){ 
    closeAllConnections() 
    on.exit(closeAllConnections()) 
    url <- paste(url_bit1, datain, sep="") 
    cat(S.No.,"\t",proc.time() - start_time,"\n") 
    data <- read_html(url) 
    total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]') 
    Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE)) 
    links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a') 
    links_fair <- html_attr(links,'href') 
    links_fair <- links_fair[grep("tendersfullview",links_fair)] 
    Page_tenders <- cbind(Page_tenders,links_fair) 
    return(Page_tenders) 
} 

要爲每個頁面返回的結果,並將其應用方式如下:

All_tenders_tmp <- sapply(2:round(Count_of_Recs/10), FUN=read_page) 

那麼你的結果將是所有結果的列表例如,您可以將它與data.table::rbindlist合併。

我希望我是清楚的。

+0

順便說一句,這個功能會拋出一個錯誤爲「page_no」,「S.No.」變量未定義。 –

+0

兩個變量是從數據輸入數據框中 –

+0

這一次仍然不爲我工作的cols。對於這個原因,我傾向於選擇較早的答案正確的,截至目前.. –