我寫了下面的代碼來每天從門戶網站中取消招標信息。申請vs For循環R
packages <- c('rvest', 'stringi', 'tidyverse','lubridate','dplyr')
purrr::walk(packages, library, character.only = TRUE, warn.conflicts = FALSE)
start_time <- proc.time()
主頁廢料並獲得總記錄數。
data <- read_html('https://eprocure.gov.in/mmp/latestactivetenders')
total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
All_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
links_fair <- html_attr(links,'href')
links_fair <- links_fair[grep("tendersfullview",links_fair)]
All_tenders <- cbind(All_tenders,links_fair)
讀的記錄總數獲取
Count_of_Recs_raw <- html_nodes(data, xpath = '//*[(@id = "edit-l-active-teners")]//div')
Count_of_Recs <- as.numeric(gsub("Total Tenders : ","",html_text(Count_of_Recs_raw[1])))
功能像日期和因素清洗和處理數據字段。
process_dates <- function(data){
cols2date <- c('Bid.Submission.Closing.Date','epublished_date','document_download_start_date','bid_submission_start_date','bid_opening_date','document_download_end_date','bid_submission_end_date')
date_processed_data <- data
date_processed_data[cols2date] <- lapply(data[cols2date] , dmy_hm)
return(date_processed_data)
}
clean_process_data <- function(data){
cols2factor <- c('State.Name','product_category','pre_qualification','organisation_name','organisation_type','tender_type')
clean_processed_data <- data
clean_processed_data[cols2factor] <- lapply(data[cols2factor] , factor)
#clean_processed_data <- process_dates(clean_processed_data)
return(clean_processed_data)
}
下面的代碼是在那裏正是我的問題在於...
表刮痧這裏開始。頁面已經被取消以獲取數據框架的結構。
for (page_no in 2:round(Count_of_Recs/10)){
closeAllConnections()
on.exit(closeAllConnections())
url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
url <- paste(url_bit1, page_no, sep="")
cat(page_no,"\t",proc.time() - start_time,"\n")
data <- read_html(url)
total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
links_fair <- html_attr(links,'href')
links_fair <- links_fair[grep("tendersfullview",links_fair)]
Page_tenders <- cbind(Page_tenders,links_fair)
All_tenders <- rbind(All_tenders,Page_tenders)
}
這個for循環通常會花費數小時才能完成。 我正在尋找使用適用於良好效果的家庭,以節省時間。這個程序有獲取和處理的所有記錄,然後爲每個單獨記錄每一次再次再殺一個全新的頁面(這裏沒有列出代碼)的進一步的責任....
我曾嘗試下面的代碼,但它不「T給我我想要的東西:
url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
read_page <- function(datain){
closeAllConnections()
on.exit(closeAllConnections())
url <- paste(url_bit1, datain$S.No., sep="")
cat(S.No.,"\t",proc.time() - start_time,"\n")
data <- read_html(url)
total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
links_fair <- html_attr(links,'href')
links_fair <- links_fair[grep("tendersfullview",links_fair)]
Page_tenders <- cbind(Page_tenders,links_fair)
All_tenders <- rbind(All_tenders,Page_tenders)
}
All_tenders <- sapply(All_tenders, FUN=read_page(All_tenders$S.No.))
任何建議,指導,建議,意見或幫助是值得歡迎的。我一直在使用R只有3-4個月。我也意識到Python在這方面優於R的優勢,但我傾向於R來解決這個問題。
我認爲在你的函數變化'All_tenders'使它真的慢... –
那麼,你必須處理它。 Web抓取並不快,否則你可能會被服務器管理員禁止。另一種方式(明顯更快)的方法是通過Python使用TOR和來自不同IP的多個請求,但這是另一回事。 –
這對我來說是愚蠢的......正如我所提到的,我只是R的新手...... 請您詳細說明以下部分 system.time( All_tenders < - lapply(1:50,function (x)read_page(x)) ) All_tenders = do.call(rbind,lapply(All_tenders,data.frame,stringsAsFactors = FALSE) –