我有兩個數據集: competitor_data - 包含給定產品的競爭對手,以及價格和日期收集競爭對手的價格時。替代嵌套For循環中的R
PRODUCT_PRICE - 每次價格變動的日期。
competitor_data <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
crawl_date=c("2014-04-05", "2014-04-22", "2014-05-05", "2014-05-22","2014-06-05", "2014-06-22",
"2014-05-08", "2014-06-17", "2014-06-09", "2014-06-14","2014-07-01", "2014-08-04"),
competitor =c("amazon","apple","google","facebook","alibaba","tencent","ebay","bestbuy","gamespot","louis vuitton","gucci","tesla"),
competitor_price =c(2.5,2.35,1.99,2.01,2.22,2.52,5.32,5.56,5.01,6.01,5.86,5.96), stringsAsFactors=FALSE)
competitor_data$crawl_date = as.Date(competitor_data$crawl_date)
#
product_price <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
date=c("2014-05-05", "2014-06-22", "2014-07-05", "2014-08-31","2014-05-03", "2014-02-22",
"2014-05-21", "2014-06-19", "2014-03-09", "2014-06-22","2014-07-03", "2014-09-08"),
price =c(2.12,2.31,2.29,2.01,2.04,2.09,5.22,5.36,5.21,5.91,5.36,5.56), stringsAsFactors=FALSE)
product_price$date = as.Date(product_price$date)
目的
- 對於PRODUCT_PRICE給定的產品,每個記錄(日期),找到 從competitor_data相關crawl_date價格。
- 比較PRODUCT_PRICE $價格到最低competitor_data $ competitor_price。
- 如果PRODUCT_PRICE $價格< = competitor_data $ competitor_price,然後創建一個新列的標誌1(price_leader)else標誌0(price_leader)
我下面使用嵌套的for循環腳本,但它需要超過24小時process 5000 unique product_id:
unique_skus <- unique(product_price$productId)
all_competitive_data <- data.frame()
mid_step_data <- data.frame()
start_time <-Sys.time()
for (i in 1:length(unique_skus)){
step1 <- subset(product_price, productId == unique_skus[i])
transact_dates = unique(step1$date)
for (a in 1:length(transact_dates)){
step2 <- subset(step1, date ==transact_dates[a])
step3 <- inner_join(step2,competitor_data, by='productId')
if (nrow(subset(step3, date > crawl_date)) == 0){
step3 <- step3[ order(step3$crawl_date , decreasing = FALSE),]
competitor_price <- head(step3,1)$competitor_price
step2$competitor_price = competitor_price
}
else {
step4 <- subset(step3, date > crawl_date)
step4 <- step4[ order(step4$crawl_date , decreasing = TRUE),]
competitor_price <- head(step4,1)$competitor_price
step2$competitor_price = competitor_price
}
step2$price_leader <- ifelse(step2$price <= step2$competitor_price, 1, 0)
mid_step_data = rbind(mid_step_data,step2)
}
all_competitive_data <- rbind(all_competitive_data,mid_step_data)
}
Sys.time()-start_time
all_competitive_data = unique(all_competitive_data)
有沒有一種方法可以使用dplyr快速完成此操作?
爲什麼不通過產品編號和日期合併這兩個數據集,然後比較這兩個價格列 – rawr
因爲crawl_date不一定映射到最新。請查看我的代碼中是否有聲明。 – BlackHat
等你選擇下一個最接近的日期的價格,所以合併後使用最後一個觀察結轉函數來填寫新手 – rawr