2017-06-16 162 views
0

webscraping附加在數據幀的行我有數據幀7行1列,其中包含一個網站鏈接,我想從這些不同的鏈接中提取數據,並將其存儲在數據框架,但不能附加that.Also我檢查,如果爲一個鏈接,如果沒有記錄(這是我通過該鏈接的html屬性檢查)跳過該鏈接,並繼續下一個鏈接。 我也試圖爲多個頁面的鏈接提取數據。R:麻煩從r中

這是重複性的數據

text1="http://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom=" 
text3="&proptype=" 
text4="Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment" 
text5="&cityName=Thane&BudgetMin=" 
text6="&BudgetMax=" 

bhk=c("1","2","3","4","5",">5") 
budg_min=c("5-Lacs","10-Lacs","20-Lacs","30-Lacs","40-Lacs","50-Lacs","60-Lacs","70-Lacs","80-Lacs","90-Lacs","1-Crores","1.2-Crores","1.4-Crores","1.6-Crores","1.8-Crores","2-Crores","2.3-Crores","2.6-Crores","3-Crores","3.5-Crores","4-Crores","4.5-Crores","5-Crores","10-Crores","20-Crores") 
budg_max=c("5-Lacs","10-Lacs","20-Lacs","30-Lacs","40-Lacs","50-Lacs","60-Lacs","70-Lacs","80-Lacs","90-Lacs","1-Crores","1.2-Crores","1.4-Crores","1.6-Crores","1.8-Crores","2-Crores","2.3-Crores","2.6-Crores","3-Crores","3.5-Crores","4-Crores","4.5-Crores","5-Crores","10-Crores","20-Crores") 
eg <- expand.grid(bhk = bhk, budg_min = budg_min, budg_max = budg_max) 
eg <- eg[as.integer(eg$budg_min) <= as.integer(eg$budg_max),] 
uuu <- sprintf("%s%s%s%s%s%s%s%s", text1,eg[,1],text3,text4,text5,eg[,2],text6,eg[,3]) 
uuu_df1=data.frame(x=uuu[1:7,]) 
dput(uuu_df1) 

我對這個3解決方案,但沒有一個似乎是工作的罰款。

解決方案#1

urlList <- llply(uuu_df1[,1], function(url){  

    this_pg <- read_html(url) 

    results_count <- this_pg %>% 
    xml_find_first(".//span[@id='resultCount']") %>% 
    xml_text() %>% 
    as.integer() 

    if(results_count > 0){ 

    cards <- this_pg %>% 
     xml_find_all('//div[@class="SRCard"]') 

    df <- ldply(cards, .fun=function(x){ 
     y <- data.frame(wine = x %>% xml_find_first('.//span[@class="agentNameh"]') %>% xml_text(), 
         excerpt = x %>% xml_find_first('.//div[@class="postedOn"]') %>% xml_text(), 
         locality = x %>% xml_find_first('.//span[@class="localityFirst"]') %>% xml_text(), 
         society = x %>% xml_find_first('.//div[@class="labValu"]') %>% xml_text() %>% gsub('\\n', '', .)) 
     return(y) 
    }) 

    } else { 
    df <- NULL 
    } 

    return(df) 
}, .progress = 'text') 
names(urlList) <- uuu_df1[,1] 

a=bind_rows(urlList) 

上面的代碼給我錯誤Error in if (results_count > 0) { : missing value where TRUE/FALSE needed

解決方案#2

urlList <- lapply(uuu_df1[,1], function(url){  

    UrlPage <- html(as.character(url)) 
    ImgNode <- UrlPage %>% html_node("div.noResultHead") 
    u <- paste("No", word(string = as(ImgNode, "character"), start=4, end=5), sep=" ") 

    cat(".")   
    pg <- read_html(url) 

    if(u!="No Results Found!") { 
    df <- data.frame(wine=html_text(html_nodes(pg, ".agentNameh")), 
        excerpt=html_text(html_nodes(pg, ".postedOn")), 
        locality=html_text(html_nodes(pg,".localityFirst")), 
        society=html_text(html_nodes(pg,'.labValu .stop-propagation:nth-child(1)')), 
        stringsAsFactors=FALSE) 
    } else { 
    # ASSIGN EMPTY DATAFRAME (FOR CONSISTENT STRUCTURE) 
    df <- data.frame(wine=character(), excerpt=character(), locality=character(), society=character()) 
    } 
    # RETURN NAMED LIST 
    return(list(UrlPage=UrlPage, ImgNode=ImgNode, u=u, df=df))  
}) 

# ROW BIND ONLY DATAFRAME ELEMENT FROM LIST 
wines <- map_df(urlList, function(u) u$df) 

以上代碼給出了空數據框中

解決方案#3

uuu_df1=data.frame(x=uuu_df[1:7,]) 
wines=data.frame() 
url_test=c() 
UrlPage_test=c() 
u=c() 
ImgNode=c() 
pg=c() 

for(i in 1:dim(uuu_df1)[1]) { 

    url_test[i]=as.character(uuu_df1[i,]) 
    UrlPage_test[i] <- html(url_test[i]) 
    ImgNode[i] <- UrlPage_test[i] %>% html_node("div.noResultHead") 
    u[i]=ImgNode[i] 
    u[i]=as(u[i],"character") 
    u[i]=paste("No",word(string = u, start = 4, end = 5),sep = " ") 

    if(u[i]=="No Results Found!") next 
    { 
    map_df(1:5, function(i) # here 1:5 is number of webpages of a website 
    { 

     # simple but effective progress indicator 
     cat(".") 

     pg[i] <- read_html(sprintf(url_test[i], i)) 

     data.frame(wine=html_text(html_nodes(pg[i], ".agentNameh")), 
       excerpt=html_text(html_nodes(pg[i], ".postedOn")), 
       locality=html_text(html_nodes(pg[i],".localityFirst")), 
       society=html_text(html_nodes(pg[i],'.labValu .stop-propagation:nth-child(1)')), 
       stringsAsFactors=FALSE) 

    }) -> wines 

    }} 

上面代碼中還給出了一個錯誤

Error in UseMethod("xml_find_first") : 
    no applicable method for 'xml_find_first' applied to an object of class "list" 
In addition: Warning messages: 
1: 'html' is deprecated. 
Use 'read_html' instead. 
See help("Deprecated") 
2: In UrlPage_test[i] <- html(url_test[i]) : 
    number of items to replace is not a multiple of replacement length 

上,這樣我的要求得到滿足的代碼可以糾正任何建議。在此先感謝

回答

1

解決方案#1

,當你做這樣的事情missing value where TRUE/FALSE needed印:

if (NA > 0) { 
    do something 
} 

所以更換你的,如果條件

if(results_count > 0) 

(!is.na(results_count) & (results_count > 0)) 
+0

優秀的@herbaman,一行救了我一天。它運作良好..謝謝噸..感謝努力! – deepesh

+0

如果您選中「僅用於第7條記錄」鏈接顯示它有94條記錄,但如果您只爲第7條記錄運行該代碼,則創建的數據框只包含30條記錄,而不是94條。爲什麼這樣呢? – deepesh