R中

2016-08-11 38 views
0

高效,選擇性地組合列,我有以下數據R中

countrycols = alljson[,c("country_gc_str","country_ipapi_str","country_tm_str")] 

head(countrycols) 
country_gc_str country_ipapi_str country_tm_str 
1   <NA>    RU    RU 
2   <NA>    CN    CN 
3    US    US    US 
4   <NA>    CD    CG 
5   <NA>    DE    DE 
6   <NA>    <NA>    NG 

我想創建它獲取在以下優先順序充滿了傳統數據的新列country_final_str:

country_gc_str 
country_ipapi_str 
country_tm_str 

我我還使用以下特徵描述了國家收入水平:

wbURL <- "http://api.worldbank.org/countries?per_page=304" 
xmlAPI <- xmlParse(wbURL) 
xmlDF <- xmlToDataFrame(xmlAPI) 
xmlDF$iso2CodeChar <- as.character(xmlDF$iso2Code) 
xmlDF$incomeLevelChar <- as.character(xmlDF$incomeLevel) 
incomexml <- xmlDF[,c("iso2CodeChar","incomeLevelChar")] 
incomexmltable <- as.data.table(incomexml) 

我有環以下,但它正在採取永遠因爲我有一百萬條記錄:

alljson$country_final_str <- alljson$country_gc_str 
    alljson$income_level <- NA 

    for (i in 1:length (alljson$country_final_str)) 
    { 
    if (is.na(alljson$country_final_str [i])) 
    { 
     alljson$country_final_str [i] = alljson$country_ipapi_str [i]; 
    } 
    if (is.na(alljson$country_final_str [i])) 
    { 
     alljson$country_final_str [i] = alljson$country_tm_str [i]; 
    } 

    a<-incomexmltable[iso2CodeChar==alljson$country_final_str [i]]$incomeLevelChar 

    if(length(a)==0) 
    { 
     alljson$income_level [i] <- NA 
    } else { 
     alljson$income_level [i] <- a 
    } 
    } 

對於提高效率的任何想法/ for循環中擺脫的?我想不出一種方法來apply/lapply/tapply,我在Windows上,所以我努力使用doParalleldoSNOW並行我的代碼失敗。

請在@thelatemail下面查看列問題的正確答案。對於國家收入水平,我執行:

allcountries <- unique(alljson$country_final_str) 

alljson$country_income_str <- NA 
sum(!is.na(countrycode(allcountries, "iso2c", "country.name"))) 
for (i in 1:length(allcountries)) 
{ 
    a<-incomexmltable[iso2CodeChar==allcountries[i]]$incomeLevelChar 
    if(length(a)==0) 
    { 
    alljson$country_income_str[which(alljson$country_final_str==allcountries[i])] <- NA 
    } else { 
    alljson$country_income_str[which(alljson$country_final_str==allcountries[i])] <- a 
    } 

    alljson$country_income_str 
} 
+1

您拍照時看看'WDI'包?這是一個很好的。 – shayaa

回答

3

下面是三個變量選擇的第一個非缺失值後,利用矩陣索引的嘗試:

countrycols[ 
    cbind(
    seq_len(nrow(countrycols)), 
    max.col(replace(-col(countrycols), is.na(countrycols), -Inf)) 
) 
] 
#[1] "RU" "CN" "US" "CD" "DE" "NG" 

要解釋的邏輯,打破各line:

-col(countrycols) 
#  [,1] [,2] [,3] 
#[1,] -1 -2 -3 
#[2,] -1 -2 -3 
#[3,] -1 -2 -3 
#[4,] -1 -2 -3 
#[5,] -1 -2 -3 
#[6,] -1 -2 -3 

replace(-col(countrycols), is.na(countrycols), -Inf) 
#  [,1] [,2] [,3] 
#[1,] -Inf -2 -3 
#[2,] -Inf -2 -3 
#[3,] -1 -2 -3 
#[4,] -Inf -2 -3 
#[5,] -Inf -2 -3 
#[6,] -Inf -Inf -3 

(colindex <- max.col(replace(-col(countrycols), is.na(countrycols), -Inf))) 
#[1] 2 2 1 2 2 3 

cbind(rowindex=seq_len(nrow(countrycols)), colindex) 
#  rowindex colindex 
#[1,]  1  2 
#[2,]  2  2 
#[3,]  3  1 
#[4,]  4  2 
#[5,]  5  2 
#[6,]  6  3 

該最終矩陣用於從原始列表中排除每個行/列組合。

其中countrycols是:

structure(list(country_gc_str = c(NA, NA, "US", NA, NA, NA), 
    country_ipapi_str = c("RU", "CN", "US", "CD", "DE", NA), 
    country_tm_str = c("RU", "CN", "US", "CG", "DE", "NG")), .Names = c("country_gc_str", 
"country_ipapi_str", "country_tm_str"), row.names = c("1", "2", 
"3", "4", "5", "6"), class = "data.frame")