2016-06-07 180 views
0

我需要比較2個名稱以查看它們中的一個是否是另一個的暱稱。數據框中有兩列名稱。在R中避免使用for循環(使用臨時變量)

Names <- data.frame(In_Name = c("Gary",'John','James','William','Bill','Paul','Tom','Annie','Bella','Sue'), 
       Match_Name = c('Garry','Jon','Jimmy','Paul','William','Pablo','Thomas','Anne','Belle','Susan'),stringsAsFactors = F) 

Names[] <- lapply(Names, toupper) 
Names$Match <- 0 

我也有一個暱稱表,其中包含一對暱稱。在全套的名稱可能會出現在對多行(如下面的「貝拉」行的情況下)

NickName_Table <- data.frame(Names = c('Garrett,Garret,Gary,Garry' 
              ,'Ian,John,Johnie,Johnnie,Johnny,Jon' 
              ,'Jae,James,Jamey,Jay,Jaymes,Jem,Jemmy,Jim,Jimi,Jimmie,Jimmy' 
              ,'Bill,Billie,Billy,Wil,Will,William,Willie,Willy' 
              ,'Paul,Pauly,Paulie' 
              ,'Maas,Thom,Thomas,Tom,Tomas,Tommie,Tommy' 
              ,'Ann,Anna,Anne,Annette,Annie,Nan,Nancy,Nanette,Nannie,Nanny' 
              ,'Bella,Belle,Ibbie,Issy,Izzy,Sabella' 
              ,'Isabella,Isabelle,Bella,Belle' 
              ,'Sue,Sukie,Susan,Susann,Susanna,Suzie')) 
    NickName_Table[] <- lapply(NickName_Table, toupper) 

我想避免使用for循環但是我無法工作,如何做一個函數調用,因爲我需要將找到的行存儲在一個臨時變量中,以便在同一行中搜索第二個名稱。我需要爲超過一百萬對名稱執行此操作,for循環太慢。我現在的循環是:

library(sqldf) 
i=1 
for (i in 1:nrow(Names)) 
{ 

    first_name <- Names[i,1] 
    match_name <- Names[i,2] 

    if(!is.na(first_name) & !is.na(match_name) & first_name != match_name) 
    { 
    if (nrow(subset(NickName_Table,grepl(first_name,NickName_Table$Names)))>= 1) 
    { 
     possibleMatch <- subset(NickName_Table,grepl(first_name,NickName_Table$Names)) 
     temp1 <- unique(as.data.frame(strsplit(gsub(" ", ",",Reduce(paste,unlist(possibleMatch))),","), stringsAsFactors = F)) 
     colnames(temp1) <- "Names" 
     temp2 <- data.frame(match_name, stringsAsFactors = F) 
     colnames(temp2) <- "Names_1" 

     if(nrow(sqldf("Select a.* from temp1 a left join temp2 b on a.Names=b.Names_1 where b.Names_1 is not NULL"))>= 1) 
     { 
     Names[i,3] <- 1 
     } 
     else 
     Names[i,3] <- 0 
    } 
    else 
     Names[i,3] <- 0 
    } 
    else 
    Names[i,3] <- 0 
} 

編輯: 我試圖創建一個功能然而問題是,暱稱表和字符串的長度進行比較是不平等的,所以矢量化比較似乎不起作用。

functiona <- function (inNames,MatchNames,NickName_Table1){ 
    if(!is.na(inNames) & !is.na(MatchNames) & inNames != MatchNames) 
    { 
    if (length(subset(NickName_Table1,grepl(inNames,NickName_Table1)))>= 1) 
    { 
     possibleMatch <- subset(NickName_Table1,grepl(inNames,NickName_Table1)) 
     temp1 <- unique(as.data.frame(strsplit(gsub(" ", ",",Reduce(paste,unlist(possibleMatch))),","), stringsAsFactors = F)) 
     colnames(temp1) <- "Names" 
     temp2 <- data.frame(MatchNames, stringsAsFactors = F) 
     colnames(temp2) <- "Names_1" 

     if(nrow(sqldf("Select a.* from temp1 a left join temp2 b on a.Names=b.Names_1 where b.Names_1 is not NULL"))>= 1) 
     { 
     return <- 1 
     } 
     else 
     return <- 0 
    } 
    else 
     return <- 0 
    } 
    else 
    return <- 0 
} 

c <- mapply(functiona,Names$In_Name,Names$Match_Name,NickName_Table$Names) 
+1

具體談談你的問題。請解釋一下,不僅是問題,還有你所嘗試的以及你所堅持的。閱讀此:http://stackoverflow.com/help/how-to-ask – crabbly

回答

0

這可以全部放入單個SQL語句中。我們在NamesIn_NameMatch_Name之前加上一個逗號並附加一個逗號,以確保我們不會得到部分匹配,然後使用條件爲true的情況下使用條件(如果匹配的話)左連接(以確保Names的所有行保留)與NickName_TableIn_NameMatch_Name都設置爲Names的同一行。 SQLite函數instr檢查其第一個參數是否將其第二個參數作爲子字符串。

sqldf("select distinct In_Name, Match_Name, Names is not null as 'Match' 
     from Names 
     left join (select ',' || Names || ',' as Names from NickName_Table) 
     on instr(Names, ',' || In_Name || ',') and instr(Names, ',' || Match_Name || ',')") 

,並提供:

In_Name Match_Name Match 
1  GARY  GARRY  1 
2  JOHN  JON  1 
3 JAMES  JIMMY  1 
4 WILLIAM  PAUL  0 
5  BILL WILLIAM  1 
6  PAUL  PABLO  0 
7  TOM  THOMAS  1 
8 ANNIE  ANNE  1 
9 BELLA  BELLE  1 
10  SUE  SUSAN  1 
0

沒有循環!

sapply比循環指數地快。 merge也更快,特別是data.table

require(data.table) 
Names <- data.frame(In_Name = c("Gary",'John','James','William','Bill','Paul','Tom','Annie','Bella','Sue'), 
        Match_Name = c('Garry','Jon','Jimmy','Paul','William','Pablo','Thomas','Anne','Belle','Susan'),stringsAsFactors = F) 

Names[] <- lapply(Names, toupper) 
Names$Match <- 0 

NickName_Table <- data.table(Names = c('Garrett,Garret,Gary,Garry' 
             ,'Ian,John,Johnie,Johnnie,Johnny,Jon' 
             ,'Jae,James,Jamey,Jay,Jaymes,Jem,Jemmy,Jim,Jimi,Jimmie,Jimmy' 
             ,'Bill,Billie,Billy,Wil,Will,William,Willie,Willy' 
             ,'Paul,Pauly,Paulie' 
             ,'Maas,Thom,Thomas,Tom,Tomas,Tommie,Tommy' 
             ,'Ann,Anna,Anne,Annette,Annie,Nan,Nancy,Nanette,Nannie,Nanny' 
             ,'Bella,Belle,Ibbie,Issy,Izzy,Sabella' 
             ,'Isabella,Isabelle,Bella,Belle' 
             ,'Sue,Sukie,Susan,Susann,Susanna,Suzie')) 
NickName_Table[] <- lapply(NickName_Table, toupper) 

n    <- which(like(NickName_Table$Names,"BELLA")) 
tmp   <- as.data.frame(paste(NickName_Table$Names[n[1]], NickName_Table$Names[n[2]])) # either tweak if you have > 2 in other cases or just count columnwise TRUE values in final sapply step below 
colnames(tmp) <- NULL 
NickName_Table <- NickName_Table[!which(like(NickName_Table$Names,"BELLA")),] 
NickName_Table <- rbind(NickName_Table,tmp) 
NickName_Table$no <- 1:nrow(NickName_Table) 

Names$nick_row <- sapply(Names$In_Name,FUN = function(x) which(grepl(x, NickName_Table$Names))) 
Names   <- merge(x = Names, NickName_Table, by.x = "nick_row", by.y = "no") 

Names$Match <- diag(sapply(Names$Match_Name, FUN = function(x) grepl(x, Names$Names))) 
Names$Names <- NULL 
Names$nick_row <- NULL 
Names 

Names 
    In_Name Match_Name Match 
1  GARY  GARRY TRUE 
2  JOHN  JON TRUE 
3 JAMES  JIMMY TRUE 
4 WILLIAM  PAUL FALSE 
5  BILL WILLIAM TRUE 
6  PAUL  PABLO FALSE 
7  TOM  THOMAS TRUE 
8 ANNIE  ANNE TRUE 
9  SUE  SUSAN TRUE 
10 BELLA  BELLE TRUE 
+0

謝謝。然而,這一個不在相同的字符串中檢查,它只是檢查數據框中是否存在兩個名字。我需要評估第二個名字是否可以在任何第一個名字的字符串中找到。 – EwenM

+0

Hadley在Wombat2016上的演講; https://youtu.be/hRNUgwAFZtQ?t=47m33s「*很多人告訴你不要使用for循環,因爲它們很慢。這是完全的,完全的垃圾。*」指數級更快可能是誇大其詞。 –

+1

你能證明「sapply比循環指數地快嗎?」? –

0

假設一個),你只需要知道,如果一對In_NameMatch_Name同一行的Nickname_Table中存在,和b)你不需要知道行他們的,那麼我認爲這樣做的伎倆:

## separate the nicknames into individual strings 
splitlist <- sapply(NickName_Table, strsplit, ",") 

## create a truth table where In_Name and Match_Name both exist on a row of Nickname_Table 
truthMatrix <- sapply(1:nrow(Names), function(x) { 
    sapply(1:length(splitlist), function(y) { 
    match(Names$In_Name[x], splitlist[[y]])>0 & match(Names$Match_Name[x], splitlist[[y]])>0 
    }) 
}) 

## assign the value as a match if there is at least one anywhere 
Names$Match <- ifelse(is.na(apply(truthMatrix, 2, any)), 0, 1) 

Names  
#> In_Name Match_Name Match 
#> 1  GARY  GARRY  1 
#> 2  JOHN  JON  1 
#> 3 JAMES  JIMMY  1 
#> 4 WILLIAM  PAUL  0 
#> 5  BILL WILLIAM  1 
#> 6  PAUL  PABLO  0 
#> 7  TOM  THOMAS  1 
#> 8 ANNIE  ANNE  1 
#> 9 BELLA  BELLE  1 
#> 10  SUE  SUSAN  1 

這仍然需要通過nrow(Names)*nrow(Nickname_Table)值循環,但可能有一些矢量化乘虛而入。

爲了清楚起見,這裏的truthMatrix的價值:

truthMatrix 
#>  [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] 
#> [1,] TRUE NA NA NA NA NA NA NA NA NA 
#> [2,] NA TRUE NA NA NA NA NA NA NA NA 
#> [3,] NA NA TRUE NA NA NA NA NA NA NA 
#> [4,] NA NA NA NA TRUE NA NA NA NA NA 
#> [5,] NA NA NA NA NA NA NA NA NA NA 
#> [6,] NA NA NA NA NA NA TRUE NA NA NA 
#> [7,] NA NA NA NA NA NA NA TRUE NA NA 
#> [8,] NA NA NA NA NA NA NA NA TRUE NA 
#> [9,] NA NA NA NA NA NA NA NA TRUE NA 
#> [10,] NA NA NA NA NA NA NA NA NA TRUE 

在這裏你可以看到「貝拉」 /「百麗」兩次匹配。

+0

您可能想要返回'FALSE'或'0'而不是'NA' –

+0

更新爲返回0或1。 –