2016-12-11 139 views
3

我的數據是低於我想基於ID時,我只保留那些具有相似的ID,以將其分成幾個部分拆分數據分成幾個部分

df1<- structure(list(Ids1 = 1:7, string1 = structure(c(3L, 2L, 4L, 
1L, 1L, 1L, 1L), .Label = c("gdyijq,udyhfs,gqdtr", "hdydg", "hishsgd,gugddf", 
"ydis"), class = "factor"), Ids2 = c(1L, 3L, 4L, 9L, 10L, NA, 
NA), string2 = structure(c(4L, 6L, 2L, 3L, 5L, 1L, 1L), .Label = c("", 
"gdyijq,udyhfs", "gqdtr", "hishsgd,gugddf", "nlrshf", "ydis"), class = "factor")), .Names = c("Ids1", 
"string1", "Ids2", "string2"), class = "data.frame", row.names = c(NA, 
-7L)) 

首先我要讓df.1並計算string1與string2的相似程度(用逗號分隔)。

Ids1 string1   ids2 string2   Similar 
1 hishsgd,gugddf  1  hishsgd,gugddf  2 
3 ydis     3  ydis    1 
4 gdyijq,udyhfs,gqdtr 4  gdyijq,udyhfs  2 

我這樣做

df.1 <- df1[which(df1$Ids1 == df1$Ids2), ] 

只給我的第一行,並沒有別的

然後我想有那些有唯一ID號爲1,其不要在IDS2

存在
Ids1 string1 
2  hdydg 
5  gdyijq,udyhfs,gqdtr 
6  gdyijq,udyhfs,gqdtr 
7  gdyijq,udyhfs,gqdtr 

我這樣做,但也不起作用

df.2<- df1[which(df1$Ids1 != df1$Ids2), ] 

,最後我想保留那些只在IDS2而不是IDS1

Ids1 string1 
9  gqdtr 
10  nlrshf 

,我這樣做,但也不起作用

df.3<- df1[which(df1$Ids2 != df1$Ids1), ] 

回答

1

這裏是一個解決方案,我能拿出基於使用dplyr的加入包:

library(dplyr) 

df.1 <- inner_join(select(df1, Ids1, string1), select(df1, Ids2, string2), by = c('Ids1' = 'Ids2')) 
df.1$Similar <- apply(df.1[, -1], 1, function(x) sum(unlist(strsplit(x[1], ',')) %in% unlist(strsplit(x[2], ',')))) 

df.2 <- anti_join(select(df1, Ids1, string1), select(df1, Ids2, string2), by = c('Ids1' = 'Ids2')) 
df.3 <- anti_join(select(df1, Ids2, string2), select(df1, Ids1, string1), by = c('Ids2' = 'Ids1')) 
df.3 <- df.3[complete.cases(df.3), ] 

你也可以做一些不同的df.2和df.3如下:

df.2 <- df1[!df1$Ids1 %in% df1$Ids2, c('Ids1', 'string1')] 
df.3 <- df1[!df1$Ids2 %in% df1$Ids1, c('Ids2', 'string2')] 
df.3 <- df.3[complete.cases(df.3), ] 
+0

謝謝我接受並喜歡你的答案 – nik