2017-07-14 21 views
0

我想找出對這兩個表之間的重疊:提取overlaping行對來自DF兩列

> dput(data1) 
structure(list(Name.x = c("MDH1", "MDH1", "IDH2", "IDH2", "IDH2", 
"IDH2", "IDH2", "IDH2", "IDH2", "SCOALB", "SCOALB", "CSY4", "CSY4", 
"CSY4", "CSY4", "CSY4", "FUM1", "FUM1", "IDH6", "IDH6", "IDH6", 
"ODC1-1", "ODC1-1", "ODC1-1", "ODC1-1", "ODC1-1", "ODC2-1", "ODC2-1", 
"ODC2-1", "ACO2", "IDH1", "IDH1", "IDH1", "IDH1", "ODC2-2"), 
    Name.y = c("SCOALB", "SCOALA-1", "CSY4", "IDH6", "ODC1-1", 
    "ODC2-1", "IDH1", "ODC2-2", "ODC1-2", "SCOALA-1", "SCOALA-2", 
    "IDH6", "SDH2-1", "IDH1", "IDH5", "ICDH", "ODC1-1", "ODC1-2", 
    "ACO2", "IDH1", "IDH5", "ODC2-1", "IDH1", "IDH5", "ODC2-2", 
    "ODC1-2", "IDH1", "ODC2-2", "ODC1-2", "IDH1", "IDH5", "SCOALA-2", 
    "ODC2-2", "ODC1-2", "ODC1-2")), .Names = c("Name.x", "Name.y" 
), class = "data.frame", row.names = c(NA, -35L)) 



> dput(data2) 
    structure(list(Protein1 = structure(c(3L, 7L, 18L, 19L, 7L, 19L, 
    6L, 18L, 6L, 18L, 18L, 19L, 9L, 8L, 19L, 18L, 9L, 7L, 18L, 12L, 
    8L, 19L, 5L, 29L, 12L, 29L, 12L, 18L, 7L, 17L, 6L, 5L, 9L, 19L, 
    12L, 3L, 19L, 16L, 18L, 17L, 16L, 17L, 9L, 29L, 12L, 7L, 29L, 
    18L, 16L, 18L, 29L, 8L, 17L, 16L, 17L, 12L, 6L, 8L, 17L, 29L, 
    9L, 17L, 29L, 19L, 8L, 17L, 29L, 9L, 9L, 16L, 29L, 29L, 19L, 
    19L, 19L, 29L, 12L, 19L, 17L, 29L, 17L, 16L, 16L, 19L, 16L, 4L, 
    1L, 5L, 17L, 9L, 18L, 18L, 6L, 4L, 8L, 16L, 16L, 29L, 7L, 12L, 
    8L, 4L, 29L, 12L, 5L), .Label = c("ACO2", "ACO3", "CSY4", "FUM1", 
    "ICDH", "IDH1", "IDH2", "IDH5", "IDH6", "LPD1", "LPD2", "MDH1", 
    "MDH2", "ME1", "ME2", "ODC1-1", "ODC1-2", "ODC2-1", "ODC2-2", 
    "PDC1a-1", "PDC1a-2", "PDC1b", "PDC2-1", "PDC2-2", "SCoALa-1", 
    "SCoALa-2", "SCoALb", "SDH1-1", "SDH2-1", "SDH2-2", "SDH2-3", 
    "SDH3-1", "SDH4", "SDH5", "SDH6", "SDH7a", "SDH7b", "SDH8"), class = "factor"), 
     Protein2 = structure(c(1L, 6L, 7L, 17L, 1L, 16L, 3L, 9L, 
     1L, 5L, 17L, 9L, 8L, 7L, 18L, 18L, 5L, 3L, 16L, 3L, 5L, 8L, 
     4L, 7L, 5L, 3L, 6L, 6L, 5L, 3L, 5L, 3L, 3L, 6L, 7L, 3L, 7L, 
     9L, 1L, 8L, 5L, 16L, 7L, 6L, 4L, 7L, 4L, 3L, 3L, 12L, 1L, 
     1L, 9L, 7L, 7L, 9L, 6L, 6L, 5L, 8L, 1L, 17L, 29L, 3L, 8L, 
     6L, 9L, 9L, 6L, 12L, 5L, 19L, 12L, 5L, 1L, 16L, 1L, 19L, 
     4L, 18L, 12L, 1L, 4L, 4L, 6L, 3L, 1L, 1L, 1L, 4L, 4L, 8L, 
     4L, 1L, 3L, 8L, 16L, 12L, 4L, 12L, 4L, 4L, 17L, 8L, 5L), .Label = c("ACO2", 
     "ACO3", "CSY4", "FUM1", "ICDH", "IDH1", "IDH2", "IDH5", "IDH6", 
     "LPD1", "LPD2", "MDH1", "MDH2", "ME1", "ME2", "ODC1-1", "ODC1-2", 
     "ODC2-1", "ODC2-2", "PDC1a-1", "PDC1a-2", "PDC1b", "PDC2-1", 
     "PDC2-2", "SCoALa-1", "SCoALa-2", "SCoALb", "SDH1-1", "SDH2-1", 
     "SDH2-2", "SDH2-3", "SDH3-1", "SDH4", "SDH5", "SDH6", "SDH7a", 
     "SDH7b", "SDH8"), class = "factor")), .Names = c("Protein1", 
    "Protein2"), class = "data.frame", row.names = c(1L, 4L, 6L, 
    12L, 22L, 25L, 28L, 33L, 44L, 48L, 51L, 52L, 53L, 60L, 68L, 70L, 
    72L, 76L, 86L, 109L, 110L, 119L, 133L, 144L, 146L, 158L, 170L, 
    197L, 202L, 206L, 211L, 213L, 226L, 227L, 237L, 271L, 272L, 286L, 
    290L, 297L, 304L, 305L, 306L, 319L, 323L, 327L, 347L, 348L, 351L, 
    357L, 370L, 372L, 373L, 378L, 379L, 392L, 406L, 410L, 414L, 417L, 
    419L, 437L, 442L, 445L, 448L, 455L, 457L, 462L, 471L, 479L, 482L, 
    483L, 488L, 503L, 509L, 522L, 536L, 563L, 618L, 620L, 623L, 628L, 
    630L, 644L, 647L, 666L, 668L, 673L, 676L, 678L, 679L, 690L, 691L, 
    694L, 698L, 703L, 709L, 714L, 715L, 722L, 723L, 724L, 727L, 739L, 
    740L)) 

在每個df有哪些存儲字符串兩列。表之間的字符串重疊。但是,成對之間的順序可能不同。來自這對的一個字符串可能在data1的第一列中找到,並在第二列data2中找到。如何找到數據集之間有多少對和多少重疊?

+1

分享所需要的輸出 – RUser

+0

你的意思是匹配:'一,B'用'C,了'? – zx8754

回答

5
> data1$combine = as.character(interaction(data1$Name.x, data1$Name.y)) 
> data2$combine = as.character(interaction(data2$Protein1, data2$Protein2)) 
> 
> dat.overlap = data1[complete.cases(match(data2$combine, data1$combine)),] 
> dat.overlap 
    Name.x Name.y   combine 
2  MDH1 SCOALA-1 MDH1.SCOALA-1 
4  IDH2  IDH6  IDH2.IDH6 
11 SCOALB SCOALA-2 SCOALB.SCOALA-2 
13  CSY4 SDH2-1  CSY4.SDH2-1 
18  FUM1 ODC1-2  FUM1.ODC1-2 
28 ODC2-1 ODC2-2 ODC2-1.ODC2-2 

data1[complete.cases(match(data1$combine, data2$combine)),] 
    Name.x Name.y  combine 
3 IDH2 CSY4  IDH2.CSY4 
7 IDH2 IDH1  IDH2.IDH1 
19 IDH6 ACO2  IDH6.ACO2 
20 IDH6 IDH1  IDH6.IDH1 
21 IDH6 IDH5  IDH6.IDH5 
23 ODC1-1 IDH1 ODC1-1.IDH1 
24 ODC1-1 IDH5 ODC1-1.IDH5 
27 ODC2-1 IDH1 ODC2-1.IDH1 
29 ODC2-1 ODC1-2 ODC2-1.ODC1-2 
35 ODC2-2 ODC1-2 ODC2-2.ODC1-2 
+0

當data1是'a,b',data2是'b,a'時,這是否匹配? – zx8754

+0

這是我想問的問題。 –

+0

@ShaxiLiver所以問題在於你:你會匹配還是不匹配? – zx8754

2

排序行方向,並通過粘貼鍵,然後合併:

data1$key <- apply(data1, 1, function(i) paste(sort(i), collapse = "_")) 
data2$key <- apply(data2, 1, function(i) paste(sort(i), collapse = "_")) 
res <- merge(data1, data2, by = "key") 

head(res) 
#   key Name.x Name.y Protein1 Protein2 
# 1 ACO2_IDH1 ACO2 IDH1  IDH1  ACO2 
# 2 ACO2_IDH6 IDH6 ACO2  IDH6  ACO2 
# 3 CSY4_ICDH CSY4 ICDH  ICDH  CSY4 
# 4 CSY4_IDH1 CSY4 IDH1  IDH1  CSY4 
# 5 CSY4_IDH2 IDH2 CSY4  IDH2  CSY4 
# 6 CSY4_IDH5 CSY4 IDH5  IDH5  CSY4