2015-09-28 55 views
4

合併數據幀的內置功能包含一個選項,更改後綴申請時合併這兩個數據幀有一個共同的列名:合併數據幀 - 列前綴

## S3 method for class 'data.frame' 
merge(x, y, by = intersect(names(x), names(y)), 
     by.x = by, by.y = by, all = FALSE, all.x = all, all.y = all, 
     sort = TRUE, suffixes = c(".x",".y"), 
     incomparables = NULL, ...) 

是否有一個選項,而不是有前綴(例如= c("x.","y."))是否適用?

回答

3

進行自定義合併功能。

#dummy data 
df1 <- head(mtcars[,1:4]) 
df2 <- tail(mtcars[,1:6]) 

#base merge 
merge(df1,df2, by="cyl") 
# cyl mpg.x disp.x hp.x mpg.y disp.y hp.y drat wt 
# 1 4 22.8 108 93 26.0 120.3 91 4.43 2.140 
# 2 4 22.8 108 93 30.4 95.1 113 3.77 1.513 
# 3 4 22.8 108 93 21.4 121.0 109 4.11 2.780 
# 4 6 21.0 160 110 19.7 145.0 175 3.62 2.770 
# 5 6 21.0 160 110 19.7 145.0 175 3.62 2.770 
# 6 6 21.4 258 110 19.7 145.0 175 3.62 2.770 
# 7 6 18.1 225 105 19.7 145.0 175 3.62 2.770 
# 8 8 18.7 360 175 15.8 351.0 264 4.22 3.170 
# 9 8 18.7 360 175 15.0 301.0 335 3.54 3.570 

#custom merge 
myMerge(df1,df2, by="cyl") 
# cyl x.mpg x.disp x.hp y.mpg y.disp y.hp drat wt 
# 1 4 22.8 108 93 26.0 120.3 91 4.43 2.140 
# 2 4 22.8 108 93 30.4 95.1 113 3.77 1.513 
# 3 4 22.8 108 93 21.4 121.0 109 4.11 2.780 
# 4 6 21.0 160 110 19.7 145.0 175 3.62 2.770 
# 5 6 21.0 160 110 19.7 145.0 175 3.62 2.770 
# 6 6 21.4 258 110 19.7 145.0 175 3.62 2.770 
# 7 6 18.1 225 105 19.7 145.0 175 3.62 2.770 
# 8 8 18.7 360 175 15.8 351.0 264 4.22 3.170 
# 9 8 18.7 360 175 15.0 301.0 335 3.54 3.570 

定製myMerge功能:

#custom myMerge function - modified from "base::merge.data.frame" 
myMerge <- function (x, y, by = intersect(names(x), names(y)), by.x = by, 
        by.y = by, all = FALSE, all.x = all, all.y = all, sort = TRUE, 
        prefix = c("x.", "y."), incomparables = NULL, ...) 
{ 
    fix.by <- function(by, df) { 
    if (is.null(by)) 
     by <- numeric() 
    by <- as.vector(by) 
    nc <- ncol(df) 
    if (is.character(by)) { 
     poss <- c("row.names", names(df)) 
     if (any(bad <- !charmatch(by, poss, 0L))) 
     stop(ngettext(sum(bad), "'by' must specify a uniquely valid column", 
         "'by' must specify uniquely valid columns"), 
      domain = NA) 
     by <- match(by, poss) - 1L 
    } 
    else if (is.numeric(by)) { 
     if (any(by < 0L) || any(by > nc)) 
     stop("'by' must match numbers of columns") 
    } 
    else if (is.logical(by)) { 
     if (length(by) != nc) 
     stop("'by' must match number of columns") 
     by <- seq_along(by)[by] 
    } 
    else stop("'by' must specify one or more columns as numbers, names or logical") 
    if (any(bad <- is.na(by))) 
     stop(ngettext(sum(bad), "'by' must specify a uniquely valid column", 
        "'by' must specify uniquely valid columns"), 
      domain = NA) 
    unique(by) 
    } 
    nx <- nrow(x <- as.data.frame(x)) 
    ny <- nrow(y <- as.data.frame(y)) 
    if (nx >= 2^31 || ny >= 2^31) 
    stop("long vectors are not supported") 
    by.x <- fix.by(by.x, x) 
    by.y <- fix.by(by.y, y) 
    if ((l.b <- length(by.x)) != length(by.y)) 
    stop("'by.x' and 'by.y' specify different numbers of columns") 
    if (l.b == 0L) { 
    nm <- nm.x <- names(x) 
    nm.y <- names(y) 
    has.common.nms <- any(cnm <- nm.x %in% nm.y) 
    if (has.common.nms) { 
     names(x)[cnm] <- paste0(prefix[1L], nm.x[cnm]) 
     cnm <- nm.y %in% nm 
     names(y)[cnm] <- paste0(prefix[1L], nm.y[cnm]) 
    } 
    if (nx == 0L || ny == 0L) { 
     res <- cbind(x[FALSE, ], y[FALSE, ]) 
    } 
    else { 
     ij <- expand.grid(seq_len(nx), seq_len(ny)) 
     res <- cbind(x[ij[, 1L], , drop = FALSE], y[ij[, 
                2L], , drop = FALSE]) 
    } 
    } 
    else { 
    if (any(by.x == 0L)) { 
     x <- cbind(Row.names = I(row.names(x)), x) 
     by.x <- by.x + 1L 
    } 
    if (any(by.y == 0L)) { 
     y <- cbind(Row.names = I(row.names(y)), y) 
     by.y <- by.y + 1L 
    } 
    row.names(x) <- NULL 
    row.names(y) <- NULL 
    if (l.b == 1L) { 
     bx <- x[, by.x] 
     if (is.factor(bx)) 
     bx <- as.character(bx) 
     by <- y[, by.y] 
     if (is.factor(by)) 
     by <- as.character(by) 
    } 
    else { 
     if (!is.null(incomparables)) 
     stop("'incomparables' is supported only for merging on a single column") 
     bx <- x[, by.x, drop = FALSE] 
     by <- y[, by.y, drop = FALSE] 
     names(bx) <- names(by) <- paste0("V", seq_len(ncol(bx))) 
     bz <- do.call("paste", c(rbind(bx, by), sep = "\r")) 
     bx <- bz[seq_len(nx)] 
     by <- bz[nx + seq_len(ny)] 
    } 
    comm <- match(bx, by, 0L) 
    bxy <- bx[comm > 0L] 
    xinds <- match(bx, bxy, 0L, incomparables) 
    yinds <- match(by, bxy, 0L, incomparables) 
    if (nx > 0L && ny > 0L) 
     m <- .Internal(merge(xinds, yinds, all.x, all.y)) 
    else m <- list(xi = integer(), yi = integer(), x.alone = seq_len(nx), 
        y.alone = seq_len(ny)) 
    nm <- nm.x <- names(x)[-by.x] 
    nm.by <- names(x)[by.x] 
    nm.y <- names(y)[-by.y] 
    ncx <- ncol(x) 
    if (all.x) 
     all.x <- (nxx <- length(m$x.alone)) > 0L 
    if (all.y) 
     all.y <- (nyy <- length(m$y.alone)) > 0L 
    lxy <- length(m$xi) 
    has.common.nms <- any(cnm <- nm.x %in% nm.y) 
    if (has.common.nms && nzchar(prefix[1L])) 
     nm.x[cnm] <- paste0(prefix[1L], nm.x[cnm]) 
    x <- x[c(m$xi, if (all.x) m$x.alone), c(by.x, seq_len(ncx)[-by.x]), 
      drop = FALSE] 
    names(x) <- c(nm.by, nm.x) 
    if (all.y) { 
     ya <- y[m$y.alone, by.y, drop = FALSE] 
     names(ya) <- nm.by 
     xa <- x[rep.int(NA_integer_, nyy), nm.x, drop = FALSE] 
     names(xa) <- nm.x 
     x <- rbind(x, cbind(ya, xa)) 
    } 
    if (has.common.nms && nzchar(prefix[2L])) { 
     cnm <- nm.y %in% nm 
     nm.y[cnm] <- paste0(prefix[2L], nm.y[cnm]) 
    } 
    y <- y[c(m$yi, if (all.x) rep.int(1L, nxx), if (all.y) m$y.alone), 
      -by.y, drop = FALSE] 
    if (all.x) { 
     zap <- (lxy + 1L):(lxy + nxx) 
     for (i in seq_along(y)) { 
     if (is.matrix(y[[1]])) 
      y[[1]][zap, ] <- NA 
     else is.na(y[[i]]) <- zap 
     } 
    } 
    if (has.common.nms) 
     names(y) <- nm.y 
    nm <- c(names(x), names(y)) 
    if (any(d <- duplicated(nm))) 
     if (sum(d) > 1L) 
     warning("column names ", paste(sQuote(nm[d]), 
             collapse = ", "), " are duplicated in the result", 
       domain = NA) 
    else warning("column name ", sQuote(nm[d]), " is duplicated in the result", 
       domain = NA) 
    res <- cbind(x, y) 
    if (sort) 
     res <- res[if (all.x || all.y) 
     do.call("order", x[, seq_len(l.b), drop = FALSE]) 
     else sort.list(bx[m$xi]), , drop = FALSE] 
    } 
    attr(res, "row.names") <- .set_row_names(nrow(res)) 
    res 
} 
3

由於merge沒有這個選項(如2月22日2016),我們就可以讓merge做它的東西,然後清理事後通過運行:

names(df_new) <- gsub("(.*).([xy])$", "\\2.\\1", names(df_new)) 

我總是討厭在腳本中寫這些令人分心的代碼行,但這就是生活。

+0

同意,但很容易將它包裝在一個小函數中,以保持腳本清潔。 –