2017-04-03 61 views
1

我有一個相當大的數據集(15.000行),並且由於數據結構,我需要對每行進行計算。我的數據集中有一列需要進一步拆分。下面是一個例子:對所有行執行操作並將結果添加回主數據框

date <- c("2015-07-10", "2013-05-06", "2017-08-10") 
Number <- c(345, 231, 10) 
Route <- c("GCLP:10011:-8848:56:-4:270:260:12;LPC:1211:-828:56:-2:22:220:22;GCCC:13451:-85458:556:-45:45:76:67", "DPAP:10011:-8848:56:-4:270:260:12;LTTC:1211:-828:56:-2:22:220:22;ATCH:13451:-85458:556:-45:45:76:67", "AMN:10011:-8848:56:-4:270:260:12;RET:1211:-828:56:-2:22:220:22;LLOP:13451:-85458:556:-45:45:76:67") 

Dep <- c("FGC","HAM","ICAO") 
Plan <- data.frame(date, Number, Route, Dep) 

對我來說,重要的信息是在「路線」列中。我需要從這一欄生成聚合功能。該列中每個單元格中的信息都需要被「;」分隔。

我試過到目前爲止:

  1. 選擇一行

  2. 創建一個新的數據幀正好與此一行。

  3. 在列「路由」上使用mutate和unnest將其拆分爲「;」點,並創建一個新的行對每個

    測試< - 計劃[1,]
    測試< - 試驗%>%突變(路線= strsplit(as.character(途徑), 「;」))%>% UNNEST(路線)

  4. 使用CSPLIT通過拆分列「路由」的信息「:」

    test = cSplit(test, "Route", ":") 
    
  5. 我則對數據的這個子集進行我的計算。

  6. 創建變量X,Y,Z救我的計算

    x1 <- mean(test$Route_2) 
        y1 <- max(test$Route_5) 
        z1 <- min(test$Route_8) 
    

兩個問題:

我怎麼能在我的原始數據集自動執行此操作的所有行? 如何將保存的變量(x,y,z)中的數據合併回原始數據框?

期望的輸出 (這些都不是從X2和X3,只是一個例子中的數據的實際值)

x1 <- 12 
y1 <- 86363 
z1 <- 7383 
x2 <- 45 
y2 <- 6754 
z2 <- 3553 
x3 <- 5648 
y3 <- 64 
z3 <- 6363 

Plan$x <- c(x1,x2,x3) 
Plan$y <- c(y1, y2, y3) 
Plan$z <- c(z1,z2,z3) 

head(Plan) 

全樣本CODE一次全部

library(splitstackshape) 
library(plyr) 
library(tidyr) 

date <- c("2015-07-10", "2013-05-06", "2017-08-10") 
Number <- c(345, 231, 10) 
Route <- c("GCLP:10011:-8848:56:-4:270:260:12;LPC:1211:-828:56:-2:22:220:22;GCCC:13451:-85458:556:-45:45:76:67", "DPAP:10011:-8848:56:-4:270:260:12;LTTC:1211:-828:56:-2:22:220:22;ATCH:13451:-85458:556:-45:45:76:67", "AMN:10011:-8848:56:-4:270:260:12;RET:1211:-828:56:-2:22:220:22;LLOP:13451:-85458:556:-45:45:76:67") 

Dep <- c("FGC","HAM","ICAO") 

Plan <- data.frame(date, Number, Route, Dep) 

test <- Plan[1,] 
test <- test %>% mutate(Route=strsplit(as.character(Route), ";")) %>% unnest(Route) 
test = cSplit(test, "Route", ":") 

x1 <- mean(test$Route_2) 
y1 <- max(test$Route_5) 
z1 <- min(test$Route_8) 

x2 <- 45 
y2 <- 6754 
z2 <- 3553 
x3 <- 5648 
y3 <- 64 
z3 <- 6363 

Plan$x <- c(x1,x2,x3) 
Plan$y <- c(y1, y2, y3) 
Plan$z <- c(z1,z2,z3) 

head(Plan) 
+0

請包括您提供的示例data.frame所需的輸出。我懷疑你想要'strsplit',但是我不完全確定最終的data.frame。 – lmo

+0

感謝您的信息! – Anna2803

+0

'tidyr'包中的'separate'函數在這裏可能會有所幫助 – bouncyball

回答

2

創建第二個臨時路由列,名爲Route_tmp,並從中爲其每個分量生成一個單獨的行,以分號分隔,然後用冒號將結果變量Route_tmp分隔成單獨的列。現在按原始變量進行分組,我們採用所需列的平均值。 (請注意,如果我們在輸出中不需要Route,那麼我們可以忽略頂部的mutate並使用Route代替Route_tmp。)

library(dplyr) 
library(tidyr) 

out <- Plan %>% 
    mutate(Route_tmp = Route) %>% 
    separate_rows(Route_tmp, sep = ";") %>% 
    separate(Route_tmp, as.character(1:8), convert = TRUE) %>% 
    group_by(date, Number, Route, Dep) %>% 
    summarize(x = mean(`2`), y = mean(`5`), z = mean(`8`)) %>% 
    ungroup 

給予以下(不顯示路徑欄,使其更易於閱讀):

> out[-3] 
# A tibble: 3 × 6 
     date Number Dep  x  y  z 
     <fctr> <dbl> <fctr> <dbl> <dbl> <dbl> 
1 2013-05-06 231 HAM 8224.333 17 33.66667 
2 2015-07-10 345 FGC 8224.333 17 33.66667 
3 2017-08-10  10 ICAO 8224.333 17 33.66667 

注:由於規劃中的問題是覆蓋它不是清楚我正是哪個版本的計劃是輸入,但我已經假設:

Plan <- data.frame(date = c("2015-07-10", "2013-05-06", "2017-08-10"), 
      Number = c(345, 231, 10), 
      Route = c("GCLP:10011:-8848:56:-4:270:260:12;LPC:1211:-828:56:-2:22:220:22;GCCC:13451:-85458:556:-45:45:76:67", "DPAP:10011:-8848:56:-4:270:260:12;LTTC:1211:-828:56:-2:22:220:22;ATCH:13451:-85458:556:-45:45:76:67", "AMN:10011:-8848:56:-4:270:260:12;RET:1211:-828:56:-2:22:220:22;LLOP:13451:-85458:556:-45:45:76:67"), 
      Dep = c("FGC","HAM","ICAO")) 
2

以下是我如何使用tidyverse包裝:

library(dplyr) 
library(tidyr) 
library(stringr) 
library(purrr) 
# This function takes a single item from Plan$Route, splits it into its 
# relevant columns and then finds the mean of columns 2, 5 and 8. 
route_extract <- function(route) { 
    cols <- str_split(route, fixed(":"), simplify = TRUE)[, c(2, 5, 8), drop = FALSE] 
    # Converts the matrix to numeric without losing dimensions 
    storage.mode(cols) <- "numeric" 
    # Calculate the column means and then return the result as a `tibble` 
    cm <- colMeans(cols) 
    tibble(x = cm[1], y = cm[2], z = cm[3]) 
} 
route_calc <- function(routes) { 
    str_split(routes, fixed(";")) %>% 
    map_df(route_extract) 

} 

Plan <- bind_cols(Plan, route_calc(Plan$Route)) 
相關問題