爲了您的具體問題,你可以這樣做:
library(h2o)
h2o.init(strict_version_check = F)
iris_wheader = "http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv"
iris.hex = h2o.importFile(iris_wheader)
iris_count <- h2o.group_by(data = iris.hex, by = "class", nrow('class'),gb.control=list(na.methods="rm"))
你可以看到原來的框架和結果:
head(iris.hex)
sepal_len sepal_wid petal_len petal_wid class
1 5.1 3.5 1.4 0.2 Iris-setosa
2 4.9 3.0 1.4 0.2 Iris-setosa
3 4.7 3.2 1.3 0.2 Iris-setosa
4 4.6 3.1 1.5 0.2 Iris-setosa
5 5.0 3.6 1.4 0.2 Iris-setosa
6 5.4 3.9 1.7 0.4 Iris-setosa
iris_count
class nrow
1 Iris-setosa 50
2 Iris-versicolor 50
3 Iris-virginica 50
沒有爲將來的版本中添加的文檔,但這裏有一些示例
> library(h2o)
> h2o.init()
# Import the airlines data set and display a summary.
> airlinesURL <- "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv"
> airlines.hex <- h2o.importFile(path = airlinesURL, destination_frame = "airlines.hex")
> summary(airlines.hex)
# Find number of flights by airport
> originFlights <- h2o.group_by(data = airlines.hex, by = "Origin", nrow("Origin"), gb.control=list(na.methods="rm"))
> originFlights.R <- as.data.frame(originFlights)
> originFlights.R
Origin nrow_Origin
1 ABE 59
2 ABQ 876
3 ACY 31
...
# Find number of flights per month
> flightsByMonth <- h2o.group_by(data = airlines.hex, by = "Month", nrow("Month"), gb.control=list(na.methods="rm"))
> flightsByMonth.R <- as.data.frame(flightsByMonth)
> flightsByMonth.R
Month nrow_Month
1 1 41979
2 10 1999
# Find the number of flights in a given month based on the origin
> cols <- c("Origin","Month")
> flightsByOriginMonth <- h2o.group_by(data=airlines.hex, by=cols,nrow("NumberOfFlights"), gb.control=list(na.methods="rm")
> flightsByOriginMonth.R <- as.data.frame(flightsByOriginMonth)
> flightsByOriginMonth.R
Origin Month nrow_NumberOfFlights
1 ABE 1 59
2 ABQ 1 846
3 ABQ 10 30
4 ACY 1 31
5 ALB 1 75
...
# Find months with the highest cancellation ratio
> which(colnames(airlines.hex)=="Cancelled")
[1] 22
> cancellationsByMonth <- h2o.group_by(data = airlines.hex, by = "Month", sum("Cancelled"), gb.control=list(na.methods="rm"))
> cancellation_rate <- cancellationsByMonth$sum_Cancelled/flightsByMonth$nrow_Month
> rates_table <- h2o.cbind(flightsByMonth$Month,cancellation_rate)
> rates_table.R <- as.data.frame(rates_table)
> rates_table.R
Month sum_Cancelled
1 1 0.025417471
2 10 0.009504752
# Use group_by with multiple columns. Summarize the destination, arrival delays, and departure delays for an origin
> cols <- c("Dest", "IsArrDelayed", "IsDepDelayed")
> originFlights <- h2o.group_by(data = airlines.hex[c("Origin",cols)], by = "Origin", sum(cols),gb.control = list(na.methods = "ignore", col.names = NULL))
# Note a warning because col.names null
> res <- h2o.cbind(lapply(cols, function(x){h2o.group_by(airlines.hex,by="Origin",sum(x))}))[,c(1,2,4,6)]
> res
Origin sum_Dest sum_IsArrDelayed sum_IsDepDelayed
1 ABE 5884 40 30
2 ABQ 84505 545 370
3 ACY 3131 9 7
4 ALB 3646 49 50
5 AMA 317 4 6
6 ANC 100 0 1
集合函數如何工作?我嘗試用'h2o.nrow(「class」)'替換'nrow(「class」)',但給出一個錯誤(名稱查找'h2o.nrow'失敗)。這是否意味着正在使用的聚合函數是基於R的函數? – mauna
聚合函數不使用基數R.H2O的groupby使用不同的命名約定。我創建了一個jira票(https://0xdata.atlassian.net/browse/PUBDEV-4549),以便我們的文檔列出您可以使用的聚合函數。感謝您的反饋! – Lauren