2012-11-29 23 views
2

我想在model.matrix的data.frame中使用多列。使用多列的model.matrix

數據幀是這樣的:

df1 <- data.frame(id=seq(1,10,1), zip1=(round(runif(10)*100000,0)), zip2=(round(runif(10)*100000,0)) 
    ,var1=round(runif(10)*100,1),Sales=round(runif(10)* 10000,2)) 
df1$zip1 <- ifelse(nchar(as.character(df1$zip1)) < 5,paste("0",as.character(df1$zip1),sep=""),df1$zip1) 
df1$zip2 <- ifelse(nchar(as.character(df1$zip2)) < 5,paste("0",as.character(df1$zip2),sep=""),df1$zip2) 

df1$zip1 <- factor(df1$zip1) 
df1$zip2 <- factor(df1$zip2) 
dput(df1) 


> dput(df1) 
structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), zip1 = structure(c(5L, 
1L, 8L, 3L, 7L, 9L, 2L, 6L, 10L, 4L), .Label = c("16667", "21922", 
"29100", "54398", "55447", "72607", "84667", "96562", "97012", 
"99125"), class = "factor"), zip2 = structure(c(7L, 4L, 2L, 6L, 
3L, 8L, 5L, 9L, 1L, 10L), .Label = c("0451", "0644", "14040", 
"17184", "18838", "42308", "48507", "50496", "64851", "89748" 
), class = "factor"), var1 = c(94.4, 39.6, 47.1, 74, 67.3, 53.4, 
28.7, 91.6, 47.1, 44.8), Sales = c(6394.03, 5575.65, 773.58, 
3181.43, 4992.56, 6627.01, 3313.13, 1585.1, 5080.98, 499.86)), .Names = c("id", 
"zip1", "zip2", "var1", "Sales"), row.names = c(NA, -10L), class = "data.frame") 

,看起來像這樣:

> df1 
    id zip1 zip2 var1 Sales 
1 1 55447 48507 94.4 6394.03 
2 2 16667 17184 39.6 5575.65 
3 3 96562 0644 47.1 773.58 
4 4 29100 42308 74.0 3181.43 
5 5 84667 14040 67.3 4992.56 
6 6 97012 50496 53.4 6627.01 
7 7 21922 18838 28.7 3313.13 
8 8 72607 64851 91.6 1585.10 
9 9 99125 0451 47.1 5080.98 
10 10 54398 89748 44.8 499.86 

我想這樣做:

df2 <- model.matrix(~df1[,-c(1,5)]) 
glmnet1 <- cv.glmnet(df2, df1[,"Sales"] 
    , family="gaussian", alpha=.95, nfolds=10, standardize = FALSE, type.measure="deviance") 

不過,我可以」噸得到model.matrix採取多個變量:

> f1 <- formula(df1$Sales ~ df1[,c("zip1","zip2")]) 
> df2 <- model.matrix(f1) 
    Error in model.frame.default(object, data, xlev = xlev) : 
    invalid type (list) for variable 'df1[, c("zip1", "zip2")]' 

有什麼建議嗎?

EDITS:

> f1 <- formula(Sales ~., data= df1[,-c(1,5)]) 
> df2 <- model.matrix(f1) 
Error in terms.formula(object) : '.' in formula and no 'data' argument 
+0

是這樣的:'model.matrix(Sales〜zip1 + zip2,data = df1)'你想要什麼? – thelatemail

+0

@thelatemail:這將起作用,我編輯了問題以更好地描述問題。我在〜100個變量上使用它,並且不想用var1 + var2 + ... + varx將它們寫出來。 – screechOwl

+0

如果'model.matrix'接受「。」速記,然後修改'model.matrix(Sales〜。,data = df1)'(我認爲這是公式的正確語法,如果不是這樣的話)的建議。 –

回答

8

可以指定多個變量的公式,如:

model.matrix(Sales ~ zip1 + zip2, data=df1) 

如果你想節省打字的東西出來,嘗試:

formdf1 <- as.formula(paste("Sales ~ ", paste(names(df1)[2:3],collapse="+"))) 
formdf1 
#Sales ~ zip1 + zip2 

,然後運行:

model.matrix(formdf1,data=df1) 
3
require(glmnet) 
df2 <- df1[,-c(1)] # need the "Sales" column, drop it later 
# Could use df2 <- df1[ !names(df1) %in% exclude_vec ] 
glmnet1 <- cv.glmnet(model.matrix(Sales ~ . , df2)[, -1], df1[,"Sales"] , 
     family="gaussian", alpha=.95, nfolds=10, standardize = FALSE, 
     type.measure="deviance")