1
我有一個數據框,每行中包含多個因子,用逗號分隔。各行中的因素數量和因素數量未知。我需要對這一列進行熱編碼,這樣每一個獨特的因素都佔據着自己的專欄。我在下面有一個解決方案,但我相信有一個更好,更優雅的解決方案。這裏是一個例子:一個熱點編碼每行中有多個因子的數據幀
#one hot encode multiple factors in each row
library(stringr)
library(caret)
library(splitstackshape)
#create toy data frame
set.seed(123)
factor.num <- sample(3:6,1) #how many factors in each row
factors <- letters[sample(1:26,4)]
df1 <- data.frame(fact = replicate(100,paste(sample(factors,sample(1:factor.num,1)),collapse = ", ")))
df1
#split "fact" into uknown number of columns
df1_split <- cSplit(df1,"fact",",")
# convert all columns into dummy columns
dmy <- dummyVars(" ~ .", data = df1_split)
trsf <- data.frame(predict(dmy, newdata = df1_split))
#collect all columns with unique factors
final_df <- as.data.frame(matrix(0, ncol = factor.num, nrow = 100))
colnames(final_df) <- paste0("all_",factors)
for (i in 1:factor.num) {
fac_cols <- colnames(trsf)[str_detect(colnames(trsf),paste0("(?<=\\.)",factors[i],"$"))]
final_df[,paste0("all_",factors[i])] <- apply(trsf[,fac_cols],1,function(x) as.numeric(any(x==1,na.rm=T)))
}
final_df