好的,我按照@ DirkEddelbuettel的建議寫了一些包裝。評論贊賞。
#' Write a table via RSQLite with factors stored in another table
#' Handles data.tables efficiently for large datasets
#' @param conn The connection object (created with e.g. dbConnect)
#' @param name The name of the table to write
#' @param value The data.frame to write to the database
#' @param factorName The base name of the tables to store the factor labels in in the SQLite database (e.g. if factorName is "_factor_" and the data.frame in value contains a factor column called "color" and the name is "mytable" then dbWriteFactorTable will create a table called mytable_factor_color which will store the levels information)
#' @param \dots Options to pass along to dbWriteTable (e.g. append=TRUE)
#' @return A boolean indicating whether the table write was successful
dbWriteFactorTable <- function(conn, name, value, factorName="_factor_", ...) {
# Test inputs
stopifnot(class(conn)=="SQLiteConnection")
stopifnot(class(name)=="character")
stopifnot("data.frame" %in% class(value))
stopifnot(class(factorName)=="character")
if(grepl("[.]",factorName)) stop("factorName must use valid characters for SQLite")
if("data.table" %in% class(value)) dt <- TRUE # Is value a data.table, if so use more efficient methods
# Convert factors to character
factorCols <- names(Filter(function(x) x=="factor", vapply(value, class, "")))
if(length(factorCols>0)) {
for(cl in which(colnames(value) %in% factorCols)) {
cn <- colnames(value)[cl]
factorTable <- data.frame(levels=levels(value[[ cn ]]))
factorTable$levelKey <- seq(nrow(factorTable))
fctNm <- paste0(name,factorName,cn)
dbWriteTable(conn = conn, name = fctNm, value = factorTable, row.names=FALSE, overwrite=TRUE)
if(dt) set(x=value, j=cl, value=as.character(value[[ cn ]]))
}
if(!dt) value <- japply(value, which(colnames(value) %in% factorCols), as.character)
} else {
warning("No factor columns detected.")
}
dbWriteTable(conn = conn, name = name, value = value, ...)
}
#' Read a table via RSQLite with factors stored in another table
#' @param conn The connection object (created with e.g. dbConnect)
#' @param name The name of the table to read
#' @param query A character string containing sequel statements to be appended onto the query (e.g. "WHERE x==3")
#' @param dt Whether to return a data.table vs. a plain-old data.frame
#' @param factorName The base name of the tables to store the factor labels in in the SQLite database (e.g. if factorName is "_factor_" and the data.frame in value contains a factor column called "color" and the name is "mytable" then dbWriteFactorTable will expect there to be a table called mytable_factor_color which holds the levels information)
#' @param \dots Options to pass along to dbGetQuery
#' @return A data.table or data.frame
dbReadFactorTable <- function(conn, name, query="", dt=TRUE, factorName="_factor_", ...) {
# Test inputs
stopifnot(class(conn)=="SQLiteConnection")
stopifnot(class(name)=="character")
stopifnot(class(factorName)=="character")
if(grepl("[.]",factorName)) stop("factorName must use valid characters for SQLite")
# Read main table
if(dt) {
value <- as.data.table(dbGetQuery(conn, paste("SELECT * FROM",name,query), ...))
} else {
value <- dbGetQuery(conn, paste("SELECT * FROM",name,query), ...)
}
# Convert factors to character
factorCols <- sub(paste0("^.*",name,factorName,"(.+)$"), "\\1",
Filter(Negate(is.na),
str_extract(dbListTables(conn), paste0(".*",name,factorName,".*"))
)
)
if(length(factorCols>0)) {
for(cn in factorCols) {
fctNm <- paste0(name,factorName,cn)
factorTable <- dbGetQuery(conn, paste0("SELECT * FROM ",fctNm))
if(dt) {
cl <- which(colnames(value) %in% cn)
set(x=value, j=cl, value=factor(value[[ cn ]], levels=factorTable$levels))
} else {
value[[ cn ]] <- factor(value[[ cn ]], levels=factorTable$levels)
}
}
} else {
warning("No factor columns detected.")
}
value
}
而且一個簡單的例子:
db <- dbConnect(SQLite(), dbname="~/temp/test.sqlite")
set.seed(1)
n <- 1000
testDat <- data.frame(key=seq(n), x=runif(n),y=runif(n),g1=sample(letters[1:10],n,replace=TRUE),g2=rep(letters[1:10],each=n/10),g3=factor(sample(letters[1:10],n,replace=TRUE)))
if(dbExistsTable(db,"test")) dbRemoveTable(db,"test")
dbWriteFactorTable(conn = db, name = "test", value = as.data.table(testDat), row.names=FALSE)
dbReadFactorTable(conn = db, name = "test")
dbReadFactorTable(conn = db, name = "test", query="WHERE g3=='a'")
附註:我覺得你們所知道的是[人物往往是首選的FO關鍵數據表因素(http://stackoverflow.com/questions/ 18304760 /爲什麼字符-IS-經常優選對因子在數據表換鍵)。什麼是dbType等同於因素?整數?我的意思是說,在閱讀表格時,應該存儲列類型信息以保留因子。 – agstudy
據我所知,目前沒有辦法在RSQLite或任何基於DBI的軟件包中執行此操作。面臨的挑戰是如何存儲有關因素的元數據。另外,將它們存儲爲字符的缺點並不大。 – hadley
@hadley我擔心的是,如果我在字符串中增加了一個額外的(和按字母順序排列的前綴)級別的數據框架,那麼引用類別可能會發生變化。至少有可能會返回錯誤的因素。 –