SQL将许多tsv文件连接到数据库中的单个表中，同时跟踪文件源（MonetBlite）_Sql_R_Monetdb_Monetdblite

SQL将许多tsv文件连接到数据库中的单个表中，同时跟踪文件源（MonetBlite）

sql r

SQL将许多tsv文件连接到数据库中的单个表中，同时跟踪文件源（MonetBlite）,sql,r,monetdb,monetdblite,Sql,R,Monetdb,Monetdblite,我正在使用MonetBlite R包创建MonetDB。我可以使用中的说明创建数据库表，代码如下： library(DBI) library(MonetDBLite) # Write tsv file of mtcars write.table(mtcars, "mtcars.tsv", row.names=FALSE, sep= "\t") # Initialize MonetDB dbdir <- "/Users/admin/my_directory" con <- dbCo

我正在使用MonetBlite R包创建MonetDB。我可以使用中的说明创建数据库表，代码如下：

library(DBI)
library(MonetDBLite)

# Write tsv file of mtcars
write.table(mtcars, "mtcars.tsv", row.names=FALSE, sep= "\t")

# Initialize MonetDB
dbdir <- "/Users/admin/my_directory"
con <- dbConnect(MonetDBLite::MonetDBLite(), dbdir)

# Write table
dbWriteTable(con, "test4", "mtcars.tsv", delim="\t")

到目前为止还不错。但是，假设我有另一个具有不同mpg值的文件mtcars2：

mtcars2 <- mtcars
mtcars2$mpg <- mtcars2$mpg + 5
write.table(mtcars2, "mtcars2.tsv", row.names= FALSE, sep = "\t")

也很好。但我的问题是：我想稍后查找所有具有6个气缸的车辆的mpg，并知道它来自mtcars或mtcars2的数据集。从我对SQL索引的理解来看，这并不多，基本上也是我所读到的，我应该将所有数据放在一个表中，以便进行最有效的搜索。我尝试加载第一个tsv文件，然后使用ALTER TABLE test4 ADD DATASE TEXT和UPDATE test4 SET DATASE=dataset1 sql命令添加了另一列-

dbSendQuery(con, "UPDATE test4 SET dataset = dataset1")
dbSendQuery(con, "UPDATE test4 SET dataset = 1")
> dbGetQuery(con, "SELECT * FROM test4 LIMIT 3")
mpg cyl disp  hp drat    wt  qsec vs am gear carb dataset
1 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4       1
2 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4       1
3 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1       1

但当我试图将mtcars2追加到表中时，它的列数与我预期的不同，duh。将具有相同列的多个tsv文件中的数据连接到单个表中，同时跟踪数据源的最佳方法是什么

编辑-正如您可能已经猜到的，真正的数据不是mtcars-它是数百万行长的平面tsv文件，这意味着我希望避免将整个文件读入内存并使用R进行操作。

您应该能够在读取文件后执行dbWriteTable在data.frame中创建一个新变量。比如：

  library(DBI)
  library(MonetDBLite)
  library(data.table)

  # Write tsv file of mtcars
  tmp <- tempfile()
  write.table(mtcars, tmp, row.names=FALSE, sep= "\t")

  # Initialize MonetDB
  dbdir <- "~/Desktop/temp"
  con <- dbConnect(MonetDBLite::MonetDBLite(), dbdir)

  test4df <- fread(tmp)
  test4df$dataset <- 1
  dbWriteTable(con, "test4", test4df)

  dbReadTable(con, "test4")

  test5df <- fread(tmp)
  test5df$mpg <- test5df$mpg + 5
  test5df$dataset <- 2
  dbWriteTable(con, "test4", test5df, append = TRUE)

  dbReadTable(con, "test4")

根据他的建议，考虑到我的数据是10个文件，每个文件都有数百万行，我解决了只使用SQL命令的问题，而且比bash命令更快

library(DBI)
library(MonetDBLite)

# Write tsv file of mtcars
write.table(mtcars, "mtcars.tsv", row.names=FALSE, sep= "\t")

# Write tsv of second mtcars
mtcars2 <- mtcars
mtcars2$mpg <- mtcars2$mpg + 5
write.table(mtcars2, "mtcars2.tsv", row.names= FALSE, sep = "\t")

# Initialize MonetDB
dbdir <- "/Users/admin/"
con <- dbConnect(MonetDBLite::MonetDBLite(), dbdir)

# Write table
dbWriteTable(con, "test4", "mtcars.tsv", delim="\t")

# Add data source information
dbSendQuery(con, "ALTER TABLE test4 ADD source TEXT")
dbSendQuery(con, "UPDATE test4 SET source = 'dataset1'")

# Write second dataset to a temporary table
dbWriteTable(con, "temptable", "mtcars2.tsv", delim="\t")

# Add data source information
dbSendQuery(con, "ALTER TABLE temptable ADD source TEXT")
dbSendQuery(con, "UPDATE temptable SET source = 'dataset2'")

# Insert temp table into main table
dbSendQuery(con, "INSERT INTO test4 SELECT * FROM temptable")

# Drop temp table
dbSendQuery(con, "DROP TABLE temptable")

# Checking the data, truncated for clarity
> dbGetQuery(con, "SELECT * FROM test4")
mpg cyl  disp  hp drat    wt  qsec vs am gear carb   source
1  21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4 dataset1
2  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4 dataset1
3  22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1 dataset1
...
33 26.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4 dataset2
34 26.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4 dataset2
35 27.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1 dataset2
...
64 26.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2 dataset2

很抱歉，如果我在问题中没有足够清楚地说明我的数据比mtcars大得多-如果您有中等大小的数据，那么data.tables包可能是比数据库更好的解决方案。

如果最后一个表中有一组具有相同结构和文件名的文件，我会这样做，这是来自所有文件的数据的组合：

# say we have those files
write.table(mtcars, "mtcars1.tsv", row.names=FALSE, sep= "\t")
write.table(mtcars, "mtcars2.tsv", row.names=FALSE, sep= "\t")

# write them individually, and add a column that contains the file name
dbWriteTable(con, "mtcars1", "mtcars1.tsv", delim="\t")
dbSendQuery(con, "ALTER TABLE mtcars1 ADD COLUMN file STRING DEFAULT 'mtcars1.tsv';")
dbWriteTable(con, "mtcars2", "mtcars2.tsv", delim="\t")
dbSendQuery(con, "ALTER TABLE mtcars2 ADD COLUMN file STRING DEFAULT 'mtcars2.tsv';")

# now combine into a new table
dbSendQuery(con, "CREATE TABLE mtcars_mat AS SELECT * FROM mtcars1 UNION ALL SELECT * FROM mtcars2")

# or a view if you don't need to modify the data in the mtcars table (faster)
dbSendQuery(con, "CREATE view mtcars AS SELECT * FROM mtcars1 UNION ALL SELECT * FROM mtcars2")



# and here is the same as a loop with a filename glob and some added robustness (handy if you have 1000 files)
files <- Sys.glob("/some/path/mtcars*.tsv")
tables <- dbQuoteIdentifier(con, tools::file_path_sans_ext(basename(files)))
dbBegin(con)
for (i in 1:length(files)) {
  dbWriteTable(con, tables[i], files[i], delim="\t", transaction=FALSE)
  dbSendQuery(con, paste0("ALTER TABLE ", tables[i], " ADD COLUMN file STRING DEFAULT ",dbQuoteString(con, files[i]),";"))
}
dbSendQuery(con, paste0("CREATE TABLE somefinalresult AS ", paste0("SELECT * FROM ",tables, collapse=" UNION ALL ")))
# remove the parts again, optional
dbSendQuery(con, paste0("DROP TABLE ", tables, ";", collapse=" "))
dbCommit(con)

逻辑过程：导入两个数据集后，更改每个表以定义源字段。然后将两个表中所有记录的字段更新为所需值。然后从一个表中选择，将所有所需记录插入第二个表中。然后删除第二个不需要/合并的表。或者编写一个视图，将两个表合并并在列中添加一个索引；但它不会像你已经指出的那样高效……太好了！这是有道理的-如果我能设法让它以这种方式工作，我会发布代码。是的，对于mtcars大小的数据，如果出于某种原因需要使其成为数据库，这是最好的解决方案。但我的真实数据不是mtcars大小的，而是数百万行的文本文件，这意味着我们希望尽可能避免将整个文件读入内存。首先，我将尝试在data.table中使用fread，这将更加节省内存。如果这不起作用，我会尝试第二种可能的解决方案。不幸的是，我们选择MonetBlite路线，因为fread对于我们正在分析的文件的数量和大小来说太慢了。不过，感谢您的建议，data.tables包对于特定大小的数据来说确实很棒。请仔细阅读我的评论/回答。正如我在前面的评论中所说，我在编辑后的答案中给出的建议不依赖于data.table并在数据库中编写一个表而不打开文件。是的，我还看到您在编辑后的评论中使用了bash命令。出于我们希望将其写入跨平台工作的代码的原因，使用SQL命令比使用bash命令更好。据我所知，它也比使用SQL命令慢，但我可能错了。是的！处理32个文件，490万行11列，耗时约15分钟。这可能和我在R框架中得到的一样好。谢谢杰出的一定要接受你的回答P我给来源命名了文件名但那是我。。。或者你想搞糊涂。啊哈，这是未公布的数据，我可能太偏执了，但人们不想在研究中占上风。Stack Overflow说我需要等到明天才能接受自己的答案，但到时候我一定会接受。

infile <- tmp
outfile <- tempfile()

# open connections
incon <- file(description = infile, open = "r")
outcon <- file(description = outfile, open = "w")

# count the number of lines (this will work only with Mac/Linux)
com <- paste("wc -l ", infile, " | awk '{ print $1 }'", sep="")
n <- system(command=com, intern=TRUE)

# work with the first line
txt <- scan(file = incon, what = character(), nlines=1, quiet=TRUE)
txt <- c(txt, "dataset")
cat(paste(txt, collapse = "\t"), "\n", file = outcon, sep = "")

# work with the rest of the file
for(i in 2:n) {
  txt <- scan(file = incon, what = character(), nlines=1, quiet=TRUE)
  txt <- c(txt, "1")
  cat(paste(txt, collapse = "\t"), "\n", file = outcon, sep = "")
}
close(incon);close(outcon)
dbWriteTable(con, "test4", outfile, delim = "\t")
# do the similar for other files

library(DBI)
library(MonetDBLite)

# Write tsv file of mtcars
write.table(mtcars, "mtcars.tsv", row.names=FALSE, sep= "\t")

# Write tsv of second mtcars
mtcars2 <- mtcars
mtcars2$mpg <- mtcars2$mpg + 5
write.table(mtcars2, "mtcars2.tsv", row.names= FALSE, sep = "\t")

# Initialize MonetDB
dbdir <- "/Users/admin/"
con <- dbConnect(MonetDBLite::MonetDBLite(), dbdir)

# Write table
dbWriteTable(con, "test4", "mtcars.tsv", delim="\t")

# Add data source information
dbSendQuery(con, "ALTER TABLE test4 ADD source TEXT")
dbSendQuery(con, "UPDATE test4 SET source = 'dataset1'")

# Write second dataset to a temporary table
dbWriteTable(con, "temptable", "mtcars2.tsv", delim="\t")

# Add data source information
dbSendQuery(con, "ALTER TABLE temptable ADD source TEXT")
dbSendQuery(con, "UPDATE temptable SET source = 'dataset2'")

# Insert temp table into main table
dbSendQuery(con, "INSERT INTO test4 SELECT * FROM temptable")

# Drop temp table
dbSendQuery(con, "DROP TABLE temptable")

# Checking the data, truncated for clarity
> dbGetQuery(con, "SELECT * FROM test4")
mpg cyl  disp  hp drat    wt  qsec vs am gear carb   source
1  21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4 dataset1
2  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4 dataset1
3  22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1 dataset1
...
33 26.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4 dataset2
34 26.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4 dataset2
35 27.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1 dataset2
...
64 26.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2 dataset2

# say we have those files
write.table(mtcars, "mtcars1.tsv", row.names=FALSE, sep= "\t")
write.table(mtcars, "mtcars2.tsv", row.names=FALSE, sep= "\t")

# write them individually, and add a column that contains the file name
dbWriteTable(con, "mtcars1", "mtcars1.tsv", delim="\t")
dbSendQuery(con, "ALTER TABLE mtcars1 ADD COLUMN file STRING DEFAULT 'mtcars1.tsv';")
dbWriteTable(con, "mtcars2", "mtcars2.tsv", delim="\t")
dbSendQuery(con, "ALTER TABLE mtcars2 ADD COLUMN file STRING DEFAULT 'mtcars2.tsv';")

# now combine into a new table
dbSendQuery(con, "CREATE TABLE mtcars_mat AS SELECT * FROM mtcars1 UNION ALL SELECT * FROM mtcars2")

# or a view if you don't need to modify the data in the mtcars table (faster)
dbSendQuery(con, "CREATE view mtcars AS SELECT * FROM mtcars1 UNION ALL SELECT * FROM mtcars2")



# and here is the same as a loop with a filename glob and some added robustness (handy if you have 1000 files)
files <- Sys.glob("/some/path/mtcars*.tsv")
tables <- dbQuoteIdentifier(con, tools::file_path_sans_ext(basename(files)))
dbBegin(con)
for (i in 1:length(files)) {
  dbWriteTable(con, tables[i], files[i], delim="\t", transaction=FALSE)
  dbSendQuery(con, paste0("ALTER TABLE ", tables[i], " ADD COLUMN file STRING DEFAULT ",dbQuoteString(con, files[i]),";"))
}
dbSendQuery(con, paste0("CREATE TABLE somefinalresult AS ", paste0("SELECT * FROM ",tables, collapse=" UNION ALL ")))
# remove the parts again, optional
dbSendQuery(con, paste0("DROP TABLE ", tables, ";", collapse=" "))
dbCommit(con)