在package data.table中使用fread一次读取数据块_R_Data.table_Fread

在package data.table中使用fread一次读取数据块

在package data.table中使用fread一次读取数据块,r,data.table,fread,R,Data.table,Fread,我试图使用packagedata.table中的fread函数输入一个以制表符分隔的大文件（大约2GB）。然而，因为它太大了，所以不能完全放在内存中。我尝试使用skip和nrow参数将其分块输入，例如： chunk.size = 1e6 done = FALSE chunk = 1 while(!done) { temp = fread("myfile.txt",skip=(chunk-1)*chunk.size,nrow=chunk.size-1) #do something

我试图使用package

data.table

中的

fread

函数输入一个以制表符分隔的大文件（大约2GB）。然而，因为它太大了，所以不能完全放在内存中。我尝试使用

skip

和

nrow

参数将其分块输入，例如：

chunk.size = 1e6
done = FALSE
chunk = 1
while(!done)
{
    temp = fread("myfile.txt",skip=(chunk-1)*chunk.size,nrow=chunk.size-1)
    #do something to temp
    chunk = chunk + 1
    if(nrow(temp)<2) done = TRUE
}

chunk.size=1e6
完成=错误
区块=1
而（！完成）
{
temp=fread（“myfile.txt”，skip=（chunk-1）*chunk.size，nrow=chunk.size-1）
#做点什么
chunk=chunk+1
if（nrow（temp）您应该使用LaF
包。这在您的数据上引入了一种指针，从而避免了读取整个文件的恼人行为。就我所知，fread（）
在数据中。tablepckg需要知道总行数，这对于GB数据来说需要时间。
使用LaF
中的指针，你可以进入你想要的每一行；读入你可以应用你的函数的数据块，然后进入下一个数据块。在我的小电脑上，我用一个25 GB的csv文件以10e6行的步长运行，并提取出所需的全部~5e6个观察值——每个10e6数据块需要30秒
更新：
library('LaF')
huge_file <- 'C:/datasets/protein.links.v9.1.txt'

#First detect a data model for your file:
model <- detect_dm_csv(huge_file, sep=" ", header=TRUE)

现在res
包含过滤后的人工数据。但是更好的是，对于更复杂的操作，例如动态计算数据，函数process_blocks（）
将函数作为参数。因此，在函数中，您可以对每段数据执行任何您想要的操作。阅读文档。
您可以使用readr读取数据，例如，分块过滤数据。有关示例，请参阅和：
# Cars with 3 gears
f <- function(x, pos) subset(x, gear == 3)
read_csv_chunked(readr_example("mtcars.csv"), DataFrameCallback$new(f), chunk_size = 5)

#三档汽车
一个相关的选项是包。下面是一个3.5 GB文本文件的示例：
library(chunked)
library(tidyverse)

# I want to look at the daily page views of Wikipedia articles
# before 2015... I can get zipped log files
# from here: hhttps://dumps.wikimedia.org/other/pagecounts-ez/merged/2012/2012-12/
# I get bz file, unzip to get this: 

my_file <- 'pagecounts-2012-12-14/pagecounts-2012-12-14'

# How big is my file?
print(paste(round(file.info(my_file)$size  / 2^30,3), 'gigabytes'))
# [1] "3.493 gigabytes" too big to open in Notepad++ !
# But can read with 010 Editor

# look at the top of the file 
readLines(my_file, n = 100)

# to find where the content starts, vary the skip value, 
read.table(my_file, nrows = 10, skip = 25)

库（分块）
图书馆（tidyverse）
#我想看看维基百科文章的每日页面浏览量
#2015年之前…我可以得到压缩的日志文件
#从这里开始：hhttps://dumps.wikimedia.org/other/pagecounts-ez/merged/2012/2012-12/
#我得到bz文件，解压得到这个：
my_filefread（）
绝对可以帮助您按块读取数据
您在代码中犯的错误是，在循环过程中更改函数中的skip
参数的大小时，您应该保持nrow
恒定
这是我为我的数据写的东西：
data=NULL

for (i in 0:20){

    data[[i+1]]=fread("my_data.csv",nrow=10000,select=c(1,2:100),skip =10000*i)


}




您可以在循环中插入以下代码：

start_time <- Sys.time()
#####something!!!!

end_time <- Sys.time()

end_time - start_time



统一成一个大数据集
希望我的回答能对你的问题有所帮助
我用这种方法加载了一个18Gb的数据，包含2k+列，大约8分钟内加载了200k行。
有一个类似的FR。我也会链接到这篇文章。感谢你指出这一点并链接！看起来像是一个最高优先级的FR。我想做同样的事情，我认为这需要一个新的请求。有同样的问题吗today@Arunt上有FR吗新的github页面？我似乎找不到。是的，请更新。谢谢。我有一个872493862行61GB的文件，它运行得相当快。我用“nrows”和“skip”对fread（）尝试了相同的循环方法但是它在每个循环中都变得越来越慢，因为它不得不跳过更多的行。似乎chunked
它是LaF的一个包装器。是的，虽然更方便用户，但我尝试了这种方法，但对于我的61GB文件，它太慢了。你是我的英雄，我使用了参数rnows，但它是nrow
library(chunked)
library(tidyverse)

# I want to look at the daily page views of Wikipedia articles
# before 2015... I can get zipped log files
# from here: hhttps://dumps.wikimedia.org/other/pagecounts-ez/merged/2012/2012-12/
# I get bz file, unzip to get this: 

my_file <- 'pagecounts-2012-12-14/pagecounts-2012-12-14'

# How big is my file?
print(paste(round(file.info(my_file)$size  / 2^30,3), 'gigabytes'))
# [1] "3.493 gigabytes" too big to open in Notepad++ !
# But can read with 010 Editor

# look at the top of the file 
readLines(my_file, n = 100)

# to find where the content starts, vary the skip value, 
read.table(my_file, nrows = 10, skip = 25)

# Let the chunked pkg work its magic! We only want the lines containing 
# "Gun_control". The main challenge here was identifying the column
# header
df <- 
read_chunkwise(my_file, 
               chunk_size=5000,
               skip = 30,
               format = "table",
               header = TRUE) %>% 
  filter(stringr::str_detect(De.mw.De.5.J3M1O1, "Gun_control"))

# this line does the evaluation, 
# and takes a few moments...
system.time(out <- collect(df))

# clean up the output to separate into cols, 
# and get the number of page views as a numeric
out_df <- 
out %>% 
  separate(De.mw.De.5.J3M1O1, 
           into = str_glue("V{1:4}"),
           sep = " ") %>% 
  mutate(V3 = as.numeric(V3))

 head(out_df)
    V1                                                        V2   V3
1 en.z                                               Gun_control 7961
2 en.z Category:Gun_control_advocacy_groups_in_the_United_States 1396
3 en.z          Gun_control_policy_of_the_Clinton_Administration  223
4 en.z                            Category:Gun_control_advocates   80
5 en.z                         Gun_control_in_the_United_Kingdom   68
6 en.z                                    Gun_control_in_america   59
                                                                                 V4
1 A34B55C32D38E32F32G32H20I22J9K12L10M9N15O34P38Q37R83S197T1207U1643V1523W1528X1319
2                                     B1C5D2E1F3H3J1O1P3Q9R9S23T197U327V245W271X295
3                                     A3B2C4D2E3F3G1J3K1L1O3P2Q2R4S2T24U39V41W43X40
4                                                            D2H1M1S4T8U22V10W18X14
5                                                             B1C1S1T11U12V13W16X13
6                                                         B1H1M1N2P1S1T6U5V17W12X12

#--------------------

data=NULL

for (i in 0:20){

    data[[i+1]]=fread("my_data.csv",nrow=10000,select=c(1,2:100),skip =10000*i)


}





start_time <- Sys.time()
#####something!!!!

end_time <- Sys.time()

end_time - start_time



new_data = data[[1]]

for (i in 1:20){
    new_data=rbind(new_data,data[[i+1]],use.names=FALSE)
}