根据标题的起始位置将多个CSV读入R并编译_R_Tidyverse

根据标题的起始位置将多个CSV读入R并编译

根据标题的起始位置将多个CSV读入R并编译,r,tidyverse,R,Tidyverse,我试图读入几个CSV，它们的标题从不同的行开始，然后将它们映射到一个数据帧中。我尝试了这里提供的代码，但无法使函数正常工作以下是两个DFs示例： file1 <- structure(list(X..Text = c("# Text", "#", "agency_cd", "5s", "USGS", "USGS"), X = c("", "", "site_no", "15s", "4294000", "4294000" ), X.1 = c("", "", "datetime",

我试图读入几个CSV，它们的标题从不同的行开始，然后将它们映射到一个数据帧中。我尝试了这里提供的代码，但无法使函数正常工作

以下是两个DFs示例：

file1 <- structure(list(X..Text = c("# Text", "#", "agency_cd", "5s", 
"USGS", "USGS"), X = c("", "", "site_no", "15s", "4294000", "4294000"
), X.1 = c("", "", "datetime", "20d", "6/24/13 0:00", "6/24/13 0:15"
), X.2 = c("", "", "tz_cd", "6s", "EDT", "EDT"), X.3 = c("", 
"", "Gage height", "14n", "1.63", "1.59"), X.4 = c("", "", " Discharge", 
"14n", "1310", "1250")), class = "data.frame", row.names = c(NA, 
-6L))

file2 <- structure(list(X..Text = c("# Text", "# Text", "#", "agency_cd", 
"5s", "USGS", "USGS"), X = c("", "", "", "site_no", "15s", "4294002", 
"4294002"), X.1 = c("", "", "", "datetime", "20d", "6/24/13 0:00", 
"6/24/13 0:15"), X.2 = c("", "", "", "tz_cd", "6s", "EDT", "EDT"
), X.3 = c("", "", "", "Gage height", "14n", "1.63", "1.59"), 
X.4 = c("", "", "", " Discharge", "14n", "1310", "1250")), class = 
"data.frame", row.names = c(NA, 
-7L))

file1使用问题中显示的file1
将其转换为Lines1
中的文本行，然后使用read.table读取，如图所示，类似于file2

Lines1 <- capture.output(write.table(file1, stdout(), row.names = FALSE, quote = FALSE))
ix <- grep("agency", Lines1) # line number of header
DF1 <- read.table(text = Lines1[-c(seq_len(ix-1), ix+1)], header = TRUE)

更新
修复。
这里有一个基本的R解决方案，它将查找标题行的过程，然后将文件读取到一个循环中，以处理文件目录
    #define column names
    #columnnames<-c("agency_cd","site_no", "datetime", "tz_cd", "Gage height", "Discharge")

    #find files that match pattern
    fname<-dir( pattern = "file[0-9]\\.csv")

    #loop and read all files
    dfs<-lapply(fname, function(f) {
    #find header row
    headerline<-grep("agency_cd", readLines(f))
    #read data with header row and following row
    #by reading the header row bind will align the columns
    df<- read.csv(f, skip=headerline-1, stringsAsFactors = FALSE)
})

finalanswer<-do.call(rbind, dfs)

> finalanswer
#  agency_cd site_no     datetime tz_cd Gage.height Discharge
#        5s     15s          20d    6s         14n       14n
#      USGS 4294000 6/24/13 0:00   EDT        1.63      1310
#      USGS 4294000 6/24/13 0:15   EDT        1.59      1250
#        5s     15s          20d    6s         14n       14n
#      USGS 4294002 6/24/13 0:00   EDT        1.63      1310
#      USGS 4294002 6/24/13 0:15   EDT        1.59      1250

#定义列名
#columnnames我找到了两种解决方案。第一个使用了most@Dave2e的解决方案，但我没有使用do.call（rbind，dfs）
将所有df绑定到一个中，而是使用了dplyr:：bind_rows（）
。do.call（rbind，dfs）
不起作用，因为我的标题列有时会有稍微不同的名称，这导致了此错误：匹配中出错。名称（clab，names（xi））：名称与以前的名称不匹配
dplyr:：bind_rows（）
对于不同的列名更加灵活。我还使用readr:：read_csv
代替read.csv
作为个人偏好
# First solution using most of @Dave2e's solution
library(tidyverse)

# Path to the data
data_path <- "Data/RACC_2012-2016/discharge"
# Get all CSV file names
file_names = dir(data_path, pattern = "*.csv", recursive = TRUE)

# Loop and read all files
dfs <- lapply(file.path(data_path, file_names), function(f) {
  # Find header row
  headerline <- grep("agency_cd", readLines(f))
  # Read data with header row and following row
  # by reading the header row bind will align the columns
  df <- read_csv(f, col_types = cols(.default = "c"), skip = headerline-1)
}) %>% 
  # Bind all into one data frame
  bind_rows() %>% 
  # Filters the row below the header row that doesn't contain data
  dplyr::filter(agency_cd != "5s") %>% 
  # Combine "Gage Height" and "Gage height" columns into one
  # First rename the columns to make them easier to call
  rename(Gage_height = "Gage Height", Gage_height2 = "Gage height") %>% 
  mutate(Gage_height = ifelse(is.na(Gage_height), Gage_height2, Gage_height)) %>% select(-Gage_height2)

谢谢大家的帮助！我还得到了以下方面的帮助：
和
谢谢！这可能行得通。问题是有很多文件，我不能确定列的顺序是否总是相同的。例如，我注意到在一些文件中，“规格高度”和“放电”列是交换的，即“放电”是第一个。有什么方法可以确保编译dfs时适当的列匹配吗？使用read.csv（f，skip=headerline-1，stringsAsFactors=FALSE）确实消除了do.call（rbind，dfs）过程中抛出的一些错误。但是，正如所怀疑的，由于“规格高度”和“排放”列的顺序并不总是相同的，规格高度值有时会在排放列中结束，反之亦然。@dkingaid，抱歉，需要删除重命名列的行。上面的代码已经过测试，应该可以解决这个问题。你知道如何用文件名列表来循环吗？lappy（文件名，函数（文件名）{行当我运行你稍微修改过的代码时，会出现这个错误：lappy（文件名，文件名），函数（文件名）{+行
> DF1
  agency_cd site_no datetime tz_cd Gage height Discharge
1      USGS 4294000  6/24/13  0:00  EDT   1.63      1310
2      USGS 4294000  6/24/13  0:15  EDT   1.59      1250

    #define column names
    #columnnames<-c("agency_cd","site_no", "datetime", "tz_cd", "Gage height", "Discharge")

    #find files that match pattern
    fname<-dir( pattern = "file[0-9]\\.csv")

    #loop and read all files
    dfs<-lapply(fname, function(f) {
    #find header row
    headerline<-grep("agency_cd", readLines(f))
    #read data with header row and following row
    #by reading the header row bind will align the columns
    df<- read.csv(f, skip=headerline-1, stringsAsFactors = FALSE)
})

finalanswer<-do.call(rbind, dfs)

> finalanswer
#  agency_cd site_no     datetime tz_cd Gage.height Discharge
#        5s     15s          20d    6s         14n       14n
#      USGS 4294000 6/24/13 0:00   EDT        1.63      1310
#      USGS 4294000 6/24/13 0:15   EDT        1.59      1250
#        5s     15s          20d    6s         14n       14n
#      USGS 4294002 6/24/13 0:00   EDT        1.63      1310
#      USGS 4294002 6/24/13 0:15   EDT        1.59      1250

# First solution using most of @Dave2e's solution
library(tidyverse)

# Path to the data
data_path <- "Data/RACC_2012-2016/discharge"
# Get all CSV file names
file_names = dir(data_path, pattern = "*.csv", recursive = TRUE)

# Loop and read all files
dfs <- lapply(file.path(data_path, file_names), function(f) {
  # Find header row
  headerline <- grep("agency_cd", readLines(f))
  # Read data with header row and following row
  # by reading the header row bind will align the columns
  df <- read_csv(f, col_types = cols(.default = "c"), skip = headerline-1)
}) %>% 
  # Bind all into one data frame
  bind_rows() %>% 
  # Filters the row below the header row that doesn't contain data
  dplyr::filter(agency_cd != "5s") %>% 
  # Combine "Gage Height" and "Gage height" columns into one
  # First rename the columns to make them easier to call
  rename(Gage_height = "Gage Height", Gage_height2 = "Gage height") %>% 
  mutate(Gage_height = ifelse(is.na(Gage_height), Gage_height2, Gage_height)) %>% select(-Gage_height2)

# Second solution

library(tidverse)
library(fs)

# Path to the data
data_path <- "Data/RACC_2012-2016/discharge"

# Bind all files together to form one data frame
discharge <-
  # Find all file names ending in CSV in all subfolders
  fs::dir_ls(data_path, regexp = "*.csv", recursive = TRUE) %>% 
  # Create a dataframe holding the file names
  data_frame(filename = .) %>% 
  # Read in all CSV files into a new data frame, 
         # Create a new column with the filenames
  mutate(file_contents = map(filename, 
                             # Here we append path to the data before the file name & force all columns to be as character
                             # because the typecasting was causing problems
                             # We use skip = grep("agency_cd", readLines(.))-1)) to find header row
                             ~ read_csv(., col_types = cols(.default = "c"), skip = grep("agency_cd", readLines(.))-1))
        ) %>% 
  # Unpack the list-columns to make a useful data frame
  unnest() %>% 
  # Filters the row below the header row that doesn't contain data
  dplyr::filter(agency_cd != "5s") %>% 
  # Combine "Gage Height" and "Gage height" columns into one
  # First rename the columns to make them easier to call
  rename(Gage_height = "Gage Height", Gage_height2 = "Gage height") %>% 
  mutate(Gage_height = ifelse(is.na(Gage_height), Gage_height2, Gage_height)) %>% select(-Gage_height2)