批处理和导出R中的CSV文件列表
我有300个CSV文件,在一个基于不同村庄名称的文件夹中具有相同的结构。我需要单独读取每个文件,处理这些文件,并将输出文件导出到另一个具有相应村庄名称的文件夹中(例如,“村庄名称”分数) 下面是一个示例村庄文件的数据批处理和导出R中的CSV文件列表,r,loops,batch-processing,R,Loops,Batch Processing,我有300个CSV文件,在一个基于不同村庄名称的文件夹中具有相同的结构。我需要单独读取每个文件,处理这些文件,并将输出文件导出到另一个具有相应村庄名称的文件夹中(例如,“村庄名称”分数) 下面是一个示例村庄文件的数据 structure(list(ID_GC = structure(1:51, .Label = c("492K", "494K", "497K", "498K", "499K",
structure(list(ID_GC = structure(1:51, .Label = c("492K", "494K",
"497K", "498K", "499K", "500K", "501K", "502K", "503K", "504K",
"506K", "507K", "508K", "509K", "510K", "511K", "512K", "513K",
"514K", "516K", "517K", "518K", "519K", "522K", "523K", "524K",
"526K", "527K", "528K", "530K", "531K", "532K", "533K", "534K",
"535K", "536K", "537K", "538K", "539K", "540K", "541K", "542K",
"543K", "544K", "545K", "546K", "547K", "548K", "550K", "551K",
"552K"), class = "factor"), Lat = c(23.78107, 23.78115, 23.78122,
23.78123, 23.78125, 23.78081, 23.78096, 23.78062, 23.78068, 23.78071,
23.78075, 23.78043, 23.78021, 23.77937, 23.77985, 23.77981, 23.77995,
23.77987, 23.7799, 23.7796, 23.77944, 23.77934, 23.77937, 23.77906,
23.77899, 23.77907, 23.77889, 23.77898, 23.77863, 23.77865, 23.77855,
23.77852, 23.77843, 23.77806, 23.77824, 23.77809, 23.7781, 23.77797,
23.77788, 23.77786, 23.77809, 23.77815, 23.77771, 23.77757, 23.77772,
23.77752, 23.7774, 23.7772, 23.77869, 23.78084, 23.78178), Long = c(90.65016,
90.64968, 90.6497, 90.64969, 90.64972, 90.64996, 90.64987, 90.64989,
90.64924, 90.64921, 90.65, 90.64998, 90.6494, 90.64989, 90.64978,
90.64973, 90.64952, 90.64958, 90.64925, 90.64935, 90.6492, 90.64922,
90.64919, 90.64928, 90.64937, 90.64887, 90.64919, 90.64891, 90.64914,
90.64903, 90.64907, 90.6491, 90.64868, 90.6491, 90.64853, 90.64862,
90.64851, 90.64852, 90.64865, 90.64865, 90.64878, 90.64878, 90.64866,
90.64859, 90.64844, 90.64839, 90.64858, 90.64861, 90.64922, 90.64994,
90.64925), Village = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Abdullapur", class = "factor"),
Depth_m = c(18, 18, 18, 210, 18, 31.5, 13.5, 15, 13.5, 21,
13.5, 18, 15, 240, 24, 13.5, 19.5, 33, 156, 14.4, 18, 21,
13.5, 18, 18, 51, 48, 54, 67.5, 69, 69, 66, 66, 21, 60, 66,
54, 31.5, 21, 210, 66, 12, 54, 27, 219, 18, 18, 18, 18, 18,
21), As_ug_L = c(68L, 68L, 68L, 2L, 68L, 306L, 129L, 129L,
20L, 68L, 188L, 129L, 68L, 2L, 68L, 68L, 129L, 188L, 2L,
2L, 68L, 37L, 20L, 306L, 306L, 20L, 306L, 20L, 2L, 2L, 2L,
2L, 2L, 306L, 2L, 2L, 2L, 306L, 306L, 2L, 2L, 306L, 2L, 306L,
20L, 306L, 68L, 68L, 306L, 68L, 20L)), class = "data.frame", row.names = c(NA,
-51L))
以及计算所有村庄所需的另一个数据集(“dtw_BG”)
structure(list(ID_GC = structure(c(10L, 11L, 12L, 13L, 14L, 8L,
9L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L,
27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 1L, 2L, 3L, 4L, 5L, 6L,
7L), .Label = c("1002F", "1008F", "1016F", "1029F", "1051F",
"1053F", "1058F", "1548D", "1561D", "498K", "509K", "514K", "540K",
"545K", "559K", "560K", "569K", "571K", "597K", "601K", "614K",
"819F", "829F", "933F", "934F", "951F", "957F", "958F", "959F",
"960F", "964F", "973F", "982F", "998F"), class = "factor"), Lat = c(23.78123,
23.77937, 23.7799, 23.77786, 23.77772, 23.77439336, 23.77204886,
23.77484, 23.775, 23.77528, 23.77492, 23.77521, 23.77593, 23.7757,
23.78494, 23.78473, 23.78385611, 23.78395451, 23.78426992, 23.78374538,
23.78377154, 23.78360725, 23.78340944, 23.78362259, 23.78272036,
23.78307399, 23.78269739, 23.78252464, 23.78279102, 23.78131262,
23.78149057, 23.77867098, 23.77828323, 23.78592929), Long = c(90.64969,
90.64989, 90.64925, 90.64865, 90.64844, 90.65543457, 90.65292302,
90.65158, 90.65192, 90.65219, 90.65232, 90.65363, 90.65356, 90.65483,
90.65025, 90.65238, 90.64900976, 90.64933908, 90.65082989, 90.64891814,
90.64902199, 90.64910447, 90.64933699, 90.6488857, 90.64921562,
90.64848103, 90.64799873, 90.64826494, 90.64738669, 90.64781684,
90.64612672, 90.64499055, 90.64476985, 90.6499865), Village = structure(c(1L,
1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L), .Label = c("Abdullapur", "Chauthar Kanda", "Nagra Para Faitadi",
"Nowa Para"), class = "factor"), Depth_m = c(210, 240, 156, 210,
219, 225, 195, 299.7, 299.7, 240, 240, 234, 240, 105, 165, 180,
180, 225, 180, 210, 195, 201, 180, 195, 210, 210, 195, 180, 225,
180, 108, 210, 225, 240), As_ug_L = c(2L, 2L, 2L, 2L, 20L, 2L,
2L, 2L, 20L, 2L, 2L, 7L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-34L))
我需要处理所有的村庄,但我不确定如何循环它们。到目前为止,我能够使用“readr”软件包阅读所有单个村庄的文件
如何导出每个村庄文件的名称
提前感谢:)只需在一个已定义的方法中概括您的过程,该方法接收村庄文件名作为参数。然后通过迭代文件名并调用您的方法来构建数据帧列表:
# COMMON VARIABLES
output_path <- "/Users/..../Output/"
dtw_BG <- read.csv('/Users/...../dtw_BG.csv', header=TRUE)
BG <- dtw_BG[,c(3,2)]
# OUTPUT CSV AND RETURN DATA FRAME
calc_score <- function(village_file) {
gw <- read.csv(village_file, header=TRUE)
#... REST OF CODE
write.csv(abc, paste0(output_Path, stw$Village[[1]], ".csv"), row.names = FALSE)
return(abc)
}
# PASS FILE NAMES ITERATIVELY TO BULLD LIST OF DFs (WITH EACH CSV)
v_files <- list.files(path = "/path/to/inputs", pattern = "*.csv",
full.names = TRUE)
df_list <- lapply(v_files, calc_score)
只需在一个已定义的方法中概括您的过程,该方法将村文件名作为参数接收。然后通过迭代文件名并调用您的方法来构建数据帧列表:
# COMMON VARIABLES
output_path <- "/Users/..../Output/"
dtw_BG <- read.csv('/Users/...../dtw_BG.csv', header=TRUE)
BG <- dtw_BG[,c(3,2)]
# OUTPUT CSV AND RETURN DATA FRAME
calc_score <- function(village_file) {
gw <- read.csv(village_file, header=TRUE)
#... REST OF CODE
write.csv(abc, paste0(output_Path, stw$Village[[1]], ".csv"), row.names = FALSE)
return(abc)
}
# PASS FILE NAMES ITERATIVELY TO BULLD LIST OF DFs (WITH EACH CSV)
v_files <- list.files(path = "/path/to/inputs", pattern = "*.csv",
full.names = TRUE)
df_list <- lapply(v_files, calc_score)
您可以在目录中列出.csv文件,并使用您为单个村庄编写的代码在其中循环。您可以在目录中列出.csv文件,并使用您为单个村庄编写的代码在其中循环。谢谢@Parfait!我可以根据需要使用公共变量导出村庄文件,并按照您提供的方式传递文件名。然而,直到我使用列名而不是数字,分数才是正确的。我还尝试了交叉连接合并的代码,但它没有给我正确的分数(合并数据长度不相等)。我在……mdf中得到一个错误是的,同样,底部的重构代码示例需要更多的测试和评估,并且只显示为演示。其思想是避免在每个数据帧中按行进行迭代,而是将整个集合合并在一起进行成对距离计算。我修复了一些语法问题。错误应该出现在
transform
行中,而不是merge
。但如果第一个代码块对您有效(使用您现有的代码),很高兴我能帮助您!非常感谢@Parfait!我可以根据需要使用公共变量导出村庄文件,并按照您提供的方式传递文件名。然而,直到我使用列名而不是数字,分数才是正确的。我还尝试了交叉连接合并的代码,但它没有给我正确的分数(合并数据长度不相等)。我在……mdf中得到一个错误是的,同样,底部的重构代码示例需要更多的测试和评估,并且只显示为演示。其思想是避免在每个数据帧中按行进行迭代,而是将整个集合合并在一起进行成对距离计算。我修复了一些语法问题。错误应该出现在transform
行中,而不是merge
。但如果第一个代码块对您有效(使用您现有的代码),很高兴我能帮助您!
structure(list(ID_GC = structure(c(1L, 2L, 3L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L, 20L, 21L, 22L, 23L,
24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L,
37L, 38L, 39L, 41L, 42L, 43L, 44L, 46L, 47L, 48L, 49L, 50L, 51L
), .Label = c("492K", "494K", "497K", "498K", "499K", "500K",
"501K", "502K", "503K", "504K", "506K", "507K", "508K", "509K",
"510K", "511K", "512K", "513K", "514K", "516K", "517K", "518K",
"519K", "522K", "523K", "524K", "526K", "527K", "528K", "530K",
"531K", "532K", "533K", "534K", "535K", "536K", "537K", "538K",
"539K", "540K", "541K", "542K", "543K", "544K", "545K", "546K",
"547K", "548K", "550K", "551K", "552K"), class = "factor"), Lat = c(23.78107,
23.78115, 23.78122, 23.78125, 23.78081, 23.78096, 23.78062, 23.78068,
23.78071, 23.78075, 23.78043, 23.78021, 23.77985, 23.77981, 23.77995,
23.77987, 23.7796, 23.77944, 23.77934, 23.77937, 23.77906, 23.77899,
23.77907, 23.77889, 23.77898, 23.77863, 23.77865, 23.77855, 23.77852,
23.77843, 23.77806, 23.77824, 23.77809, 23.7781, 23.77797, 23.77788,
23.77809, 23.77815, 23.77771, 23.77757, 23.77752, 23.7774, 23.7772,
23.77869, 23.78084, 23.78178), Long = c(90.65016, 90.64968, 90.6497,
90.64972, 90.64996, 90.64987, 90.64989, 90.64924, 90.64921, 90.65,
90.64998, 90.6494, 90.64978, 90.64973, 90.64952, 90.64958, 90.64935,
90.6492, 90.64922, 90.64919, 90.64928, 90.64937, 90.64887, 90.64919,
90.64891, 90.64914, 90.64903, 90.64907, 90.6491, 90.64868, 90.6491,
90.64853, 90.64862, 90.64851, 90.64852, 90.64865, 90.64878, 90.64878,
90.64866, 90.64859, 90.64839, 90.64858, 90.64861, 90.64922, 90.64994,
90.64925), Village = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Abdullapur", class = "factor"),
Depth_m = c(18, 18, 18, 18, 31.5, 13.5, 15, 13.5, 21, 13.5,
18, 15, 24, 13.5, 19.5, 33, 14.4, 18, 21, 13.5, 18, 18, 51,
48, 54, 67.5, 69, 69, 66, 66, 21, 60, 66, 54, 31.5, 21, 66,
12, 54, 27, 18, 18, 18, 18, 18, 21), As_ug_L = c(68L, 68L,
68L, 68L, 306L, 129L, 129L, 20L, 68L, 188L, 129L, 68L, 68L,
68L, 129L, 188L, 2L, 68L, 37L, 20L, 306L, 306L, 20L, 306L,
20L, 2L, 2L, 2L, 2L, 2L, 306L, 2L, 2L, 2L, 306L, 306L, 2L,
306L, 2L, 306L, 306L, 68L, 68L, 306L, 68L, 20L), maxscore = c(10L,
11L, 11L, 11L, 12L, 12L, 16L, 13L, 12L, 12L, 16L, 13L, 8L,
10L, 9L, 10L, 9L, 10L, 10L, 10L, 7L, 7L, 5L, 7L, 6L, 9L,
9L, 9L, 8L, 9L, 9L, 9L, 9L, 8L, 8L, 8L, 9L, 9L, 8L, 8L, 8L,
8L, 6L, 7L, 12L, 3L), count = c(10L, 11L, 11L, 11L, 12L,
12L, 16L, 13L, 12L, 12L, 16L, 13L, 8L, 10L, 9L, 10L, 9L,
9L, 9L, 9L, 6L, 6L, 4L, 6L, 5L, 8L, 8L, 8L, 7L, 8L, 8L, 8L,
8L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 6L, 6L, 12L, 3L),
score = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 0L), count_itw = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 3L, 6L, 7L, 7L, 8L, 8L,
9L, 10L, 10L, 12L, 12L, 12L, 12L, 13L, 11L, 13L, 10L, 10L,
10L, 10L, 12L, 12L, 6L, 6L, 5L, 5L, 2L, 12L, 0L, 0L), count_itw10 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 3L, 4L, 4L, 5L, 5L, 6L, 7L, 7L, 9L, 9L, 9L, 9L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 6L, 6L, 5L, 5L, 2L,
9L, 0L, 0L)), class = "data.frame", row.names = c(1L, 2L,
3L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L,
20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L,
33L, 34L, 35L, 36L, 37L, 38L, 39L, 41L, 42L, 43L, 44L, 46L, 47L,
48L, 49L, 50L, 51L))
# COMMON VARIABLES
output_path <- "/Users/..../Output/"
dtw_BG <- read.csv('/Users/...../dtw_BG.csv', header=TRUE)
BG <- dtw_BG[,c(3,2)]
# OUTPUT CSV AND RETURN DATA FRAME
calc_score <- function(village_file) {
gw <- read.csv(village_file, header=TRUE)
#... REST OF CODE
write.csv(abc, paste0(output_Path, stw$Village[[1]], ".csv"), row.names = FALSE)
return(abc)
}
# PASS FILE NAMES ITERATIVELY TO BULLD LIST OF DFs (WITH EACH CSV)
v_files <- list.files(path = "/path/to/inputs", pattern = "*.csv",
full.names = TRUE)
df_list <- lapply(v_files, calc_score)
calc_score <- function(village_file) {
gw <- read.csv(village_file, header=TRUE)
### DATA FRAME SUBSETS
stw <- gw[gw$Depth_m <= 90,]
p <- stw[, c("Long", "Lat")]
R <- gw[gw$Depth_m <= 90 & gw$As_ug_L > 50, c("Long", "Lat")]
ITW <- gw[gw$Depth_m >= 45 & gw$Depth_m <= 90, c("Long", "Lat")]
ITW_10 <- gw[gw$Depth_m >= 45 & gw$Depth_m <= 90 & gw$As_ug_L <= 10, c("Long", "Lat")]
### MAX SCORE CALCULATION
cj <- merge(R, p, by=NULL, suffixes=c("", "_")) # CROSS JOIN OF ALL ROWS BETWEEN DFs
dist_R <- transform(cj, Distance = distGeo(cj[c("Long", "Lat")], cj[c("Long_", "Lat_")]))
dist_R100 <- subset(dist_R, Distance <= 100)
maxscore <- aggregate(cbind(Score=Distance) ~ Long_ + Lat_, dist_R100, FUN=length)$Score
### COUNT ITW100 CALCULATION
cj <- merge(ITW, p, by=NULL, suffixes=c("", "_")) # CROSS JOIN OF ALL ROWS BETWEEN DFs
dist_ITW <- transform(cj, Distance = distGeo(cj[c("Long", "Lat")], cj[c("Long_", "Lat_")]))
dist_ITW100 <- subset(dist_ITW, Distance <= 100)
count_itw <- aggregate(cbind(Count=Distance) ~ Long_ + Lat_, dist_ITW100, FUN=length)$Count
### COUNT ITW10 CALCULATION
if (nrow(ITW_10)==0) {
count_itw10 <- rep(0, length(maxscore))
} else {
cj <- merge(IT_10, p, by=NULL, suffixes=c("", "_")) # CROSS JOIN OF ALL ROWS BETWEEN DFs
dist_ITW10 <- transform(cj, Distance = distGeo(cj[c("Long", "Lat")], cj[c("Long_", "Lat_")]))
dist_ITW10_100 <- subset(dist_ITW10, Distance <= 100)
count_itw10 <- aggregate(cbind(Count=Distance) ~ Long_ + Lat_, dist_ITW10_100, FUN=length)$Count
}
### MINIMUM DISTANCE
cj <- merge(BG, p, by=NULL, suffixes=c("", "_")) # CROSS JOIN OF ALL ROWS BETWEEN DFs
dist_BG <- transform(cj, Distance = distGeo(cj[c("Long", "Lat")], cj[c("Long_", "Lat_")]))
mdf <- merge(dist_R100, dist_BG, by=c("Long_", "Lat_"),
suffixes=c("", "_")) # MERGE AT p LEVEL
dtw <- transform(mdf, Distance = distGeo(mdf[c("Long", "Lat")], mdf[c("Long_", "Lat_")]))
dtw <- aggregate(Distance ~ Long + Lat, dtw, FUN=min)$Distance
### SCORE CALCULATION
dtw <- unlist(dtw)
dtw <- split(dtw, (0:length(dtw) %/% length(p[[1]])))
dtw <- dtw[-length (dtw)]
count <- sapply(dtw, function(d) length(d[d<=100]))
score <- maxscore - count
### FINAL DATA FRAME
village_df <- cbind.data.frame(stw, maxscore, count, score, count_itw, count_itw10)
write.csv(village_df, paste0(output_Path, village_df$Village[[1]], ".csv"), row.names = FALSE)
return(village_df)
}