R 如何整理一个固定宽度的文件,每n(不同)行有一个标题?
我在固定宽度的文件中有时间序列数据,其中观察行(n根据样本大小而变化)出现在包含重要元数据(即样本编号、日期等)的“标题”行下。这两种类型的行都包含字母数字字符。它看起来像这样(字符串缩短以便于阅读:R 如何整理一个固定宽度的文件,每n(不同)行有一个标题?,r,dplyr,data.table,read.fwf,R,Dplyr,Data.table,Read.fwf,我在固定宽度的文件中有时间序列数据,其中观察行(n根据样本大小而变化)出现在包含重要元数据(即样本编号、日期等)的“标题”行下。这两种类型的行都包含字母数字字符。它看起来像这样(字符串缩短以便于阅读: 4 64001416230519844TP blahblah 5416001130 1 F 492273 5416001140 3 F 492274 5416001145 1 F 492275 5416001150 19 F 492276 5416001155 21 F
4 64001416230519844TP blahblah
5416001130 1 F 492273
5416001140 3 F 492274
5416001145 1 F 492275
5416001150 19 F 492276
5416001155 21 F 492277
5416001160 21 F 492278
5416001165 13 F 492279
5416001170 3 F 492280
5416001180 1 F 492281
4 64001544250619844RA blahblah
5544001125 1 F 492291
5544001130 3 F 492292
5544001135 4 F 492293
5544001140 11 F 492294
5544001145 13 F 492295
4 64002544250619844RA blahblah
etc.
标题行由字符串==4中的第一个字符区分,有89个字符。观察行==5,有24个字符
我想要的是将标题行粘贴到每个后续观察行(数据子集),以便以后可以使用read_fwf解析字符串,并确保我可以根据标题行中包含的信息对每个观察进行排序。我不在乎是否删除原始标题行。如下所示:
5416001130 1 F 492273 4 64001416230519844TP blahblah
5416001140 3 F 492274 4 64001416230519844TP blahblah
5416001145 1 F 492275 4 64001416230519844TP blahblah
5416001150 19 F 492276 4 64001416230519844TP blahblah
5416001155 21 F 492277 4 64001416230519844TP blahblah
5416001160 21 F 492278 4 64001416230519844TP blahblah
5416001165 13 F 492279 4 64001416230519844TP blahblah
5416001170 3 F 492280 4 64001416230519844TP blahblah
5416001180 1 F 492281 4 64001416230519844TP blahblah
5544001125 1 F 492291 4 64001544250619844RA blahblah
5544001130 3 F 492292 4 64001544250619844RA blahblah
5544001135 4 F 492293 4 64001544250619844RA blahblah
5544001140 11 F 492294 4 64001544250619844RA blahblah
5544001145 13 F 492295 4 64001544250619844RA blahblah
etc...
我找到的最接近的解决方案就是这里
提供的解决方案是一个循环,循环滚动行,测试行是字符还是数字,并相应地将它们粘贴在一起
text <- readLines('/path/to/file') # read in the file
split_text <- strsplit(text, "\\s+") # split each line on whitespace
for (line in split_text) { # iterate through lines
numeric_line <- suppressWarnings(as.numeric(line)) # try to convert the current line into a vector of numbers
if (is.na(numeric_line[[1]])) { # if it fails, we know we're on a header line
header <- line
} else {
for (i in seq(1, length(line), 2)) { # otherwise, we're on a data line, so take two numbers at once
print(c(header, line[[i]], line[[i+1]])) # and output the latest header with each pair of values
}
}
}
正如您所看到的,它拆分文件,对变量进行排序,合并它们。然而,这是一年一年地完成的,我希望多年使用一个文件。这里有一个使用
tidyverse
的解决方案
它只使用标题行创建一个新列,然后使用上面的标题填充没有标题的行。最后,如果需要,您可以将列粘贴在一起
x <- read.table(text = "4 64001416230519844TP blahblah
5416001130 1 F 492273
5416001140 3 F 492274
5416001145 1 F 492275
5416001150 19 F 492276
5416001155 21 F 492277
5416001160 21 F 492278
5416001165 13 F 492279
5416001170 3 F 492280
5416001180 1 F 492281
4 64001544250619844RA blahblah
5544001125 1 F 492291
5544001130 3 F 492292
5544001135 4 F 492293
5544001140 11 F 492294
5544001145 13 F 492295", header = FALSE, sep = "\t")
library("tidyverse")
x %>%
rename(body = V1) %>%
mutate(
body = trimws(body),
head = if_else(grepl("^4", body), body, NA_character_),
body = if_else(is.na(head), body, NA_character_)
) %>%
fill(head, .direction = "down") %>%
filter(!is.na(body))
另一种可能的解决方案(无tidyverse)是每行读取文件,查找标题行并将这些行粘贴到无标题行的末尾。之后,这些行被拆分并放入data.frame中
lines <- readLines("asd.dat")
# last index + 1 for iteration
headers <- c(which(grepl("^4 ", lines)), length(lines) + 1)
pastedLines <- c()
for(i in 1:(length(headers) - 1)) {
pastedLines <- c(pastedLines,
paste(lines[(headers[i] + 1) : (headers[i + 1] - 1)], lines[headers[i]]))
}
DF <- as.data.frame(matrix(unlist(strsplit(pastedLines, "\\s+")), nrow = length(pastedLines), byrow=T))
以R为基数的两个选项。两个选项都使用
readLines
读取原始文本数据(请参见本答案的结尾)
选项1:
i <- grepl(pattern = '^4 ', x)
x1 <- strsplit(x[!i], '\\s+')
x2 <- strsplit(x[i], '\\s+')
d1 <- do.call(rbind.data.frame, x1)
d2 <- do.call(rbind.data.frame, x2)
d <- cbind(d1, d2[cumsum(i)[-which(i)],])
names(d) <- paste0('V',1:ncol(d))
rawlist <- split(x, cumsum(grepl(pattern = '^4 ', x)))
l1 <- lapply(rawlist, function(x) read.table(text = x, skip = 1, header = FALSE))
l2 <- lapply(rawlist, function(x) read.table(text = x, nrows = 1, header = FALSE))
reps <- sapply(l1, nrow)
d1 <- do.call(rbind, l1)
d2 <- do.call(rbind, l2)[rep(1:length(l2), reps),]
d <- cbind(d1, d2)
names(d) <- paste0('V',1:ncol(d))
选项2:
i <- grepl(pattern = '^4 ', x)
x1 <- strsplit(x[!i], '\\s+')
x2 <- strsplit(x[i], '\\s+')
d1 <- do.call(rbind.data.frame, x1)
d2 <- do.call(rbind.data.frame, x2)
d <- cbind(d1, d2[cumsum(i)[-which(i)],])
names(d) <- paste0('V',1:ncol(d))
rawlist <- split(x, cumsum(grepl(pattern = '^4 ', x)))
l1 <- lapply(rawlist, function(x) read.table(text = x, skip = 1, header = FALSE))
l2 <- lapply(rawlist, function(x) read.table(text = x, nrows = 1, header = FALSE))
reps <- sapply(l1, nrow)
d1 <- do.call(rbind, l1)
d2 <- do.call(rbind, l2)[rep(1:length(l2), reps),]
d <- cbind(d1, d2)
names(d) <- paste0('V',1:ncol(d))
使用数据:
x <- readLines(textConnection('4 64001416230519844TP blahblah
5416001130 1 F 492273
5416001140 3 F 492274
5416001145 1 F 492275
5416001150 19 F 492276
5416001155 21 F 492277
5416001160 21 F 492278
5416001165 13 F 492279
5416001170 3 F 492280
5416001180 1 F 492281
4 64001544250619844RA blahblah
5544001125 1 F 492291
5544001130 3 F 492292
5544001135 4 F 492293
5544001140 11 F 492294
5544001145 13 F 492295'))
x这里有一个可能的base R解决方案,它试图提高一点内存效率:
rawtext <- "4 64001416230519844TP blahblah
5416001130 1 F 492273
5416001140 3 F 492274
5416001145 1 F 492275
5416001150 19 F 492276
5416001155 21 F 492277
5416001160 21 F 492278
5416001165 13 F 492279
5416001170 3 F 492280
5416001180 1 F 492281
4 64001544250619844RA blahblah
5544001125 1 F 492291
5544001130 3 F 492292
5544001135 4 F 492293
5544001140 11 F 492294
5544001145 13 F 492295"
然后实际重读每一篇文章,但只阅读必要的行数:
do.call(rbind, mapply(
function(skip, nrows, ...) data.frame(
read.table(skip = skip, nrows = nrows, ...),
read.table(skip = skip - 1, nrows = 1, ...)
),
MoreArgs = list(text = rawtext),
header_rows,
lengths,
SIMPLIFY = FALSE
))
# V1 V2 V3 V4 V1.1 V2.1 V3.1
# 1 5416001130 1 FALSE 492273 4 64001416230519844TP blahblah
# 2 5416001140 3 FALSE 492274 4 64001416230519844TP blahblah
# 3 5416001145 1 FALSE 492275 4 64001416230519844TP blahblah
# 4 5416001150 19 FALSE 492276 4 64001416230519844TP blahblah
# 5 5416001155 21 FALSE 492277 4 64001416230519844TP blahblah
# 6 5416001160 21 FALSE 492278 4 64001416230519844TP blahblah
# 7 5416001165 13 FALSE 492279 4 64001416230519844TP blahblah
# 8 5416001170 3 FALSE 492280 4 64001416230519844TP blahblah
# 9 5416001180 1 FALSE 492281 4 64001416230519844TP blahblah
# 10 5544001125 1 FALSE 492291 4 64001544250619844RA blahblah
# 11 5544001130 3 FALSE 492292 4 64001544250619844RA blahblah
# 12 5544001135 4 FALSE 492293 4 64001544250619844RA blahblah
# 13 5544001140 11 FALSE 492294 4 64001544250619844RA blahblah
# 14 5544001145 13 FALSE 492295 4 64001544250619844RA blahblah
感谢您的快速反馈。选项1有效!选项2无效,但我会再次检查。我收到了此错误:l1@AndrewSmith显然第5行缺少一个值。如果是这种情况,应该不太难进行目视检查。为了防止此错误,您可以添加fill=TRUE
:lappy(rawlist,function(x))read.table(text=x,skip=1,header=FALSE,fill=TRUE))
此解决方案同样有效。感谢您的快速回答。
> d
V1 V2 V3 V4 V5 V6 V7
1 5416001130 1 F 492273 4 64001416230519844TP blahblah
1.1 5416001140 3 F 492274 4 64001416230519844TP blahblah
1.2 5416001145 1 F 492275 4 64001416230519844TP blahblah
1.3 5416001150 19 F 492276 4 64001416230519844TP blahblah
1.4 5416001155 21 F 492277 4 64001416230519844TP blahblah
1.5 5416001160 21 F 492278 4 64001416230519844TP blahblah
1.6 5416001165 13 F 492279 4 64001416230519844TP blahblah
1.7 5416001170 3 F 492280 4 64001416230519844TP blahblah
1.8 5416001180 1 F 492281 4 64001416230519844TP blahblah
2 5544001125 1 F 492291 4 64001544250619844RA blahblah
2.1 5544001130 3 F 492292 4 64001544250619844RA blahblah
2.2 5544001135 4 F 492293 4 64001544250619844RA blahblah
2.3 5544001140 11 F 492294 4 64001544250619844RA blahblah
2.4 5544001145 13 F 492295 4 64001544250619844RA blahblah
rawlist <- split(x, cumsum(grepl(pattern = '^4 ', x)))
l1 <- lapply(rawlist, function(x) read.table(text = x, skip = 1, header = FALSE))
l2 <- lapply(rawlist, function(x) read.table(text = x, nrows = 1, header = FALSE))
reps <- sapply(l1, nrow)
d1 <- do.call(rbind, l1)
d2 <- do.call(rbind, l2)[rep(1:length(l2), reps),]
d <- cbind(d1, d2)
names(d) <- paste0('V',1:ncol(d))
> d
V1 V2 V3 V4 V5 V6 V7
1.1 5416001130 1 FALSE 492273 4 64001416230519844TP blahblah
1.2 5416001140 3 FALSE 492274 4 64001416230519844TP blahblah
1.3 5416001145 1 FALSE 492275 4 64001416230519844TP blahblah
1.4 5416001150 19 FALSE 492276 4 64001416230519844TP blahblah
1.5 5416001155 21 FALSE 492277 4 64001416230519844TP blahblah
1.6 5416001160 21 FALSE 492278 4 64001416230519844TP blahblah
1.7 5416001165 13 FALSE 492279 4 64001416230519844TP blahblah
1.8 5416001170 3 FALSE 492280 4 64001416230519844TP blahblah
1.9 5416001180 1 FALSE 492281 4 64001416230519844TP blahblah
2.1 5544001125 1 FALSE 492291 4 64001544250619844RA blahblah
2.2 5544001130 3 FALSE 492292 4 64001544250619844RA blahblah
2.3 5544001135 4 FALSE 492293 4 64001544250619844RA blahblah
2.4 5544001140 11 FALSE 492294 4 64001544250619844RA blahblah
2.5 5544001145 13 FALSE 492295 4 64001544250619844RA blahblah
x <- readLines(textConnection('4 64001416230519844TP blahblah
5416001130 1 F 492273
5416001140 3 F 492274
5416001145 1 F 492275
5416001150 19 F 492276
5416001155 21 F 492277
5416001160 21 F 492278
5416001165 13 F 492279
5416001170 3 F 492280
5416001180 1 F 492281
4 64001544250619844RA blahblah
5544001125 1 F 492291
5544001130 3 F 492292
5544001135 4 F 492293
5544001140 11 F 492294
5544001145 13 F 492295'))
x <- readLine('name-of-datafile.txt')
rawtext <- "4 64001416230519844TP blahblah
5416001130 1 F 492273
5416001140 3 F 492274
5416001145 1 F 492275
5416001150 19 F 492276
5416001155 21 F 492277
5416001160 21 F 492278
5416001165 13 F 492279
5416001170 3 F 492280
5416001180 1 F 492281
4 64001544250619844RA blahblah
5544001125 1 F 492291
5544001130 3 F 492292
5544001135 4 F 492293
5544001140 11 F 492294
5544001145 13 F 492295"
text <- readLines(textConnection(rawtext))
header_rows <- grep("^4", text)
lengths <- diff(c(header_rows, length(text) + 1)) - 1
rm(text)
do.call(rbind, mapply(
function(skip, nrows, ...) data.frame(
read.table(skip = skip, nrows = nrows, ...),
read.table(skip = skip - 1, nrows = 1, ...)
),
MoreArgs = list(text = rawtext),
header_rows,
lengths,
SIMPLIFY = FALSE
))
# V1 V2 V3 V4 V1.1 V2.1 V3.1
# 1 5416001130 1 FALSE 492273 4 64001416230519844TP blahblah
# 2 5416001140 3 FALSE 492274 4 64001416230519844TP blahblah
# 3 5416001145 1 FALSE 492275 4 64001416230519844TP blahblah
# 4 5416001150 19 FALSE 492276 4 64001416230519844TP blahblah
# 5 5416001155 21 FALSE 492277 4 64001416230519844TP blahblah
# 6 5416001160 21 FALSE 492278 4 64001416230519844TP blahblah
# 7 5416001165 13 FALSE 492279 4 64001416230519844TP blahblah
# 8 5416001170 3 FALSE 492280 4 64001416230519844TP blahblah
# 9 5416001180 1 FALSE 492281 4 64001416230519844TP blahblah
# 10 5544001125 1 FALSE 492291 4 64001544250619844RA blahblah
# 11 5544001130 3 FALSE 492292 4 64001544250619844RA blahblah
# 12 5544001135 4 FALSE 492293 4 64001544250619844RA blahblah
# 13 5544001140 11 FALSE 492294 4 64001544250619844RA blahblah
# 14 5544001145 13 FALSE 492295 4 64001544250619844RA blahblah