Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/81.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
R 用单列读取txt文件,在几行中记录每行一个单元格,放入一个整洁的数据框_R_Dplyr_Tidyverse - Fatal编程技术网

R 用单列读取txt文件,在几行中记录每行一个单元格,放入一个整洁的数据框

R 用单列读取txt文件,在几行中记录每行一个单元格,放入一个整洁的数据框,r,dplyr,tidyverse,R,Dplyr,Tidyverse,我在一个文本文件中有以下数据 RecordID:Name1 VariableA:14 VariableB: 34 VariableC: 21 VariableD:red VariableE: low RecordID: Name2 VariableA: 31 VariableB: 21 VariableD:yellow VariableE:high 等等,几千条记录 然后,每条记录都以RecordID:和标识符开始 有时会丢失一些变量数据,比如在第二条记录中没有变量D。有时,拆分文本字符串时,

我在一个文本文件中有以下数据

RecordID:Name1
VariableA:14
VariableB: 34
VariableC: 21
VariableD:red
VariableE: low
RecordID: Name2
VariableA: 31
VariableB: 21
VariableD:yellow
VariableE:high
等等,几千条记录

然后,每条记录都以RecordID:和标识符开始 有时会丢失一些变量数据,比如在第二条记录中没有变量D。有时,拆分文本字符串时,变量单元格可以跨越两行。因此,我不能只依赖于变量名的行号

如何将这些数据以整齐的形式在列中读取到R中:

RecordID | VariableA | VariableB | Variable C | Variable D| Variable E
----------------------------------------------------------------------
Name1       14           34           21           red        low
Name2       31           21                        yellow     high

我会这样做的

library(tidyr)

xy <- readLines(con = "test.txt") # this is your data file, read it line-wise

out <- data.frame(temp = xy)
find.record <- grepl("RecordID:", out$temp) # identify where record starts

# create RecordID column
out$RecordID <- NA
out[find.record, "RecordID"] <- as.character(out[find.record, "temp"])
out <- fill(out, "RecordID")
out$RecordID <- trimws(sapply(strsplit(out$RecordID, ":"), "[[", 2))

# now that we have a RecordID column, remove these lines
out <- out[!find.record, ]

# split the data on colon and put into respective columns
out$variable <- trimws(sapply(strsplit(as.character(out$temp), ":"), "[[", 1))
out$value <- trimws(sapply(strsplit(as.character(out$temp), ":"), "[[", 2))
out$temp <- NULL # remove temporary data
out

   RecordID  variable  value
2     Name1 VariableA     14
3     Name1 VariableB     34
4     Name1 VariableC     21
5     Name1 VariableD    red
6     Name1 VariableE    low
8     Name2 VariableA     31
9     Name2 VariableB     21
10    Name2 VariableD yellow
11    Name2 VariableE   high

# transform from long to wide format
spread(out, key = variable, value = value)

  RecordID VariableA VariableB VariableC VariableD VariableE
1    Name1        14        34        21       red       low
2    Name2        31        21      <NA>    yellow      high
library(tidyr)

xy您需要做几件事。首先是将文本放入数据框中。我使用了
magrittr
管道操作符来让过程更清晰一些,但它基本上是先按行再按冒号拆分的

library(magrittr)

my_d <- "RecordID:Name1
VariableA:14
VariableB: 34
VariableC: 21
VariableD:red
VariableE: low
RecordID: Name2
VariableA: 31
VariableB: 21
VariableD:yellow
VariableE:high" %>% 
strsplit("\n") %>%
unlist() %>%
strsplit(., ":") %>%
data.frame() %>%
t() %>% 
as.data.frame(stringsAsFactors=FALSE)

rownames(my_d) <- 1:dim(my_d)[1]
colnames(my_d) <- c("colname", "value")
然后将“长”数据帧重塑为“宽”格式

my_d_long使用的解决方案。假设原始格式的数据帧是
dat
<代码>dat2
是最终输出。注意,要使用
read.table
命令创建
dat
,我们可以指定
sep=:
来读取数据

library(tidyverse)

dat2 <- dat %>%
  mutate_all(funs(str_trim(.))) %>%            # Trim the white space for all columns
  mutate(RecordID = ifelse(V1 %in% "RecordID", 
                           V2, NA)) %>%        # Create a new column with the name from V2 when V1 is RecordID
  fill(RecordID) %>%                           # Fill in NA in the RecordID column  
  filter(!V1 %in% "RecordID") %>%              # Remove V1 == "RecordID"
  spread(V1, V2, convert = TRUE)               # Spread the data frame
dat2
#   RecordID VariableA VariableB VariableC VariableD VariableE
# 1    Name1        14        34        21       red       low
# 2    Name2        31        21        NA    yellow      high
库(tidyverse)
dat2%
mutate_all(funs(str_trim(.))%>%#修剪所有列的空白
变异(RecordID=ifelse)(V1%在%“RecordID”,
V2,NA))%>%#当V1为RecordID时,使用V2中的名称创建一个新列
填写(记录ID)%>%#在记录ID列中填写NA
筛选器(!V1%in%%“RecordID”)%%>%#删除V1==“RecordID”
扩展(V1,V2,convert=TRUE)#扩展数据帧
dat2
#记录ID变量A变量B变量C变量E
#1姓名114 34 21红色低
#2名2 31 21 NA黄色高
数据

dat <- read.table(text = "RecordID:Name1
VariableA:14
                  VariableB: 34
                  VariableC: 21
                  VariableD:red
                  VariableE: low
                  RecordID: Name2
                  VariableA: 31
                  VariableB: 21
                  VariableD:yellow
                  VariableE:high", sep = ":", stringsAsFactors = FALSE)

dat此文件为DCF格式,因此另一种选择是使用
read.DCF

#read text file
txt <- readLines("test.txt")

#insert a blank line before each line having "RecordID" so that the subsequent line is identified as a new record
record_num <- grep("RecordID", txt)
for (i in seq(length(record_num))) {
  txt <- append(txt, "", after = grep("RecordID", txt)[i] - 1)
}

#now read dcf file and convert it to a dataframe
df <- as.data.frame(read.dcf(textConnection(txt)))
相关的:
dat <- read.table(text = "RecordID:Name1
VariableA:14
                  VariableB: 34
                  VariableC: 21
                  VariableD:red
                  VariableE: low
                  RecordID: Name2
                  VariableA: 31
                  VariableB: 21
                  VariableD:yellow
                  VariableE:high", sep = ":", stringsAsFactors = FALSE)
#read text file
txt <- readLines("test.txt")

#insert a blank line before each line having "RecordID" so that the subsequent line is identified as a new record
record_num <- grep("RecordID", txt)
for (i in seq(length(record_num))) {
  txt <- append(txt, "", after = grep("RecordID", txt)[i] - 1)
}

#now read dcf file and convert it to a dataframe
df <- as.data.frame(read.dcf(textConnection(txt)))
> df
  RecordID VariableA VariableB VariableC VariableD VariableE
1    Name1        14        34        21       red       low
2    Name2        31        21      <NA>    yellow      high
RecordID:Name1
VariableA:14
VariableB: 34
VariableC: 21
VariableD:red
VariableE: low
RecordID: Name2
VariableA: 31
VariableB: 21
VariableD:yellow
VariableE:high