R 使用未对齐的数据和间隙重新格式化数据帧
我有一个数据集,它有一个奇怪的报告格式,我需要把它放到一个可行的数据框架中。我处理的数据如下所示:R 使用未对齐的数据和间隙重新格式化数据帧,r,R,我有一个数据集,它有一个奇怪的报告格式,我需要把它放到一个可行的数据框架中。我处理的数据如下所示: ids<-(c("A101","","","","B101","","","C101","","","")) dx<-c("Lung","","","","Kidney","","","Prostate","","","") alt<-c("","A766","G283","F933","","B293","T432","","U920","D289","S203") val&l
ids<-(c("A101","","","","B101","","","C101","","",""))
dx<-c("Lung","","","","Kidney","","","Prostate","","","")
alt<-c("","A766","G283","F933","","B293","T432","","U920","D289","S203")
val<-c(NA,3.2,4.3,7.2,NA,2.1,3.8,NA,8.1,5.3,7.1)
df.in<-data.frame(ids,dx,alt,val)
ids<-(c("A101","A101","A101","B101","B101","C101","C101","C101"))
dx<-c("Lung","Lung","Lung","Kidney","Kidney","Prostate","Prostate","Prostate")
alt<-c("A766","G283","F933","B293","T432","U920","D289","S203")
val<-c(3.2,4.3,7.2,2.1,3.8,8.1,5.3,7.1)
df.out<-data.frame(ids,dx,alt,val)
ids
细分:
> first<-which(is.na(df.in$val))# The positions for every new group ie 1,5 and 8
> groups=cumsum(is.na(df.in$val))#The groups you have
> groupsize=rle(groups)$length#The size of the groups
> newdf=transform(df.in[rep(first,groupsize),],val=df.in$val)#Create the new df
> newdf=na.omit(newdf)#Remove the NA rows
> row.names(newdf)=NULL# REMOVE THE ROWNAMES GIVEN
> newdf
ids dx alt val
1 A101 Lung 3.2
2 A101 Lung 4.3
3 A101 Lung 7.2
4 B101 Kidney 2.1
5 B101 Kidney 3.8
6 C101 Prostate 8.1
7 C101 Prostate 5.3
8 C101 Prostate 7.1
>first groups=cumsum(is.na(df.in$val))#您拥有的组
>groupsize=rle(组)$length#组的大小
>newdf=transform(df.in[rep(first,groupsize),],val=df.in$val)#创建新的df
>newdf=na.省略(newdf)#删除na行
>row.names(newdf)=NULL#删除给定的行名
>新发展基金
ids dx alt val
1 A101肺3.2
2 A101肺4.3
3 A101肺7.2
4 B101肾脏2.1
5 B101肾脏3.8
6 C101前列腺癌8.1
7 C101前列腺癌5.3
8 C101前列腺癌7.1
单程与tidyr
和dplyr
:
library(dplyr)
library(tidyr)
# Replace blank cells "" with NA
df.in[df.in == ""] <- NA
# Fill NA values with value of row above it
df.in %>%
fill(c(ids, dx), .direction = "down") %>%
drop_na() %>%
mutate_if(is.factor, as.character) # optional
# A tibble: 8 x 4
ids dx alt val
<chr> <chr> <chr> <dbl>
1 A101 Lung A766 3.20
2 A101 Lung G283 4.30
3 A101 Lung F933 7.20
4 B101 Kidney B293 2.10
5 B101 Kidney T432 3.80
6 C101 Prostate U920 8.10
7 C101 Prostate D289 5.30
8 C101 Prostate S203 7.10
库(dplyr)
图书馆(tidyr)
#将空白单元格“”替换为NA
df.in[df.in==“”]%
填充(c(ID,dx),.direction=“down”)%>%
下拉菜单()%>%
如果(is.factor,as.character)发生变异#可选
#一个tibble:8x4
ids dx alt val
1 A101肺A766 3.20
2 A101 Lung G283 4.30
3 A101肺F933 7.20
4 B101肾脏B293 2.10
5 B101肾脏T432 3.80
6 C101前列腺U920 8.10
7 C101前列腺D289 5.30
8 C101前列腺S203 7.10
链中的最后一行,mutate\u if(is.factor,as.character)
是可选的,它将因子转换为字符。我们可以在创建数据集时使用stringsAsFactors=FALSE
来避免此步骤。这非常有效。谢谢我确实收到了一个关于mutate_if函数的错误,因为它似乎在我的安装中不可用(R版本3.4,tidyr_0.8.0)
library(dplyr)
library(tidyr)
# Replace blank cells "" with NA
df.in[df.in == ""] <- NA
# Fill NA values with value of row above it
df.in %>%
fill(c(ids, dx), .direction = "down") %>%
drop_na() %>%
mutate_if(is.factor, as.character) # optional
# A tibble: 8 x 4
ids dx alt val
<chr> <chr> <chr> <dbl>
1 A101 Lung A766 3.20
2 A101 Lung G283 4.30
3 A101 Lung F933 7.20
4 B101 Kidney B293 2.10
5 B101 Kidney T432 3.80
6 C101 Prostate U920 8.10
7 C101 Prostate D289 5.30
8 C101 Prostate S203 7.10