R 将列拆分为多列并保留下一列

R 将列拆分为多列并保留下一列,r,regex,dataframe,multiple-columns,R,Regex,Dataframe,Multiple Columns,我有以下格式的数据框: i j score chr12-100000000 chr12.100000000 0.333000 chr12-100000000 chr12.100050000 0.169200 chr12-100000000 chr12.100100000 0.054980 我想将其转换为将列分隔为: chr_firstside position_firstside chr_secondside position_s

我有以下格式的数据框:

i               j               score
chr12-100000000 chr12.100000000 0.333000
chr12-100000000 chr12.100050000 0.169200
chr12-100000000 chr12.100100000 0.054980
我想将其转换为将列分隔为:

chr_firstside   position_firstside  chr_secondside  position_secondside score
chr12           100000000           chr12           100000000           0.333000
chr12           100000000           chr12           100050000           0.169200
chr12           100000000           chr12           100100000           0.054980
我希望它以制表符分隔并在R中实现。我尝试了这个方法,但没有成功:

library(data.table)
setDT(converted)[ , tstrsplit(i '[-]', type.convert=TRUE)]

使用
sub

df$chr_firstside <- sub("^([^-]+).*", "\\1", df$i)
df$position_firstside <- sub(".*?([^-]+)$", "\\1", df$i)
df$chr_secondside <- sub("^([^.]+).*", "\\1", df$j)
df$position_secondside <- sub(".*?([^.]+)$", "\\1", df$j)

三天一次

library(tidyr)
df%
单独(i,c('chr_i','position_i'),convert=TRUE)%>%
单独(j,c('chr_j','position_j'),convert=TRUE)
#>chr_i位置chr_i位置chr_j得分
#>1 chr12 100000000 chr12 100000000 0.33300
#>2 chr12 100000000 chr12 100050000 0.16920
#>3 chr12 100000000 chr12 100100000 0.05498
长格式可能更实用,但:

df_long%
聚集(变量、值、i:j)%>%
单独(val,c('chr','position'),convert=TRUE)
德福朗
#>分数变量chr位置
#>1 0.33300 i chr12 100000000
#>2 0.16920 i chr12 100000000
#>3 0.05498 i chr12 100000000
#>4 0.33300克朗12 100000000克朗
#>5 0.16920 j chr12 100050000
#>6 0.05498 j chr12 100100000
…如果您想恢复宽幅格式,可以:

df_宽%
聚集(var2、val、chr:位置)%>%
联合(变量,变量2,变量)%>%
排列(var、val、convert=TRUE)
全方位
#>#tibble:3 x 5
#>得分chr_i chr_j位置
#>                  
#>1 0.0550 chr12 chr12 100000000 100100000
#>2 0.169 chr12 chr12 100000000 100050000
#>3 0.333 chr12 chr12 100000000 100000000

A
base R
选项,带有
read。表
将在前两列上映射,为
read.table
指定相应的
sep
,以分成多列,
cbind
list输出,然后
cbind
在使用所需的列名重命名列(“nm1”)后,使用“score”列将其重命名


nm1玩
base R
strsplit

split_temp <- sapply(lapply(converted[, 1:2], strsplit, "[\\.-]"), unlist)
row_pos <- 1:nrow(split_temp) %% 2 == 0L
converted2 <- data.frame(chr_firstside       = split_temp[!row_pos, "i"],
                         position_firstside  = split_temp[row_pos, "i"],
                         chr_secondside      = split_temp[!row_pos, "j"],
                         position_secondside = split_temp[row_pos, "j"],
                         score               = converted$score)
print(converted2)
  chr_firstside position_firstside chr_secondside position_secondside   score
1         chr12          100000000          chr12           100000000 0.33300
2         chr12          100000000          chr12           100050000 0.16920
3         chr12          100000000          chr12           100100000 0.05498

split\u temp我建议从我的“splitstackshape”包中选择
cSplit
,它允许您提供一个拆分字符向量,每个要拆分的列对应一个

演示(使用):

使用
setcolorder
更改列顺序:

setcolorder(cSplit(df, c("i", "j"), c("-", ".")), c(2:5, 1))[]
#      i_1       i_2   j_1       j_2   score
# 1: chr12 100000000 chr12 100000000 0.33300
# 2: chr12 100000000 chr12 100050000 0.16920
# 3: chr12 100000000 chr12 100100000 0.05498

这个很好用!有什么理由把分数排到第一列吗?
split_temp <- sapply(lapply(converted[, 1:2], strsplit, "[\\.-]"), unlist)
row_pos <- 1:nrow(split_temp) %% 2 == 0L
converted2 <- data.frame(chr_firstside       = split_temp[!row_pos, "i"],
                         position_firstside  = split_temp[row_pos, "i"],
                         chr_secondside      = split_temp[!row_pos, "j"],
                         position_secondside = split_temp[row_pos, "j"],
                         score               = converted$score)
print(converted2)
  chr_firstside position_firstside chr_secondside position_secondside   score
1         chr12          100000000          chr12           100000000 0.33300
2         chr12          100000000          chr12           100050000 0.16920
3         chr12          100000000          chr12           100100000 0.05498
library(splitstackshape)
cSplit(df, c("i", "j"), c("-", "."))
#      score   i_1       i_2   j_1       j_2
# 1: 0.33300 chr12 100000000 chr12 100000000
# 2: 0.16920 chr12 100000000 chr12 100050000
# 3: 0.05498 chr12 100000000 chr12 100100000
setcolorder(cSplit(df, c("i", "j"), c("-", ".")), c(2:5, 1))[]
#      i_1       i_2   j_1       j_2   score
# 1: chr12 100000000 chr12 100000000 0.33300
# 2: chr12 100000000 chr12 100050000 0.16920
# 3: chr12 100000000 chr12 100100000 0.05498