R 如何将列更改为基于分隔符的值列表
我目前有一个数据框,看起来像这样:R 如何将列更改为基于分隔符的值列表,r,R,我目前有一个数据框,看起来像这样: SampleID Chrom Start End ID HSB275 chr1 243216377 243219494 ENST00000366542|ENSG00000143702|protein_coding|protein_coding,chr1,243216377,243219494;ENST00000366543|ENSG00000143702|protein_coding|protein
SampleID Chrom Start End ID
HSB275 chr1 243216377 243219494 ENST00000366542|ENSG00000143702|protein_coding|protein_coding,chr1,243216377,243219494;ENST00000366543|ENSG00000143702|protein_coding|protein_coding,chr1,243216377,243219494
HSB274 chr10 952208 979839 ENST00000381466|ENSG00000205740|antisense|processed_transcript,chr10,971146,979839
HSB272 chr10 1046378 1047984 ENST00000381344|ENSG00000067064|protein_coding|protein_coding,chr10,1046378,1047984;ENST00000491735|ENSG00000067064|processed_transcript|protein_coding,chr10,1046378,1047984;ENST00000427898|ENSG00000067064|protein_coding|protein_coding,chr10,1046378,1047984
HSB481 chr11 654157 655184 ENST00000527170|ENSG00000177030|nonsense_mediated_decay|protein_coding,chr11,654157,655184
我想做的是将ID
列缩减为“ENSGXXXXXXX”值的列表,这些值由“,”分隔,如果每行有多个值,那么它看起来就像下面的Genes
列:
期望的结果:
SampleID Chrom Start End Genes
HSB275 chr1 243216377 243219494 ENSG00000143702,ENSG00000143702
HSB274 chr10 952208 979839 ENSG00000205740
HSB272 chr10 1046378 1047984 ENSG00000067064,ENSG00000067064,ENSG00000067064
HSB481 chr11 654157 655184 ENSG00000177030
您没有固定的定界符,但使用strpsit我们可以在不同的定界符(
,
,;
,,,
)上拆分ID
列,然后对于每个元素,只保留以“ENSG”开头的值,并删除其他值
sapply(strsplit(df$ID, ",|\\||;"),
function(x) toString(grep("^ENSG", x, value = TRUE)))
#[1] "ENSG00000143702, ENSG00000143702"
#[2] "ENSG00000205740"
#[3] "ENSG00000067064, ENSG00000067064, ENSG00000067064"
#[4] "ENSG00000177030"
您没有固定的定界符,但使用strpsit
我们可以在不同的定界符(,
,;
,,,
)上拆分ID
列,然后对于每个元素,只保留以“ENSG”开头的值,并删除其他值
sapply(strsplit(df$ID, ",|\\||;"),
function(x) toString(grep("^ENSG", x, value = TRUE)))
#[1] "ENSG00000143702, ENSG00000143702"
#[2] "ENSG00000205740"
#[3] "ENSG00000067064, ENSG00000067064, ENSG00000067064"
#[4] "ENSG00000177030"
这里有一个tidyverse
选项
library(tidyverse)
df %>%
mutate(Genes = map_chr(str_split(ID, ";"), ~toString(map(str_split(.x, "\\|"), 2)))) %>%
select(-ID)
# SampleID Chrom Start End
#1 HSB275 chr1 243216377 243219494
#2 HSB274 chr10 952208 979839
#3 HSB272 chr10 1046378 1047984
#4 HSB481 chr11 654157 655184
# Genes
#1 ENSG00000143702, ENSG00000143702
#2 ENSG00000205740
#3 ENSG00000067064, ENSG00000067064, ENSG00000067064
#4 ENSG00000177030
样本数据
df这里有一个tidyverse
选项
library(tidyverse)
df %>%
mutate(Genes = map_chr(str_split(ID, ";"), ~toString(map(str_split(.x, "\\|"), 2)))) %>%
select(-ID)
# SampleID Chrom Start End
#1 HSB275 chr1 243216377 243219494
#2 HSB274 chr10 952208 979839
#3 HSB272 chr10 1046378 1047984
#4 HSB481 chr11 654157 655184
# Genes
#1 ENSG00000143702, ENSG00000143702
#2 ENSG00000205740
#3 ENSG00000067064, ENSG00000067064, ENSG00000067064
#4 ENSG00000177030
样本数据
dflibrary(dplyr)
图书馆(stringr)#str#u extract_all
df%%>%groupby(SampleID)%%>%#如果不喜欢groupby,请使用rowwise()
突变(基因=粘贴(str_extract_all(ID,'ENSG\\d+',simplify=T),collapse=','))%>%
选择(-ID)
#一个tibble:4x5
#分组:样本[4]
样本色度起始-终止基因
1 HSB275 chr1 243216377 243219494 ENSG0000143702,ENSG0000143702
2 HSB274 chr10 952208 979839 ENSG0000205740
3 HSB272 chr10 1046378 1047984 ENSG0000067064、ENSG0000067064、ENSG0000067064
4 HSB481 chr11 654157 655184 ENSG0000177030
库(dplyr)
图书馆(stringr)#str#u extract_all
df%%>%groupby(SampleID)%%>%#如果不喜欢groupby,请使用rowwise()
突变(基因=粘贴(str_extract_all(ID,'ENSG\\d+',simplify=T),collapse=','))%>%
选择(-ID)
#一个tibble:4x5
#分组:样本[4]
样本色度起始-终止基因
1 HSB275 chr1 243216377 243219494 ENSG0000143702,ENSG0000143702
2 HSB274 chr10 952208 979839 ENSG0000205740
3 HSB272 chr10 1046378 1047984 ENSG0000067064、ENSG0000067064、ENSG0000067064
4 HSB481 chr11 654157 655184 ENSG0000177030
我的尝试:
genes %>%
mutate_at(vars(ID), funs(str_extract_all(., "ENSG[:digit:]*") %>%
str_replace_all("c|\"|\\(|\\)", "")))
# A tibble: 4 x 5
SampleID Chrom Start End ID
<chr> <chr> <dbl> <dbl> <chr>
1 HSB275 chr1 243216377 243219494 ENSG00000143702, ENSG00000143702
2 HSB274 chr10 952208 979839 ENSG00000205740
3 HSB272 chr10 1046378 1047984 ENSG00000067064, ENSG00000067064, ENSG00000067064
4 HSB481 chr11 654157 655184 ENSG00000177030
基因%>%
在(变量(ID)、funs(str)处进行变异提取所有(,“ENSG[:digit:*”)%>%
str|u replace|u all(“c|\”|\(|\),“”)
#一个tibble:4x5
样本色度起始结束ID
1 HSB275 chr1 243216377 243219494 ENSG0000143702,ENSG0000143702
2 HSB274 chr10 952208 979839 ENSG0000205740
3 HSB272 chr10 1046378 1047984 ENSG0000067064、ENSG0000067064、ENSG0000067064
4 HSB481 chr11 654157 655184 ENSG0000177030
这将查找与ENSG
匹配的任何模式,然后将列表强制为相关字符串的向量,并整理所有不需要的字符
虽然我个人本着整洁数据的精神,但我会将每个“ID”放在一个单独的列中,复制相关的样本ID/Chrom/Start/End数据。我的尝试:
genes %>%
mutate_at(vars(ID), funs(str_extract_all(., "ENSG[:digit:]*") %>%
str_replace_all("c|\"|\\(|\\)", "")))
# A tibble: 4 x 5
SampleID Chrom Start End ID
<chr> <chr> <dbl> <dbl> <chr>
1 HSB275 chr1 243216377 243219494 ENSG00000143702, ENSG00000143702
2 HSB274 chr10 952208 979839 ENSG00000205740
3 HSB272 chr10 1046378 1047984 ENSG00000067064, ENSG00000067064, ENSG00000067064
4 HSB481 chr11 654157 655184 ENSG00000177030
基因%>%
在(变量(ID)、funs(str)处进行变异提取所有(,“ENSG[:digit:*”)%>%
str|u replace|u all(“c|\”|\(|\),“”)
#一个tibble:4x5
样本色度起始结束ID
1 HSB275 chr1 243216377 243219494 ENSG0000143702,ENSG0000143702
2 HSB274 chr10 952208 979839 ENSG0000205740
3 HSB272 chr10 1046378 1047984 ENSG0000067064、ENSG0000067064、ENSG0000067064
4 HSB481 chr11 654157 655184 ENSG0000177030
这将查找与ENSG
匹配的任何模式,然后将列表强制为相关字符串的向量,并整理所有不需要的字符
虽然我个人本着整洁数据的精神,但我会将每个“ID”放在一个单独的列中,复制相关的样本ID/Chrom/Start/End数据。数据
df <- read.table(text =
"SampleID Chrom Start End ID
HSB275 chr1 243216377 243219494 ENST00000366542|ENSG00000143702|protein_coding|protein_coding,chr1,243216377,243219494;ENST00000366543|ENSG00000143702|protein_coding|protein_coding,chr1,243216377,243219494
HSB274 chr10 952208 979839 ENST00000381466|ENSG00000205740|antisense|processed_transcript,chr10,971146,979839
HSB272 chr10 1046378 1047984 ENST00000381344|ENSG00000067064|protein_coding|protein_coding,chr10,1046378,1047984;ENST00000491735|ENSG00000067064|processed_transcript|protein_coding,chr10,1046378,1047984;ENST00000427898|ENSG00000067064|protein_coding|protein_coding,chr10,1046378,1047984
HSB481 chr11 654157 655184 ENST00000527170|ENSG00000177030|nonsense_mediated_decay|protein_coding,chr11,654157,655184", header = T)
df
是:
SampleID Chrom Start End
1 HSB275 chr1 243216377 243219494
2 HSB274 chr10 952208 979839
3 HSB272 chr10 1046378 1047984
4 HSB481 chr11 654157 655184
ID
1 ENSG00000143702, ENSG00000143702
2 ENSG00000205740
3 ENSG00000067064, ENSG00000067064, ENSG00000067064
4 ENSG00000177030
与使用strsplit
和sapply
和lapply
数据的任何解决方案相比,此解决方案在大型表上的速度更快
df <- read.table(text =
"SampleID Chrom Start End ID
HSB275 chr1 243216377 243219494 ENST00000366542|ENSG00000143702|protein_coding|protein_coding,chr1,243216377,243219494;ENST00000366543|ENSG00000143702|protein_coding|protein_coding,chr1,243216377,243219494
HSB274 chr10 952208 979839 ENST00000381466|ENSG00000205740|antisense|processed_transcript,chr10,971146,979839
HSB272 chr10 1046378 1047984 ENST00000381344|ENSG00000067064|protein_coding|protein_coding,chr10,1046378,1047984;ENST00000491735|ENSG00000067064|processed_transcript|protein_coding,chr10,1046378,1047984;ENST00000427898|ENSG00000067064|protein_coding|protein_coding,chr10,1046378,1047984
HSB481 chr11 654157 655184 ENST00000527170|ENSG00000177030|nonsense_mediated_decay|protein_coding,chr11,654157,655184", header = T)
df
是:
SampleID Chrom Start End
1 HSB275 chr1 243216377 243219494
2 HSB274 chr10 952208 979839
3 HSB272 chr10 1046378 1047984
4 HSB481 chr11 654157 655184
ID
1 ENSG00000143702, ENSG00000143702
2 ENSG00000205740
3 ENSG00000067064, ENSG00000067064, ENSG00000067064
4 ENSG00000177030
与使用strsplit
和sapply
和lapply
的任何解决方案相比,此解决方案在大型表格上的速度更快,谢谢!如果我想得到基因
专栏的唯一列表呢?@claudiadast我想你会有这种情况。然后用unique
将其包装起来<代码>sapply(strsplit(df$ID,“,”函数(x)到字符串(unique(grep(“^ENSG”,x,value=T)))
谢谢!如果我想得到基因
专栏的唯一列表呢?@claudiadast我想你会有这种情况。然后用unique
将其包装起来<代码>sapply(strsplit(df$ID,“,”函数(x)到字符串(unique(grep(“^ENSG”,x,value=T)))