R 从字符串中提取数字并复制其关联值
我目前有以下代码:R 从字符串中提取数字并复制其关联值,r,R,我目前有以下代码: args <- commandArgs(TRUE) args[1] <- "H2SO4" components <- gsub('([[:upper:]])', ' \\1', args[1]) components <- c(unlist(strsplit(components, " ")))[-1] 现在,我如何将数字与字母分开,并将该类型的字母数量与提取的数字进行比较。 因此,输出将如下所示: [1] "H" "H" "S" "O" "O"
args <- commandArgs(TRUE)
args[1] <- "H2SO4"
components <- gsub('([[:upper:]])', ' \\1', args[1])
components <- c(unlist(strsplit(components, " ")))[-1]
现在,我如何将数字与字母分开,并将该类型的字母数量与提取的数字进行比较。
因此,输出将如下所示:
[1] "H" "H" "S" "O" "O" "O" "O"
我们可以使用
gsubfn
按字符串的数字部分复制字符,然后使用str\u extract\u all
提取字符
library(gsubfn)
library(stringr)
str_extract_all(gsubfn('(\\D)(\\d+)', ~rep(x,y), str1),'[A-Z]')[[1]]
#[1] "H" "H" "S" "O" "O" "O" "O"
str_extract_all(gsubfn('(\\D)(\\d+)', ~rep(x,y), str2),'[A-Z]')[[1]]
#[1] "C" "C" "C" "C" "H" "H" "H" "H" "H" "H" "H" "H" "H" "H" "O"
str_extract_all(gsubfn('([A-Z][a-z]*)(\\d+)', ~rep(x,y),
str3), '[A-Z][a-z]*')[[1]]
#[1] "Fe" "Fe" "Fe"
str_extract_all(gsubfn('([A-Z][a-z]*)(\\d+)', ~rep(x,y),
str4), '[A-Z][a-z]*')[[1]]
#[1] "Fe" "Fe" "O" "O" "O"
数据
str1这正是函数的作用,您只需将输入转换为正确的格式:
repetitions = as.numeric(gsub('[[:upper:]]', '', components))
repetitions[is.na(repetitions)] = 1
rle = list(lengths = repetitions, values = gsub('[^[:upper:]]', '', components))
inverse.rle(rle)
当然,您也可以重新发明inverse.rle
的功能,正如另一个答案所示,这并不难。但是,组合现有工具而不是重新发明它们通常是一个好主意(要明确的是:我不推荐我的答案胜过akrun的答案,akrun的答案更简洁、直接,并且可能更有效;但是,了解您可以使用的工具是一个好主意)。再来一次尝试
args <- "H2SO10"
components <- gsub('([[:upper:]])', ' \\1', args)
components <- c(unlist(strsplit(components, " ")))[-1]
f <- function(x)
if (length(y <- strsplit(x, '(?=\\D\\d+)', perl = TRUE)[[1]]) > 1)
rep(y[1], as.numeric(y[2])) else x
f(components[1])
# [1] "H" "H"
unlist(Vectorize(f, USE.NAMES = FALSE)(components))
# [1] "H" "H" "S" "O" "O" "O" "O" "O" "O" "O" "O" "O" "O"
args以下是一种dplyr方式:
library(stringi)
library(tidyr)
library(plyr)
library(dplyr)
chemicals = data_frame(chemical = c("H2SO4", "C4H10O"))
elements =
chemicals %>%
mutate(element_number =
chemical %>%
stri_replace_all_regex("([A-Z])",
" $1") %>%
stri_split_fixed(" ") ) %>%
unnest(element_number) %>%
filter(element_number != "") %>%
mutate(element =
element_number %>%
stri_replace_all_regex("[0-9]", ""),
number =
element_number %>%
stri_replace_all_regex("[^0-9]", "") %>%
as.numeric %>%
mapvalues(NA, 1)) %>%
select(-element_number)
long_elements =
elements %>%
rowwise %>%
mutate(result =
element %>%
rep(number) %>%
list) %>%
unnest(result)
噢刚发现一个问题。。。我有一个两位数只需要第一个one@LoneCowCoder尝试将\\d
更改为\\d+
。我更新了密码。啊。,。。。泰。。忘记了这个表达式,我们正在测试一些东西,发现如果我有像Fe3这样的东西,gsubfn会将它列为Fc(e,e,e),而不是c(Fe,Fe,Fe)。我已经修改了代码,这样我就可以让Fe在后面没有数字了。@LoneCowCode更新了帖子。我还没有对所有的案例进行测试,但它现在可以与您展示的示例一起使用。@KonradRudolph这与您的帖子无关:-)
args <- "H2SO10"
components <- gsub('([[:upper:]])', ' \\1', args)
components <- c(unlist(strsplit(components, " ")))[-1]
f <- function(x)
if (length(y <- strsplit(x, '(?=\\D\\d+)', perl = TRUE)[[1]]) > 1)
rep(y[1], as.numeric(y[2])) else x
f(components[1])
# [1] "H" "H"
unlist(Vectorize(f, USE.NAMES = FALSE)(components))
# [1] "H" "H" "S" "O" "O" "O" "O" "O" "O" "O" "O" "O" "O"
library(stringi)
library(tidyr)
library(plyr)
library(dplyr)
chemicals = data_frame(chemical = c("H2SO4", "C4H10O"))
elements =
chemicals %>%
mutate(element_number =
chemical %>%
stri_replace_all_regex("([A-Z])",
" $1") %>%
stri_split_fixed(" ") ) %>%
unnest(element_number) %>%
filter(element_number != "") %>%
mutate(element =
element_number %>%
stri_replace_all_regex("[0-9]", ""),
number =
element_number %>%
stri_replace_all_regex("[^0-9]", "") %>%
as.numeric %>%
mapvalues(NA, 1)) %>%
select(-element_number)
long_elements =
elements %>%
rowwise %>%
mutate(result =
element %>%
rep(number) %>%
list) %>%
unnest(result)