R 从字符串中提取数字并复制其关联值

R 从字符串中提取数字并复制其关联值,r,R,我目前有以下代码: args <- commandArgs(TRUE) args[1] <- "H2SO4" components <- gsub('([[:upper:]])', ' \\1', args[1]) components <- c(unlist(strsplit(components, " ")))[-1] 现在,我如何将数字与字母分开,并将该类型的字母数量与提取的数字进行比较。 因此,输出将如下所示: [1] "H" "H" "S" "O" "O"

我目前有以下代码:

args <- commandArgs(TRUE)
args[1] <- "H2SO4"

components <- gsub('([[:upper:]])', ' \\1', args[1])
components <- c(unlist(strsplit(components, " ")))[-1]
现在,我如何将数字与字母分开,并将该类型的字母数量与提取的数字进行比较。 因此,输出将如下所示:

[1] "H" "H" "S" "O" "O" "O" "O"

我们可以使用
gsubfn
按字符串的数字部分复制字符,然后使用
str\u extract\u all
提取字符

library(gsubfn)
library(stringr)
str_extract_all(gsubfn('(\\D)(\\d+)', ~rep(x,y), str1),'[A-Z]')[[1]]
#[1] "H" "H" "S" "O" "O" "O" "O"

str_extract_all(gsubfn('(\\D)(\\d+)', ~rep(x,y), str2),'[A-Z]')[[1]]
#[1] "C" "C" "C" "C" "H" "H" "H" "H" "H" "H" "H" "H" "H" "H" "O"

str_extract_all(gsubfn('([A-Z][a-z]*)(\\d+)', ~rep(x,y), 
            str3), '[A-Z][a-z]*')[[1]]
#[1] "Fe" "Fe" "Fe"

str_extract_all(gsubfn('([A-Z][a-z]*)(\\d+)', ~rep(x,y), 
            str4), '[A-Z][a-z]*')[[1]]
#[1] "Fe" "Fe" "O"  "O"  "O" 
数据
str1这正是函数的作用,您只需将输入转换为正确的格式:

repetitions = as.numeric(gsub('[[:upper:]]', '', components))
repetitions[is.na(repetitions)] = 1

rle = list(lengths = repetitions, values = gsub('[^[:upper:]]', '', components))
inverse.rle(rle)
当然,您也可以重新发明
inverse.rle
的功能,正如另一个答案所示,这并不难。但是,组合现有工具而不是重新发明它们通常是一个好主意(要明确的是:我不推荐我的答案胜过akrun的答案,akrun的答案更简洁、直接,并且可能更有效;但是,了解您可以使用的工具是一个好主意)。

再来一次尝试

args <- "H2SO10"

components <- gsub('([[:upper:]])', ' \\1', args)
components <- c(unlist(strsplit(components, " ")))[-1]

f <- function(x)
  if (length(y <- strsplit(x, '(?=\\D\\d+)', perl = TRUE)[[1]]) > 1)
    rep(y[1], as.numeric(y[2])) else x

f(components[1])
# [1] "H" "H"

unlist(Vectorize(f, USE.NAMES = FALSE)(components))
# [1] "H" "H" "S" "O" "O" "O" "O" "O" "O" "O" "O" "O" "O"
args以下是一种dplyr方式:

library(stringi)
library(tidyr)
library(plyr)
library(dplyr)

chemicals = data_frame(chemical = c("H2SO4", "C4H10O"))

elements = 
  chemicals %>%
  mutate(element_number = 
           chemical %>%
           stri_replace_all_regex("([A-Z])", 
                                  " $1") %>%
           stri_split_fixed(" ") ) %>%
  unnest(element_number) %>%
  filter(element_number != "") %>%
  mutate(element = 
           element_number %>%
           stri_replace_all_regex("[0-9]", ""),
         number = 
           element_number %>%
           stri_replace_all_regex("[^0-9]", "") %>%
           as.numeric %>%
           mapvalues(NA, 1)) %>%
  select(-element_number)

long_elements = 
  elements %>%
  rowwise %>%
  mutate(result = 
           element %>%
           rep(number) %>%
           list) %>%
  unnest(result)

噢刚发现一个问题。。。我有一个两位数只需要第一个one@LoneCowCoder尝试将
\\d
更改为
\\d+
。我更新了密码。啊。,。。。泰。。忘记了这个表达式,我们正在测试一些东西,发现如果我有像Fe3这样的东西,gsubfn会将它列为Fc(e,e,e),而不是c(Fe,Fe,Fe)。我已经修改了代码,这样我就可以让Fe在后面没有数字了。@LoneCowCode更新了帖子。我还没有对所有的案例进行测试,但它现在可以与您展示的示例一起使用。@KonradRudolph这与您的帖子无关:-)
args <- "H2SO10"

components <- gsub('([[:upper:]])', ' \\1', args)
components <- c(unlist(strsplit(components, " ")))[-1]

f <- function(x)
  if (length(y <- strsplit(x, '(?=\\D\\d+)', perl = TRUE)[[1]]) > 1)
    rep(y[1], as.numeric(y[2])) else x

f(components[1])
# [1] "H" "H"

unlist(Vectorize(f, USE.NAMES = FALSE)(components))
# [1] "H" "H" "S" "O" "O" "O" "O" "O" "O" "O" "O" "O" "O"
library(stringi)
library(tidyr)
library(plyr)
library(dplyr)

chemicals = data_frame(chemical = c("H2SO4", "C4H10O"))

elements = 
  chemicals %>%
  mutate(element_number = 
           chemical %>%
           stri_replace_all_regex("([A-Z])", 
                                  " $1") %>%
           stri_split_fixed(" ") ) %>%
  unnest(element_number) %>%
  filter(element_number != "") %>%
  mutate(element = 
           element_number %>%
           stri_replace_all_regex("[0-9]", ""),
         number = 
           element_number %>%
           stri_replace_all_regex("[^0-9]", "") %>%
           as.numeric %>%
           mapvalues(NA, 1)) %>%
  select(-element_number)

long_elements = 
  elements %>%
  rowwise %>%
  mutate(result = 
           element %>%
           rep(number) %>%
           list) %>%
  unnest(result)