R 我可以每隔一个逗号分隔一个字符吗?
假设我有dd,我想拆分cc列,最后得到一个targetddR 我可以每隔一个逗号分隔一个字符吗?,r,tidyverse,R,Tidyverse,假设我有dd,我想拆分cc列,最后得到一个targetdd 假设我有dd,我想拆分cc列,最后得到一个targetdd。一个简单的方法是用逗号拆分列,每两行分组,并使用toString汇总值 使用我们可以在一个字母后面拆分 library(dplyr) dd %>% tidyr::separate_rows(cc, sep = ",(?=[a-z]+)") # A tibble: 6 x 2 aa cc <chr> <chr> 1 we
假设我有dd,我想拆分cc列,最后得到一个targetdd。一个简单的方法是用逗号拆分列,每两行分组,并使用toString汇总值 使用我们可以在一个字母后面拆分
library(dplyr)
dd %>% tidyr::separate_rows(cc, sep = ",(?=[a-z]+)")
# A tibble: 6 x 2
aa cc
<chr> <chr>
1 we de,34
2 we ff,55u
3 we gf,55
4 dd de,34
5 qw de,34
6 qw ff,55u
基本R解决方案:
# Split the string into groups:
cc_uld <- lapply(strsplit(dd$cc, ","), function(x){
x <- unlist(strsplit(paste0(ifelse(grepl("\\d+", x),
paste0(x, ","), x), collapse = " "), ","))
}
)
# Replicate aa vector by the length of cc_uld list elements:
rolled_out_df <- data.frame(aa = rep(as.character(dd$aa), sapply(cc_uld, length)),
cc = unlist(cc_uld), stringsAsFactors = FALSE)
# Prep the string to mirror the target:
rolled_out_df$cc <- gsub("\\s+", ",", trimws(rolled_out_df$cc, "both"))
使用strsplit。正则表达式从@A.Suliman处借用
do.call(rbind.data.frame, Map(cbind, dd$aa, strsplit(dd$cc, ",(?=[a-z]+)", perl=T)))
# V1 V2
# we.1 we de,34
# we.2 we ff,55u
# we.3 we gf,55
# dd dd de,34
# qw.1 qw de,34
# qw.2 qw ff,55u
字符串中sep每出现一次拆分的广义解:
您可以使用strsplit按逗号拆分,然后将它们逐个分组。
dd %>%
mutate(res = purrr::map(cc, ~ tibble(cc_new = unlist(stringr::str_split(.x, ",(?=[a-z]+)"))))) %>%
unnest(res)
# A tibble: 6 x 3
aa cc cc_new
<chr> <chr> <chr>
1 we de,34,ff,55u,gf,55 de,34
2 we de,34,ff,55u,gf,55 ff,55u
3 we de,34,ff,55u,gf,55 gf,55
4 dd de,34 de,34
5 qw de,34,ff,55u de,34
6 qw de,34,ff,55u ff,55u
# Split the string into groups:
cc_uld <- lapply(strsplit(dd$cc, ","), function(x){
x <- unlist(strsplit(paste0(ifelse(grepl("\\d+", x),
paste0(x, ","), x), collapse = " "), ","))
}
)
# Replicate aa vector by the length of cc_uld list elements:
rolled_out_df <- data.frame(aa = rep(as.character(dd$aa), sapply(cc_uld, length)),
cc = unlist(cc_uld), stringsAsFactors = FALSE)
# Prep the string to mirror the target:
rolled_out_df$cc <- gsub("\\s+", ",", trimws(rolled_out_df$cc, "both"))
do.call(rbind.data.frame, Map(cbind, dd$aa, strsplit(dd$cc, ",(?=[a-z]+)", perl=T)))
# V1 V2
# we.1 we de,34
# we.2 we ff,55u
# we.3 we gf,55
# dd dd de,34
# qw.1 qw de,34
# qw.2 qw ff,55u
nGroup_indexes <- function(vec, n) {
vec_len <- length(vec)
index_groups <- list()
for (i in seq(n)) {
index_groups[[i]] <- seq(from=i, to=vec_len, by=n)
}
index_groups
suppressWarnings(index_mat <- Reduce(cbind, index_groups))
colnames(index_mat) <- 1:n
divisor <- vec_len %% n
if (!(divisor * n == vec_len)) {
index_mat[vec_len %/% n + 1, (vec_len %% n + 1):n] <- NA
}
index_mat
}
nGroups <- function(vec, n) {
index_mat <- nGroup_indexes(vec, n)
res <- lapply(1:nrow(index_mat), function(row_idx) vec[index_mat[row_idx, ]])
res[[nrow(index_mat)]] <- na.omit(res[[nrow(index_mat)]])
res
}
split_every_kth <- function(str, sep, k) {
i_res <- strsplit(str, sep)[[1]]
grouped_i_res <- nGroups(i_res, k)
groups_joined <- lapply(grouped_i_res, function(vec) paste(vec, collapse=sep))
unlist(groups_joined) # make back to vector
}
# > split_every_kth("a,b,c,d,e,f,g", ",", 2)
# [1] "a,b" "c,d" "e,f" "g"
# > split_every_kth("a,b,c,d,e,f,g", ",", 3)
# [1] "a,b,c" "d,e,f" "g"