将规则字符串解析为data.frame的最快方法_R_Dataframe_Data.table

将规则字符串解析为data.frame的最快方法

r dataframe

将规则字符串解析为data.frame的最快方法,r,dataframe,data.table,R,Dataframe,Data.table,我有一个设置了字符串规则的数据集 R> input id rules 1 1 1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0 2 2 1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0 3 3 1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.1

我有一个设置了字符串规则的数据集

R> input
   id                                   rules
1   1 1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0
2   2 1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0
3   3 1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0
4   4 1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0
5   5 1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0
6   6 1.11=>0;1.12=>0;1.13=>0;1.14=>1;1.15=>0
7   7 1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0
8   8 1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0
9   9 1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0
10 10 1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0
11 11 1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0
12 12 1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0
13 13 1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0
14 14 1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0
15 15 1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0
16 16 1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0

将此规则拆分和合并为单独的列的最快方法是什么？预期结果：

R> res
   R1.11 R1.12 R1.13 R1.14 R1.15 id
1      0     0     0     0     0  1
2      0     0     0     0     0  2
3      0     0     0     0     0  3
4      0     0     0     0     0  4
5      0     0     0     0     0  5
6      0     0     0     1     0  6
7      0     0     0     0     0  7
8      0     0     0     0     0  8
9      0     0     0     0     0  9
10     0     0     0     0     0 10
11     0     0     0     0     0 11
12     0     0     0     0     0 12
13     0     0     0     0     0 13
14     0     0     0     0     0 14
15     0     0     0     0     0 15
16     0     0     0     0     0 16

要重现数据集，请参见下面的结构

输入数据结构：

input <- structure(
    list(id = 1:16,
         rules = c("1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0", 
                   "1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0", 
                   "1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0", 
                   "1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0", 
                   "1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0", 
                   "1.11=>0;1.12=>0;1.13=>0;1.14=>1;1.15=>0", 
                   "1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0", 
                   "1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0", 
                   "1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0", 
                   "1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0", 
                   "1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0", 
                   "1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0", 
                   "1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0", 
                   "1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0", 
                   "1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0", 
                   "1.11=>0;1.12=>0;1.13=>0;1.14=>0;1.15=>0")), 
    .Names = c("id", "rules"),
    row.names = c(NA, -16L),
    class = "data.frame")

output <- structure(
    list(R1.11 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
         R1.12 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), 
         R1.13 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), 
         R1.14 = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), 
         R1.15 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
         id = 1:16), 
    .Names = c("R1.11", "R1.12", "R1.13", "R1.14", "R1.15", "id"),
    class = "data.frame",
    row.names = c(NA, -16L))

input我可能会
library(splitstackshape)
res = cSplit(input, "rules", ";", "long")
res[, c("variable", "value") := tstrsplit(rules, "=>", type.convert=TRUE)]

# head(res)
#    id   rules variable value
# 1:  1 1.11=>0     1.11     0
# 2:  1 1.12=>0     1.12     0
# 3:  1 1.13=>0     1.13     0
# 4:  1 1.14=>0     1.14     0
# 5:  1 1.15=>0     1.15     0
# 6:  2 1.11=>0     1.11     0

我将在这里停止，数据格式为长格式，但您可以使用
wideres = dcast(res, id ~ paste0("R", variable), value.var="value")

# test that it's essentially correct:
fsetequal(wideres, setcolorder(data.table(output), names(wideres)))

为您编写了一个快速函数
my.Parser <- function(row){
  r <- unlist(strsplit(gsub(";","x",row[-1]),"x"))
  Values <- data.frame(t(gsub("*.*=>","",r)))
  colnames(Values) <- paste("R",gsub("=>*.*","",r,),sep="")
  Values <- cbind(id=row[1], Values)
  return(Values)
}

要应用于您的输入

Results <- apply(input,1,function(x) my.Parser(x))
Results <- do.call("rbind", Results)
Results

下面是一个基本的R方法：
input[3:7] <- 
 matrix(as.integer(sub(".*=>", "", unlist(strsplit(input$rules, ";", fixed=TRUE)))), 
         ncol=5, byrow = TRUE)

input[3:7]感谢您的回复。我的解决方案仅基于数据表包：
library(data.table)
parse_rules <- function(.data, id_col, rules_col, prefix = "R", sep1 = ";", sep2 = "=>", convert = TRUE) {
    if (is.numeric(id_col)) id_col <- names(.data)[id_col]
    if (is.numeric(rules_col)) rules_col <- names(.data)[rules_col]
    res <- .data[, tstrsplit(get(rules_col), split = sep1, fixed = TRUE)]
    res[, (id_col) := .data[[id_col]]]
    res <- melt(res, id.vars = id_col, measure.vars = setdiff(names(res), id_col))
    res[, c("variable", "value") := tstrsplit(value, split = sep2, fixed = TRUE, type.convert = convert)]
    res <- dcast(res, get(id_col) ~ paste0(prefix, variable), value.var = "value")
    setnames(res, "id_col", id_col)
    return(res)
}
setDT(input)
parse_rules(input, "id", "rules")

另一种基于基本功能的解决方案：
parse_rules <- function(.data, col, prefix = "R", sep1 = ";", sep2 = "=>", convert = TRUE) {
    n <- length(gregexpr(sep1, .data[1L, col])[[1]]) + 1L
    str <- unlist(strsplit(.data[[col]], sep1, fixed = TRUE))
    res <- matrix(sub(paste0(".*", sep2), "", str), ncol = n, byrow = TRUE)
    res <- as.data.frame(res, stringsAsFactors = FALSE)
    nm <- paste0(prefix, sub(paste0(sep2, ".*"), "", str[seq_len(n)]))
    names(res) <- nm
    if (convert && any(chr <- sapply(res, is.character))) {
        for (c in names(res)[chr])
            res[[c]] <- type.convert(res[[c]], as.is = TRUE)
    }
    .data[[col]] <- NULL
    res <- cbind(.data, res, stringsAsFactors = FALSE)
    res
}

parse_rules@DarshanBaral，是的。也许是一些有用的输入。
library(data.table)
parse_rules <- function(.data, id_col, rules_col, prefix = "R", sep1 = ";", sep2 = "=>", convert = TRUE) {
    if (is.numeric(id_col)) id_col <- names(.data)[id_col]
    if (is.numeric(rules_col)) rules_col <- names(.data)[rules_col]
    res <- .data[, tstrsplit(get(rules_col), split = sep1, fixed = TRUE)]
    res[, (id_col) := .data[[id_col]]]
    res <- melt(res, id.vars = id_col, measure.vars = setdiff(names(res), id_col))
    res[, c("variable", "value") := tstrsplit(value, split = sep2, fixed = TRUE, type.convert = convert)]
    res <- dcast(res, get(id_col) ~ paste0(prefix, variable), value.var = "value")
    setnames(res, "id_col", id_col)
    return(res)
}
setDT(input)
parse_rules(input, "id", "rules")

    id R1.11 R1.12 R1.13 R1.14 R1.15
 1:  1     0     0     0     0     0
 2:  2     0     0     0     0     0
 3:  3     0     0     0     0     0
 4:  4     0     0     0     0     0
 5:  5     0     0     0     0     0
 6:  6     0     0     0     1     0
 7:  7     0     0     0     0     0
 8:  8     0     0     0     0     0
 9:  9     0     0     0     0     0
10: 10     0     0     0     0     0
11: 11     0     0     0     0     0
12: 12     0     0     0     0     0
13: 13     0     0     0     0     0
14: 14     0     0     0     0     0
15: 15     0     0     0     0     0
16: 16     0     0     0     0     0

parse_rules <- function(.data, col, prefix = "R", sep1 = ";", sep2 = "=>", convert = TRUE) {
    n <- length(gregexpr(sep1, .data[1L, col])[[1]]) + 1L
    str <- unlist(strsplit(.data[[col]], sep1, fixed = TRUE))
    res <- matrix(sub(paste0(".*", sep2), "", str), ncol = n, byrow = TRUE)
    res <- as.data.frame(res, stringsAsFactors = FALSE)
    nm <- paste0(prefix, sub(paste0(sep2, ".*"), "", str[seq_len(n)]))
    names(res) <- nm
    if (convert && any(chr <- sapply(res, is.character))) {
        for (c in names(res)[chr])
            res[[c]] <- type.convert(res[[c]], as.is = TRUE)
    }
    .data[[col]] <- NULL
    res <- cbind(.data, res, stringsAsFactors = FALSE)
    res
}