R 创建过渡频率表的快速方法

R 创建过渡频率表的快速方法,r,optimization,R,Optimization,我有一个表格显示了模式序列,用序列id表示,以及连续模式在几行上的有序值。图案的长度相同,但序列的长度不同) 我还没有对其进行基准测试,但以下解决方案似乎使用了基本的R函数,这些函数通常都很快。给定问题中的序列\u df: table(unlist(tapply(sequence_df$pattern, sequence_df$sequence_id, FUN = function(p) paste0(p[-length(p)], p[-1]))))

我有一个表格显示了模式序列,用序列id表示,以及连续模式在几行上的有序值。图案的长度相同,但序列的长度不同)


我还没有对其进行基准测试,但以下解决方案似乎使用了基本的R函数,这些函数通常都很快。给定问题中的
序列\u df

table(unlist(tapply(sequence_df$pattern, sequence_df$sequence_id,  FUN  = 
                    function(p) paste0(p[-length(p)], p[-1]))))
我正在使用
tapply
单独检查每个
序列id
,并使用
paste0
检查转换模式
unlist
将其全部放在一个大向量中,
table
可以计数。我不是舒尔,不管人们是否会称之为完全矢量化,但至少它没有嵌套的
for
循环,也没有正则表达式


它现在显然缺少将表类型转换为矩阵的代码。一旦我们知道它如何与其他解决方案进行运行时比较,并且取决于矩阵是否真的是随后操作的理想格式,就可以编写这个问题了。

好的,所以我查看了@Berhard和@user20650的解决方案,虽然我还没有认真验证结果,但看起来它们都完成了任务。 一个区别是,
tapply
不生成0频率转换,而
数据。表
生成0频率转换。
这两种解决方案都比预期的嵌套for/rbind示例更快。 因此,我已尝试对这两种方法进行基准测试,并且假设我没有错误地修改您的代码,data.table的速度是tapply的两倍多。
谢谢你们两位优雅的回答,谢谢

library(data.table)
library(tidyr)
library(microbenchmark)


patterns_sequences <- lapply(seq(1:8), function(id) {
                          length_sequence <- sample(3:10, 1);
                          do.call(paste0, replicate(3, sample(c("A", "B", "C"), length_sequence, TRUE), FALSE))
                          })

sequence_df <- data.frame(sequence_id = c(rep("a", length(patterns_sequences[[1]])),
                                          rep("b", length(patterns_sequences[[2]])),
                                          rep("c", length(patterns_sequences[[3]])),
                                          rep("d", length(patterns_sequences[[4]])),
                                          rep("e", length(patterns_sequences[[5]])),
                                          rep("f", length(patterns_sequences[[6]])),
                                          rep("g", length(patterns_sequences[[7]])),
                                          rep("h", length(patterns_sequences[[8]]))),
                          pattern = unlist(patterns_sequences))

build_frequency_table_base <- function(sequence_df) {

  ft0 <- as.data.frame(table(unlist(tapply(sequence_df$pattern, sequence_df$sequence_id,  FUN  =
                                             function(p) paste0(p[-length(p)], ",", p[-1])))), stringsAsFactors = FALSE)

  ft1 <- ft0 %>%
    tidyr::separate(Var1, c("from_pattern", "to_pattern"), ",")

  ft5 <- tidyr::spread(ft1, to_pattern, Freq, fill= 0)
  rownames(ft5) <- ft5$from_pattern
  ft5$from_pattern <- NULL
  ft5
}

build_frequency_table_dt <- function(sequence_df) {
  dt = as.data.table(sequence_df);
  dt[, pattern := factor(pattern)];
  dt[, pl := shift(pattern), by=sequence_id][ ,pl := factor(pl, level=levels(pattern))];
  res_dt <- with(dt, table(pl, pattern))
  res_dt <- as.data.frame.matrix(res_dt)

}

tictoc::tic("base")
res_base <- build_frequency_table_base(sequence_df)
tictoc::toc()


tictoc::tic("DT")
res_dt <- build_frequency_table_dt(sequence_df)
tictoc::toc()

(bench = microbenchmark::microbenchmark(
  build_frequency_table_base(sequence_df),
  res_dt <- build_frequency_table_dt(sequence_df),
  times=1000L
))
ggplot2::autoplot(bench)
库(data.table)
图书馆(tidyr)
图书馆(微基准)

我不太确定你想要什么:是沿着这些路线的吗。为每个ID2获取一个转换表。然后组合,即
sequence_df$pattern=因子(sequence_df$pattern);使用(sequence_df,Reduce(“+”),lappy(split(sequence_df,sequence_id),function(i)table(i$pattern[-nrow(i)],i$pattern[-1]))
如果正确的话,data.table可能会更快<代码>dt=as.data.table(序列_-df);dt[模式:=因子(模式)];dt[,pl:=shift(pattern),by=sequence_id][,pl:=factor(pl,level=levels(pattern));有了(dt,table(pl,pattern))
@user20650,你能把这个评论变成一个答案吗?代码将更具可读性,我将能够投票并接受答案。@Bernhard;我一直在等待op的回复,看看这是否符合预期。我还注意到,它与您的答案非常相似,因此,如果您认为我的任何评论是正确的,请随时将其合并useful@user20650我的答案甚至没有生成预期的矩阵,你的答案是。你好,伯恩哈德,虽然我理解这个概念,但我真的不理解p[-length(p)],p[-1]是如何工作的。好的,您将创建从1个模式到下一个模式的转换,但是p是什么?为什么是-length(p)和-1?我用一个以
=function(p)…
开头的匿名函数调用了
taply
。这就是
p
的起源。它是一个模式向量,
tapply
使用调用函数。希望,这是可以理解的。好吧,我缺少的是-x意味着在x位置放置元素,如果我理解正确的话。谢谢你抽出时间!我不太了解舒尔,当人们花费大量时间用tidyverse材料和不必要的管道等重新排列结果时,是否应该将其称为基础版本。也就是说,
数据。table
版本通常是最快的解决方案。在阅读看到结果的基准测试代码之前,我希望
data.table
版本是最快的。
freq_table <- c()

for (start_pattern in unique_patterns) {
  for (end_pattern in unique_patterns) {
    transition_pattern <- paste0(start_pattern, ',', end_pattern)
    sequence_holding_transition <- concat_sequence_df[grep(transition_pattern, concat_sequence_df$patterns_sequence),]
    if (nrow(sequence_holding_transition) < 1) {
      transition_frequency <- c(transition_pattern, 0)
    } else {
      concat_sequence_holding_transition <-  paste0(sequence_holding_transition$patterns_sequence, collapse = ",", sep="/")
      transition_pattern_positions <- gregexpr(pattern = transition_pattern, text = concat_sequence_holding_transition)[[1]]
      transition_frequency <- c(transition_pattern, length(transition_pattern_positions))
    }
    freq_table <- rbind(freq_table, transition_frequency)
  }
}


frequency_table <- data.frame(pattern_transition = freq_table[, 1], counts = freq_table[, 2])
frequency_table$pattern.from <- sapply(strsplit(as.character(frequency_table$pattern_transition), ","), `[`, 1)
frequency_table$pattern.to <- sapply(strsplit(as.character(frequency_table$pattern_transition), ","), `[`, 2)

frequency_table <- t(matrix(as.numeric(as.character(frequency_table$counts)), ncol=length(unique_patterns), nrow=length(unique_patterns)))
colnames(frequency_table) <- unique_patterns
rownames(frequency_table) <- unique_patterns
structure(
  list(
    ABC = c(1, 1, 0, 1, 0),
    BAC = c(0, 0, 1, 0, 0),
    BCC = c(0, 0, 0, 1, 0),
    BCD = c(2, 0, 0, 0, 0),
    CBA = c(1, 0, 0, 0, 0)
  ),
  row.names = c("ABC", "BAC", "BCC", "BCD", "CBA"),
  class = "data.frame"
)
table(unlist(tapply(sequence_df$pattern, sequence_df$sequence_id,  FUN  = 
                    function(p) paste0(p[-length(p)], p[-1]))))
library(data.table)
library(tidyr)
library(microbenchmark)


patterns_sequences <- lapply(seq(1:8), function(id) {
                          length_sequence <- sample(3:10, 1);
                          do.call(paste0, replicate(3, sample(c("A", "B", "C"), length_sequence, TRUE), FALSE))
                          })

sequence_df <- data.frame(sequence_id = c(rep("a", length(patterns_sequences[[1]])),
                                          rep("b", length(patterns_sequences[[2]])),
                                          rep("c", length(patterns_sequences[[3]])),
                                          rep("d", length(patterns_sequences[[4]])),
                                          rep("e", length(patterns_sequences[[5]])),
                                          rep("f", length(patterns_sequences[[6]])),
                                          rep("g", length(patterns_sequences[[7]])),
                                          rep("h", length(patterns_sequences[[8]]))),
                          pattern = unlist(patterns_sequences))

build_frequency_table_base <- function(sequence_df) {

  ft0 <- as.data.frame(table(unlist(tapply(sequence_df$pattern, sequence_df$sequence_id,  FUN  =
                                             function(p) paste0(p[-length(p)], ",", p[-1])))), stringsAsFactors = FALSE)

  ft1 <- ft0 %>%
    tidyr::separate(Var1, c("from_pattern", "to_pattern"), ",")

  ft5 <- tidyr::spread(ft1, to_pattern, Freq, fill= 0)
  rownames(ft5) <- ft5$from_pattern
  ft5$from_pattern <- NULL
  ft5
}

build_frequency_table_dt <- function(sequence_df) {
  dt = as.data.table(sequence_df);
  dt[, pattern := factor(pattern)];
  dt[, pl := shift(pattern), by=sequence_id][ ,pl := factor(pl, level=levels(pattern))];
  res_dt <- with(dt, table(pl, pattern))
  res_dt <- as.data.frame.matrix(res_dt)

}

tictoc::tic("base")
res_base <- build_frequency_table_base(sequence_df)
tictoc::toc()


tictoc::tic("DT")
res_dt <- build_frequency_table_dt(sequence_df)
tictoc::toc()

(bench = microbenchmark::microbenchmark(
  build_frequency_table_base(sequence_df),
  res_dt <- build_frequency_table_dt(sequence_df),
  times=1000L
))
ggplot2::autoplot(bench)