如何知道一列中每个观察的频率,并按r排序?

如何知道一列中每个观察的频率,并按r排序?,r,sorting,R,Sorting,我有一个列,每一行都是一个字符串。 我想找到 1.每个序列的频率 2.按频率从高到低对结果进行排序 3.如果多个字符串的频率相同,我会按顺序的字母表对它们进行排序 我的数据看起来像 ID seq 1 1 BBBBBBIRBBRBBBB 2 2 BBBBBBIRRRRRBBB 3 3 BBBBBBIRRRRRRRR 4 4 BBBBBBITBBBBBBB 5 5 BBBBBBITBBBRBBX 6 6 BBBBBBITTTTBBCX 7 7

我有一个列,每一行都是一个字符串。 我想找到 1.每个序列的频率 2.按频率从高到低对结果进行排序 3.如果多个字符串的频率相同,我会按顺序的字母表对它们进行排序

我的数据看起来像

   ID             seq
1   1 BBBBBBIRBBRBBBB
2   2 BBBBBBIRRRRRBBB
3   3 BBBBBBIRRRRRRRR
4   4 BBBBBBITBBBBBBB
5   5 BBBBBBITBBBRBBX
6   6 BBBBBBITTTTBBCX
7   7 BBBBBBITTTTTTTT
8   8 BBBBBBOBBBBBBTX
9   9 BBBBBBOBBBBBBXB
10 10 BBBBBBIRBBRBBBB
11 11 BBBBBBIRRRRRBBB
12 12 BBBBBBIRRRRRRRR
13 13 BBBBBBITBBBBBBB
14 14 BBBBBBITBBBRBBX
15 15 BBBBBBIRBBRBBBB
16 16 BBBBBBIRRRRRBBB
17 17 BBBBBBIRRRRRRRR
18 18 BBBBBBIRBBRBBBB
19 19 BBBBBBIRRRRRBBB
20 20 BBBBBBIRRRRRBBB

ID<-c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20)
seq<-c('BBBBBBIRBBRBBBB','BBBBBBIRBBRBBBB',  'BBBBBBIRRRRRBBB', 'BBBBBBIRRRRRRRR',  'BBBBBBITBBBBBBB',  'BBBBBBITBBBRBBX',  'BBBBBBITTTTBBCX',  'BBBBBBITTTTTTTT',  'BBBBBBOBBBBBBTX',  'BBBBBBOBBBBBBXB',  'BBBBBBIRBBRBBBB',  'BBBBBBIRRRRRBBB',  'BBBBBBIRRRRRRRR',  'BBBBBBITBBBBBBB',  'BBBBBBITBBBRBBX',  'BBBBBBIRBBRBBBB',  'BBBBBBIRRRRRBBB',  'BBBBBBIRRRRRRRR',  'BBBBBBIRBBRBBBB',  'BBBBBBIRRRRRBBB')
data.frame(ID,seq)

提前谢谢

我喜欢
dplyr

install.packages('dplyr')
library(dplyr)

df <- group_by(df, seq)
df <- count(df, seq)

count(df, seq)
Source: local data frame [9 x 2]

          seq     n
       (fctr) (int)
 1 BBBBBBIRBBRBBBB     4
 2 BBBBBBIRRRRRBBB     4
 3 BBBBBBIRRRRRRRR     3
 4 BBBBBBITBBBBBBB     2
 5 BBBBBBITBBBRBBX     2
 6 BBBBBBITTTTBBCX     1
 7 BBBBBBITTTTTTTT     1
 8 BBBBBBOBBBBBBTX     1
 9 BBBBBBOBBBBBBXB     1
install.packages('dplyr'))
图书馆(dplyr)

df如果您想对排序和名称施加更多控制,可以使用以下dplyr函数

library(dplyr)
# assumes df is a data frame with seq and ID columns
df %>% 
  group_by(sequence = seq) %>% 
  summarize(frequency = length(ID)) %>% 
  arrange(-frequency)

可以使用
数据执行此操作。表

library(data.table)

setDT(df)[, .N, by = seq][order(-N)]
值得注意的是,
data.table
在不同样本大小下的速度始终优于
dplyr

顶部的数字是原始样本重复的次数

下面是要复制的代码:

library(data.table)
library(dplyr)
dtWay <- function(ID, seq) {
  dt <- data.table(ID, seq);
  setkey(dt, seq);
  return(dt[, .N, by = seq][order(-N)])
}
dplyrWay <- function(ID, seq) {
  df <- data.frame(ID, seq)
  res <- df %>% 
    dplyr::group_by(seq) %>% 
    dplyr::summarize(frequency = length(ID)) %>% 
    dplyr::arrange(desc(frequency)) %>%
    dplyr::rename(sequence = seq)
  return (res)
}

N <- c(3, 4, 5, 6)
n <- 10^N

library(microbenchmark)
dev.off()
par( mfrow = c( 2, 2 ) )
res <- lapply(n, function(x) {

  ID <-c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19)
  ID <- rep(ID, times = x)
  seq<-c('BBBBBBIRBBRBBBB',  'BBBBBBIRRRRRBBB',   'BBBBBBIRRRRRRRR',  'BBBBBBITBBBBBBB',  'BBBBBBITBBBRBBX',  'BBBBBBITTTTBBCX',  'BBBBBBITTTTTTTT',  'BBBBBBOBBBBBBTX',  'BBBBBBOBBBBBBXB',  'BBBBBBIRBBRBBBB',  'BBBBBBIRRRRRBBB',  'BBBBBBIRRRRRRRR',  'BBBBBBITBBBBBBB',  'BBBBBBITBBBRBBX',  'BBBBBBIRBBRBBBB',  'BBBBBBIRRRRRBBB',  'BBBBBBIRRRRRRRR',  'BBBBBBIRBBRBBBB',  'BBBBBBIRRRRRBBB')
  seq  <- rep(seq, times = x)

  m <- microbenchmark( "data.table" = dtWay(ID, seq),
                       "dplyr" = dplyrWay(ID, seq),
                       times = 10, unit = "sc")

  a <- boxplot(m, main = x, xlab ="", ylab = "time")
})
库(data.table)
图书馆(dplyr)

dtWay我认为您需要
聚合(ID~seq,df,length)
。它为您下单。
f谢谢您,乔戈。但这仍然使排序按“顺序字母表”和“频率”。但我想先按频率排序,然后再按字母排序。@TerenceTien After
f谢谢。但这使得排序按“顺序字母表”和“频率”进行。但我想先按频率排序,然后再按字母表排序。
library(data.table)
library(dplyr)
dtWay <- function(ID, seq) {
  dt <- data.table(ID, seq);
  setkey(dt, seq);
  return(dt[, .N, by = seq][order(-N)])
}
dplyrWay <- function(ID, seq) {
  df <- data.frame(ID, seq)
  res <- df %>% 
    dplyr::group_by(seq) %>% 
    dplyr::summarize(frequency = length(ID)) %>% 
    dplyr::arrange(desc(frequency)) %>%
    dplyr::rename(sequence = seq)
  return (res)
}

N <- c(3, 4, 5, 6)
n <- 10^N

library(microbenchmark)
dev.off()
par( mfrow = c( 2, 2 ) )
res <- lapply(n, function(x) {

  ID <-c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19)
  ID <- rep(ID, times = x)
  seq<-c('BBBBBBIRBBRBBBB',  'BBBBBBIRRRRRBBB',   'BBBBBBIRRRRRRRR',  'BBBBBBITBBBBBBB',  'BBBBBBITBBBRBBX',  'BBBBBBITTTTBBCX',  'BBBBBBITTTTTTTT',  'BBBBBBOBBBBBBTX',  'BBBBBBOBBBBBBXB',  'BBBBBBIRBBRBBBB',  'BBBBBBIRRRRRBBB',  'BBBBBBIRRRRRRRR',  'BBBBBBITBBBBBBB',  'BBBBBBITBBBRBBX',  'BBBBBBIRBBRBBBB',  'BBBBBBIRRRRRBBB',  'BBBBBBIRRRRRRRR',  'BBBBBBIRBBRBBBB',  'BBBBBBIRRRRRBBB')
  seq  <- rep(seq, times = x)

  m <- microbenchmark( "data.table" = dtWay(ID, seq),
                       "dplyr" = dplyrWay(ID, seq),
                       times = 10, unit = "sc")

  a <- boxplot(m, main = x, xlab ="", ylab = "time")
})