R 将值减去另一列中最近的特定字符串_R

R 将值减去另一列中最近的特定字符串

R 将值减去另一列中最近的特定字符串,r,R,假设我有一个数据df，如下所示。总共有20行，列string中有四种类型的字符串：“A”、“B”、“C”和“D” 通过从上一行减去列位置中的值，我可以通过执行以下命令获得第四列距离： df$distance <- ave(df$position, FUN=function(x) c(0, diff(x))) 然而，我希望得到的是每个字符串在列位置到最近的前一个字符串“C”的距离，例如下面7,8和17的变化： no string position distance 1 B

假设我有一个数据

df

，如下所示。总共有20行，列

string

中有四种类型的字符串：“A”、“B”、“C”和“D”

通过从上一行减去列

位置

中的值，我可以通过执行以下命令获得第四列

距离

：

df$distance <- ave(df$position, FUN=function(x) c(0, diff(x)))

然而，我希望得到的是每个字符串在

列位置

到最近的前一个字符串“C”的距离，例如下面7,8和17的变化：

no string position distance 1 B 650 0 2 C 651 1 3 B 659 8 4 C 660 1 5 C 662 2 6 B 663 1 7 D 668 6 8 D 670 8 9 C 671 1 10 B 672 1 11 C 673 1 12 A 681 8 13 C 682 1 14 B 683 1 15 C 684 1 16 D 690 6 17 A 692 8 18 C 693 1 19 D 694 1 20 C 695 1

我怎样才能做到？顺便问一下，我可以知道如何获得与列

字符串中最近的下一个“C”的距离吗
 可能不是一个理想的解决方案，有一种方法可以简化这一点
#Taken from your code
df$distance <- ave(df$position, FUN=function(x) c(0, diff(x)))

#logical values indicating occurrence of "C" 
c_occur = df$string == "C"

#We can ignore first two values in each group since, 
#First value is "C" and second value is correctly calculated from previous row
#Get the indices where we need to replace the values
inds_to_replace = which(ave(df$string, cumsum(c_occur), FUN = seq_along) > 2)

#Get the closest occurrence of "C" from the inds_to_replace
c_to_replace <- sapply(inds_to_replace, function(x) {
          new_inds <- which(c_occur)
          max(new_inds[(x - new_inds) > 0])
#To get distance from "nearest next "C" replace the above line with 
          #new_inds[which.max(x - new_inds < 0)]
})

#Replace the values
df$distance[inds_to_replace] <- df$position[inds_to_replace] - 
                                df$position[c_to_replace]

df[inds_to_replace, ]

#   no string position distance
#7   7      D      668        6
#8   8      D      670        8
#17 17      A      692        8

#取自您的代码
df$距离2）
#从inds_到_替换中获取最接近的“C”匹配项
c_to_replace这是一个数据。表
方式：
dtt[, distance := c(0, diff(position))]
dtt[cumsum(string == 'C') > 0,
    distance := ifelse(seq_len(.N) == 1, distance, position - position[1]),
    by = cumsum(string == 'C')]

#     no string position distance
#  1:  1      B      650        0
# 2:  2      C      651        1
# 3:  3      B      659        8
# 4:  4      C      660        1
# 5:  5      C      662        2
# 6:  6      B      663        1
# 7:  7      D      668        6
# 8:  8      D      670        8
# 9:  9      C      671        1
# 10: 10      B      672        1
# 11: 11      C      673        1
# 12: 12      A      681        8
# 13: 13      C      682        1
# 14: 14      B      683        1
# 15: 15      C      684        1
# 16: 16      D      690        6
# 17: 17      A      692        8
# 18: 18      C      693        1
# 19: 19      D      694        1
# 20: 20      C      695        1

这里是dtt的
structure(list(no = 1:20, string = c("B", "C", "B", "C", "C", 
"B", "D", "D", "C", "B", "C", "A", "C", "B", "C", "D", "A", "C", 
"D", "C"), position = c(650L, 651L, 659L, 660L, 662L, 663L, 668L, 
670L, 671L, 672L, 673L, 681L, 682L, 683L, 684L, 690L, 692L, 693L, 
694L, 695L)), row.names = c(NA, -20L), class = c("data.table", 
"data.frame"), .internal.selfref = <pointer: 0x1939260>)

以下tidyverse
方法再现了您的预期输出
问题描述：计算当前行的位置
与上一行的差异；如果没有先前的string=“C”
行，或者该行本身有string=“C”
，则距离由当前行和先前行之间的位置差给出（与string
无关）
库（tidyverse）
df%>%
变异（nC=cumsum（string==“C”））%>%
分组依据（nC）%>%
突变（dist=cumsum（c（0，diff（位置）））%>%
解组（）%>%
变异（dist=if_else（dist==0，c（0，diff（位置）），dist））%>%
选择（-nC）
##一个tibble:20x4
#无字符串位置距离
#          
#11B6500。
#2 C 651 1。
#33B659 8。
#4 C 660 1。
#5 C 662 2。
#6B6631。
#7 D 668 6。
#8D6708。
#9 C 671 1。
#1010b6721。
#11 C 673 1。
#12 A 681 8。
#13 C 682 1。
#14 B 683 1。
#15 C 684 1。
#16 D 690 6。
#17 A 692 8。
#18 C 693 1。
#19 D 694 1。
#20 C 695 1。


样本数据
df@RonakShah我向您道歉。我在表格中有一个输入错误，并且已经纠正了它。”获取每个字符串在列位置到最近的前一个字符串“C”的距离，例如下面7、8和17的变化“对不起，我真的很难理解您的预期输出以及您试图计算的内容。您是否在询问字符串距离（例如，D和A之间相隔多少个字母）？字母“C”和它有什么关系？这只是一个例子吗？还是“C”是推荐信？“7、8和17的变化”指的是什么值？我在任何地方都看不到值17。[继续]如果您能详细说明前5行的距离的计算，可能会有所帮助。您好@MauritsEvers。我为我的困惑道歉。例如，在第17行，位置
值为692，表示字符串
“A”（非“C”）。我希望从最近的前一个“C”中得到减法，即第15行684位。所以我希望列distance
的值为692-684=8，而不是只有减去前一行才能得到的2。我希望对每一个非“C”和它最近的前一个C都这样做。希望这能澄清。你不能初始化df$distance吗
dtt[, distance := c(0, diff(position))]
dtt[cumsum(string == 'C') > 0,
    distance := ifelse(seq_len(.N) == 1, distance, position - position[1]),
    by = cumsum(string == 'C')]

#     no string position distance
#  1:  1      B      650        0
# 2:  2      C      651        1
# 3:  3      B      659        8
# 4:  4      C      660        1
# 5:  5      C      662        2
# 6:  6      B      663        1
# 7:  7      D      668        6
# 8:  8      D      670        8
# 9:  9      C      671        1
# 10: 10      B      672        1
# 11: 11      C      673        1
# 12: 12      A      681        8
# 13: 13      C      682        1
# 14: 14      B      683        1
# 15: 15      C      684        1
# 16: 16      D      690        6
# 17: 17      A      692        8
# 18: 18      C      693        1
# 19: 19      D      694        1
# 20: 20      C      695        1

structure(list(no = 1:20, string = c("B", "C", "B", "C", "C", 
"B", "D", "D", "C", "B", "C", "A", "C", "B", "C", "D", "A", "C", 
"D", "C"), position = c(650L, 651L, 659L, 660L, 662L, 663L, 668L, 
670L, 671L, 672L, 673L, 681L, 682L, 683L, 684L, 690L, 692L, 693L, 
694L, 695L)), row.names = c(NA, -20L), class = c("data.table", 
"data.frame"), .internal.selfref = <pointer: 0x1939260>)

dtt[, distance := c(0, diff(position))]
dtt[, g := rev(cumsum(rev(string == 'C')))]
dtt[g > 0, distance := ifelse(seq_len(.N) == .N, distance, abs(position - position[.N])), by = g]
dtt[, g := NULL]
#     no string position distance
#  1:  1      B      650        1
#  2:  2      C      651        1
#  3:  3      B      659        1
#  4:  4      C      660        1
#  5:  5      C      662        2
#  6:  6      B      663        8
#  7:  7      D      668        3
#  8:  8      D      670        1
#  9:  9      C      671        1
# 10: 10      B      672        1
# 11: 11      C      673        1
# 12: 12      A      681        1
# 13: 13      C      682        1
# 14: 14      B      683        1
# 15: 15      C      684        1
# 16: 16      D      690        3
# 17: 17      A      692        1
# 18: 18      C      693        1
# 19: 19      D      694        1
# 20: 20      C      695        1

library(tidyverse)
df %>%
    mutate(nC = cumsum(string == "C")) %>%
    group_by(nC) %>%
    mutate(dist = cumsum(c(0, diff(position)))) %>%
    ungroup() %>%
    mutate(dist = if_else(dist == 0, c(0, diff(position)), dist)) %>%
    select(-nC)
## A tibble: 20 x 4
#      no string position  dist
#   <int> <fct>     <int> <dbl>
# 1     1 B           650    0.
# 2     2 C           651    1.
# 3     3 B           659    8.
# 4     4 C           660    1.
# 5     5 C           662    2.
# 6     6 B           663    1.
# 7     7 D           668    6.
# 8     8 D           670    8.
# 9     9 C           671    1.
#10    10 B           672    1.
#11    11 C           673    1.
#12    12 A           681    8.
#13    13 C           682    1.
#14    14 B           683    1.
#15    15 C           684    1.
#16    16 D           690    6.
#17    17 A           692    8.
#18    18 C           693    1.
#19    19 D           694    1.
#20    20 C           695    1.

df <- read.table(text =
"no  string  position
1   B   650
2   C   651
3   B   659
4   C   660
5   C   662
6   B   663
7   D   668
8   D   670
9   C   671
10  B   672
11  C   673
12  A   681
13  C   682
14  B   683
15  C   684
16  D   690
17  A   692
18  C   693
19  D   694
20  C   695", header = T)