R 从一个表中选取一组,然后使用其他表计算欧氏距离

R 从一个表中选取一组,然后使用其他表计算欧氏距离,r,grouping,euclidean-distance,R,Grouping,Euclidean Distance,我想计算特定轮廓之间的欧几里德距离。最大的问题是如何将特定的行集合在一起以计算它们之间的距离。在第一个表中,我存储了不同表中的行名称的组,这些行应用于距离计算。 第一个表如下所示: Activity Person ValueOfComp 1 Football Mark_1_OUT 4 2 Football Greg_1_OUT 4 3 Football Mark_1_INT 4

我想计算特定轮廓之间的欧几里德距离。最大的问题是如何将特定的行集合在一起以计算它们之间的距离。在第一个表中,我存储了不同表中的行名称的组,这些行应用于距离计算。 第一个表如下所示:

        Activity     Person ValueOfComp
    1   Football Mark_1_OUT           4
    2   Football Greg_1_OUT           4
    3   Football Mark_1_INT           4
    4   Football Greg_1_INT           4
    5 Volleyball  Tim_1_INT          6
    6 Volleyball  Tim_1_OUT          6
    7 Volleyball  Tom_1_INT          6
    8 Volleyball  Tom_1_OUT          6
    9 Volleyball  Sim_1_INT          6
    10 Volleyball  Sim_1_OUT          6
    11 Handball  Karl_1_OUT          8
    12 Handball  Karl_1_INT          8
    13 Handball  Matt_1_OUT          8
    14 Handball  Matt_1_INT          8
    15 Handball  Jake_1_INT          8
    16 Handball  Jake_1_OUT          8
    17 Handball  Sonya_1_OUT          8
    18 Handball  Sonya_1_INT          8
有两个表存储了上述变量的轮廓,应用于欧氏距离计算

表1表示以
INT
结尾的变量:

                10         34        59 84        110       134       165       199
Mark_1 0.000000000 0.00000000 0.0000000  1 0.12345123 0.1160406 0.2847189 0.4636836
Greg_1 0.000000000 0.00000000 0.1719200  1 0.68940000 0.2087267 0.2469333 0.2358933
Tim_1 0.000000000 0.00000000 0.0000000  1 0.123415551 0.55321234 0.0000000 0.0000000
Tom_1 0.000000000 0.00000000 0.0000000  0 1 0.11234120 0.1755712 0.2344607
Sim_1 0.000000000 0.00000000 0.0000000  1 0.324532121 0.123412666 0.0000000 0.0000000
Karl_1 1 0.123256312 0.34312334  0 0.00000000 0.0000000 0.0000000 0.0000000
Matt_1 0.000000000 0.03978242 0.1272671  1 0.00000000 0.0000000 0.0000000 0.0000000
Moham_1 0.5123412423 0.12423561 0.1775713  1 0.01186404 0.0000000 0.0000000 0.0000000
Teraq_1 0.009915695 0.13451256 0.2211453  1 0.01186404 0.0000000 0.0000000 0.0000000
Jake_1 0.066915225 0.20623498 0.53215713  1 0.01186404 0.0000000 0.0000000 0.0000000
Sonya_1 0.000000000 0.21341411 0.5323123  1 0.00000000 0.0000000 0.0000000 0.0000000
Monique_1 1 0.4311223 0.22343212  0 0.00000000 0.0000000 0.0000000 0.0000000
表2表示以
OUT
结尾的变量:

                10         34        59 84        110       134       165       199
Mark_1 0.000000000 0.00000000 0.0000000  1 0.33345123 0.2530406 0.2147189 0.4636836
Greg_1 0.000000000 0.00000000 0.1719200  1 0.48240000 0.22345726 0.2122233 0.2358933
Tim_1 0.000000000 0.00000000 0.0000000  1 0.623415551 0.35321234 0.0000000 0.0000000
Tom_1 0.000000000 0.00000000 0.0000000  0 1 0.4122120 0.3755712 0.2324607
Sim_1 0.000000000 0.00000000 0.0000000  1 0.33352121 0.223412666 0.0000000 0.0000000
Karl_1 1 0.553256312 0.24312334  0 0.00000000 0.0000000 0.0000000 0.0000000
Matt_1 0.000000000 0.11978242 0.1272671  1 0.00000000 0.0000000 0.0000000 0.0000000
Moham_1 0.5123412423 0.52423561 0.6775713  1 0.31186404 0.0000000 0.0000000 0.0000000
Teraq_1 0.119915695 0.16451256 0.2433253  1 0.09186404 0.0000000 0.0000000 0.0000000
Jake_1 0.264915225 0.33123498 0.39215713  1 0.11186404 0.0000000 0.0000000 0.0000000
Sonya_1 0.000000000 0.33341411 0.4323123  1 0.00000000 0.0000000 0.0000000 0.0000000
Monique_1 1 0.5511223 0.44343212  0 0.00000000 0.0000000 0.0000000 0.0000000
因此,基于第一个表
Football
Volleyball
etc
中的组,我想从这个组中获取所有的轮廓,并计算它们之间的欧几里德距离。配置文件可在其他表中找到。应计算该组中所有纵断面之间的距离,即使这些纵断面取自同一表格

如果将结果存储为一个单独的表,包含对、活动和计算的距离,那就太好了

我的真实数据由几千行组成,但我也有CPU能力来运行循环

有人能帮我回答吗

编辑:可复制示例:

> dput(repr_data)
structure(list(Activity = structure(c(1L, 1L, 1L, 1L, 3L, 3L, 
3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Football", 
"Handball", "Volleyball"), class = "factor"), Person = structure(c(8L, 
7L, 2L, 1L, 15L, 16L, 17L, 18L, 11L, 12L, 6L, 5L, 10L, 9L, 3L, 
4L, 14L, 13L), .Label = c("Greg_1_INT", "Greg_1_OUT", "Jake_1_INT", 
"Jake_1_OUT", "Karl_1_INT", "Karl_1_OUT", "Mark_1_INT", "Mark_1_OUT", 
"Matt_1_INT", "Matt_1_OUT", "Sim_1_INT", "Sim_1_OUT", "Sonya_1_INT", 
"Sonya_1_OUT", "Tim_1_INT", "Tim_1_OUT", "Tom_1_INT", "Tom_1_OUT"
), class = "factor"), ValueOfComp = c(4, 4, 4, 4, 6, 6, 6, 6, 
6, 6, 8, 8, 8, 8, 8, 8, 8, 8)), .Names = c("Activity", "Person", 
"ValueOfComp"), row.names = c(NA, -18L), class = "data.frame")
表1:

> dput(INT_tbl)
structure(c(0, 0, 0, 0, 0, 1, 0.22123412423, 0.0123915695, 0.0126915225, 
0.4312, 1, 0, 0, 0, 0, 0, 0.323256312, 0.32423561, 0.44451256, 
0.33623498, 0.21341411, 0.321223, 0.232, 0.57192, 0, 0, 0, 0.31312334, 
0.2775713, 0.1311453, 0.63215713, 0.4423123, 0.132212, 1, 1, 
1, 0, 1, 0, 1, 1, 1, 1, 0, 0.55345123, 0.689875, 0.423415551, 
1, 0.444532121, 0, 0.01186404, 0.22132204, 0.21186404, 0, 0, 
0.234126, 0.33347267, 0.35321234, 0.4123412, 0.333412666, 0, 
0, 0, 0.3123, 0, 0, 0.1147189, 0.12343, 0.3155, 0.2755712, 0.123, 
0, 0, 0, 0, 0, 0, 0.1236836, 0.0058933, 0, 0.1344607, 0, 0, 0, 
0, 0, 0, 0), .Dim = c(11L, 8L), .Dimnames = list(c("Mark_1", 
"Greg_1", "Tim_1", "Tom_1", "Sim_1", "Karl_1", "Moham_1", "Teraq_1", 
"Jake_1", "Sonya_1", "Monique_1"), c("10", "34", "59", "84", 
"110", "134", "165", "199")))
表2:

> dput(OUT_tbl)
structure(c(0.236915225, 0, 0, 0, 0, 0, 1, 1, 0.22123412423, 
0.0123915695, 0.0126915225, 0.4312, 1, 0.26666498, 0, 0, 0, 0, 
0, 0.323256312, 0.52356312, 0.32423561, 0.44451256, 0.33623498, 
0.21341411, 0.321223, 0.123415713, 0.232, 0.57192, 0, 0, 0, 0.31312334, 
0.12342332, 0.2775713, 0.1311453, 0.63215713, 0.4423123, 0.132212, 
1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0.2235404, 0.55345123, 
0.689875, 0.423415551, 1, 0.444532121, 0, 0, 0.01186404, 0.22132204, 
0.21186404, 0, 0, 0.123, 0.234126, 0.33347267, 0.35321234, 0.4123412, 
0.333412666, 0, 0, 0, 0, 0.3123, 0, 0, 0, 0.1147189, 0.12343, 
0.3155, 0.2755712, 0.123, 0, 0, 0, 0, 0, 0, 0, 0, 0.1236836, 
0.0058933, 0, 0.1344607, 0, 0, 0, 0, 0, 0, 0, 0), .Dim = c(13L, 
8L), .Dimnames = list(c("Karsten_1", "Mark_1", "Greg_1", "Tim_1", 
"Tom_1", "Sim_1", "Karl_1", "Johan_1", "Moham_1", "Teraq_1", 
"Jake_1", "Sonya_1", "Monique_1"), c("10", "34", "59", "84", 
"110", "134", "165", "199")))
期望输出:

        Activity     Person 1   Person 2    EUC.DIST
    1   Football Mark_1_OUT    Greg_1_OUT      XX
    2   Football Mark_1_OUT    Mark_1_INT      XX
    3   Football Mark_1_OUT    Greg_1_INT      XX
    4   Football Greg_1_INT    Greg_1_OUT      XX
    5   Football Greg_1_INT    Mark_1_INT      XX
    6   Football Greg_1_OUT    Mark_1_INT      XX
    ........
    and so on with other combinations withing rest of the groups.

好吧,这可能会有点混乱,但请容忍我

首先,我们将
INT\u tbl
取出
并对它们做一些工作。我们将它们设为数据帧,将行名称添加为列,并在每个条目中添加后缀。这样做是为了将Out和Int表
rbind
合并到完整的数据帧中,即

library(dplyr)
library(tidyr)

out <- setNames(data.frame(paste0(rownames(OUT_tbl), '_OUT'), OUT_tbl, 
                row.names = NULL, stringsAsFactors = FALSE), c('Person', colnames(OUT_tbl)))

int <- setNames(data.frame(paste0(rownames(INT_tbl), '_INT'), INT_tbl, 
                row.names = NULL, stringsAsFactors = FALSE), c('Person', colnames(INT_tbl)))

full_d <- rbind(out, int)

#which gives,
rbind(head(full_d, 3), tail(full_d, 3))
#          Person         10        34        59 84       110       134       165       199
#1  Karsten_1_OUT 0.23691523 0.2666650 0.1234157  1 0.2235404 0.1230000 0.0000000 0.0000000
#2     Mark_1_OUT 0.00000000 0.0000000 0.2320000  1 0.5534512 0.2341260 0.1147189 0.1236836
#3     Greg_1_OUT 0.00000000 0.0000000 0.5719200  1 0.6898750 0.3334727 0.1234300 0.0058933
#22    Jake_1_INT 0.01269152 0.3362350 0.6321571  1 0.2118640 0.3123000 0.0000000 0.0000000
#23   Sonya_1_INT 0.43120000 0.2134141 0.4423123  1 0.0000000 0.0000000 0.0000000 0.0000000
#24 Monique_1_INT 1.00000000 0.3212230 0.1322120  0 0.0000000 0.0000000 0.0000000 0.0000000
如果要从最终数据帧中排除
NA
值,只需

final_d <- final_d[!is.na(final_d$EUC.DIST),]
final\u d请检查此项

#Convert to data.frame and cleanup
INT_tbl = as.data.frame(INT_tbl)
OUT_tbl = as.data.frame(OUT_tbl)
INT_tbl$Remarks = "INT"
OUT_tbl$Remarks = "OUT"
INT_tbl$Names = rownames(INT_tbl)
OUT_tbl$Names = rownames(OUT_tbl)
rownames(INT_tbl) = NULL
rownames(OUT_tbl) = NULL

# Initiate empty lists
Name_Pair1 = list()
Name_Pair2 = list()
EDistance = list()

m = 1

#Compute distance between all names in INT and OUT and add to lists
while (m < nrow(INT_tbl)*nrow(OUT_tbl)){
    for (i in 1:nrow(INT_tbl)){
        for (j in 1:nrow(OUT_tbl)){
            Name_Pair1[m] = paste(INT_tbl$Names[i],"_INT-",OUT_tbl$Names[j],"_OUT",sep="")
            Name_Pair2[m] = paste(OUT_tbl$Names[j],"_OUT-",INT_tbl$Names[i],"_INT",sep="")
            EDistance[m] = sqrt((INT_tbl$`10`[i]-OUT_tbl$`10`[i])^2+
                                    (INT_tbl$`34`[i]-OUT_tbl$`34`[i])^2+
                                    (INT_tbl$`59`[i]-OUT_tbl$`59`[i])^2+
                                    (INT_tbl$`84`[i]-OUT_tbl$`84`[i])^2+
                                    (INT_tbl$`110`[i]-OUT_tbl$`110`[i])^2+
                                    (INT_tbl$`134`[i]-OUT_tbl$`134`[i])^2+
                                    (INT_tbl$`165`[i]-OUT_tbl$`165`[i])^2+
                                    (INT_tbl$`199`[i]-OUT_tbl$`199`[i])^2)
            m = m+1
        }
    }
}

#COmbine lists into data.frame and cleanup 
DDistance = data.frame(cbind(Name_Pair1,Name_Pair2,EDistance))
DDistance$Name_Pair1 = as.character(DDistance$Name_Pair1)
DDistance$Name_Pair2 = as.character(DDistance$Name_Pair2)
DDistance$EDistance = as.numeric(DDistance$EDistance)

#Initiate OUTPUT data.frame 
Out.put = data.frame(V1 = NA,V2=NA,Name_Pair=NA,EDistance=NA,Activity=NA)

#Obtain list of unique Activity 
Activity = as.character(unique(repr_data$Activity))

for (i in 1:length(Activity)){
    df = repr_data[repr_data$Activity == Activity[i],] #Subset for unique activity
    x = as.data.frame(combn(df$Person,2,simplify = FALSE)) #Get all combination of names in the subset
    x= t(x)
    rownames(x) = NULL
    x= as.data.frame(x)

    #Lookup distance for each row based on Name1(V1) and Name2(V2)
    for (j in 1:nrow(x)){
        x$Name_Pair[j] = paste(x$V1[j],x$V2[j],sep="-")
        for (k in 1:nrow(DDistance)){
            if (x$Name_Pair[j] == DDistance$Name_Pair1[k] | x$Name_Pair[j] == DDistance$Name_Pair2[k])
                x$EDistance = DDistance$EDistance[k]
            next
        }
        x$Activity = Activity[i]
    }
    Out.put = rbind(Out.put,x) #Append to Out.put
}

Out.put = Out.put[2:nrow(Out.put),] #Cleanup
#转换为data.frame并清除
INT_tbl=as.data.frame(INT_tbl)
OUT\u tbl=as.data.frame(OUT\u tbl)
INT\u tbl$备注=“INT”
OUT\u tbl$备注=“OUT”
INT_tbl$Names=行名(INT_tbl)
OUT\u tbl$Names=行名(OUT\u tbl)
行名称(INT_tbl)=空
行名称(OUT_tbl)=空
#启动空列表
Name_Pair1=列表()
Name_Pair2=list()
eInstance=list()
m=1
#计算INT和OUT中所有名称之间的距离,并添加到列表中
而(m
这里有一个使用
dplyr的替代方法。我认为在相应地更新行名称后,将
INT\u tbl
OUT\u tbl
组合起来效果更好(可能更容易理解):

rownames(INT_tbl) <- paste0(rownames(INT_tbl), "_INT")
rownames(OUT_tbl) <- paste0(rownames(OUT_tbl), "_OUT")
BOTH_tbl <- rbind(INT_tbl, OUT_tbl)
解决方案一 如果您需要通过
NA
距离保持失踪人员可见:

repr_data %>%
  group_by(Activity) %>%
  do({
    people <- as.character(unique(.$Person))
    peoplei <- match(people, allpeople)
    d <- dist(BOTH_tbl[peoplei,])
    n <- length(people) - 1
    data.frame(
      Person1 = rev(people[-1])[unlist(mapply(`:`, n:1, 1))],
      Person2 = rep(people, times = n:0),
      Dist = unclass(d),
      stringsAsFactors = FALSE
    )
  }) %>%
  ungroup()
# # A tibble: 49 × 4
#    Activity    Person1    Person2      Dist
#      <fctr>      <chr>      <chr>     <dbl>
# 1  Football Mark_1_INT Mark_1_OUT 0.0000000
# 2  Football Greg_1_OUT Mark_1_OUT 0.3974635
# 3  Football Greg_1_INT Mark_1_OUT 0.3974635
# 4  Football Greg_1_OUT Mark_1_INT 0.3974635
# 5  Football Greg_1_INT Mark_1_INT 0.3974635
# 6  Football Greg_1_INT Greg_1_OUT 0.0000000
# 7  Handball Karl_1_INT Karl_1_OUT 0.0000000
# 8  Handball Matt_1_OUT Karl_1_OUT        NA
# 9  Handball Matt_1_INT Karl_1_OUT        NA
# 10 Handball Jake_1_INT Karl_1_OUT 1.4896801
# # ... with 39 more rows
repr\u数据%>%
分组单位(活动)%>%
做({

人们请让你的例子重现。同时添加预期输出更新与你要求的一切。嗨,你能让我知道我的解决方案是否有效吗?谢谢,但有一些个人问题。它有效。谢谢你的解决方案。我会检查其余的。
rownames(INT_tbl) <- paste0(rownames(INT_tbl), "_INT")
rownames(OUT_tbl) <- paste0(rownames(OUT_tbl), "_OUT")
BOTH_tbl <- rbind(INT_tbl, OUT_tbl)
allpeople <- rownames(BOTH_tbl)
library(dplyr)
repr_data %>%
  group_by(Activity) %>%
  do({
    people <- as.character(unique(.$Person))
    peoplei <- match(people, allpeople)
    d <- dist(BOTH_tbl[peoplei,])
    n <- length(people) - 1
    data.frame(
      Person1 = rev(people[-1])[unlist(mapply(`:`, n:1, 1))],
      Person2 = rep(people, times = n:0),
      Dist = unclass(d),
      stringsAsFactors = FALSE
    )
  }) %>%
  ungroup()
# # A tibble: 49 × 4
#    Activity    Person1    Person2      Dist
#      <fctr>      <chr>      <chr>     <dbl>
# 1  Football Mark_1_INT Mark_1_OUT 0.0000000
# 2  Football Greg_1_OUT Mark_1_OUT 0.3974635
# 3  Football Greg_1_INT Mark_1_OUT 0.3974635
# 4  Football Greg_1_OUT Mark_1_INT 0.3974635
# 5  Football Greg_1_INT Mark_1_INT 0.3974635
# 6  Football Greg_1_INT Greg_1_OUT 0.0000000
# 7  Handball Karl_1_INT Karl_1_OUT 0.0000000
# 8  Handball Matt_1_OUT Karl_1_OUT        NA
# 9  Handball Matt_1_INT Karl_1_OUT        NA
# 10 Handball Jake_1_INT Karl_1_OUT 1.4896801
# # ... with 39 more rows
repr_data %>%
  group_by(Activity) %>%
  do({
    people <- intersect(as.character(unique(.$Person)), allpeople)
    d <- dist(BOTH_tbl[people,])
    n <- length(people) - 1
    data.frame(
      Person1 = rev(people[-1])[unlist(mapply(`:`, n:1, 1))],
      Person2 = rep(people, times = n:0),
      Dist = unclass(d),
      stringsAsFactors = FALSE
    )
  }) %>%
  ungroup()
# # A tibble: 36 × 4
#    Activity     Person1    Person2      Dist
#      <fctr>       <chr>      <chr>     <dbl>
# 1  Football  Mark_1_INT Mark_1_OUT 0.0000000
# 2  Football  Greg_1_OUT Mark_1_OUT 0.3974635
# 3  Football  Greg_1_INT Mark_1_OUT 0.3974635
# 4  Football  Greg_1_OUT Mark_1_INT 0.3974635
# 5  Football  Greg_1_INT Mark_1_INT 0.3974635
# 6  Football  Greg_1_INT Greg_1_OUT 0.0000000
# 7  Handball  Karl_1_INT Karl_1_OUT 0.0000000
# 8  Handball  Jake_1_INT Karl_1_OUT 1.4896801
# 9  Handball  Jake_1_OUT Karl_1_OUT 1.4896801
# 10 Handball Sonya_1_OUT Karl_1_OUT 1.1628794
# # ... with 26 more rows