R 找出两对之间最常见的组合

R 找出两对之间最常见的组合,r,list,similarity,recommendation-engine,R,List,Similarity,Recommendation Engine,我有一份参加这些活动的活动和客人的名单。像这样,但文件要大得多: event guests birthday John Doe birthday Jane Doe birthday Mark White wedding John Doe wedding Jane Doe wedding Matthew Green bar mitzvah Janet Black bar mitzvah John Doe bar mitzvah Jane Doe


event       guests
birthday    John Doe
birthday    Jane Doe
birthday    Mark White
wedding     John Doe
wedding     Jane Doe
wedding     Matthew Green
bar mitzvah Janet Black
bar mitzvah John Doe
bar mitzvah Jane Doe
bar mitzvah William Hill
retirement  Janet Black
retirement  Matthew Green
John Doe
Jane Doe




# All names that we have
nameAll <- unique(df$guests)
# Length of names vector
N <- length(nameAll)

# Function to find intersect between names
getSimilarity <- function(nameA, nameB, type = "intersect") {
    # Subset events for name A
    eventA <- subset(df, guests == nameA)$event
    # Subset events for name B
    eventB <- subset(df, guests == nameB)$event
    # Fint intersect length between events
    if (type == "intersect") {
        res <- length(intersect(eventA, eventB))
    # Find Jaccard index between events
    if (type == "JC") {
        res <- length(intersect(eventA, eventB)) / length(union(eventA, eventB))
    # Return result
    return(data.frame(type, value = res, nameA, nameB))

# Iterate over all possible combinations
# Using double loop for simpler representation    
result <- list()
for(i in 1:(N-1)) {
    for(j in (i+1):N) {
        result[[length(result) + 1]] <- getSimilarity(nameAll[i], nameAll[j])
# Transform result to data.frame and order by similarity 
result <- do.call(rbind, result)
# Showing top 5 pairs
head(result[with(result, order(-value)), ])




# Load as a data frame
df <- data.frame(event = c(rep("birthday", 3), 
                           rep("wedding", 3), 
                           rep("bar mitzvah", 4), 
                           rep("retirement", 2)), 
                  guests = c("John Doe", "Jane Doe", "Mark White", 
                             "John Doe", "Jane Doe", "Matthew Green",   
                              "Janet Black", "John Doe", "Jane Doe",
                              "William Hill", "Janet Black", "Matthew Green"))

# You can represent who attended which event as a matrix
M <- table(df$guests, df$event)
# Now we can compute how many times each individual appeared at an
# event with another with a simple matrix product
admat <- M %*% t(M)

  ##################Jane Doe Janet Black John Doe Mark White Matthew Green William Hill
  #Jane Doe             3           1        3          1             1            1
  #Janet Black          1           2        1          0             1            1
  #John Doe             3           1        3          1             1            1
  #Mark White           1           0        1          1             0            0
  #Matthew Green        1           1        1          0             2            0
  #William Hill         1           1        1          0             0            1






dat_m <- dat %>%
  mutate(value = 1) %>%
  spread(event, value, fill = 0) %>%
  column_to_rownames(var = "guests") %>%

#               bar mitzvah birthday retirement wedding
# Jane Doe                1        1          0       1
# Janet Black             1        0          1       0
# John Doe                1        1          0       1
# Mark White              0        1          0       0
# Matthew Green           0        0          1       1
# William Hill            1        0          0       0

hc <- hclust(dat_dist)


# Load as a data frame
df <- data.frame(event = c(rep("birthday", 3), 
                           rep("wedding", 3), 
                           rep("bar mitzvah", 4), 
                           rep("retirement", 2)), 
                  guests = c("John Doe", "Jane Doe", "Mark White", 
                             "John Doe", "Jane Doe", "Matthew Green",   
                              "Janet Black", "John Doe", "Jane Doe",
                              "William Hill", "Janet Black", "Matthew Green"))

# You can represent who attended which event as a matrix
M <- table(df$guests, df$event)
# Now we can compute how many times each individual appeared at an
# event with another with a simple matrix product
admat <- M %*% t(M)

  ##################Jane Doe Janet Black John Doe Mark White Matthew Green William Hill
  #Jane Doe             3           1        3          1             1            1
  #Janet Black          1           2        1          0             1            1
  #John Doe             3           1        3          1             1            1
  #Mark White           1           0        1          1             0            0
  #Matthew Green        1           1        1          0             2            0
  #William Hill         1           1        1          0             0            1
diag(admat) <- 0
admat[upper.tri(admat)] <- 0
dfmatches <- unique(melt(admat))
# Drop all the zero matches
dfmatches <- dfmatches[dfmatches$value !=0,]
# order it descending
dfmatches <- dfmatches[order(-dfmatches$value),]

#            Var1        Var2 value
#3       John Doe    Jane Doe     3
#2    Janet Black    Jane Doe     1
#4     Mark White    Jane Doe     1
#5  Matthew Green    Jane Doe     1
#6   William Hill    Jane Doe     1
#9       John Doe Janet Black     1
#11 Matthew Green Janet Black     1
#12  William Hill Janet Black     1
#16    Mark White    John Doe     1
#17 Matthew Green    John Doe     1
#18  William Hill    John Doe     1

dat_m <- dat %>%
  mutate(value = 1) %>%
  spread(event, value, fill = 0) %>%
  column_to_rownames(var = "guests") %>%

#               bar mitzvah birthday retirement wedding
# Jane Doe                1        1          0       1
# Janet Black             1        0          1       0
# John Doe                1        1          0       1
# Mark White              0        1          0       0
# Matthew Green           0        0          1       1
# William Hill            1        0          0       0
dat_dist <- vegdist(dat_m, binary = TRUE)

#                Jane Doe Janet Black  John Doe Mark White Matthew Green
# Janet Black   0.6000000                                               
# John Doe      0.0000000   0.6000000                                   
# Mark White    0.5000000   1.0000000 0.5000000                         
# Matthew Green 0.6000000   0.5000000 0.6000000  1.0000000              
# William Hill  0.5000000   0.3333333 0.5000000  1.0000000     1.0000000
hc <- hclust(dat_dist)
# Jane Doe   Janet Black      John Doe    Mark White Matthew Green  William Hill 
#        3             2             3             1             2             1