R 从A列中选择唯一值，同时从B列中选择唯一值_R_Dataframe

R 从A列中选择唯一值，同时从B列中选择唯一值

r dataframe

R 从A列中选择唯一值，同时从B列中选择唯一值,r,dataframe,R,Dataframe,我在R中有一个data.frame： user hobby user_profile_url 1 reading "https://...user1" 1 dancing "https://...user1" 2 dancing "https://...user2" 2 gaming "https://...user2" 3 gaming "https://...user3" 4

我在R中有一个data.frame：

user    hobby      user_profile_url
1       reading    "https://...user1"
1       dancing    "https://...user1"
2       dancing    "https://...user2"
2       gaming     "https://...user2"
3       gaming     "https://...user3"
4       cooking    "https://...user4"
4       singing    "https://...user4"
...

我试图为每个独特的爱好选择一个独特的用户（只要可能）

期望输出：

user    hobby      user_profile_url
1       reading    "https://...user1"
2       dancing    "https://...user2"
3       gaming     "https://...user3"
4       cooking    "https://...user4"
4       singing    "https://...user4"
...

有人知道怎么做吗？谢谢

编辑：

dat <- data.frame(user  = c(1,1,2,2,3,4,4), hobby = c("reading","dancing","dancing","gaming","gaming","cooking","singing"), user_profile_url = c("https://...user1","https://...user1","https://...user2","https://...user2","https://...user3","https://...user4","https://...user4"), stringsAsFactors = F)

somewhatUnique <- function(df, colA, colB){
  uniq.df <- df[!duplicated(df[,c(colA, colB)]),]
  tb.uniq.df <- table(uniq.df[,c(colA, colB)])
  new.tb <- row(tb.uniq.df)
  new.tb[] <- rownames(tb.uniq.df)[new.tb]
  new.tb[tb.uniq.df == 0] <- NA
  j <- apply(new.tb, 2, as.list) # supply table columns as individual lists
  # expand.grid can take list arguments so we can handle dynamic unique hobbies
  combos <- expand.grid(lapply(j, function(x) do.call(rbind,x)), 
                        stringsAsFactors = F) # all possible options
  k <- combos[complete.cases(combos),] # options without NA
  s <- rep(NA,nrow(k)) # initialize vector
  for(i in 1:nrow(k)) s[i] <- length(unique(k[i,,drop = T]))
  L <- as.list(c(k[which.max(s),]))
  names(L) <- unique(df[,colB]) 
  # find position in split and return correct row
  by_B <- split(df, df[,colB])
  takerows <- as.list(c(mapply(function(x,y) match(x,y[,colA]), 
                               x = L, y = by_B)))
  out <- as.data.frame(t(mapply(function(z,r) z[r,], z = by_B, r = takerows)))
  out <- do.call(cbind.data.frame, lapply(out, unlist)) # formatting output correctly
  out <- out[order(out[,colA]),] # sorting by user
  rownames(out) <- NULL
  out
}

somewhatUnique(dat, "user", "hobby") # all unique hobbies, maximizing unique users
#  user   hobby user_profile_url
#1    1 reading https://...user1
#2    2 dancing https://...user2
#3    3  gaming https://...user3
#4    4 cooking https://...user4
#5    4 singing https://...user4

somewhatUnique(dat, "hobby", "user") # all unique users, maximizing unique hobbies
#  user   hobby user_profile_url
#1    4 cooking https://...user4
#2    2 dancing https://...user2
#3    3  gaming https://...user3
#4    1 reading https://...user1

我想我有一些有用的东西

uniqueUserPerHobby <- function(df){
  vec1Arr <- c()
  vec2Arr <- c()
  used_id <- c()
  for (a_label in unique(df$hobby)){
    if (nrow(df[df$hobby==a_label,])==1) {
      used_id <- c(used_id, df[df$hobby==a_label,]$user )
      vec1Arr <- c(vec1Arr, df[df$hobby==a_label,]$user)
      vec2Arr <- c(vec2Arr, a_label)
    } else {
      i<-1
      df_multy <- df[df$hobby==a_label,]
      for (a_user in df_multy$user) {
        i<-i+1
        if (nrow(df[df$user==a_user,])==1) {
          used_id <- c(used_id, a_user)
          vec1Arr <- c(vec1Arr, a_user)
          vec2Arr <- c(vec2Arr, a_label)
          break
        } else if (i == length(df$user)) {
          if (! a_user %in% used_id){
            used_id <- c(used_id, a_user)
          }
          vec1Arr <- c(vec1Arr, a_user)
          vec2Arr <- c(vec2Arr, a_label)
        } else if (! a_user %in% used_id) {
          used_id <- c(used_id, a_user)
          vec1Arr <- c(vec1Arr, a_user)
          vec2Arr <- c(vec2Arr, a_label)
          break
        }
      }
      if (!a_label %in% vec2Arr){
        if (!df[df$hobby==a_label,]$user[1] %in% used_id){
          used_id <- c(used_id, df[df$hobby==a_label,]$user[1])
        }
        vec1Arr <- c(vec1Arr, df[df$hobby==a_label,]$user[1])
        vec2Arr <- c(vec2Arr, a_label)
      }
    }
  }
  new.df <- dplyr::left_join(data.frame(user=vec1Arr, hobby=vec2Arr, stringsAsFactors = F), df, by=c("user", "hobby"))
  return(new.df)
}

uniqueUserPerHobby好的，我试着让它尽可能的通用，但是我只在你的数据上测试过它，它会给你一些警告，但不会影响输出（基于当前数据）。我不能保证它会一直工作，但它应该让你开始
从我保存为TIBLE的数据开始：
df <- tibble(user=c(1,1,2,2,3,4,4), hobby=c("reading","dancing","dancing","gaming","gaming","cooking","singing"),user_profile=c("user1","user1","user2","user2","user3","user4","user4"))

我重新格式化原始数据框，以便hobby
为列：
library(tidyverse)
new <- df %>% 
         mutate(dummy=1) %>%
         complete(user,hobby) %>%
         group_by(user) %>%
         spread(.,hobby,dummy) %>%
         filter(!is.na(user_profile))

   user user_profile cooking dancing gaming reading singing
  <dbl>        <chr>   <dbl>   <dbl>  <dbl>   <dbl>   <dbl>
1     1        user1      NA       1     NA       1      NA
2     2        user2      NA       1      1      NA      NA
3     3        user3      NA      NA      1      NA      NA
4     4        user4       1      NA     NA      NA       1

输出
   user     key user_profile
  <dbl>   <chr>        <chr>
1     1 reading        user1
2     2 dancing        user2
3     3  gaming        user3
4     4 cooking        user4
5     4 singing        user4

用户密钥用户配置文件
1读取用户1
2跳舞用户2
3游戏用户3
4烹饪用户4
5 4歌唱用户4
woooweee不容易。我用base R做了这个，还做了一个函数。试一试：
您的数据：
dat <- data.frame(user  = c(1,1,2,2,3,4,4), hobby = c("reading","dancing","dancing","gaming","gaming","cooking","singing"), user_profile_url = c("https://...user1","https://...user1","https://...user2","https://...user2","https://...user3","https://...user4","https://...user4"), stringsAsFactors = F)

somewhatUnique <- function(df, colA, colB){
  uniq.df <- df[!duplicated(df[,c(colA, colB)]),]
  tb.uniq.df <- table(uniq.df[,c(colA, colB)])
  new.tb <- row(tb.uniq.df)
  new.tb[] <- rownames(tb.uniq.df)[new.tb]
  new.tb[tb.uniq.df == 0] <- NA
  j <- apply(new.tb, 2, as.list) # supply table columns as individual lists
  # expand.grid can take list arguments so we can handle dynamic unique hobbies
  combos <- expand.grid(lapply(j, function(x) do.call(rbind,x)), 
                        stringsAsFactors = F) # all possible options
  k <- combos[complete.cases(combos),] # options without NA
  s <- rep(NA,nrow(k)) # initialize vector
  for(i in 1:nrow(k)) s[i] <- length(unique(k[i,,drop = T]))
  L <- as.list(c(k[which.max(s),]))
  names(L) <- unique(df[,colB]) 
  # find position in split and return correct row
  by_B <- split(df, df[,colB])
  takerows <- as.list(c(mapply(function(x,y) match(x,y[,colA]), 
                               x = L, y = by_B)))
  out <- as.data.frame(t(mapply(function(z,r) z[r,], z = by_B, r = takerows)))
  out <- do.call(cbind.data.frame, lapply(out, unlist)) # formatting output correctly
  out <- out[order(out[,colA]),] # sorting by user
  rownames(out) <- NULL
  out
}    

somewhatUnique(dat, "user", "hobby") # all unique hobbies, maximizing unique users
#  user   hobby user_profile_url
#1    1 reading https://...user1
#2    2 dancing https://...user2
#3    3  gaming https://...user3
#4    4 cooking https://...user4
#5    4 singing https://...user4

somewhatUnique(dat, "hobby", "user") # all unique users, maximizing unique hobbies
#  user   hobby user_profile_url
#1    4 cooking https://...user4
#2    2 dancing https://...user2
#3    3  gaming https://...user3
#4    1 reading https://...user1

如果您有任何问题，请告诉我
 为什么User1只获得一个爱好，而User4在输出中获得两个爱好？你的意思是，如果没有其他用户从事这项爱好，请重复相同的用户？尝试df1[！duplicated（df1[1:2]），]
@akrun似乎他想要的输出稍微复杂一点。他希望每个爱好只有一个用户——以最大化用户数量的方式……例如，uniq.dat@EvanFriedland：我同意这更复杂user3
getsgaming
，因为user1
获得了读取，这使得user2
具有舞蹈
。非常复杂…我删除了我的原始注释，因为我看到您已经理解了OP想要的输出内容…谢谢，我尝试了它，但出现了以下错误：lappy中的错误（j，函数（x）do.call（rbind，x））：找不到对象“j”
大编辑，应该适用于不同的情况，请参见我的示例这是grate！它适用于任何2列的情况。经典谢谢！：）我尝试了这个方法，但在tibble（user=c（1,1,2,2,3,4,4），hobby=c（“读取”）：在“frame_data（）”调用中未检测到列名时，出现了一个错误错误您可能需要在开始时加载库（tibble）。
somewhatUnique <- function(df, colA, colB){
  uniq.df <- df[!duplicated(df[,c(colA, colB)]),]
  tb.uniq.df <- table(uniq.df[,c(colA, colB)])
  new.tb <- row(tb.uniq.df)
  new.tb[] <- rownames(tb.uniq.df)[new.tb]
  new.tb[tb.uniq.df == 0] <- NA
  j <- apply(new.tb, 2, as.list) # supply table columns as individual lists
  # expand.grid can take list arguments so we can handle dynamic unique hobbies
  combos <- expand.grid(lapply(j, function(x) do.call(rbind,x)), 
                        stringsAsFactors = F) # all possible options
  k <- combos[complete.cases(combos),] # options without NA
  s <- rep(NA,nrow(k)) # initialize vector
  for(i in 1:nrow(k)) s[i] <- length(unique(k[i,,drop = T]))
  L <- as.list(c(k[which.max(s),]))
  names(L) <- unique(df[,colB]) 
  # find position in split and return correct row
  by_B <- split(df, df[,colB])
  takerows <- as.list(c(mapply(function(x,y) match(x,y[,colA]), 
                               x = L, y = by_B)))
  out <- as.data.frame(t(mapply(function(z,r) z[r,], z = by_B, r = takerows)))
  out <- do.call(cbind.data.frame, lapply(out, unlist)) # formatting output correctly
  out <- out[order(out[,colA]),] # sorting by user
  rownames(out) <- NULL
  out
}    

somewhatUnique(dat, "user", "hobby") # all unique hobbies, maximizing unique users
#  user   hobby user_profile_url
#1    1 reading https://...user1
#2    2 dancing https://...user2
#3    3  gaming https://...user3
#4    4 cooking https://...user4
#5    4 singing https://...user4

somewhatUnique(dat, "hobby", "user") # all unique users, maximizing unique hobbies
#  user   hobby user_profile_url
#1    4 cooking https://...user4
#2    2 dancing https://...user2
#3    3  gaming https://...user3
#4    1 reading https://...user1