Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/81.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
R 从A列中选择唯一值,同时从B列中选择唯一值_R_Dataframe - Fatal编程技术网

R 从A列中选择唯一值,同时从B列中选择唯一值

R 从A列中选择唯一值,同时从B列中选择唯一值,r,dataframe,R,Dataframe,我在R中有一个data.frame: user hobby user_profile_url 1 reading "https://...user1" 1 dancing "https://...user1" 2 dancing "https://...user2" 2 gaming "https://...user2" 3 gaming "https://...user3" 4

我在R中有一个data.frame:

user    hobby      user_profile_url
1       reading    "https://...user1"
1       dancing    "https://...user1"
2       dancing    "https://...user2"
2       gaming     "https://...user2"
3       gaming     "https://...user3"
4       cooking    "https://...user4"
4       singing    "https://...user4"
...
我试图为每个独特的爱好选择一个独特的用户(只要可能)

期望输出:

user    hobby      user_profile_url
1       reading    "https://...user1"
2       dancing    "https://...user2"
3       gaming     "https://...user3"
4       cooking    "https://...user4"
4       singing    "https://...user4"
...
有人知道怎么做吗?谢谢

编辑:

dat <- data.frame(user  = c(1,1,2,2,3,4,4), hobby = c("reading","dancing","dancing","gaming","gaming","cooking","singing"), user_profile_url = c("https://...user1","https://...user1","https://...user2","https://...user2","https://...user3","https://...user4","https://...user4"), stringsAsFactors = F)
somewhatUnique <- function(df, colA, colB){
  uniq.df <- df[!duplicated(df[,c(colA, colB)]),]
  tb.uniq.df <- table(uniq.df[,c(colA, colB)])
  new.tb <- row(tb.uniq.df)
  new.tb[] <- rownames(tb.uniq.df)[new.tb]
  new.tb[tb.uniq.df == 0] <- NA
  j <- apply(new.tb, 2, as.list) # supply table columns as individual lists
  # expand.grid can take list arguments so we can handle dynamic unique hobbies
  combos <- expand.grid(lapply(j, function(x) do.call(rbind,x)), 
                        stringsAsFactors = F) # all possible options
  k <- combos[complete.cases(combos),] # options without NA
  s <- rep(NA,nrow(k)) # initialize vector
  for(i in 1:nrow(k)) s[i] <- length(unique(k[i,,drop = T]))
  L <- as.list(c(k[which.max(s),]))
  names(L) <- unique(df[,colB]) 
  # find position in split and return correct row
  by_B <- split(df, df[,colB])
  takerows <- as.list(c(mapply(function(x,y) match(x,y[,colA]), 
                               x = L, y = by_B)))
  out <- as.data.frame(t(mapply(function(z,r) z[r,], z = by_B, r = takerows)))
  out <- do.call(cbind.data.frame, lapply(out, unlist)) # formatting output correctly
  out <- out[order(out[,colA]),] # sorting by user
  rownames(out) <- NULL
  out
}    
somewhatUnique(dat, "user", "hobby") # all unique hobbies, maximizing unique users
#  user   hobby user_profile_url
#1    1 reading https://...user1
#2    2 dancing https://...user2
#3    3  gaming https://...user3
#4    4 cooking https://...user4
#5    4 singing https://...user4

somewhatUnique(dat, "hobby", "user") # all unique users, maximizing unique hobbies
#  user   hobby user_profile_url
#1    4 cooking https://...user4
#2    2 dancing https://...user2
#3    3  gaming https://...user3
#4    1 reading https://...user1
我想我有一些有用的东西

uniqueUserPerHobby <- function(df){
  vec1Arr <- c()
  vec2Arr <- c()
  used_id <- c()
  for (a_label in unique(df$hobby)){
    if (nrow(df[df$hobby==a_label,])==1) {
      used_id <- c(used_id, df[df$hobby==a_label,]$user )
      vec1Arr <- c(vec1Arr, df[df$hobby==a_label,]$user)
      vec2Arr <- c(vec2Arr, a_label)
    } else {
      i<-1
      df_multy <- df[df$hobby==a_label,]
      for (a_user in df_multy$user) {
        i<-i+1
        if (nrow(df[df$user==a_user,])==1) {
          used_id <- c(used_id, a_user)
          vec1Arr <- c(vec1Arr, a_user)
          vec2Arr <- c(vec2Arr, a_label)
          break
        } else if (i == length(df$user)) {
          if (! a_user %in% used_id){
            used_id <- c(used_id, a_user)
          }
          vec1Arr <- c(vec1Arr, a_user)
          vec2Arr <- c(vec2Arr, a_label)
        } else if (! a_user %in% used_id) {
          used_id <- c(used_id, a_user)
          vec1Arr <- c(vec1Arr, a_user)
          vec2Arr <- c(vec2Arr, a_label)
          break
        }
      }
      if (!a_label %in% vec2Arr){
        if (!df[df$hobby==a_label,]$user[1] %in% used_id){
          used_id <- c(used_id, df[df$hobby==a_label,]$user[1])
        }
        vec1Arr <- c(vec1Arr, df[df$hobby==a_label,]$user[1])
        vec2Arr <- c(vec2Arr, a_label)
      }
    }
  }
  new.df <- dplyr::left_join(data.frame(user=vec1Arr, hobby=vec2Arr, stringsAsFactors = F), df, by=c("user", "hobby"))
  return(new.df)
}

uniqueUserPerHobby好的,我试着让它尽可能的通用,但是我只在你的数据上测试过它,它会给你一些警告,但不会影响输出(基于当前数据)。我不能保证它会一直工作,但它应该让你开始

从我保存为TIBLE的数据开始:

df <- tibble(user=c(1,1,2,2,3,4,4), hobby=c("reading","dancing","dancing","gaming","gaming","cooking","singing"),user_profile=c("user1","user1","user2","user2","user3","user4","user4"))
我重新格式化原始数据框,以便
hobby
为列:

library(tidyverse)
new <- df %>% 
         mutate(dummy=1) %>%
         complete(user,hobby) %>%
         group_by(user) %>%
         spread(.,hobby,dummy) %>%
         filter(!is.na(user_profile))

   user user_profile cooking dancing gaming reading singing
  <dbl>        <chr>   <dbl>   <dbl>  <dbl>   <dbl>   <dbl>
1     1        user1      NA       1     NA       1      NA
2     2        user2      NA       1      1      NA      NA
3     3        user3      NA      NA      1      NA      NA
4     4        user4       1      NA     NA      NA       1
输出

   user     key user_profile
  <dbl>   <chr>        <chr>
1     1 reading        user1
2     2 dancing        user2
3     3  gaming        user3
4     4 cooking        user4
5     4 singing        user4
用户密钥用户配置文件
1读取用户1
2跳舞用户2
3游戏用户3
4烹饪用户4
5 4歌唱用户4

woooweee不容易。我用base R做了这个,还做了一个函数。试一试:

您的数据:

dat <- data.frame(user  = c(1,1,2,2,3,4,4), hobby = c("reading","dancing","dancing","gaming","gaming","cooking","singing"), user_profile_url = c("https://...user1","https://...user1","https://...user2","https://...user2","https://...user3","https://...user4","https://...user4"), stringsAsFactors = F)
somewhatUnique <- function(df, colA, colB){
  uniq.df <- df[!duplicated(df[,c(colA, colB)]),]
  tb.uniq.df <- table(uniq.df[,c(colA, colB)])
  new.tb <- row(tb.uniq.df)
  new.tb[] <- rownames(tb.uniq.df)[new.tb]
  new.tb[tb.uniq.df == 0] <- NA
  j <- apply(new.tb, 2, as.list) # supply table columns as individual lists
  # expand.grid can take list arguments so we can handle dynamic unique hobbies
  combos <- expand.grid(lapply(j, function(x) do.call(rbind,x)), 
                        stringsAsFactors = F) # all possible options
  k <- combos[complete.cases(combos),] # options without NA
  s <- rep(NA,nrow(k)) # initialize vector
  for(i in 1:nrow(k)) s[i] <- length(unique(k[i,,drop = T]))
  L <- as.list(c(k[which.max(s),]))
  names(L) <- unique(df[,colB]) 
  # find position in split and return correct row
  by_B <- split(df, df[,colB])
  takerows <- as.list(c(mapply(function(x,y) match(x,y[,colA]), 
                               x = L, y = by_B)))
  out <- as.data.frame(t(mapply(function(z,r) z[r,], z = by_B, r = takerows)))
  out <- do.call(cbind.data.frame, lapply(out, unlist)) # formatting output correctly
  out <- out[order(out[,colA]),] # sorting by user
  rownames(out) <- NULL
  out
}    
somewhatUnique(dat, "user", "hobby") # all unique hobbies, maximizing unique users
#  user   hobby user_profile_url
#1    1 reading https://...user1
#2    2 dancing https://...user2
#3    3  gaming https://...user3
#4    4 cooking https://...user4
#5    4 singing https://...user4

somewhatUnique(dat, "hobby", "user") # all unique users, maximizing unique hobbies
#  user   hobby user_profile_url
#1    4 cooking https://...user4
#2    2 dancing https://...user2
#3    3  gaming https://...user3
#4    1 reading https://...user1

如果您有任何问题,请告诉我

为什么User1只获得一个爱好,而User4在输出中获得两个爱好?你的意思是,如果没有其他用户从事这项爱好,请重复相同的用户?尝试
df1[!duplicated(df1[1:2]),]
@akrun似乎他想要的输出稍微复杂一点。他希望每个爱好只有一个用户——以最大化用户数量的方式……例如,
uniq.dat@EvanFriedland:我同意这更复杂
user3
gets
gaming
,因为
user1
获得了读取,这使得
user2
具有
舞蹈
。非常复杂…我删除了我的原始注释,因为我看到您已经理解了OP想要的输出内容…谢谢,我尝试了它,但出现了以下错误:
lappy中的错误(j,函数(x)do.call(rbind,x)):找不到对象“j”
大编辑,应该适用于不同的情况,请参见我的示例这是grate!它适用于任何2列的情况。经典谢谢!:)我尝试了这个方法,但在tibble(user=c(1,1,2,2,3,4,4),hobby=c(“读取”):在“frame_data()”调用中未检测到列名时,出现了一个错误
错误
您可能需要在开始时加载
库(tibble)
somewhatUnique <- function(df, colA, colB){
  uniq.df <- df[!duplicated(df[,c(colA, colB)]),]
  tb.uniq.df <- table(uniq.df[,c(colA, colB)])
  new.tb <- row(tb.uniq.df)
  new.tb[] <- rownames(tb.uniq.df)[new.tb]
  new.tb[tb.uniq.df == 0] <- NA
  j <- apply(new.tb, 2, as.list) # supply table columns as individual lists
  # expand.grid can take list arguments so we can handle dynamic unique hobbies
  combos <- expand.grid(lapply(j, function(x) do.call(rbind,x)), 
                        stringsAsFactors = F) # all possible options
  k <- combos[complete.cases(combos),] # options without NA
  s <- rep(NA,nrow(k)) # initialize vector
  for(i in 1:nrow(k)) s[i] <- length(unique(k[i,,drop = T]))
  L <- as.list(c(k[which.max(s),]))
  names(L) <- unique(df[,colB]) 
  # find position in split and return correct row
  by_B <- split(df, df[,colB])
  takerows <- as.list(c(mapply(function(x,y) match(x,y[,colA]), 
                               x = L, y = by_B)))
  out <- as.data.frame(t(mapply(function(z,r) z[r,], z = by_B, r = takerows)))
  out <- do.call(cbind.data.frame, lapply(out, unlist)) # formatting output correctly
  out <- out[order(out[,colA]),] # sorting by user
  rownames(out) <- NULL
  out
}    
somewhatUnique(dat, "user", "hobby") # all unique hobbies, maximizing unique users
#  user   hobby user_profile_url
#1    1 reading https://...user1
#2    2 dancing https://...user2
#3    3  gaming https://...user3
#4    4 cooking https://...user4
#5    4 singing https://...user4

somewhatUnique(dat, "hobby", "user") # all unique users, maximizing unique hobbies
#  user   hobby user_profile_url
#1    4 cooking https://...user4
#2    2 dancing https://...user2
#3    3  gaming https://...user3
#4    1 reading https://...user1