R 从A列中选择唯一值,同时从B列中选择唯一值
我在R中有一个data.frame:R 从A列中选择唯一值,同时从B列中选择唯一值,r,dataframe,R,Dataframe,我在R中有一个data.frame: user hobby user_profile_url 1 reading "https://...user1" 1 dancing "https://...user1" 2 dancing "https://...user2" 2 gaming "https://...user2" 3 gaming "https://...user3" 4
user hobby user_profile_url
1 reading "https://...user1"
1 dancing "https://...user1"
2 dancing "https://...user2"
2 gaming "https://...user2"
3 gaming "https://...user3"
4 cooking "https://...user4"
4 singing "https://...user4"
...
我试图为每个独特的爱好选择一个独特的用户(只要可能)
期望输出:
user hobby user_profile_url
1 reading "https://...user1"
2 dancing "https://...user2"
3 gaming "https://...user3"
4 cooking "https://...user4"
4 singing "https://...user4"
...
有人知道怎么做吗?谢谢
编辑:
dat <- data.frame(user = c(1,1,2,2,3,4,4), hobby = c("reading","dancing","dancing","gaming","gaming","cooking","singing"), user_profile_url = c("https://...user1","https://...user1","https://...user2","https://...user2","https://...user3","https://...user4","https://...user4"), stringsAsFactors = F)
somewhatUnique <- function(df, colA, colB){
uniq.df <- df[!duplicated(df[,c(colA, colB)]),]
tb.uniq.df <- table(uniq.df[,c(colA, colB)])
new.tb <- row(tb.uniq.df)
new.tb[] <- rownames(tb.uniq.df)[new.tb]
new.tb[tb.uniq.df == 0] <- NA
j <- apply(new.tb, 2, as.list) # supply table columns as individual lists
# expand.grid can take list arguments so we can handle dynamic unique hobbies
combos <- expand.grid(lapply(j, function(x) do.call(rbind,x)),
stringsAsFactors = F) # all possible options
k <- combos[complete.cases(combos),] # options without NA
s <- rep(NA,nrow(k)) # initialize vector
for(i in 1:nrow(k)) s[i] <- length(unique(k[i,,drop = T]))
L <- as.list(c(k[which.max(s),]))
names(L) <- unique(df[,colB])
# find position in split and return correct row
by_B <- split(df, df[,colB])
takerows <- as.list(c(mapply(function(x,y) match(x,y[,colA]),
x = L, y = by_B)))
out <- as.data.frame(t(mapply(function(z,r) z[r,], z = by_B, r = takerows)))
out <- do.call(cbind.data.frame, lapply(out, unlist)) # formatting output correctly
out <- out[order(out[,colA]),] # sorting by user
rownames(out) <- NULL
out
}
somewhatUnique(dat, "user", "hobby") # all unique hobbies, maximizing unique users
# user hobby user_profile_url
#1 1 reading https://...user1
#2 2 dancing https://...user2
#3 3 gaming https://...user3
#4 4 cooking https://...user4
#5 4 singing https://...user4
somewhatUnique(dat, "hobby", "user") # all unique users, maximizing unique hobbies
# user hobby user_profile_url
#1 4 cooking https://...user4
#2 2 dancing https://...user2
#3 3 gaming https://...user3
#4 1 reading https://...user1
我想我有一些有用的东西
uniqueUserPerHobby <- function(df){
vec1Arr <- c()
vec2Arr <- c()
used_id <- c()
for (a_label in unique(df$hobby)){
if (nrow(df[df$hobby==a_label,])==1) {
used_id <- c(used_id, df[df$hobby==a_label,]$user )
vec1Arr <- c(vec1Arr, df[df$hobby==a_label,]$user)
vec2Arr <- c(vec2Arr, a_label)
} else {
i<-1
df_multy <- df[df$hobby==a_label,]
for (a_user in df_multy$user) {
i<-i+1
if (nrow(df[df$user==a_user,])==1) {
used_id <- c(used_id, a_user)
vec1Arr <- c(vec1Arr, a_user)
vec2Arr <- c(vec2Arr, a_label)
break
} else if (i == length(df$user)) {
if (! a_user %in% used_id){
used_id <- c(used_id, a_user)
}
vec1Arr <- c(vec1Arr, a_user)
vec2Arr <- c(vec2Arr, a_label)
} else if (! a_user %in% used_id) {
used_id <- c(used_id, a_user)
vec1Arr <- c(vec1Arr, a_user)
vec2Arr <- c(vec2Arr, a_label)
break
}
}
if (!a_label %in% vec2Arr){
if (!df[df$hobby==a_label,]$user[1] %in% used_id){
used_id <- c(used_id, df[df$hobby==a_label,]$user[1])
}
vec1Arr <- c(vec1Arr, df[df$hobby==a_label,]$user[1])
vec2Arr <- c(vec2Arr, a_label)
}
}
}
new.df <- dplyr::left_join(data.frame(user=vec1Arr, hobby=vec2Arr, stringsAsFactors = F), df, by=c("user", "hobby"))
return(new.df)
}
uniqueUserPerHobby好的,我试着让它尽可能的通用,但是我只在你的数据上测试过它,它会给你一些警告,但不会影响输出(基于当前数据)。我不能保证它会一直工作,但它应该让你开始
从我保存为TIBLE的数据开始:
df <- tibble(user=c(1,1,2,2,3,4,4), hobby=c("reading","dancing","dancing","gaming","gaming","cooking","singing"),user_profile=c("user1","user1","user2","user2","user3","user4","user4"))
我重新格式化原始数据框,以便hobby
为列:
library(tidyverse)
new <- df %>%
mutate(dummy=1) %>%
complete(user,hobby) %>%
group_by(user) %>%
spread(.,hobby,dummy) %>%
filter(!is.na(user_profile))
user user_profile cooking dancing gaming reading singing
<dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 user1 NA 1 NA 1 NA
2 2 user2 NA 1 1 NA NA
3 3 user3 NA NA 1 NA NA
4 4 user4 1 NA NA NA 1
输出
user key user_profile
<dbl> <chr> <chr>
1 1 reading user1
2 2 dancing user2
3 3 gaming user3
4 4 cooking user4
5 4 singing user4
用户密钥用户配置文件
1读取用户1
2跳舞用户2
3游戏用户3
4烹饪用户4
5 4歌唱用户4
woooweee不容易。我用base R做了这个,还做了一个函数。试一试:
您的数据:
dat <- data.frame(user = c(1,1,2,2,3,4,4), hobby = c("reading","dancing","dancing","gaming","gaming","cooking","singing"), user_profile_url = c("https://...user1","https://...user1","https://...user2","https://...user2","https://...user3","https://...user4","https://...user4"), stringsAsFactors = F)
somewhatUnique <- function(df, colA, colB){
uniq.df <- df[!duplicated(df[,c(colA, colB)]),]
tb.uniq.df <- table(uniq.df[,c(colA, colB)])
new.tb <- row(tb.uniq.df)
new.tb[] <- rownames(tb.uniq.df)[new.tb]
new.tb[tb.uniq.df == 0] <- NA
j <- apply(new.tb, 2, as.list) # supply table columns as individual lists
# expand.grid can take list arguments so we can handle dynamic unique hobbies
combos <- expand.grid(lapply(j, function(x) do.call(rbind,x)),
stringsAsFactors = F) # all possible options
k <- combos[complete.cases(combos),] # options without NA
s <- rep(NA,nrow(k)) # initialize vector
for(i in 1:nrow(k)) s[i] <- length(unique(k[i,,drop = T]))
L <- as.list(c(k[which.max(s),]))
names(L) <- unique(df[,colB])
# find position in split and return correct row
by_B <- split(df, df[,colB])
takerows <- as.list(c(mapply(function(x,y) match(x,y[,colA]),
x = L, y = by_B)))
out <- as.data.frame(t(mapply(function(z,r) z[r,], z = by_B, r = takerows)))
out <- do.call(cbind.data.frame, lapply(out, unlist)) # formatting output correctly
out <- out[order(out[,colA]),] # sorting by user
rownames(out) <- NULL
out
}
somewhatUnique(dat, "user", "hobby") # all unique hobbies, maximizing unique users
# user hobby user_profile_url
#1 1 reading https://...user1
#2 2 dancing https://...user2
#3 3 gaming https://...user3
#4 4 cooking https://...user4
#5 4 singing https://...user4
somewhatUnique(dat, "hobby", "user") # all unique users, maximizing unique hobbies
# user hobby user_profile_url
#1 4 cooking https://...user4
#2 2 dancing https://...user2
#3 3 gaming https://...user3
#4 1 reading https://...user1
如果您有任何问题,请告诉我 为什么User1只获得一个爱好,而User4在输出中获得两个爱好?你的意思是,如果没有其他用户从事这项爱好,请重复相同的用户?尝试df1[!duplicated(df1[1:2]),]
@akrun似乎他想要的输出稍微复杂一点。他希望每个爱好只有一个用户——以最大化用户数量的方式……例如,uniq.dat@EvanFriedland:我同意这更复杂user3
getsgaming
,因为user1
获得了读取,这使得user2
具有舞蹈
。非常复杂…我删除了我的原始注释,因为我看到您已经理解了OP想要的输出内容…谢谢,我尝试了它,但出现了以下错误:lappy中的错误(j,函数(x)do.call(rbind,x)):找不到对象“j”
大编辑,应该适用于不同的情况,请参见我的示例这是grate!它适用于任何2列的情况。经典谢谢!:)我尝试了这个方法,但在tibble(user=c(1,1,2,2,3,4,4),hobby=c(“读取”):在“frame_data()”调用中未检测到列名时,出现了一个错误错误您可能需要在开始时加载库(tibble)
。
somewhatUnique <- function(df, colA, colB){
uniq.df <- df[!duplicated(df[,c(colA, colB)]),]
tb.uniq.df <- table(uniq.df[,c(colA, colB)])
new.tb <- row(tb.uniq.df)
new.tb[] <- rownames(tb.uniq.df)[new.tb]
new.tb[tb.uniq.df == 0] <- NA
j <- apply(new.tb, 2, as.list) # supply table columns as individual lists
# expand.grid can take list arguments so we can handle dynamic unique hobbies
combos <- expand.grid(lapply(j, function(x) do.call(rbind,x)),
stringsAsFactors = F) # all possible options
k <- combos[complete.cases(combos),] # options without NA
s <- rep(NA,nrow(k)) # initialize vector
for(i in 1:nrow(k)) s[i] <- length(unique(k[i,,drop = T]))
L <- as.list(c(k[which.max(s),]))
names(L) <- unique(df[,colB])
# find position in split and return correct row
by_B <- split(df, df[,colB])
takerows <- as.list(c(mapply(function(x,y) match(x,y[,colA]),
x = L, y = by_B)))
out <- as.data.frame(t(mapply(function(z,r) z[r,], z = by_B, r = takerows)))
out <- do.call(cbind.data.frame, lapply(out, unlist)) # formatting output correctly
out <- out[order(out[,colA]),] # sorting by user
rownames(out) <- NULL
out
}
somewhatUnique(dat, "user", "hobby") # all unique hobbies, maximizing unique users
# user hobby user_profile_url
#1 1 reading https://...user1
#2 2 dancing https://...user2
#3 3 gaming https://...user3
#4 4 cooking https://...user4
#5 4 singing https://...user4
somewhatUnique(dat, "hobby", "user") # all unique users, maximizing unique hobbies
# user hobby user_profile_url
#1 4 cooking https://...user4
#2 2 dancing https://...user2
#3 3 gaming https://...user3
#4 1 reading https://...user1