将两个data.frame中的公共列与第1列到第3个data.frame中的公共字进行平均_R_Mean_Word

将两个data.frame中的公共列与第1列到第3个data.frame中的公共字进行平均

将两个data.frame中的公共列与第1列到第3个data.frame中的公共字进行平均,r,mean,word,R,Mean,Word,我有两个数据集。两者都包含较大的数据部分，实际数据集约为100万行乘以300列。我想通过两个数据集中的常用词将它们合并在一起。此外，我想将对应于列和常用词的每个单元格平均起来，并生成第三个data.frame。下面是一些示例数据这是第一个数据集。它更小 set.seed(511111) #first data.frame with a smaller datasset df<-matrix(data=rnorm(n=300,mean=10,sd=300),nrow=6,nc

我有两个数据集。两者都包含较大的数据部分，实际数据集约为100万行乘以300列。我想通过两个数据集中的常用词将它们合并在一起。此外，我想将对应于列和常用词的每个单元格平均起来，并生成第三个data.frame。下面是一些示例数据

这是第一个数据集。它更小

set.seed(511111)     
 #first data.frame with a smaller datasset
 df<-matrix(data=rnorm(n=300,mean=10,sd=300),nrow=6,ncol=2)
   words<-c("a","by","the","hi","bye","see")
   df<-cbind(words,df);colnames(df)=c("y",paste0("V",c(1:2)))
   df


          y     V1                  V2                 
[1,] "a"   "158.979716349289"  "-16.2574951855564"
[2,] "by"  "164.995114380192"  "-68.1726437428752"
[3,] "the" "720.223066121601"  "1054.04351778352" 
[4,] "hi"  "-288.629142240942" "537.900385284324" 
[5,] "bye" "-581.097490056299" "183.495782507513" 
[6,] "see" "-192.129441997881" "-117.187652711745"

对于第一个值-200.365，通过取df[1,2]（-399.988526255518）和df2[1,2]（“-1.4723244399644”）的平均值计算得出，该行的常用词为“a”。对于第二个值8.64，通过取df[1,3]（16.9236076090913）和df2[1,3]（“-0.520509732658999”）的平均值计算得出，该行的常用词为“a”

数字df3
#一个tibble:5x3
y V1 V2
1A 80.8-7.79
2乘82.3-34.3
再见。91.8
4 hi-144。270
5见-94.9-58.7

将两个数据帧的行绑定在一起，转换为数字，

仅过滤普通。单词
，按
y
分组，并计算平均值

library(dplyr)

bind_rows(df, df2) %>%
    mutate_at(vars(starts_with("V")), as.numeric) %>%
    filter(y %in% common.words) %>%
    group_by(y) %>%
    summarise_all(mean)


我们可以使用相同的逻辑来使用基本Raggregate

#rbind both the datasets
df1 <- rbind(df, df2)
#Convert factor numbers to numeric
df1[2:3] <- lapply(df1[2:3], function(x) as.numeric(as.character(x)))
#Filter and aggregate
aggregate(.~y, df1[df1$y %in% common.words, ], mean)

#r查找两个数据集
df1
 #what I want the dataset to look like after its finished merging and averaging columns V1 and V2 for common words

 numbers<-data.frame(V1=c("-200.365","121.227","91.187","29.125","100.76"),
+                     V2=c("8.64","80.558","-138.89","68.11","86.454"))
 df3<-cbind(common.words,numbers)
 df3


  common.words         V1      V2
1 a       80.8   -7.79
2 by      82.3  -34.3 
3 bye   -290.    91.8 
4 hi    -144.   270.  
5 see    -94.9  -58.7 

df <- data.frame(df)
 df2 <- data.frame(df2)
 library(dplyr)
 #df.list=list(df,df2)
 df3<-bind_rows(df,df2) %>%
+   mutate_at(vars(starts_with("V")), as.numeric) %>%
+   filter(y %in% common.words) %>%
+   group_by(y) %>%
+   summarise_all(mean)
Warning messages:
1: In bind_rows_(x, .id) : Unequal factor levels: coercing to character
2: In bind_rows_(x, .id) :
  binding character and factor vector, coercing into character vector
3: In bind_rows_(x, .id) :
  binding character and factor vector, coercing into character vector
4: In bind_rows_(x, .id) : Unequal factor levels: coercing to character
5: In bind_rows_(x, .id) :
  binding character and factor vector, coercing into character vector
6: In bind_rows_(x, .id) :
  binding character and factor vector, coercing into character vector
7: In bind_rows_(x, .id) : Unequal factor levels: coercing to character
8: In bind_rows_(x, .id) :
  binding character and factor vector, coercing into character vector
9: In bind_rows_(x, .id) :
  binding character and factor vector, coercing into character vector
> df3
# A tibble: 5 x 3
  y         V1      V2
  <chr>  <dbl>   <dbl>
1 a       80.8   -7.79
2 by      82.3  -34.3 
3 bye   -290.    91.8 
4 hi    -144.   270.  
5 see    -94.9  -58.7 

library(dplyr)

bind_rows(df, df2) %>%
    mutate_at(vars(starts_with("V")), as.numeric) %>%
    filter(y %in% common.words) %>%
    group_by(y) %>%
    summarise_all(mean)

#rbind both the datasets
df1 <- rbind(df, df2)
#Convert factor numbers to numeric
df1[2:3] <- lapply(df1[2:3], function(x) as.numeric(as.character(x)))
#Filter and aggregate
aggregate(.~y, df1[df1$y %in% common.words, ], mean)

df <- data.frame(df)
df2 <- data.frame(df2)