在r中添加额外行和NA的完整连接
我尝试使用full_join连接两个数据帧,以下是我的数据子集:在r中添加额外行和NA的完整连接,r,dataframe,join,dplyr,merge,R,Dataframe,Join,Dplyr,Merge,我尝试使用full_join连接两个数据帧,以下是我的数据子集: df1 <- structure(list(Team = structure(c(4L, 3L, 5L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L, 12L, 14L, 13L, 15L, 15L, 16L, 16L, 17L, 17L, 18L, 18L, 19L, 19L, 20L, 20L, 21L, 22L, 23L, 23L, 24L, 24L,
df1 <- structure(list(Team = structure(c(4L, 3L, 5L, 6L, 7L, 7L, 8L,
8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L, 12L, 14L, 13L, 15L, 15L,
16L, 16L, 17L, 17L, 18L, 18L, 19L, 19L, 20L, 20L, 21L, 22L, 23L,
23L, 24L, 24L, 25L, 25L, 28L, 28L, 29L, 29L, 30L, 30L, 31L, 31L,
32L, 32L, 33L, 33L, 34L, 34L, 2L, 1L, 26L, 27L), .Label = c("76ers",
"76ers ", "Bucks", "Bucks ", "Bull ", "Bulls ", "Cavaliers ",
"Celtics ", "Clippers ", "Grizzlies ", "Hawks ", "Heat ", "Hornets",
"Hornets ", "Jazz ", "Kings ", "Knicks ", "Lakers ", "Magic ",
"Mavericks ", "Net ", "Nets ", "Nuggets ", "Pacers ", "Pelicans ",
"Pistons", "Pistons ", "Raptors ", "Rockets ", "Spurs ", "Thunder ",
"Timberwolves ", "Warriors ", "Wizards "), class = "factor"),
Injury.Count = c(3L, 3L, 1L, 1L, 1L, 2L, 0L, 2L, 1L, 1L,
0L, 2L, 1L, 0L, 5L, 4L, 3L, 2L, 3L, 0L, 3L, 3L, 4L, 6L, 5L,
0L, 2L, 2L, 1L, 2L, 0L, 1L, 3L, 4L, 2L, 6L, 2L, 1L, 1L, 1L,
3L, 3L, 4L, 5L, 1L, 6L, 4L, 2L, 0L, 2L, 2L, 1L, 5L, 6L, 1L,
1L), HomevsAway = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L), .Label = c("0", "1"), class = "factor")), row.names = c(NA,
-56L), class = "data.frame")
df2 <- structure(list(Team = structure(c(1L, 1L, 2L, 2L, 3L, 4L, 4L,
5L, 6L, 7L, 8L, 9L, 9L, 10L, 10L, 11L, 12L, 12L, 13L, 13L, 14L,
15L, 15L, 16L, 16L, 17L, 18L, 18L, 19L, 19L, 20L, 20L, 21L, 21L,
22L, 22L, 23L, 23L, 24L, 24L, 25L, 25L, 26L, 26L, 27L, 28L, 28L,
3L, 5L, 6L, 7L, 8L, 11L, 14L, 17L, 27L), .Label = c("76ers",
"Bucks", "Bulls", "Cavaliers", "Celtics", "Clippers", "Grizzlies",
"Hawks", "Heat", "Hornets", "Jazz", "Kings", "Knicks", "Lakers",
"Magic", "Mavericks", "Nets", "Nuggets", "Pacers", "Pelicans",
"Pistons", "Raptors", "Rockets", "Spurs", "Thunder", "Timberwolves",
"Warriors", "Wizards"), class = "factor"), HomevsAway = structure(c(1L,
2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L,
2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L,
1L, 1L, 2L, 2L, 2L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
t_1 = c(55.883, 140.1, 32.2, 37.967, 29.85, 24.317, 57.316,
17.967, 19.05, 36.95, 16.167, 95.317, 86.533, 21.334, 52.567,
40.75, 28.3, 68.15, 97.067, 102.233, 26.866, 71.033, 34.467,
24.233, 42.033, 22.433, 59.033, 41.516, 12.7, 107.996, 6.5,
32.783, 0, 23.217, 13.93, 0, 54.88, 23.617, 83.834, 106.794,
17.56, 27.76, 85.83, 0.017, 35.183, 22.467, 25.033, 0, 0,
0, 0, 0, 0, 0, 0, 0), t_3 = c(197.3164, 388.6827, 126.2663,
111.916, 61.95, 91.55, 167.067, 104.083, 71.067, 135.383,
45.633, 261.317, 267.399, 114.6997, 159.2, 152.034, 84.8337,
204.3003, 351.449, 376.317, 86.333, 213.9, 99.767, 65.1,
131.767, 73.317, 126.416, 129.066, 73.383, 347.0994, 4761,
113.367, 0, 89.933, 59.8, 0, 188.983, 124.384, 215.666, 289.9667,
92, 144.2497, 254.083, 32.0333, 122.1837, 102.533, 82.817,
0, 0, 0, 0, 0, 0, 0, 0, 0)), row.names = c(NA, -56L), groups = structure(list(
Team = structure(1:28, .Label = c("76ers", "Bucks", "Bulls",
"Cavaliers", "Celtics", "Clippers", "Grizzlies", "Hawks",
"Heat", "Hornets", "Jazz", "Kings", "Knicks", "Lakers", "Magic",
"Mavericks", "Nets", "Nuggets", "Pacers", "Pelicans", "Pistons",
"Raptors", "Rockets", "Spurs", "Thunder", "Timberwolves",
"Warriors", "Wizards"), class = "factor"), .rows = structure(list(
1:2, 3:4, c(5L, 48L), 6:7, c(8L, 49L), c(9L, 50L), c(10L,
51L), c(11L, 52L), 12:13, 14:15, c(16L, 53L), 17:18,
19:20, c(21L, 54L), 22:23, 24:25, c(26L, 55L), 27:28,
29:30, 31:32, 33:34, 35:36, 37:38, 39:40, 41:42, 43:44,
c(45L, 56L), 46:47), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 28L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
df1问题在于,在第一个data.frame中,团队名称后面有一个空格。这意味着,您使用的不是字符串“Bucks”
,而是字符串“Bucks”
。
这两条线不能连接
以下是如何修复数据。首先使用sub
删除前导空格,然后将团队转换为字符向量。然后,完全连接按计划工作:
df1_new <- df1 %>% as_tibble() %>%
mutate(Team = sub(" +", "", as.character(Team)))
df2_new <- df2 %>%
mutate(Team = as.character(Team))
df1_new %>% full_join(df2_new, by = c("Team", "HomevsAway"))
# A tibble: 58 x 5
Team Injury.Count HomevsAway t_1 t_3
<chr> <int> <fct> <dbl> <dbl>
1 Bucks 3 0 32.2 126.
2 Bucks 3 1 38.0 112.
3 Bull 1 0 NA NA
4 Bulls 1 1 0 0
5 Cavaliers 1 0 24.3 91.6
6 Cavaliers 2 1 57.3 167.
7 Celtics 0 0 0 0
8 Celtics 2 1 18.0 104.
9 Clippers 1 0 0 0
10 Clippers 1 1 19.0 71.1
df1\u新%as\u tible()%>%
变异(Team=sub(“+”,“”,as.character(Team)))
df2_新%
变异(团队=角色(团队))
df1_新建%>%完全加入(df2_新建,由=c(“团队”、“主场”))
#A tibble:58 x 5
球队受伤,主客场1次3次
1美元3032.2126。
2美元3138.01112。
3公牛10 NA NA
4公牛队1110
5骑士队1024.391.6
6骑士队2157.3167。
7凯尔特人0 0 0
8凯尔特人队2118.0104。
9号快船1 0 0 0
10艘快艇1119.071.1
请注意,仍有一些NAs。这是由于一些打字错误造成的:牛市对牛市,网络对网络。@Cettt知道为什么我的实际数据会出现这种情况吗?@Cettt有没有一个功能可以将数据从R复制/粘贴到stackoverflow中?dput
。使用dput(mydata)
并将结果发布到问题中。@Cettt让我知道这是否有效,更新如上
df1_new <- df1 %>% as_tibble() %>%
mutate(Team = sub(" +", "", as.character(Team)))
df2_new <- df2 %>%
mutate(Team = as.character(Team))
df1_new %>% full_join(df2_new, by = c("Team", "HomevsAway"))
# A tibble: 58 x 5
Team Injury.Count HomevsAway t_1 t_3
<chr> <int> <fct> <dbl> <dbl>
1 Bucks 3 0 32.2 126.
2 Bucks 3 1 38.0 112.
3 Bull 1 0 NA NA
4 Bulls 1 1 0 0
5 Cavaliers 1 0 24.3 91.6
6 Cavaliers 2 1 57.3 167.
7 Celtics 0 0 0 0
8 Celtics 2 1 18.0 104.
9 Clippers 1 0 0 0
10 Clippers 1 1 19.0 71.1