使用dplyr mutate()规范化数据会带来不一致
我试图用下面的代码从这篇博客文章中复制框架,但结果似乎不一致使用dplyr mutate()规范化数据会带来不一致,r,dplyr,mutate,R,Dplyr,Mutate,我试图用下面的代码从这篇博客文章中复制框架,但结果似乎不一致 library(tidyverse) library(magrittr) ids <- c("1617", "1516", "1415", "1314", "1213", "1112", "1011", "0910", "0809", "0708", "0607", "0506") data <- ids %>% map(function(i) {read_csv(paste0("http://www.fo
library(tidyverse)
library(magrittr)
ids <- c("1617", "1516", "1415", "1314", "1213", "1112", "1011", "0910", "0809", "0708", "0607", "0506")
data <- ids %>%
map(function(i) {read_csv(paste0("http://www.football-data.co.uk/mmz4281/", i ,"/F1.csv")) %>%
select(Date:AST) %>%
mutate(season = i)})
data <- bind_rows(data)
data <- data[complete.cases(data[ , 1:3]), ]
tmp1 <- data %>%
select(season, HomeTeam, FTHG:FTR,HS:AST) %>%
rename(BP = FTHG,
BC = FTAG,
TP = HS,
TC = AS,
TCP = HST,
TCC = AST,
team = HomeTeam)%>%
mutate(Pts = ifelse(FTR == "H", 3, ifelse(FTR == "A", 0, 1)),
Terrain = "Domicile")
tmp2 <- data %>%
select(season, AwayTeam, FTHG:FTR, HS:AST) %>%
rename(BP = FTAG,
BC = FTHG,
TP = AS,
TC = HS,
TCP = AST,
TCC = HST,
team = AwayTeam)%>%
mutate(Pts = ifelse(FTR == "A", 3 ,ifelse(FTR == "H", 0 , 1)),
Terrain = "Extérieur")
tmp3 <- bind_rows(tmp1, tmp2)
l1_0517 <- tmp3 %>%
group_by(season, team)%>%
summarise(j = n(),
pts = sum(Pts),
diff_but = (sum(BP) - sum(BC)),
diff_t_ca = (sum(TCP, na.rm = T) - sum(TCC, na.rm = T)),
diff_t = (sum(TP, na.rm = T) - sum(TC, na.rm = T)),
but_p = sum(BP),
but_c = sum(BC),
tir_ca_p = sum(TCP, na.rm = T),
tir_ca_c = sum(TCC, na.rm = T),
tir_p = sum(TP, na.rm = T),
tir_c = sum(TC, na.rm = T)) %>%
arrange((season), desc(pts), desc(diff_but))
例如,当但是p
有值83
、第5行和第7行时,我分别在0.5612738
和0.5128357
处得到norm\u攻击
这正常吗?我希望平均值(l1_0517$but_p)
是固定的,因此当l1_0517$but_p
的值是对数标准化的时,会得到相同的结果吗
更新
我尝试了一个更简单的例子,但我无法重现这个问题:
df <- tibble(a = as.integer(runif(200, 15, 100)))
df <- df %>%
mutate(norm_a = a %>% divide_by(mean(a)) %>%
log())
df%除以(平均值(a))%>%
日志())
我在查看了l1\u 0517的类型后找到了解决方案
这是一个分组的_df
,因此结果不同
正确的代码是:
l1_0517 <- tmp3 %>%
group_by(season, team)%>%
summarise(j = n(),
pts = sum(Pts),
diff_but = (sum(BP) - sum(BC)),
diff_t_ca = (sum(TCP, na.rm = T) - sum(TCC, na.rm = T)),
diff_t = (sum(TP, na.rm = T) - sum(TC, na.rm = T)),
but_p = sum(BP),
but_c = sum(BC),
tir_ca_p = sum(TCP, na.rm = T),
tir_ca_c = sum(TCC, na.rm = T),
tir_p = sum(TP, na.rm = T),
tir_c = sum(TC, na.rm = T)) %>%
ungroup() %>%
arrange((season), desc(pts), desc(diff_but))
l1\u 0517%
分组(赛季、团队)%>%
总结(j=n(),
pts=总和(pts),
diff_but=(求和(BP)-求和(BC)),
diff_t_ca=(sum(TCP,na.rm=t)-sum(TCC,na.rm=t)),
差异t=(总和(TP,na.rm=t)-总和(TC,na.rm=t)),
但_p=总和(BP),
但_c=总和(BC),
tir_ca_p=sum(TCP,na.rm=T),
tir_ca_c=总和(TCC,na.rm=T),
tir_p=总和(TP,na.rm=T),
tir_c=总和(TC,na.rm=T))%>%
解组()%>%
安排((季节)、描述(临时)、描述(差异)
df <- tibble(a = as.integer(runif(200, 15, 100)))
df <- df %>%
mutate(norm_a = a %>% divide_by(mean(a)) %>%
log())
l1_0517 <- tmp3 %>%
group_by(season, team)%>%
summarise(j = n(),
pts = sum(Pts),
diff_but = (sum(BP) - sum(BC)),
diff_t_ca = (sum(TCP, na.rm = T) - sum(TCC, na.rm = T)),
diff_t = (sum(TP, na.rm = T) - sum(TC, na.rm = T)),
but_p = sum(BP),
but_c = sum(BC),
tir_ca_p = sum(TCP, na.rm = T),
tir_ca_c = sum(TCC, na.rm = T),
tir_p = sum(TP, na.rm = T),
tir_c = sum(TC, na.rm = T)) %>%
ungroup() %>%
arrange((season), desc(pts), desc(diff_but))