使R计算元素中的字符串数 dflibrary(tidyverse) df%>% 选择(-Column1,-Column2)%>%#删除这些列 收集(年份、值、-ID、`Cut Off`)%>%#重塑数据 na.omit()%>%#删除带有na的行 分隔_行
使R计算元素中的字符串数使R计算元素中的字符串数 dflibrary(tidyverse) df%>% 选择(-Column1,-Column2)%>%#删除这些列 收集(年份、值、-ID、`Cut Off`)%>%#重塑数据 na.omit()%>%#删除带有na的行 分隔_行,r,R,使R计算元素中的字符串数 dflibrary(tidyverse) df%>% 选择(-Column1,-Column2)%>%#删除这些列 收集(年份、值、-ID、`Cut Off`)%>%#重塑数据 na.omit()%>%#删除带有na的行 分隔_行(值)%>%#拆分值(使用逗号) 分组依据(ID,`Cut-Off`)%>%#每个ID和Cut-Off 总结(之前=n_不同(值[as.numeric(`Cut Off`)>as.numeric(year)]),#计算截止日期之后的不同值 Af
dflibrary(tidyverse)
df%>%
选择(-Column1,-Column2)%>%#删除这些列
收集(年份、值、-ID、`Cut Off`)%>%#重塑数据
na.omit()%>%#删除带有na的行
分隔_行(值)%>%#拆分值(使用逗号)
分组依据(ID,`Cut-Off`)%>%#每个ID和Cut-Off
总结(之前=n_不同(值[as.numeric(`Cut Off`)>as.numeric(year)]),#计算截止日期之后的不同值
After=n#u distinct(值[as.numeric(`Cut Off`)%#计算截止日期之前的不同值
取消分组()%>%#忘记分组
选择(`Cut-Off`)%>%#删除截断列
右键联接(df,by=“ID”)%>%#联接回原始数据集
在(vars(Before,After),~coalesce(,0L))#将这两列中的NAs替换为0
##tibble:6 x 18
#第1列后面的ID第2列“截止日期”`2005``2006``2007``2008``2009``2010``2011``2012`
#
#1 1 2 1 NA 2011 NA 15 16 15 NA NA
#2010年不适用2015年不适用不适用不适用不适用不适用
#3 3 2 0不适用2015不适用18不适用不适用不适用
#4 4 0 3 NA 2006,2006,~2005 30 NA 30,27 20 30,20 NA 20,30
#5 5 2 1不适用于2014年、2011年18不适用于30、18 18、30、18不适用于
#6 6 0 1不适用2007不适用不适用不适用不适用不适用26
# # ... 还有4个变量:`2013`、`2014`、`2015`、`2016`
我要说的是,这种数据格式不可能干净地使用。试图将多个数字的字符串作为单元格中的文本,与以数字命名的列作为文本,与一列截断数字进行比较,这是非常困难的。我会尝试将整个数据集重塑为一个长格式的数据集,在页面上运行ID/Year/Value
,对每个逗号分隔的值重复ID/Year
。我还将ID/Cut-Off
放在一个单独的表中,您可以将其合并回去。生命太短暂,无法以目前的形式与之抗争。
df <- structure(list(ID = c("1", "2", "3", "4", "5", "6"), Column1 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_
), Column2 = c("2011", "2015", "2015", "2006, 2006, 2005, 2005, 2007",
"2014, 2011", "2007"), `Cut-Off` = c("2011", "2015", "2015",
"2005", "2011", "2007"), `2005` = c(NA, NA, NA, "30", "18", NA
), `2006` = c(NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_), `2007` = c("15", NA, "18", NA,
"30, 18", NA), `2008` = c("16", NA, NA, "30, 27", "18, 30", NA
), `2009` = c("15", NA, NA, "20", "30, 18", NA), `2010` = c(NA,
NA, NA, "30, 20", NA, NA), `2011` = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_),
`2012` = c(NA, NA, NA, "20, 30", NA, "26"), `2013` = c("15",
NA, "19", NA, NA, NA), `2014` = c(NA, NA, "18", NA, NA, NA
), `2015` = c(NA, NA, "18", NA, "18", NA), `2016` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_)), .Names = c("ID", "Column1", "Column2", "Cut-Off",
"2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012",
"2013", "2014", "2015", "2016"), row.names = c(NA, 6L), class = "data.frame")
df_solution <- structure(list(ID = c("1", "2", "3", "4", "5", "6"), Column1 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_
), Column2 = c("2011", "2015", "2015", "2006, 2006, 2005, 2005, 2007",
"2014, 2011", "2007"), `Cut-Off` = c("2011", "2015", "2015",
"2005", "2011", "2007"), `2005` = c(NA, NA, NA, "30", "18", NA
), `2006` = c(NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_), `2007` = c("15", NA, "18", NA,
"30, 18", NA), `2008` = c("16", NA, NA, "30, 27", "18, 30", NA
), `2009` = c("15", NA, NA, "20", "30, 18", NA), `2010` = c(NA,
NA, NA, "30, 20", NA, NA), `2011` = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_),
`2012` = c(NA, NA, NA, "20, 30", NA, "26"), `2013` = c("15",
NA, "19", NA, NA, NA), `2014` = c(NA, NA, "18", NA, NA, NA
), `2015` = c(NA, NA, "18", NA, "18", NA), `2016` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), Before = c(2, 0, 2, 0, 2, 0), After = c(1,
0, 0, 3, 1, 1)), .Names = c("ID", "Column1", "Column2", "Cut-Off",
"2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012",
"2013", "2014", "2015", "2016", "Before", "After"), row.names = c(NA,
6L), class = "data.frame")
library(tidyverse)
df %>%
select(-Column1, - Column2) %>% # remove those columns
gather(year,value,-ID, -`Cut-Off`) %>% # reshape data
na.omit() %>% # remove rows with NA
separate_rows(value) %>% # split values (using commas)
group_by(ID, `Cut-Off`) %>% # for each ID and cut-off
summarise(Before = n_distinct(value[as.numeric(`Cut-Off`) > as.numeric(year)]), # count distinct values where cut-off is after the dates
After = n_distinct(value[as.numeric(`Cut-Off`) < as.numeric(year)])) %>% # count distinct values where cut-off is before the dates
ungroup() %>% # forget the grouping
select(-`Cut-Off`) %>% # remove cut-off column
right_join(df, by="ID") %>% # join back original dataset
mutate_at(vars(Before,After), ~coalesce(.,0L)) # replace NAs with 0 in those two columns
# # A tibble: 6 x 18
# ID Before After Column1 Column2 `Cut-Off` `2005` `2006` `2007` `2008` `2009` `2010` `2011` `2012`
# <chr> <int> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 1 2 1 NA 2011 2011 NA NA 15 16 15 NA NA NA
# 2 2 0 0 NA 2015 2015 NA NA NA NA NA NA NA NA
# 3 3 2 0 NA 2015 2015 NA NA 18 NA NA NA NA NA
# 4 4 0 3 NA 2006, 2006,~ 2005 30 NA NA 30, 27 20 30, 20 NA 20, 30
# 5 5 2 1 NA 2014, 2011 2011 18 NA 30, 18 18, 30 30, 18 NA NA NA
# 6 6 0 1 NA 2007 2007 NA NA NA NA NA NA NA 26
# # ... with 4 more variables: `2013` <chr>, `2014` <chr>, `2015` <chr>, `2016` <chr>