R中许多fisher.test p值的复杂代码_R

R中许多fisher.test p值的复杂代码

R中许多fisher.test p值的复杂代码,r,R,我是R的初学者，所以下面的内容对我来说非常复杂我有以下数据。frame数据来自纽约市5个行政区以及2012-2015年。每年有两类：P和Q 数据 input_df = data.frame( Manhattan=c(1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0), Brooklyn=c(0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0),

我是R的初学者，所以下面的内容对我来说非常复杂

我有以下

数据。frame

数据来自纽约市5个行政区以及2012-2015年。每年有两类：P和Q

数据

 input_df = data.frame(
      Manhattan=c(1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0), 
      Brooklyn=c(0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0), 
      Queens=c(1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0), 
      The_Bronx=c(1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0), 
      Staten_Island=c(0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0), 
      "2012"=c("P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "Q", "Q", "Q", "Q", "Q", "Q", "Q", "Q", "Q"), 
      "2013"=c("P", "P", "P", "P", "P", "P", "P", "P", "Q", "Q", "P", "P", "P", "P", "Q", "Q", "Q", "Q", "Q"), 
      "2014"=c("P", "P", "P", "Q", "Q", "P", "P", "Q", "Q", "Q", "Q", "Q", "P", "Q", "P", "P", "P", "Q", "Q"), 
      "2015"=c("P", "P", "P", "P", "P", "Q", "Q", "Q", "P", "Q", "P", "P", "Q", "Q", "Q", "Q", "Q", "Q", "Q"), 
 check.names=FALSE)

我想使用

fisher.test

系统地确定在任何两个行政区，p类事件（“1”）是否比Q类事件（反之亦然）更频繁地同时发生

例如，2012年，曼哈顿和布鲁克林的事件是否同时发生（同一行中的“1”）p类事件比Q类事件更频繁？这是P的10分之4，Q的9分之0，因此

fisher.test（矩阵（c（4,6,0,9），nrow=2））$P.value

等于

0.08668731

有没有办法系统地做到这一点？下面是一个简单的开始和我理想的输出

data.frame

。我会很高兴有任何接近这个输出。多谢各位

 input_df = data.frame(
      Manhattan=c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0), 
      Brooklyn=c(0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0), 
      Queens=c(1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0), 
      The_Bronx=c(1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0), 
      Staten_Island=c(0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0), 
      "2012"=c("P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "Q", "Q", "Q", "Q", "Q", "Q", "Q", "Q", "Q"), 
      "2013"=c("P", "P", "P", "P", "P", "P", "P", "P", "Q", "Q", "P", "P", "P", "P", "Q", "Q", "Q", "Q", "Q"), 
      "2014"=c("P", "P", "P", "Q", "Q", "P", "P", "Q", "Q", "Q", "Q", "Q", "P", "Q", "P", "P", "P", "Q", "Q"), 
      "2015"=c("P", "P", "P", "P", "P", "Q", "Q", "Q", "P", "Q", "P", "P", "Q", "Q", "Q", "Q", "Q", "Q", "Q"), 
 check.names=FALSE)

代码（仅开始）

为@user2974951编辑
用户2974951，请您帮助我在以下可选的
input_df
上顺利运行相同的代码好吗？如果我使用这个
input\u df
，不幸的是它会抛出一个错误，因为
tmp3
不再是一个2x2表。我非常感谢你的帮助。多谢各位

input_df = data.frame( Manhattan=c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0), Brooklyn=c(0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0), Queens=c(1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0), The_Bronx=c(1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0), Staten_Island=c(0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0), "2012"=c("P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "Q", "Q", "Q", "Q", "Q", "Q", "Q", "Q", "Q"), "2013"=c("P", "P", "P", "P", "P", "P", "P", "P", "Q", "Q", "P", "P", "P", "P", "Q", "Q", "Q", "Q", "Q"), "2014"=c("P", "P", "P", "Q", "Q", "P", "P", "Q", "Q", "Q", "Q", "Q", "P", "Q", "P", "P", "P", "Q", "Q"), "2015"=c("P", "P", "P", "P", "P", "Q", "Q", "Q", "P", "Q", "P", "P", "Q", "Q", "Q", "Q", "Q", "Q", "Q"), check.names=FALSE)

下面是我使用for循环的尝试

res=vector("list",4) names(res)=colnames(input_df)[6:9] for (k in 1:4) { #years res[[k]]=matrix(NA,5,5) rownames(res[[k]])=colnames(res[[k]])=colnames(input_df)[1:5] for (i in 1:4) { #first in par for (j in (i+1):5) { #second in pair tmp1=which(input_df[,k+5]=="P") tmp2=which(input_df[,k+5]=="Q") tmp3=table(input_df[tmp1,i],input_df[tmp1,j]) #table for P tmp4=table(input_df[tmp2,i],input_df[tmp2,j]) #table for Q tmp5=matrix(c(tmp3[2,2],sum(tmp3)-tmp3[2,2], tmp4[2,2],sum(tmp4)-tmp4[2,2]),nrow=2,byrow=T) res[[k]][i,j]=fisher.test(tmp5)$p.value } } }
以及所有p值的输出

res $`2012` Manhattan Brooklyn Queens The_Bronx Staten_Island Manhattan NA 0.08668731 0.3034056 0.3034056 1 Brooklyn NA NA 1.0000000 1.0000000 1 Queens NA NA NA 0.3498452 1 The_Bronx NA NA NA NA 1 Staten_Island NA NA NA NA NA $`2013` Manhattan Brooklyn Queens The_Bronx Staten_Island Manhattan NA 0.6026832 0.6026832 0.30469556 0.3684211 Brooklyn NA NA 1.0000000 0.03611971 0.3684211 Queens NA NA NA 1.00000000 1.0000000 The_Bronx NA NA NA NA 0.1228070 Staten_Island NA NA NA NA NA $`2014` Manhattan Brooklyn Queens The_Bronx Staten_Island Manhattan NA 0.5820433 0.1408669 0.6284830 1 Brooklyn NA NA 0.2105263 1.0000000 1 Queens NA NA NA 0.3498452 1 The_Bronx NA NA NA NA 1 Staten_Island NA NA NA NA NA $`2015` Manhattan Brooklyn Queens The_Bronx Staten_Island Manhattan NA 1 0.6026832 0.6026832 0.4210526 Brooklyn NA NA 0.4853801 1.0000000 0.4210526 Queens NA NA NA 0.3188854 1.0000000 The_Bronx NA NA NA NA 1.0000000 Staten_Island NA NA NA NA NA
或者，如果您希望在一个数据帧中包含所有信息和附加信息

res=matrix(NA,4*choose(5,2),8) colnames(res)=c("borough_1","borough_2","year","P_both_boroughs_1", "P_not_both_boroughs_1","Q_both_boroughs_1", "Q_not_both_boroughs_1","fisher.test.pval") m=1 for (k in 1:4) { #years for (i in 1:4) { #first in par for (j in (i+1):5) { #second in pair tmp1=which(input_df[,k+5]=="P") tmp2=which(input_df[,k+5]=="Q") tmp3=table(input_df[tmp1,i],input_df[tmp1,j]) #table for P tmp4=table(input_df[tmp2,i],input_df[tmp2,j]) #table for Q tmp5=matrix(c(tmp3[2,2],sum(tmp3)-tmp3[2,2], tmp4[2,2],sum(tmp4)-tmp4[2,2]),nrow=2,byrow=T) res[m,]=c(colnames(input_df)[i], colnames(input_df)[j], colnames(input_df)[k+5], tmp5[1,1],tmp5[1,2],tmp5[2,1],tmp5[2,2], fisher.test(tmp5)$p.value) m=m+1 } } }
以及输出的前几行

data.frame(res) borough_1 borough_2 year P_both_boroughs_1 P_not_both_boroughs_1 1 Manhattan Brooklyn 2012 4 6 2 Manhattan Queens 2012 4 6 3 Manhattan The_Bronx 2012 4 6 4 Manhattan Staten_Island 2012 1 9 5 Brooklyn Queens 2012 1 9 6 Brooklyn The_Bronx 2012 2 8 Q_both_boroughs_1 Q_not_both_boroughs_1 fisher.test.pval 1 0 9 0.0866873065015479 2 1 8 0.303405572755418 3 1 8 0.303405572755418 4 0 9 1 5 1 8 1 6 1 8 1
编辑：作为缺失级别的修复，您可以使用自己的表格功能

myTable=function(t1,t2) { res=matrix(NA,2,2) res[1,1]=sum(t1==0 & t2==0) res[1,2]=sum(t1==0 & t2==1) res[2,1]=sum(t1==1 & t2==0) res[2,2]=sum(t1==1 & t2==1) return(res) }

用这个代替
表
我将按如下方式解决这个问题。首先，我加载用于分析的包

#包图书馆（dplyr）图书馆（tidyr）图书馆（purrr）
并创建数据集

#数据输入数据：6 x 9 #>曼哈顿布鲁克林皇后区布朗克斯州斯塔顿岛'2012'`2013'`2014` #> #>10110110p #>2110110p #>30000p #>41100pq #>51010P Q #>61100p #> # ... 还有一个变量：`2015`
然后我将数据集从宽结构更改为长结构。列
year
和
borough
采用值
2012
，…，
2015
和
曼哈顿
。。。，
Staten_Island
，而
category
和
flag
列则为数据集中的
borough
和
year
组合取相应的值。我需要这个结构来完成后续的功能

#整理整齐的输入\u df% 收集（“年度”、“类别”、`2012`:`2015`）%>% 聚集（“自治区”、“旗帜”-类别，-年份）整理输入 #>#A tible:380 x 4 #>年份类别区旗 #> #>1 2012页曼哈顿1 #>2012年2月1日 #>2012年3月30日 #>2012年4月1日 #>2012年5月1日 #>2012年6月1日 #>2012年7月1日 #>2012年8月30日 #>2012年9月1日 #>2012年10月1日 #> # ... 还有370行
我还需要一个包含所有行政区名称的向量

自治区年份数据 #> #> 1 2012 #> 2 2013 #> 3 2014 #> 4 2015
并创建一个新函数来执行我上面描述的过程。我现在可以使用所描述的
nest
-
map
方法
函数的第一部分在数据框中创建一个新列，该列表示每个类别和自治区组合的唯一ID，而代码的第二部分创建一个新数据框，其中所有自治区组合每次取2个，并关联标志和类别的相应值（即0/1和p/Q）

创建行政区组合% 变异（ID=1:n（））%>% 解组（） #创建n个行政区的所有组合，每次2个。 t（梳（长度（行政区），2））%>% #以一种不可压缩的方式变换矩阵可修复（.name\u repair=~c（“自治区1”、“自治区2”）%>% #将每个矩阵值与相应的自治区名称关联变异（borough_1=borough[borough_1]，borough_2=borough[borough_2]）%>% #以第一个自治区的名称连接两个数据帧内部连接（数据，由=c（“自治区1”=“自治区”））%>% #将两个数据帧合并到第二列（类别）的名称中 #还有唯一的ID 内部联接（数据，由=c（“自治区2”=“自治区”、“类别”、“ID”））%>% #创建一个新变量，用于检查事件是否同时发生变异（相等=因子（flag.x==1和flag.y==1，levels=c（真、假））） }
现在我可以使用
map
函数将该函数应用于
nested\u输入。我必须使用map ，因为我每年都需要单独应用该函数。这就是结果flag.x 是第一个行政区的flag 值，而flag.y 是第二个行政区的flag 值 unnested_input_df <- nested_input_df %>% mutate(data = map(data, create_boroughs_combinations, borough = borough)) %>% unnest() unnested_input_df #> # A tibble: 760 x 8 #> year borough_1 borough_2 category flag.x ID flag.y equal #> <chr> <chr> <chr> <chr> <dbl> <int> <dbl> <fct> #> 1 2012 Manhattan Brooklyn P 1 1 0 FALSE #> 2 2012 Manhattan Brooklyn P 1 2 0 FALSE #> 3 2012 Manhattan Brooklyn P 0 3 0 FALSE #> 4 2012 Manhattan Brooklyn P 1 4 1 TRUE #> 5 2012 Manhattan Brooklyn P 1 5 0 FALSE #> 6 2012 Manhattan Brooklyn P 1 6 1 TRUE #> 7 2012 Manhattan Brooklyn P 1 7 0 FALSE #> 8 2012 Manhattan Brooklyn P 0 8 0 FALSE #> 9 2012 Manhattan Brooklyn P 1 9 1 TRUE #> 10 2012 Manhattan Brooklyn P 1 10 1 TRUE #> # ... with 750 more rows unested_input_df% 变异（数据=地图（数据，创建区组合，区=区））%>% unnest（）未列出的输入 #>#A tible:760 x 8 #>年份区号1区号2类别标志.x ID标志.y相等 #> #>1 2012曼哈顿布鲁克林第1页1 unnested_input_df <- nested_input_df %>% mutate(data = map(data, create_boroughs_combinations, borough = borough)) %>% unnest() unnested_input_df #> # A tibble: 760 x 8 #> year borough_1 borough_2 category flag.x ID flag.y equal #> <chr> <chr> <chr> <chr> <dbl> <int> <dbl> <fct> #> 1 2012 Manhattan Brooklyn P 1 1 0 FALSE #> 2 2012 Manhattan Brooklyn P 1 2 0 FALSE #> 3 2012 Manhattan Brooklyn P 0 3 0 FALSE #> 4 2012 Manhattan Brooklyn P 1 4 1 TRUE #> 5 2012 Manhattan Brooklyn P 1 5 0 FALSE #> 6 2012 Manhattan Brooklyn P 1 6 1 TRUE #> 7 2012 Manhattan Brooklyn P 1 7 0 FALSE #> 8 2012 Manhattan Brooklyn P 0 8 0 FALSE #> 9 2012 Manhattan Brooklyn P 1 9 1 TRUE #> 10 2012 Manhattan Brooklyn P 1 10 1 TRUE #> # ... with 750 more rows nested_input_df <- unnested_input_df %>% nest(-year, -borough_1, -borough_2) nested_input_df #> # A tibble: 40 x 4 #> year borough_1 borough_2 data #> <chr> <chr> <chr> <list> #> 1 2012 Manhattan Brooklyn <tibble [19 x 5]> #> 2 2012 Manhattan Queens <tibble [19 x 5]> #> 3 2012 Manhattan The_Bronx <tibble [19 x 5]> #> 4 2012 Manhattan Staten_Island <tibble [19 x 5]> #> 5 2012 Brooklyn Queens <tibble [19 x 5]> #> 6 2012 Brooklyn The_Bronx <tibble [19 x 5]> #> 7 2012 Brooklyn Staten_Island <tibble [19 x 5]> #> 8 2012 Queens The_Bronx <tibble [19 x 5]> #> 9 2012 Queens Staten_Island <tibble [19 x 5]> #> 10 2012 The_Bronx Staten_Island <tibble [19 x 5]> #> # ... with 30 more rows