#모든 시트를 하나의 리스트로 불러오는 함수read_excel_allsheets<-function(file,tibble=FALSE){sheets<-readxl::excel_sheets(file)x<-lapply(sheets,function(X)readxl::read_excel(file,sheet=X))if(!tibble)x<-lapply(x,as.data.frame)names(x)<-sheetsx}
11장
11장 연습문제 불러오기
#data_chap11에 연습문제 11장 모든 문제 저장data_chap11<-read_excel_allsheets("data_chap11.xls")#연습문제 각각 데이터 생성for(xin1:length(data_chap11)){assign(paste0('ex11_',c(1,10))[x],data_chap11[x])}#연습문제 데이터 형식을 리스트에서 데이터프레임으로 변환for(xin1:length(data_chap11)){assign(paste0('ex11_',c(1,10))[x],data.frame(data_chap11[x]))}
분산분석을 실시 후 그룹간 차이가 있다는 결론이 났을 때, 어느 그룹간 평균이 차이가 있는지 사후검정을 통해 살펴볼 것이다.
이 문제에서는 튜키 검정을 통해 사후검정을 시행해 보았다.
\[\begin{aligned} H_0 &: \mu_1=\mu_2=\mu_3=\mu_4=\mu_5 \\ H_A &: Mean \ strontium \ concentrations \ are \ not \ the \ same \ in \ all \ five \ bodies \ of \ water \end{aligned}\]
우선 분산분석의 가정인 independence, homoscedasticity, normality 살펴보자.
Appletree Lake-Beaver Lake(4-2), Angler’s Cove-Beaver Lake(3-2), Angler’s Cove-Appletree Lake(3-4) 간에 모평균의 유의한 차이가 없었으며 나머지 수역간에 비교시 유의수준 5% 하에 모평균의 유의한 차이가 있다고 할 수 있다.
Feed1-Feed4(1-4), Feed3-Feed2(3-2) 간에 모평균의 유의한 차이가 없었으며 나머지 사료간의 비교시 유의수준 5% 하에 모평균의 유의한 차이가 있다고 할 수 있다.
EXAMPLE 11.3
이 예제는 Example 11.1 데이터를 사용하여 모평균에 대한 신뢰구간을 구해본다.
library(MBESS)exam11_1.mean<-aggregate(ex11_1$exam11_1.strontium,by=list(ex11_1$exam11_1.pond),mean)exam11_1.n<-aggregate(ex11_1$exam11_1.strontium,by=list(ex11_1$exam11_1.pond),length)ci<-ci.c(means=exam11_1.mean$x,s.anova=sqrt(9.7652),c.weights=c(-3/3,1/3,1/3,1/3,0),n=exam11_1.n$x,N=sum(exam11_1.n$x),conf.level=.95)data.frame(ci)%>%kbl(caption="CI for the Population Means from ex11_1",escape=F)%>%kable_styling(bootstrap_options=c("striped","hover","condensed","responsive"))
ci<-ci.c(means=exam11_1.mean$x,s.anova=sqrt(9.7652),c.weights=c(0,-1/3,-1/3,-1/3,1),n=exam11_1.n$x,N=sum(exam11_1.n$x),conf.level=.95)data.frame(ci)%>%kbl(caption="CI for the Population Means from ex11_1",escape=F)%>%kable_styling(bootstrap_options=c("striped","hover","condensed","responsive"))
CI for the Population Means from ex11_1
Lower.Conf.Limit.Contrast
Contrast
Upper.Conf.Limit.Contrast
13.46052
16.49444
19.52837
그룹 5의 모평균과 그룹 2,4,3의 모평균의 차의 95% 신뢰구간은 (13.46, 19.53)으로 계산되었다.
중앙값은 7.9 값이 나왔으며, 7.9보다 큰 첫번째 수인 8.1보다 큰 수를 빈도로 잡아 North:3, East:2, South:9, West:6 값이 나온다.
s.size<-c(11,12,11,12)N<-46;n<-round(4/sum(1/s.size),2);se<-round(sqrt(n*N/(4*(N-1))),3)freq<-c()for(iin1:4){freq[i]=length(which(ex11_9$height[ex11_9$side==i]>8.1))}diff<-c(freq[3]-freq[2],freq[3]-freq[1],freq[4]-freq[2])table<-data.frame(Comparison=c("3 vs. 2","3 vs. 1","4 vs. 2"),Difference=diff,SE=se,q=round(diff/se,3),Critical_Value=3.633)table%>%kbl(caption="")%>%kable_classic(full_width=F,html_font="Cambria")
Comparison
Difference
SE
q
Critical_Value
3 vs. 2
7
1.713
4.086
3.633
3 vs. 1
6
1.713
3.503
3.633
4 vs. 2
4
1.713
2.335
3.633
4개의 그룹을 두 그룹 씩 중앙값을 통해 차이 검정을 실시하였다.
귀무가설은 ‘두 모집단의 중앙값은 동일하다.’, 대립가설은 ‘두 모집단의 중앙값은 동일하지 않다.’ 이며, q 통계량이 임계값 3.633 보다 클 경우 귀무가설을 기각한다.
검정 결과 그룹 3과 2를 비교했을 때, q 통계량이 임계값보다 크므로 귀무가설을 기각하여 3번과 2번 모집단의 중앙값이 서로 동일하지 않음을 알 수 있다.
EXAMPLE 11.10
이 예제는 분산이 동일하지 않은 결과에 대한 다중비교를 하는 문제이다.
var<-c(2.74,2.83,2.20,6.42);n<-c(50,48,50,50);v<-n-1;treat<-c("1","2","3","4")q_crit<-3.633;Difference=round(log(c(var[4]/var[3],var[4]/var[1],var[4]/var[2],var[2]/var[3])),4)SE=round(sqrt(c(1/v[4]+1/v[3],1/v[4]+1/v[1],1/v[4]+1/v[2],1/v[2]+1/v[3])),3)table<-data.frame(Comparison=c("4 vs. 3","4 vs. 1","4 vs. 2","2 vs. 3"),Difference,SE,q=round(Difference/SE,3),Critical_Value=q_crit)table%>%kbl(caption="")%>%kable_classic(full_width=F,html_font="Cambria")
Comparison
Difference
SE
q
Critical_Value
4 vs. 3
1.0710
0.202
5.302
3.633
4 vs. 1
0.8515
0.202
4.215
3.633
4 vs. 2
0.8191
0.204
4.015
3.633
2 vs. 3
0.2518
0.204
1.234
3.633
4개의 그룹을 두 그룹 씩 등분산 검정을 실시하였다.
귀무가설은 ‘두 그룹의 모분산은 동일하다.’, 대립가설은 ‘두 그룹의 모분산은 동일하지 않다.’ 이며, 검정 결과 q 통계량이 임계값 3.633 보다 클 경우 귀무가설을 기각한다.
검정 결과 그룹 4는 나머지 그룹 전체와 서로 모분산이 동일하지 않으며, 그 외 그룹들의 모분산들은 서로 동일함을 알 수 있다.
/*정규성 검정*/odsgraphicsoff;odsexcludeall;odsnoresults;PROCUNIVARIATEDATA=ex.ex11_1normal;CLASSpond;VARstrontium;odsoutputTestsForNormality=TestsForNormality;RUN;odsgraphicson;odsexcludenone;odsresults;PROCSORTDATA=TestsForNormality;BYdescendingTest;RUN;title'ex11_1: 정규성 가정';PROCPRINTDATA=TestsForNormalitylabel;RUN;title;/*등분산성 검정*/odsgraphicsoff;odsexcludeall;odsnoresults;PROCGLMDATA=ex.ex11_1;CLASSpond;MODELstrontium=pond/p;MEANSpond/HOVTEST=BARTLETT;odsoutputBartlett=Bartlett;RUN;odsgraphicson;odsexcludenone;odsresults;title"ex11_1 : 등분산 가정 (Bartlett's Test for Homogeneity of weight Variance)";PROCPRINTDATA=Bartlettlabel;RUN;title;/*독립성 검정*/odsgraphicsoff;odsexcludeall;odsnoresults;PROCGLMDATA=ex.ex11_1;CLASSpond;MODELstrontium=pond/p;outputout=out_dsr=resid_var;RUN;DATAout_ds;SETout_ds;time=_n_;odsgraphicson;odsexcludenone;odsresults;PROCGPLOTDATA=out_ds;PLOTresid_var*time;RUN;odsgraphicsoff;odsexcludeall;odsnoresults;PROCANOVAdata=ex.ex11_1;CLASSpond;MODELstrontium=pond;MEANSpond/tukeyCLdifflines;odsoutputCLDiffs=CLDiffs_tukeyOverallANOVA=OverallANOVA_1;RUN;odsgraphicson;odsexcludenone;odsresults;PROCPRINTDATA=OverallANOVA_1label;RUN;/*다중비교*/title"ex11.1 : Tukey 다중비교";PROCSORTDATA=CLDiffs_tukey;BYComparison;RUN;PROCPRINTDATA=CLDiffs_tukeylabel;VARComparisonDifferenceLowerCLUpperCLSignificance;RUN;
SAS 출력
ex11_1: 정규성 가정
OBS
VarName
pond
적합도 검정
적합도 통계량에 대한 레이블
적합도 통계량 값
p-값 레이블
p-값의 부호
p-값
1
strontium
1
Shapiro-Wilk
W
0.956742
Pr < W
0.7943
2
strontium
2
Shapiro-Wilk
W
0.96163
Pr < W
0.8322
3
strontium
3
Shapiro-Wilk
W
0.97181
Pr < W
0.9044
4
strontium
4
Shapiro-Wilk
W
0.978396
Pr < W
0.9433
5
strontium
5
Shapiro-Wilk
W
0.989372
Pr < W
0.9876
6
strontium
1
Kolmogorov-Smirnov
D
0.157345
Pr > D
>
0.1500
7
strontium
2
Kolmogorov-Smirnov
D
0.155105
Pr > D
>
0.1500
8
strontium
3
Kolmogorov-Smirnov
D
0.216182
Pr > D
>
0.1500
9
strontium
4
Kolmogorov-Smirnov
D
0.177547
Pr > D
>
0.1500
10
strontium
5
Kolmogorov-Smirnov
D
0.141423
Pr > D
>
0.1500
11
strontium
1
Cramer-von Mises
W-Sq
0.02661
Pr > W-Sq
>
0.2500
12
strontium
2
Cramer-von Mises
W-Sq
0.0229
Pr > W-Sq
>
0.2500
13
strontium
3
Cramer-von Mises
W-Sq
0.032535
Pr > W-Sq
>
0.2500
14
strontium
4
Cramer-von Mises
W-Sq
0.025067
Pr > W-Sq
>
0.2500
15
strontium
5
Cramer-von Mises
W-Sq
0.0209
Pr > W-Sq
>
0.2500
16
strontium
1
Anderson-Darling
A-Sq
0.186217
Pr > A-Sq
>
0.2500
17
strontium
2
Anderson-Darling
A-Sq
0.171671
Pr > A-Sq
>
0.2500
18
strontium
3
Anderson-Darling
A-Sq
0.195251
Pr > A-Sq
>
0.2500
19
strontium
4
Anderson-Darling
A-Sq
0.16375
Pr > A-Sq
>
0.2500
20
strontium
5
Anderson-Darling
A-Sq
0.145265
Pr > A-Sq
>
0.2500
ex11_1 : 등분산 가정 (Bartlett's Test for Homogeneity of weight Variance)
odsgraphicsoff;odsexcludeall;odsnoresults;PROCGLMDATA=ex.ex11_1;CLASSpond;MODELstrontium=pond/clparm;MEANSpond/tukeyCLdiff;estimate'pond=1 vs pond=2,3,4'pond-31110/divisor=3;estimate'pond=5 vs pond=2,3,4'pond0-1-1-13/divisor=3;odsoutputEstimates=Estimates;RUN;QUIT;odsgraphicson;odsexcludenone;odsresults;TITLECIforthePopulationMeansfromex11_1;PROCPRINTDATA=Estimates;RUN;
%macroKW_MC(source=,groups=,obsname=,gpname=,sig=);** PERFORM THE STANDARD KRUSKAL WALLIS TEST;PROCnpar1waydata=&sourcewilcoxon;outputout=KW_MC_TMP5;CLASS&gpname;VAR&OBSNAME;RUN;* Rank the input data froum the source file;procsortdata=&source;by&gpname;run;procrankdata=&sourceout=KW_MC_TMP6ties=mean;var&OBSNAME;ranksobsrank;run;* Determin if there are tied ranks;procfreqdata=KW_MC_TMP6order=freq;tablesobsrank/out=KW_MC_TMP7;run;* Create macro variable named &ISTIES;data_null_;if_N_=1thensetKW_MC_TMP7;maxtied=count;IFMAXTIEDgt1thenTIED=1;ELSETIED=0;CALLSYMPUT('ISTIES',TIED);run;* calculate SUMT as per Zar formula 10.42;procfreqnoprintdata=KW_MC_TMP6;tableobsrank/out=KW_MC_TMP4sparse;run;dataKW_MC_TMP4;setKW_MC_TMP4;retaint;t=sum(t,(count**3-count));keept;run;dataKW_MC_TMP4;setKW_MC_TMP4end=eof;N=1;if(eof)thenoutput;run;*calculate and output the ranksums;procmeansnoprintsumnmeandata=KW_MC_TMP6;outputout=rankmeansn=nsum=ranksummean=rankmean;varobsrank;by&gpname;run;procsortdata=rankmeans;byrankmean;run;datarankmeans;setrankmeans;labelgp="Rank for Variable &OBSNAME";run;dataKW_MC_TMP5;setKW_MC_TMP5;_label_="Rank for Variable &OBSNAME";keep_label_p_kw;run;proctransposedata=rankmeansout=KW_MC_TMP5prefix=MEAN;varrankmean;run;dataKW_MC_TMP5;setKW_MC_TMP5;N=_N_;keepnmean1-mean&groups;run;proctransposedata=rankmeansout=KW_MC_TMP6prefix=n;varn;run;dataKW_MC_TMP6;setKW_MC_TMP6;N=_N_;keepnn1-n&groups;run;proctransposedata=rankmeansout=KW_MC_TMP7prefix=gp;var&gpname;run;dataKW_MC_TMP7;setKW_MC_TMP7;N=_N_;keepngp1-gp&groups;run;datatransposed;mergeKW_MC_TMP5KW_MC_TMP6KW_MC_TMP7KW_MC_TMP4;byn;run;datatmp4;settransposed;formatmsg$53.;sumt=t;iputrejectmessage=0;msg="Do not test";*set length of variable;msg="Reject";arrayranksum(*)mean1-mean&groups;arrayna(*)n1-n&groups;arraylab(*)gp1-gp&groups;arrayq05(20);arraypair(20,20);* Check to see if all group ns are equal;notequal=0;doi=1to&groups;ifna(1)nena(i)thennotequal=1;end;* if there are ties, use the notequal version of the comparisons;tmp=&isties;iftmpeq1thennotequal=1;ALPHA=&sig;Q05(1)=.;doK=2to20;PQ=1-ALPHA/(K*(k-1));Q05(K)=PROBIT(PQ);end;xx=Q05(3);ifnotequal=0thendoK=2to20;PQ=1-ALPHA;q05(k)=probmc("RANGE",.,PQ,64000,k);end;nsum=0;doi=1to&groups;nsum=nsum+na(i);end;icompare=&groups;qcrit=q05(icompare);* print out the multiple comparison results table;fileprint;gp="&gpname";ifnotequal=1thendo;put@5"Group sample sizes not equal, or some ranks tied. Performed Dunn's test, alpha="alpha;end;elsedo;put@5"Group sample sizes are equal. Performed Nemenyi test, alpha="alpha;end;put' ';put@5'Comparison group = '"&gpname";put' ';put' Compare Diff SE q q('"&sig"') Conclude';put'------------------------------------------------------------';iskiprest=0;* as the table is constructed, determine the correct Conclude message;doi=icompareto1by-1;doj=1toi;pair(i,j)=0;*0=not yet tested, 1=reject 2=accept 3= skip;end;end;doi=icompareto1by-1;doj=1toi;ifinejthendo;ifnotequal=0thendo;* Zar formula 11.22;rs1=ranksum(i)*na(1);rs2=ranksum(j)*na(1);se=round(sqrt((na(1)*(na(1)*&groups)*(na(1)*&groups+1))/12),.01);end;elsedo;rs1=ranksum(i);rs2=ranksum(j);* Zar formula 11.28;setmp=(((nsum*(nsum+1))/12)-(SUMT/(12*(nsum-1))))*((1/na(i))+(1/na(j)));se=round(sqrt(setmp),.01);end;diff=round(rs1-rs2,.01);q=round((rs1-rs2)/se,.01);ifpair(i,j)ne3thendo;if(qgtqcrit)and(pair(i,j)ne2)thenpair(i,j)=1;* REJECT;ifqleqcritthendo;pair(i,j)=2;if(i-j)ge2thendo;dok=jto(i-1);dol=(k+1)toi;ificomparene1thendo;if(pair(l,k)ne2)and(lnek)thenpair(l,k)=3;* set to not test;end;end;end;end;end;end;ifpair(i,j)=1thenmsg='Reject';ifpair(i,j)=2thenmsg='Do not reject';ifpair(i,j)=3thenmsg='Do not reject
(within non-sig. comparison)';ifpair(i,j)=3theniputrejectmessage=1;ifpair(i,j)le2thenput@5lab(i)'vs'@11lab(j)@15diff@25se@35q@45qcrit6.3@55msg;ifpair(i,j)=3thenput@5lab(i)'vs'@11lab(j)@15msg;end;end;end;ifiputrejectmessage=1thendo;put' ';put' Note: "Do not reject (within non-sig.comparison)" indicates that any comparison';put' within the range of a non-significant comparison must also be non-significant.';end;put' ';put' Reference: Biostatistical Analysis, 4th Edition, J. Zar, 2010.';run;%mendKW_MC;%KW_MC(source=ex.ex10_10,groups=3,obsname=abundance,gpname=layer,sig=0.05);
SAS 출력
ex11.5 & 11.6: SCHEFFE
The NPAR1WAY Procedure
Wilcoxon Scores (Rank Sums) for Variable abundance Classified by Variable layer
layer
N
Sum of Scores
Expected Under H0
Std Dev Under H0
Mean Score
1
5
64.0
40.0
8.164966
12.80
2
5
30.0
40.0
8.164966
6.00
3
5
26.0
40.0
8.164966
5.20
Kruskal-Wallis Test
Chi-Square
DF
Pr > ChiSq
8.7200
2
0.0128
ex11.5 & 11.6: SCHEFFE
FREQ 프로시저
변수 abundance의 순위
obsrank
빈도
백분율
누적 빈도
누적 백분율
1
1
6.67
1
6.67
2
1
6.67
2
13.33
3
1
6.67
3
20.00
4
1
6.67
4
26.67
5
1
6.67
5
33.33
6
1
6.67
6
40.00
7
1
6.67
7
46.67
8
1
6.67
8
53.33
9
1
6.67
9
60.00
10
1
6.67
10
66.67
11
1
6.67
11
73.33
12
1
6.67
12
80.00
13
1
6.67
13
86.67
14
1
6.67
14
93.33
15
1
6.67
15
100.00
ex11.5 & 11.6: SCHEFFE
Group sample sizes are equal. Performed Nemenyi test, alpha=0.05
Comparison group = layer
Compare Diff SE q q(0.05) Conclude
------------------------------------------------------------
1 vs 3 38 10 3.8 3.315 Reject
1 vs 2 34 10 3.4 3.315 Reject
2 vs 3 4 10 0.4 3.315 Do not reject
Reference: Biostatistical Analysis, 4th Edition, J. Zar, 2010.
EXAMPLE 11.8
PROCIMPORTDBMS=excelDATAFILE="C:\Biostat\data_chap10"OUT=ex.ex10_11REPLACE;RANGE="exam10_11$";RUN;%LETNUMGROUPS=4;%LETDATANAME=ex.ex10_11;%LETOBSVAR=ph;%LETGROUP=pond;%LETALPHA=0.05;* OPTINALLY DEFINE A TITLE;TITLE'Kruskal-Wallis Tied Ranks';RUN;ODSHTML;%KW_MC(source=&DATANAME,groups=&NUMGROUPS,obsname=&OBSVAR,gpname=&GROUP,sig=&alpha);ODSHTMLCLOSE;
SAS 출력
Kruskal-Wallis Tied Ranks
The NPAR1WAY Procedure
Wilcoxon Scores (Rank Sums) for Variable ph Classified by Variable pond
pond
N
Sum of Scores
Expected Under H0
Std Dev Under H0
Mean Score
Average scores were used for ties.
1
8
55.00
128.0
22.088386
6.875000
2
8
132.50
128.0
22.088386
16.562500
3
7
145.00
112.0
21.106183
20.714286
4
8
163.50
128.0
22.088386
20.437500
Kruskal-Wallis Test
Chi-Square
DF
Pr > ChiSq
11.9435
3
0.0076
Kruskal-Wallis Tied Ranks
FREQ 프로시저
변수 ph의 순위
obsrank
빈도
백분율
누적 빈도
누적 백분율
13.5
4
12.90
4
12.90
6
3
9.68
7
22.58
10
3
9.68
10
32.26
20
3
9.68
13
41.94
26
3
9.68
16
51.61
3.5
2
6.45
18
58.06
23.5
2
6.45
20
64.52
1
1
3.23
21
67.74
2
1
3.23
22
70.97
8
1
3.23
23
74.19
16
1
3.23
24
77.42
17
1
3.23
25
80.65
18
1
3.23
26
83.87
22
1
3.23
27
87.10
28
1
3.23
28
90.32
29
1
3.23
29
93.55
30
1
3.23
30
96.77
31
1
3.23
31
100.00
Kruskal-Wallis Tied Ranks
Group sample sizes not equal, or some ranks tied. Performed Dunn's test, alpha=0.05
Comparison group = pond
Compare Diff SE q q(0.05) Conclude
------------------------------------------------------------
3 vs 1 13.84 4.69 2.95 2.638 Reject
3 vs 2 4.15 4.69 0.89 2.638 Do not reject
3 vs 4 Do not reject(within non-sig. comparison)
4 vs 1 13.56 4.53 2.99 2.638 Reject
4 vs 2 Do not reject(within non-sig. comparison)
2 vs 1 9.69 4.53 2.14 2.638 Do not reject
Note: "Do not reject (within non-sig.comparison)" indicates that any comparison
within the range of a non-significant comparison must also be non-significant.
Reference: Biostatistical Analysis, 4th Edition, J. Zar, 2010.
EXAMPLE 11.9
%macroex11_9(a,b);prociml;f={12,11,12,11};n=4/(1/12+1/11+1/12+1/11);se=sqrt((11.48*sum(f))/(4*(sum(f)-1)));q=(&b-&a)/se;printq;run;quit;%mend;TITLE'3 vs 2';%ex11_9(2,9);TITLE'3 vs 1';%ex11_9(3,9);TITLE'4 vs 2';%ex11_9(2,6);
SAS 출력
3 vs 2
q
4.0868099
3 vs 1
q
3.5029799
4 vs 2
q
2.3353199
EXAMPLE 11.10
%macroex11_10(n1_a,n2_a,s1_a,s2_a);prociml;se=sqrt(1/(&n1_a-1)+1/(&n2_a-1));q=(log(&s1_a)-log(&s2_a))/se;printseq;run;quit;%mend;TITLE'4 vs 3';%ex11_10(50,50,6.42,2.20);TITLE'4 vs 1';%ex11_10(50,50,6.42,2.74);TITLE'4 vs 2';%ex11_10(50,48,6.42,2.83);TITLE'2 vs 3';%ex11_10(48,50,2.83,2.20);
SAS 출력
4 vs 3
se
q
0.2020305
5.3009853
4 vs 1
se
q
0.2020305
4.214513
4 vs 2
se
q
0.2041685
4.012086
2 vs 3
se
q
0.2041685
1.2333901
교재: Biostatistical Analysis (5th Edition) by Jerrold H. Zar