Script 4

#4.1.1 Load datafile
load("misdata.Rdata")
#4.1.2 Frame null and alternative hypotheses
# H0: mean difference sores (Arith1-Arith0) do not significantly differ for method A and B
# Ha: mean difference score (Arith1-Arith0) do significantly differ for method A and B

boxplot(misdata$mis_arith0 ~ misdata$mydata.methodf)
arith0_A <-misdata$mis_arith0[misdata$mydata.method==0]
arith0_B <-misdata$mis_arith0[misdata$mydata.method==1]
arith0_A; arith0_B

par(mfrow=c(2,1))
hist(arith0_A);hist(arith0_B)
par(mfrow=c(1,1))
hist(arith0_A);hist(arith0_B)

#4.2 Compare method A and B visually on Arith0 and IQ
#4.2.1 Arith 0
par(mfrow=c(2,1))
hist(misdata$mis_arith0[misdata$mydata.method==0]);hist(misdata$mis_arith0[misdata$mydata.method==1])
#method A seems negatively skewed; method B seems more equally distributed but has less low values than A
boxplot(misdata$mis_arith0 ~ misdata$mydata.methodf)
# method A has more variance/outliers than method B
#4.2.2 IQ
par(mfrow=c(1,1))
hist(misdata$mis_IQ[misdata$mydata.method==0]);hist(misdata$mis_IQ[misdata$mydata.method==1])
#method B seems positively skewed with more values on the left; method A has a larger range
boxplot(misdata$mis_IQ ~ misdata$mydata.methodf)
#variance is slightly smaller in method B, has 2 outliers, A has one outlier

#4.3
#4.3.1
??tapply
#4.3.2
tapply(misdata$mis_arith0, misdata$mydata.methodf, mean, na.rm=TRUE)
tapply(misdata$mis_arith0, misdata$mydata.methodf, var, na.rm=TRUE)
#meana are almost exactly the same, although variance is larger in method A (Levene's?)
tapply(misdata$mis_IQ, misdata$mydata.methodf, mean, na.rm=TRUE)
tapply(misdata$mis_IQ, misdata$mydata.methodf, var, na.rm=TRUE)
#means differ but variances are equal, so probably significant difference
#This means that arith0 can probably be used as a covariate, as the scores are similar in both methods
#And that IQ is similar in both methods

??t.test
t.test(misdata$mis_arith0 ~ misdata$mydata.methodf)
t.test(misdata$mis_arith0 ~ misdata$mydata.methodf, var.equal=TRUE)
t.test(misdata$mis_arith1, misdata$mis_arith0, paired=TRUE)
t.test(misdata$mis_arith0, mu=125)

#4.4
#4.4.1 Compare differences between methods at baseline for arith0 and IQ
t.test(misdata$mis_arith0 ~ misdata$mydata.methodf, var.equal=TRUE)
#No significant group differences at baseline arithmetic skills (p=.983, CI contains 0)
t.test(misdata$mis_IQ ~ misdata$mydata.methodf, var.equal=TRUE)
#No significant group differences at baseline IQ (p=.587, CI contains 0)
#4.4.2
#Assignment has been random because data is randomly generated
#4.4.3 Test for systematic differences in children with missing data at baseline
#test for arithmetic scores after 1 year
t.test(misdata$mis_arith1 ~ misdata$ind_arith0)
#There are significant differences, p<.001 and CI doesnt contain 0. Arith1 significantly higher for those with missing values at arith0
t.test(misdata$mis_educ ~ misdata$ind_arith0)
#p> .05 but CI contains 0. Hence, Mean parental education is lower in children with missing values at arith0
t.test(misdata$mis_IQ ~ misdata$ind_arith0)
#p<.001 and CI doesnt contain 0. Hence, children with missing value at Arith0 have significantly higher IQ

# If the cases with missing values at arith0 are deleted pairwise, the conclusions about the effect of method will be

#4.4.4
t.test(misdata$mis_arith1, misdata$mis_arith0, paired=TRUE)
#yes, significant differences between mean arith1 and mean arith0 (p<.001; CI 56.51,61.13)

chisq.test(misdata$mydata.sexf, misdata$mydata.methodf, correct=FALSE)
chisq.test(misdata$mydata.sexf, misdata$mydata.methodf, simulate.p.value = TRUE)

#4.5
#4.5.1 Compare methods to sex and to educ.
chisq.test(misdata$mydata.methodf, misdata$mydata.sexf)
#p=.716, no significant differences in sex for both methods
chisq.test(misdata$mydata.methodf, misdata$mis_educf)
#p=.049, significant differences in parental education for both methods
#4.5.2
#Method assignment was random due to the generation of the dataset
#significant differences in parental education were not expected

#4.6
#4.6.1 test for group differences in difference scores over methods
dif.arith <- misdata$mis_arith1-misdata$mis_arith0
t.test(dif.arith ~ misdata$mydata.methodf)
t.test(misdata$mis_arith1-misdata$mis_arith0 ~ misdata$mydata.methodf)
#4.6.2 conclusions?
#in both cases, p=.15, CI (-7.84;1.26) so no significant differences between method A and B
#however, may be affected by missing values:
rm(dif.arith)

#4.7
#4.7.1 Define improve as differencer score
improve<-misdata$mis_arith1-misdata$mis_arith0; improve
#4.7.2 Plot improvement over parental education, both numerical and factor string
plot(improve ~ misdata$mis_educ)
plot(improve ~ misdata$mis_educf)
#the latter one is a boxplot which is preferred
# primary inspection: variance is not equal over groups, low variance in high educ
# median is higher in low educ
#4.7.3
# H0: mean improvement is the same in low, middle and high parental education
# Ha: mean improvement differs significantly in low, middle and high educ

#4.8 ANOVA
anova(lm(improve ~ mis_educf, data=misdata))
??anova
anova(lm(improve ~ misdata$mis_educf))
#yields similar results

anova1 <- anova(lm(improve ~ mis_educf + mydata.methodf, data=misdata)); anova1
summary(anova1)
anova2<-aov(improve ~ mis_educf + mydata.methodf, data=misdata); anova2
summary(anova2)
#No significant main effect for method or parental education, H0 accepted

anova3<- anova(lm(improve ~ mis_educf * mydata.methodf, data=misdata)); anova3
# also no significant interaction effect (if I did this correctly)

#4.9
#4.9.1
anova2<-aov(improve ~ misdata$mis_educf + misdata$mydata.methodf); anova2
anova2a<-aov(improve ~ mydata.methodf + mis_educf, data=misdata); anova2a
# anova2 first tests for main effect of educf and then for main effect of methodf
# vice versa for anova2a
# It's a hierarchical procedure. The order determines the variance

# 4.9.2
anova1 <- anova(lm(improve ~ mis_educf + mydata.methodf, data=misdata)); anova1
anova1a <- anova(lm(improve ~ mydata.methodf + mis_educf, data=misdata)); anova1a

#4.10 Type III SS
#4.10.1
anova3 <- drop1(anova2, test="F"); anova3
anova3a<-drop1(anova2a, test="F");anova3a
#In this case, there is no order effect for adding the factors, F test is the same
#4.10.2
anova3a<-drop1(anova2a, test="F");anova3a
#in anova2a, p =.148 for methof, while in anova3a, p=.140

#4.11
#4.11.1
??TukeyHSD
TukeyHSD(anova2)
#4.11.2
plot(TukeyHSD(anova2, "misdata$mis_educf"))
#It seems like all confidence intervals contain 0, hence do not differ significantly

#4.12
#4.12.1 Make histograms
improve_0 <-improve[misdata$mis_educ==0];improve_0
improve_1 <-improve[misdata$mis_educ==1];improve_1
improve_2 <-improve[misdata$mis_educ==2];improve_2
par(mfrow=c(3,1))
hist(improve_0); hist(improve_1); hist(improve_2)
#The scores seem normally distributed
#QQ plots
library(lattice)
qqmath(~improve|misdata$mis_educ)
shapiro.test(misdata$mis_arith1)
library(car) #werkt niet
leveneTest(improve)

########################################################

#1. Exercise 4.7
#4.7
#4.7.1 Define improve as differencer score
improve<-misdata$mis_arith1-misdata$mis_arith0; improve
#4.7.2 Plot improvement over parental education, both numerical and factor string
plot(improve ~ misdata$mis_educ)
plot(improve ~ misdata$mis_educf)
#the latter one is a boxplot which is preferred
# primary inspection: variance is not equal over groups, less variance in high educ
# median is higher in low educ
#4.7.3
# H0: mean improvement is the same in low, middle and high parental education
# Ha: mean improvement differs significantly in low, middle and high educ

#2. Exercise 4.8
#4.8 ANOVA
anova(lm(improve ~ mis_educf, data=misdata))
??anova
anova(lm(improve ~ misdata$mis_educf))
#yields similar results
anova1 <- anova(lm(improve ~ mis_educf + mydata.methodf, data=misdata)); anova1
summary(anova1)
anova2<-aov(improve ~ mis_educf + mydata.methodf, data=misdata); anova2
summary(anova2)
#No significant main effect for method or parental education, H0 accepted
anova3<- anova(lm(improve ~ mis_educf * mydata.methodf, data=misdata)); anova3
# if I did this correctly, there's also no significant interaction effect

#3.
library(MASS)
data("shoes", package="MASS")
summary(shoes)
#3a. test whether mean of A equals 9
t.test(shoes$A, mu=9)
#p=.064 so it is not significant, hence the mean is not equal to 9
#3b. test whether variance is equal for A and B
var.test(shoes$A,shoes$B)
# No evidence for Ha, hence the variance is equal in both variables
#3c. test for difference in means between A and B
t.test(shoes$A, shoes$B)
#p=.717, hence Ha is rejected, there are no significant differences in the means of A and B
#3d. test whether mean of B is smaller than mean of A
t.test(shoes$A, shoes$B, alternative="greater")
#this test is not significant p=.642, hence amount of material in B is not smaller

#4a
load("compensation.txt")
#I get an error about magic numbers and aroots
compensation<-read.table("compensation.txt", header=TRUE); compensation
#this seems to work
#4a.
summary(compensation$aRoot)
summary(compensation$Fruit)
summary(compensation$Grazing)
boxplot(compensation$aRoot ~ compensation$Grazing)
#variance seems about equal, large difference in median
aRoot_G<-compensation$aRoot[compensation$Grazing=="Ungrazed"]
aRoot_U<-compensation$aRoot[compensation$Grazing=="Grazed"]
par(mfrow=c(2,1))
hist(aRoot_G); hist(aRoot_U)
#variance of aRoot seems more normally distributed in the Grazed group than in Ungrazed
#approximates normal distribution
#4b
#Im assuming this means 2 separate t-tests for the 2 continuous variables
t.test(compensation$Fruit ~ compensation$Grazing)
# p=.027, CI doesnt contain 0; the mean in Grazed is sign. higher than in Ungrazed
t.test(compensation$aRoot ~ compensation$Grazing)
#p<.001, CI doesnt contain 0; the mean in Grazed is sign. higher than in Ungrazed

#5.
install.packages("car")
library(car)
data("Moore", package="car"); Moore
#5a
#partner.status yes (high) or no (low), continuous score for conformity,
#fscores are translated to 3 categories low medium high (fcategory)

#5.b investigate outliers
library(extremevalues)
getOutliers(Moore$conformity[Moore$partner.status=="low"])
#2 outliers on the right side
getOutliers(Moore$conformity[Moore$partner.status=="high"])
#1 outlier on the right side
getOutliers(Moore$conformity[Moore$fcategory=="low"])
#2 outliers on the right side
getOutliers(Moore$conformity[Moore$fcategory=="medium"])
#0 outliers
getOutliers(Moore$conformity[Moore$fcategory=="high"])
# 2 outliers on the right side
# I am not sure whether I interpreted this correctly
# and I am clueless as to what to with these supposed outliers?

#5c check distribution
par(mfrow=c(1,1))
boxplot(Moore$conformity ~ Moore$fcategory)
#variance is way larger in medium group
par(mfrow=c(3,1))
conf_low<-(Moore$conformity[Moore$fcategory=="low"])
conf_medium<-(Moore$conformity[Moore$fcategory=="medium"])
conf_high<-(Moore$conformity[Moore$fcategory=="high"])
hist(conf_low);hist(conf_medium);hist(conf_high)
#seems positively skewed in low conformity group, somewhat negatively skewed in medium
# and just not normally distributed in high group
shapiro.test(Moore$conformity)
#p >.05, so null hypothesis is rejected, hence data is not normally distributed
install.packages("psych")
library(psych)
tapply(Moore$conformity, Moore$fcategory, skew)
# high and low category are moderately skewed (values between |-1| and |-.5|), medium is ok
tapply(Moore$conformity, Moore$fcategory, kurtosi)
#values are between -2 and +2, thus acceptable

#5d check Levene's
leveneTest(Moore$conformity, Moore$fcategory)
#not significant so variance is homogenous

#5e
anova<-anova(lm(Moore$conformity ~ Moore$fcategory)); anova
#no significant differences in conformity for the fcategory groups
anova2<-anova(lm(Moore$conformity ~ Moore$fcategory + Moore$partner.status)); anova2
#Significant differences in conformity for partner status
anova2a<-aov(Moore$conformity ~ Moore$fcategory + Moore$partner.status); anova2a
TukeyHSD(anova2a)
#Those with partner score significantly higher on conformity than those without

Zoeken in deze blog

R introduction

Script 4

Reacties

Een reactie posten

Populaire posts van deze blog

Script 3

Script 5

Script 1