Script 3

#Exercise
load("misdata.Rdata")
names(misdata)
misdata$educ <- NULL; misdata$educ
misdata2 <- misdata[,-c(2:4,7,8)]
names(misdata2)
save(misdata2, file="misdata2.Rdata")
table(misdata$mydata.sex)

#Exercise 3.1
#1.
table(misdata$mis_educf)
# missing values are not counted

#2. Create table of gender x educf
table(misdata$mydata.sexf, misdata$mis_educf)
table(misdata$mydata.sexf, misdata$mydata.methodf)

#3. proportions instead of frequencies
prop.table(table(misdata$mydata.sexf, misdata$mis_educf),1)
prop.table(table(misdata$mydata.sexf, misdata$mis_educf),2)
prop.table(table(misdata$mydata.sexf, misdata$mis_educf))
#if a number is given behind a comma, the values will be a proportion of that number
#if no number is given, the values will be a proportion of the sum of all values in the table

#4. bonus prop table
prop.table(table(misdata$mydata.sexf, misdata$mis_educf, exclude=NULL))
#the NAs are treated as a different category alltogether, thereby yielding a more informative table

#5. Check whether sex and parental education is equally distributed over methods
#5.1 Sex
table(misdata$mydata.sexf, misdata$mydata.methodf)
or
prop.table(table(misdata$mydata.sexf, misdata$mydata.methodf))
# for all sexes, more individuals did methodB
#5.2 educ
table(misdata$mis_educf,misdata$mydata.methodf)
#B has way more middle education than A

# Exercise 3.2
#1
barplot(table(misdata$mis_educf, misdata$mydata.methodf))
pie(table(misdata$mis_educf,misdata$mydata.methodf))
# barplot is more suitable, pie is a mess
#2
??pie
pie(table(misdata$mydata.sexf, misdata$mis_educf), col=c("green","purple","pink","blue","yellow", "cyan"))

# Exercise 3.3
#1.
??mean
??median
#2.
??na.omit
??na.rm
# I think that na.omit omits a complete row when 1 variable is missing, while na.rm just removes that variable for the required calculation

#3.
mean(misdata$mis_arith0, na.rm= TRUE)
median(misdata$mis_arith0, na.rm= TRUE)
mean(misdata$mis_arith1, na.rm= TRUE)
median(misdata$mis_arith1, na.rm= TRUE)
mean(misdata$mis_IQ, na.rm= TRUE)
median(misdata$mis_IQ, na.rm= TRUE)
#the means and medians are close to each other for all of these variables, which makes sense
#given that the data is simulated, not collected

# Exercise 3.4
#1. create frequency table
freq <- table(misdata$mis_IQ)
#2. create mod
mod <- which.max(freq); mod
#3. create function
modeX <- function(x){freq<-table(x)
mod <-which.max(freq)
return(names(mod))}
#4. Calculate mode of education
modeX(misdata$mis_educf)

#Exercise 3.5
#1
??var()
??sd()
??IQR()
??mad()
# Missing data can be removed using na.rm = TRUE
#2.Dispersion measures for arith0, arith1, IQ
var(misdata$mis_arith0, na.rm= TRUE)
sd(misdata$mis_arith0, na.rm= TRUE)
IQR(misdata$mis_arith0, na.rm= TRUE)
mad(misdata$mis_arith0, na.rm= TRUE)
var(misdata$mis_arith1, na.rm= TRUE)
sd(misdata$mis_arith1, na.rm= TRUE)
IQR(misdata$mis_arith1, na.rm= TRUE)
mad(misdata$mis_arith1, na.rm= TRUE)
var(misdata$mis_IQ, na.rm= TRUE)
sd(misdata$mis_IQ, na.rm= TRUE)
IQR(misdata$mis_IQ, na.rm= TRUE)
mad(misdata$mis_IQ, na.rm= TRUE)
# I am not sure how to interpret all of these values, but they are probably super correct
# because the data was not collected but created. Variance for arith_1 seems quite high though

#Exercise 3.6
#0 Install extremevalues package
install.packages("extremevalues", lib="C:/Users/Lenovo/Dropbox/Studie/Masters/Blok 1a/R workshop/extremevalues_2.3.2/extremevalues.R")
library(extremevalues)

#1 Call help functions for described functions
??fivenum #na.rm removes NAs before statistics are computed
??getOutliers #has no specific description on how it handles NAs, but I assume that I can use na.rm=true
??outlierPlot #idem

#2. Apply these functions to continuous variables
fivenum(misdata$mis_arith0, na.rm=TRUE)
L <- getOutliers(misdata$mis_arith0)
outlierPlot(misdata$mis_arith0, L, mode="qq", title="Outliers", fat=TRUE)

# <- getOutliers(misdata$mis_arith0, method="II",  alpha=c(0.05,0.05), FLim=c(0.1,0.9), distribution="normal", returnResiduals=TRUE)
#I dont understand what kind of error Im getting and why
#but its coming back in the next piece of syntax as well
OutlierplotX <- function(x){L <-getOutliers(misdata$mis_arith0, method="I")
outlierPlot(misdata$mis_arith0,L,mode="qq")
return(outlierPlot(x))}
OutlierplotX(misdata$mis_arith0)

#3. Look at graphs of continuous variables for distribution
hist(misdata$mis_arith0)
boxplot(misdata$mis_arith0)
stem(misdata$mis_arith0)
#mis_arith0 seems negatively skewed to the right
hist(misdata$mis_arith1)
boxplot(misdata$mis_arith1)
stem(misdata$mis_arith1)
#mis_arith1 seems normally distributed
hist(misdata$mis_IQ)
boxplot(misdata$mis_IQ)
stem(misdata$mis_IQ)
#mis_IQ seems slightly positively skewed to the left

#Exercise 3.7
#1. Calculate correlations between continuous variables
cor(cbind(misdata$mis_arith0, misdata$mis_arith1, misdata$mis_IQ), use="complete.obs")
#2. Make matrix scatter plot
pairs(~ misdata$mis_arith0 + misdata$mis_arith1 + misdata$mis_IQ, data=misdata,
      main="Simple Scatterplot Matrix")

##########################################

#1. Exercise 3.1.5 for all data frames
#1.1 Check for mydata whether sex and parental education are equally distributed over methods
prop.table(table(mydata$sexf,mydata$methodf),1)
prop.table(table(mydata$educf,mydata$methodf),1)
#1.2 Check for misdata
prop.table(table(misdata$mydata.sexf,misdata$mydata.methodf),1)
prop.table(table(misdata$mis_educf,misdata$mydata.methodf),1)
#1.3 Check for compdata
prop.table(table(compdata$mydata.sexf,compdata$mydata.methodf),1)
prop.table(table(compdata$mis_educf,compdata$mydata.methodf),1)
#I chose prop.tables to evaluate whether all values equal 1/(factor levels), which often is not the case
#I could have used regular tables too, did that for the next question

#2. Exercise 3.2.1 for all data frames
#2.1 Barplot for categorical variables in mydata
barplot(table(mydata$sexf,mydata$methodf), col=c("purple","green"), main="Sex per method in 'mydata'", legend=c("girl", "boy"))
barplot(table(mydata$educf,mydata$methodf), col=c("yellow","cyan","red"), main="Educf per method in 'mydata'", legend=c("low","middle","high"))
#2.2 Barplot for categorical variables in misdata
barplot(table(misdata$mydata.sexf,misdata$mydata.methodf), col=c("purple","green"), main="Sex per method in 'misdata'", legend=c("girl", "boy"))
barplot(table(misdata$mis_educf,misdata$mydata.methodf), col=c("yellow","cyan","red"), main="Educf per method in 'misdata'", legend=c("low","middle","high"))
#2.3 Barplot for categorical variables in compdata
barplot(table(compdata$mydata.sexf,compdata$mydata.methodf),  col=c("purple","green"), main="Sex per method in 'compdata'", legend=c("girl", "boy"))
barplot(table(compdata$mis_educf,compdata$mydata.methodf), col=c("yellow","cyan","red"), main="Educf per method in 'compdata'", legend=c("low","middle","high"))

#3. Give location and dispersion measures for continious variable of 1 dataset; misdata
#3.1 Location measures (Exercise 3.3.3)
#3.1.1 mis_arith0
mean(misdata$mis_arith0, na.rm= TRUE)
median(misdata$mis_arith0, na.rm= TRUE)
modeX <- function(x){freq<-table(x)
mod <-which.max(freq)
return(names(mod))}
modeX(misdata$mis_arith0)
#3.1.2 mis_arith1
mean(misdata$mis_arith1, na.rm= TRUE)
median(misdata$mis_arith1, na.rm= TRUE)
modeX(misdata$mis_arith1)
#3.1.3 mis_IQ
mean(misdata$mis_IQ, na.rm= TRUE)
median(misdata$mis_IQ, na.rm= TRUE)
modeX(misdata$mis_IQ)
#For a proper overview, I would like to make a table of these measures.
#Is there an easy way to put these numbers in a table in R?

#3.2 Dispersion measures (Exercise 3.5.2)
#3.2.1 mis_arith0
var(misdata$mis_arith0, na.rm= TRUE)
sd(misdata$mis_arith0, na.rm= TRUE)
IQR(misdata$mis_arith0, na.rm= TRUE)
mad(misdata$mis_arith0, na.rm= TRUE)
#3.2.2 mis_arith1
var(misdata$mis_arith1, na.rm= TRUE)
sd(misdata$mis_arith1, na.rm= TRUE)
IQR(misdata$mis_arith1, na.rm= TRUE)
mad(misdata$mis_arith1, na.rm= TRUE)
#3.2.3 mis_IQ
var(misdata$mis_IQ, na.rm= TRUE)
sd(misdata$mis_IQ, na.rm= TRUE)
IQR(misdata$mis_IQ, na.rm= TRUE)
mad(misdata$mis_IQ, na.rm= TRUE)

#3.3 Boxplot of arith0 and arith1
boxplot(misdata$mis_arith0,misdata$mis_arith1, main="Variance in 'misdata'", names=c("arith0","arith1"))

#4. Make a graph of histogram of one  variable
hist(mydata$IQ, col="purple", main="IQ for mydata", xlab="IQ", prob=TRUE)
curve(dnorm(x, mean(mydata$IQ), sd(mydata$IQ)), add=TRUE)

#5. Compute the correlation matrix for continuous variables
cor(cbind(misdata$mis_arith0, misdata$mis_arith1, misdata$mis_IQ), use="complete.obs")
#I thought that the following matrix should be similar, interestingly it isn't:
cor(cbind(compdata$mis_arith0, compdata$mis_arith1, compdata$mis_IQ))
#So I'm assuming that use="complete.obs" uses pairwise deletion?

#6. Obtain descriptives of a chosen continuous variable for the levels of a chosen factor
IQbysex <- split(compdata$mis_IQ,compdata$mydata.sexf, drop = "FALSE", sep=".", lex.order="FALSE"); IQbysex
sapply(IQbysex,mean)
sapply(IQbysex,median)
sapply(IQbysex,fivenum)
boxplot(IQbysex, col="purple", main="IQ by sex")

#7. Make a plot to compare the distribution of one of the continuous variables
#for the three datasets you constructed
boxplot(mydata$IQ, misdata$mis_IQ, compdata$mis_IQ, col="purple", main="IQ distributions", names =c("mydata","misdata","compdata"))

Reacties

Populaire posts van deze blog

Script 5

Script 1