R introduction

#Contents:

#General Commands [ST1]
#Make Vector [ST2]
#Matrix Algebra [ST3]
#Creating Random Variables [ST4]
#Converting variable types [ST5]
#Getting information from variables [ST6]
#Calculating with variables [ST7]
#Missing Values [ST8]
#Dataframe Commands [ST9]
#Graphical Options [ST10]
#Descriptive & Exploratory Analyses [ST11]
#Statistical Analyses [ST12]
#Assumptions [ST13]
#MANOVA / Repeated Measures (Probably not needed) [ST14]

##############################

#General Commands [ST1]
getwd() #Shows current working directory
?keyword #Get help for keyword, a specific topic
??keyword #More general search, looking for anything keyword is found in
setwd("X:/R") #Set working directory
data() #Check available datasets
library() #Check library
library(XXX) #Load package XXX
install.packages("xxx") #Installs package xxx
ls() #List of all variables currently loaded in memory
rm(list=ls(all=TRUE)) #Removes all variables from memory
set.seed(number) #Forces random commands into a specific pattern, so random data can be recreated
capture.output(summary(mydata), file="output2.txt") #Saves output which you would normally get from a command (summary(mydata)) to a text file (output2.txt)
X=edit(mydata) #Shows a new screen in which a variable or matrix (mydata) can be edited. This will then be saved to X

worms = read.spss(file="X:/R/Worms.sav", to.data.frame=TRUE) #Loads SPSS file SAV from file=location & places it in a data frame (TRUE), saves it as "worms"
worms = read.spss(file="X:/R/Worms.por", to.data.frame=TRUE) #Loads SPSS file POR from file=location & places it in a data frame (TRUE), saves it as "worms"
data = read.fwf("filename.dat", width=(rep(width,number of variables))) #Loads DAT file (filename.data) with a certain width per variable and number of variables
data = read.table("Carcasse.dat", header=TRUE, sep="") #Loads DAT file (Carcasse.dat) WITH headers (=TRUE) and seperated by spaces ("")
data = read.table("filename.csv", header=TRUE, sep=",") #Loads CSV file (filename.csv) WITH headers (=TRUE) and seperated by commas (sep=",")
data = read.table(file="pollute.txt", header=FALSE, sep="\t") #Loads TXT file (pollute.txt) from working directory, WITHOUT headers (=FALSE), seperated by tabs (sep="\t")

save(nameofdataframe, file="mydata.Rdata") #Saves current data frame to mydata, to a file called "mydata.Rdata".
load("mydata.Rdata") #Loads data frame "mydata.Rdata"
attach(mydata) #Places it into memory so it may be manipulated
detach(mydata) #Removes it from memory to prevent overlap with other data which may have similar variable names. Restaring R works too.

##############################

#Make Vector [ST2]
a = c(1, 2, 5.3, 6, -2,4) #numerical vector
b = c("one", "two", "three") #factor/character vector
c = c(TRUE, TRUE, TRUE, FALSE, TRUE, FALSE) #binary/logical vector
IQ[2]=100 #Sets value 2 in vector IQ as 100
x1 = seq(0, 25, 10) #a vector of numbers with (min, max, size of steps), so this will give 0 10 20
x2 = 1:5 #x2 is vector of 5 values of 1 through 5

##############################

#Matrix Algebra [ST3]
A=matrix(c(1, 2, 3, 4), nrow=2, ncol=2, byrow=TRUE) #gives a matrix with the numbers 1234, in 2 rows and 2 columns, byrow sets the 2 to the right of the 1 instead of below it
A2=matrix(c(1, 2, 3, 4), nrow=2, ncol=2, byrow=FALSE) #now 2 will be below 1, and 3 will be on the top right
A*A2 #multiplies each individual element by the amount of the element fromt he other matrix on the same position
A%*%A2 #multiplies matrices "properly" like row1*column1 instead of per element
dim(A) #gives number of rows and columns of a matrix (doesn't work on vectors)
t(A) #transposes matrix
det(A) #gives determinant of matrix
solve(A) #gives inverse of matrix
eigen(A) #gives eigenvector of matrix

##############################

#Creating Random Variables [ST4]
rnorm(100, 0, 1) #NORMAL - Creates a vector of 100 values with mean = 0 & sd
rbinom(100, 50, 0.4) #BINOMIAL - Creates a vector of 100 values with 50 trials (n) and a 0.4 chance (p) to succeed
rmultinom(10,4,c(0.1, 0.6, 0.3)) #MULTINOMIAL - Creates a vector of 10 values with 4 trials (n) and a chance of 0.1 to be in the first group, 0.6 in second and 0.3 in third group.
runif(10, 0, 1) #UNIFORMAL - Creates a vector of 10 (n) random numbers between 0 (min) and 1 (max)
mvrnorm(10,mu,sigma) #MULTIVARIATENORMAL - Creates a matrix for 10 (n) people and x variables. For all x variable their means (mu), variances and covariances (sigma) should be defined first.

##############################

#Converting variable types [ST5]
as.matrix(A) #forces A to be a matrix (instead of a vector for instance)
as.factor(B) #forces B to be a factor (instead of a numerical variable for instance)
as.numeric(C) #forces C to be a numeric variable (instead of a factor for instance)
factor(X, labels=c("low", "middle", "high")) #forces X to be a factor with given labels instead of numeric variable

##############################

#Getting information from variables [ST6]
mean(IQ) #mean IQ
mean(mis_IQ, na.rm=TRUE) #mean IQ, removes all missing cases from analysis
median(IQ) #median IQ
median(mis_IQ, na.rm=TRUE) #median IQ, removes all missing cases from analysis
var(IQ) #variance IQ
var(mis_IQ, na.rm=TRUE) #gives Variance of IQ, removes all missing cases from analysis
sd(mis_IQ, na.rm=TRUE) #gives SD of IQ, removes all missing cases from analysis
mad(mis_IQ, na.rm=TRUE) #gives MAD of IQ (MAD = Median Absolute Deviation, calculated by first calculating the deviance from each score to the median of a scale, then take the absolute values of all deviations and then calculate the median from all those deviation scores.
IQR(mis_IQ, na.rm=TRUE) #gives IQR of IQ (distance score at Q1 to score at Q3)

length(IQ) #number of values in vector
sum(IQ) #sumscore IQ
prod(IQ) #productscore IQ
cumsum(IQ) #every IQ result gets added to the sum of all the previous results
cumprod(IQ) #every IQ result gets multiplied to the product of all the previous results
sort(IQ) #sort the vector from small to large
getOutliers(mis_arith1) #shows any outlier for a particular variable, BASED ON SD'S FROM MEAN, so be careful with non-normal data

IQ[2] #shows value 2 in the vector IQ
IQ[c(1,3)] #shows value 1 & 3 in the vector IQ
IQ[1:3] #shows value 1 through 3 in the vector IQ
IQ[2]==100 #Shows whether value 2 is equal to 100 or not
which(IQ==100) #Shows whether each value of IQ is equal to 100 or not

##############################

#Calculating with variables [ST7]
IQp5 = IQ+5 #plus
IQm10 = IQ-10 #minus
IQ10 = IQ*10 #multiplication
IQd10 = IQ/10 #division
IQsq = IQ^2 #square
IQsqrt = sqrt(IQ) #squareroot
logIQ = log(IQ) #LN of IQ (NOT LOG)
eIQ = exp(IQ) #e to the power of IQ

##############################

#Missing Values [ST8]
A[seq(10, 150, 10)]=NA #forces each 10th value of matrix A to be NA (10, 20, 30, etc.)
is.na(A[seq(10, 150, 10)]) #is.na checks if a value in variable A (variable) to be missing (NA)
mis_IQ[which(IQ<85)]=NA #any IQ below 85 is now missing instead

na.omit(B) #shows data frame B, but entirely leaves out any case which has 1 or more missing values
na.fail(cool) #shows data frame "cool" if there are no missing values, otherwise it will create an error that there is at least 1 missing value

##############################

#Dataframe Commands [ST9]
data.frame(X, Y) #Places variables X & Y in a data.frame which can be saved to an RData file
save(nameofdataframe, file="mydata.Rdata") #Saves current data frame to mydata, to a file called "mydata.Rdata".
load("mydata.Rdata") #Loads data frame "mydata.Rdata"
attach(mydata) #Places it into memory so it may be manipulated
detach(mydata) #Removes it from memory to prevent overlap with other data which may have similar variable names. Restaring R works too.
misdata$nicecolumn #Shows column "nicecolumn" of data frame "misdata"
misdata$nicecolumn=IQ #Adds column "nicecolumn" to misdata, using the values from IQ
est_par=summary(multip)$coefficients[,1] #Saves first column of the coefficients part of the summary of a model to "est_par"

fivenum(mis_arith1) #min Q1 med Q2 max
summary(mydata) #Shows min/Q1/med/mean/Q3/max of continuous variables & frequencies of factors
names(mydata) #List the variables in mydata.
str(mydata) #List the structure of mydata (names of each variable, number of rows/columns).
levels(mydata$educf) #List the levels of factor educf in mydata.
dim(mydata) #Gives the dimensions of mydata (i.e., the number of observations and the number of variables).
class(object) #Gives the class (or type)) of an object (e.g., numeric, matrix, data frame).
mydata #Show mydata (i.e., gives the complete data set).
head(mydata, n = 10) #Show the first 10 rows of mydata.
tail(mydata, n = 5) #Show last 5 rows of mydata.

##############################

#Graphical Options [ST10]
plot(x,y) #plots a scatterplot with variable x on x-axis and variable y on y-axis
abline(mean(y), 0) #creates a line with intercept=mean(y) & slope=0)
abline(lm(y ~ x)) #creates a simple regression line (lm) between variable y & x
abline(regressionmodel) #regressionmodel being the result of a saved lm() command, draws regression line in plot, like command above
abline(h=0) #creates a horizontal line at height 0 of y
abline(v=5) #creates a vertical line at height 5 of x

pmatrix=data.frame(mis_IQ, mis_arith0, mis_arith1); pairs(pmatrix) #Correlation matrix between all 3 variables
pairs(dat_reg, gap = 2, cex.labels=1.5) #data, gap = border between plots, cex = size of text (improve, mis_IQ, etc.)

hist(Wind, col="red", border="purple", density=10) #creates histogram from variable Wind, barcolor=red, bordercoler=purple, line density in bars = 10 (higher is more lines)

hist(compdata$mis_IQ, col="lavenderblush2", main="IQ for compdata", xlab="IQ", prob=TRUE) #Density Histogram (variable, color, title, x-axisname, density=TRUE)
curve(dnorm(x, mean(compdata$mis_IQ), sd(compdata$mis_IQ)), add=TRUE) #Normal Line through histogram (BE SURE TO SET PROB=TRUE IN HISTOGRAM OR THIS WON'T WORK), Change "compdata$mis_IQ" to your variable, keep the rest.
#CURVE DNORM DOES NOT WORK WITH MISSING VALUES SO MAKE IT "mean(na.omit(compdata$mis_IQ))" IF NECESSARY!

boxplot(Pollution, col="rosybrown3", border="violetred2", horizontal=TRUE, outline=FALSE) #creates a horizontal (=TRUE) boxplot from variable Pollution, boxcolor=rosybrown3, bordercolor=violtedred2, without outliers (=FALSE)
x=rnorm(50,0,2); x2=rnorm(50,2.5,2); boxplot(x, x2, notch=TRUE) #because notches don't overlap, box 2 seems significantly different from box 1
boxplot(mis_arith0 ~ method) #Creates a box for each level of method
boxplot(conformity ~ partner.status)$out #adding $out shows outlying values in R Console while making graph

barplot(table(mydata$educf, mydata$methodf), main="Black is low, gray is middle, white is high") #Bar plot of educf & methodf with a title (main)
pie(table(mydata$sexf, mydata$methodf), labels=c("Girl A", "Boy A", "Girl B", "Boy B"), col=c("red", "blue", "green", "yellow")) #Pie diagram with sexf & methodf as factors and corresponding labels + colors
stem(mis_IQ) #Stemplot

#For making two graphs appear next to each other:

par(mfcol=c(1,2))

#The first number of c is the number of rows, the second number is the number of colomns, so this gives two graphs next to each other.

par(mfcol=c(5,10)) #this would give 50 graphs, 5 rows 10 columns.
par(mfcol=c(1,1)) #never forget to reset!

#This is a bit from assignment 5 to get dots for a logistic regression in a plot at specific points in the graph:
plot(mis_IQ, arithbin) # scatter plot
IQgroup <- cut(mis_IQ, seq(85, 125, 5)) # scale subdivision
(tab <- table(IQgroup, arithbin)) # show table
(relfreq <- prop.table(tab,1)[,2]) # proportion of success
points(relfreq ~ seq(87.5,122.5,5), pch = 10, col = "red") #make points in plot

##############################

#Descriptive & Exploratory Analyses [ST11]
describeBy(compdata$mis_IQ, compdata$mis_educf) #Gives all descriptive statistics of IQ for each level of educ
summary(compdata$mis_IQ[compdata$mis_educf=="Low"]) #This works without the package "psych" unlike describeBy
tapply(mis_arith0, methodf, mean, na.rm=TRUE) #Shows mean of arith0 for each seperate level of methodf, ignores missing
tapply(mis_IQ, methodf, var, na.rm=TRUE) #Shows variance of IQ for each seperate level of methodf, ignores missing

table(variable) #Shows table with frequency of each unique value, ignores missing values
table(var1, var2) #Shows 2x2 table with frequency of each unique combination, if one or two values are missing, case is left out.
prop.table(table(sexf, mis_educf),1) #1 gives proportion of row instead of frequencies
prop.table(table(sexf, mis_educf),2) #2 gives proportion of column instead of frequencies
prop.table(table(sexf, mis_educf)) #Nothing gives proportion of total instead of frequencies

cor(mis_arith0, mis_arith1) #Pearson Correlation, only works if no missing values
cor(mis_arith0, mis_arith1, method="spearman") #Spearman Correlation, only works if no missing values
cor(mis_arith0, mis_arith1, use="complete.obs") #Pearson Correlation, also works with missing values

rcorr(cbind(compdata$mis_IQ, compdata$mis_arith0, compdata$mis_arith1)) #Needs Hmisc, also a correlation matrix

##############################

#Statistical Analyses [ST12]
t.test(mis_arith0, mu=125) #one-sample
t.test(mis_arith0 ~ method) #independent two-sample t-test
t.test(mis_arith0 ~ method, var.equal=T) #pooled
t.test(mis_arith0 ~ method, alternative="less") #left-sided
t.test(mis_arith0 ~ method, alternative="greater") #right-sided
t.test(mis_arith1, mis_arith0, paired=TRUE) #paired

chisq.test(sex, method) #default chi-square test, asymptotic in SPSS
chisq.test(sex, method, correct=FALSE) #no continuity correction
chisq.test(sex, method, simulate.p.value=TRUE) #more like exact test, it's monte carlo testing

anova(lm(improve ~ mis_educf)) #one-way ANOVA, type I SS, so order matters
aov(improve ~ mis_educf) #one-way ANOVA, don't forget to get summary() from this afterwards, type I SS, so order matters
anova(lm(improve ~ mis_educf + methodf)) #two-way ANOVA, type I SS, so order matters
aov(improve ~ mis_educf + methodf) #two-way ANOVA, don't forget to get summary() from this afterwards, type I SS, so order matters
drop1(aovresult, test="F") #after doing an ANOVA with the aov-command, saving it to "aovresult", this gives type III sum of squares, so order does not matter anymore
TukeyHSD(aovresult) #after doing an ANOVA with the aov-command, saving it to "aovresult", Post-hoc over everything
TukeyHSD(aovresult, "mis_educf") #after doing an ANOVA with the aov-command, saving it to "aovresult", Post-hoc over just mis_educf
plot(TukeyHSD(anova2a, "mis_educf")) #Because this command is specifically for Tukey, I'm putting it here. It's a plot for looking at the CI's from the Tukey results.

#Contrast is probably not necessary to know, but better safe than sorry. Here is an example how to get them:
c=cbind ( c(-1, 0, 1), c(5, -9, 4) )
contrasts(mis_educf)=c
summary.lm(aov(improve ~ mis_educf + methodf))

lm(improve ~ mis_IQ) #Simple Regression, IQ as Independent Variable predicting improve as Dependent Variable
lm(improve ~ mis_IQ + sexf + mis_educf) #Multiple Regression, Improve as DV, IQ & Sex & Educ as IVs
lm(improve ~ mis_IQ + sexf + mis_IQ*sexf) #Multiple Regression with interaction effect
lm(improve ~ mis_IQ, na.action = na.exclude) #na.action is set to na.omit as default, which removes all missing values. na.exclude keeps missing values as "NA". This does not change results, but it does change:
glm(arithbin ~ mis_IQ, binomial) #Logistic Regression, Binomial, arithbin = dependent, IQ = independent
glm(arithbin ~ mis_IQ, binomial, na.action=na.exclude) #Same as above, but again it keeps missing values as "NA" instead of throwing it out.
drop1(logmodel, test = "Chisq") #after doing logistic regression, shows significance of all variables through model deviance of logistic regression result "logmodel"

residuals(model) #Shows residual for each case in model (if model was made with na.exclude, NA is left in, if it was na.omit, cases with NA are removed)
predict(model) #Shows the result of filling in the model for each individual
predict(model, type = "response") #Predicted value for each case (= equal to fitting for linear regression, not for logistic)
fitted(model) #Same as "predict(x, type = "response")

##############################

#Assumptions [ST13]

#Normality:
qqnorm(mis_arith0); qqline(mis_arith0) #Normality QQ Plot
qqmath(~improve | mis_educf) #Seperate QQ-plot for each level of educf
skew(mis_arith0) #Gives skewness of arith0
kurtosi(mis_arith0) #Gives kurtosis of arith0 (no I don't know why the s misses at the end of kurtosi)
shapiro.test(improve) #Shapiro-Wilk test for normality of a variable

#Homoscedasticity:
leveneTest(mis_arith0, mis_educf) #levene test for educf, arith0 being dependent variable
leveneTest(mis_arith0, interaction(mis_educf, methodf)) #does levene's test over all groups at once (probably not asked in exam)
var.test(A, B) #Compares variance of two scales, IS INFERIOR TO LEVENE as Levene's test is more robust

#Residual plot:
plot(fitted(model), resid(model))
abline(h=0)

##############################

#MANOVA / Repeated Measures (Probably not needed) [ST14]

### MANOVA & REPEATED MEASURES
#manova() does MANOVA, don't forget cbind() for combining multiple
#dependent variables.

### Repeated Measures:
install.packages("nlme")
library(nlme)

Zoeken in deze blog

R introduction

Reacties

Een reactie posten

Populaire posts van deze blog

Script 3

Script 5

Script 1