Biostatistics Tutorial Full course for Beginners to Experts
Module 4 – Describing Data: Variability
Variability
A statistical method to measure and objectively describe the differences that exist within a data set. Describes distribution along with shape and central tendency.
There are three measures of variability; range, Interquartile Range, and Standard Deviation with Standard Deviation being hte most promenant.
require("RPostgreSQL")
# Load the PostgreSQL driver, create a connection to the postgres database
drv <- dbDriver("PostgreSQL")
con <- dbConnect(drv, dbname = "smokyants", host = "localhost", port = 5432, user = "ruser", password = "ruser")
sql_string <- paste("SELECT * FROM smokyants", sep="")
smokyants <- data.frame(dbGetQuery(con, sql_string))
dbDisconnect(con)
[1] TRUE
Range
The distance between the largest and smallest scores in a distribution.
# One way:
elevations <- data.frame(smokyants$elevation_m)
range(elevations, na.rm = TRUE)
[1] 379 1828
# Another way:
# multipleelevations1 <- smokyants[c(05,48)]
# multipleelevations2 <- smokyants[c(10,48)]
# multipleelevations3 <- smokyants[c(15,48)]
multipleelevations1 <- subset(smokyants[c(05,48)], smokyants[5] != 0, na.rm = TRUE)
multipleelevations2 <- subset(smokyants[c(10,48)], smokyants[10] != 0, na.rm = TRUE)
multipleelevations3 <- subset(smokyants[c(15,48)], smokyants[15] != 0, na.rm = TRUE)
range(multipleelevations1)
[1] 1 1530
range(multipleelevations2)
[1] 1 770
range(multipleelevations3)
[1] 1 403
# Another way:
mutipleelevations <- c(multipleelevations1, multipleelevations2, multipleelevations3)
sapply(mutipleelevations[c(2)], function(mutipleelevations){max(mutipleelevations, na.rm=TRUE) - min(mutipleelevations, na.rm=TRUE)})
elevation_m
1151
sapply(mutipleelevations[c(4)], function(mutipleelevations){max(mutipleelevations, na.rm=TRUE) - min(mutipleelevations, na.rm=TRUE)})
elevation_m
51
sapply(mutipleelevations[c(6)], function(mutipleelevations){max(mutipleelevations, na.rm=TRUE) - min(mutipleelevations, na.rm=TRUE)})
elevation_m
0
Interquartile Range
An observation variable that is the difference between its upper and lower quartiles. It is a measure of how far apart the middle portion of data spreads in value.
#One way
summary(smokyants$elevation_m)
Min. 1st Qu. Median Mean 3rd Qu. Max.
379 625 941 1002 1419 1828
print(paste("Elevation IQR: ", IQR(smokyants$elevation_m)))
[1] "Elevation IQR: 794"
fiveantselevations <- c(subset(smokyants$elevation_m, smokyants[5] != 0))
print(paste("stigmatomma_pallipes elevation IQR: ", IQR(fiveantselevations))) # stigmatomma_pallipes
[1] "stigmatomma_pallipes elevation IQR: 526.5"
teneantselevations <- c(subset(smokyants$elevation_m, smokyants[10] != 0))
print(paste("camponotus_pennsylvanicus elevation IQR: ", IQR(teneantselevations))) # camponotus_pennsylvanicus
[1] "camponotus_pennsylvanicus elevation IQR: 25.5"
twentyantselevations <- c(subset(smokyants$elevation_m, smokyants[15] != 0))
print(paste("crematogaster_minutissima elevation IQR: ", IQR(twentyantselevations))) # crematogaster_minutissima
[1] "crematogaster_minutissima elevation IQR: 0"
#Another way
mutipleelevations <- c(multipleelevations1, multipleelevations2, multipleelevations3)
sapply(mutipleelevations[c(2, 4, 6)], IQR)
elevation_m elevation_m elevation_m
526.5 25.5 0.0
Standard Deviation:
The measure that is used to quantify the amount of variation or dispersion of a set of data values
# install.packages("RPostgreSQL")
require("RPostgreSQL")
# Load the PostgreSQL driver, create a connection to the postgres database
drv <- dbDriver("PostgreSQL")
con <- dbConnect(drv, dbname = "nitrofungi", host = "localhost", port = 5432, user = "ruser", password = "ruser")
# sequencing_sql_string <- paste("SELECT * FROM sequencing", sep="")
# sequencing <- data.frame(dbGetQuery(con, sequencing_sql_string))
greenhouse_sql_string <- paste("SELECT * FROM greenhouse", sep="")
greenhouse <- data.frame(dbGetQuery(con, greenhouse_sql_string))
dbDisconnect(con)
[1] TRUE
unique(greenhouse$species)
[1] "E. barberi" "E. brookeriana" "E. cordata" "E. dalrympleana" "E. globulus" "E. gunnii" "E. johnstonii" "E. morrisbyi"
[9] "E. ovata" "E. perriniana" "E. rodwayi" "E. rubida" "E. subcrenulata" "E. urnigera" "E. viminalis"
e_globulus <- subset(greenhouse, greenhouse$species == "E. globulus")
colnames(e_globulus)
[1] "pot" "species"
[3] "lineage" "soil"
[5] "nitrogen" "fungi"
[7] "block" "height"
[9] "biomass" "number_of_roots_colonized_by_ecm_fungi"
[11] "number_of_roots_colonized_by_am_fungi" "number_of_roots_colonized_by_dse_fungi"
[13] "number_of_roots_colonized_by_nonfilamentous_fungi"
unique(greenhouse$nitrogen)
[1] "low N" "high N"
low_nitrogen <- subset(e_globulus, greenhouse$nitrogen == "low N")
high_nitrogen <- subset(e_globulus, greenhouse$nitrogen == "high N")
SD_e_globulusSD_low <- sd(low_nitrogen$biomass, na.rm = TRUE)
SD_e_globulusSD_high <- sd(high_nitrogen$biomass, na.rm = TRUE)
SD_e_globulusSD_low
[1] 3.302322
SD_e_globulusSD_high
[1] 3.686937
plot(e_globulus$biomass)
plot(subset(e_globulus$biomass, e_globulus$nitrogen == "low N"), main = "Biomass for e_globulus low nitrogen")
plot(subset(e_globulus$biomass, e_globulus$nitrogen == "high N"), main = "Biomass for e_globulus high nitrogen")
Visual Displays of Variability: Error Bars
#Think these a scientificlly wrong tbh....
e_globulus <- subset(greenhouse, greenhouse$species == "E. globulus")
FileDirectory <- paste("/home/daiten/Programming/R/Projects/Biostatistics Tutorial/Media/", sep="")
#Let's create the working vector
biomasses <- c()
#Let's create a function to apply the error bars
error.bar <- function(x, y, upper, lower=upper, length=0.1,...)
{
arrows(x,upper, x, y, angle=90, code=3, length=length, ...)
}
# Let's fill a vector with the mean, min, max. Kinda like a box plot but not a box plot
# (It's a box plot.......)
for(i in 1:4)
{
biomasses <- rbind(biomasses, c(mean(subset(e_globulus$biomass, e_globulus$block == i)),
min(subset(e_globulus$biomass, e_globulus$block == i)),
max(subset(e_globulus$biomass, e_globulus$block == i))))
}
#Generate inline Image:
#Generate the plot, call the function to apply the error bars
biomassplot <- barplot(biomasses[,1], ylim = c(0,max(biomasses[,3]+5)), main = "Bar is mean, bottom is min value, top is max value.")
error.bar(biomassplot, biomasses[,2], biomasses[,3])
#Generate image file
png(paste(FileDirectory, "ArrowBars.jpg", sep = ""), width = 1000, height = 1000)
biomassplot <- barplot(biomasses[,1], ylim = c(0,max(biomasses[,3])), main = "Bar is mean, bottom is min value, top is max value.")
error.bar(biomassplot, biomasses[,2], biomasses[,3])
dev.off()
png
2
Visual Displays of Variability: Box Plot
e_globulus <- subset(greenhouse, greenhouse$species == "E. globulus")
block1 <- subset(e_globulus, e_globulus$block == "1")
block2 <- subset(e_globulus, e_globulus$block == "2")
block3 <- subset(e_globulus, e_globulus$block == "3")
block4 <- subset(e_globulus, e_globulus$block == "4")
#BoxPlots
par(mfrow = c(2,2))
boxplot(subset(block1$height, block1$soil == "control"), main="Block 1, heights - control group")
boxplot(subset(block2$height, block2$soil == "control"), main="Block 2, heights - control group")
boxplot(subset(block3$height, block3$soil == "control"), main="Block 3, heights - control group")
boxplot(subset(block4$height, block4$soil == "control"), main="Block 4, heights - control group")
par(mfrow = c(2,2))
boxplot(subset(block1$height, block1$soil == "conspecific"), main="Block 1, heights - conspecific group")
boxplot(subset(block2$height, block2$soil == "conspecific"), main="Block 2, heights - conspecific group")
boxplot(subset(block3$height, block3$soil == "conspecific"), main="Block 3, heights - conspecific group")
boxplot(subset(block4$height, block4$soil == "conspecific"), main="Block 4, heights - conspecific group")
# Biomass in control and conspecific groups
par(mfrow = c(2,2))
boxplot(subset(block1$biomass, block1$soil == "control"), main="Block 1, Biomass - control group")
boxplot(subset(block2$biomass, block2$soil == "control"), main="Block 2, Biomass - control group")
boxplot(subset(block3$biomass, block3$soil == "control"), main="Block 3, Biomass - control group")
boxplot(subset(block4$biomass, block4$soil == "control"), main="Block 4, Biomass - control group")
par(mfrow = c(2,2))
boxplot(subset(block1$biomass, block1$soil == "conspecific"), main="Block 1, Biomass - conspecific group")
boxplot(subset(block2$biomass, block2$soil == "conspecific"), main="Block 2, Biomass - conspecific group")
boxplot(subset(block3$biomass, block3$soil == "conspecific"), main="Block 3, Biomass - conspecific group")
boxplot(subset(block4$biomass, block4$soil == "conspecific"), main="Block 4, Biomass - conspecific group")