Statistical Modelling in R

During my Master's year at university, I undertook a Statistics elective module, where I conducted data analysis and statistical modelling in MATLAB to predict space mission costs over time. Subsequently, I completed a professional development course in R and recreated this project to showcase my adaptability and proficiency with its libraries and functions.

Abstract

"This investigation focused on modelling the space mission costs in terms of the time index predictor variable by implementing and comparing different regression models, from a 1st degree (linear) model to an 8th degree polynomial.

"A 6th degree polynomial regression model, which was then applied to a subset of the original data, was ultimately found to produce the best results by minimising the amount of information lost in the fitting process according to Akaike’s Information Criterion.

"A bootstrapping technique was also implemented which made it possible to calculate a 95% pointwise confidence band for the fitted response estimated from the data subset."

Below is a snippet of the code that was used for this task. I used the R statistical analysis package along with several of its useful libraries, including readr for data ingestion, dplyr for data cleaning and manipulation, ggplot2 for data visualisation, among others.

A sample of the plots generated for this project is shown in the gallery at the bottom of the page.

library(readr)   #Library to read csv files
library(dplyr)   #Library for data analysis
library(ggplot2) #Library for plotting visualisations

#Import the data
data <- read_csv('data1342785.csv')
head(data)

timeIndex <- data$X
cost <- data$Y

#Exploratory data analysis and histogram
costMean <- mean(cost)      #Mean cost
costMedian <- median(cost)  #Median cost
costTrimMean <- mean(cost, trim=0.1) #10% trimmed mean i.e. remove top and bottom 10% of data
costSTD <- sd(cost)         #Sample standard deviation

histogram <- data.frame(cost) %>%
  ggplot(aes(cost))+
  geom_histogram(bins=13,binwidth=0.0003,color='black', fill=rgb(37/256, 150/256, 190/256), alpha=0.8)+
  labs(title=expression(bold('Histogram for the Space Mission Costs')),
       x=expression(bold('Space Mission Costs/ Billions of Pounds')),
       y=expression(bold('Count')))+
  theme(panel.border = element_rect(colour = "black", fill=NA))+
  scale_y_continuous(expand = c(0, 0), limits = c(0, 175))+
  theme(plot.title = element_text(hjust = 0.5, size=16),
        axis.title=element_text(size=16))
histogram

#.
#.
#.

#Simple linear regression model
model1 <- lm(cost ~ timeIndex)  #Fit a linear regression model to the data

xlin1 <- seq(first(timeIndex), last(timeIndex), length.out=length(timeIndex)) #Create 851 equally spaced timeIndex points (851 because geom_line() below requires same number of points as original data in scatter)
y1_predicted <- predict(model1, list(timeIndex=xlin1))                    #Calculate predicted values of y (cost) based on the linear model fitted

scatter_poly1 <- data.frame(timeIndex) %>%
  ggplot(aes(x=timeIndex, y=cost))+
  geom_point(color=rgb(37/256, 150/256, 190/256), alpha=0.8)+
  labs(title=expression(bold('Scatter Plot and Regression Line of Space Mission Costs vs. Time Index')),
       x=expression(bold('Time Index')),
       y=expression(bold('Space Mission Costs/ Billions of Pounds')))+
  theme(panel.border = element_rect(colour = "black", fill=NA))+
  scale_y_continuous(expand = c(0, 0), limits = c(min(cost)-0.0005, max(cost)+0.0005))+
  scale_x_continuous(expand = c(0, 0), limits = c(0, max(timeIndex)+10))+
  geom_line(aes(y=y1_predicted), colour='red', linewidth=1)+
  theme(plot.title = element_text(hjust = 0.5, size=16),
        axis.title=element_text(size=16))
scatter_poly1

#Higher order polynomial fits were also calculated
#.
#.
#.

#Exploring error terms on 6th order polynomial model
residuals_6 <- model6$residuals

  #Are the errors independent? Are they centred at 0? Do they have constant
  #variance (i.e. 'width') across all x values?
scatter_residuals <- data.frame(stdTimeIndex) %>%
  ggplot(aes(x=stdTimeIndex, y=residuals_6))+
  geom_point(color=rgb(37/256, 150/256, 190/256), alpha=0.8)+
  labs(title=expression(bold('Scatter Plot of Residuals vs. Time Index')),
       x=expression(bold('Standardised Time Index')),
       y=expression(bold('Residuals/ Billions of Pounds')))+
  theme(panel.border = element_rect(colour = "black", fill=NA))+
  scale_y_continuous(expand = c(0, 0), limits = c(-0.0015, 0.0015))+
  scale_x_continuous(expand = c(0, 0), limits = c(min(stdTimeIndex)-0.1, max(stdTimeIndex)+0.1))+
  theme(plot.title = element_text(hjust = 0.5, size=16),
        axis.title=element_text(size=16))
scatter_residuals

#Are the errors normally distributed?
residuals_6_df <- data.frame(residuals_6)
qqplot <- residuals_6_df %>%
  ggplot(aes(sample=residuals_6))+
  stat_qq_line(color='red')+
  stat_qq(color=rgb(37/256, 150/256, 190/256), alpha=0.8)+
  labs(title=expression(bold('Q-Q Plot of Residuals vs. Standard Normal Distribution')),
       x=expression(bold('Quantiles of Normal Distribution')),
       y=expression(bold('Quantiles of Input Sample')))+
  theme(panel.border = element_rect(colour = "black", fill=NA))+
  scale_y_continuous(expand = c(0, 0), limits = c(-0.0015, 0.0015))+
  theme(plot.title = element_text(hjust = 0.5, size=16),
        axis.title=element_text(size=16))
qqplot

#.
#.
#.

#Fit 6th degree model on subset of data
stdTimeIndex_subset <- (timeIndex_subset-mean(timeIndex_subset))/sd(timeIndex_subset)
stdTimeIndex_subset_2 <- stdTimeIndex_subset^2
stdTimeIndex_subset_3 <- stdTimeIndex_subset^3
stdTimeIndex_subset_4 <- stdTimeIndex_subset^4
stdTimeIndex_subset_5 <- stdTimeIndex_subset^5
stdTimeIndex_subset_6 <- stdTimeIndex_subset^6

model6_subset <- lm(cost_subset ~ stdTimeIndex_subset + stdTimeIndex_subset_2 + stdTimeIndex_subset_3 + stdTimeIndex_subset_4 + stdTimeIndex_subset_5 + stdTimeIndex_subset_6)

xlin_subset <- seq(first(stdTimeIndex_subset), last(stdTimeIndex_subset), length.out=length(stdTimeIndex_subset))
y6_predicted_subset <- predict(model6_subset, list(stdTimeIndex_subset=xlin_subset, stdTimeIndex_subset_2=xlin_subset^2, stdTimeIndex_subset_3=xlin_subset^3, stdTimeIndex_subset_4=xlin_subset^4, stdTimeIndex_subset_5=xlin_subset^5, stdTimeIndex_subset_6=xlin_subset^6))

residuals_6_subset <- model6_subset$residuals     #Get residuals from model6_subset of Question 2f
fitted_6_subset <- model6_subset$fitted.values    #Get fitted y values calculated during fitting of model6_subset

#Bootstrapping manually
k <- 1000
predictedNewResponseModel <- array(numeric(), c(k, length(stdTimeIndex_subset)))  #This array will contain all the new response predictions from the new model below. Each row represents one bootstrap repetition, each column corresponds to each value of X

  #Resampling with replacement to generate k sets of refitted values (.i. k lines of best fit)
for (m in 1:k){
  residualsResample <- sample(residuals_6_subset, size=length(residuals_6_subset), replace=TRUE)
  newResponse <- fitted_6_subset + residualsResample
  newResponseModel <- lm(newResponse ~ stdTimeIndex_subset + stdTimeIndex_subset_2 + stdTimeIndex_subset_3 + stdTimeIndex_subset_4 + stdTimeIndex_subset_5 + stdTimeIndex_subset_6)
  predictedNewResponseModel[m,] <- newResponseModel$fitted.values
}

  #Find 95% C.I for each value of X
quants <- array(numeric(), c(length(stdTimeIndex_subset), 2))
  
for (n in 1:length(stdTimeIndex_subset)){
  quants[n,1] <- quantile(predictedNewResponseModel[,n], 0.025)
  quants[n,2] <- quantile(predictedNewResponseModel[,n], 0.975)
}

  #Plot confidence band by joining all lower bounds as a line and all upper bounds as a different line
scatter_subset_bootstrap <- data.frame(stdTimeIndex_subset) %>%
  ggplot(aes(x=stdTimeIndex_subset, y=cost_subset))+
  geom_point(color=rgb(37/256, 150/256, 190/256), alpha=0.8)+
  labs(title=expression(bold('Scatter Plot and Regression Line of Space Mission Costs vs. Time Index for a Data Subset')),
       x=expression(bold('Standardised Time Index')),
       y=expression(bold('Space Mission Costs/ Billions of Pounds')))+
  theme(panel.border = element_rect(colour = "black", fill=NA))+
  scale_y_continuous(expand = c(0, 0), limits = c(min(cost_subset)-0.0005, max(cost_subset)+0.0005))+
  scale_x_continuous(expand = c(0, 0), limits = c(min(stdTimeIndex_subset)-0.1, max(stdTimeIndex_subset)+0.1))+
  geom_line(aes(y=y6_predicted_subset, colour='6th Degree'), linewidth=1)+
  geom_line(aes(y=quants[,1], colour='Bootstrapping 95%\nConfidence Band'), linetype='dashed', linewidth=0.75)+
  geom_line(aes(y=quants[,2], colour='Bootstrapping 95%\nConfidence Band'), linetype='dashed', linewidth=0.75)+
  scale_color_manual(name='Regression \nModel', values=c('red', 'black'))+
  theme(plot.title = element_text(hjust = 0.5, size=16),
        axis.title=element_text(size=16))
scatter_subset_bootstrap

Download report (PDF)

Download