Link to Home

Writing functions to simulate and analyze deer data

# Description -----------------------------------
# This is a script for Homework 9. I am using the data from Homework 8. I am going to focus on my predictions for the infection rate of Plasmodium odocoilei in white tailed deer. Based on previous research, I hypothesize that white-tailed deer across Vermont, New Hampshire, and Maine are infected with Plasmodium odocoilei at a rate of about 25%.

# 23 Mar 2022
# LEB

# Initialize ------------------------------------
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
# Load Functions --------------------------------

library(boot)
library(ggplot2)
library(broom)
source("Functions/sim_deer_data.R")
source("Functions/calculate_deer_stuff.R")
source("Functions/graph_deer_results.R")

# Global Variables ------------------------------

VT <- inv.logit(rnorm(n=500,mean=0.25,sd=.50))
NH <- inv.logit(rnorm(n=500,mean=0.25,sd=.50))
ME <- inv.logit(rnorm(n=500,mean=0.25,sd=.50))

# Program Body ----------------------------------

Deer_Data <- sim_deer_data() # create fake data frame
head(Deer_Data)
##          VT        NH        ME
## 1 0.4562172 0.4540134 0.5789553
## 2 0.4991290 0.6186916 0.6173265
## 3 0.3658591 0.3898670 0.5297237
## 4 0.4286593 0.5245625 0.5384617
## 5 0.5017154 0.6921359 0.3888316
## 6 0.4798130 0.2873462 0.4934792
Deer_Data <- Deer_Data %>%
      pivot_longer(cols=VT:ME,
                   names_to = "State",
                   values_to = "InfectionRate")

return(head(Deer_Data))
## # A tibble: 6 × 2
##   State InfectionRate
##   <chr>         <dbl>
## 1 VT            0.456
## 2 NH            0.454
## 3 ME            0.579
## 4 VT            0.499
## 5 NH            0.619
## 6 ME            0.617
Deer_ANOVA <- calculate_deer_stuff(Deer_Data)
print(Deer_ANOVA)
##               Df Sum Sq  Mean Sq F value Pr(>F)
## x_var          2  0.011 0.005494   0.388  0.679
## Residuals   1497 21.219 0.014174
graph_deer_results(Deer_Data) # create graph
## data plotted

Changing the sample size

# My function for simulating data is already set up to generate random infection rates every time you run it. So, the data for the following script will be different than that for the one above based on that alone. I can also alter the sample size, mean, and standard deviation of the randomly generated data set, which I do here. 

# Initialize ------------------------------------
library(tidyverse)

# Load Functions --------------------------------

library(boot)
library(ggplot2)
library(broom)
source("Functions/sim_deer_data.R")
source("Functions/calculate_deer_stuff.R")
source("Functions/graph_deer_results.R")

# Global Variables ------------------------------

VT <- inv.logit(rnorm(n=500,mean=0.25,sd=.50))
NH <- inv.logit(rnorm(n=500,mean=0.25,sd=.50))
ME <- inv.logit(rnorm(n=500,mean=0.25,sd=.50))

# Program Body ----------------------------------

Deer_Data <- sim_deer_data(size=1000,mean=0.25,sd=0.50) # The default values for this function are size=500,mean=0.25,sd=0.50. Here I alter the sample size.
head(Deer_Data)
##          VT        NH        ME
## 1 0.5111999 0.6094034 0.4788216
## 2 0.5517366 0.5032753 0.6736983
## 3 0.6306354 0.6917094 0.5723545
## 4 0.4402477 0.5066606 0.7226061
## 5 0.3726307 0.4829441 0.6661718
## 6 0.7525471 0.4281650 0.8316942
Deer_Data <- Deer_Data %>%
      pivot_longer(cols=VT:ME,
                   names_to = "State",
                   values_to = "InfectionRate")

return(head(Deer_Data))
## # A tibble: 6 × 2
##   State InfectionRate
##   <chr>         <dbl>
## 1 VT            0.511
## 2 NH            0.609
## 3 ME            0.479
## 4 VT            0.552
## 5 NH            0.503
## 6 ME            0.674
Deer_ANOVA <- calculate_deer_stuff(Deer_Data)
print(Deer_ANOVA)
##               Df Sum Sq Mean Sq F value Pr(>F)
## x_var          2   0.00 0.00145   0.105    0.9
## Residuals   2997  41.31 0.01378
graph_deer_results(Deer_Data) # create graph
## data plotted

My functions

I chose to save my functions in separate R scripts, then load them into my homework script. I will copy and paste the functions below so you can see what I did.

#####################################
# FUNCTION: sim_deer_data
# purpose: create fake cervid data then make it longer
# input: sample size, mean, sd
# output: data frame
# ----------------------------------

library(boot)

# create initial data
sim_deer_data <- function(size=500,mean=0.25,sd=0.50) {
  
    d_frame <- data.frame(VT=inv.logit(rnorm(n=size,mean=mean,sd=sd)),
                          NH=inv.logit(rnorm(n=size,mean=mean,sd=sd)),
                          ME=inv.logit(rnorm(n=size,mean=mean,sd=sd)))
    
  return(d_frame)
}


#####################################
# FUNCTION: calculate_deer_stuff
# purpose: run an ANOVA
# input: data frame
# output: ANOVA stats
# ----------------------------------

calculate_deer_stuff <- function(d_frame=Deer_Data) {
  names(d_frame) <- c("x_var","y_var")

  DeerANOVA <- aov(y_var ~ x_var, data=d_frame)
  
  return(summary(DeerANOVA))
}


#####################################
# FUNCTION: graph_deer_results
# purpose: graph data
# input: data frame, identify the x and y variables
# output: create boxplot
# ----------------------------------
library(tidyverse)
library(ggplot2)

graph_deer_results <- function(data=Deer_Data) {
  
  bar_plot <- ggplot(data=data) +
    aes(x=State,y=InfectionRate) +
    geom_boxplot()
  print(bar_plot)
  message("data plotted")
}