Link to Home

library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation

# I used this data first then commented it out so I could use my own data. 

# z <- rnorm(n=3000,mean=0.2)
# z <- data.frame(1:3000,z)
# names(z) <- list("ID","myVar")
# z <- z[z$myVar>0,]
# str(z)
# summary(z$myVar)

# The data I chose for this assignment was the snout to vent lengths of salamanders measured during a research project at Murray State University. 

# First I found the path to my data to bring it into the markdown file.
z <- read.table(file="mysalamanderdata.csv",header=TRUE,sep=",")
str(z)
## 'data.frame':    163 obs. of  4 variables:
##  $ Tank   : int  1 1 1 1 2 2 2 2 2 2 ...
##  $ SVL    : num  34 33 32.5 32 49 49 48 47 46 42 ...
##  $ Morph  : chr  "Paedomorph" "Paedomorph" "Paedomorph" "Paedomorph" ...
##  $ Density: chr  "H" "H" "H" "H" ...
summary(z)
##       Tank            SVL           Morph             Density         
##  Min.   : 1.00   Min.   :31.00   Length:163         Length:163        
##  1st Qu.:11.00   1st Qu.:38.00   Class :character   Class :character  
##  Median :22.00   Median :41.50   Mode  :character   Mode  :character  
##  Mean   :20.33   Mean   :41.32                                        
##  3rd Qu.:31.00   3rd Qu.:45.50                                        
##  Max.   :40.00   Max.   :52.50

Plot histogram of data

# This chunk of code generates a histogram with the SVL data on the x axis and the density of the different SVL values on the y axis. The color is cornsilk and the border is gray60.
p1 <- ggplot(data=z, aes(x=SVL, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Add empirical density curve

# Adding a dotted line to show empirical density. 
p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Get maximum likelihood parameters for normal

normPars <- fitdistr(z$SVL,"normal")
print(normPars)
##       mean          sd    
##   41.3159509    5.2600931 
##  ( 0.4120023) ( 0.2913296)
str(normPars)
## List of 5
##  $ estimate: Named num [1:2] 41.32 5.26
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.412 0.291
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.1697 0 0 0.0849
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 163
##  $ loglik  : num -502
##  - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
##     mean 
## 41.31595

Plot normal probability density

meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(z$SVL),len=length(z$SVL))

stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$SVL), args = list(mean = meanML, sd = sdML))
 p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot exponential probability density

expoPars <- fitdistr(z$SVL,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$SVL), args = list(rate=rateML))
 p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot uniform probability density

stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$SVL), args = list(min=min(z$SVL), max=max(z$SVL)))
 p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot gamma probability density

gammaPars <- fitdistr(z$SVL,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$SVL), args = list(shape=shapeML, rate=rateML))
 p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot beta probability density

pSpecial <- ggplot(data=z, aes(x=SVL/(max(SVL + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=z$SVL/max(z$SVL + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$SVL), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

What is the best fitting distribution?

The normal and the gamma distributions fit the salamander data very well. I think the gamma distribution is preferred, because it can not go below zero and it would not be possible to have a negative salamander length.

Simulate a new data set

# Max likelihood parameters from gamma distribution

shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

# Make a data frame that is the same length as my salamander data
x <- rgamma(n= length(z$SVL), shape = 61.31629, rate = 1.484091)
d <- seq(1:length(x))

w <- data.frame(x,d)
print(w)
##            x   d
## 1   41.31033   1
## 2   38.25418   2
## 3   43.49803   3
## 4   42.96262   4
## 5   40.76754   5
## 6   41.37000   6
## 7   41.43949   7
## 8   36.76637   8
## 9   37.29304   9
## 10  42.64329  10
## 11  39.66136  11
## 12  33.22791  12
## 13  45.54481  13
## 14  38.64973  14
## 15  40.19532  15
## 16  45.01525  16
## 17  34.69510  17
## 18  41.75461  18
## 19  47.39548  19
## 20  48.35820  20
## 21  39.29257  21
## 22  40.87920  22
## 23  39.02372  23
## 24  41.76247  24
## 25  43.88657  25
## 26  33.27813  26
## 27  44.18021  27
## 28  33.46629  28
## 29  44.00712  29
## 30  40.38999  30
## 31  31.59694  31
## 32  43.61473  32
## 33  38.10792  33
## 34  34.24055  34
## 35  46.33775  35
## 36  41.97445  36
## 37  40.82371  37
## 38  37.34814  38
## 39  40.32225  39
## 40  48.80446  40
## 41  50.21556  41
## 42  43.94536  42
## 43  39.17344  43
## 44  34.30680  44
## 45  33.54461  45
## 46  45.50979  46
## 47  37.90295  47
## 48  41.78235  48
## 49  38.89282  49
## 50  36.52100  50
## 51  48.84526  51
## 52  41.40780  52
## 53  37.96659  53
## 54  48.78766  54
## 55  39.24496  55
## 56  44.03315  56
## 57  39.44382  57
## 58  41.81479  58
## 59  33.15339  59
## 60  35.94831  60
## 61  46.03616  61
## 62  37.41648  62
## 63  39.93257  63
## 64  44.76032  64
## 65  40.26711  65
## 66  30.85936  66
## 67  33.34479  67
## 68  40.92175  68
## 69  40.45562  69
## 70  38.32099  70
## 71  41.45568  71
## 72  47.39144  72
## 73  32.22193  73
## 74  39.38528  74
## 75  43.32853  75
## 76  36.85656  76
## 77  35.46365  77
## 78  30.49507  78
## 79  45.96554  79
## 80  42.19347  80
## 81  33.99445  81
## 82  50.08578  82
## 83  43.91466  83
## 84  52.34746  84
## 85  33.93599  85
## 86  32.93009  86
## 87  39.47434  87
## 88  47.52658  88
## 89  40.71917  89
## 90  39.75417  90
## 91  37.11067  91
## 92  35.95797  92
## 93  33.85981  93
## 94  40.54876  94
## 95  37.31650  95
## 96  41.58866  96
## 97  45.73985  97
## 98  50.43401  98
## 99  43.93742  99
## 100 46.03053 100
## 101 39.39000 101
## 102 40.42237 102
## 103 34.83339 103
## 104 34.34994 104
## 105 33.89123 105
## 106 38.34387 106
## 107 43.41919 107
## 108 44.02770 108
## 109 41.85346 109
## 110 39.05616 110
## 111 63.24877 111
## 112 35.34968 112
## 113 44.08738 113
## 114 43.13608 114
## 115 39.51851 115
## 116 43.04888 116
## 117 29.88794 117
## 118 35.98040 118
## 119 53.34408 119
## 120 37.45347 120
## 121 43.92195 121
## 122 48.88809 122
## 123 35.23307 123
## 124 43.38850 124
## 125 39.14108 125
## 126 42.94802 126
## 127 29.04348 127
## 128 46.57105 128
## 129 46.06045 129
## 130 39.56143 130
## 131 48.61319 131
## 132 39.78477 132
## 133 46.86014 133
## 134 31.81476 134
## 135 45.23502 135
## 136 51.34368 136
## 137 40.68969 137
## 138 37.72666 138
## 139 38.88463 139
## 140 35.60393 140
## 141 36.97400 141
## 142 37.04688 142
## 143 47.45415 143
## 144 38.91125 144
## 145 36.55610 145
## 146 28.67777 146
## 147 40.55858 147
## 148 38.78853 148
## 149 39.41163 149
## 150 34.18472 150
## 151 36.99409 151
## 152 54.34728 152
## 153 43.37272 153
## 154 33.50107 154
## 155 37.17458 155
## 156 36.77739 156
## 157 36.06425 157
## 158 39.51574 158
## 159 42.00440 159
## 160 38.04964 160
## 161 41.42288 161
## 162 44.48356 162
## 163 46.69289 163
# Make a histogram
p1 <- ggplot(data=w, aes(x=x, y=..density..)) +
  geom_histogram(color="grey60",fill="deepskyblue4",size=0.2)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Add gamma distribution
gammaPars <- fitdistr(z$SVL,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

new_gamma <- stat_function(aes(x = x, y = ..y..), fun = dgamma, colour="brown", n = nrow(w), args = list(shape=shapeML, rate=rateML))
print(p1+new_gamma)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Fresh histogram of original salamander data
p1 <- ggplot(data=z, aes(x=SVL, y=..density..)) +
  geom_histogram(color="grey60",fill="deepskyblue4",size=0.2)

gammaPars <- fitdistr(z$SVL,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$SVL), args = list(shape=shapeML, rate=rateML))
 p1 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# I don't know why the model is creating an x-axis that goes all the way to zero for some plots, and for others the axis starts at a higher value. I tried using xlim to change the x-axis, but I had a hard time with that. 

Final questions

  1. How do the two histogram profiles compare? Do you think the model is doing a good job of simulating realistic data that match your original measurements? Why or why not?

Based on this project, the model does an excellent job of simulating realistic data. Because the axes of my graphs are different (I’m not sure why), at first glance it would appear that the simulated data set is more varied than the real data, but both data sets are actually within the range of 30-50 or so.

  1. If you have entered a large data frame with many columns, try running all of the code on a different variable to see how the simulation performs.

This is not really possible for me (as far as I know) because the other columns of my data set are characters.