Link to Home
library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation
# I used this data first then commented it out so I could use my own data.
# z <- rnorm(n=3000,mean=0.2)
# z <- data.frame(1:3000,z)
# names(z) <- list("ID","myVar")
# z <- z[z$myVar>0,]
# str(z)
# summary(z$myVar)
# The data I chose for this assignment was the snout to vent lengths of salamanders measured during a research project at Murray State University.
# First I found the path to my data to bring it into the markdown file.
z <- read.table(file="mysalamanderdata.csv",header=TRUE,sep=",")
str(z)
## 'data.frame': 163 obs. of 4 variables:
## $ Tank : int 1 1 1 1 2 2 2 2 2 2 ...
## $ SVL : num 34 33 32.5 32 49 49 48 47 46 42 ...
## $ Morph : chr "Paedomorph" "Paedomorph" "Paedomorph" "Paedomorph" ...
## $ Density: chr "H" "H" "H" "H" ...
summary(z)
## Tank SVL Morph Density
## Min. : 1.00 Min. :31.00 Length:163 Length:163
## 1st Qu.:11.00 1st Qu.:38.00 Class :character Class :character
## Median :22.00 Median :41.50 Mode :character Mode :character
## Mean :20.33 Mean :41.32
## 3rd Qu.:31.00 3rd Qu.:45.50
## Max. :40.00 Max. :52.50
# This chunk of code generates a histogram with the SVL data on the x axis and the density of the different SVL values on the y axis. The color is cornsilk and the border is gray60.
p1 <- ggplot(data=z, aes(x=SVL, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Adding a dotted line to show empirical density.
p1 <- p1 + geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
normal
normPars <- fitdistr(z$SVL,"normal")
print(normPars)
## mean sd
## 41.3159509 5.2600931
## ( 0.4120023) ( 0.2913296)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 41.32 5.26
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.412 0.291
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.1697 0 0 0.0849
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 163
## $ loglik : num -502
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## mean
## 41.31595
normal
probability densitymeanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(z$SVL),len=length(z$SVL))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$SVL), args = list(mean = meanML, sd = sdML))
p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
exponential
probability densityexpoPars <- fitdistr(z$SVL,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$SVL), args = list(rate=rateML))
p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
uniform
probability densitystat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$SVL), args = list(min=min(z$SVL), max=max(z$SVL)))
p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
gamma
probability densitygammaPars <- fitdistr(z$SVL,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$SVL), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
beta
probability densitypSpecial <- ggplot(data=z, aes(x=SVL/(max(SVL + 0.1)), y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
betaPars <- fitdistr(x=z$SVL/max(z$SVL + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$SVL), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
The normal
and the gamma
distributions fit the salamander data very well. I think the gamma
distribution is preferred, because it can not go below zero and it would not be possible to have a negative salamander length.
# Max likelihood parameters from gamma distribution
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
# Make a data frame that is the same length as my salamander data
x <- rgamma(n= length(z$SVL), shape = 61.31629, rate = 1.484091)
d <- seq(1:length(x))
w <- data.frame(x,d)
print(w)
## x d
## 1 41.31033 1
## 2 38.25418 2
## 3 43.49803 3
## 4 42.96262 4
## 5 40.76754 5
## 6 41.37000 6
## 7 41.43949 7
## 8 36.76637 8
## 9 37.29304 9
## 10 42.64329 10
## 11 39.66136 11
## 12 33.22791 12
## 13 45.54481 13
## 14 38.64973 14
## 15 40.19532 15
## 16 45.01525 16
## 17 34.69510 17
## 18 41.75461 18
## 19 47.39548 19
## 20 48.35820 20
## 21 39.29257 21
## 22 40.87920 22
## 23 39.02372 23
## 24 41.76247 24
## 25 43.88657 25
## 26 33.27813 26
## 27 44.18021 27
## 28 33.46629 28
## 29 44.00712 29
## 30 40.38999 30
## 31 31.59694 31
## 32 43.61473 32
## 33 38.10792 33
## 34 34.24055 34
## 35 46.33775 35
## 36 41.97445 36
## 37 40.82371 37
## 38 37.34814 38
## 39 40.32225 39
## 40 48.80446 40
## 41 50.21556 41
## 42 43.94536 42
## 43 39.17344 43
## 44 34.30680 44
## 45 33.54461 45
## 46 45.50979 46
## 47 37.90295 47
## 48 41.78235 48
## 49 38.89282 49
## 50 36.52100 50
## 51 48.84526 51
## 52 41.40780 52
## 53 37.96659 53
## 54 48.78766 54
## 55 39.24496 55
## 56 44.03315 56
## 57 39.44382 57
## 58 41.81479 58
## 59 33.15339 59
## 60 35.94831 60
## 61 46.03616 61
## 62 37.41648 62
## 63 39.93257 63
## 64 44.76032 64
## 65 40.26711 65
## 66 30.85936 66
## 67 33.34479 67
## 68 40.92175 68
## 69 40.45562 69
## 70 38.32099 70
## 71 41.45568 71
## 72 47.39144 72
## 73 32.22193 73
## 74 39.38528 74
## 75 43.32853 75
## 76 36.85656 76
## 77 35.46365 77
## 78 30.49507 78
## 79 45.96554 79
## 80 42.19347 80
## 81 33.99445 81
## 82 50.08578 82
## 83 43.91466 83
## 84 52.34746 84
## 85 33.93599 85
## 86 32.93009 86
## 87 39.47434 87
## 88 47.52658 88
## 89 40.71917 89
## 90 39.75417 90
## 91 37.11067 91
## 92 35.95797 92
## 93 33.85981 93
## 94 40.54876 94
## 95 37.31650 95
## 96 41.58866 96
## 97 45.73985 97
## 98 50.43401 98
## 99 43.93742 99
## 100 46.03053 100
## 101 39.39000 101
## 102 40.42237 102
## 103 34.83339 103
## 104 34.34994 104
## 105 33.89123 105
## 106 38.34387 106
## 107 43.41919 107
## 108 44.02770 108
## 109 41.85346 109
## 110 39.05616 110
## 111 63.24877 111
## 112 35.34968 112
## 113 44.08738 113
## 114 43.13608 114
## 115 39.51851 115
## 116 43.04888 116
## 117 29.88794 117
## 118 35.98040 118
## 119 53.34408 119
## 120 37.45347 120
## 121 43.92195 121
## 122 48.88809 122
## 123 35.23307 123
## 124 43.38850 124
## 125 39.14108 125
## 126 42.94802 126
## 127 29.04348 127
## 128 46.57105 128
## 129 46.06045 129
## 130 39.56143 130
## 131 48.61319 131
## 132 39.78477 132
## 133 46.86014 133
## 134 31.81476 134
## 135 45.23502 135
## 136 51.34368 136
## 137 40.68969 137
## 138 37.72666 138
## 139 38.88463 139
## 140 35.60393 140
## 141 36.97400 141
## 142 37.04688 142
## 143 47.45415 143
## 144 38.91125 144
## 145 36.55610 145
## 146 28.67777 146
## 147 40.55858 147
## 148 38.78853 148
## 149 39.41163 149
## 150 34.18472 150
## 151 36.99409 151
## 152 54.34728 152
## 153 43.37272 153
## 154 33.50107 154
## 155 37.17458 155
## 156 36.77739 156
## 157 36.06425 157
## 158 39.51574 158
## 159 42.00440 159
## 160 38.04964 160
## 161 41.42288 161
## 162 44.48356 162
## 163 46.69289 163
# Make a histogram
p1 <- ggplot(data=w, aes(x=x, y=..density..)) +
geom_histogram(color="grey60",fill="deepskyblue4",size=0.2)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Add gamma distribution
gammaPars <- fitdistr(z$SVL,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
new_gamma <- stat_function(aes(x = x, y = ..y..), fun = dgamma, colour="brown", n = nrow(w), args = list(shape=shapeML, rate=rateML))
print(p1+new_gamma)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Fresh histogram of original salamander data
p1 <- ggplot(data=z, aes(x=SVL, y=..density..)) +
geom_histogram(color="grey60",fill="deepskyblue4",size=0.2)
gammaPars <- fitdistr(z$SVL,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$SVL), args = list(shape=shapeML, rate=rateML))
p1 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# I don't know why the model is creating an x-axis that goes all the way to zero for some plots, and for others the axis starts at a higher value. I tried using xlim to change the x-axis, but I had a hard time with that.
Based on this project, the model does an excellent job of simulating realistic data. Because the axes of my graphs are different (I’m not sure why), at first glance it would appear that the simulated data set is more varied than the real data, but both data sets are actually within the range of 30-50 or so.
This is not really possible for me (as far as I know) because the other columns of my data set are characters.