rm(list=ls())
Draw samples from various distributions and display their histograms.
A function that can be repeated:
test_random <- function(n, binom.trials=32, binom.prob=0.5, pois.lambda=4) {
hist(d <- runif(n), breaks=seq(0, 1, by=0.05),
freq=FALSE, main="Uniform")
rug(d); abline(h=1, lty=2, col="red")
summary(d)
hist(d <- rnorm(n, mean=0, sd=1), freq=FALSE,
main="Normal"); rug(d)
x <- seq(-3, 3, length=128); y=dnorm(x,0,1)
lines(x,y, col="red")
summary(d)
hist(d <- rbinom(n, size=binom.trials, prob=binom.prob),
main="Binomial",freq=FALSE,
breaks=seq(0,binom.trials, by=1))
x <- seq(0, binom.trials, by=1); y=dbinom(x, binom.trials, prob=0.5)
points(x+0.5,y, col="red", type="h")
table(d)
hist(d <- rpois(n, lambda=pois.lambda), freq=FALSE,
breaks=seq(0, (3*pois.lambda), by=1), main="Poisson")
x <- seq(0, 3*pois.lambda, by=1); y=dpois(x,pois.lambda)
points(x+0.5,y, col="red", type="h")
table(d)
return()
}
Show these with one number of sample draws:
test_random(128)
## NULL
Task: run this with different sample sizes, each sample size a few times.
Q: How does the agreement of the empirical with the theoretical distribution change with increasing sample size?
Q: How does the variability of the empirical distribution change with increasing sample size?
The essential idea is to assume a statistical model for the distribution of one or more variables, and then repeatedly sample from that distribution.
This random sample simulates what might be encountered in reality, with appropriate probability.
This is used for Bernoulli trials: independent events with a given probability of occurring. Each trials is either a “success” (1) or “failure” (0); we simulate the number of successes.
The classic example is flips of a fair coin:
n <- 1024
sample <- rbinom(n, size=24, prob=0.5);
head(sample, n=20)
## [1] 12 11 11 11 13 12 17 11 11 9 11 10 12 14 11 12 11 10 6 10
(table.k <- table(sample))
## sample
## 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
## 3 8 23 49 71 99 183 153 155 125 73 52 23 5 2
plot(table.k/n, xlab="k", ylab="density")
A more interesting example is whether a certain group of the population is fairly represented in a sample.
For example: 30% of the students are male, 70% female, in a certain study. The best 20 students of the class receive a bonus or a scholarship. Do the best 20 represent this 30/70 split? Is the deviation from 6 males, 14 females (exactly matching the population) by chance?
n <- 1024
sample <- rbinom(n, size=20, prob=0.3)
(table.k <- table(sample))
## sample
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 1 4 21 74 125 145 202 192 119 77 40 19 4 1
plot(table.k/n, xlab="k", ylab="density")
Q: What is the maximum and minimum number of males selected by chance?
Q: What is the mode (most common value)? Does it match the expected value from the population?
Q: Suppose only 3 males are selected. How strong is the evidence that they are under-represented? (This could be either due to their poor performance or discrimination, we have no way to distinguish the cause.)
length(which(sample <= 3))/n
## [1] 0.09765625
Suppose we want to know how large a random sample should be to reliably estimate a total.
nsim <- 256
# set up to save results
totals <- vector(mode="integer", length=nsim) # save run results
n <- 32; mu=100; s=1
for (run in 1:nsim) {
totals[run] <- sum(rnorm(n, mean=mu, sd=s))
}
summary(totals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3184 3197 3200 3200 3204 3216
print(paste("The actual total is", mu*n))
## [1] "The actual total is 3200"
print(paste("Sampling standard deviation of the mean:", round(sd(totals),4)))
## [1] "Sampling standard deviation of the mean: 5.7118"
hist(totals, breaks=16); rug(totals)
Practical example: risk of overloading an aircraft.
Risk of an overweight airplane on full 19-seat plane
Passengers weights assumed to follow a normal distribution
Estimate mean and standard deviation from measurements from the target population
separate distributions for males/females; hierarchical model gender binomial → gender-specific normal
Estimate mean proportion of female passengers (parameter of binomial)
Simulate number of females/males of the 19, from binomial distribution
Simulate each individual’s weight; sum all 19
Compare to maximum allowable weight; find proportion overweight
# parameters of normal distribution
# estimate these from the target population
# mean, s.d. of fe/male weights, kg
mu.m <- 80; sd.m <- 14; mu.f <- 65; sd.f <- 12
# mean proportion of female passengers: binomial \theta
prop.f.mu <- 0.35
# simulation
nsim <- 2048 # number of simulations
n.females <- vector(mode="integer", length=nsim) # save run results
wt.sum <- vector(mode="integer", length=nsim) # save run results
n <- 19 # aircract size
for (run in 1:nsim) {
# number of fe/males
num.f <- rbinom(n=1, size=n, prob=prop.f.mu)
num.m <- n - num.f
n.females[run] <- num.f
wts.f <- rnorm(num.f, mean=mu.f, sd=sd.f)
wts.m <- rnorm(num.m, mean=mu.m, sd=sd.m)
wt.sum[run] <- ceiling(sum(wts.f) + sum(wts.m))
}
summary(wt.sum)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1216 1375 1420 1421 1464 1666
summary(n.females)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 7.000 6.687 8.000 14.000
# Fairchild Metro II: empty 3380 kg, max takeoff 5670kg, total load 2290 kg
load.wt <- 2290; pilots.wt <- 200; fuel.wt <- 600
max.load <- load.wt-pilots.wt-fuel.wt
(n.overweight <- sum(wt.sum > max.load))
## [1] 293
# probabilty of overweight
(prob.overweight <- round(n.overweight/nsim,3))
## [1] 0.143
# figures
# number of females per flight
plot(as.vector(table(n.females)), type="h", xlim=c(0,19),
ylab="frequency", xlab="females per 19 passengers", lwd=2)
grid()
# number of flights overweight
hist(wt.sum, main="", xlab="Total passenger weight, kg", breaks=24); rug(wt.sum)
abline(v=(2290-800), col="red")
We can simulate the placement of random samples
There are several obvious problems with kriging predictions over an area, with respect to uncertainty:
Kriging is an exact predictor at known points, because all the weight is given to the known point; this is mathematically necessary but not realistic, since just away from the point the predictions are weighted;
Kriging prediction maps are by definition smooth, even if there is a nugget component to the model; the actual spatial field is usually much rougher.
Both of these problems are serious issues for spatially-distributed models. An example is the soil hydraulic conductivity in models of groundwater flow: the actual erratic nature of this attribute leads to much different model outputs than a smoothly-varying field.
So, a map produced by kriging gives an unrealistic view of the fine-scale spatial variability. We can recover this with conditional simulation: this shows one (or many) possible realizations of the spatial field:
as defined by the covariance structure (variogram); and
as constrained by the known data values
What is geostatistical simulation?
The theory of regionalized variables assumes that the values we observe come from some random process
simplest case: one expected value (first-order stationarity) with a spatially-correlated error that is the same over the whole area (second-order stationarity).
The one reality we observe is the results of a random process
There are “alternative realities”; that is, spatial patterns that, by this theory, could have occurred in another realization of the same spatial process.
the construction of a gridded surface corresponding to a random function, i.e., model of spatial correlation
the statistical properties of the surface match those of the sample: spatial mean, spatial variance, semivariogram (model, partial sill, nugget variance, range parameter)
Gaussian simulation assumes that the target field is multivariate Gaussian, with a defined stationary spatial mean and covariance structure
This generates multiple, equally probable “realities”, i.e., the spatial distribution of the target attribute
There are various geostatistical simulation algorithms; that used in gstat is described by Pebesma (2004).
library(sp)
library(gstat)
library(lattice)
data(jura)
str(jura.pred)
## 'data.frame': 259 obs. of 13 variables:
## $ Xloc : num 2.39 2.54 2.81 4.31 4.38 ...
## $ Yloc : num 3.08 1.97 3.35 1.93 1.08 ...
## $ long : num 6.85 6.85 6.86 6.88 6.88 ...
## $ lat : num 47.1 47.1 47.1 47.1 47.1 ...
## $ Landuse: Factor w/ 4 levels "Forest","Pasture",..: 3 2 2 3 3 3 3 3 3 3 ...
## $ Rock : Factor w/ 5 levels "Argovian","Kimmeridgian",..: 3 2 3 2 5 5 5 1 1 3 ...
## $ Cd : num 1.74 1.33 1.61 2.15 1.56 ...
## $ Co : num 9.32 10 10.6 11.92 16.32 ...
## $ Cr : num 38.3 40.2 47 43.5 38.5 ...
## $ Cu : num 25.72 24.76 8.88 22.7 34.32 ...
## $ Ni : num 21.3 29.7 21.4 29.7 26.2 ...
## $ Pb : num 77.4 77.9 30.8 56.4 66.4 ...
## $ Zn : num 92.6 73.6 64.8 90 88.4 ...
str(jura.val)
## 'data.frame': 100 obs. of 13 variables:
## $ Xloc : num 2.67 3.59 4.01 2.94 1.41 ...
## $ Yloc : num 3.56 4.44 4.71 3.14 2.75 ...
## $ long : num 6.85 6.87 6.87 6.86 6.84 ...
## $ lat : num 47.1 47.2 47.2 47.1 47.1 ...
## $ Landuse: Factor w/ 4 levels "Forest","Pasture",..: 3 3 2 2 3 1 1 2 3 1 ...
## $ Rock : Factor w/ 5 levels "Argovian","Kimmeridgian",..: 5 1 1 5 3 2 2 3 2 2 ...
## $ Cd : num 1.57 2.045 1.203 0.49 0.692 ...
## $ Co : num 8.28 10.8 12 10.92 8.12 ...
## $ Cr : num 37.1 40.8 53.2 23.4 27.2 ...
## $ Cu : num 18.6 11.48 13.04 5.64 10.32 ...
## $ Ni : num 18.6 21.5 23.9 14.6 14.6 ...
## $ Pb : num 38.2 33.4 26.6 25.9 31.2 ...
## $ Zn : num 65.2 112.8 91.6 41.2 50.4 ...
str(jura.grid)
## 'data.frame': 5957 obs. of 6 variables:
## $ Xloc : num 0.3 0.35 0.35 0.4 0.4 0.4 0.4 0.4 0.4 0.4 ...
## $ Yloc : num 1.7 1.7 1.75 1.7 1.75 1.8 1.85 1.9 2.1 2.15 ...
## $ long : num 6.82 6.82 6.82 6.82 6.82 ...
## $ lat : num 47.1 47.1 47.1 47.1 47.1 ...
## $ Landuse: Factor w/ 4 levels "Forest","Pasture",..: 2 2 2 2 1 1 1 1 3 3 ...
## $ Rock : Factor w/ 5 levels "Argovian","Kimmeridgian",..: 3 3 3 2 2 2 3 3 1 1 ...
coordinates(jura.pred) <- ~ Xloc + Yloc
class(jura.pred)
## [1] "SpatialPointsDataFrame"
## attr(,"package")
## [1] "sp"
coordinates(jura.grid) <- ~ Xloc + Yloc
class(jura.grid)
## [1] "SpatialPointsDataFrame"
## attr(,"package")
## [1] "sp"
gridded(jura.grid) <- TRUE
class(jura.grid)
## [1] "SpatialPixelsDataFrame"
## attr(,"package")
## [1] "sp"
Target variable:
print(xyplot(Yloc ~ Xloc, as.data.frame(jura.pred),
cex=1.2*jura.pred$Co/max(jura.pred$Co), pch=1,
asp="iso", xlab="East", ylab="North",
main="Co concentration in topsoils, Jura"))
Examine local spatial correlation with an empirical variogram:
v <- variogram(Co ~ 1, loc=jura.pred, cutoff=1.6)
plot(v, plot.numbers=TRUE, xlab="separation distance")
Fit a variogrm model, i.e., model of spatial correlation:
(vmf <- fit.variogram(v, vgm(12.5, "Sph", 1.2, 1.5)))
## model psill range
## 1 Nug 1.066698 0.000000
## 2 Sph 12.949998 1.181867
plot(v, plot.numbers=TRUE, xlab="separation distance", model=vmf)
This is a good model fit.
Predict over the study area with this model using OK:
k.o <- krige(Co ~ 1, loc=jura.pred, newdata=jura.grid, model=vmf)
## [using ordinary kriging]
summary(k.o)
## Object of class SpatialPixelsDataFrame
## Coordinates:
## min max
## Xloc 0.275 5.125
## Yloc 0.075 5.925
## Is projected: NA
## proj4string : [NA]
## Number of points: 5957
## Grid attributes:
## cellcentre.offset cellsize cells.dim
## Xloc 0.3 0.05 97
## Yloc 0.1 0.05 117
## Data attributes:
## var1.pred var1.var
## Min. : 2.876 Min. : 1.503
## 1st Qu.: 7.853 1st Qu.: 3.453
## Median :10.118 Median : 4.040
## Mean : 9.547 Mean : 4.308
## 3rd Qu.:11.419 3rd Qu.: 4.450
## Max. :16.296 Max. :13.902
View a map of the predictions and their standard deviations.
spplot(k.o, zcol="var1.pred", main="Co, ppm", sub="Ordinary Kriging prediction")
spplot(k.o, zcol="var1.var", formula=sqrt(var1.var) ~ Xloc + Yloc,
main="Co, ppm", sub="Ordinary Kriging prediction standard deviation",
col.regions=topo.colors(64))
This is the best single map, but we see that there is uncertainty.
Many simulated fields can be created, each equally valid, and used as model inputs.
Make four realizations of a conditional simulation of the Co concentration in Jura soils over the prediction grid jura.grid, using the OK model. •
The krige
method can also do conditional simulations. It requires one optional argument, nsim
, the number of simulations.
For each set of simulations in this section we use set.seed
so your results will look the same as ours; in practice you would only do this if you want to make sure to have the same starting point.
Parameter nsim
is the number of conditional simulations.
if (file.exists("./JuraSim4.RData"))
{ load("./JuraSim4.Rdata") } else
{ set.seed(621)
system.time(
k.sim.4 <- krige(Co ~ 1, loc = jura.pred,
newdata = jura.grid,
model=vmf, nsim=4, nmax=128)
)
}
Save the simulation results – this took a lot of time!
if (!file.exists("./JuraSim4.RData"))
save(k.sim.4, file="./JuraSim4.RData")
Compare the means, medians, IQR and ranges of the four simulations, also with the calibration data set on which the simulation was based.
summary(k.sim.4)
## Object of class SpatialPixelsDataFrame
## Coordinates:
## min max
## Xloc 0.275 5.125
## Yloc 0.075 5.925
## Is projected: NA
## proj4string : [NA]
## Number of points: 5957
## Grid attributes:
## cellcentre.offset cellsize cells.dim
## Xloc 0.3 0.05 97
## Yloc 0.1 0.05 117
## Data attributes:
## sim1 sim2 sim3 sim4
## Min. :-1.357 Min. :-3.695 Min. :-0.9183 Min. :-0.8696
## 1st Qu.: 7.627 1st Qu.: 7.311 1st Qu.: 7.3350 1st Qu.: 7.6448
## Median :10.170 Median : 9.995 Median : 9.7845 Median : 9.8988
## Mean : 9.830 Mean : 9.571 Mean : 9.5288 Mean : 9.7063
## 3rd Qu.:12.176 3rd Qu.:12.037 3rd Qu.:11.8271 3rd Qu.:11.8868
## Max. :20.470 Max. :20.250 Max. :20.0087 Max. :19.4015
summary(jura.pred$Co)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.552 6.520 9.760 9.303 11.980 17.720
Q: How do the ranges of the four simulations compare with the range of the data?
Visualize:
Display the four conditional simulations in one plot, with the locations of the the observation points over-printed.
layout.2 <- list("sp.points", jura.pred, pch=1, cex=1.4*jura.pred$Co/max(jura.pred$Co), col="white")
print(spplot(k.sim.4, zcol=1:4, col.regions=bpy.colors(64),
main="Conditional simulations, Jura Co (ppm), OK",
sp.layout=list(layout.2)))
Simulation is supposed to respect the spatial structure; that is, the simulated field should have the same structure as the variogram model. Let’s see how well it was reproduced.
Compute the empirical variograms of the four simulations, and plot them, each with the fitted model vmf
. These have to be converted to points rather than pixels.
k.sim.4.pts <- as( k.sim.4, "SpatialPointsDataFrame")
vs.1 <- variogram(sim1 ~ 1, loc=k.sim.4.pts)
vs.2 <- variogram(sim2 ~ 1, loc=k.sim.4.pts)
vs.3 <- variogram(sim3 ~ 1, loc=k.sim.4.pts)
vs.4 <- variogram(sim4 ~ 1, loc=k.sim.4.pts)
gamma.max=ceiling(max(vs.1$gamma, vs.2$gamma, vs.3$gamma, vs.4$gamma, sum(vmf[,"psill"])))
p.1 <- plot(vs.1, model=vmf, ylim=c(0,gamma.max), main="Simulation 1")
p.2 <- plot(vs.2, model=vmf, ylim=c(0,gamma.max), main="Simulation 2")
p.3 <- plot(vs.3, model=vmf, ylim=c(0,gamma.max), main="Simulation 3")
p.4 <- plot(vs.4, model=vmf, ylim=c(0,gamma.max), main="Simulation 4")
print(p.1, split=c(1,1,2,2), more=T)
print(p.2, split=c(1,2,2,2), more=T)
print(p.3, split=c(2,1,2,2), more=T)
print(p.4, split=c(2,2,2,2), more=F)
How well do the simulations reproduce the variogram?
Do the empirical variograms of the four simulations match the variogram model? What are the differences? Why?
In some situations we would like to simulate a random field defined by an assumed spatial covariance structure (e.g., as represented by a variogram model), without considering any data points. For example, for designing a sampling scheme, if we have an idea of the field structure, we can create many realizations by unconditional simulation, and see how well our proposed scheme would capture the known structure.
This is as the conditional simulation, using the krige method. However, for unconditional simulation we specify:
that the data locations are missing, by setting the loc
argument to NULL
;
that there is no data, by setting the dummy
argument to TRUE
;
the value of the trend surface coefficients, by setting the beta
argument to the presumed mean; for OK this is the mean, so that OK becomes Simple Kriging (SK).
The beta
argument gives the expected value of the stationary field; here it is most reasonably set to the non-spatial mean of the Co concentration at the sample points.
As with conditional simulation, the nmax and/or maxdist optional argu- ments are often specified as well.
if (file.exists("./JuraSimU4.RData"))
{ load("./JuraSimU4.RData") } else
{ set.seed(621)
system.time(k.sim.4.u <- krige(z ~ 1,
loc=NULL,
newdata=jura.grid,
model=vmf,
nsim=4, nmax=128,
beta=mean(jura.pred$Co),
dummy=T)
)
}
if (!file.exists("./JuraSimU4.RData"))
save(k.sim.4.u, file="./JuraSimU4.RData")
Compare the means, medians, IQR and ranges of the four simulations
summary(k.sim.4.u)
## Object of class SpatialPixelsDataFrame
## Coordinates:
## min max
## Xloc 0.275 5.125
## Yloc 0.075 5.925
## Is projected: NA
## proj4string : [NA]
## Number of points: 5957
## Grid attributes:
## cellcentre.offset cellsize cells.dim
## Xloc 0.3 0.05 97
## Yloc 0.1 0.05 117
## Data attributes:
## sim1 sim2 sim3 sim4
## Min. :-4.079 Min. :-6.670 Min. :-2.201 Min. :-3.693
## 1st Qu.: 6.816 1st Qu.: 5.896 1st Qu.: 6.663 1st Qu.: 7.340
## Median : 9.668 Median : 8.286 Median : 9.251 Median : 9.835
## Mean : 9.814 Mean : 8.253 Mean : 9.401 Mean : 9.797
## 3rd Qu.:12.896 3rd Qu.:10.791 3rd Qu.:12.146 3rd Qu.:12.341
## Max. :22.752 Max. :20.409 Max. :21.307 Max. :20.616
print(spplot(k.sim.4.u, zcol=1:4, col.regions=bpy.colors(64),
main="Unconditional simulations, Co variogram model"))
Describe the similarities and differences among the four simulations.
One use of unconditional simulation is to visualize different random fields. We begin by comparing random fields with the same variogram parameters but different forms.
Use the same variogram parameters for four different models of spatial dependence:
Simulate random fields on the Jura grid for spherical, pentaspher- ical, exponential, Gaussian models, all with the same effective range, partial sill and nugget as the fitted model for Jura cobalt.
Recall that the effective range of the exponential model is \(3\) times the range parameter, and for the Gaussian model it is $3 $ times.
c0 <- vmf[1,"psill"]; c1 <- vmf[2,"psill"]; a <- vmf[2,"range"]
p.1 <- show.vgms(max=round(a*1.4,2), sill=round(c1,2), range=round(a,2), nugget=round(c0,2), models="Sph")
p.2 <- show.vgms(max=round(a*1.4,2), sill=round(c1,2), range=round(a,2), nugget=round(c0,2), models="Pen")
p.3 <- show.vgms(max=round(a*1.4,2), sill=round(c1,2), range=round(a/3,2), nugget=round(c0,2), models="Exp")
p.4 <- show.vgms(max=round(a*1.4,2), sill=round(c1,2), range=round(a/sqrt(3),2), nugget=round(c0,2), models="Gau")
print(p.1, split=c(1,1,2,2), more=T)
print(p.2, split=c(2,1,2,2), more=T)
print(p.3, split=c(1,2,2,2), more=T)
print(p.4, split=c(2,2,2,2), more=F)
Simulate; we also need to specify the mean value of the field.
mu <- mean(jura.pred$Co)
set.seed(621)
k.sim.u.sph <- krige(z ~ 1, loc=NULL, newdata=jura.grid,
model=vgm(c1,"Sph",a,c0), nsim=1, nmax=128,
beta=mu, dummy=T)
## [using unconditional Gaussian simulation]
k.sim.u.pen <- krige(z ~ 1, loc=NULL, newdata=jura.grid,
model=vgm(c1,"Pen",a,c0), nsim=1, nmax=128,
beta=mu, dummy=T)
## [using unconditional Gaussian simulation]
k.sim.u.exp <- krige(z ~ 1, loc=NULL, newdata=jura.grid,
model=vgm(c1,"Exp",a/3,c0), nsim=1, nmax=128,
beta=mu, dummy=T)
## [using unconditional Gaussian simulation]
k.sim.u.gau <- krige(z ~ 1, loc=NULL, newdata=jura.grid,
model=vgm(c1,"Gau",a/sqrt(3),c0), nsim=1,
nmax=128, beta=mu, dummy=T)
## [using unconditional Gaussian simulation]
Display the simulated fields in one figure.
k.sim.u <- k.sim.u.sph
names(k.sim.u) <- "sim.sph"
k.sim.u$sim.pen <- k.sim.u.pen$sim1
k.sim.u$sim.exp <- k.sim.u.exp$sim1
k.sim.u$sim.gau <- k.sim.u.gau$sim1
spplot(k.sim.u, zcol=c(3,4,1,2),
col.regions=bpy.colors(64))
Describe the differences between the fields simulated by the four models with different structure of spatial correlation.
E J Pebesma. Multivariable geostatistics in S: the gstat package. Computers & Geosciences, 30(7):683–691, 2004. DOI: 10.1016/j.cageo.2004.03.012