rm(list=ls())

1 Random numbers

Draw samples from various distributions and display their histograms.

A function that can be repeated:

test_random <- function(n, binom.trials=32, binom.prob=0.5, pois.lambda=4) {
  hist(d <- runif(n), breaks=seq(0, 1, by=0.05),
       freq=FALSE, main="Uniform")
  rug(d); abline(h=1, lty=2, col="red")
  summary(d)
  hist(d <- rnorm(n, mean=0, sd=1), freq=FALSE,
       main="Normal"); rug(d)
  x <- seq(-3, 3, length=128); y=dnorm(x,0,1)
  lines(x,y, col="red")
  summary(d)
  hist(d <- rbinom(n, size=binom.trials, prob=binom.prob), 
       main="Binomial",freq=FALSE,
       breaks=seq(0,binom.trials, by=1))
  x <- seq(0, binom.trials, by=1); y=dbinom(x, binom.trials, prob=0.5)
  points(x+0.5,y, col="red", type="h")
  table(d)
  hist(d <- rpois(n, lambda=pois.lambda), freq=FALSE,
       breaks=seq(0, (3*pois.lambda), by=1), main="Poisson")
  x <- seq(0, 3*pois.lambda, by=1); y=dpois(x,pois.lambda)
  points(x+0.5,y, col="red", type="h")
  table(d)
  return()
}

Show these with one number of sample draws:

test_random(128)

## NULL

Task: run this with different sample sizes, each sample size a few times.

Q: How does the agreement of the empirical with the theoretical distribution change with increasing sample size?

Q: How does the variability of the empirical distribution change with increasing sample size?

2 Simulation

The essential idea is to assume a statistical model for the distribution of one or more variables, and then repeatedly sample from that distribution.

This random sample simulates what might be encountered in reality, with appropriate probability.

2.1 Binomial distribution

This is used for Bernoulli trials: independent events with a given probability of occurring. Each trials is either a “success” (1) or “failure” (0); we simulate the number of successes.

The classic example is flips of a fair coin:

n <- 1024
sample <- rbinom(n, size=24, prob=0.5);
head(sample, n=20)

##  [1] 12 11 11 11 13 12 17 11 11  9 11 10 12 14 11 12 11 10  6 10

(table.k <- table(sample))

## sample
##   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19 
##   3   8  23  49  71  99 183 153 155 125  73  52  23   5   2

plot(table.k/n, xlab="k", ylab="density")

A more interesting example is whether a certain group of the population is fairly represented in a sample.

For example: 30% of the students are male, 70% female, in a certain study. The best 20 students of the class receive a bonus or a scholarship. Do the best 20 represent this 30/70 split? Is the deviation from 6 males, 14 females (exactly matching the population) by chance?

n <- 1024
sample <- rbinom(n, size=20, prob=0.3)
(table.k <- table(sample))

## sample
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13 
##   1   4  21  74 125 145 202 192 119  77  40  19   4   1

plot(table.k/n, xlab="k", ylab="density")

Q: What is the maximum and minimum number of males selected by chance?

Q: What is the mode (most common value)? Does it match the expected value from the population?

Q: Suppose only 3 males are selected. How strong is the evidence that they are under-represented? (This could be either due to their poor performance or discrimination, we have no way to distinguish the cause.)

length(which(sample <= 3))/n

## [1] 0.09765625

2.2 Normal distribution

Suppose we want to know how large a random sample should be to reliably estimate a total.

nsim <- 256
# set up to save results
totals <- vector(mode="integer", length=nsim) # save run results
n <- 32; mu=100; s=1
for (run in 1:nsim) {
  totals[run] <- sum(rnorm(n, mean=mu, sd=s))
}
summary(totals)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3184    3197    3200    3200    3204    3216

print(paste("The actual total is", mu*n))

## [1] "The actual total is 3200"

print(paste("Sampling standard deviation of the mean:", round(sd(totals),4)))

## [1] "Sampling standard deviation of the mean: 5.7118"

hist(totals, breaks=16); rug(totals)

2.3 Mix of distributions

Practical example: risk of overloading an aircraft.

Risk of an overweight airplane on full 19-seat plane

Passengers weights assumed to follow a normal distribution
Estimate mean and standard deviation from measurements from the target population
separate distributions for males/females; hierarchical model gender binomial → gender-specific normal
Estimate mean proportion of female passengers (parameter of binomial)
Simulate number of females/males of the 19, from binomial distribution
Simulate each individual’s weight; sum all 19
Compare to maximum allowable weight; find proportion overweight

# parameters of normal distribution
# estimate these from the target population
# mean, s.d. of fe/male weights, kg
mu.m <- 80; sd.m <- 14; mu.f <- 65; sd.f <- 12
# mean proportion of female passengers: binomial \theta
prop.f.mu <- 0.35

# simulation
nsim <- 2048 # number of simulations
n.females <- vector(mode="integer", length=nsim) # save run results
wt.sum <- vector(mode="integer", length=nsim) # save run results
n <- 19 # aircract size
for (run in 1:nsim) {
  # number of fe/males
  num.f <- rbinom(n=1, size=n, prob=prop.f.mu)
  num.m <- n - num.f
  n.females[run] <- num.f
  wts.f <- rnorm(num.f, mean=mu.f, sd=sd.f)
  wts.m <- rnorm(num.m, mean=mu.m, sd=sd.m)
  wt.sum[run] <- ceiling(sum(wts.f) + sum(wts.m))
  }
summary(wt.sum)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1216    1375    1420    1421    1464    1666

summary(n.females)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   5.000   7.000   6.687   8.000  14.000

# Fairchild Metro II: empty 3380 kg, max takeoff 5670kg, total load 2290 kg
load.wt <- 2290; pilots.wt <- 200; fuel.wt <- 600
max.load <- load.wt-pilots.wt-fuel.wt

(n.overweight <- sum(wt.sum > max.load))

## [1] 293

# probabilty of overweight
(prob.overweight <- round(n.overweight/nsim,3))

## [1] 0.143

# figures
# number of females per flight
plot(as.vector(table(n.females)), type="h", xlim=c(0,19),
     ylab="frequency", xlab="females per 19 passengers", lwd=2)
grid()

# number of flights overweight
hist(wt.sum, main="", xlab="Total passenger weight, kg", breaks=24); rug(wt.sum)
abline(v=(2290-800), col="red")

3 Geostatistical simulation

3.1 Sampling

We can simulate the placement of random samples

3.2 Random fields

There are several obvious problems with kriging predictions over an area, with respect to uncertainty:

Kriging is an exact predictor at known points, because all the weight is given to the known point; this is mathematically necessary but not realistic, since just away from the point the predictions are weighted;
Kriging prediction maps are by definition smooth, even if there is a nugget component to the model; the actual spatial field is usually much rougher.

Both of these problems are serious issues for spatially-distributed models. An example is the soil hydraulic conductivity in models of groundwater flow: the actual erratic nature of this attribute leads to much different model outputs than a smoothly-varying field.

So, a map produced by kriging gives an unrealistic view of the fine-scale spatial variability. We can recover this with conditional simulation: this shows one (or many) possible realizations of the spatial field:

as defined by the covariance structure (variogram); and
as constrained by the known data values

What is geostatistical simulation?

The theory of regionalized variables assumes that the values we observe come from some random process
simplest case: one expected value (first-order stationarity) with a spatially-correlated error that is the same over the whole area (second-order stationarity).
The one reality we observe is the results of a random process
There are “alternative realities”; that is, spatial patterns that, by this theory, could have occurred in another realization of the same spatial process.
the construction of a gridded surface corresponding to a random function, i.e., model of spatial correlation
the statistical properties of the surface match those of the sample: spatial mean, spatial variance, semivariogram (model, partial sill, nugget variance, range parameter)
Gaussian simulation assumes that the target field is multivariate Gaussian, with a defined stationary spatial mean and covariance structure
This generates multiple, equally probable “realities”, i.e., the spatial distribution of the target attribute

There are various geostatistical simulation algorithms; that used in gstat is described by Pebesma (2004).

library(sp)
library(gstat)
library(lattice)

data(jura)
str(jura.pred)

## 'data.frame':    259 obs. of  13 variables:
##  $ Xloc   : num  2.39 2.54 2.81 4.31 4.38 ...
##  $ Yloc   : num  3.08 1.97 3.35 1.93 1.08 ...
##  $ long   : num  6.85 6.85 6.86 6.88 6.88 ...
##  $ lat    : num  47.1 47.1 47.1 47.1 47.1 ...
##  $ Landuse: Factor w/ 4 levels "Forest","Pasture",..: 3 2 2 3 3 3 3 3 3 3 ...
##  $ Rock   : Factor w/ 5 levels "Argovian","Kimmeridgian",..: 3 2 3 2 5 5 5 1 1 3 ...
##  $ Cd     : num  1.74 1.33 1.61 2.15 1.56 ...
##  $ Co     : num  9.32 10 10.6 11.92 16.32 ...
##  $ Cr     : num  38.3 40.2 47 43.5 38.5 ...
##  $ Cu     : num  25.72 24.76 8.88 22.7 34.32 ...
##  $ Ni     : num  21.3 29.7 21.4 29.7 26.2 ...
##  $ Pb     : num  77.4 77.9 30.8 56.4 66.4 ...
##  $ Zn     : num  92.6 73.6 64.8 90 88.4 ...

str(jura.val)

## 'data.frame':    100 obs. of  13 variables:
##  $ Xloc   : num  2.67 3.59 4.01 2.94 1.41 ...
##  $ Yloc   : num  3.56 4.44 4.71 3.14 2.75 ...
##  $ long   : num  6.85 6.87 6.87 6.86 6.84 ...
##  $ lat    : num  47.1 47.2 47.2 47.1 47.1 ...
##  $ Landuse: Factor w/ 4 levels "Forest","Pasture",..: 3 3 2 2 3 1 1 2 3 1 ...
##  $ Rock   : Factor w/ 5 levels "Argovian","Kimmeridgian",..: 5 1 1 5 3 2 2 3 2 2 ...
##  $ Cd     : num  1.57 2.045 1.203 0.49 0.692 ...
##  $ Co     : num  8.28 10.8 12 10.92 8.12 ...
##  $ Cr     : num  37.1 40.8 53.2 23.4 27.2 ...
##  $ Cu     : num  18.6 11.48 13.04 5.64 10.32 ...
##  $ Ni     : num  18.6 21.5 23.9 14.6 14.6 ...
##  $ Pb     : num  38.2 33.4 26.6 25.9 31.2 ...
##  $ Zn     : num  65.2 112.8 91.6 41.2 50.4 ...

str(jura.grid)

## 'data.frame':    5957 obs. of  6 variables:
##  $ Xloc   : num  0.3 0.35 0.35 0.4 0.4 0.4 0.4 0.4 0.4 0.4 ...
##  $ Yloc   : num  1.7 1.7 1.75 1.7 1.75 1.8 1.85 1.9 2.1 2.15 ...
##  $ long   : num  6.82 6.82 6.82 6.82 6.82 ...
##  $ lat    : num  47.1 47.1 47.1 47.1 47.1 ...
##  $ Landuse: Factor w/ 4 levels "Forest","Pasture",..: 2 2 2 2 1 1 1 1 3 3 ...
##  $ Rock   : Factor w/ 5 levels "Argovian","Kimmeridgian",..: 3 3 3 2 2 2 3 3 1 1 ...

coordinates(jura.pred) <- ~ Xloc + Yloc
class(jura.pred)

## [1] "SpatialPointsDataFrame"
## attr(,"package")
## [1] "sp"

coordinates(jura.grid) <- ~ Xloc + Yloc
class(jura.grid)

## [1] "SpatialPointsDataFrame"
## attr(,"package")
## [1] "sp"

gridded(jura.grid) <- TRUE
class(jura.grid)

## [1] "SpatialPixelsDataFrame"
## attr(,"package")
## [1] "sp"

Target variable:

print(xyplot(Yloc ~ Xloc, as.data.frame(jura.pred),
             cex=1.2*jura.pred$Co/max(jura.pred$Co), pch=1,
             asp="iso", xlab="East", ylab="North",
             main="Co concentration in topsoils, Jura"))

Examine local spatial correlation with an empirical variogram:

v <- variogram(Co ~ 1, loc=jura.pred, cutoff=1.6)
plot(v, plot.numbers=TRUE, xlab="separation distance")

Fit a variogrm model, i.e., model of spatial correlation:

(vmf <- fit.variogram(v, vgm(12.5, "Sph", 1.2, 1.5)))

##   model     psill    range
## 1   Nug  1.066698 0.000000
## 2   Sph 12.949998 1.181867

plot(v, plot.numbers=TRUE, xlab="separation distance", model=vmf)

This is a good model fit.

Predict over the study area with this model using OK:

k.o <- krige(Co ~ 1, loc=jura.pred, newdata=jura.grid, model=vmf)

## [using ordinary kriging]

summary(k.o)

## Object of class SpatialPixelsDataFrame
## Coordinates:
##        min   max
## Xloc 0.275 5.125
## Yloc 0.075 5.925
## Is projected: NA 
## proj4string : [NA]
## Number of points: 5957
## Grid attributes:
##      cellcentre.offset cellsize cells.dim
## Xloc               0.3     0.05        97
## Yloc               0.1     0.05       117
## Data attributes:
##    var1.pred         var1.var     
##  Min.   : 2.876   Min.   : 1.503  
##  1st Qu.: 7.853   1st Qu.: 3.453  
##  Median :10.118   Median : 4.040  
##  Mean   : 9.547   Mean   : 4.308  
##  3rd Qu.:11.419   3rd Qu.: 4.450  
##  Max.   :16.296   Max.   :13.902

View a map of the predictions and their standard deviations.

spplot(k.o, zcol="var1.pred", main="Co, ppm", sub="Ordinary Kriging prediction")

spplot(k.o, zcol="var1.var", formula=sqrt(var1.var) ~ Xloc + Yloc,
       main="Co, ppm", sub="Ordinary Kriging prediction standard deviation",
       col.regions=topo.colors(64))

This is the best single map, but we see that there is uncertainty.

3.3 Conditional simulation

Many simulated fields can be created, each equally valid, and used as model inputs.

Make four realizations of a conditional simulation of the Co concentration in Jura soils over the prediction grid jura.grid, using the OK model. •

The krige method can also do conditional simulations. It requires one optional argument, nsim, the number of simulations.

For each set of simulations in this section we use set.seed so your results will look the same as ours; in practice you would only do this if you want to make sure to have the same starting point.

Parameter nsim is the number of conditional simulations.

if (file.exists("./JuraSim4.RData")) 
  { load("./JuraSim4.Rdata") } else
  { set.seed(621)
    system.time(
    k.sim.4 <- krige(Co ~ 1, loc = jura.pred,
                     newdata = jura.grid,
                     model=vmf, nsim=4, nmax=128) 
    )
    }

Save the simulation results – this took a lot of time!

if (!file.exists("./JuraSim4.RData"))
  save(k.sim.4, file="./JuraSim4.RData")

Compare the means, medians, IQR and ranges of the four simulations, also with the calibration data set on which the simulation was based.

summary(k.sim.4)

## Object of class SpatialPixelsDataFrame
## Coordinates:
##        min   max
## Xloc 0.275 5.125
## Yloc 0.075 5.925
## Is projected: NA 
## proj4string : [NA]
## Number of points: 5957
## Grid attributes:
##      cellcentre.offset cellsize cells.dim
## Xloc               0.3     0.05        97
## Yloc               0.1     0.05       117
## Data attributes:
##       sim1             sim2             sim3              sim4        
##  Min.   :-1.357   Min.   :-3.695   Min.   :-0.9183   Min.   :-0.8696  
##  1st Qu.: 7.627   1st Qu.: 7.311   1st Qu.: 7.3350   1st Qu.: 7.6448  
##  Median :10.170   Median : 9.995   Median : 9.7845   Median : 9.8988  
##  Mean   : 9.830   Mean   : 9.571   Mean   : 9.5288   Mean   : 9.7063  
##  3rd Qu.:12.176   3rd Qu.:12.037   3rd Qu.:11.8271   3rd Qu.:11.8868  
##  Max.   :20.470   Max.   :20.250   Max.   :20.0087   Max.   :19.4015

summary(jura.pred$Co)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.552   6.520   9.760   9.303  11.980  17.720

Q: How do the ranges of the four simulations compare with the range of the data?

Visualize:

Display the four conditional simulations in one plot, with the locations of the the observation points over-printed.

layout.2 <- list("sp.points", jura.pred, pch=1, cex=1.4*jura.pred$Co/max(jura.pred$Co), col="white")
print(spplot(k.sim.4, zcol=1:4, col.regions=bpy.colors(64), 
       main="Conditional simulations, Jura Co (ppm), OK",
       sp.layout=list(layout.2)))

Simulation is supposed to respect the spatial structure; that is, the simulated field should have the same structure as the variogram model. Let’s see how well it was reproduced.

Compute the empirical variograms of the four simulations, and plot them, each with the fitted model vmf. These have to be converted to points rather than pixels.

k.sim.4.pts <- as( k.sim.4, "SpatialPointsDataFrame")
vs.1 <- variogram(sim1 ~ 1, loc=k.sim.4.pts)
vs.2 <- variogram(sim2 ~ 1, loc=k.sim.4.pts)
vs.3 <- variogram(sim3 ~ 1, loc=k.sim.4.pts)
vs.4 <- variogram(sim4 ~ 1, loc=k.sim.4.pts)
gamma.max=ceiling(max(vs.1$gamma, vs.2$gamma, vs.3$gamma, vs.4$gamma, sum(vmf[,"psill"])))
p.1 <- plot(vs.1, model=vmf, ylim=c(0,gamma.max), main="Simulation 1")
p.2 <- plot(vs.2, model=vmf, ylim=c(0,gamma.max), main="Simulation 2")
p.3 <- plot(vs.3, model=vmf, ylim=c(0,gamma.max), main="Simulation 3")
p.4 <- plot(vs.4, model=vmf, ylim=c(0,gamma.max), main="Simulation 4")
print(p.1, split=c(1,1,2,2), more=T)
print(p.2, split=c(1,2,2,2), more=T)
print(p.3, split=c(2,1,2,2), more=T)
print(p.4, split=c(2,2,2,2), more=F)

How well do the simulations reproduce the variogram?

Do the empirical variograms of the four simulations match the variogram model? What are the differences? Why?

3.4 Unconditional simulation

In some situations we would like to simulate a random field defined by an assumed spatial covariance structure (e.g., as represented by a variogram model), without considering any data points. For example, for designing a sampling scheme, if we have an idea of the field structure, we can create many realizations by unconditional simulation, and see how well our proposed scheme would capture the known structure.

This is as the conditional simulation, using the krige method. However, for unconditional simulation we specify:

that the data locations are missing, by setting the loc argument to NULL;
that there is no data, by setting the dummy argument to TRUE;
the value of the trend surface coefficients, by setting the beta argument to the presumed mean; for OK this is the mean, so that OK becomes Simple Kriging (SK).

The beta argument gives the expected value of the stationary field; here it is most reasonably set to the non-spatial mean of the Co concentration at the sample points.

As with conditional simulation, the nmax and/or maxdist optional argu- ments are often specified as well.

if (file.exists("./JuraSimU4.RData")) 
  { load("./JuraSimU4.RData") } else
  { set.seed(621)
    system.time(k.sim.4.u <- krige(z ~ 1,
                                   loc=NULL,
                                   newdata=jura.grid,
                                   model=vmf,
                                   nsim=4, nmax=128,
                                   beta=mean(jura.pred$Co),
                                   dummy=T) 
    )
    }

if (!file.exists("./JuraSimU4.RData"))
  save(k.sim.4.u, file="./JuraSimU4.RData")

Compare the means, medians, IQR and ranges of the four simulations

summary(k.sim.4.u)

## Object of class SpatialPixelsDataFrame
## Coordinates:
##        min   max
## Xloc 0.275 5.125
## Yloc 0.075 5.925
## Is projected: NA 
## proj4string : [NA]
## Number of points: 5957
## Grid attributes:
##      cellcentre.offset cellsize cells.dim
## Xloc               0.3     0.05        97
## Yloc               0.1     0.05       117
## Data attributes:
##       sim1             sim2             sim3             sim4       
##  Min.   :-4.079   Min.   :-6.670   Min.   :-2.201   Min.   :-3.693  
##  1st Qu.: 6.816   1st Qu.: 5.896   1st Qu.: 6.663   1st Qu.: 7.340  
##  Median : 9.668   Median : 8.286   Median : 9.251   Median : 9.835  
##  Mean   : 9.814   Mean   : 8.253   Mean   : 9.401   Mean   : 9.797  
##  3rd Qu.:12.896   3rd Qu.:10.791   3rd Qu.:12.146   3rd Qu.:12.341  
##  Max.   :22.752   Max.   :20.409   Max.   :21.307   Max.   :20.616

print(spplot(k.sim.4.u, zcol=1:4, col.regions=bpy.colors(64), 
       main="Unconditional simulations, Co variogram model"))

Describe the similarities and differences among the four simulations.

3.5 Simulation different spatial structures

One use of unconditional simulation is to visualize different random fields. We begin by comparing random fields with the same variogram parameters but different forms.

Use the same variogram parameters for four different models of spatial dependence:

Simulate random fields on the Jura grid for spherical, pentaspher- ical, exponential, Gaussian models, all with the same effective range, partial sill and nugget as the fitted model for Jura cobalt.

Recall that the effective range of the exponential model is $3$ times the range parameter, and for the Gaussian model it is $3 $ times.

c0 <- vmf[1,"psill"]; c1 <- vmf[2,"psill"]; a <- vmf[2,"range"]
p.1 <- show.vgms(max=round(a*1.4,2), sill=round(c1,2), range=round(a,2), nugget=round(c0,2), models="Sph")
p.2 <- show.vgms(max=round(a*1.4,2), sill=round(c1,2), range=round(a,2), nugget=round(c0,2), models="Pen")
p.3 <- show.vgms(max=round(a*1.4,2), sill=round(c1,2), range=round(a/3,2), nugget=round(c0,2), models="Exp")
p.4 <- show.vgms(max=round(a*1.4,2), sill=round(c1,2), range=round(a/sqrt(3),2), nugget=round(c0,2), models="Gau")
print(p.1, split=c(1,1,2,2), more=T)
print(p.2, split=c(2,1,2,2), more=T)
print(p.3, split=c(1,2,2,2), more=T)
print(p.4, split=c(2,2,2,2), more=F)

Simulate; we also need to specify the mean value of the field.

mu <- mean(jura.pred$Co)
set.seed(621)
k.sim.u.sph <- krige(z ~ 1, loc=NULL, newdata=jura.grid,
                     model=vgm(c1,"Sph",a,c0), nsim=1, nmax=128,
                     beta=mu, dummy=T)

## [using unconditional Gaussian simulation]

k.sim.u.pen <- krige(z ~ 1, loc=NULL, newdata=jura.grid,
                     model=vgm(c1,"Pen",a,c0), nsim=1, nmax=128,
                     beta=mu, dummy=T)

## [using unconditional Gaussian simulation]

k.sim.u.exp <- krige(z ~ 1, loc=NULL, newdata=jura.grid,
                     model=vgm(c1,"Exp",a/3,c0), nsim=1, nmax=128,
                     beta=mu, dummy=T)

## [using unconditional Gaussian simulation]

k.sim.u.gau <- krige(z ~ 1, loc=NULL, newdata=jura.grid,
                     model=vgm(c1,"Gau",a/sqrt(3),c0), nsim=1,
                     nmax=128, beta=mu, dummy=T)

## [using unconditional Gaussian simulation]

Display the simulated fields in one figure.

k.sim.u <- k.sim.u.sph
names(k.sim.u) <- "sim.sph"
k.sim.u$sim.pen <- k.sim.u.pen$sim1
k.sim.u$sim.exp <- k.sim.u.exp$sim1
k.sim.u$sim.gau <- k.sim.u.gau$sim1
spplot(k.sim.u, zcol=c(3,4,1,2),
       col.regions=bpy.colors(64))

Describe the differences between the fields simulated by the four models with different structure of spatial correlation.

4 References

E J Pebesma. Multivariable geostatistics in S: the gstat package. Computers & Geosciences, 30(7):683–691, 2004. DOI: 10.1016/j.cageo.2004.03.012

(Geo-)statistical simulation

D G Rossiter

2019-08-04