## ----setup, include=FALSE, cache=FALSE-----------------------------------
# set global chunk options
## this code is only required to set up knitr for reproducible documentation
## it has nothing to do with the exercise itself, please ignore!
opts_knit$set(aliases=c(h = 'fig.height', w = 'fig.width'), out.format='latex', use.highlight=TRUE)
opts_chunk$set(fig.path='kgraph/exRKGLS', fig.align='center', fig.show='hold', prompt=FALSE,
               fig.width=6, fig.height=6, out.width='.7\\linewidth', size='scriptsize')
## comment=NA, 
## this does not comment out the output
                                        # options to be read by formatR
options(replace.assign=TRUE,width=70)


## ----echo=FALSE, results='hide'------------------------------------------
rm(list=ls())


## ----child-data, child='exRKGLS_data.Rnw'--------------------------------

## ----data-set-parent, echo=FALSE, cache=FALSE----------------------------
set_parent('exRKGLS.Rnw')


## ----setwd-mac-no, eval=FALSE--------------------------------------------
## setwd("~/ds/NEweather")


## ----setwd-win, eval=FALSE-----------------------------------------------
## setwd("M://css6200/dgr2/datasets/NEweather")


## ----list-files-no, eval=F-----------------------------------------------
## list.files(".", pattern=".shp$")


## ----list-files-yes, echo=F, eval=T--------------------------------------
## this is the file specification on the author's system; he organizes datasets into a subdirectory named `ds', and then
##   each data source into its own sub-subdirectory
list.files("./ds/NEweather", pattern=".shp$")


## ----load-packages-------------------------------------------------------
library(sp)
library(rgdal)


## ----load-all-usa-no, eval=FALSE-----------------------------------------
## usa <- readOGR(dsn=".", layer="gdd50_7100j",
##                integer64="allow.loss")


## ----load-all-usa-yes, echo=FALSE----------------------------------------
## this is for the author's system
usa <- readOGR(dsn="./ds/NEweather", layer="gdd50_7100j",
               integer64="allow.loss")


## ----str-all-use---------------------------------------------------------
bbox(usa)
names(usa)
strwrap(proj4string(usa))


## ----num-to-char-ids-----------------------------------------------------
sapply(usa@data, "class")
usa$STATION_ID <- as.character(usa$STATION_ID)
usa$STATION_NA <- as.character(usa$STATION_NA)
usa$STN_NAME <- as.character(usa$STN_NAME)
usa$OID_<- as.character(usa$OID_)
usa$COOP_ID <- as.character(usa$COOP_ID)
summary(usa)


## ----list-states---------------------------------------------------------
length(levels(usa$STATE))
levels(usa$STATE)


## ----select-ne-----------------------------------------------------------
dim(usa)[1]  # number of rows in the full dataset
ix <- (usa$STATE %in% c("NY","VT","NJ","PA"))  # logical expression
str(ix)      # each row is either TRUE or FALSE
sum(ix)      # number of rows selected
head(which(ix))  # first six row numbers of stations in these states
ne <- usa[ix,]  # select only these states' records
dim(ne)[1]   # number of rows in the restricted dataset


## ----adjust-levels-------------------------------------------------------
class(ne$STATE)
levels(ne$STATE)
ne$STATE <- factor(ne$STATE)
levels(ne$STATE)


## ----new-bbox------------------------------------------------------------
bbox(ne)


## ----check-zerodist------------------------------------------------------
length(ix <- zerodist(ne))


## ----check-zerodist-usa--------------------------------------------------
length(ix <- zerodist(usa))
usa@data[ix,1:5]


## ----adjust-coords-------------------------------------------------------
coordinates(usa)[ix[1],]; coordinates(usa)[ix[4],]
usa@data[ix[1],4:5]; usa@data[ix[4],4:5]
usa@coords[ix[4],] <- c(-84.99, 40.42)
usa@data[ix[4],4:5] <- c(40.42, -84.99)  # this is the order in the dataframe


## ----eval=FALSE----------------------------------------------------------
## usa <- (usa[-ix[4],])


## ----show-proj4----------------------------------------------------------
strwrap(proj4string(ne))


## ----list-projs----------------------------------------------------------
head(projInfo(type="proj"))


## ----find-albers-parameters----------------------------------------------
round(median(bbox(ne)[1,]))
floor(bbox(ne)[2,1])+1
ceiling(bbox(ne)[2,2])-1


## ----make-albers---------------------------------------------------------
ne.crs <-
  CRS("+proj=aea +lat_0=42.5 +lat_1=39 +lat_2=44 +lon_0=-76 +ellps=WGS84 +units=m")
ne.m <- spTransform(ne, ne.crs)


## ----change-coordnames---------------------------------------------------
coordnames(ne.m) <- c("E","N")
bbox(ne.m)


## ----convert-to-df-------------------------------------------------------
ne.df <- as(ne.m, "data.frame")
names(ne.df)


## ----get-state-boundaries-no, eval=FALSE---------------------------------
## state <- readOGR(dsn=".", "cb_2014_us_state_500k")
## summary(state)


## ----get-state-boundaries-yes, eval=T, echo=F----------------------------
## for the author's system
state <- readOGR(dsn="./ds/NEweather", "cb_2014_us_state_500k")
summary(state)


## ----limit-to-ne---------------------------------------------------------
state.ne <- state[state$STUSPS %in% c('NY','NJ', 'PA', 'VT'),]
summary(state.ne)


## ----state-CRS-----------------------------------------------------------
proj4string(ne.m)
state.ne.m <- spTransform(state.ne, proj4string(ne.m))
proj4string(state.ne.m)
coordnames(state.ne.m) <- c("E","N")
bbox(state.ne.m)


## ----save-projected-stations-no, eval=FALSE------------------------------
## save(ne, ne.m, ne.df, state.ne, state.ne.m, ne.crs, file="./ne_stations.RData")


## ----save-projected-stations-yes, eval=T, echo=F-------------------------
## this is for the author's system
save(ne, ne.m, ne.df, state.ne, state.ne.m, ne.crs, file="./ds/NEweather/ne_stations.RData")


## ----child-data, child='exRKGLS_explore.Rnw'-----------------------------

## ----explore-set-parent, echo=FALSE, cache=FALSE-------------------------
set_parent('exRKGLS.Rnw')


## ----load-packages-explore-----------------------------------------------
library(sp)
library(rgdal)


## ----load-saved-dataset, eval=FALSE--------------------------------------
## load(file="./ne_stations.RData", verbose=TRUE)


## ----load-saved-dataset-yes, eval=TRUE, echo=FALSE-----------------------
rm(list=ls()) # make sure this is all the data we need to go forward
load(file="./ds/NEweather/ne_stations.RData", verbose=TRUE)


## ----summarize-ds--------------------------------------------------------
summary(ne.df[,c("ANN_GDD50","ELEVATION_")])


## ----show-stations, out.width='\\linewidth', w=8, h=8, fig.cap="Location of weather stations"----
plot(coordinates(ne.m), asp=1, xlab="E", ylab="N", pch=20)
text(coordinates(ne.m),
     labels=substr(ne.m$STATION_NA, 1, 4), cex=0.5,
     pos=2, col=as.numeric(ne.m$STATE))
 ## add state boundaries if available
 ## otherwise omit the next command
plot(state.ne.m, add=TRUE, border="darkgray", lwd=2)
grid()


## ----load-ggplot2--------------------------------------------------------
library(ggplot2)


## ----display-annual, out.width='\\textwidth', w=11, h=10, fig.cap="Annual Growing Degree days, 50F"----
ggplot(data=ne.df) +
    aes(x=E, y=N) +
    geom_point(aes(size=ANN_GDD50,
                   colour=ELEVATION_),
               shape=20) +
        xlab("E") +  ylab("N") + coord_fixed() 


## ----export-to-kml-------------------------------------------------------
ne.wgs84 <- spTransform(ne.m, CRS("+init=epsg:4326"))
            # this is the EPSG code for global WGS84 long/lat
library(plotKML)
shape = "http://maps.google.com/mapfiles/kml/pal2/icon18.png"
station.names = substr(ne.wgs84$STATION_NA,1,12)
kml_open(file.name='ne.stations.kml')
kml_layer(ne.wgs84, colour=ANN_GDD50,
   colour_scale=SAGA_pal[[1]], shape=shape,
   points_names=station.names)
kml_close('ne.stations.kml')


## ----child-data, child='exRKGLS_OLS.Rnw'---------------------------------

## ----OLS-set-parent, echo=FALSE, cache=FALSE-----------------------------
set_parent('exRKGLS.Rnw')


## ----scatter-covars------------------------------------------------------
p1 <- ggplot() +
    geom_point(
        aes(x=ELEVATION_, y=ANN_GDD50, colour=STATE),
            data=ne.df
    )
p2 <- ggplot() +
    geom_point(
        aes(x=E, y=ANN_GDD50, colour=STATE),
            data=ne.df
    ) + xlab("Easting")
p3 <- ggplot() +
    geom_point(
        aes(x=N, y=ANN_GDD50, colour=STATE),
            data=ne.df
    ) + xlab("Northing")


## ----print-scatter-covars, out.width='0.33\\linewidth'-------------------
print(p3)
print(p2)
print(p1)


## ----scatter-xform, out.width='0.48\\linewidth'--------------------------
ggplot() +
    geom_point(
        aes(x=sqrt(ELEVATION_), y=ANN_GDD50, colour=STATE),
            data=ne.df
    )


## ----cor-preds-----------------------------------------------------------
with(ne.df, cor(ANN_GDD50, N))
with(ne.df, cor(ANN_GDD50, E))
with(ne.df, cor(ANN_GDD50, sqrt(ELEVATION_)))


## ----build-naive-elev----------------------------------------------------
m.ols.elev <- lm(ANN_GDD50 ~ sqrt(ELEVATION_), data=ne.df)
summary(m.ols.elev)


## ----build-naive-two-----------------------------------------------------
m.ols <- lm(ANN_GDD50 ~ sqrt(ELEVATION_) + N, data=ne.df)
summary(m.ols)


## ----plot-lm-fits--------------------------------------------------------
plot(ne.m$ANN_GDD50 ~ fitted(m.ols),
     col=ne.m$STATE, pch=20, asp=1,
     xlab="Fitted by OLS", ylab="Actual",
     main="Annual GDD50")
legend("topleft", levels(ne.m$STATE), pch=20, col=1:4)
grid()
abline(0,1)


## ----dx-naive, out.width='\\textwidth', w=12, h=4------------------------
par(mfrow=c(1,3))
plot(m.ols, which=c(1,2,5))
par(mfrow=c(1,1))


## ----find-min-max-ols-resid----------------------------------------------
ne.df$ols.resid <- residuals(m.ols)
(ix <- c(which.max(ne.df$ols.resid), which.min(ne.df$ols.resid)))
ne.df[ix,]
ne.df[ix,c("LATITUDE_D", "LONGITUDE_")]


## ----correct-it----------------------------------------------------------
ne.m[ix[1],"ELEVATION_"] <- ne.m[ix[1],"ELEV_FT"] <- 1192
ne.df[ix[1],"ELEVATION_"] <- ne.df[ix[1],"ELEV_FT"] <- 1192
ne.m[ix[2],"ELEVATION_"] <- ne.m[ix[2],"ELEV_FT"] <- 1141
ne.df[ix[2],"ELEVATION_"] <- ne.df[ix[2],"ELEV_FT"] <- 1141


## ----re-run-ols----------------------------------------------------------
m.ols <- lm(ANN_GDD50 ~ sqrt(ELEVATION_) + N, data=ne.df)
summary(m.ols)


## ----ols-model-dx2, out.width='\\textwidth', w=12, h=4-------------------
par(mfrow=c(1,3))
plot(m.ols, which=c(1,2,5))
par(mfrow=c(1,1))


## ----plot-lm-fits-2------------------------------------------------------
plot(ne.m$ANN_GDD50 ~ fitted(m.ols),
     col=ne.m$STATE, pch=20, asp=1,
     xlab="Fitted by OLS (corrected data)",
     ylab="Actual",
     main="Annual GDD50")
legend("topleft", levels(ne.m$STATE), pch=20, col=1:4)
grid()
abline(0,1)


## ----ols-resid-summary---------------------------------------------------
summary(residuals(m.ols))


## ----add-east, eval=FALSE, echo=FALSE, results="hide", fig.keep='none'----
## ## this code would be useful to answer the challenge
## m.ols.ts1 <- lm(ANN_GDD50 ~ N + E, data=ne.df)
## m.ols.ts1.z <- lm(ANN_GDD50 ~ N + E + sqrt(ELEVATION_), data=ne.df)
## m.ols.ts1.z.i <- lm(ANN_GDD50 ~ (N + E)*sqrt(ELEVATION_), data=ne.df)
## summary(m.ols.ts1)
## summary(m.ols)
## summary(m.ols.ts1.z)
## summary(m.ols.ts1.z.i)
## anova(m.ols.ts1.z.i, m.ols.ts1.z, m.ols.ts1)
## anova(m.ols.ts1.z, m.ols)
## AIC(m.ols.ts1.z.i, m.ols.ts1.z, m.ols.ts1, m.ols)
## par(mfrow=c(1,3))
## plot(m.ols.ts1.z.i, which=c(1,2,5))
## par(mfrow=c(1,1))
## ix <- which(abs(residuals(m.ols.ts1.z)) > 400)
## ne.m$ols.ts1.z.resid <- ne.df$ols.ts1.z.resid <- residuals(m.ols.ts1.z)
## ne.df[ix,c("STATION_NA","STATE","ELEVATION_","N","E", "ols.ts1.z.resid")]
## bubble(ne.m, zcol="ols.ts1.z.resid", pch=1,
##        main="Residuals from trend surface/elevation model")
## bubble(ne.m, zcol="ols.resid", pch=1,
##        main="Residuals from N/elevation model")
## ne.m$diff.resid.ols.ts1.z <- ne.df$diff.resid.ols.ts1.z <- (ne.m$ols.ts1.z.resid - ne.m$ols.resid)
## bubble(ne.m, zcol="diff.resid.ols.ts1.z", pch=1,
##        main="Difference of residuals: trend surface/elevation - Northing/elevation ")


## ----add-OLS-resid-df----------------------------------------------------
ne.m$ols.resid <- residuals(m.ols)


## ----bubble-naive--------------------------------------------------------
bubble(ne.m, zcol="ols.resid", pch=1,
       main="Residuals from OLS fit, actual - predicted")


## ----load-gstat----------------------------------------------------------
library(gstat)


## ----vgm-ols-resid, out.width='0.5\\textwidth'---------------------------
v.r.ols <- variogram(ols.resid ~ 1, locations=ne.m,
                     cutoff=100000, width=16000)
plot(v.r.ols, pl=T)


## ----vgm-ols-resid-model, out.width='0.5\\textwidth'---------------------
(vmf.r.ols <- fit.variogram(v.r.ols, vgm(15000, "Exp", 20000, 20000)))
plot(v.r.ols, pl=T, model=vmf.r.ols)


## ----vgm-ols-cloud, out.width='0.5\\textwidth'---------------------------
vc <- variogram(ols.resid ~ 1, locations=ne.m, cutoff=12000, cloud=T)
plot(vc, pch=20, cex=2)


## ----why-ols-resid-diff--------------------------------------------------
vc.df <- as.data.frame(vc)
vc.close <- subset(vc.df, vc.df$dist < 8000)
## sort by separation, look for anomalies.
vc.close[order(vc.close$dist),c("dist","gamma","left","right")]


## ------------------------------------------------------------------------
print(ne.m@data[c(106,107),c("STATE","STATION_NA","LATITUDE_D","LONGITUDE_",
                             "ELEVATION_", "ANN_GDD50","ols.resid")])


## ----child-data, child='exRKGLS_GLS.Rnw'---------------------------------

## ----GLS-set-parent, echo=FALSE, cache=FALSE-----------------------------
set_parent('exRKGLS.Rnw')


## ----fit-reml------------------------------------------------------------
library(nlme)
vmf.r.ols[1:2,]
m.gls <- gls(model=ANN_GDD50 ~ sqrt(ELEVATION_) + N,
             data=ne.df,
             correlation=corExp(
                 value=c(vmf.r.ols[2,"range"]),
                 form=~E + N,
                 nugget=FALSE))


## ----summary-reml--------------------------------------------------------
summary(m.gls)


## ----intervals-reml------------------------------------------------------
intervals(m.gls, level=0.95)$coef


## ----summary-intervals-coef----------------------------------------------
intervals(m.gls, level=0.95)$corStruct


## ----compare-gls-ols-corr------------------------------------------------
intervals(m.gls, level=0.95)$corStruct
vmf.r.ols[2,"range"]


## ----plot-gls-fits-------------------------------------------------------
plot(ne.m$ANN_GDD50 ~ predict(m.gls),
     col=ne.m$STATE, pch=20, asp=1,
     xlab="Fitted by GLS",
     ylab="Actual",
     main="Annual GDD50")
legend("topleft", levels(ne.m$STATE), pch=20, col=1:4)
grid()
abline(0,1)


## ----fit-reml-resid------------------------------------------------------
ne.m$gls.resid <- residuals(m.gls)
summary(ne.m$gls.resid)
bubble(ne.m, zcol="gls.resid", pch=1,
       main="Residuals from GLS fit, actual - predicted")


## ----gls-resid-vgm, out.width='0.5\\textwidth'---------------------------
v.r.gls <- variogram(gls.resid ~ 1, ne.m, cutoff=120000, width=12000)
(vmf.r.gls <- vgm(psill=var(ne.m$gls.resid),
                      model="Exp",
                      range=intervals(m.gls)$corStruct["range","est."],
                      nugget=TRUE))
plot(v.r.gls, pl=T, model=vmf.r.gls)


## ----show-spherical------------------------------------------------------
plot(v.r.ols, pl=T)


## ----fit-spherical-------------------------------------------------------
(vmf.r.ols.sph <- fit.variogram(v.r.ols,
                                vgm(psill=20000, model="Sph",
                                    range=40000, nugget=15000)))
plot(v.r.ols, pl=T, model=vmf.r.ols.sph)


## ----compute-prop-nugget-------------------------------------------------
(prop.nugget <- vmf.r.ols.sph[1,"psill"]/sum(vmf.r.ols.sph[,"psill"]))


## ----fit-reml-with-nugget------------------------------------------------
m.gls.2 <- gls(model=ANN_GDD50 ~ sqrt(ELEVATION_) + N,
             data=ne.df,
             correlation=corSpher(
                 value=c(vmf.r.ols.sph[2,"range"], prop.nugget),
                 form=~E + N,
                 nugget=TRUE))


## ----compare-corr-structs------------------------------------------------
intervals(m.gls, level=0.95)$corStruct
intervals(m.gls.2, level=0.95)$corStruct


## ----compare-ranges------------------------------------------------------
intervals(m.gls)$corStruct["range", 2]*3
intervals(m.gls.2)$corStruct["range", 2]


## ----compare-regr-coeff--------------------------------------------------
intervals(m.gls, level=0.95)$coef[,2]
intervals(m.gls.2, level=0.95)$coef[,2]


## ----compare-coeff-------------------------------------------------------
coefficients(m.gls)
coefficients(m.ols)
round(100*(coefficients(m.gls) - coefficients(m.ols))/coefficients(m.ols),2)


## ----bubble-diffs--------------------------------------------------------
ne.m$diff.gls.ols.resid <- (ne.m$gls.resid - ne.m$ols.resid)
summary(ne.m$diff.gls.ols.resid)
bubble(ne.m, zcol="diff.gls.ols.resid", pch=1,
       main="GLS residual - OLS residual",
       col=c("#EF8A62","#67A9CF"))


## ----scatter-diff, out.width='\\linewidth', w=10, h=6--------------------
ggplot() + 
    geom_point(aes(x=sqrt(ne.m$ELEVATION_), y=ne.m$diff.gls.ols.resid,
                   size=ne.m$ANN_GDD50, colour=ne.m$STATE)) +
                       xlab("Sqrt(Elevation)") + 
                           ylab("GLS - OLS residual") +
                               geom_hline(yintercept=0, size=1.2)


## ----big-diff-resid------------------------------------------------------
ix <- which.max(ne.m$diff.gls.ols.resid)
ne.df[ix,c("STATION_NA","STATE","ELEVATION_","N","E")]
ix <- which.min(ne.m$diff.gls.ols.resid)
ne.df[ix,c("STATION_NA","STATE","ELEVATION_","N","E")]


## ----child-data, child='exRKGLS_grid.Rnw'--------------------------------

## ----grid-set-parent, echo=FALSE, cache=FALSE----------------------------
set_parent('exRKGLS.Rnw')


## ----load-1km-dem-no, eval=FALSE, echo=TRUE------------------------------
## library(raster)
## dem.1km <- raster("./srtm_1km_48.asc")


## ----load-1km-dem-yes, eval=TRUE, echo=FALSE-----------------------------
## this is for the author's system
library(raster)
dem.1km <- raster("./ds/SRTM_1km_ASC/srtm_1km_48.asc")


## ----show-1km-dem--------------------------------------------------------
class(dem.1km)
extent(dem.1km)
projection(dem.1km)
proj4string(dem.1km)
res(dem.1km)
nrow(dem.1km)
ncol(dem.1km)
ncell(dem.1km)


## ----compute-grid-cells-size---------------------------------------------
round(res(dem.1km)[1]*(10000/90),3)
round(res(dem.1km)[1]*(10000/90)*cos(42*pi/180),3)


## ----crop-1km------------------------------------------------------------
bbox(state.ne)
dem.ne <- crop(dem.1km, state.ne)
bbox(dem.ne)
nrow(dem.ne)
ncol(dem.ne)
ncell(dem.ne)


## ----project-1km---------------------------------------------------------
dem.ne.m <- projectRaster(dem.ne, crs=proj4string(ne.m), method="bilinear")
nrow(dem.ne.m)
ncol(dem.ne.m)
ncell(dem.ne.m)
res(dem.ne.m)
summary(dem.ne.m)


## ----convert-elevations-to-feet------------------------------------------
ft.to.m <- 0.3048
dem.ne.m <- dem.ne.m/ft.to.m
summary(dem.ne.m)


## ----view-1km, out.width='.8\\textwidth'---------------------------------
image(dem.ne.m, col=topo.colors(24))


## ----aggregate-to-4km----------------------------------------------------
out.grid.res <- 4000
dem.ne.m <- aggregate(dem.ne.m,
                      fact=c(floor(out.grid.res/res(dem.ne.m)[1]),
                             floor(out.grid.res/res(dem.ne.m)[2])),
                      fun=mean)
ncell(dem.ne.m)


## ----save-4km-grid-no, eval=FALSE----------------------------------------
## save(dem.ne.m, file="./dem_ne_4km.RData")


## ----save-4km-grid-yes, eval=T, echo=FALSE-------------------------------
## for the author's system
save(dem.ne.m, file="./ds/NEweather/dem_ne_4km.RData")


## ----load-4km-grid-a-no, eval=FALSE--------------------------------------
## load(file="./dem_ne_4km.RData", verbose=TRUE)


## ----load-4km-grid-a-yes, eval=T, echo=F---------------------------------
## for the author's system
load(file="./ds/NEweather/dem_ne_4km.RData", verbose=TRUE)


## ----load-4km-grid, out.width='.8\\textwidth'----------------------------
image(dem.ne.m, col=topo.colors(24))


## ----convert-4km-df------------------------------------------------------
class(dem.ne.m)
dem.ne.m.sp <- as(dem.ne.m, "SpatialPixelsDataFrame")
coordnames(dem.ne.m.sp) <- c("E", "N")
names(dem.ne.m.sp) <- "ELEVATION_"
dem.ne.m.df <- as.data.frame(dem.ne.m.sp)
class(dem.ne.m.df)
dim(dem.ne.m.df)
names(dem.ne.m.df)


## ----adjust-negative-to-zero---------------------------------------------
dem.ne.m.sp$ELEVATION_ <- pmax(0, dem.ne.m.sp$ELEVATION_)
dem.ne.m.df$ELEVATION_ <- pmax(0, dem.ne.m.df$ELEVATION_)


## ----child-data, child='exRKGLS_pred_OLS_GLS.Rnw'------------------------

## ----pred-OLS-GLS-set-parent, echo=FALSE, cache=FALSE--------------------
set_parent('exRKGLS.Rnw')


## ----predict-4km---------------------------------------------------------
dem.ne.m.df$pred.ols <- predict(m.ols, newdata=dem.ne.m.df)
dem.ne.m.df$pred.gls <- predict(m.gls, newdata=dem.ne.m.df)
summary(dem.ne.m.df[,-(1:3)])


## ----define-display-map-function-----------------------------------------
display.prediction.map <- function(.prediction.map.name,
                                   .plot.title, 
                                   .legend.title,
                                   .plot.limits=NULL,
                                   .palette="YlGnBu") {
    ggplot(data=dem.ne.m.df) +
        geom_point(aes_(x=quote(E), y=quote(N),
                        color=as.name(.prediction.map.name))) +
        xlab("E") + ylab("N") + coord_fixed() +
        geom_point(aes(x=E, y=N, color=ANN_GDD50),
                   data=ne.df, shape=I(20)) +
        ggtitle(.plot.title) +
        scale_colour_distiller(name=.legend.title,
                               space="Lab",
                               palette=.palette,
                               limits=.plot.limits)
}


## ----plot-4km, out.width='.8\\textwidth'---------------------------------
(gdd.pred.lim <- round(
    c(min(dem.ne.m.df[,4:5])-10, max(dem.ne.m.df[,4:5])+10)))
display.prediction.map("pred.ols", "Annual GDD, base 50F, OLS prediction",
                       "GDD50", gdd.pred.lim)
display.prediction.map("pred.gls", "Annual GDD, base 50F, GLS prediction",
                       "GDD50", gdd.pred.lim)


## ----diffs-4km-gls-ols---------------------------------------------------
summary(dem.ne.m.df$diff.gls.ols <- 
            dem.ne.m.df$pred.gls - dem.ne.m.df$pred.ols)


## ----define-difference-map-function--------------------------------------
display.difference.map <- function(.diff.map.name,
                                   .diff.map.title,
                                   .legend.name,
                                   .palette="Spectral") {
    ggplot(data=dem.ne.m.df) +
        geom_point(aes_(x=quote(E), y=quote(N),
                            color=as.name(.diff.map.name))) +
        xlab("E") + ylab("N") + coord_fixed() +
        ggtitle(.diff.map.title) +
        scale_colour_distiller(name=.legend.name, 
                               space="Lab",
                               palette=.palette)
    }


## ----display-diffs-4km-gls-ols, out.width='.8\\textwidth'----------------
display.difference.map("diff.gls.ols", 
                       "Annual GDD, base 50F, GLS - OLS predictions", 
                       "+/- GDD50")


## ----child-data, child='exRKGLS_RK.Rnw'----------------------------------

## ----RKGLS-set-parent, echo=FALSE, cache=FALSE---------------------------
set_parent('exRKGLS.Rnw')


## ----print-gls-vmf-------------------------------------------------------
print(vmf.r.gls)


## ----gls-resid-ok-grid, out.width='.6\\textwidth'------------------------
summary(ne.m$gls.resid)
hist(ne.m$gls.resid, freq=F,
     xlab="GDD50",
     main="GLS residuals")
rug(ne.m$gls.resid)


## ----krige-ok-gls-resid-1------------------------------------------------
ok.gls.resid <- krige(gls.resid ~ 1,
                      loc=ne.m, newdata=dem.ne.m.sp,
                      model=vmf.r.gls)
summary(ok.gls.resid)


## ----krige-ok-gls-resid-2------------------------------------------------
dem.ne.m.df$ok.gls.resid <- ok.gls.resid$var1.pred
dem.ne.m.df$ok.gls.resid.var <- ok.gls.resid$var1.var


## ----krige-ok-gls-resid, out.width='.9\\textwidth'-----------------------
ggplot() +
    geom_point(aes(x=E, y=N, colour=ok.gls.resid), data=dem.ne.m.df) +
    xlab("E") + ylab("N") + coord_fixed() +
    ggtitle("Residuals from GLS trend surface, GDD base 50F") +
    scale_colour_distiller(name="GDD50", space="Lab", palette="RdBu")


## ------------------------------------------------------------------------
mean(dem.ne.m.df$ok.gls.resid)  # spatial mean over the grid
mean(ne.m$gls.resid)            # arithmetic mean at observation points


## ----show-ok-gls-resid-var, out.width='.9\\textwidth'--------------------
ggplot() +
    geom_point(aes(x=E, y=N, colour=sqrt(ok.gls.resid.var)),
               data=dem.ne.m.df) + xlab("E") + ylab("N") + 
    coord_fixed() +
    geom_point(aes(x=E, y=N),  data=ne.df, size=0.5,
               colour="black", shape=I(20)) +
    ggtitle("Residuals from GLS trend, kriging prediction standard deviation") +
    scale_colour_distiller(name="GDD50 base 50F", space="Lab",
                           palette="BuPu", trans="reverse")


## ----rk-gls, out.width='.9\\textwidth'-----------------------------------
summary(dem.ne.m.df$pred.rkgls <- 
            dem.ne.m.df$pred.gls + dem.ne.m.df$ok.gls.resid)
display.prediction.map("pred.rkgls", "Annual GDD, base 50F, GLS-RK prediction",
                       "GDD50")


## ----child-data, child='exRKGLS_KED.Rnw'---------------------------------

## ----KED-set-parent, echo=FALSE, cache=FALSE-----------------------------
set_parent('exRKGLS.Rnw')


## ----ked-vgm, out.width='0.6\\textwidth', w=8, h=6-----------------------
v.ked <- variogram(ANN_GDD50 ~ sqrt(ELEVATION_) + N, locations=ne.m,
                     cutoff=100000, width=16000)


## ----ked-vgm-fit, out.width='0.6\\textwidth', w=8, h=6-------------------
(vmf.ked <- fit.variogram(v.ked, vgm(15000, "Exp", 20000, 20000)))
plot(v.ked, plot.numbers=TRUE, model=vmf.ked)


## ----ked-pred-grid-------------------------------------------------------
k.ked <- krige(ANN_GDD50 ~ sqrt(ELEVATION_)+ N, locations=ne.m,
                      newdata=dem.ne.m.sp, model=vmf.ked)
summary(k.ked)


## ----ked-show, out.width='.9\\textwidth'---------------------------------
dem.ne.m.df$pred.ked <- k.ked$var1.pred
dem.ne.m.df$pred.ked.sd <- sqrt(k.ked$var1.var)
display.prediction.map("pred.ked",
                       "Annual GDD, base 50F, KED prediction",
                       "GDD50")
#
ggplot() +
    geom_point(aes(x=E, y=N, colour=pred.ked.sd), data=dem.ne.m.df) +
    xlab("E") + ylab("N") + coord_fixed() +
    geom_point(aes(x=E, y=N),  data=ne.df, size=0.5,
               colour="black", shape=I(20)) +
    ggtitle("KED standard deviation, GDD base 50F") +
    scale_colour_distiller(name="GDD50", space="Lab",
                           palette="BuPu", trans="reverse")


## ----diffs-4km-rkgls-ked-------------------------------------------------
summary(dem.ne.m.df$diff.rkgls.ked <- 
            dem.ne.m.df$pred.rkgls - dem.ne.m.df$pred.ked)


## ----display-diffs-4km-rkgls-ked, out.width='.8\\textwidth'--------------
display.difference.map("diff.rkgls.ked", 
                       "Difference annual GDD base 50F, RK-GLS - KED", 
                       "+/- GDD50")


## ----krige-loocv-ked, results='hide'-------------------------------------
kcv.ked <- krige.cv(ANN_GDD50 ~ sqrt(ELEVATION_)+ N, locations=ne.m, model=vmf.ked)


## ----krige-loocv-ked-2---------------------------------------------------
summary(kcv.ked$residual)


## ------------------------------------------------------------------------
(loocv.ked.rmse <- sqrt(sum(kcv.ked$residual^2)/length(kcv.ked$residual)))


## ----ok-loocv-bubble-ked-------------------------------------------------
bubble(kcv.ked, zcol="residual", pch=1, main="LOOCV KED residuals")


## ----ked-local-----------------------------------------------------------
ked.n.max <- 61  # change this to try other numbers of neighbours
k.ked.nn <- krige(ANN_GDD50 ~ sqrt(ELEVATION_)+ N, locations=ne.m,
               newdata=dem.ne.m.sp, model=vmf.ked,
               nmax=ked.n.max)
summary(k.ked.nn@data)


## ----ked-local-display-map-----------------------------------------------
dem.ne.m.df$pred.ked.nn <- k.ked.nn$var1.pred
dem.ne.m.df$pred.ked.nn.sd <- sqrt(k.ked.nn$var1.var)
display.prediction.map("pred.ked.nn",
                       paste("Annual GDD, base 50F, KED prediction,",
                             ked.n.max, "nearest neighbours"),
                       "GDD50")


## ----ked-compare-global-local--------------------------------------------
summary(dem.ne.m.df$diff.ked <- dem.ne.m.df$pred.ked - dem.ne.m.df$pred.ked.nn)
display.difference.map("diff.ked",
                       paste("Difference annual GDD base 50F, KED global - KED",
                             ked.n.max,"nearest neighbours"),
                       "+/- GDD50")


## ----ked-local-cv, results='hide'----------------------------------------
kcv.ked.nn <- krige.cv(ANN_GDD50 ~ sqrt(ELEVATION_)+ N,
                       locations=ne.m, model=vmf.ked,
                       nmax=ked.n.max)


## ----ked-local-cv-2------------------------------------------------------
summary(kcv.ked.nn$residual)
summary(kcv.ked$residual)
(loocv.ked.nn.rmse <- sqrt(sum(kcv.ked.nn$residual^2)/length(kcv.ked.nn$residual)))
(loocv.ked.rmse <- sqrt(sum(kcv.ked$residual^2)/length(kcv.ked$residual)))


## ----ked-global-local-cv-bubble------------------------------------------
summary(kcv.ked$diff <- kcv.ked$residual - kcv.ked.nn$residual)
bubble(kcv.ked, zcol="diff",
       pch=1, main=paste("KED - KED",
                         ked.n.max, "nearest neighbour residuals"),
       col=c("#EF8A62","#67A9CF"))


## ------------------------------------------------------------------------
k.ok <- krige(ANN_GDD50 ~ sqrt(ELEVATION_)+ N, locations=ne.m,
                newdata=dem.ne.m.sp, model=NULL)


## ------------------------------------------------------------------------
k.okr <- krige(ols.resid ~ 1, locations=ne.m,
               newdata=dem.ne.m.sp, model=vmf.r.ols)


## ------------------------------------------------------------------------
k.ok$rk.pred <- k.ok$var1.pred + k.okr$var1.pred


## ------------------------------------------------------------------------
k.ok$diff.pred <- k.ok$rk.pred - k.ked$var1.pred
summary(k.ok$rk.pred); summary(k.ked$var1.pred)
summary(k.ok$diff.pred)


## ----display-okrk-ked-diff, out.width='.8\\textwidth'--------------------
dem.ne.m.df$diff.okrk.ked.pred <- k.ok$diff.pred
display.difference.map("diff.okrk.ked.pred", 
                       "Naive RK vs.\ KED surface, GDD base 50F", 
                       "+/- GDD50")


## ----child-data, child='exRKGLS_GAM.Rnw'---------------------------------

## ----GAM-set-parent, echo=FALSE, cache=FALSE-----------------------------
set_parent('exRKGLS.Rnw')


## ----plot-marginal-loess, out.width='\\textwidth'------------------------
g1 <- ggplot(ne.df, aes(x=E, y=ANN_GDD50)) +
  geom_point() +
  geom_smooth(method="loess")
g2 <- ggplot(ne.df, aes(x=N, y=ANN_GDD50)) +
  geom_point() +
  geom_smooth(method="loess")
g3 <- ggplot(ne.df, aes(x=ELEVATION_, y=ANN_GDD50)) +
  geom_point() +
  geom_smooth(method="loess")
g4 <- ggplot(ne.df, aes(x=sqrt(ELEVATION_), y=ANN_GDD50)) +
  geom_point() +
  geom_smooth(method="loess")
require(gridExtra)
grid.arrange(g1, g2, g3, g4, ncol = 2)


## ----load-mgcv-----------------------------------------------------------
require(mgcv)


## ----fit-gam-------------------------------------------------------------
m.g.xy <- gam(ANN_GDD50 ~ s(E, N) + s(ELEVATION_), data=ne.df)
summary(m.g.xy)
summary(residuals(m.g.xy))


## ----gam-resid-bubble----------------------------------------------------
ne.m@data$resid.m.g.xy <- residuals(m.g.xy)
bubble(ne.m, zcol="resid.m.g.xy", pch=1, main="Residuals from GAM")


## ----gam-resid-vgm, out.width='.55\\linewidth'---------------------------
vr <- variogram(resid.m.g.xy ~ 1, loc=ne.m, cutoff=50000, width=5000)
plot(vr, pl=T)


## ----plot-gam-2D, out.width='\\linewidth'--------------------------------
plot.gam(m.g.xy, rug=T, se=T, select=1,
         scheme=1, theta=30+130, phi=30)


## ----plot-gam-2D-se, out.width='\\linewidth'-----------------------------
vis.gam(m.g.xy, plot.type="persp", color="terrain", 
        theta=160, zlab="Annual GDD50", se=1.96)


## ----plot-gam-1D-elev----------------------------------------------------
plot.gam(m.g.xy, select=2, rug=T, se=T, residuals=T, pch=20,
         shade=T,  seWithMean=T, shade.col="lightblue")


## ----plot-gam-fits-------------------------------------------------------
(rmse.gam <- sqrt(sum(residuals(m.g.xy)^2)/length(residuals(m.g.xy))))
plot(ne.m$ANN_GDD50 ~ predict(m.g.xy, newdata=ne.df),
     col=ne.m$STATE, pch=20, asp=1,
     xlab="Fitted by random forest", ylab="Actual",
     main="Annual GDD50")
legend("topleft", levels(ne.df$STATE), pch=20, col=1:4)
grid(); abline(0,1)


## ----predict-gam, out.width='0.8\\textwidth'-----------------------------
tmp <- predict.gam(object=m.g.xy, newdata=dem.ne.m.df, se.fit=TRUE)
summary(tmp$fit)
summary(tmp$se.fit)
dem.ne.m.df$pred.gam <- tmp$fit
dem.ne.m.df$pred.gam.se <- tmp$se.fit
display.prediction.map("pred.gam", "Annual GDD, base 50F, GAM prediction",
                       "GDD50")


## ----predict-gam-se, out.width='0.8\\textwidth'--------------------------
ggplot() +
    geom_point(aes(x=E, y=N, colour=pred.gam.se), data=dem.ne.m.df) +
    xlab("E") + ylab("N") + coord_fixed() +
    ggtitle("Annual GDD base 50F, Standard error of GAM prediction") +
    scale_colour_distiller(name="GDD50 s.e.", space="Lab", palette="RdYlGn",
                           direction=-1)


## ----diffs-4km-gls-gam---------------------------------------------------
summary(dem.ne.m.df$diff.gls.gam <- 
            dem.ne.m.df$pred.gls - dem.ne.m.df$pred.gam)


## ----display-diffs-4km-gls-gam, out.width='.8\\textwidth'----------------
display.difference.map("diff.gls.gam",
                       "Difference annual GDD base 50F, GLS - GAM",
                       "+/- GDD50")


## ----diffs-4km-rkgls-gam-------------------------------------------------
dem.ne.m.df$diff.rkgls.gam <- dem.ne.m.df$pred.rkgls - dem.ne.m.df$pred.gam
summary(dem.ne.m.df$diff.rkgls.gam)


## ----display-diffs-4km-rkgls-gam, out.width='.8\\textwidth'--------------
ggplot() +
    geom_point(aes(x=E, y=N, colour=diff.rkgls.gam), data=dem.ne.m.df) +
    xlab("E") + ylab("N") + coord_fixed() +
    ggtitle("Difference annual GDD base 50F, GLS/RK - GAM") +
    scale_colour_distiller(name="GDD50", space="Lab", palette="Spectral")


## ----child-data, child='exRKGLS_GWR.Rnw'---------------------------------

## ----GWR-set-parent, echo=FALSE, cache=FALSE-----------------------------
set_parent('exRKGLS.Rnw')


## ----plot.fixed.kernel, out.width='0.6\\textwidth', w=6, h=4, fig.cap="Gaussian kernel"----
require(ggplot2)
h <- 16
ggplot(data=data.frame(x=c(-64, 64)), aes(x)) + 
  stat_function(fun=function(x) { exp(-1/2 * (x/h)^2) }) +
    labs(title = paste("Gaussian kernel, bandwidth parameter",h))
g.kernel <- function(d, h) { exp(-1/2 * (d/h)^2)}


## ----<transform.m.km-----------------------------------------------------
require(rgdal)
(p4str.km <- paste0(substr(proj4string(ne.m), 1,
                           nchar(proj4string(ne.m))-1),"km"))
ne.km <- spTransform(ne.m, p4str.km)
bbox(ne.km)


## ----add.e.n-------------------------------------------------------------
coordnames(ne.km) <- c("x", "y")
ne.km$E <- coordinates(ne.km)[,1]
ne.km$N <- coordinates(ne.km)[,2]


## ----gwr-packages--------------------------------------------------------
require(sp)
require(spgwr)


## ----gwr-sel-fixed-------------------------------------------------------
(bw.Gauss.f <- gwr.sel(ANN_GDD50 ~ E + N + sqrt(ELEVATION_), data = ne.km,
                       adapt = F,  verbose = F,
                       gweight=gwr.Gauss))


## ----gwr-sel-adapt-------------------------------------------------------
(bw.Gauss.a <- gwr.sel(ANN_GDD50 ~ E + N + sqrt(ELEVATION_), data = ne.km,
                       adapt = T,  verbose = F,
                       gweight=gwr.Gauss, method="cv"))


## ----gwr-compute-it------------------------------------------------------
(gwr.Gauss.f <- gwr(ANN_GDD50 ~ E + N + sqrt(ELEVATION_),
                  data = ne.km,
                  bandwidth = bw.Gauss.f,
                  gweight=gwr.Gauss))
(gwr.Gauss.a <- gwr(ANN_GDD50 ~ E + N + sqrt(ELEVATION_),
                  data = ne.km,
                  adapt=bw.Gauss.a,
                  gweight=gwr.Gauss))


## ----gwr-results-df------------------------------------------------------
gwr.f <- gwr.Gauss.f$SDF@data
gwr.a <- gwr.Gauss.a$SDF@data
(gwr.global <- gwr.Gauss.f$lm$coefficients)


## ----gwr-coeff-featurespace-b0, fig.width=9, fig.height=5, out.width='0.9\\textwidth'----
xlim.b0 <- range(gwr.f$'(Intercept)', gwr.a$'(Intercept)')
breaks.b0 <- seq(xlim.b0[1], xlim.b0[2], length=16)
par(mfrow=c(1,2))
hist(gwr.f$'(Intercept)', main="Gauss fixed bandwidth",
     xlab="Intercept", xlim=xlim.b0, freq=F, breaks=breaks.b0)
abline(v=gwr.global['(Intercept)'], col="red", lwd=2)
rug(gwr.f$'(Intercept)')
hist(gwr.a$'(Intercept)', main="Gauss adaptive bandwidth",
     xlab="Intercept", xlim=xlim.b0, freq=F, breaks=breaks.b0)
abline(v=gwr.global['(Intercept)'], col="red", lwd=2)
rug(gwr.a$'(Intercept)')
par(mfrow=c(1,1))


## ----gwr-coeff-featurespace-N,fig.width=9, fig.height=5, out.width='0.9\\textwidth'----
xlim.N <- range(gwr.f$N, gwr.a$N)
breaks.N <- seq(xlim.N[1], xlim.N[2], length=16)
par(mfrow=c(1,2))
hist(gwr.f$N, main="Gauss fixed bandwidth",
     xlab="N coefficient", xlim=xlim.N, freq=F, breaks=breaks.N)
abline(v=gwr.global["N"], col="red", lwd=2)
rug(gwr.f$N)
hist(gwr.a$N, main="Gauss adaptive bandwidth",
     xlab="N coefficient", xlim=xlim.N, freq=F, breaks=breaks.N)
abline(v=gwr.global["N"], col="red", lwd=2)
rug(gwr.a$N)
par(mfrow=c(1,1))


## ----gwr-coeff-featurespace-E, fig.width=9, fig.height=5, out.width='0.9\\textwidth'----
xlim.E <- range(gwr.f$E, gwr.a$E)
breaks.E <- seq(xlim.E[1], xlim.E[2], length=16)
par(mfrow=c(1,2))
hist(gwr.f$E, main="Gauss fixed bandwidth",
     xlab="E coefficient", xlim=xlim.E, freq=T, breaks=breaks.E)
abline(v=gwr.global["E"], col="red", lwd=1.5)
rug(gwr.f$E)
hist(gwr.a$E, main="Gauss adaptive bandwidth",
     xlab="E coefficient", xlim=xlim.E, freq=T, breaks=breaks.E)
abline(v=gwr.global["E"], col="red", lwd=1.5)
rug(gwr.a$E)
par(mfrow=c(1,1))


## ----gwr-coeff-featurespace-elev, fig.width=9, fig.height=5, out.width='0.9\\textwidth'----
xlim.elev <- range(gwr.f$'sqrt(ELEVATION_)', gwr.a$'sqrt(ELEVATION_)')
breaks.elev <- seq(xlim.elev[1], xlim.elev[2], length=16)
par(mfrow=c(1,2))
hist(gwr.f$'sqrt(ELEVATION_)', main="Gauss fixed bandwidth",
     xlab="sqrt(elevation) coefficient", xlim=xlim.elev, freq=T,
     breaks=breaks.elev)
abline(v=gwr.global['sqrt(ELEVATION_)'], col="red", lwd=1.5)
rug(gwr.f$'sqrt(ELEVATION_)')
hist(gwr.a$'sqrt(ELEVATION_)', main="Gauss adaptive bandwidth",
     xlab="sqrt(elevation) coefficient", xlim=xlim.elev, freq=T,
     breaks=breaks.elev)
abline(v=gwr.global['sqrt(ELEVATION_)'], col="red", lwd=1.5)
rug(gwr.a$'sqrt(ELEVATION_)')
par(mfrow=c(1,1))


## ----gwr-coeff-featurespace-r2, fig.width=9, fig.height=5, out.width='0.9\\textwidth'----
# global fit
global.r2 <- summary(lm(ANN_GDD50 ~ E + N + sqrt(ELEVATION_),
                        data=ne.km@data))$adj.r.squared
xlim.r2 <- range(gwr.f$localR2, gwr.a$localR2)
breaks.r2 <- seq(xlim.r2[1], xlim.r2[2], length=16)
par(mfrow=c(1,2))
hist(gwr.f$localR2, main="Gauss fixed bandwidth",
     xlab="Coefficient of determination", xlim=xlim.r2, freq=T, breaks=breaks.r2)
abline(v=global.r2, col="red", lwd=1.5)
rug(gwr.f$localR2)
hist(gwr.a$localR2, main="Gauss adaptive bandwidth",
     xlab="Coefficient of determination", xlim=xlim.r2, freq=T, breaks=breaks.r2)
abline(v=global.r2, col="red", lwd=1.5)
rug(gwr.a$localR2)
par(mfrow=c(1,1))


## ----gwr-sum-w, fig.width=9, fig.height=5, out.width='0.9\\textwidth'----
xlim.sum.w <- range(gwr.f$sum.w, gwr.a$sum.w)
breaks.sum.w <- seq(xlim.sum.w[1], xlim.sum.w[2], length=16)
par(mfrow=c(1,2))
hist(gwr.f$sum.w, main="Gauss fixed bandwidth",
     xlab="Sum of weights", xlim=xlim.sum.w, freq=T, breaks=breaks.sum.w)
rug(gwr.f$sum.w)
hist(gwr.a$sum.w, main="Gauss adaptive bandwidth",
     xlab="Sum of weights", xlim=xlim.sum.w, freq=T, breaks=breaks.sum.w)
rug(gwr.a$sum.w)
par(mfrow=c(1,1))


## ----gwr-significance----------------------------------------------------
gwr.with.se <- gwr(ANN_GDD50 ~ E + N + sqrt(ELEVATION_),
                   data = ne.km,
                   adapt = bw.Gauss.a,
                   gweight=gwr.Gauss,
                   hatmatrix=TRUE,
                   se.fit=TRUE)
names(gwr.with.se$SDF@data)


## ----gwr-significance-test-----------------------------------------------
summary(t.n <- gwr.with.se$SDF$N / gwr.with.se$SDF$N_se_EDF)
summary(t.e <- gwr.with.se$SDF$E / gwr.with.se$SDF$E_se_EDF)
summary(t.h <- gwr.with.se$SDF$'sqrt(ELEVATION_)' /
            gwr.with.se$SDF$'sqrt(ELEVATION_)_se_EDF')


## ----gwr-significance-map-N, fig.width=6, fig.height=6, out.width='0.55\\textwidth'----
sig.map <- SpatialPointsDataFrame(ne.km, data.frame(t.n, t.e, t.h))
sig.colours <- c("darkgreen","red","green") 
breaks <- c(min(t.n),-4,4,max(t.n))
spplot(sig.map, col.regions=sig.colours, breaks=breaks,
       zcol="t.n",
       main="Significance N coefficient",
       auto.key=FALSE)


## ----gwr-significance-map-E, fig.width=6, fig.height=6, out.width='0.55\\textwidth'----
breaks <- c(min(t.e),-4,4,max(t.e))
spplot(sig.map, col.regions=sig.colours, breaks=breaks,
       zcol="t.e",
       main="Significance E coefficient",
       auto.key=FALSE)


## ----gwr-significance-map-H, fig.width=6, fig.height=6, out.width='0.55\\textwidth'----
breaks <- c(min(t.h),-3,3,max(t.h))
spplot(sig.map, col.regions=sig.colours, breaks=breaks,
       zcol="t.h",
       main="Significance elevation coefficient",
       auto.key=FALSE)


## ----gwr-sig-any---------------------------------------------------------
table(is.sig <- (abs(t.n) > 3) | (abs(t.e) > 3) | (abs(t.h) > 3))
ne.m@data[which(!is.sig),"STATION_NA"]


## ----gwr-compare-rmse----------------------------------------------------
n <- length(residuals(m.ols))
tmp.rmse <- data.frame(cbind(sqrt(sum(residuals(m.ols)^2)/n),
      sqrt(sum(residuals(m.gls)^2)/n),
      sqrt(sum((gwr.Gauss.f$SDF$pred - ne.m$ANN_GDD50)^2)/n),
      sqrt(sum((gwr.Gauss.a$SDF$pred - ne.m$ANN_GDD50)^2)/n)))
row.names(tmp.rmse) <- "RMSE"


## ----gwr-pred-points-----------------------------------------------------
tmp <- data.frame(round(
    cbind(summary(residuals(m.ols)),
          summary(residuals(m.gls)),
          summary(gwr.f$gwr.e),
          summary(gwr.a$gwr.e)),1))
tmp <- rbind(tmp.rmse, tmp)
names(tmp) <- c("OLS", "GLS", "GWR-fixed", "GWR-adaptive")
print(tmp)
rm(tmp, tmp.rmse)


## ----manipulate-states---------------------------------------------------
require(raster)
dem.ne4.m <- raster::mask(dem.ne.m, state.ne.m)


## ----convert-dem-transform-----------------------------------------------
dem.ne4.m.spdf <- as(dem.ne4.m,"SpatialPointsDataFrame")
## match the coordinate names with the calibration points
coordnames(dem.ne4.m.spdf)
dem.ne4.km.spdf <- spTransform(dem.ne4.m.spdf, p4str.km)
## remove negative elevations so sqrt() is real
names(dem.ne4.km.spdf) <- "ELEVATION_"
dem.ne4.km.spdf[(dem.ne4.km.spdf$ELEVATION_ < 0), "ELEVATION_"] <- 0
## add the coordinates to the data frame
dem.ne4.km.spdf$E <- coordinates(dem.ne4.km.spdf)[,1]
dem.ne4.km.spdf$N <- coordinates(dem.ne4.km.spdf)[,2]
## match the names with the calibration points
summary(dem.ne4.km.spdf)


## ----gwr-predict---------------------------------------------------------
gwr.Gauss.f.pred <- gwr(ANN_GDD50 ~ E + N + sqrt(ELEVATION_),
                  data=ne.km,
                  predictions=TRUE, # for prediction
                  fit.points=dem.ne4.km.spdf,
                  bandwidth=bw.Gauss.f,
                  gweight=gwr.Gauss)
gwr.Gauss.a.pred <- gwr(ANN_GDD50 ~ E + N + sqrt(ELEVATION_),
                  data=ne.km,
                  predictions=TRUE,  # for prediction
                  fit.points=dem.ne4.km.spdf,
                  adapt=bw.Gauss.a,
                  gweight=gwr.Gauss)
summary(gwr.Gauss.f.pred$SDF$pred)
summary(gwr.Gauss.a.pred$SDF$pred)
dem.ne4.km.spdf$pred.f <- gwr.Gauss.f.pred$SDF$pred
dem.ne4.km.spdf$pred.a <- gwr.Gauss.a.pred$SDF$pred


## ----gwr-pred-coefficients-----------------------------------------------
dem.ne4.km.spdf$b0.f <- gwr.Gauss.f.pred$SDF$"(Intercept)"
dem.ne4.km.spdf$b0.a <- gwr.Gauss.a.pred$SDF$"(Intercept)"
dem.ne4.km.spdf$N.f <- gwr.Gauss.f.pred$SDF$N
dem.ne4.km.spdf$N.a <- gwr.Gauss.a.pred$SDF$N
dem.ne4.km.spdf$E.f <- gwr.Gauss.f.pred$SDF$E
dem.ne4.km.spdf$E.a <- gwr.Gauss.a.pred$SDF$E
dem.ne4.km.spdf$H.f <- gwr.Gauss.f.pred$SDF$"sqrt(ELEVATION_)"
dem.ne4.km.spdf$H.a <- gwr.Gauss.a.pred$SDF$"sqrt(ELEVATION_)"


## ----make-km-grid--------------------------------------------------------
dem.ne4.km.grid <- dem.ne4.km.spdf
# convert to a gridded object
gridded(dem.ne4.km.grid) <- TRUE; fullgrid(dem.ne4.km.grid) <- TRUE


## ----map-gwr-setup-------------------------------------------------------
pred.range <- range(na.omit(dem.ne4.km.grid$pred.f),
                     na.omit(dem.ne4.km.grid$pred.a))
cuts <- seq(pred.range[1], pred.range[2], length=24)


## ----map-gwr-pred, fig.width=14, fig.height=7, out.width='\\textwidth'----
p1 <- spplot(dem.ne4.km.grid, zcol="pred.f",
       main="Annual GDD50, GWR fixed Gaussian kernel", 
       at=cuts, key.space="right", col.regions=bpy.colors(24))
p2 <- spplot(dem.ne4.km.grid, zcol="pred.a",
       main="Annual GDD50, GWR adaptive Gaussian kernel",
       at=cuts, key.space="right")
print(p1, split=c(1,1,2,1), more=T); print(p2, split=c(2,1,2,1), more=F)


## ----map-gwr-setup-bo----------------------------------------------------
pred.range <- range(na.omit(dem.ne4.km.grid$b0.f),
                     na.omit(dem.ne4.km.grid$b0.a))
cuts <- seq(pred.range[1], pred.range[2], length=24)


## ----map-gwr-pred-b0, fig.width=14, fig.height=7, out.width='\\textwidth'----
p1 <- spplot(dem.ne4.km.grid, zcol="b0.f",
       main="Intercept, fixed kernel", 
       at=cuts, key.space="right", col.regions=topo.colors(24))
p2 <- spplot(dem.ne4.km.grid, zcol="b0.a",
       main="Intercept, adaptive kernel",
       at=cuts, key.space="right", col.regions=topo.colors(24))
print(p1, split=c(1,1,2,1), more=T); print(p2, split=c(2,1,2,1), more=F)


## ----map-gwr-setup-N-----------------------------------------------------
pred.range <- range(na.omit(dem.ne4.km.grid$N.f),
                     na.omit(dem.ne4.km.grid$N.a))
cuts <- seq(pred.range[1], pred.range[2], length=24)


## ----map-gwr-pred-N, fig.width=14, fig.height=7, out.width='\\textwidth'----
p1 <- spplot(dem.ne4.km.grid, zcol="N.f",
       main="Northing coefficient, fixed kernel", 
       at=cuts, key.space="right", col.regions=terrain.colors(24))
p2 <- spplot(dem.ne4.km.grid, zcol="N.a",
       main="Northing coefficient, adaptive kernel",
       at=cuts, key.space="right", col.regions=terrain.colors(24))
print(p1, split=c(1,1,2,1), more=T); print(p2, split=c(2,1,2,1), more=F)


## ----map-gwr-setup-E-----------------------------------------------------
pred.range <- range(na.omit(dem.ne4.km.grid$E.f),
                     na.omit(dem.ne4.km.grid$E.a))
cuts <- seq(pred.range[1], pred.range[2], length=24)


## ----map-gwr-pred-E, fig.width=14, fig.height=7, out.width='\\textwidth'----
p1 <- spplot(dem.ne4.km.grid, zcol="E.f",
       main="Easting coefficient, fixed kernel", 
       at=cuts, key.space="right", col.regions=terrain.colors(24))
p2 <- spplot(dem.ne4.km.grid, zcol="E.a",
       main="Easting coefficient, adaptive kernel",
       at=cuts, key.space="right", col.regions=terrain.colors(24))
print(p1, split=c(1,1,2,1), more=T); print(p2, split=c(2,1,2,1), more=F)


## ----map-gwr-setup-H-----------------------------------------------------
pred.range <- range(na.omit(dem.ne4.km.grid$H.f),
                     na.omit(dem.ne4.km.grid$H.a))
cuts <- seq(pred.range[1], pred.range[2], length=24)


## ----map-gwr-pred-H, fig.width=14, fig.height=7, out.width='\\textwidth'----
p1 <- spplot(dem.ne4.km.grid, zcol="H.f",
       main="elevation coefficient, fixed kernel", 
       at=cuts, key.space="right", col.regions=terrain.colors(24))
p2 <- spplot(dem.ne4.km.grid, zcol="H.a",
       main="elevation coefficient, adaptive kernel",
       at=cuts, key.space="right", col.regions=terrain.colors(24))
print(p1, split=c(1,1,2,1), more=T); print(p2, split=c(2,1,2,1), more=F)


## ----child-data, child='exRKGLS_RF.Rnw'----------------------------------

## ----RF-set-parent, echo=FALSE, cache=FALSE------------------------------
set_parent('exRKGLS.Rnw')


## ----load-rpart----------------------------------------------------------
library(rpart)


## ----set-seed-rt, echo=F, results='hide'---------------------------------
# set seed for the random number generator to get a consistent result for the document
# this should *not* be done when following the exercise
set.seed(318316)

## ----trees-comp-rpart----------------------------------------------------
m.rt <- rpart(ANN_GDD50 ~ N + E + ELEVATION_,
                  data=ne.df,
                  minsplit=2,
                  cp=0.003)


## ----summary-rt----------------------------------------------------------
print(m.rt)


## ----trees-plot-unpruned, out.width='\\textwidth'------------------------
library(rpart.plot)
rpart.plot(m.rt, digits=3, type=4, extra=1)


## ----trees-import-rpart--------------------------------------------------
x <- m.rt$variable.importance 
data.frame(variableImportance = 100 * x / sum(x))


## ----trees-plotcp-rp, out.width='\\textwidth'----------------------------
printcp(m.rt)
plotcp(m.rt)


## ----trees-prune-rpart---------------------------------------------------
(m.rt.p <- prune(m.rt, cp=0.0045))


## ----trees-plot-pruned, out.width='\\textwidth'--------------------------
rpart.plot(m.rt.p, digits=3, type=4, extra=1)


## ----trees-predict-cal-pts-----------------------------------------------
summary(p.rt.p <- predict(m.rt.p, newdata=ne.df))


## ----trees-count-unique-pred---------------------------------------------
length(unique(p.rt.p))


## ----trees-residuals-histo-----------------------------------------------
summary(residuals.rt.p <- ne.df$ANN_GDD50 - p.rt.p)
hist(residuals.rt.p, main="Residuals from regression tree fit",
     xlab="ANN_GDD50")
rug(residuals.rt.p)
sqrt(mean(residuals.rt.p^2)/length(residuals.rt.p))


## ----trees-predict-rpart, out.width='0.75\\textwidth', keep.source=TRUE----
plot(ne.df$ANN_GDD50 ~ p.rt.p, asp=1, pch=20,
     xlab="fitted by regression tree", ylab="actual",
     xlim=c(500,4200), ylim=c(500,4200),
     col=ne.df$STATE,
     main="Annual GDD50")
legend("topleft", levels(ne.m$STATE), pch=20, col=1:4)
grid()
abline(0,1)


## ----pred-rt-area--------------------------------------------------------
dem.ne.m.df$pred.rt <- predict(m.rt.p, newdata=dem.ne.m.df)
summary(dem.ne.m.df$pred.rt)


## ----plot-rt-area--------------------------------------------------------
display.prediction.map("pred.rt", 
                       "Annual GDD, base 50F, regression tree prediction",
                       "GDD50")


## ----set-seed-rf, echo=F, results='hide'---------------------------------
# set seed for the random number generator to get a consistent result for the document
# this should *not* be done when following the exercise
set.seed(318316)


## ----random-forest-------------------------------------------------------
library(randomForest)
m.rf <- randomForest(ANN_GDD50 ~ ELEVATION_ + N + E,
                     data=ne.df, ntree=1200,
                     importance=TRUE)
randomForest::importance(m.rf)[,1]
randomForest::importance(m.rf)[,2]/max(randomForest::importance(m.rf)[,2])


## ----plot-rf-fits--------------------------------------------------------
summary(predict(m.rf, newdata=ne.m))
plot(ne.m$ANN_GDD50 ~ predict(m.rf, newdata=ne.m),
     col=ne.m$STATE, pch=20, asp=1,
     xlab="Fitted by random forest",
     ylab="Actual",
     main="Annual GDD50")
legend("topleft", levels(ne.m$STATE), pch=20, col=1:4)
grid()
abline(0,1)


## ----rf-fits-errors------------------------------------------------------
summary(rf.resid <- ne.m$ANN_GDD50 - predict(m.rf, newdata=ne.m))
(rf.me <- mean(rf.resid))
(rf.rmse <- sqrt(mean(rf.resid^2)/length(rf.resid)))


## ----plot-rf-oob---------------------------------------------------------
summary(predict(m.rf))
plot(ne.m$ANN_GDD50 ~ predict(m.rf),
     col=ne.m$STATE, pch=20, asp=1,
     xlab="Fitted by random forest (OOB)",
     ylab="Actual",
     main="Annual GDD50")
legend("topleft", levels(ne.m$STATE), pch=20, col=1:4)
grid()
abline(0,1)


## ----rf-oob-errors-------------------------------------------------------
summary(rf.oob.resid <- ne.m$ANN_GDD50 - predict(m.rf))
(rf.oob.me <- mean(rf.oob.resid))
(rf.oob.rmse <- sqrt(mean(rf.oob.resid^2)/length(rf.oob.resid)))


## ----rf-compare-errors---------------------------------------------------
(rf.oob.me/rf.me)
(rf.oob.rmse/rf.rmse)


## ----rf-resid------------------------------------------------------------
ne.m$rf.resid <- (ne.df$ANN_GDD50 - predict(m.rf, newdata=ne.m))
summary(ne.m$rf.resid)
bubble(ne.m, zcol="rf.resid", pch=1,
       main="Random Forest residuals, actual-predicted")


## ----rf-resid-oob--------------------------------------------------------
ne.m$rf.resid.oob <- (ne.df$ANN_GDD50 - predict(m.rf))
summary(ne.m$rf.resid.oob)
summary(ne.m$rf.resid)
summary(ne.m$rf.resid.oob/ne.m$rf.resid)


## ----which-rf-worst------------------------------------------------------
(ix <- order(abs(ne.m$rf.resid), decreasing=TRUE)[1:8])
ne.m@data[ix,c("STATE","STATION_NA","ELEVATION_",
                     "gls.resid", "rf.resid")]


## ----vgm-rf-resid, out.width='0.5\\textwidth'----------------------------
v.rf <- variogram(rf.resid ~ 1, locations=ne.m, cutoff=100000, width=16000)
plot(v.rf, pl=T)


## ------------------------------------------------------------------------
summary(ne.m$ols.resid)
summary(ne.m$rf.resid)
summary(ne.m$gls.resid)
sd(ne.m$ols.resid)
sd(ne.m$rf.resid)
sd(ne.m$gls.resid)


## ----predict-4km-rf------------------------------------------------------
dem.ne.m.df$pred.rf <- predict(m.rf, newdata=dem.ne.m.df)
summary(dem.ne.m.df$pred.rf)


## ----display-4km-rf, out.width='.8\\textwidth'---------------------------
display.prediction.map("pred.rf", 
                       "Annual GDD, base 50F, random forest prediction",
                       "GDD50")


## ----diffs-4km-gls-rf----------------------------------------------------
summary(dem.ne.m.df$diff.gls.rf <- 
            dem.ne.m.df$pred.gls - dem.ne.m.df$pred.rf)


## ----display-diffs-4km-gls-rf, out.width='.8\\textwidth'-----------------
display.difference.map("diff.gls.rf", 
                       "Annual GDD, base 50F, GLS - RF predictions", 
                       "+/- GDD50")


## ----load-caret----------------------------------------------------------
library(caret)


## ----list-models---------------------------------------------------------
length(names(getModelInfo()))
head(names(getModelInfo()),24)


## ----caret-model-info----------------------------------------------------
getModelInfo("ranger")$ranger$parameters


## ----set-seed-caret, echo=F, results='hide'------------------------------
# set seed for the random number generator to get a consistent result for the document
# this should *not* be done when following the exercise
set.seed(318316)


## ----caret---------------------------------------------------------------
library(ranger)
dim(preds <- ne.df[, c("E", "N", "ELEVATION_")])
length(response <- ne.df[, "ANN_GDD50"])
system.time(
   ranger.tune <- train(x = preds, y = response, method="ranger", 
                 tuneGrid = expand.grid(.mtry = 1:3, 
                                        .splitrule = "variance",
                                        .min.node.size = 1:10), 
                 trControl = trainControl(method = 'cv'))
)


## ----caret-show-results, out.width='.65\\textwidth', fig.width=8, fig.height=5----
print(ranger.tune)
names(ranger.tune$result)
ix <- which.min(ranger.tune$result$RMSE)
ranger.tune$result[ix, c(1,3,4)]
ix <- which.max(ranger.tune$result$Rsquared)
ranger.tune$result[ix, c(1,3,5)]
ix <- which.min(ranger.tune$result$MAE)
ranger.tune$result[ix, c(1,3,6)]
plot.train(ranger.tune, metric="RMSE")
plot.train(ranger.tune, metric="Rsquared")
plot.train(ranger.tune, metric="MAE")


## ----fit-ranger----------------------------------------------------------
(ranger.rf <- ranger(ANN_GDD50 ~ N + E + ELEVATION_, data=ne.df,
                    mtry=2, min.node.size=9))


## ----plot-ranger-fits----------------------------------------------------
summary(ranger.fits <- predict(ranger.rf, data=ne.df)$predictions)
plot(ne.df$ANN_GDD50 ~ ranger.fits,
     col=ne.m$STATE, pch=20, asp=1,
     xlab="Fitted by ranger",
     ylab="Actual",
     main="Annual GDD50")
legend("topleft", levels(ne.m$STATE), pch=20, col=1:4)
grid(); abline(0,1)


## ----predict-4km-ranger--------------------------------------------------
summary(dem.ne.m.df$pred.ranger <- 
            predict(ranger.rf, data=dem.ne.m.df)$predictions)


## ----display-4km-ranger, out.width='.8\\textwidth'-----------------------
display.prediction.map("pred.ranger", 
                       "Annual GDD, base 50F, ranger RF prediction",
                       "GDD50")


## ----diffs-4km-ranger-rf-------------------------------------------------
summary(dem.ne.m.df$diff.ranger.rf <- 
            dem.ne.m.df$pred.ranger - dem.ne.m.df$pred.rf)


## ----display-diffs-4km-ranger-rf, out.width='.8\\textwidth'--------------
display.difference.map("diff.ranger.rf", 
                       "Annual GDD, base 50F, ranger - randomForest predictions", 
                       "+/- GDD50")


## ----load-cubist---------------------------------------------------------
library(Cubist)


## ----vignette-cubist, eval=FALSE-----------------------------------------
## vignette("cubist")


## ----set-seed-cubist, echo=F, results='hide'-----------------------------
# set seed for the random number generator to get a consistent result for the document
# this should *not* be done when following the exercise
set.seed(316318)


## ----cubist-train--------------------------------------------------------
require(caret)
all.preds <- ne.df[, c("N", "E", "ELEVATION_")]
all.resp <- ne.df[ , "ANN_GDD50"]
system.time(
       cubist.tune <- train(x = all.preds, y = all.resp, "cubist", 
                     tuneGrid = expand.grid(.committees = 1:12, 
                                            .neighbors = 0:8), 
                     trControl = trainControl(method = 'cv'))
       )


## ----cubist-train-results, fig.width=8, fig.height=5---------------------
print(cubist.tune)
plot(cubist.tune, metric="RMSE")
plot(cubist.tune, metric="Rsquared")
plot(cubist.tune, metric="MAE")


## ----cubist-fit-best-----------------------------------------------------
c.model <- cubist(x = all.preds, y = all.resp, committees=6)
summary(c.model)


## ----cubist-model-eval-fit-----------------------------------------------
# predictive accuracy
cubist.fits <- predict(c.model, newdata=all.preds, 
                       neighbors=cubist.tune$bestTune$neighbors)
## Test set RMSE
sqrt(mean((cubist.fits - all.resp)^2))
cor(cubist.fits, all.resp)^2  # R^2
plot(ne.df$ANN_GDD50 ~ cubist.fits,
     col=ne.m$STATE, pch=20, asp=1,
     xlab="Fitted by cubist",
     ylab="Actual",
     main="Annual GDD50")
legend("topleft", levels(ne.m$STATE), pch=20, col=1:4)
grid(); abline(0,1)


## ----cubist-model-predict------------------------------------------------
summary(dem.ne.m.df$pred.cubist <- 
            predict(c.model, newdata=dem.ne.m.df,
                              neighbours=cubist.tune$bestTune$neighbors))


## ----display-4km-cubist, out.width='.8\\textwidth'-----------------------
display.prediction.map("pred.cubist", 
                       "Annual GDD, base 50F, Cubist prediction",
                       "GDD50")


## ----child-data, child='exRKGLS_TPS.Rnw'---------------------------------

## ----TPS-set-parent, echo=FALSE, cache=FALSE-----------------------------
set_parent('exRKGLS.Rnw')


## ----TPS-----------------------------------------------------------------
require(fields)
ne.tps <- ne.df
ne.tps$coords <- matrix(c(ne.tps$E, ne.tps$N), byrow=F, ncol=2)
str(ne.tps$coords)
surf.1 <-Tps(ne.tps$coords, ne.tps$ANN_GDD50)
summary(surf.1)


## ----TPS-grid------------------------------------------------------------
resolution <- 9
bbox(state.ne.m)
(ne.sq.km <- diff(bbox(state.ne.m)[1,]) * diff(bbox(state.ne.m)[2,])/10^6)
(approx.n.grid.cells <- ne.sq.km/(resolution^2))
states.grid <- spsample(state.ne.m, n=approx.n.grid.cells, type="regular")
gridded(states.grid) <- TRUE
states.grid.df <- as.data.frame(states.grid)
states.grid.df$coords <- 
    matrix(c(states.grid.df$x1, states.grid.df$x2), byrow=F, ncol=2)
str(states.grid.df)


## ----TPS-predict---------------------------------------------------------
surf.1.pred <- predict.Krig(surf.1, states.grid.df$coords)
summary(as.vector(surf.1.pred))


## ----TPS-map, out.width='\\textwidth'------------------------------------
plot(states.grid.df$coords, pch=20, asp=1, cex=.6,
     col=bpy.colors(256)[cut(surf.1.pred, 256)],
     xlab="E", ylab="N",
     main="Annual GDD50")
lines(state.ne.m, col="white")
grid(col="gray")


## ----states-grid-spdf----------------------------------------------------
states.grid.spdf <- SpatialPixelsDataFrame(
    states.grid,
     data=data.frame(tps.pred=as.vector(surf.1.pred))
)


## ----get-value-tps-------------------------------------------------------
pt <- SpatialPoints(coords=cbind(-76.402175, 42.453271))
proj4string(pt) <- CRS("+init=EPSG:4326")
pt <- spTransform(pt, CRS(proj4string(states.grid.spdf)))
coordinates(pt)
pt.sp <- SpatialPointsDataFrame(pt, over(pt, states.grid.spdf))
pt.sp$tps.pred


## ----child-data, child='exRKGLS_OK.Rnw'----------------------------------

## ----OK-set-parent, echo=FALSE, cache=FALSE------------------------------
set_parent('exRKGLS.Rnw')


## ----display-ordinary-v, out.width='0.6\\textwidth', w=8, h=6------------
library(gstat)
bbox(ne.m)
(cutoff.default <- (sqrt(diff(bbox(ne.m)["E",])^2 + diff(bbox(ne.m)["N",])^2))/3)
(binwidth.default <- cutoff.default/15)
plot(v.o <- variogram(ANN_GDD50 ~ 1, locations=ne.m),
     plot.numbers=TRUE)


## ----display-ordinary-v-long, out.width='0.6\\textwidth', w=8, h=6-------
plot(v.o <- variogram(ANN_GDD50 ~ 1, locations=ne.m, cutoff=500000, width=20000),
     plot.numbers=TRUE)


## ----show-ordinary-v-short, out.width='0.6\\textwidth', w=8, h=6---------
v.o <- variogram(ANN_GDD50 ~ 1, locations=ne.m,
                    cutoff=220000)
plot(v.o, plot.numbers=TRUE)


## ----model-ordinary-2, out.width='0.6\\textwidth', w=8, h=6--------------
(vmf.o <- fit.variogram(v.o,
                        vgm(psill=210000, model="Sph",
                            range=220000, nugget=40000)))
plot(v.o, plot.numbers=TRUE, model=vmf.o)


## ----krige-ok------------------------------------------------------------
# dem.ne.m.sp was set up in \S6.2 "Adjusting the grid for prediction"
k.ok <- krige(ANN_GDD50 ~ 1, locations=ne.m, newdata=dem.ne.m.sp,
              model=vmf.o, nmax=24, block=c(1000,1000))
summary(k.ok)


## ----close-sta-eith------------------------------------------------------
e.ith.pt <- subset(ne.m, (ne.m$STATION_NA=="ITHACA CORNELL UNIV"))
dist.pt <- spDists(ne.m, e.ith.pt)
(ix <- order(dist.pt)[1:24])
print(cbind(ne.df[ix,c('STN_NAME','STATE','ELEVATION_','ANN_GDD50')],
      dist=round(dist.pt[ix,]/1000,1)))


## ----krige-plot-ok-pred, out.width='0.65\\textwidth'---------------------
spplot(k.ok, zcol="var1.pred")


## ----krige-plot-ok-pred-df, out.width='0.8\\textwidth'-------------------
dim(dem.ne.m.df)
dem.ne.m.df$pred.ok <- k.ok$var1.pred
display.prediction.map("pred.ok",
                       "Annual GDD base 50F, OK prediction",
                       "GDD50")


## ----krige-plot-ok-pred-sd-df, out.width='0.8\\textwidth'----------------
dem.ne.m.df$sd.ok <- sqrt(k.ok$var1.var)
ggplot() +
    geom_point(aes(x=E, y=N, colour=sd.ok), data=dem.ne.m.df) +
    xlab("E") + ylab("N") + coord_fixed() +
    ggtitle("Annual GDD base 50F, Standard error of OK prediction") +
    scale_colour_distiller(name="GDD50 s.d.", space="Lab", palette="RdYlGn",
                           direction=-1)


## ----krige-min-sd--------------------------------------------------------
summary(dem.ne.m.df$ok.sd)
ix <- which.min(dem.ne.m.df$ok.sd)
k.ok@data[ix,"var1.pred"]
round(100*dem.ne.m.df[ix,"ok.sd"]/k.ok$var1.pred[ix],1)


## ----krige-loocv-ok, results='hide'--------------------------------------
kcv.ok <- krige.cv(ANN_GDD50 ~ 1, locations=ne.m, model=vmf.o)
summary(kcv.ok$residual)


## ------------------------------------------------------------------------
(loocv.ok.rmse <- sqrt(sum(kcv.ok$residual^2)/length(kcv.ok$residual)))


## ----ok-loocv-bubble-ok--------------------------------------------------
bubble(kcv.ok, zcol="residual", pch=1, main="LOOCV OK residuals")


## ------------------------------------------------------------------------
ne.m@data[which.min(kcv.ok$residual),2:6]
ne.m@data[which.max(kcv.ok$residual),2:6]


## ----idw-----------------------------------------------------------------
k.idw <- idw(ANN_GDD50 ~ 1, locations=ne.m, newdata=dem.ne.m.sp,
              idp=2, nmax=24)
summary(k.idw$var1.pred)
summary(k.ok$var1.pred)


## ----idw-plot, out.width='.8\\textwidth'---------------------------------
dem.ne.m.df$pred.idw <- k.idw$var1.pred
display.prediction.map("pred.idw",
                       "Annual GDD base 50F, IDW2 prediction",
                       "GDD50")


## ----idw-loocv-1, results='hide'-----------------------------------------
kcv.idw <- krige.cv(ANN_GDD50 ~ 1, locations=ne.m, model=NULL)


## ----idw-loocv-2---------------------------------------------------------
summary(kcv.idw$residual)
summary(kcv.ok$residual)
(loocv.idw.rmse <- sqrt(sum(kcv.idw$residual^2)/length(kcv.idw$residual)))
(loocv.ok.rmse)


## ----idw-loocv-bubble----------------------------------------------------
bubble(kcv.idw, zcol="residual", pch=1, main="IDW^2 LOOCV residuals")


## ----child-data, child='exRKGLS_Thiessen.Rnw'----------------------------

## ----Thiessen-set-parent, echo=FALSE, cache=FALSE------------------------
set_parent('exRKGLS.Rnw')


## ----voronoi, out.width='0.9\\linewidth'---------------------------------
library(dismo)
v <- voronoi(ne.m, ext=t(bbox(state.ne.m)))
class(v)
plot(v)


## ----union-state---------------------------------------------------------
require(rgeos)
area.ne.m <- gUnaryUnion(state.ne.m)
class(area.ne.m)
plot(area.ne.m)


## ----clip-voronoi, out.width='\\linewidth'-------------------------------
v2 <- gIntersection(v, area.ne.m, byid=TRUE)
class(v2)
plot(v2)


## ----predict-voroni-over-------------------------------------------------
v3 <- over(v2, ne.m)
class(v3)


## ----predict-overlay-spdf------------------------------------------------
v4 <- SpatialPolygonsDataFrame(v2, data=v3)
names(v4)


## ----plot-pred-voronoi, out.width='\\linewidth'--------------------------
spplot(v4, zcol="ANN_GDD50", col.regions=topo.colors(64),
       main="Annual GDD base 50F")


## ----find-point----------------------------------------------------------
pt <- SpatialPoints(coords=cbind(-76.402175, 42.453271))
proj4string(pt) <- CRS("+init=EPSG:4326")
pt <- spTransform(pt, CRS(proj4string(ne.m)))
coordinates(pt)


## ----predict-pt-voronoi--------------------------------------------------
pt.sp <- SpatialPointsDataFrame(pt, over(pt, v4))
pt.sp$ANN_GDD50


## ----compare-voronoi-rkgls-----------------------------------------------
dem.ne.m.spdf <- SpatialPixelsDataFrame(coordinates(dem.ne.m.sp),
                                        data=dem.ne.m.df)
proj4string(dem.ne.m.spdf) <- proj4string(ne.m)
pt.pred <-over(pt, dem.ne.m.spdf)
names(pt.pred)
pt.pred[,c("pred.ols", "pred.gls", "pred.rkgls", "pred.ked", "pred.rf")]


## ----compare-elev--------------------------------------------------------
pt.sp$ELEVATION_
pt.pred$ELEVATION_


## ----inter-station-dist--------------------------------------------------
dm <- spDists(ne.m)
str(dm)


## ----find-nn-------------------------------------------------------------
diag(dm) <- max(dm)*1.1
nn <- apply(dm, 1, which.min)
str(nn)


## ----predict-nn----------------------------------------------------------
nn.gdd <- ne.m@data[nn,"ANN_GDD50"]
str(nn.gdd)


## ----compare-nn-gdd------------------------------------------------------
obs <- ne.m@data[,"ANN_GDD50"]
summary(obs)
summary(nn.gdd)
summary(diff <- (obs - nn.gdd))
hist(diff)
rug(diff)
(me <- mean(diff))
(rmse <- sqrt(sum(diff^2)/length(diff)))


## ------------------------------------------------------------------------
rf.oob.me; rf.oob.rmse


## ----plot-loocv-voronoi, out.width='\\linewidth'-------------------------
v4$resid <- diff
spplot(v4, zcol="resid", col.regions=bpy.colors(32),
       main="Cross-validation error, Thiessen polygon prediction")


## ----child-data, child='exRKGLS_compare.Rnw'-----------------------------

## ----compare-set-parent, echo=FALSE, cache=FALSE-------------------------
set_parent('exRKGLS.Rnw')


## ----read-july-no, eval=FALSE--------------------------------------------
## tmp <- readOGR(dsn=".", layer="maat7100",
##                integer64="allow.loss")


## ----read-july-usa-yes, echo=FALSE---------------------------------------
tmp <- readOGR(dsn="./ds/NEweather", layer="maat7100",
               integer64="allow.loss")


## ----july-4states--------------------------------------------------------
ix <- (tmp$STATE %in% c("NY","NJ","PA","VT"))
tmp <- tmp[ix,]
names(tmp)


## ----july-crs------------------------------------------------------------
strwrap(proj4string(dem.ne.m))
tmp.m  <- spTransform(tmp, ne.crs)
proj4string(tmp.m)


## ----add-july------------------------------------------------------------
names(ne.m)
ne.m <- cbind(ne.m, tmp.m[,7:19])
names(ne.m)
rm(tmp, tmp.m)


## ----add-july-df---------------------------------------------------------
ne.df <- as(ne.m, "data.frame")


## ----standardized--------------------------------------------------------
summary(ne.m$ANN_GDD50)
summary(ne.m$ANN_TNORM_)
gdd50.std <- (ne.m$ANN_GDD50 - mean(ne.m$ANN_GDD50))/sd(ne.m$ANN_GDD50)
t.ann.std <- (ne.m$ANN_TNORM_ - mean(ne.m$ANN_TNORM_))/sd(ne.m$ANN_TNORM_)
summary(gdd50.std)
summary(t.ann.std)


## ----check-corr-t-gdd----------------------------------------------------
cor(ne.m$ANN_GDD50, ne.m$ANN_TNORM_)
cor(gdd50.std, t.ann.std)
plot(gdd50.std, t.ann.std, asp=1,
     xlab="Standardized annual GDD50",
     ylab="Standardized mean annual T")
abline(0,1)


## ----build-two-models----------------------------------------------------
summary(m.ols.t <- lm(t.ann.std~ sqrt(ELEVATION_)+N+E, data=ne.df))
summary(m.ols.gdd <-  lm(gdd50.std ~ sqrt(ELEVATION_)+N+E, data=ne.df))
coef(m.ols.t)
coef(m.ols.gdd)
coef(m.ols.t)/coef(m.ols.gdd)


## ----show-ols-fits-two, out.width='\\textwidth', w=12, h=6---------------
summary(ne.m$ols.resid.t <- residuals(m.ols.t))
summary(ne.m$ols.resid.gdd <- residuals(m.ols.gdd))
p1 <- bubble(ne.m, zcol="ols.resid.t", pch=1,
             main="OLS residuals, std. annual T")
p2 <- bubble(ne.m, zcol="ols.resid.gdd", pch=1,
             main="OLS residuals, std. GDD50")
print(p1, split=c(1,1,2,1), more=T)
print(p2, split=c(2,1,2,1), more=F)


## ----compare-resids-two--------------------------------------------------
summary(ne.m$ols.resid.diff <- ne.m$ols.resid.t - ne.m$ols.resid.gdd)
bubble(ne.m, zcol="ols.resid.diff", pch=1,
       main="Difference of residuals, T - GDD",
       col=c("#EF8A62","#67A9CF"))


## ----model-resid-vgms----------------------------------------------------
v.r.ols.t <- variogram(ols.resid.t ~ 1,
                       locations=ne.m, cutoff=120000, width=12000)
(vmf.r.ols.t <- fit.variogram(v.r.ols.t,
                              vgm(psill=0.1, model="Exp",
                                  range=20000, nugget=0.02)))
p1 <- plot(v.r.ols.t, pl=T, model=vmf.r.ols.t)
#
v.r.ols.gdd <- variogram(ols.resid.gdd ~ 1,
                         locations=ne.m, cutoff=120000, width=12000)
(vmf.r.ols.gdd <- fit.variogram(v.r.ols.gdd,
                                vgm(psill=0.12, model="Exp",
                                    range=20000, nugget=0.02)))
p2 <- plot(v.r.ols.gdd, pl=T, model=vmf.r.ols.gdd)
print(p1, split=c(1,1,1,2), more=T)
print(p2, split=c(1,2,1,2), more=F)


## ----fit-gls-two---------------------------------------------------------
# require(nlme)
(p.nugget <- vmf.r.ols.t[1,"psill"]/sum(vmf.r.ols.t[,"psill"]))
m.gls.t <- gls(model=t.ann.std ~ sqrt(ELEVATION_) + N + E,
             data=ne.df,
             correlation=corExp(
               value=c(vmf.r.ols.t[2,"range"], p.nugget),
               form=~E + N,
               nugget=TRUE))
#
(p.nugget <- vmf.r.ols.gdd[1,"psill"]/sum(vmf.r.ols.gdd[,"psill"]))
m.gls.gdd <- gls(model=gdd50.std ~ sqrt(ELEVATION_) + N + E,
             data=ne.df,
             correlation=corExp(
               value=c(vmf.r.ols.gdd[2,"range"], p.nugget),
               form=~E + N,
               nugget=TRUE))
summary(m.gls.t)
summary(m.gls.gdd)


## ----compare-coeff-gls-two-----------------------------------------------
coefficients(m.gls.t)
coefficients(m.gls.gdd)
coefficients(m.gls.t)/coefficients(m.gls.gdd)


## ----compare-corr-struct-gls-two-----------------------------------------
intervals(m.gls.t)$corStruct
intervals(m.gls.gdd)$corStruct
intervals(m.gls.t)$corStruct/intervals(m.gls.gdd)$corStruct


## ----gls-resid-two-------------------------------------------------------
ne.m$gls.resid.t <- residuals(m.gls.t)
ne.m$gls.resid.gdd <- residuals(m.gls.gdd)


## ----show-vgm-gls-corr-std-----------------------------------------------
(p.nugget <- intervals(m.gls.t)$corStruct["nugget","est."])
(t.sill <- var(ne.m$gls.resid.t))
(vmf.r.gls.t <- vgm(psill=t.sill*(1-p.nugget), model="Exp",
                    range=intervals(m.gls.t)$corStruct["range","est."],
                    nugget=t.sill*p.nugget))
v.r.gls.t <- variogram(gls.resid.t ~ 1,
                         locations=ne.m, cutoff=120000, width=12000)
p1 <- plot(v.r.gls.t, model=vmf.r.gls.t, pl=T)
#
(p.nugget <- intervals(m.gls.gdd)$corStruct["nugget","est."])
(t.sill <- var(ne.m$gls.resid.gdd))
(vmf.r.gls.gdd <- vgm(psill=t.sill*(1-p.nugget), model="Exp",
                    range=intervals(m.gls.gdd)$corStruct["range","est."],
                    nugget=t.sill*p.nugget))
v.r.gls.gdd <- variogram(gls.resid.gdd ~ 1,
                         locations=ne.m, cutoff=120000, width=12000)
p2 <- plot(v.r.gls.gdd, model=vmf.r.gls.gdd, pl=T)
print(p1, split=c(1,1,1,2), more=T)
print(p2, split=c(1,2,1,2), more=F)


## ----gls-predict-two-----------------------------------------------------
dem.ne.m.df$pred.gls.t <- predict(m.gls.t, newdata=dem.ne.m.df)
dem.ne.m.df$pred.gls.gdd <- predict(m.gls.gdd, newdata=dem.ne.m.df)
dem.ne.m.df$diff.gls.t.gdd <- 
    dem.ne.m.df$pred.gls.t - dem.ne.m.df$pred.gls.gdd


## ----plot-4km-gdd-t, out.width='.8\\textwidth'---------------------------
(std.pred.lim <- c(min(dem.ne.m.df[,c("pred.gls.t","pred.gls.gdd")]),
                   max(dem.ne.m.df[,c("pred.gls.t","pred.gls.gdd")])))
display.prediction.map("pred.gls.t",
                       "Mean Annual Temperature, standardized, GLS prediction",
                       "GDD50", std.pred.lim, .palette="YlOrBr")
display.prediction.map("pred.gls.gdd",
                       "Annual GDD, base 50F, standardized, GLS prediction",
                       "GDD50", std.pred.lim, .palette="YlOrBr")


## ----diffs-4km-gls-t-gdd-------------------------------------------------
summary(dem.ne.m.df$diff.gls.t.gdd <- 
            dem.ne.m.df$pred.gls.t - dem.ne.m.df$pred.gls.gdd)


## ----plot-4km-gdd-t-diff, out.width='.8\\textwidth'----------------------
display.difference.map("diff.gls.t.gdd", 
                       "Difference, MAT - GDD50, standardized",
                       "+/- s.d.",
                       .palette="BrBG")


## ----ok-gls-resid-t-gdd-std----------------------------------------------
ok.gls.resid.t <- krige(gls.resid.t ~ 1, loc=ne.m, newdata=dem.ne.m.sp,
                      model=vmf.r.gls.t)
ok.gls.resid.gdd <- krige(gls.resid.gdd ~ 1, loc=ne.m, newdata=dem.ne.m.sp,
                      model=vmf.r.gls.gdd)
dem.ne.m.df$ok.gls.resid.t <- ok.gls.resid.t$var1.pred
dem.ne.m.df$ok.gls.resid.gdd <- ok.gls.resid.gdd$var1.pred


## ----ok-gls-resid-t-gdd-std-maps, out.width='.8\\textwidth'--------------
ggplot() +
    geom_point(aes(x=E, y=N, colour=ok.gls.resid.t), data=dem.ne.m.df) +
    xlab("E") + ylab("N") + coord_fixed() +
    ggtitle("Residuals from GLS trend surface, MAT (std)") +
    scale_colour_distiller(name="T (std)", space="Lab", palette="RdBu")
ggplot() +
    geom_point(aes(x=E, y=N, colour=ok.gls.resid.gdd), data=dem.ne.m.df) +
    xlab("E") + ylab("N") + coord_fixed() +
    ggtitle("Residuals from GLS trend surface, GDD base 50F (std)") +
    scale_colour_distiller(name="GDD50 (std)", space="Lab", palette="RdBu")


## ----ok-gls-resid-t-gdd-diff, out.width='.8\\textwidth'------------------
summary(dem.ne.m.df$ok.gls.resid.diff.t.gdd <- 
            dem.ne.m.df$ok.gls.resid.t - dem.ne.m.df$ok.gls.resid.gdd)
display.difference.map("ok.gls.resid.diff.t.gdd", 
                       "Difference: kriged residuals from GLS trend surface",
                       "+/- s.d.",
                       .palette="RdGy")


## ----final-gls-rk-std, out.width='.8\\textwidth'-------------------------
summary(dem.ne.m.df$pred.rkgls.std.t <- 
            dem.ne.m.df$pred.gls.t + dem.ne.m.df$ok.gls.resid.t)
summary(dem.ne.m.df$pred.rkgls.std.gdd <- 
            dem.ne.m.df$pred.gls.gdd + dem.ne.m.df$ok.gls.resid.gdd)
display.prediction.map("pred.rkgls.std.t",
                       "GLS-RK prediction, Mean Annual Temperature, standardized",
                       "MAAT  (std)", std.pred.lim, .palette="YlOrBr")
display.prediction.map("pred.rkgls.std.gdd",
                       "GLS-RK prediction, Annual GDD, base 50F, standardized",
                       "GDD50 (std)", std.pred.lim, .palette="YlOrBr")


## ----final-gls-rk-std-diff, out.width='.8\\textwidth'--------------------

#
summary(dem.ne.m.df$diff.rkgls.std.t.gdd <-
            (dem.ne.m.df$pred.rkgls.std.t - dem.ne.m.df$pred.rkgls.std.gdd))
display.difference.map("diff.rkgls.std.t.gdd", 
                       "GLS-RK predictions, difference, MAT - GDD50, standardized",
                       "+/- s.d.",
                       .palette="BrBG")


## ----build-rf-two--------------------------------------------------------
m.rf.std.t <- randomForest(t.ann.std ~ ELEVATION_ + N + E,
                     data=ne.df, ntree=1200,
                     importance=TRUE)
m.rf.std.gdd <- randomForest(gdd50.std ~ ELEVATION_ + N + E,
                     data=ne.df, ntree=1200,
                     importance=TRUE)


## ----compare-rf-importance-----------------------------------------------
randomForest::importance(m.rf.std.t)
randomForest::importance(m.rf.std.gdd)


## ----compare-rf-11-oob, out.width='0.45\\linewidth'----------------------
plot(t.ann.std ~ predict(m.rf.std.t, newdata=ne.m), col=ne.m$STATE, pch=20, asp=1,
     xlab="Fitted by random forest", ylab="Actual",
     main="Mean Annual T (std)")
legend("topleft", levels(ne.m$STATE), pch=20, col=1:4)
grid(); abline(0,1)
plot(gdd50.std ~ predict(m.rf.std.t, newdata=ne.m), col=ne.m$STATE, pch=20, asp=1,
     xlab="Fitted by random forest", ylab="Actual",
     main="Annual GDD50 (std)")
legend("topleft", levels(ne.m$STATE), pch=20, col=1:4)
grid(); abline(0,1)
plot(t.ann.std ~ predict(m.rf.std.t), col=ne.m$STATE, pch=20, asp=1,
     xlab="Fitted by random forest (OOB)", ylab="Actual",
     main="Mean Annual T (std)")
legend("topleft", levels(ne.m$STATE), pch=20, col=1:4)
grid(); abline(0,1)
plot(gdd50.std ~ predict(m.rf.std.t), col=ne.m$STATE, pch=20, asp=1,
     xlab="Fitted by random forest (OOB)", ylab="Actual",
     main="Annual GDD50 (std)")
legend("topleft", levels(ne.m$STATE), pch=20, col=1:4)
grid(); abline(0,1)


## ----predict-rf-two------------------------------------------------------
dem.ne.m.df$pred.rf.std.t <- predict(m.rf.std.t, newdata=dem.ne.m.df)
dem.ne.m.df$pred.rf.std.gdd <- predict(m.rf.std.gdd, newdata=dem.ne.m.df)
(std.pred.lim <- c(min(dem.ne.m.df[,c("pred.rf.std.t","pred.rf.std.gdd")]),
                       max(dem.ne.m.df[,c("pred.rf.std.t","pred.rf.std.gdd")])))
display.prediction.map("pred.rf.std.t",
                       "Mean Annual Temperature, standardized, RF prediction",
                       "GDD50", std.pred.lim, .palette="YlOrBr")
display.prediction.map("pred.rf.std.gdd",
                       "Annual GDD, base 50F, standardized, RF prediction",
                       "GDD50", std.pred.lim, .palette="YlOrBr")


## ----predict-rf-two-diff-------------------------------------------------
summary(dem.ne.m.df$diff.rf.std.t.gdd <-
    dem.ne.m.df$pred.rf.std.t - dem.ne.m.df$pred.rf.std.gdd)
display.difference.map("diff.rf.std.t.gdd", 
                       "RF predictions, difference, MAT - GDD50, standardized",
                       "+/- s.d.",
                       .palette="BrBG")


## ----child-data, child='exRKGLS_answers.Rnw'-----------------------------

## ----answers-set-parent, echo=FALSE, cache=FALSE-------------------------
set_parent('exRKGLS.Rnw')


## ----get-rt-row, echo=FALSE----------------------------------------------
ix.rt.l <- which(row.names(m.rt$frame)=="2")
ix.rt.r <- which(row.names(m.rt$frame)=="3")


## ----child-data, child='exRKGLS_challenge.Rnw'---------------------------

## ----challenge-set-parent, echo=FALSE, cache=FALSE-----------------------
set_parent('exRKGLS.Rnw')


## ----child-data, child='exRKGLS_colours.Rnw'-----------------------------

## ----colours-set-parent, echo=FALSE, cache=FALSE-------------------------
set_parent('exRKGLS.Rnw')


## ----seq-brewers, out.width='\\linewidth', h=6, w=8----------------------
library(RColorBrewer)
display.brewer.all(type="seq")


## ----display-annual-colored, out.width='\\textwidth'---------------------
ggplot(data=ne.df) +
    aes(x=E, y=N) +
    geom_point(aes(size=ANN_GDD50, colour=ANN_GDD50),
               shape=20) +
    scale_colour_distiller(space="Lab",
                           palette="Greens") +
        xlab("E") +  ylab("N") + coord_fixed() 


## ----seq-brewers-div, out.width='\\linewidth', h=6, w=8------------------
display.brewer.all(type="div")