## ----template:intro, echo=FALSE------------------------------------------
# Welcome to the Togaware Data Science End-to-End Template WeatherAUS ----
# 
# Refer to the book, The Essentials of Data Science available from
# Amazon at http://bit.ly/essentials_data_science, and the web site
# https://essentials.togaware.com for more details.
#
# Australian Weather Dataset
#
# This template provides a starting point for the 
# data scientist exploring a new dataset. By no means
# is it the end point of the data science journey.
# 
# This R script is automatically extracted from a knitr
# file with a .Rnw extension. That file includes a broader 
# narrative and explanation of the journey through our data.
# Before our own journey into literate programming we can
# make use of these R scripts as our templates for data science.
# 
# The template is under regular revision and improvement
# and is provided as is. It is published as an appendix to the 
# book, Essentials of Data Science from CRC Press.
#
# Copyright (c) 2014-2018 Togaware.com
# Authored by and feedback to Graham.Williams@togaware.com
# License: Creative Commons Attribution-ShareAlike CC BY-SA 
#
# DOCVERSION

## ----template:attach_packages, echo=FALSE, message=FALSE, warning=FALSE----
# Load required packages from local library into R.

library(magrittr)     # Pipe operator %>% %<>% %T>% equals().
library(lubridate)    # Dates and time.
library(rattle)       # normVarNames().
library(ROCR)         # Use prediction() for evaluation.
library(rpart)        # Model: decision tree.
library(scales)       # Include commas in numbers.
library(stringi)      # String concat operator %s+%.
library(tidyverse)    # ggplot2, tibble, tidyr, readr, purr, dplyr, stringr

## ----template:data_source, message=FALSE---------------------------------
# Original dataset source/location.

dsorig <- file.path("https://rattle.togaware.com/weatherAUS.csv")

# Name of the dataset.

dsname <- "weatherAUS"

# Identify the Essentials location of the dataset.

dsloc  <- "https://essentials.togaware.com"
dspath <- file.path(dsloc, dsname %s+% ".csv") %T>% print()

## ----template:data_ingest, message=FALSE---------------------------------
# Ingest the dataset.

dspath %>% read_csv() %>% assign(dsname, ., envir=.GlobalEnv)

## ----template:set_template_variable--------------------------------------
# Store the dataset with a generic template variable name.

dsname %>% get() %T>% print() -> ds

## ----template:norm_var_names---------------------------------------------
# Normalise the variable names.

names(ds) %<>% normVarNames() %T>% print()

# Fix specific variable names.

names(ds)[23] <- c("rainfall_tomorrow")

# Check the names.

names(ds)

## ----template:identifiers------------------------------------------------
# Note any identifiers.

id <- c("date", "location")

# Note the target variable.

target <- "rain_tomorrow"

# Note any risk variable - measures the severity of the outcome.

risk <- "rainfall_tomorrow"

## ----template:variable_roles, out.lines=NULL-----------------------------
# Note available variables ignoring identifiers and risk, with target first.

ds %>%
  names() %>% 
  setdiff(c(id, risk)) %>%
  c(target, .) %>%
  unique() %T>% 
  print() ->
vars

## ----template:ignore-----------------------------------------------------
# We will sometimes want to ignore specific variables.

ignore <- NULL

# Remove variables to ignore from the variable list.

vars %<>% setdiff(ignore) %T>% print()

## ----template:numeric_data-----------------------------------------------
# Fix specific numeric variables.

cvars <- c("evaporation", "sunshine")
ds[cvars] %<>% sapply(as.numeric)

# Remove observations without a value for rain_tomorrow.

ds %<>% drop_na(rain_tomorrow)

## ----template:characters-------------------------------------------------
# Identify the character variables by index.

chari <- ds[vars] %>% sapply(is.character) %>% which() %T>% print()

# Identify the chracter variables by name.

charc <- ds[vars] %>% names() %>% '['(chari) %T>% print()

## ----template:character_levels-------------------------------------------
# Observe the unique levels.

ds[charc] %>% sapply(unique)

## ----template:data_wrangling---------------------------------------------
# Convert all character to factor if determined appropriate.

ds[charc] %<>% map(factor)

## ----template:observe, out.lines=NULL------------------------------------
# A glimpse into the dataset.

glimpse(ds)

## ----template:visualise, fig.height=3------------------------------------
# Visualise relationships in the data.

ds %>%
  filter(location %in% c("Canberra", "Darwin", "Sydney")) %>%
  filter(temp_3pm %>% is.na() %>% not()) %>%
  select(temp_3pm, location) %>%
  ggplot(aes(x=temp_3pm, colour=location, fill=location)) +
  geom_density(alpha=0.55)

## ----template:model_formula----------------------------------------------
# Formula for modelling.

ds[vars] %>% 
  formula() %T>% 
  print() ->
form

## ----template:target_as_categoric----------------------------------------
# Ensure the target is categoric.

ds[[target]] %<>% factor()

## ----template:ipnuts_nobs------------------------------------------------
# Identify the input variables by name.

inputs <- setdiff(vars, target) %T>% print()

# Record the number of observations.

nobs <- nrow(ds) %T>% comcat()

## ----template:model_setup------------------------------------------------
# Initialise random numbers for repeatable results.

seed <- 123
set.seed(seed)

# Partition the full dataset into three: train (70%), validate (15%), test (15%).

nobs %>%
  sample(0.70*nobs) %T>% 
  {length(.) %>% comma() %>% cat("Size of training dataset:", ., "\n")} ->
train

## ----template:validation-------------------------------------------------
# Create a validation dataset of 15% of the observations.

nobs %>%
  seq_len() %>% 
  setdiff(train) %>% 
  sample(0.15*nobs) %T>%
  {length(.) %>% comma() %>% cat("Size of validation dataset:", ., "\n")} ->
validate

## ----template:test-------------------------------------------------------
# Create a testing dataset of 15% (the remainder) of the observations.

nobs %>%
  seq_len() %>%
  setdiff(union(train, validate)) %T>%
  {length(.) %>% comma() %>% cat("Size of validation dataset:", ., "\n")} ->
test

## ----template:evaluation_data--------------------------------------------
# Cache the various actual values for target and risk.

tr_target <- ds[train,][[target]]    %T>% {head(., 15) %>% print()}
tr_risk   <- ds[train,][[risk]]      %T>% {head(., 15) %>% print()}

va_target <- ds[validate,][[target]] %T>% {head(., 15) %>% print()}
va_risk   <- ds[validate,][[risk]]   %T>% {head(., 15) %>% print()}

te_target <- ds[test,][[target]]     %T>% {head(., 15) %>% print()}
te_risk   <- ds[test,][[risk]]       %T>% {head(., 15) %>% print()}

## ----template:rpart------------------------------------------------------
# Splitting function: "anova" "poisson" "class" "exp"

mthd <- "class"

# Splitting function parameters.

prms <- list(split="information")

# Control the training.

ctrl <- rpart.control(maxdepth=5)

# Build the model

m_rp <- rpart(form, ds[train, vars], method=mthd, parms=prms, control=ctrl)

## ----template:model_generics---------------------------------------------
# Capture the model in generic variables.

model <- m_rp
mtype <- "rpart"
mdesc <- "Decision Tree"

## ----template:model_review, out.lines=NULL-------------------------------
# Basic model structure.

model

## ----template:model_plot-------------------------------------------------
# Visually expose the discovered knowledge.

fancyRpartPlot(model)

## ----template:model_summary, fig.height=6, out.lines=40------------------
# Complete model build summary.

summary(model)

## ----template:varimp, fig.height=3---------------------------------------
# Review which importance of the variables.

ggVarImp(model)

## ----template:evaluate---------------------------------------------------
# Predict on validation dataset to judge performance.

model %>%
  predict(newdata=ds[validate, vars], type="class") %>%
  set_names(NULL) %T>%
  {head(., 20) %>% print()} ->
va_class

model %>% 
  predict(newdata=ds[validate, vars], type="prob") %>%
  .[,2] %>%
  set_names(NULL) %>%
  round(2) %T>%
  {head(., 20) %>% print()} ->
va_prob

## ----template:accuracy_error---------------------------------------------
# Overall accuracy and error.

sum(va_class == va_target) %>%
  divide_by(length(va_target)) %T>%
  {
    percent(.) %>%
      sprintf("Overall accuracy = %s\n", .) %>%
      cat()
  } ->
va_acc

sum(va_class != va_target) %>%
  divide_by(length(va_target)) %T>%
  {
    percent(.) %>%
      sprintf("Overall error = %s\n", .) %>%
      cat()
  } ->
va_err

## ----template:confusion_matrix-------------------------------------------
# Basic comparison of prediction/actual as a confusion matrix.

table(va_target, va_class, useNA="ifany", dnn=c("Actual", "Predicted"))

# Comparison as percentages of all observations.

errorMatrix(va_target, va_class) %T>%
  print() ->
va_matrix

# Error rate and average of the class error rate.

va_matrix %>%
  diag() %>%
  sum(na.rm=TRUE) %>%
  subtract(100, .) %>%
  sprintf("Overall error percentage = %s%%\n", .) %>%
  cat()

va_matrix[,"Error"] %>%
  mean(na.rm=TRUE) %>%
  sprintf("Averaged class error percentage = %s%%\n", .) %>%
  cat()

## ----template:metrics----------------------------------------------------
# Other performance metrics: recall, precision, and the F-score.

va_rec <- (va_matrix[2,2]/(va_matrix[2,2]+va_matrix[2,1])) %T>%
  {percent(.) %>% sprintf("Recall = %s\n", .) %>% cat()}

va_pre <- (va_matrix[2,2]/(va_matrix[2,2]+va_matrix[1,2])) %T>%
  {percent(.) %>% sprintf("Precision = %s\n", .) %>% cat()}

va_fsc <- ((2 * va_pre * va_rec)/(va_rec + va_pre))  %T>%
  {sprintf("F-Score = %.3f\n", .) %>% cat()}

## ----template:roc--------------------------------------------------------
# Calculate the area under the curve (AUC).

va_prob %>%
  prediction(va_target) %>%
  performance("auc") %>%
  attr("y.values") %>%
  .[[1]] %T>%
  {
    percent(.) %>%
    sprintf("Percentage area under the ROC curve = %s\n", .) %>%
    cat()
  } ->
va_auc

# Calculate measures required to plot the ROC Curve.

va_prob %>%
  prediction(va_target) %>%
  performance("tpr", "fpr") ->
va_rates

## ----template:roc_plot, fig.height=5-------------------------------------
# Plot the ROC Curve.

data_frame(tpr=attr(va_rates, "y.values")[[1]], 
           fpr=attr(va_rates, "x.values")[[1]]) %>%
  ggplot(aes(fpr, tpr)) +
  geom_line() +
  annotate("text", x=0.875, y=0.125, vjust=0,
           label=paste("AUC =", percent(va_auc))) +
  labs(title="ROC - " %s+% mtype %s+% " - Validation Dataset",
       x="False Positive Rate (1-Specificity)",
       y="True Positive Rate (Sensitivity)")

## ----template:risk_chart, fig.height=6-----------------------------------
# Risk chart.

riskchart(va_prob, va_target, va_risk) +
  labs(title="Risk Chart - " %s+%
         mtype %s+%
         " - Validation Dataset") +
  theme(plot.title=element_text(size=14))