# Welcome to the Togaware Data Science Data Template ----
# XX
# Refer to the book, The Essentials of Data Science available from
# Amazon at http://bit.ly/essentials_data_science, and the web site
# https://essentials.togaware.com for more details.
#
# Australian Weather Dataset.
# General Setup.
#
# File: 00_setup.R
#
# This template provides a starting point for the 
# data scientist exploring a new dataset. By no means
# is it the end point of the data science journey.
# 
# This R script is automatically extracted from a knitr
# file with a .Rnw extension. That file includes a broader 
# narrative and explanation of the journey through our data.
# Before our own journey into literate programming we can
# make use of these R scripts as our templates for data science.
# 
# The template is under regular revision and improvement
# and is provided as is. It is published as an appendix to the 
# book, Quick Start Data Science in R from CRC Press (pending).
#
# Copyright (c) 2014-2018 Togaware.com
# Authored by and feedback to Graham.Williams@togaware.com
# License: Creative Commons Attribution-ShareAlike CC BY-SA 
#
# DOCVERSION

# Load required packages from local library into R.

library(tidyverse)    # ggplot2, tibble, tidyr, readr, purr, dplyr
library(rattle)       # comcat(), weatherAUS, normVarNames().
library(magrittr)     # Pipe operator %>% %<>% %T>% equals().
library(lubridate)    # Dates and time.
library(stringi)      # String concat operator %s+%.
library(stringr)      # String manipulation: str_replace().
library(randomForest) # Impute missing values with na.roughfix()
library(FSelector)    # Feature selection: information.gain().
library(scales)       # Include commas in numbers.
library(xtable)       # Generate LaTeX tables.

# Welcome to the Togaware Data Science Data Template
#
# Refer to the book, The Essentials of Data Science available from
# Amazon at http://bit.ly/essentials_data_science, and the web site
# https://essentials.togaware.com for more details.
#
# Australian Weather Dataset.
# Data Ingestion.
#
# File: 10_ingest.R
#
# This template provides a starting point for the 
# data scientist exploring a new dataset. By no means
# is it the end point of the data science journey.
# 
# This R script is automatically extracted from a knitr
# file with a .Rnw extension. That file includes a broader 
# narrative and explanation of the journey through our data.
# If you are not familiar yet with literate programming you can
# make use of these R scripts as templates for data science.
# 
# The template is under regular revision and improvement
# and is provided as is. It is published as an appendix to the 
# book, Quick Start Literate Data Science Using R from CRC Press.
#
# Copyright (c) 2014-2018 Togaware.com
# Authored by and feedback to Graham.Williams@togaware.com
# License: Creative Commons Attribution-ShareAlike CC BY-SA 
#
# DOCVERSION

# Name of the dataset.

dsname <- "weatherAUS"
dspath <- "data/."

# Name of the dataset.

dsname <- "weatherAUS"

# Identify the source location of the dataset.

dsloc <- "data"

# Alternatives might include:
#
# dsloc <- "C:/Users/graham/Projects"
# dsloc <- "~/projects"
# dsloc <- "http://rattle.togaware.com"

# Construct the path to the dataset and display some if it.

dsname %s+% ".csv" %>%
  file.path(dsloc, .) %T>%
  cat("Dataset:", ., "\n\n") %T>%
  {
     paste("head", .) %>%
     system(intern=TRUE) %>%
     sub("\r", "\n", .) %>%
     print()
  } ->
dspath

# Ingest the dataset.

dspath %>%
  read_csv() ->
weatherAUS

weatherAUS

# Prepare the dataset for usage with our template.

ds <- get(dsname)

# Save the dataset to disk as a binary R Data for backup.

fpath <- "data"

dsname %s+% ".RData" %>%
  file.path(fpath, .) %T>%
  cat("Saving:", ., "\n\n") ->
fname

if (! dir.exists(fpath))  dir.create(fpath)
if (! file.exists(fname)) save(weatherAUS, file=fname)

# Remove the original dataset to save on memory.

rm(weatherAUS)

# Test the loading of the saved dataset.

load(fname) %>% print()

# Cleanup to save on memory.

rm(weatherAUS)

# Welcome to the Togaware Data Science Data Template ----
#
# Refer to the book, The Essentials of Data Science available from
# Amazon at http://bit.ly/essentials_data_science, and the web site
# https://essentials.togaware.com for more details.
#
# Australian Weather Dataset.
# Data Observation.
#
# File: 20_observe.R
#
# This template provides a starting point for the 
# data scientist exploring a new dataset. By no means
# is it the end point of the data science journey.
# 
# This R script is automatically extracted from a knitr
# file with a .Rnw extension. That file includes a broader 
# narrative and explanation of the journey through our data.
# Before our own journey into literate programming we can
# make use of these R scripts as our templates for data science.
# 
# The template is under regular revision and improvement
# and is provided as is. It is published as an appendix to the 
# book, Quick Start Data Science in R from CRC Press (pending).
#
# Copyright (c) 2014-2018 Togaware.com
# Authored by and feedback to Graham.Williams@togaware.com
# License: Creative Commons Attribution-ShareAlike CC BY-SA 
#
# DOCVERSION

# A glimpse into the dataset.

glimpse(ds)

# Welcome to the Togaware Data Science Data Template ----
#
# Refer to the book, The Essentials of Data Science available from
# Amazon at http://bit.ly/essentials_data_science, and the web site
# https://essentials.togaware.com for more details.
#
# Australian Weather Dataset.
# Data Wrangling.
#
# File: 30_prepare.R
#
# This template provides a starting point for the 
# data scientist exploring a new dataset. By no means
# is it the end point of the data science journey.
# 
# This R script is automatically extracted from a knitr
# file with a .Rnw extension. That file includes a broader 
# narrative and explanation of the journey through our data.
# Before our own journey into literate programming we can
# make use of these R scripts as our templates for data science.
# 
# The template is under regular revision and improvement
# and is provided as is. It is published as an appendix to the 
# book, Quick Start Data Science in R from CRC Press (pending).
#
# Copyright (c) 2014-2018 Togaware.com
# Authored by and feedback to Graham.Williams@togaware.com
# License: Creative Commons Attribution-ShareAlike CC BY-SA 
#
# DOCVERSION

# Review the variables to optionally normalise their names.

names(ds)

# Capture the original variable names for use later on.

vnames <- names(ds)

# Normalise the variable names.

names(ds) %<>% normVarNames() %T>% print()

# Index the original variable names by the new names.

names(vnames) <- names(ds)

# We can then obtain the original variable name.

cat(vnames["min_temp"])

dspath %>%
  read_csv() %>%
  set_names(names(.) %>% normVarNames()) ->
weatherAUS

weatherAUS

# Glimpse the dataset.

glimpse(ds)

# Review the first few observations.

head(ds) %>% print.data.frame()

# Review the last few observations.

tail(ds) %>% print.data.frame()

# Review a random sample of observations.

sample_n(ds, size=6) %>% print.data.frame()

# Traditional dataset summary to get started.

summary(ds)

# Date data type conversion (if required). Set the appropriate date format.

head(ds$date) %>% as.character() %>% class()
ds$date %<>% as.character() %>% ymd() %>% as.Date()
class(ds$date)
head(ds$date)

# How many locations are represented in the dataset.

ds$location %>% 
  unique() %>%
  length()

# Review the distribution of observations across levels.

ds %>%
  select(starts_with("rain_")) %>%
  sapply(table)

# Note the  names of the rain variables.

ds %>% 
  select(starts_with("rain_")) %>% 
  names() %T>%
  print() ->
vrain

# Confirm these are currently character variables.

ds[vrain] %>% sapply(class)

# Choose to convert these variables from character to factor.

ds[vrain] %<>% 
  lapply(factor) %>% 
  data.frame() %>% 
  tbl_df()

# Confirm they are now factors.

ds[vrain] %>% sapply(class)

# Verify the distribution has not changed.

ds %>%
  select(starts_with("rain_")) %>%
  sapply(table)

ds %>% 
  select(contains("_dir")) %>%
  names() %>%
  paste(collapse="|, \\verb|") %>%
  paste0("\\verb|", . , "|") %>%
  str_replace(", (\\\\\\verb[^,]+)$", ", and \\1") ->
wgvars

# Review the distribution of observations across levels.

ds %>%
  select(contains("_dir")) %>%
  sapply(table)

# Levels of wind direction are ordered compas directions.

compass <- c("N", "NNE", "NE", "ENE",
             "E", "ESE", "SE", "SSE",
             "S", "SSW", "SW", "WSW",
             "W", "WNW", "NW", "NNW")

# Note the names of the wind direction variables.

ds %>% 
  select(contains("_dir")) %>% 
  names() %T>%
  print() ->
vdir

# Confirm these are currently character variables.

ds[vdir] %>% sapply(class)

# Convert these variables from character to factor.

ds[vdir] %<>% 
  lapply(factor, levels=compass, ordered=TRUE) %>% 
  data.frame() %>% 
  tbl_df()

# Confirm they are now factors.

ds[vdir] %>% sapply(class)

# Verify the distribution has not changed.

ds %>%
  select(contains("_dir")) %>%
  sapply(table)

# Note the character remaining variables to be dealt with.

cvars <- c("evaporation", "sunshine")

# Review the values.

head(ds[cvars])
tail(ds[cvars])
sample_n(ds[cvars], 6)

# Check the current class of the variables.

ds[cvars] %>% sapply(class)

# Convert to numeric.

ds[cvars] %<>% sapply(as.numeric)

# Confirm the conversion.

ds[cvars] %>% sapply(class)

## Identifiers and Targets ----------------

# Note the available variables.

ds %>%
  names() %T>%
  print() ->
vars

# Note the target variable.

target <- "rain_tomorrow"

# Place the target variable at the beginning of the vars.

c(target, vars) %>%
  unique() %T>%
  print() ->
vars

# Note the risk variable - measures the severity of the outcome.

risk <- "risk_mm"

# Note any identifiers.

id <- c("date", "location")

## Generic Data Wrangling ----------------

# Initialise ignored variables: identifiers and risk.

ignore <- union(id, if (exists("risk")) risk) %T>% print()

# Heuristic for indentifiers to possibly ignore.

ds[vars] %>%
  sapply(function(x) x %>% unique() %>% length()) %>%
  equals(nrow(ds)) %>%
  which() %>%
  names() %T>%
  print() ->
ids

# Add them if any to the variables to be ignored for modelling.

ignore <- union(ignore, ids) %T>% print()

# Identify variables with only missing values.

ds[vars] %>%
  sapply(function(x) x %>% is.na %>% sum) %>%
  equals(nrow(ds)) %>%
  which() %>%
  names() %T>%
  print() ->
missing

# Add them if any to the variables to be ignored for modelling.

ignore <- union(ignore, missing) %T>% print()

# Identify a threshold above which proportion missing is fatal.

missing.threshold <- 0.7

# Identify variables that are mostly missing.

ds[vars] %>%
  sapply(function(x) x %>% is.na() %>% sum()) %>%
  '>'(missing.threshold*nrow(ds)) %>%
  which() %>%
  names() %T>%
  print() ->
mostly

# Add them if any to the variables to be ignored for modelling.

ignore <- union(ignore, mostly) %T>% print()

# Identify a threshold above which we have too many levels.

levels.threshold <- 20

# Identify variables that have too many levels.

ds[vars] %>%
  sapply(is.factor) %>%
  which() %>%
  names() %>%
  sapply(function(x) ds %>% extract2(x) %>% levels() %>% length()) %>%
  '>='(levels.threshold) %>%
  which() %>%
  names() %T>%
  print() ->
too.many

# Add them if any to the variables to be ignored for modelling.

ignore <- union(ignore, too.many) %T>% print()

# Identify variables that have a single value.

ds[vars] %>%
  sapply(function(x) all(x == x[1L])) %>%
  which() %>%
  names() %T>%
  print() ->
constants 

# Add them if any to the variables to be ignored for modelling.

ignore <- union(ignore, constants) %T>% print()

# Note which variables are numeric.

vars %>%
  setdiff(ignore) %>%
  '['(ds, .) %>%
  sapply(is.numeric) %>% 
  which() %>%
  names() %T>%
  print() ->
numc

# For the numeric variables generate a table of correlations

ds[numc] %>%
  cor(use="complete.obs") %>%
  ifelse(upper.tri(., diag=TRUE), NA, .) %>% 
  abs %>% 
  data.frame %>%
  tbl_df %>%
  set_colnames(numc) %>%
  mutate(var1=numc) %>% 
  gather(var2, cor, -var1) %>% 
  na.omit %>%
  arrange(-abs(cor)) %T>%
  print() ->
mc

# Any variables could be removed because highly correlated?

correlated <- c("temp_3pm", "pressure_3pm", "temp_9am")

# Add them if any to the variables to be ignored for modelling.

ignore <- union(ignore, correlated) %T>% print()

# Check the number of variables currently.

length(vars)

# Remove the variables to ignore.

vars <- setdiff(vars, ignore) %T>% print()

# Confirm they are now ignored.

length(vars)

## Variable Selection ----------------

# Formula for modelling.

form <- formula(target %s+% " ~ .") %T>% print()

# Use correlation search to identify key variables.
# Could be useful to decide which variables to retain.

cfs(form, ds[vars])

# Any variables to remove because not useful?

vars %<>% setdiff(NULL) %T>% print()

# Use information gain to identify variable importance.

information.gain(form, ds[vars]) %>%
  rownames_to_column() %>%
  arrange(attr_importance)

# Any variables to remove because not useful?

vars %<>% setdiff(NULL) %T>% print()

## Further Wrangling ----------------

# Check the dimensions to start with.

dim(ds) %>% comcat()

# Identify observations with a missing target.

ds %>% 
  extract2(target) %>% 
  is.na() %T>%
  {sum(.) %>% comcat()} ->
missing.target 

# Remove observations with a missing target.

ds %<>% filter(!missing.target)

# Confirm the filter delivered the expected dataset.

dim(ds) %>% comcat()

## Optional: Missing Value Imputation ----------------

# Count the number of missing values.

ds[vars] %>%  is.na() %>% sum() %>% comcat()

# Impute missing values.

ds[vars] %<>% na.roughfix()

# Confirm that no missing values remain.

ds[vars] %>%  is.na() %>% sum() %>% comcat()

## Optional: Remove Observations With Missing Values ----------------

# Initialise the list of observations to be removed.

omit <- NULL

# Review the current dataset.

ds[vars] %>% nrow() %>% comcat()
ds[vars] %>% is.na() %>% sum() %>% comcat()

# Identify any observations with missing values.

ds[vars] %>%
  na.omit() %>%
  attr("na.action") %T>%
  print() ->
mo

# Record the observations to omit.

omit <- union(omit, mo) %T>% {length(.) %>% print()}

# If there are observations to omit then remove them.

if (length(omit)) ds <- ds[-omit,]

# Confirm the observations have been removed.

ds[vars] %>% nrow() %>% comcat()
ds[vars] %>% is.na() %>% sum() %>% comcat()

## Normalise Factors ----------------

# Note which variables are categoric.

ds[vars] %>%
  sapply(is.factor) %>%
  which() %>%
  names() %T>%
  print() ->
catc

# Check the levels.

ds[catc] %>% sapply(levels)

# Normalise the levels of all categoric variables.

for (v in catc) 
  levels(ds[[v]]) %<>% normVarNames()

# Review the levels.

ds[catc] %>% sapply(levels)

## Categoric Target ----------------

# Ensure the target is categoric.

class(ds[[target]])

ds[[target]] %<>% as.factor()

# Confirm the distribution.

ds[target] %>% table()

ds %>%
  ggplot(aes_string(x=target)) +
  geom_bar(width=0.2, fill="grey") +
  scale_y_continuous(labels=comma) +
  theme(text=element_text(size=14))

## Numeric Target - Alternative ----------------

# Ensure the target is numeric.

class(ds[[target]])

ds[[target]] %<>% as.numeric()

# Confirm the distribution.

ds[target] %>% summary()

ds %>%
  ggplot(aes_string(x=target)) +
  geom_histogram(fill="grey", col="black", binwidth=20) +
  theme(text=element_text(size=14))

# Welcome to the Togaware Data Science Data Template ----
#
# Refer to the book, The Essentials of Data Science available from
# Amazon at http://bit.ly/essentials_data_science, and the web site
# https://essentials.togaware.com for more details.
#
# Australian Weather Dataset.
# Data Preparation.
#
# File: 40_meta.R
#
# This template provides a starting point for the 
# data scientist exploring a new dataset. By no means
# is it the end point of the data science journey.
# 
# This R script is automatically extracted from a knitr
# file with a .Rnw extension. That file includes a broader 
# narrative and explanation of the journey through our data.
# Before our own journey into literate programming we can
# make use of these R scripts as our templates for data science.
# 
# The template is under regular revision and improvement
# and is provided as is. It is published as an appendix to the 
# book, Quick Start Data Science in R from CRC Press (pending).
#
# Copyright (c) 2014-2018 Togaware.com
# Authored by and feedback to Graham.Williams@togaware.com
# License: Creative Commons Attribution-ShareAlike CC BY-SA 
#
# DOCVERSION

#### META DATA --------------------------------

# Identify the input variables by name.

inputs <- setdiff(vars, target) %T>% print()

# Identify the input variables by index.

inputi <- sapply(inputs, 
                 function(x) which(x == names(ds)), 
                 USE.NAMES=FALSE) %T>% print()

# Record the number of observations.

nobs <- nrow(ds) %T>% comcat()

# Confirm various subset sizes.

dim(ds)         %>% comcat()
dim(ds[vars])   %>% comcat()
dim(ds[inputs]) %>% comcat()
dim(ds[inputi]) %>% comcat()

# Identify the numeric variables by index.

ds %>%
  sapply(is.numeric) %>%
  which() %>%
  intersect(inputi) %T>%
  print() ->
numi

# Identify the numeric variables by name.

ds %>% 
  names() %>% 
  '['(numi) %T>% 
  print() ->
numc

# Identify the categoric variables by index.

ds %>%
  sapply(is.factor) %>%
  which() %>%
  intersect(inputi) %T>%
  print() ->
cati

# Identify the categoric variables by name.

ds %>% 
  names() %>% 
  '['(cati) %T>% 
  print() ->
catc

# Welcome to the Togaware Data Science Data Template ----
#
# Refer to the book, The Essentials of Data Science available from
# Amazon at http://bit.ly/essentials_data_science, and the web site
# https://essentials.togaware.com for more details.
#
# Australian Weather Dataset.
# Cache the Data to Disk.
#
# File: 50_save.R
#
# This template provides a starting point for the 
# data scientist exploring a new dataset. By no means
# is it the end point of the data science journey.
# 
# This R script is automatically extracted from a knitr
# file with a .Rnw extension. That file includes a broader 
# narrative and explanation of the journey through our data.
# Before our own journey into literate programming we can
# make use of these R scripts as our templates for data science.
# 
# The template is under regular revision and improvement
# and is provided as is. It is published as an appendix to the 
# book, Quick Start Data Science in R from CRC Press (pending).
#
# Copyright (c) 2014-2018 Togaware.com
# Authored by and feedback to Graham.Williams@togaware.com
# License: Creative Commons Attribution-ShareAlike CC BY-SA 
#
# DOCVERSION

#### SAVE THE DATASET --------------------------------

# Timestamp for the dataset - this is the general approach.

dsdate  <- "_" %s+% format(Sys.Date(), "%Y%m%d") %T>% print()

# We will use a fixed timestamp to identify our file for convenience.

dsdate <- "_20180702"

# Filename for the saved dataset.

dsrdata <- 
  file.path(fpath, dsname %s+% dsdate %s+% ".RData") %T>% 
  print()

# Save relevant R objects to the binary RData file.

save(ds, dsname, dspath, dsdate, nobs,
     vars, target, risk, id, ignore, omit, 
     inputi, inputs, numi, numc, cati, catc, 
     file=dsrdata)