# Welcome to the Togaware Data Science Data Template ----
# XX

# Refer to the book, The Essentials of Data Science available from
# Amazon at http://bit.ly/essentials_data_science, and the web site
# https://essentials.togaware.com for more details.
#
# Australian Weather Dataset.
# General Setup.
#
# File: 00_setup.R

# This template provides a starting point for the
# data scientist exploring a new dataset. By no means
# is it the end point of the data science journey.

# This R script is automatically extracted from a knitr
# file with a .Rnw extension. That file includes a broader
# narrative and explanation of the journey through our data.
# Before our own journey into literate programming we can
# make use of these R scripts as our templates for data science.

# The template is under regular revision and improvement
# and is provided as is. # Copyright (c) 2014-2018 Togaware.com
# Authored by and feedback to Graham.Williams@togaware.com
# License: Creative Commons Attribution-ShareAlike CC BY-SA
#
# DOCVERSION

# Name of the dataset.

dsname <- "weatherAUS"
dspath <- "data/."

# Name of the dataset.

dsname <- "weatherAUS"

# Identify the source location of the dataset.

dsloc <- "data"

# Alternatives might include:
#
# dsloc <- "C:/Users/graham/Projects"
# dsloc <- "~/projects"
# dsloc <- "http://rattle.togaware.com"

# Construct the path to the dataset and display some if it.

dsname %s+% ".csv" %>%
  file.path(dsloc, .) %T>%
  cat("Dataset:", ., "\n\n") %T>%
  {
    paste("head", .) %>%
      system(intern=TRUE) %>%
      sub("\r", "\n", .) %>%
      print()
  } -> dspath

# Ingest the dataset.

dspath %>%
  read_csv() -> weatherAUS

weatherAUS

# Prepare the dataset for usage with our template.

ds <- get(dsname)

# Save the dataset to disk as a binary R Data for backup.

fpath <- "data"

dsname %s+% ".RData" %>%
  file.path(fpath, .) %T>%
  cat("Saving:", ., "\n\n") -> fname

if (! dir.exists(fpath)) dir.create(fpath)
if (! file.exists(fname)) save(weatherAUS, file=fname)

# Remove the original dataset to save on memory.

rm(weatherAUS)

# Test the loading of the saved dataset.

load(fname) %>% print()

# Cleanup to save on memory.

rm(weatherAUS)

# Welcome to the Togaware Data Science Data Template ----
#
# Refer to the book, The Essentials of Data Science available from
# Amazon at http://bit.ly/essentials_data_science, and the web site
# https://essentials.togaware.com for more details.
#
# Australian Weather Dataset.
# Data Observation.
#
# File: 20_observe.R

# This template provides a starting point for the
# data scientist exploring a new dataset. # Copyright (c) 2014-2018 Togaware.com
# Authored by and feedback to Graham.Williams@togaware.com
# License: Creative Commons Attribution-ShareAlike CC BY-SA
#
# DOCVERSION

# A glimpse into the dataset.

glimpse(ds)

# Welcome to the Togaware Data Science Data Template ----
#
# Refer to the book, The Essentials of Data Science available from
# Amazon at http://bit.ly/essentials_data_science, and the web site
# https://essentials.togaware.com for more details.
#
# Australian Weather Dataset.
# Data Wrangling.
#
# File: 30_prepare.R

# This template provides a starting point for the
# data scientist exploring a new dataset. By no means
# is it the end point of the data science journey.

# This R script is automatically extracted from a knitr
# file with a .Rnw extension. # Copyright (c) 2014-2018 Togaware.com
# Authored by and feedback to Graham.Williams@togaware.com
# License: Creative Commons Attribution-ShareAlike CC BY-SA
#
# DOCVERSION

# Review the variables to optionally normalise their names.

names(ds)

# Capture the original variable names for use later on.

vnames <- names(ds)

# Normalise the variable names.

names(ds) %<>% normVarNames() %T>% print()

# Index the original variable names by the new names.

names(vnames) <- names(ds)

# We can then obtain the original variable name.

cat(vnames["min_temp"])

dspath %>%
  read_csv() %>%
  set_names(names(.) %>% normVarNames()) -> weatherAUS

weatherAUS

# Glimpse the dataset.

glimpse(ds)

# Review the first few observations.

head(ds) %>% print.data.frame()

# Review the last few observations.

tail(ds) %>% print.data.frame()

# Review a random sample of observations.

sample_n(ds, size=6) %>% print.data.frame()

# Traditional dataset summary to get started.

summary(ds)

# Date data type conversion (if required). Set the appropriate date format. head(ds$date) %>% as.character() %>% class() ds$date %<>% as.character() %>% ymd() %>% as.Date() class(ds$date) head(ds$date) # How many locations are represented in the dataset. ds$location %>% unique() %>% length() # Review the distribution of observations across levels. ds %>% select(starts_with("rain_")) %>% sapply(table) # Note the names of the rain variables. ds %>% select(starts_with("rain_")) %>% names() %T>% print() -> vrain # Confirm these are currently character variables. ds[vrain] %>% sapply(class) # Choose to convert these variables from character to factor. ds[vrain] %<>% lapply(factor) %>% data.frame() %>% tbl_df() # Confirm they are now factors. ds[vrain] %>% sapply(class) # Verify the distribution has not changed. ds %>% select(starts_with("rain_")) %>% sapply(table) ds %>% select(contains("_dir")) %>% names() %>% paste(collapse="|, \\verb|") %>% paste0("\\verb|", . , "|") %>% str_replace(", (\\\\\\verb[^,]+)$", ", and \\1") -> wgvars # Review the distribution of observations across levels. ds %>% select(contains("_dir")) %>% sapply(table) # Levels of wind direction are ordered compas directions. compass <- c("N", "NNE", "NE", "ENE", "E", "ESE", "SE", "SSE", "S", "SSW", "SW", "WSW", "W", "WNW", "NW", "NNW") # Note the names of the wind direction variables. ds %>% select(contains("_dir")) %>% names() %T>% print() -> vdir # Confirm these are currently character variables. ds[vdir] %>% sapply(class) # Convert these variables from character to factor. ds[vdir] %<>% lapply(factor, levels=compass, ordered=TRUE) %>% data.frame() %>% tbl_df() # Confirm they are now factors. ds[vdir] %>% sapply(class) # Verify the distribution has not changed. ds %>% select(contains("_dir")) %>% sapply(table) # Note the character remaining variables to be dealt with. cvars <- c("evaporation", "sunshine") # Review the values. head(ds[cvars]) tail(ds[cvars]) sample_n(ds[cvars], 6) # Check the current class of the variables. ds[cvars] %>% sapply(class) # Convert to numeric. ds[cvars] %<>% sapply(as.numeric) # Confirm the conversion. ds[cvars] %>% sapply(class) ## Identifiers and Targets ---------------- # Note the available variables. ds %>% names() %T>% print() -> vars # Note the target variable. target <- "rain_tomorrow" # Place the target variable at the beginning of the vars. c(target, vars) %>% unique() %T>% print() -> vars # Note the risk variable - measures the severity of the outcome. risk <- "risk_mm" # Note any identifiers. id <- c("date", "location") ## Generic Data Wrangling ---------------- # Initialise ignored variables: identifiers and risk. ignore <- union(id, if (exists("risk")) risk) %T>% print() # Heuristic for indentifiers to possibly ignore. ds[vars] %>% sapply(function(x) x %>% unique() %>% length()) %>% equals(nrow(ds)) %>% which() %>% names() %T>% print() -> ids # Add them if any to the variables to be ignored for modelling. ignore <- union(ignore, ids) %T>% print() # Identify variables with only missing values. ds[vars] %>% sapply(function(x) x %>% is.na %>% sum) %>% equals(nrow(ds)) %>% which() %>% names() %T>% print() -> missing # Add them if any to the variables to be ignored for modelling. ignore <- union(ignore, missing) %T>% print() # Identify a threshold above which proportion missing is fatal. missing.threshold <- 0.7 # Identify variables that are mostly missing. ds[vars] %>% sapply(function(x) x %>% is.na() %>% sum()) %>% '>'(missing.threshold*nrow(ds)) %>% which() %>% names() %T>% print() -> mostly # Add them if any to the variables to be ignored for modelling. ignore <- union(ignore, mostly) %T>% print() # Identify a threshold above which we have too many levels. levels.threshold <- 20 # Identify variables that have too many levels. ds[vars] %>% sapply(is.factor) %>% which() %>% names() %>% sapply(function(x) ds %>% extract2(x) %>% levels() %>% length()) %>% '>='(levels.threshold) %>% which() %>% names() %T>% print() -> too.many # Add them if any to the variables to be ignored for modelling. ignore <- union(ignore, too.many) %T>% print() # Identify variables that have a single value. ds[vars] %>% sapply(function(x) all(x == x[1L])) %>% which() %>% names() %T>% print() -> constants # Add them if any to the variables to be ignored for modelling. ignore <- union(ignore, constants) %T>% print() # Note which variables are numeric. vars %>% setdiff(ignore) %>% '['(ds, .) %>% sapply(is.numeric) %>% which() %>% names() %T>% print() -> numc # For the numeric variables generate a table of correlations ds[numc] %>% cor(use="complete.obs") %>% ifelse(upper.tri(., diag=TRUE), NA, .) %>% abs %>% data.frame %>% tbl_df %>% set_colnames(numc) %>% mutate(var1=numc) %>% gather(var2, cor, -var1) %>% na.omit %>% arrange(-abs(cor)) %T>% print() -> mc # Any variables could be removed because highly correlated? # Copyright (c) 2014-2018 Togaware.com
# Authored by and feedback to Graham.Williams@togaware.com
# License: Creative Commons Attribution-ShareAlike CC BY-SA
#
# DOCVERSION

#### META DATA --------------------------------

# Identify the input variables by name.

inputs <- setdiff(vars, target) %T>% print()

# Identify the input variables by index.

inputi <- sapply(inputs, function(x) which(x == names(ds)), USE.NAMES=FALSE) %T>% print()

# Record the number of observations.

nobs <- nrow(ds) %T>% comcat()

# Confirm various subset sizes.

dim(ds) %>% comcat()
dim(ds[vars]) %>% comcat()
dim(ds[inputs]) %>% comcat()
dim(ds[inputi]) %>% comcat()

# Identify the numeric variables by index.

ds %>%
  sapply(is.numeric) %>%
  which() %>%
  intersect(inputi) %T>%
  print() -> numi

# Identify the numeric variables by name.

ds %>%
  names() %>%
  '['(numi) %T>%
  print() -> numc

# Identify the categoric variables by index.

ds %>%
  sapply(is.factor) %>%
  which() %>%
  intersect(inputi) %T>%
  print() -> cati

# Identify the categoric variables by name.

ds %>%
  names() %>%
  '['(cati) %T>%
  print() -> catc

# Welcome to the Togaware Data Science Data Template ----
#
# Refer to the book, The Essentials of Data Science available from
# Amazon at http://bit.ly/essentials_data_science, and the web site
# https://essentials.togaware.com for more details.
#
# Australian Weather Dataset.
# Cache the Data to Disk.
#
# File: 50_save.R

# This template provides a starting point for the
# data scientist exploring a new dataset. # Copyright (c) 2014-2018 Togaware.com
# Authored by and feedback to Graham.Williams@togaware.com
# License: Creative Commons Attribution-ShareAlike CC BY-SA
#
# DOCVERSION

# Load required packages from local library into R.

library(tidyverse)      # ggplot2, tibble, tidyr, readr, purr, dplyr
library(rattle)         # comcat(), weatherAUS, normVarNames().
library(magrittr)       # Pipe operator %>% %<>% %T>% equals().
library(lubridate)      # Dates and time.
library(stringi)        # String concat operator %s+%.
library(stringr)        # String manipulation: str_replace().
library(randomForest)   # Impute missing values with na.roughfix()
library(FSelector)      # Feature selection: information.gain().
library(scales)         # Include commas in numbers.
library(xtable)         # Generate LaTeX tables. It is published as an appendix to the # book, Quick Start Data Science in R from CRC Press (pending). # # Copyright (c) 2014-2018 Togaware.com # Authored by and feedback to Graham.Williams@togaware.com # License: Creative Commons Attribution-ShareAlike CC BY-SA # # DOCVERSION #### SAVE THE DATASET -------------------------------- # Timestamp for the dataset - this is the general approach. dsdate <- "_" %s+% format(Sys.Date(), "%Y%m%d") %T>% print() # We will use a fixed timestamp to identify our file for convenience. dsdate <- "_20180702" # Filename for the saved dataset. dsrdata <- file.path(fpath, dsname %s+% dsdate %s+% ".RData") %T>% print() # Save relevant R objects to the binary RData file. save(ds, dsname, dspath, dsdate, nobs, vars, target, risk, id, ignore, omit, inputi, inputs, numi, numc, cati, catc, file=dsrdata)