## ----template:intro, echo=FALSE------------------------------------------ # Welcome to the Togaware Data Science End-to-End Template WeatherAUS ---- # # Refer to the book, The Essentials of Data Science available from # Amazon at http://bit.ly/essentials_data_science, and the web site # https://essentials.togaware.com for more details. # # Australian Weather Dataset # # This template provides a starting point for the # data scientist exploring a new dataset. By no means # is it the end point of the data science journey. # # This R script is automatically extracted from a knitr # file with a .Rnw extension. That file includes a broader # narrative and explanation of the journey through our data. # Before our own journey into literate programming we can # make use of these R scripts as our templates for data science. # # The template is under regular revision and improvement # and is provided as is. It is published as an appendix to the # book, Essentials of Data Science from CRC Press. # # Copyright (c) 2014-2018 Togaware.com # Authored by and feedback to Graham.Williams@togaware.com # License: Creative Commons Attribution-ShareAlike CC BY-SA # # DOCVERSION ## ----template:attach_packages, echo=FALSE, message=FALSE, warning=FALSE---- # Load required packages from local library into R. library(magrittr) # Pipe operator %>% %<>% %T>% equals(). library(lubridate) # Dates and time. library(rattle) # normVarNames(). library(ROCR) # Use prediction() for evaluation. library(rpart) # Model: decision tree. library(scales) # Include commas in numbers. library(stringi) # String concat operator %s+%. library(tidyverse) # ggplot2, tibble, tidyr, readr, purr, dplyr, stringr ## ----template:data_source, message=FALSE--------------------------------- # Original dataset source/location. dsorig <- file.path("https://rattle.togaware.com/weatherAUS.csv") # Name of the dataset. dsname <- "weatherAUS" # Identify the Essentials location of the dataset. dsloc <- "https://essentials.togaware.com" dspath <- file.path(dsloc, dsname %s+% ".csv") %T>% print() ## ----template:data_ingest, message=FALSE--------------------------------- # Ingest the dataset. dspath %>% read_csv() %>% assign(dsname, ., envir=.GlobalEnv) ## ----template:set_template_variable-------------------------------------- # Store the dataset with a generic template variable name. dsname %>% get() %T>% print() -> ds ## ----template:norm_var_names--------------------------------------------- # Normalise the variable names. names(ds) %<>% normVarNames() %T>% print() # Fix specific variable names. names(ds)[23] <- c("rainfall_tomorrow") # Check the names. names(ds) ## ----template:identifiers------------------------------------------------ # Note any identifiers. id <- c("date", "location") # Note the target variable. target <- "rain_tomorrow" # Note any risk variable - measures the severity of the outcome. risk <- "rainfall_tomorrow" ## ----template:variable_roles, out.lines=NULL----------------------------- # Note available variables ignoring identifiers and risk, with target first. ds %>% names() %>% setdiff(c(id, risk)) %>% c(target, .) %>% unique() %T>% print() -> vars ## ----template:ignore----------------------------------------------------- # We will sometimes want to ignore specific variables. ignore <- NULL # Remove variables to ignore from the variable list. vars %<>% setdiff(ignore) %T>% print() ## ----template:numeric_data----------------------------------------------- # Fix specific numeric variables. cvars <- c("evaporation", "sunshine") ds[cvars] %<>% sapply(as.numeric) # Remove observations without a value for rain_tomorrow. ds %<>% drop_na(rain_tomorrow) ## ----template:characters------------------------------------------------- # Identify the character variables by index. chari <- ds[vars] %>% sapply(is.character) %>% which() %T>% print() # Identify the chracter variables by name. charc <- ds[vars] %>% names() %>% '['(chari) %T>% print() ## ----template:character_levels------------------------------------------- # Observe the unique levels. ds[charc] %>% sapply(unique) ## ----template:data_wrangling--------------------------------------------- # Convert all character to factor if determined appropriate. ds[charc] %<>% map(factor) ## ----template:observe, out.lines=NULL------------------------------------ # A glimpse into the dataset. glimpse(ds) ## ----template:visualise, fig.height=3------------------------------------ # Visualise relationships in the data. ds %>% filter(location %in% c("Canberra", "Darwin", "Sydney")) %>% filter(temp_3pm %>% is.na() %>% not()) %>% select(temp_3pm, location) %>% ggplot(aes(x=temp_3pm, colour=location, fill=location)) + geom_density(alpha=0.55) ## ----template:model_formula---------------------------------------------- # Formula for modelling. ds[vars] %>% formula() %T>% print() -> form ## ----template:target_as_categoric---------------------------------------- # Ensure the target is categoric. ds[[target]] %<>% factor() ## ----template:ipnuts_nobs------------------------------------------------ # Identify the input variables by name. inputs <- setdiff(vars, target) %T>% print() # Record the number of observations. nobs <- nrow(ds) %T>% comcat() ## ----template:model_setup------------------------------------------------ # Initialise random numbers for repeatable results. seed <- 123 set.seed(seed) # Partition the full dataset into three: train (70%), validate (15%), test (15%). nobs %>% sample(0.70*nobs) %T>% {length(.) %>% comma() %>% cat("Size of training dataset:", ., "\n")} -> train ## ----template:validation------------------------------------------------- # Create a validation dataset of 15% of the observations. nobs %>% seq_len() %>% setdiff(train) %>% sample(0.15*nobs) %T>% {length(.) %>% comma() %>% cat("Size of validation dataset:", ., "\n")} -> validate ## ----template:test------------------------------------------------------- # Create a testing dataset of 15% (the remainder) of the observations. nobs %>% seq_len() %>% setdiff(union(train, validate)) %T>% {length(.) %>% comma() %>% cat("Size of validation dataset:", ., "\n")} -> test ## ----template:evaluation_data-------------------------------------------- # Cache the various actual values for target and risk. tr_target <- ds[train,][[target]] %T>% {head(., 15) %>% print()} tr_risk <- ds[train,][[risk]] %T>% {head(., 15) %>% print()} va_target <- ds[validate,][[target]] %T>% {head(., 15) %>% print()} va_risk <- ds[validate,][[risk]] %T>% {head(., 15) %>% print()} te_target <- ds[test,][[target]] %T>% {head(., 15) %>% print()} te_risk <- ds[test,][[risk]] %T>% {head(., 15) %>% print()} ## ----template:rpart------------------------------------------------------ # Splitting function: "anova" "poisson" "class" "exp" mthd <- "class" # Splitting function parameters. prms <- list(split="information") # Control the training. ctrl <- rpart.control(maxdepth=5) # Build the model m_rp <- rpart(form, ds[train, vars], method=mthd, parms=prms, control=ctrl) ## ----template:model_generics--------------------------------------------- # Capture the model in generic variables. model <- m_rp mtype <- "rpart" mdesc <- "Decision Tree" ## ----template:model_review, out.lines=NULL------------------------------- # Basic model structure. model ## ----template:model_plot------------------------------------------------- # Visually expose the discovered knowledge. fancyRpartPlot(model) ## ----template:model_summary, fig.height=6, out.lines=40------------------ # Complete model build summary. summary(model) ## ----template:varimp, fig.height=3--------------------------------------- # Review which importance of the variables. ggVarImp(model) ## ----template:evaluate--------------------------------------------------- # Predict on validation dataset to judge performance. model %>% predict(newdata=ds[validate, vars], type="class") %>% set_names(NULL) %T>% {head(., 20) %>% print()} -> va_class model %>% predict(newdata=ds[validate, vars], type="prob") %>% .[,2] %>% set_names(NULL) %>% round(2) %T>% {head(., 20) %>% print()} -> va_prob ## ----template:accuracy_error--------------------------------------------- # Overall accuracy and error. sum(va_class == va_target) %>% divide_by(length(va_target)) %T>% { percent(.) %>% sprintf("Overall accuracy = %s\n", .) %>% cat() } -> va_acc sum(va_class != va_target) %>% divide_by(length(va_target)) %T>% { percent(.) %>% sprintf("Overall error = %s\n", .) %>% cat() } -> va_err ## ----template:confusion_matrix------------------------------------------- # Basic comparison of prediction/actual as a confusion matrix. table(va_target, va_class, useNA="ifany", dnn=c("Actual", "Predicted")) # Comparison as percentages of all observations. errorMatrix(va_target, va_class) %T>% print() -> va_matrix # Error rate and average of the class error rate. va_matrix %>% diag() %>% sum(na.rm=TRUE) %>% subtract(100, .) %>% sprintf("Overall error percentage = %s%%\n", .) %>% cat() va_matrix[,"Error"] %>% mean(na.rm=TRUE) %>% sprintf("Averaged class error percentage = %s%%\n", .) %>% cat() ## ----template:metrics---------------------------------------------------- # Other performance metrics: recall, precision, and the F-score. va_rec <- (va_matrix[2,2]/(va_matrix[2,2]+va_matrix[2,1])) %T>% {percent(.) %>% sprintf("Recall = %s\n", .) %>% cat()} va_pre <- (va_matrix[2,2]/(va_matrix[2,2]+va_matrix[1,2])) %T>% {percent(.) %>% sprintf("Precision = %s\n", .) %>% cat()} va_fsc <- ((2 * va_pre * va_rec)/(va_rec + va_pre)) %T>% {sprintf("F-Score = %.3f\n", .) %>% cat()} ## ----template:roc-------------------------------------------------------- # Calculate the area under the curve (AUC). va_prob %>% prediction(va_target) %>% performance("auc") %>% attr("y.values") %>% .[[1]] %T>% { percent(.) %>% sprintf("Percentage area under the ROC curve = %s\n", .) %>% cat() } -> va_auc # Calculate measures required to plot the ROC Curve. va_prob %>% prediction(va_target) %>% performance("tpr", "fpr") -> va_rates ## ----template:roc_plot, fig.height=5------------------------------------- # Plot the ROC Curve. data_frame(tpr=attr(va_rates, "y.values")[[1]], fpr=attr(va_rates, "x.values")[[1]]) %>% ggplot(aes(fpr, tpr)) + geom_line() + annotate("text", x=0.875, y=0.125, vjust=0, label=paste("AUC =", percent(va_auc))) + labs(title="ROC - " %s+% mtype %s+% " - Validation Dataset", x="False Positive Rate (1-Specificity)", y="True Positive Rate (Sensitivity)") ## ----template:risk_chart, fig.height=6----------------------------------- # Risk chart. riskchart(va_prob, va_target, va_risk) + labs(title="Risk Chart - " %s+% mtype %s+% " - Validation Dataset") + theme(plot.title=element_text(size=14))