\documentclass[a4paper]{article} \usepackage[british]{babel} \usepackage[colorlinks=true]{hyperref} \usepackage{geometry} \usepackage[parfill]{parskip} % Space between paragraphs and no indent. \usepackage{xcolor} <>= Module <- sub(".Rnw", "", current_input()) cat(paste0("\\newcommand{\\Module}{", Module, "}")) @ \begin{document} <>= # Welcome to the Togaware Data Science End-to-End Template WeatherAUS ---- # # Refer to the book, The Essentials of Data Science available from # Amazon at http://bit.ly/essentials_data_science, and the web site # https://essentials.togaware.com for more details. # # Australian Weather Dataset # # This template provides a starting point for the # data scientist exploring a new dataset. By no means # is it the end point of the data science journey. # # This R script is automatically extracted from a knitr # file with a .Rnw extension. That file includes a broader # narrative and explanation of the journey through our data. # Before our own journey into literate programming we can # make use of these R scripts as our templates for data science. # # The template is under regular revision and improvement # and is provided as is. It is published as an appendix to the # book, Essentials of Data Science from CRC Press. # # Copyright (c) 2014-2018 Togaware.com # Authored by and feedback to Graham.Williams@togaware.com # License: Creative Commons Attribution-ShareAlike CC BY-SA # # DOCVERSION @ <>= # Load required packages from local library into R. library(magrittr) # Pipe operator %>% %<>% %T>% equals(). library(lubridate) # Dates and time. library(rattle) # normVarNames(). library(ROCR) # Use prediction() for evaluation. library(rpart) # Model: decision tree. library(scales) # Include commas in numbers. library(stringi) # String concat operator %s+%. library(tidyverse) # ggplot2, tibble, tidyr, readr, purr, dplyr, stringr @ % We actually set up the dataset here so that within the text that % follows we can refer to the dataset name. We insert the actual code % into the produced document below. <>= dsname <- "weatherAUS" @ \title{Data Science Template\\ End-to-End \textbf{\Sexpr{dsname}} Analysis} \author{Graham Williams} \date{\today} \maketitle\thispagestyle{empty} This template provides a minimal example of a data science template. We will build a decision tree model to predict future events from historic observations of that event. In this case the event is whether it will rain tomorrow. The concept of templates for Data Science are developed in the book \href{https://bit.ly/essentials_data_science}{The Essentials of Data Science} (2017). The actual source files and scripts, with regular updates, available from the \href{htts://essentials.togaware.com}{Essentials web site} (\href{htts://essentials.togaware.com}{essentials.togaware.com}). As with all of our templates and reports we collect up front here the packages used to support the creation of this document. <>= @ \clearpage \section{Data Source} <>= # Original dataset source/location. dsorig <- file.path("https://rattle.togaware.com/weatherAUS.csv") # Name of the dataset. dsname <- "weatherAUS" # Identify the Essentials location of the dataset. dsloc <- "https://essentials.togaware.com" dspath <- file.path(dsloc, dsname %s+% ".csv") %T>% print() @ \section{Data Ingestion} <>= # Ingest the dataset. dspath %>% read_csv() %>% assign(dsname, ., envir=.GlobalEnv) @ \clearpage \section{Generic Template Variable and Initial View} <>= # Store the dataset with a generic template variable name. dsname %>% get() %T>% print() -> ds @ \clearpage \section{Normalise Variable Names} <>= # Normalise the variable names. names(ds) %<>% normVarNames() %T>% print() # Fix specific variable names. names(ds)[23] <- c("rainfall_tomorrow") # Check the names. names(ds) @ \section{Key Variables} <>= # Note any identifiers. id <- c("date", "location") # Note the target variable. target <- "rain_tomorrow" # Note any risk variable - measures the severity of the outcome. risk <- "rainfall_tomorrow" @ \clearpage \section{Variables for Analysis} <>= # Note available variables ignoring identifiers and risk, with target first. ds %>% names() %>% setdiff(c(id, risk)) %>% c(target, .) %>% unique() %T>% print() -> vars @ \section{Variables to Ignore} <>= # We will sometimes want to ignore specific variables. ignore <- NULL # Remove variables to ignore from the variable list. vars %<>% setdiff(ignore) %T>% print() @ \clearpage \section{Deal with Numeric Variables} In our dataset we might notice that evaporation and sunshine have come through as character variables. This is because there are only missing values in the first many observations and the heuristics interprets that as character variables. <>= # Fix specific numeric variables. cvars <- c("evaporation", "sunshine") ds[cvars] %<>% sapply(as.numeric) # Remove observations without a value for rain_tomorrow. ds %<>% drop_na(rain_tomorrow) @ \clearpage \section{Deal with Character Variables} <>= # Identify the character variables by index. chari <- ds[vars] %>% sapply(is.character) %>% which() %T>% print() # Identify the chracter variables by name. charc <- ds[vars] %>% names() %>% '['(chari) %T>% print() @ \section{Character Variable Levels} <>= # Observe the unique levels. ds[charc] %>% sapply(unique) @ \section{Characters to Factors} <>= # Convert all character to factor if determined appropriate. ds[charc] %<>% map(factor) @ \clearpage \section{Data Observation} <>= # A glimpse into the dataset. glimpse(ds) @ \clearpage \section{Data Visualisation} <>= # Visualise relationships in the data. ds %>% filter(location %in% c("Canberra", "Darwin", "Sydney")) %>% filter(temp_3pm %>% is.na() %>% not()) %>% select(temp_3pm, location) %>% ggplot(aes(x=temp_3pm, colour=location, fill=location)) + geom_density(alpha=0.55) @ \clearpage \section{Model Formula} <>= # Formula for modelling. ds[vars] %>% formula() %T>% print() -> form @ \section{Target as a Categoric} <>= # Ensure the target is categoric. ds[[target]] %<>% factor() @ \section{Variables and Observations} <>= # Identify the input variables by name. inputs <- setdiff(vars, target) %T>% print() # Record the number of observations. nobs <- nrow(ds) %T>% comcat() @ \clearpage \section{Training Dataset} <>= # Initialise random numbers for repeatable results. seed <- 123 set.seed(seed) # Partition the full dataset into three: train (70%), validate (15%), test (15%). nobs %>% sample(0.70*nobs) %T>% {length(.) %>% comma() %>% cat("Size of training dataset:", ., "\n")} -> train @ \section{Validation Dataset} <>= # Create a validation dataset of 15% of the observations. nobs %>% seq_len() %>% setdiff(train) %>% sample(0.15*nobs) %T>% {length(.) %>% comma() %>% cat("Size of validation dataset:", ., "\n")} -> validate @ \section{Test Dataset} <>= # Create a testing dataset of 15% (the remainder) of the observations. nobs %>% seq_len() %>% setdiff(union(train, validate)) %T>% {length(.) %>% comma() %>% cat("Size of validation dataset:", ., "\n")} -> test @ \clearpage \section{Evaluation Subsets} <>= # Cache the various actual values for target and risk. tr_target <- ds[train,][[target]] %T>% {head(., 15) %>% print()} tr_risk <- ds[train,][[risk]] %T>% {head(., 15) %>% print()} va_target <- ds[validate,][[target]] %T>% {head(., 15) %>% print()} va_risk <- ds[validate,][[risk]] %T>% {head(., 15) %>% print()} te_target <- ds[test,][[target]] %T>% {head(., 15) %>% print()} te_risk <- ds[test,][[risk]] %T>% {head(., 15) %>% print()} @ \section{Build Model: Decision Tree} <>= # Splitting function: "anova" "poisson" "class" "exp" mthd <- "class" # Splitting function parameters. prms <- list(split="information") # Control the training. ctrl <- rpart.control(maxdepth=5) # Build the model m_rp <- rpart(form, ds[train, vars], method=mthd, parms=prms, control=ctrl) @ \clearpage \section{Model Generic Variables} <>= # Capture the model in generic variables. model <- m_rp mtype <- "rpart" mdesc <- "Decision Tree" @ \section{Review Model} <>= # Basic model structure. model @ \clearpage \section{Visualise the Model} <>= # Visually expose the discovered knowledge. fancyRpartPlot(model) @ \clearpage \section{Summary of Model} <>= # Complete model build summary. summary(model) @ \clearpage \section{Variable Importance} <>= # Review which importance of the variables. ggVarImp(model) @ \clearpage \section{Model Predictions on Validation} <>= # Predict on validation dataset to judge performance. model %>% predict(newdata=ds[validate, vars], type="class") %>% set_names(NULL) %T>% {head(., 20) %>% print()} -> va_class model %>% predict(newdata=ds[validate, vars], type="prob") %>% .[,2] %>% set_names(NULL) %>% round(2) %T>% {head(., 20) %>% print()} -> va_prob @ \section{Overall Accuracy and Error} <>= # Overall accuracy and error. sum(va_class == va_target) %>% divide_by(length(va_target)) %T>% { percent(.) %>% sprintf("Overall accuracy = %s\n", .) %>% cat() } -> va_acc sum(va_class != va_target) %>% divide_by(length(va_target)) %T>% { percent(.) %>% sprintf("Overall error = %s\n", .) %>% cat() } -> va_err @ \section{Confusion Matrix} <>= # Basic comparison of prediction/actual as a confusion matrix. table(va_target, va_class, useNA="ifany", dnn=c("Actual", "Predicted")) # Comparison as percentages of all observations. errorMatrix(va_target, va_class) %T>% print() -> va_matrix # Error rate and average of the class error rate. va_matrix %>% diag() %>% sum(na.rm=TRUE) %>% subtract(100, .) %>% sprintf("Overall error percentage = %s%%\n", .) %>% cat() va_matrix[,"Error"] %>% mean(na.rm=TRUE) %>% sprintf("Averaged class error percentage = %s%%\n", .) %>% cat() @ \clearpage \section{Recall, Precision, F-Score} <>= # Other performance metrics: recall, precision, and the F-score. va_rec <- (va_matrix[2,2]/(va_matrix[2,2]+va_matrix[2,1])) %T>% {percent(.) %>% sprintf("Recall = %s\n", .) %>% cat()} va_pre <- (va_matrix[2,2]/(va_matrix[2,2]+va_matrix[1,2])) %T>% {percent(.) %>% sprintf("Precision = %s\n", .) %>% cat()} va_fsc <- ((2 * va_pre * va_rec)/(va_rec + va_pre)) %T>% {sprintf("F-Score = %.3f\n", .) %>% cat()} @ \section{ROC Curve} <>= # Calculate the area under the curve (AUC). va_prob %>% prediction(va_target) %>% performance("auc") %>% attr("y.values") %>% .[[1]] %T>% { percent(.) %>% sprintf("Percentage area under the ROC curve = %s\n", .) %>% cat() } -> va_auc # Calculate measures required to plot the ROC Curve. va_prob %>% prediction(va_target) %>% performance("tpr", "fpr") -> va_rates @ \clearpage \section{ROC Curve Plot} <>= # Plot the ROC Curve. data_frame(tpr=attr(va_rates, "y.values")[[1]], fpr=attr(va_rates, "x.values")[[1]]) %>% ggplot(aes(fpr, tpr)) + geom_line() + annotate("text", x=0.875, y=0.125, vjust=0, label=paste("AUC =", percent(va_auc))) + labs(title="ROC - " %s+% mtype %s+% " - Validation Dataset", x="False Positive Rate (1-Specificity)", y="True Positive Rate (Sensitivity)") @ \clearpage \section{Risk Chart} <>= # Risk chart. riskchart(va_prob, va_target, va_risk) + labs(title="Risk Chart - " %s+% mtype %s+% " - Validation Dataset") + theme(plot.title=element_text(size=14)) @ \end{document}