\documentclass[a4paper]{article} \usepackage[british]{babel} \usepackage[colorlinks=true]{hyperref} \usepackage{geometry} \usepackage[parfill]{parskip} % Space between paragraphs and no indent. \usepackage{xcolor} <>= Module <- sub(".Rnw", "", current_input()) cat(paste0("\\newcommand{\\Module}{", Module, "}")) @ \begin{document} <>= # Welcome to the Togaware Data Science End-to-End Template Audit ---- # # Refer to the book, The Essentials of Data Science available from # Amazon at http://bit.ly/essentials_data_science, and the web site # https://essentials.togaware.com for more details. # # Financial Audit Dataset. # # This template provides a starting point for the # data scientist exploring a new dataset. By no means # is it the end point of the data science journey. # # This R script is automatically extracted from a knitr # file with a .Rnw extension. That file includes a broader # narrative and explanation of the journey through our data. # Before our own journey into literate programming we can # make use of these R scripts as our templates for data science. # # The template is under regular revision and improvement # and is provided as is. It is published as an appendix to the # book, Essentials of Data Science from CRC Press. # # Copyright (c) 2014-2018 Togaware.com # Authored by and feedback to Graham.Williams@togaware.com # License: Creative Commons Attribution-ShareAlike CC BY-SA # # DOCVERSION @ <>= # Load required packages from local library into R. library(magrittr) # Pipe operator %>% %<>% %T>% equals(). library(lubridate) # Dates and time. library(rattle) # normVarNames(). library(ROCR) # Use prediction() for evaluation. library(rpart) # Model: decision tree. library(scales) # Include commas in numbers. library(stringi) # String concat operator %s+%. library(tidyverse) # ggplot2, tibble, tidyr, readr, purr, dplyr, stringr @ % We actually set up the dataset here so that within the text that % follows we can refer to the dataset name. We insert the actual code % into the produced document below. <>= dsname <- "audit" @ \title{Data Science Template\\ End-to-End \textbf{\Sexpr{dsname}} Analysis} \author{Graham Williams} \date{\today} \maketitle\thispagestyle{empty} This template provides a minimal example of a data science template. We will build a decision tree model to predict future events from historic observations of that event. In this case the event is whether a customer was found non-compliant in an audit. The concept of templates for Data Science are developed in the book \href{https://bit.ly/essentials_data_science}{The Essentials of Data Science} (2017). The actual source files and scripts, with regular updates, available from the \href{htts://essentials.togaware.com}{Essentials web site} (\href{htts://essentials.togaware.com}{essentials.togaware.com}). As with all of our templates and reports we collect up front here the packages used to support the creation of this document. <>= @ \clearpage \section{Data Source} <>= # Original dataset source/location. dsorig <- file.path("https://rattle.togaware.com/audit.csv") # Name of the dataset. dsname <- "audit" # Identify the Essentials location of the dataset. dsloc <- "https://essentials.togaware.com" dspath <- file.path(dsloc, dsname %s+% ".csv") %T>% print() @ \section{Data Ingestion} <>= # Ingest the dataset. dspath %>% read_csv() %>% assign(dsname, ., envir=.GlobalEnv) @ \section{Generic Template Variable and Initial View} <>= # Store the dataset with a generic template variable name. dsname %>% get() %T>% print() -> ds @ \section{Normalise Variable Names} <>= # Normalise the variable names. names(ds) %<>% normVarNames() %T>% print() # Fix specific variable names. names(ds)[11:13] <- c("accounts", "adjustment", "adjusted") # Check the names. names(ds) @ \section{Key Variables} <>= # Note any identifiers. id <- c("id") # Note the target variable. target <- "adjusted" # Note any risk variable - measures the severity of the outcome. risk <- "adjustment" @ \clearpage \section{Variables for Analysis} <>= # Note available variables ignoring identifiers and risk, with target first. ds %>% names() %>% setdiff(c(id, risk)) %>% c(target, .) %>% unique() %T>% print() -> vars @ \section{Variables to Ignore} <>= # We will sometimes want to ignore specific variables. ignore <- c("accounts", "marital", "occupation", "education") # Remove variables to ignore from the variable list. vars %<>% setdiff(ignore) %T>% print() @ \clearpage \section{Deal with Character Variables} <>= # Identify the character variables by index. chari <- ds[vars] %>% sapply(is.character) %>% which() %T>% print() # Identify the chracter variables by name. charc <- ds[vars] %>% names() %>% '['(chari) %T>% print() @ \section{Character Variable Levels} <>= # Observe the unique levels. ds[charc] %>% sapply(unique) @ \section{Characters to Factors} <>= # Convert all character to factor if determined appropriate. ds[charc] %<>% map(factor) @ \clearpage \section{Data Observation} <>= # A glimpse into the dataset. glimpse(ds) @ \section{Data Visualisation} <>= # Visualise relationships in the data. ds %>% select(gender, income) %>% ggplot(aes(x=income, colour=gender, fill=gender)) + geom_density(alpha=0.55) + scale_y_continuous() + scale_x_continuous(labels=comma) @ \clearpage \section{Model Formula} <>= # Formula for modelling. ds[vars] %>% formula() %T>% print() -> form @ \section{Target as a Categoric} <>= # Ensure the target is categoric. ds[[target]] %<>% factor() @ \section{Variables and Observations} <>= # Identify the input variables by name. inputs <- setdiff(vars, target) %T>% print() # Record the number of observations. nobs <- nrow(ds) %T>% comcat() @ \clearpage \section{Training Dataset} <>= # Initialise random numbers for repeatable results. seed <- 123 set.seed(seed) # Partition the full dataset into three: train (70%), validate (15%), test (15%). nobs %>% sample(0.70*nobs) %T>% {length(.) %>% comma() %>% cat("Size of training dataset:", ., "\n")} -> train @ \section{Validation Dataset} <>= # Create a validation dataset of 15% of the observations. nobs %>% seq_len() %>% setdiff(train) %>% sample(0.15*nobs) %T>% {length(.) %>% comma() %>% cat("Size of validation dataset:", ., "\n")} -> validate @ \section{Test Dataset} <>= # Create a testing dataset of 15% (the remainder) of the observations. nobs %>% seq_len() %>% setdiff(union(train, validate)) %T>% {length(.) %>% comma() %>% cat("Size of validation dataset:", ., "\n")} -> test @ \clearpage \section{Evaluation Subsets} <>= # Cache the various actual values for target and risk. tr_target <- ds[train,][[target]] %T>% {head(., 15) %>% print()} tr_risk <- ds[train,][[risk]] %T>% {head(., 15) %>% print()} va_target <- ds[validate,][[target]] %T>% {head(., 15) %>% print()} va_risk <- ds[validate,][[risk]] %T>% {head(., 15) %>% print()} te_target <- ds[test,][[target]] %T>% {head(., 15) %>% print()} te_risk <- ds[test,][[risk]] %T>% {head(., 15) %>% print()} @ \section{Build Model: Decision Tree} <>= # Splitting function: "anova" "poisson" "class" "exp" mthd <- "class" # Splitting function parameters. prms <- list(split="information") # Control the training. ctrl <- rpart.control(maxdepth=5) # Build the model m_rp <- rpart(form, ds[train, vars], method=mthd, parms=prms, control=ctrl) @ \clearpage \section{Model Generic Variables} <>= # Capture the model in generic variables. model <- m_rp mtype <- "rpart" mdesc <- "Decision Tree" @ \section{Review Model} <>= # Basic model structure. model @ \clearpage \section{Visualise the Model} <>= # Visually expose the discovered knowledge. fancyRpartPlot(model) @ \clearpage \section{Summary of Model} <>= # Complete model build summary. summary(model) @ \clearpage \section{Variable Importance} <>= # Review which importance of the variables. ggVarImp(model) @ \clearpage \section{Model Predictions on Validation} <>= # Predict on validation dataset to judge performance. model %>% predict(newdata=ds[validate, vars], type="class") %>% set_names(NULL) %T>% {head(., 20) %>% print()} -> va_class model %>% predict(newdata=ds[validate, vars], type="prob") %>% .[,2] %>% set_names(NULL) %>% round(2) %T>% {head(., 20) %>% print()} -> va_prob @ \section{Overall Accuracy and Error} <>= # Overall accuracy and error. sum(va_class == va_target) %>% divide_by(length(va_target)) %T>% { percent(.) %>% sprintf("Overall accuracy = %s\n", .) %>% cat() } -> va_acc sum(va_class != va_target) %>% divide_by(length(va_target)) %T>% { percent(.) %>% sprintf("Overall error = %s\n", .) %>% cat() } -> va_err @ \section{Confusion Matrix} <>= # Basic comparison of prediction/actual as a confusion matrix. table(va_target, va_class, useNA="ifany", dnn=c("Actual", "Predicted")) # Comparison as percentages of all observations. errorMatrix(va_target, va_class) %T>% print() -> va_matrix # Error rate and average of the class error rate. va_matrix %>% diag() %>% sum(na.rm=TRUE) %>% subtract(100, .) %>% sprintf("Overall error percentage = %s%%\n", .) %>% cat() va_matrix[,"Error"] %>% mean(na.rm=TRUE) %>% sprintf("Averaged class error percentage = %s%%\n", .) %>% cat() @ \clearpage \section{Recall, Precision, F-Score} <>= # Other performance metrics: recall, precision, and the F-score. va_rec <- (va_matrix[2,2]/(va_matrix[2,2]+va_matrix[2,1])) %T>% {percent(.) %>% sprintf("Recall = %s\n", .) %>% cat()} va_pre <- (va_matrix[2,2]/(va_matrix[2,2]+va_matrix[1,2])) %T>% {percent(.) %>% sprintf("Precision = %s\n", .) %>% cat()} va_fsc <- ((2 * va_pre * va_rec)/(va_rec + va_pre)) %T>% {sprintf("F-Score = %.3f\n", .) %>% cat()} @ \section{ROC Curve} <>= # Calculate the area under the curve (AUC). va_prob %>% prediction(va_target) %>% performance("auc") %>% attr("y.values") %>% .[[1]] %T>% { percent(.) %>% sprintf("Percentage area under the ROC curve = %s\n", .) %>% cat() } -> va_auc # Calculate measures required to plot the ROC Curve. va_prob %>% prediction(va_target) %>% performance("tpr", "fpr") -> va_rates @ \clearpage \section{ROC Curve Plot} <>= # Plot the ROC Curve. data_frame(tpr=attr(va_rates, "y.values")[[1]], fpr=attr(va_rates, "x.values")[[1]]) %>% ggplot(aes(fpr, tpr)) + geom_line() + annotate("text", x=0.875, y=0.125, vjust=0, label=paste("AUC =", percent(va_auc))) + labs(title="ROC - " %s+% mtype %s+% " - Validation Dataset", x="False Positive Rate (1-Specificity)", y="True Positive Rate (Sensitivity)") @ \clearpage \section{Risk Chart} <>= # Risk chart. riskchart(va_prob, va_target, va_risk) + labs(title="Risk Chart - " %s+% mtype %s+% " - Validation Dataset") + theme(plot.title=element_text(size=14)) @ \end{document}