\documentclass[a4paper]{article} \usepackage[british]{babel} \usepackage[colorlinks=true]{hyperref} \usepackage{geometry} \usepackage[parfill]{parskip} % Space between paragraphs and no indent. \usepackage{xcolor} <>= Module <- sub(".Rnw", "", current_input()) cat(paste0("\\newcommand{\\Module}{", Module, "}")) @ \begin{document} <>= # Welcome to the Togaware Data Science End-to-End Template Ports ---- # # Refer to the book, The Essentials of Data Science available from # Amazon at http://bit.ly/essentials_data_science, and the web site # https://essentials.togaware.com for more details. # # Ports Dataset. # # This template provides a starting point for the # data scientist exploring a new dataset. By no means # is it the end point of the data science journey. # # This R script is automatically extracted from a knitr # file with a .Rnw extension. That file includes a broader # narrative and explanation of the journey through our data. # Before our own journey into literate programming we can # make use of these R scripts as our templates for data science. # # The template is under regular revision and improvement # and is provided as is. It is published as an appendix to the # book, Essentials of Data Science in R from CRC Press. # # Copyright (c) 2014-2018 Togaware.com # Authored by and feedback to Graham.Williams@togaware.com # License: Creative Commons Attribution-ShareAlike CC BY-SA # # DOCVERSION @ <>= # Load required packages from local library into R. library(directlabels) # Dodging labels for ggplot2. library(grid) # Layout of plots: viewport(). library(magrittr) # Pipe operator %>% %<>% %T>% equals(). library(rattle) # normVarNames(). library(readxl) # read_excel(). library(scales) # Include commas in numbers. library(stringi) # String concat operator %s+%. library(tidyverse) # ggplot2, tibble, tidyr, readr, purr, dplyr, stringr @ % We actually set up the dataset here so that within the text that % follows we can refer to the dataset name. We insert the actual code % into the produced document below. <>= dsname <- "ports" @ \title{Data Science Template\\ End-to-End \textbf{\Sexpr{dsname}} Analysis} \author{Graham Williams} \date{\today} \maketitle\thispagestyle{empty} This template provides an example of a data science template for visualising data. Through visualisation we are able to gain insights into the data and to begin to tell the story that the data supports. The concept of templates for Data Science are developed in the book \href{https://bit.ly/essentials_data_science}{The Essentials of Data Science} (2017). The actual source files and scripts, with regular updates, are available from the \href{htts://essentials.togaware.com}{Essentials web site} (\href{htts://essentials.togaware.com}{essentials.togaware.com}). As with all our templates and reports we collect up front here the packages used to support the creation of this document. <>= @ \section{Data Source} <>= # Name of the dataset. dsname <- "ports" # Identify the Essentials location of the dataset. dsloc <- "https://essentials.togaware.com" dspath <- dsname %s+% ".xlsx" dsurl <- file.path(dsloc, dspath) %T>% print() @ \section{Data Ingestion} <>= # Download the file locally. download.file(dsurl, destfile=dspath, mode="wb") # Ingest the dataset. dspath %>% read_xlsx(sheet=1, col_names=FALSE) %>% assign(dsname, ., envir=.GlobalEnv) get(dsname) @ \section{Generic Template Variables} <>= # Prepare the dataset for usage with template. ds <- get(dsname) @ \section{Normalise Variable Names} This is not really required for this dataset as we will not be referring to these variable names but we will do so simply to maintain the template. <>= # Normalise the variable names. names(ds) %<>% normVarNames() %T>% print() @ \clearpage \section{Initial Observations} <>= # A glimpse into the dataset. glimpse(ds) @ We note that the spreadsheet contains multiple small tables on the one sheet. We will be treating each table separately. \clearpage \section{Bar Chart: Value/Weight of Sea Trade} <>= # Confirm the row and column span for the table of interest. ds[72:93, 1:4] @ <>= # Wrangle the dataset: Rename columns informatively. ds[72:93, 1:4] %>% set_names(c("period", "location", "export", "import")) @ \clearpage <>= # Wrangle the dataset: Numeric variable conversion. ds[72:93, 1:4] %>% set_names(c("period", "location", "export", "import")) %>% mutate( export = as.numeric(export), import = as.numeric(import) ) @ <>= # Otherwise it is using tidyr::extract? extract <- magrittr::extract @ <>= # Generate indicies that will be useful for indexing the data. seq(1,21,2) %>% rep(2) %>% sort() # Confirm this achieves the desired outcome. ds[72:93, 1:4] %>% set_names(c("period", "location", "export", "import")) %>% extract2("period") %>% extract(seq(1,21,2) %>% rep(2) %>% sort()) @ \clearpage <>= # Wrangle the dataset: Repair the period column. ds[72:93, 1:4] %>% set_names(c("period", "location", "export", "import")) %>% mutate( export = as.numeric(export), import = as.numeric(import), period = period[seq(1, 21, 2) %>% rep(2) %>% sort()] ) @ <>= # Wrangle the dataset: Reshape the datset. ds[72:93, 1:4] %>% set_names(c("period", "location", "export", "import")) %>% mutate( export = as.numeric(export), import = as.numeric(import), period = period[seq(1, 21, 2) %>% rep(2) %>% sort()] ) %>% gather(type, value, -c(period, location)) @ <>= # Identify specific colors required for the organisational style. cols <- c('#F6A01A', # Primary Yellow '#0065A4', # Primary Blue '#455560', # Primary Accent Grey '#B2BB1E', # Secondary Green '#7581BF', # Secondary Purple '#BBB0A3', # Secondary Light Grey '#E31B23', # Secondary Red '#C1D2E8') # Variant Grey # Create a ggplot2 theme using these colours. theme_bitre <- scale_fill_manual(values=cols) @ <>= # Facetted bar plot coparing import/export value across years. ds[72:93, 1:4] %>% set_names(c("period", "location", "export", "import")) %>% mutate( export = as.numeric(export), import = as.numeric(import), period = period[seq(1, 21, 2) %>% rep(2) %>% sort()] ) %>% gather(type, value, -c(period, location)) %>% ggplot(aes(x=location, y=value/1000, fill=type)) + geom_bar(stat="identity", position=position_dodge(width=1)) + facet_grid(~period) + labs(y="Billion dollars", x="", fill="") + theme(axis.text.x=element_text(angle=45, hjust=1, size=10)) + theme_bitre @ \clearpage <>= # Facetted bar plot coparing import/export weight across years. ds[96:117, 1:4] %>% set_names(c("period", "location", "export", "import")) %>% mutate( export = as.numeric(export), import = as.numeric(import), period = period[seq(1, 21, 2) %>% rep(2) %>% sort()] ) %>% gather(type, value, -c(period, location)) %>% ggplot(aes(x=location, y=value/1000, fill=type)) + geom_bar(stat="identity",position=position_dodge(width = 1)) + facet_grid(~period) + labs(y="Million tonnes", x="", fill="") + theme(axis.text.x=element_text(angle=45, hjust=1, size=10)) + theme_bitre @ \clearpage \section{Scatter Plot: Throughput versus Annual Growth} <>= # Confirm the table of interest from the dataset. ds[2:4, 2:18] @ <>= # Wrangle the dataset: Transpose and retain as a dataset. ds[2:4, 2:18] %>% t() %>% data.frame(row.names=NULL, stringsAsFactors=FALSE) %>% tbl_df() @ \clearpage <>= # Wrangle the dataset: Renaming columns informatively. ds[2:4, 2:18] %>% t() %>% data.frame(row.names=NULL, stringsAsFactors=FALSE) %>% tbl_df() %>% set_names(c("port", "weight", "rate")) @ \clearpage <>= # Wrangle the dataset: Numeric variable conversion. ds[2:4, 2:18] %>% t() %>% data.frame(row.names=NULL, stringsAsFactors=FALSE) %>% tbl_df() %>% set_names(c("port", "weight", "rate")) %>% mutate( weight = as.numeric(weight), rate = as.numeric(rate) ) @ \clearpage <>= # Identify port types from appropriate data in the sheet. ds[6, 1:2] ds[7:17, 1:2] @ <>= # Construct a port type table. ds[7:17, 1:2] %>% set_names(ds[6, 1:2]) @ \clearpage <>= # Tidy the dataset into a more useful structure. ds[7:17, 1:2] %>% set_names(ds[6, 1:2]) %>% gather(type, port) %>% na.omit() @ \clearpage <>= # Wrangle the dataset: Join to port type. ds[2:4, 2:18] %>% t() %>% data.frame(row.names=NULL, stringsAsFactors=FALSE) %>% tbl_df() %>% set_names(c("port", "weight", "rate")) %>% mutate( weight = as.numeric(weight), rate = as.numeric(rate) ) %>% left_join(ds[7:17, 1:2] %>% set_names(ds[6, 1:2]) %>% gather(type, port) %>% na.omit(), by="port") @ \clearpage <>= # Labelled scatter plot with inset. ds[2:4, 2:18] %>% t() %>% data.frame(row.names=NULL, stringsAsFactors=FALSE) %>% tbl_df() %>% set_names(c("port", "weight", "rate")) %>% mutate(weight = as.numeric(weight), rate = as.numeric(rate)) %>% left_join(ds[7:17, 1:2] %>% set_names(ds[6, 1:2]) %>% gather(type, port) %>% na.omit(), by="port") %>% mutate(type=factor(type, levels=c("Mixed", "Bulk"))) %>% filter(port != "Darwin") -> tds tds %>% ggplot(aes(x=weight, y=rate)) + geom_point(aes(colour=type, shape=type), size=4) + xlim(0, 300) + ylim(0, 13) + labs(shape="Port Type", colour="Port Type", x="Throughput 2011-12 (million tonnes)", y="Throughput average annual growth rate") + geom_text(data=filter(tds, type=="Bulk"), aes(label=port), vjust=2) + annotate("rect", xmin=0, xmax=45, ymin=3, ymax=6.5, alpha = .1) + annotate("text", label="See inset", x=28, y=3.3, size=4) + theme(legend.position="bottom") @ <>= # Labelled scatter plot - the inset. above <- c("Townsville", "Fremantle") tds %<>% filter(port != "Darwin" & type == "Mixed") tds %>% ggplot(aes(x=weight, y=rate, label=port)) + geom_point(aes(colour=type, shape=type), size=4) + labs(shape="Port Type", colour="Port Type") + xlim(0, 40) + ylim(3, 6) + labs(x="Throughput 2011-12 (million tonnes)", y="Throughput average annual growth rate") + geom_text(data=filter(tds, !port%in%above), vjust= 2.0) + geom_text(data=filter(tds, port%in%above), vjust=-1.0) + theme(legend.position="bottom") @ \clearpage \section{Combined Plots: Port Calls} <>= # Wrangle the dataset: Name columns informatively. ds[20:36, 1:13] %>% set_names(c("port", ds[19, 2:13])) @ <>= # Wrangle the dataset: Dataset reshape and convert integer. ds[20:36, 1:13] %>% set_names(c("port", ds[19, 2:13])) %>% gather(period, calls, -port) %>% mutate(calls=as.integer(calls)) @ \clearpage <>= # Wrangle the dataset: Avg calculation. ds[20:36, 1:13] %>% set_names(c("port", ds[19, 2:13])) %>% select(port, 2, 13) %>% set_names(c('port', 'start', 'end')) %>% mutate( start = as.integer(start), end = as.integer(end), avg = 100*(exp(log(end/start)/11)-1) ) @ \clearpage <>= # Build the main faceted plot. p1 <- ds[20:36, 1:13] %>% set_names(c("port", ds[19, 2:13])) %>% gather(period, calls, -port) %>% mutate(calls=as.integer(calls)) %>% ggplot(aes(x=period, y=calls)) + geom_bar(stat="identity", position="dodge", fill="#6AADD6") + facet_wrap(~port) + labs(x="", y="Number of Calls") + theme(axis.text.x=element_text(angle=90, hjust=1, size=8)) + scale_y_continuous(labels=comma) @ <>= # Generate the second plot. p2 <- ds[20:36, 1:13] %>% set_names(c("port", ds[19, 2:13])) %>% select(port, 2, 13) %>% set_names(c('port', 'start', 'end')) %>% mutate( start = as.integer(start), end = as.integer(end), avg = 100*(exp(log(end/start)/11)-1) ) %>% ggplot(aes(x=port, y=avg)) + geom_bar(stat="identity", position="identity", fill="#6AADD6") + theme(axis.text.x=element_text(angle=45, hjust=1, size=8), axis.text.y=element_text(size=8), axis.title=element_text(size=10), plot.title=element_text(size=8), plot.background = element_blank()) + labs(x="", y="Per cent", title="Average Annual Growth, 2001-02 to 2012-13") @ \clearpage <>= # Combine the plots into a single faceted bar plot with embedded bar plot. print(p1) print(p2, vp=viewport(x=0.72, y=0.13, height=0.28, width=0.54)) @ \clearpage \section{Horizontal Bar Chart} <>= # Horizontal bar chart. ds[48:56, 1:2] %>% set_names(c("occupation", "percent")) %>% mutate(percent = as.numeric(percent), occupation = factor(occupation, levels=occupation[order(percent)])) %>% ggplot(aes(x=occupation, y=percent)) + geom_bar(stat="identity", fill="#6AADD6", width=0.8) + theme(axis.title.x=element_text(size=10)) + labs(x="", y="Per cent") + coord_flip() @ \clearpage \section{Stacked Horizontal Bar Chart} <>= tds <- ds[39:40, 2:9] %>% set_names(ds[38, 2:9]) %>% mutate(type=c("Mixed Ports", "Bulk Ports")) %>% gather(occupation, percent, -type) %>% mutate( percent = as.numeric(percent), occupation = factor(occupation, levels=c("Managers", "Professionals", "Technicians and Trades Workers", "Community and Personal Service Workers", "Clerical and Administrative Workers", "Sales Workers", "Machinery Operators and Drivers", "Labourers")) ) %T>% print() @ \clearpage <>= mv <- tds %>% filter(type=="Mixed Ports") %>% extract2("percent") %>% rev() my <- (mv/2) + c(0, head(cumsum(mv), -1)) bv <- tds %>% filter(type=="Bulk Ports") %>% extract2("percent") %>% rev() by <- (bv/2) + c(0, head(cumsum(bv), -1)) lbls <- data.frame(x=c(rep(1, length(mv)), rep(2, length(bv))), y=c(by, my), v=round(c(bv, mv))) %T>% print() @ \clearpage <>= # Horizontal bar chart with multiple stacks. tds %>% ggplot(aes(x=type, y=percent, fill=occupation)) + geom_bar(stat="identity", width=0.5) + labs(x="", y="Per cent", fill="") + annotate("text", x=lbls$x, y=lbls$y, label=lbls$v, colour="white") + coord_flip() + scale_y_reverse() + theme_bitre @ \clearpage <>= # Simple bar chart with dodged and labelled bars. ds[43:45, 1:3] %>% set_names(c("type", ds[42, 2:3])) %>% gather(var, count, -type) %>% mutate(count = as.integer(count), type = factor(type, levels=c("Bulk", "Mixed", "Australia"))) -> tds lbls <- data.frame(x=c(.7, 1, 1.3, 1.7, 2, 2.3), y=tds$count-3, lbl=round(tds$count)) tds %>% ggplot(aes(x=var, y=count)) + geom_bar(stat="identity", position="dodge", aes(fill=type)) + labs(x="", y="Per cent", fill="") + ylim(0, 100) + geom_text(data=lbls, aes(x=x, y=y, label=lbl), colour="white") + theme_bitre @ \end{document}