# Load required packages from local library into the R session. library(dplyr) # glimpse(). library(ggplot2) # Visualise data. library(magrittr) # Data pipelines: %>% %<>% %T>% equals(). library(randomForest) # na.roughfix() for missing data. library(rattle) # normVarNames(). library(rattle.data) # weatherAUS. library(scales) # commas(), percent(). library(stringr) # str_replace_all(). iris %>% ggplot(aes(x=Sepal.Length, y=Sepal.Width)) + geom_point() iris %>% ggplot(aes(x=Sepal.Length, y=Sepal.Width)) + geom_point() + geom_line() iris %>% ggplot(aes(x=Sepal.Length, y=Sepal.Width)) + geom_point() + stat_smooth(method="loess") iris %>% ggplot(aes(x=Sepal.Length, y=Sepal.Width, colour=Species)) + geom_point() # Initialise the dataset as per the template. library(rattle) dsname <- "weatherAUS" ds <- get(dsname) glimpse(ds) # Review the variables before normalising their names. names(ds) # Capture the original variable names for use in plots. vnames <- names(ds) # Normalise the variable names. names(ds) %<>% normVarNames() # Confirm the results are as expected. names(ds) # Index the original variable names by the new names. names(vnames) <- names(ds) vnames # Note the available variables. vars <- names(ds) %T>% print() # Note the target variable. target <- "rain_tomorrow" # Place the target variable at the beginning of the vars. vars <- c(target, vars) %>% unique() %T>% print() # Note the risk variable which measures the severity of the outcome. risk <- "risk_mm" # Note the identifiers. id <- c("date", "location") # Initialise ignored variables: identifiers. ignore <- c(risk, id) # Remove the variables to ignore. vars <- setdiff(vars, ignore) # Identify the input variables for modelling. inputs <- setdiff(vars, target) %T>% print() # Also record them by indicies. inputi <- inputs %>% sapply(function(x) which(x == names(ds)), USE.NAMES=FALSE) %T>% print() # Identify the numeric variables by index. numi <- ds %>% sapply(is.numeric) %>% which() %>% intersect(inputi) %T>% print() # Identify the numeric variables by name. numc <- ds %>% names() %>% extract(numi) %T>% print() # Identify the categoric variables by index. cati <- ds %>% sapply(is.factor) %>% which() %>% intersect(inputi) %T>% print() # Identify the categoric variables by name. catc <- ds %>% names() %>% extract(cati) %T>% print() # Normalise the levels of all categoric variables. for (v in catc) levels(ds[[v]]) %<>% normVarNames() # Count the number of missing values. ds[vars] %>% is.na() %>% sum() # Impute missing values. ds[vars] %<>% na.roughfix() # Confirm that no missing values remain. ds[vars] %>% is.na() %>% sum() glimpse(ds) ds %>% sample_n(1000) %>% ggplot(aes(x=min_temp, y=max_temp, colour=rain_tomorrow)) + geom_point() + scale_colour_brewer(palette="Set2") + labs(x = vnames["min_temp"], y = vnames["max_temp"], colour = vnames["rain_tomorrow"]) ds %>% filter(location=="Canberra") %>% ggplot(aes(x=date, y=max_temp)) + geom_point(shape=".") + geom_smooth(method="gam", formula=y~s(x, bs="cs")) + labs(x=vnames["date"], y=vnames["max_temp"]) ds %>% ggplot(aes(x=date, y=max_temp)) + geom_point(alpha=0.05, shape=".") + geom_smooth(method="gam", formula=y~s(x, bs="cs")) + facet_wrap(~location) + theme(axis.text.x=element_text(angle=45, hjust=1)) + labs(x=vnames["date"], y=vnames["max_temp"]) ds %>% ggplot(aes(x=date, y=max_temp)) + geom_line(alpha=0.1, size=0.05) + geom_smooth(method="gam", formula=y~s(x, bs="cs")) + facet_wrap(~location) + theme(axis.text.x=element_text(angle=45, hjust=1)) + labs(x=vnames["date"], y=vnames["max_temp"]) lblr <- function(x) { x %>% str_replace_all("n", "North ") %>% str_replace_all("s", "South ") %>% str_replace_all("e", "East ") %>% str_replace_all("w", "West ") %>% str_replace(" $", "") } ds %>% sample_n(10000) %>% ggplot(aes(x=min_temp, y=max_temp, colour=rain_tomorrow)) + geom_point(shape=".") + geom_smooth(method="gam", formula=y~s(x, bs="cs")) + facet_wrap(~wind_dir_3pm, labeller=labeller(wind_dir_3pm=lblr)) + labs(x = vnames["min_temp"], y = vnames["max_temp"], colour = vnames["rain_tomorrow"]) ## lblr <- function(x) ## { ## x %>% ## str_replace_all("n", "North ") %>% ## str_replace_all("s", "South ") %>% ## str_replace_all("e", "East ") %>% ## str_replace_all("w", "West ") %>% ## str_replace(" $", "") ## } ## ## ds %>% ## sample_n(10000) %>% ## ggplot(aes(x=min_temp, y=max_temp, colour=rain_tomorrow)) + ## geom_point(shape=".") + ## geom_smooth(method="gam", formula=y~s(x, bs="cs")) + ## facet_wrap(~wind_dir_3pm, labeller=labeller(wind_dir_3pm=lblr)) + ## labs(x = vnames["min_temp"], ## y = vnames["max_temp"], ## colour = vnames["rain_tomorrow"]) ds %>% group_by(rain_tomorrow) %>% count() %>% ungroup() %>% mutate(per=round(`n`/sum(`n`), 2)) %>% mutate(label=paste(rain_tomorrow, percent(per))) %>% arrange(per) %>% ggplot(aes(x=1, y=per, fill=rain_tomorrow)) + geom_bar(stat="identity") + coord_polar(theta='y') + theme_void() + theme(legend.position="none") + geom_text(aes(x=1, y=cumsum(per)-per/2, label=label), size=8) ds %>% ggplot(aes(x=wind_dir_3pm)) + geom_bar() + scale_y_continuous(labels=comma) + labs(x=vnames["wind_dir_3pm"], y="Count") #ds.smpl <- ds %>% sample_frac(0.1) #ds.nol <- ds.smpl %>% select(-rain_tomorrow) #ds.smpl %>% # ggplot(aes(x=wind_dir_3pm)) + # geom_bar(data=ds.nol, fill="grey", alpha=0.5) + # geom_bar() + # facet_wrap(~ rain_tomorrow) d <- iris # Full data set d_bg <- d[, -5] # Background Data - full without the 5th column (Species) ggplot(d, aes(x = Sepal.Width, fill = Species)) + geom_histogram(data = d_bg, fill = "grey", alpha = .5) + geom_histogram(colour = "black") + facet_wrap(~ Species) + guides(fill = FALSE) + # to remove the legend theme_bw() # for clean look overall