# Load required packages from local library into the R session.

library(dplyr)        # glimpse().
library(ggplot2)      # Visualise data.
library(magrittr)     # Data pipelines: %>% %<>% %T>% equals().
library(randomForest) # na.roughfix() for missing data.
library(rattle)       # normVarNames().
library(rattle.data)  # weatherAUS.
library(scales)       # commas(), percent().
library(stringr)      # str_replace_all().

iris %>%
  ggplot(aes(x=Sepal.Length, y=Sepal.Width)) +
  geom_point()

iris %>%
  ggplot(aes(x=Sepal.Length, y=Sepal.Width)) +
  geom_point() +
  geom_line()

iris %>%
  ggplot(aes(x=Sepal.Length, y=Sepal.Width)) +
  geom_point() +
  stat_smooth(method="loess")

iris %>%
  ggplot(aes(x=Sepal.Length, y=Sepal.Width, colour=Species)) +
  geom_point()

# Initialise the dataset as per the template.

library(rattle)

dsname <- "weatherAUS"
ds     <- get(dsname)

glimpse(ds)

# Review the variables before normalising their names.

names(ds)

# Capture the original variable names for use in plots.

vnames <- names(ds)

# Normalise the variable names.

names(ds) %<>% normVarNames()

# Confirm the results are as expected.

names(ds)

# Index the original variable names by the new names.

names(vnames) <- names(ds)

vnames

# Note the available variables.

vars <- names(ds) %T>% print()

# Note the target variable.

target <- "rain_tomorrow"

# Place the target variable at the beginning of the vars.

vars <- c(target, vars) %>% unique() %T>% print()

# Note the risk variable which measures the severity of the outcome.

risk <- "risk_mm"

# Note the identifiers.

id <- c("date", "location")

# Initialise ignored variables: identifiers.

ignore <- c(risk, id)

# Remove the variables to ignore.

vars <- setdiff(vars, ignore)

# Identify the input variables for modelling.

inputs <- setdiff(vars, target) %T>% print()

# Also record them by indicies.

inputi <- 
  inputs %>%
  sapply(function(x) which(x == names(ds)), USE.NAMES=FALSE) %T>%
  print()

# Identify the numeric variables by index.

numi <- 
  ds %>%
  sapply(is.numeric) %>%
  which() %>%
  intersect(inputi) %T>%
  print()

# Identify the numeric variables by name.

numc <- 
  ds %>% 
  names() %>% 
  extract(numi) %T>% 
  print()


# Identify the categoric variables by index.

cati <- 
  ds %>%
  sapply(is.factor) %>%
  which() %>%
  intersect(inputi) %T>%
  print()

# Identify the categoric variables by name.

catc <-
  ds %>% 
  names() %>% 
  extract(cati) %T>% 
  print()

# Normalise the levels of all categoric variables.

for (v in catc) 
  levels(ds[[v]]) %<>% normVarNames()


# Count the number of missing values.

ds[vars] %>% is.na() %>% sum()

# Impute missing values.

ds[vars] %<>% na.roughfix()

# Confirm that no missing values remain.

ds[vars] %>% is.na() %>% sum()

glimpse(ds)

ds %>%
  sample_n(1000) %>%
  ggplot(aes(x=min_temp, y=max_temp, colour=rain_tomorrow)) +
  geom_point() +
  scale_colour_brewer(palette="Set2") +
  labs(x      = vnames["min_temp"], 
       y      = vnames["max_temp"], 
       colour = vnames["rain_tomorrow"])

ds %>%
  filter(location=="Canberra") %>%
  ggplot(aes(x=date, y=max_temp)) +
  geom_point(shape=".") +
  geom_smooth(method="gam", formula=y~s(x, bs="cs")) +
  labs(x=vnames["date"], y=vnames["max_temp"])

ds %>%
  ggplot(aes(x=date, y=max_temp)) +
  geom_point(alpha=0.05, shape=".") +
  geom_smooth(method="gam", formula=y~s(x, bs="cs")) +
  facet_wrap(~location) +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  labs(x=vnames["date"], y=vnames["max_temp"])

ds %>%
  ggplot(aes(x=date, y=max_temp)) +
  geom_line(alpha=0.1, size=0.05) +
  geom_smooth(method="gam", formula=y~s(x, bs="cs")) +
  facet_wrap(~location) +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  labs(x=vnames["date"], y=vnames["max_temp"])

lblr <- function(x) 
{
  x %>%
    str_replace_all("n", "North ") %>%
    str_replace_all("s", "South ") %>%
    str_replace_all("e", "East ") %>%
    str_replace_all("w", "West ") %>%
    str_replace(" $", "")
}

ds %>%
  sample_n(10000) %>%
  ggplot(aes(x=min_temp, y=max_temp, colour=rain_tomorrow)) +
  geom_point(shape=".") +
  geom_smooth(method="gam", formula=y~s(x, bs="cs")) +
  facet_wrap(~wind_dir_3pm, labeller=labeller(wind_dir_3pm=lblr)) +
  labs(x      = vnames["min_temp"], 
       y      = vnames["max_temp"], 
       colour = vnames["rain_tomorrow"])

## lblr <- function(x)
## {
##   x %>%
##     str_replace_all("n", "North ") %>%
##     str_replace_all("s", "South ") %>%
##     str_replace_all("e", "East ") %>%
##     str_replace_all("w", "West ") %>%
##     str_replace(" $", "")
## }
## 
## ds %>%
##   sample_n(10000) %>%
##   ggplot(aes(x=min_temp, y=max_temp, colour=rain_tomorrow)) +
##   geom_point(shape=".") +
##   geom_smooth(method="gam", formula=y~s(x, bs="cs")) +
##   facet_wrap(~wind_dir_3pm, labeller=labeller(wind_dir_3pm=lblr)) +
##   labs(x      = vnames["min_temp"],
##        y      = vnames["max_temp"],
##        colour = vnames["rain_tomorrow"])

ds %>%
  group_by(rain_tomorrow) %>%
  count() %>%
  ungroup()  %>% 
  mutate(per=round(`n`/sum(`n`), 2)) %>%
  mutate(label=paste(rain_tomorrow, percent(per))) %>%
  arrange(per) %>%
  ggplot(aes(x=1, y=per, fill=rain_tomorrow)) +
  geom_bar(stat="identity") +
  coord_polar(theta='y') +
  theme_void() +
  theme(legend.position="none") +
  geom_text(aes(x=1, y=cumsum(per)-per/2, label=label), size=8)

ds %>%
  ggplot(aes(x=wind_dir_3pm)) +
  geom_bar() +
  scale_y_continuous(labels=comma) +  
  labs(x=vnames["wind_dir_3pm"], y="Count")

#ds.smpl <- ds %>% sample_frac(0.1)
#ds.nol  <- ds.smpl %>% select(-rain_tomorrow)
#ds.smpl %>%
#  ggplot(aes(x=wind_dir_3pm)) +
#  geom_bar(data=ds.nol, fill="grey", alpha=0.5) +
#  geom_bar() +
#  facet_wrap(~ rain_tomorrow)
d <- iris        # Full data set
d_bg <- d[, -5]  # Background Data - full without the 5th column (Species)

ggplot(d, aes(x = Sepal.Width, fill = Species)) +
  geom_histogram(data = d_bg, fill = "grey", alpha = .5) +
  geom_histogram(colour = "black") +
  facet_wrap(~ Species) +
  guides(fill = FALSE) +  # to remove the legend
  theme_bw()              # for clean look overall