Tidy data

#------------------------------------------------------------------------#
# Tidy data script
# Demonstrates cleaning up the messiest data / Excel files you can find
#------------------------------------------------------------------------#

# Install these packages
install.packages("readxl")    # For importing Excel data
install.packages("tidyr")     # For re-organizing your data frames
install.packages("lubridate") # For dates
install.packages("stringr") # Cleans spaces and character strings
install.packages("forcats")   # For assigning an order to character data c("Gold", "Silver", "Bronze")
                 
# Load these packages
library("dplyr")
library("readr")
library("ggplot2")
library("tidyr")     
library("readxl")    
library("lubridate") 
library("stringr")
library("forcats")   


#------------------------------------------------------------------------#
# Wide tables
#------------------------------------------------------------------------#

# Let's start with a really WIDEEEEE Excel table with lots of columns

#1. Load emissions Excel file
folder <- "messy_data/"

wide_data <- read_excel(paste0(folder, "emissions_2015.xlsx"))

#2. Lowercase names
names(wide_data) <- tolower(names(wide_data))

names(wide_data)

#3. Gather all the pollutant columns into one
long_data <- wide_data %>% 
                gather(key = pollutant, value = emissions, `222trifluoroetha`:`solv in coat`)

#4. Drop the missing values and zero emissions
long_data <- filter(long_data, !is.na(emissions), emissions > 0)


#5. Plot emissions for *ALL* pollutants
long_data %>% ggplot(aes(x = pollutant, y = emissions)) + 
                geom_boxplot() + 
                scale_y_log10()

#6. Plot GHGs
long_data <- filter(long_data, pollutant %in% c("co2", "co2-b", "methane", "nitrous oxid", "hfc-134a"))

long_data %>% 
  ggplot(aes(x = pollutant, y = emissions)) + 
    geom_boxplot() + 
    scale_y_log10()


#6. Arrange plots in descending order
long_data <- long_data %>% arrange(desc(emissions))

# See the new pollutant order
unique(long_data$pollutant)

# Use fct_inorder() to tell ggplot to respect your pollutant order.
long_data %>% 
  ggplot(aes(x = fct_inorder(pollutant), y = emissions)) + 
    geom_boxplot() + 
    scale_y_log10() +
    labs(x = "Pollutant")


# **Bonus** 
# Flip your data back to wide format using 
## spread(data, column_to_spread, column_to_use_for_values)
wide_again <- spread(long_data, pollutant, emissions)  


#------------------------------------------------------------------------#
# Looping through files
#------------------------------------------------------------------------#

# Example "for loop"
num <- 1:10

num

# Pi loop
for (i in num) {
  
  print(i + pi)
  
}


# Save answers in Pi loop
answers <- c()

for (i in num) {
  
  temp <- i + pi
  
  print(temp)
  
  answers <- c(temp, answers)
  
}

answers

# For loop of air monitoring files
files_2016 <- list.files("messy_data/60min/")


# Loop through each file
for (i in files_2016) {
  
  print(i)

}

# Read each file and add to master table "data_2016"
data_2016 <- data_frame()


for (i in files_2016) {
  
  print(i)
  
  # Read in a single file
  temp <- read_csv(paste0("messy_data/60min/", i))
 
  # Select only 2 columns and date/time
  temp <- select(temp, `Date&Time`, `BC1(370)`, `BC6(880)`)
  
  # Add to master table of all files  
  data_2016 <- bind_rows(temp, data_2016)

}


# Fix column names
names(data_2016)

# str_trim() removes empty spaces at the beginning and end of a string
str_trim(" excel_column ")

# gsub() finds and replaces values in strings. 
# Use ?gsub for more information on how to supply arguments.

# gsub(character_to_search_for, what_toreplace_with, your_string)
# The line below searches for any space and replaces it with nothing "".
gsub(" ", "", " excel column ")


# Parenthesis are special.
# When replacing special characters, you must use [ ] around them or you'll get an error like this.
gsub("(", "", names(data_2016))

# Drop left
gsub("[(]", "", names(data_2016))

# Drop right
gsub("[)]", "", names(data_2016))

# Drop both with the "or" operator |
# Hint: it's the vertical line above the Enter key on the keybord
names(data_2016) <- gsub("[()]|[)]", "", names(data_2016))

names(data_2016)

# Drop the M&M symbol
names(data_2016) <- gsub("&", "", names(data_2016))

names(data_2016)

# Make lowercase
names(data_2016) <- tolower(names(data_2016))

names(data_2016)

# Check date coverage
range(data_2016$datetime)


# Use function from lubridate package to convert to "date"
# Use mdy_hm because the dates are in Month, Day, Year, Hour, Minute order
data_2016 <- mutate(data_2016, date = mdy_hm(datetime)) 

range(data_2016$date)

# Now extract the individual time components
data_2016 <- mutate(data_2016,
                    year    = year(date),
                    quarter = quarter(date),
                    month   = month(date),
                    week    = week(date),
                    day     = day(date),
                    hour    = hour(date),
                    minute  = minute(date))


# Now you can group_by the different time components
group_by(data_2016, quarter) %>% summarize(med_bc1270 = median(bc1370, na.rm = T),
                                           med_bc6880 = median(bc6880, na.rm = T)) 



# View final table
View(data_2016)



#------------------------------------------------------------------------#
# Anne's scary landfill data ;( 
#------------------------------------------------------------------------#

# Excel range example and sheet selection
folder <- "messy_data/"

permitted <- read_excel(paste0(folder, "2016 Demo Retrac Report.xlsx"), 
                        sheet = "2014", 
                        range = "A4:I102", 
                        col_names = T)

ultimate <- read_excel(paste0(folder, "2016 Demo Retrac Report.xlsx"), 
                       sheet = "2014", 
                       range = "K4:N102", 
                       col_names = T)

# left_join()
one_table <- left_join(permitted, ultimate)


#------------------------------------------------------------------------#
# Blank rows in Excel
#------------------------------------------------------------------------#

# See what directory you're in
list.files("messy_data/")

# Read the PTE tab - "Potential emissions"
emits <- read_excel("messy_data/emission_calcs.xlsx", "PTE - Pollutant Summary")


# Where do the column names start? Let's "skip" the top 3 info rows.


# Skip first 3 rows
emits <- read_excel("messy_data/emission_calcs.xlsx", 
                    sheet = "PTE - Pollutant Summary", 
                    skip = 3)

View(emits)

# Why are pollutant names only listed once in table? Maybe it seemed like a good idea at the time, but can create problems when sorting.

#-------------------------------------------#
# How can we fill in the empty rows?
#-------------------------------------------#

# -> Don't worry you don't need to paste in Excel! 
# -> R is your friend and will help you.


# Fill empty rows using fill() from the tidyr package
emits <- fill(emits, Pollutant)

View(emits)

#------------------------------------------------------------------------#
# Look out messy data, R is coming for you.
#------------------------------------------------------------------------#


############
# THE END  #
############
R Workshops | Tidy data

Tidy data