This workshop is available online at
The messy data examples can be downloaded from here
https://github.com/MPCA-air/RCamp/tree/master/data/messy_data
#------------------------------------------------------------------------#
# Tidy data script
# Demonstrates cleaning up the messiest data / Excel files you can find
#------------------------------------------------------------------------#
# Install these packages
install.packages("readxl") # For importing Excel data
install.packages("tidyr") # For re-organizing your data frames
install.packages("lubridate") # For dates
install.packages("stringr") # Cleans spaces and character strings
install.packages("forcats") # For assigning an order to character data c("Gold", "Silver", "Bronze")
# Load these packages
library("dplyr")
library("readr")
library("ggplot2")
library("tidyr")
library("readxl")
library("lubridate")
library("stringr")
library("forcats")
#------------------------------------------------------------------------#
# Wide tables
#------------------------------------------------------------------------#
# Let's start with a really WIDEEEEE Excel table with lots of columns
#1. Load emissions Excel file
folder <- "messy_data/"
wide_data <- read_excel(paste0(folder, "emissions_2015.xlsx"))
#2. Lowercase names
names(wide_data) <- tolower(names(wide_data))
names(wide_data)
#3. Gather all the pollutant columns into one
long_data <- wide_data %>%
gather(key = pollutant, value = emissions, `222trifluoroetha`:`solv in coat`)
#4. Drop the missing values and zero emissions
long_data <- filter(long_data, !is.na(emissions), emissions > 0)
#5. Plot emissions for *ALL* pollutants
long_data %>% ggplot(aes(x = pollutant, y = emissions)) +
geom_boxplot() +
scale_y_log10()
#6. Plot GHGs
long_data <- filter(long_data, pollutant %in% c("co2", "co2-b", "methane", "nitrous oxid", "hfc-134a"))
long_data %>%
ggplot(aes(x = pollutant, y = emissions)) +
geom_boxplot() +
scale_y_log10()
#6. Arrange plots in descending order
long_data <- long_data %>% arrange(desc(emissions))
# See the new pollutant order
unique(long_data$pollutant)
# Use fct_inorder() to tell ggplot to respect your pollutant order.
long_data %>%
ggplot(aes(x = fct_inorder(pollutant), y = emissions)) +
geom_boxplot() +
scale_y_log10() +
labs(x = "Pollutant")
# **Bonus**
# Flip your data back to wide format using
## spread(data, column_to_spread, column_to_use_for_values)
wide_again <- spread(long_data, pollutant, emissions)
#------------------------------------------------------------------------#
# Looping through files
#------------------------------------------------------------------------#
# Example "for loop"
num <- 1:10
num
# Pi loop
for (i in num) {
print(i + pi)
}
# Save answers in Pi loop
answers <- c()
for (i in num) {
temp <- i + pi
print(temp)
answers <- c(temp, answers)
}
answers
# For loop of air monitoring files
files_2016 <- list.files("messy_data/60min/")
# Loop through each file
for (i in files_2016) {
print(i)
}
# Read each file and add to master table "data_2016"
data_2016 <- data_frame()
for (i in files_2016) {
print(i)
# Read in a single file
temp <- read_csv(paste0("messy_data/60min/", i))
# Select only 2 columns and date/time
temp <- select(temp, `Date&Time`, `BC1(370)`, `BC6(880)`)
# Add to master table of all files
data_2016 <- bind_rows(temp, data_2016)
}
# Fix column names
names(data_2016)
# str_trim() removes empty spaces at the beginning and end of a string
str_trim(" excel_column ")
# gsub() finds and replaces values in strings.
# Use ?gsub for more information on how to supply arguments.
# gsub(character_to_search_for, what_toreplace_with, your_string)
# The line below searches for any space and replaces it with nothing "".
gsub(" ", "", " excel column ")
# Parenthesis are special.
# When replacing special characters, you must use [ ] around them or you'll get an error like this.
gsub("(", "", names(data_2016))
# Drop left
gsub("[(]", "", names(data_2016))
# Drop right
gsub("[)]", "", names(data_2016))
# Drop both with the "or" operator |
# Hint: it's the vertical line above the Enter key on the keybord
names(data_2016) <- gsub("[()]|[)]", "", names(data_2016))
names(data_2016)
# Drop the M&M symbol
names(data_2016) <- gsub("&", "", names(data_2016))
names(data_2016)
# Make lowercase
names(data_2016) <- tolower(names(data_2016))
names(data_2016)
# Check date coverage
range(data_2016$datetime)
# Use function from lubridate package to convert to "date"
# Use mdy_hm because the dates are in Month, Day, Year, Hour, Minute order
data_2016 <- mutate(data_2016, date = mdy_hm(datetime))
range(data_2016$date)
# Now extract the individual time components
data_2016 <- mutate(data_2016,
year = year(date),
quarter = quarter(date),
month = month(date),
week = week(date),
day = day(date),
hour = hour(date),
minute = minute(date))
# Now you can group_by the different time components
group_by(data_2016, quarter) %>% summarize(med_bc1270 = median(bc1370, na.rm = T),
med_bc6880 = median(bc6880, na.rm = T))
# View final table
View(data_2016)
#------------------------------------------------------------------------#
# Anne's scary landfill data ;(
#------------------------------------------------------------------------#
# Excel range example and sheet selection
folder <- "messy_data/"
permitted <- read_excel(paste0(folder, "2016 Demo Retrac Report.xlsx"),
sheet = "2014",
range = "A4:I102",
col_names = T)
ultimate <- read_excel(paste0(folder, "2016 Demo Retrac Report.xlsx"),
sheet = "2014",
range = "K4:N102",
col_names = T)
# left_join()
one_table <- left_join(permitted, ultimate)
#------------------------------------------------------------------------#
# Blank rows in Excel
#------------------------------------------------------------------------#
# See what directory you're in
list.files("messy_data/")
# Read the PTE tab - "Potential emissions"
emits <- read_excel("messy_data/emission_calcs.xlsx", "PTE - Pollutant Summary")
# Where do the column names start? Let's "skip" the top 3 info rows.
# Skip first 3 rows
emits <- read_excel("messy_data/emission_calcs.xlsx",
sheet = "PTE - Pollutant Summary",
skip = 3)
View(emits)
# Why are pollutant names only listed once in table? Maybe it seemed like a good idea at the time, but can create problems when sorting.
#-------------------------------------------#
# How can we fill in the empty rows?
#-------------------------------------------#
# -> Don't worry you don't need to paste in Excel!
# -> R is your friend and will help you.
# Fill empty rows using fill() from the tidyr package
emits <- fill(emits, Pollutant)
View(emits)
#------------------------------------------------------------------------#
# Look out messy data, R is coming for you.
#------------------------------------------------------------------------#
############
# THE END #
############