MW_DataWrangling.Rmd

---
title: "MeadoWatch Data Wrangling"
author: "Janneke Hille Ris Lambers, Aji John, Meera Sethi, Elli Theobald"
date: "06/27/2021"
output: html_document
---

#Setup for R script and R markdown
1. Load all libraries
2. Specify string behavior
3. Load packages

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
options(stringsAsFactors = FALSE)
library(tidyverse)
#library(leaflet)
#library(lubridate)
library(boot)
#library(readr) 
```


#Read in data, reconfigure
1. Read in raw data, remove NA's
2. Remove unneeded columns
3. Calculate and add Day of Year (DOY) to data
4. Rename to PhenoSite

```{r}
# Read in phenology data
PhenoDatall <- read.csv("data/MW_PhenoDat_2013_2019.csv", header=TRUE)

# Remove rows where any phenophases were coded as NA
PhenoPhases <- PhenoDatall[,c("Bud","Flower","Fruit","Disperse")] 
PhenoDat <- PhenoDatall[complete.cases(PhenoPhases),]

# Extract data columns of interest
PhenoSite_0 <- PhenoDat[,c("Year", "Transect","Site_Code",
                           "Date", "Observer","QA.QC",
                           "Species","Snow","Flower")]

# Calculate Day of Year (DOY) for each observations
PhenoSite_0 <- PhenoSite_0[order(PhenoSite_0$Year),] #order by year
yrs <- unique(PhenoSite_0$Year);
DOY <- c()

# Convert the 'Date' - observed date to Julian (DOY)
for(i in 1:length(yrs)){
  tmpdates <- PhenoSite_0$Date[PhenoSite_0$Year==yrs[i]]
  Jan1Jul <- as.Date(paste(yrs[i],"-01-01", sep=""))
  ObsJulDayYr <- julian(as.Date(as.character(tmpdates),"%m/%d/%Y"),
                        origin=Jan1Jul)
  DOY <- c(DOY,ObsJulDayYr)
}

# Now add day of year (DOY) to Phenosite Data
PhenoSite <- cbind(PhenoSite_0, DOY)

# Examine the data
head(PhenoSite)

```


#Calculate observer effort per plot / year species, remove those with too few
Add information to PhenoSite to allow for outlier detection
1. Calculate total observations per plot / year / species
2. Filter out plots with too little data (see below for parameters that can be modified)
   + *totobs* (min # observations for fitting, currently set at 30)
   + *totyesobs* (min # yes observations of flowering, currently set at 5)

```{r}
#Calculate total number of observations
PhenoSite_0 <- PhenoSite

# Start calculating
Nobs <- PhenoSite_0 %>% 
  group_by(Transect,Year,Site_Code,Species)%>% 
  tally() %>% as.data.frame()

# Calculate total number of yes observations
PhenoSite_1 <- PhenoSite_0[PhenoSite_0$Flower==1,]
Nobsyes <- PhenoSite_1 %>% 
  group_by(Transect,Year,Site_Code,Species)%>% 
  tally() %>% as.data.frame()

# Merge PhenoSite with Nobs, Nobsyes
PhenoSite_2 <- merge(PhenoSite_0, Nobs,
                         by=c("Year","Transect",
                              "Site_Code","Species"))
PhenoSite_3 <- merge(PhenoSite_2, Nobsyes,
                         by=c("Year","Transect",
                              "Site_Code","Species"))
ncols_P3 <- dim(PhenoSite_3)[2]
dimnames(PhenoSite_3)[[2]][(ncols_P3 -1):ncols_P3] <- c("Nobs","NobsY")

#Filter out plots with fewer than totobs, totyesobs observations
totobs <- 30
totyesobs <- 5

PhenoSite_4 <- PhenoSite_3[PhenoSite_3$Nobs>=totobs &
                          PhenoSite_3$NobsY>=totyesobs,]
head(PhenoSite_4)
     
```

#Calculate outliers
Calculate for each observation the closest 'yes' flowering and the closest 'no' flowering observation. We will use this to detect outliers (i.e. 'yes' observations that are more than a certain number of days separated from other yes observations).
```{r}
#For each observation, calculate nearest yes, no observation
#Note - this takes a long time, there's probably a way to shorten
distyes <- rep(NA, times=dim(PhenoSite_4)[1])
distno <- rep(NA, times=dim(PhenoSite_4)[1])
PhenoSite_5 <- cbind(PhenoSite_4, distyes, distno)

#Now for loop to calculate
for(i in 1:dim(PhenoSite_5)[1]){
  yr <- PhenoSite_5$Year[i]
  sc <- PhenoSite_5$Site_Code[i]
  sp <- PhenoSite_5$Species[i]
  dy <- PhenoSite_5$DOY[i]
  
  #exclude current observation
  PhenoSite_5_comp <- PhenoSite_5[-i,]
  
  #extract plot / year / species data
  compdat <- PhenoSite_5_comp[PhenoSite_5_comp$Year==yr&
                              PhenoSite_5_comp$Site_Code==sc&
                              PhenoSite_5_comp$Species==sp,]
  #separate into yes and no
  compdatyes <- compdat[compdat$Flower==1,]
  compdatno <- compdat[compdat$Flower==0,]
  
  #closest yes, no observation
  minyes <- min(abs(compdatyes$DOY - dy))
  minno <- min(abs(compdatno$DOY - dy))
  
  #Fill in PhenoSite_Focal
  PhenoSite_5$distyes[i] <- minyes
  PhenoSite_5$distno[i] <- minno
}

# Examine file
head(PhenoSite_5)

# Write data file
write.csv(PhenoSite_5, "data/PhenoSite_Dirty.csv", quote=TRUE,
          row.names=FALSE)

```

#Examine observation effort over time
Create graphs showing observation effort across the growing season. These are graphs in Appendix D.
```{r}

# Read in data
PhenoSite_6 <- read.csv("data/PhenoSite_Dirty.csv", header=TRUE)
trails <- unique(PhenoSite_6$Transect)

#file name for figure
filenames <- c("FigD1.tif", "FigD2.tif")

# For loop to create graph
for(i in 1:2){

  # Extract trail specific data
  PhenoSite_trail <- PhenoSite_6[PhenoSite_6$Transect==trails[i],]

  # Define years for trail
  yr1 <- min(PhenoSite_trail$Year); yr2 <- max(PhenoSite_trail$Year)
  nyr_trail <- yr2 - yr1
  minDOY <- min(PhenoSite_trail$DOY); maxDOY <- max(PhenoSite_trail$DOY)
  season <- seq(minDOY, maxDOY)
  nobsseason <- season
  
  #Set figure for tiff  
  #tiff(file=paste("output/figures/",filenames[i], sep=""), 
  #     width=6, height=4, units="in", res=600)
  
  #Make a dummy plot to add points to
  par(mfrow=c(1,1), omi=c(0,0,0,0), mai=c(0.5,0.6,0.5,0.5), 
      tck=-0.02, mgp=c(1.1,0.5,0))
  plot(1,1, type="n", xlim=c(135, 285), 
       ylim=c(0,(nyr_trail+2)), xaxt="n", yaxt="n",
       xlab="observation date", ylab="")
  title(trails[i])

  #now add points and labels
  for(j in yr1:yr2){
    #Determine unique observers / dates
    PhenoSite_trailyr <- PhenoSite_trail[PhenoSite_trail$Year==j,]
    ObsDates <- paste(PhenoSite_trailyr$Observer,PhenoSite_trailyr$DOY)
    Nobs_year <- length(unique(ObsDates))
    ObsDOYs <- data.frame(cbind(ObsDates, PhenoSite_trailyr$DOY))
    dimnames(ObsDOYs)[[2]] <- c("Observer","DOY")
    ObsDOYs$DOY <- as.numeric(ObsDOYs$DOY)
    ObsDOY <- distinct(ObsDOYs)
    
    #print out # visits
    print(paste(trails[i],j,sep="-"))
    print(Nobs_year)
    
    #Determine y value
    posy <- (j-yr1+1)
    obsy <- jitter(rep(posy,times=dim(ObsDOY)[1]),
                   amount = 0.5/nyr_trail)
    points(ObsDOY$DOY, obsy, pch=21, bg="grey")

    # y axis labels
    xposlab <- 135 - (285-135)*1/15
    text(xposlab,posy,labels=j,cex=0.5, adj=1, xpd=NA)
    
    # x axis labels
    posy2 <- (yr2-yr1)*-0.1
    text(c(135, 165, 195, 225, 255, 285), posy2, 
     labels=c("May", "Jun", "Jul", "Aug", "Sept", "Oct"), xpd=NA) 
    
    #Now calculate how many visits within the week
    nobsseasontmp <- rep(NA, times=length(season))
    for(k in 1:length(season)){
      if(season[k]<min(min(ObsDOY$DOY))){next}
      if(season[k]>max(max(ObsDOY$DOY))){next}
      lowwindow <- season[k]-4
      highwindow <- season[k]+4
      nobsseasontmp[k] <- length(ObsDOY$DOY[ObsDOY$DOY>lowwindow &
                                           ObsDOY$DOY<highwindow])
    }
    nobsseason <- cbind(nobsseason, nobsseasontmp)
  }
  if(i==1){RL_nobs <- nobsseason
           dimnames(RL_nobs)[[2]] <-c("DOY", seq(yr1,yr2))}
  if(i==2){GB_nobs <- nobsseason; 
          dimnames(GB_nobs)[[2]] <- c("DOY", seq(yr1,yr2))}
  
  #uncomment if you want to make a tif figure
  #dev.off()
}

##Reflection Lakes
#Average number of observations per week
print("Reflection Lakes; data collection season")
for(i in 2:dim(RL_nobs)[2]){
  tmpseason <- RL_nobs[is.na(RL_nobs[,i])==FALSE,]
  frst <- min(tmpseason[,1]); lst <- max(tmpseason[,1])
  ttl <- lst - frst
  print(paste(dimnames(RL_nobs)[[2]][i],"- first, last, length", sep=""))
  print(c(frst,lst,ttl))
}

print("Reflection Lakes; average observations per week")
colMeans(RL_nobs[,-1], na.rm = TRUE)

print("Reflection Lakes; min and max # observations per week")
apply(RL_nobs[,-1],2,min, na.rm = TRUE)
apply(RL_nobs[,-1],2,max, na.rm=TRUE)

for(i in 2:dim(RL_nobs)[2]){
  if(i==2){print("Reflection Lakes; days with 1 week gap")}
  ndaysgap <- sum(na.omit(RL_nobs[,i])==0)
  print(paste(dimnames(RL_nobs)[[2]][i],"-",ndaysgap), sep="")
}

#Glacier basin
#Average number of observations per week
print("Glacier Basin; data collection season")
for(i in 2:dim(GB_nobs)[2]){
  tmpseason <- GB_nobs[is.na(GB_nobs[,i])==FALSE,]
  frst <- min(tmpseason[,1]); lst <- max(tmpseason[,1])
  ttl <- lst - frst
  print(paste(dimnames(GB_nobs)[[2]][i],"- first, last, length", sep=""))
  print(c(frst,lst,ttl))
}

print("Glacier; average observations per week")
colMeans(GB_nobs[,-1], na.rm = TRUE)

print("Glacier Basin; min and max # observations per week")
apply(GB_nobs[,-1],2,min, na.rm = TRUE)
apply(GB_nobs[,-1],2,max, na.rm=TRUE)

for(i in 2:dim(GB_nobs)[2]){
  if(i==2){print("Glacier Basin; days with 1 week gap")}
  ndaysgap <- sum(na.omit(GB_nobs[,i])==0)
  print(paste(dimnames(GB_nobs)[[2]][i],"-",ndaysgap), sep="")
}


```


#Examine outliers, eliminate, write clean data file
Make graphs for each species & plot to examine outliers and eliminate those at certain thresholds (distance between yes observations). This is set by out_thresh (currently set at 21 days, 3 weeks). 
```{r}
# Read in data with distyes / distno
PhenoSite_6 <- read.csv("data/PhenoSite_Dirty.csv", header=TRUE)

# Define threshold (max distance between yes observations)
out_thresh <- 21

# Create plots of outliers
species <- unique(PhenoSite_6$Species)

for(i in 1:length(species)){
  PhenoSite_Species <- PhenoSite_6[PhenoSite_6$Species==species[i],]
  
  #make unique identifier - year, site code
  yrsite <- paste(PhenoSite_Species$Year, PhenoSite_Species$Site_Code)
  yrsites <- unique(yrsite); nyrsite <- length(yrsites)
  
  #assess number of obs, outlier obs
  nobsyes_sp <- dim(PhenoSite_Species[PhenoSite_Species$Flower==1,])[1]
  noutliers_sp <- dim(PhenoSite_Species[PhenoSite_Species$Flower==1 
                      & PhenoSite_Species$distyes>=out_thresh,])[1]
  
  #Make a dummy plot to add points to
  namegraph <- paste("figs/",species[i],"_outliers.png", sep="", collapse=NULL)

  png(filename = namegraph,
    width = 480, height = 720, units = "px")

  mxy <- max(PhenoSite_Species$distyes[PhenoSite_Species$Flower==1])
  par(mfrow=c(1,1), omi=c(0,0,0,0), mai=c(0.5,0.6,0.5,0.5), 
    tck=-0.02, mgp=c(1.1,0.5,0))
  plot(1,1, type="n", xlim=c(0,mxy), ylim=c(0,(nyrsite+1)), yaxt="n",
       xlab="Time between yes observations", ylab="")
  title(paste(species[i],"-", noutliers_sp, "outlier / ", 
              nobsyes_sp, "total yes"))
  abline(v=out_thresh)
  
  #now add points and labels
  for(j in nyrsite:1){
    PhenoSite_plt <- PhenoSite_Species[yrsite==yrsites[j],]
    PhenoSite_plt_ys <- PhenoSite_plt[PhenoSite_plt$Flower==1,] 
    noy <- jitter(rep(j,times=dim(PhenoSite_plt_ys)[1]), amount = 1/nyrsite)
    yesy<- jitter(rep(j,times=dim(PhenoSite_plt_ys)[1]), amount = 1/nyrsite)
    points(PhenoSite_plt_ys$distno, noy, pch=21, bg="grey")
    points(PhenoSite_plt_ys$distyes, yesy, pch=21, bg="black", cex=1.5)
    #labels
    xposlab <- mxy*1/15
    text(-xposlab,j,labels=yrsites[j],cex=0.5, adj=1, xpd=NA)
    legend(x="topright", legend = c("yes.yes","yes.no"), 
           pch=21, pt.bg=c("black","grey"))
  }
  dev.off()
}

# Write file of outliers to check
outliercheck <- PhenoSite_6[PhenoSite_6$Flower==1
                                & PhenoSite_6$distyes>out_thresh,]
filename <- paste("output/outliers","-",(out_thresh+1),".csv", sep="")
write.csv(outliercheck, file=filename, quote = FALSE, row.names=FALSE)

# identify outliers as defined by out_tresh
outlierid <- rep(0, length=dim(PhenoSite_6)[1])
outlierid[PhenoSite_6$distyes>out_thresh&PhenoSite_6$Flower==1] <- 1

# Create new data file to exclude outliers
PhenoSite_7 <- PhenoSite_6[outlierid==0,]

# Write clean data file
write.csv(PhenoSite_7, "cleandata/PhenoSite_Clean.csv", quote=TRUE,
          row.names=FALSE)

# Observations
Nobs <- dim(PhenoSite_7)[1]
Nobsyes <- dim(PhenoSite_7[PhenoSite_7$Flower==1,])[1]
Noutliers <- dim(outliercheck)[1]

# Print out observations - note this is for all species (some spp eliminated)
print("Total (all species), Total Yes, Total outliers")
print(c(Nobs, Nobsyes, Noutliers))     
```


#Make a graph for all species with a histograms of the distance between yes observations, with the cutoff, and # observations excluded. This is supplemental Figure E1.
```{r}
# Read in data with distyes / distno
PhenoSite_6 <- read.csv("data/PhenoSite_Dirty.csv", header=TRUE)

# Define threshold (-1 = max distance between yes observations)
out_thresh <- 20

# remove a few species
PhenoSite_7 <- PhenoSite_6[PhenoSite_6$Species!="ANAR"&
                           PhenoSite_6$Species!="LICA"&
                           PhenoSite_6$Species!="MEPA",]

species <- unique(PhenoSite_7$Species)
species <- species[order(species, decreasing = TRUE)]
species <- species[species[]!="MIAL"]
nsp <- length(species)

# Figure out max
mxy <- max(PhenoSite_7$distyes[PhenoSite_7$Flower==1])

#set parameters for figure if writing to tiff 
#tiff(file="output/figures/FigE1.tif", width=6, height=5, units="in", res=600)

#set graph
par(mfrow=c(1,1), omi=c(0,0.75,0,0.75), mai=c(0.5,0.5,0.5,0.4), 
    tck=-0.01, mgp=c(1.1,0.35,0))

plot(1,1, type="n", xlim=c(0,(mxy*1.2)), ylim=c(0.5,(nsp+0.5)), yaxt="n",
       xlab="Time between yes observations", ylab="")

abline(v=(out_thresh+1))

for(i in 1:length(species)){
  PhenoSite_Species <- PhenoSite_6[PhenoSite_6$Species==species[i],]
  PhenoSite_spF <- PhenoSite_Species[PhenoSite_Species$Flower==1,]
  PhenoSite_spFo <- PhenoSite_spF[PhenoSite_spF$distyes>out_thresh,]
  
  # identify # outliers
  nout <- dim(PhenoSite_spFo)[1]
  ntotF <- dim(PhenoSite_spF)[1]

  # create jittered y position
  yesy<- jitter(rep(i,times=dim(PhenoSite_spF)[1]), amount = 1.5/nsp)
  points(PhenoSite_spF$distyes, yesy, pch=21, bg="grey", cex=1.5)
  
  #labels
  xposlab <- mxy*1/15
  text(-xposlab,i,labels=species[i],cex=0.75, adj=1, xpd=NA)
  
  #observations
  text((mxy*1.2), i, labels=paste(nout,"/",ntotF, sep=""), 
       cex=0.75, adj=1)
}

# uncomment below if writing to tiff file
#dev.off()

     
```