data_carpentry_2016-08-24.R

# R intro script

# a calculation
2016 - 1969

# load data from web
surveys <- read.csv("http://kbroman.org/datacarp/portal_data_joined.csv")

# top few rows
head( surveys ) 

# last few rows
tail( surveys )

# structure
str(surveys)

# summary
summary(surveys)

# other useful summaries
dim(surveys)
ncol(surveys)
nrow(surveys)
names(surveys)
colnames(surveys)
rownames(surveys)

# download file from web as local file
download.file("http://kbroman.org/datacarp/portal_data_joined.csv",
              "CleanData/portal_data_joined.csv")

# read data from local file
surveys <- read.csv("CleanData/portal_data_joined.csv")

# current working directory
getwd()

# indexing
surveys[1, 1]
surveys[1, 7]
surveys[10001, 7]
surveys[2, ]
surveys[2,]  # don't _need_ the spaces
surveys[,7]

sex <- surveys[,7]
sex <- surveys[, "sex"]
sex <- surveys$sex
sex <- surveys[["sex"]] 

# pull out a single value from a vector
sex[1]
sex[10001]

# create a vector
c(1, 4, 6)

# pull out multiple values
sex[c(1,4,6)] # <- ouch, 4 and 6 are "" and aren't seen

sex[c(1, 10001)] # <- pull out 1st and 10001th.

# more ways to create vectors 
1:10
10:1
sex[1:10]

# every 2nd value
seq(1, 10, by=2)
sex[seq(1,10, by=2)]

# first ten rows of surveys
surveys[1:3,]
surveys[10001:10003,]
surveys[5:7, 1:7]


# seq function
seq(1, 11)
?seq # help
seq(1, 11, 2)
seq(1, 11, by=2)
seq(to=11, from=1, by=2) 

# challenge # 2
nrow(surveys)
indexes <- seq(10, nrow(surveys), by=10)
surveys_by_10 <- surveys[indexes ,  ]

surveys_by_10 <- surveys[seq(10, nrow(surveys), by=10),  ]

# those awful blanks in the data file
surveys <- read.csv("CleanData/portal_data_joined.csv", 
                    na.strings="")

surveys[,"sex"]

### dplyr

# install packages
install.packages("dplyr")
install.packages("ggplot2")

# load the dplyr package
library(dplyr)

# select some columns
selected_col <- select(surveys, sex, species_id, plot_type, weight)
head(selected_col)

# filter out some rows
selected_row <- filter(surveys, year == 2002)
head(selected_row)

# filter out some rows
selected_row <- filter(surveys, year == 2002, weight>78)
head(selected_row)

selected_row <- filter(surveys, sex == "F", weight>78)
head(selected_row)

# pipe operator
surveys %>% 
    filter(weight < 5) %>% 
    select(species_id, sex, weight)
 
selected_rows <- filter(surveys, weight<5)
result <- select(selected_rows, species_id, sex, weight)

# challenge #4
surveys %>% 
    filter(year < 1995) %>% 
    select(year, sex, weight)

# variation on that, also filter on weight
surveys %>% 
    filter(year < 1995) %>% 
    filter(weight > 78) %>% 
    select(year, sex, weight)

# equivalent to that
surveys %>% 
    filter(year < 1995, weight > 78) %>% 
    select(year, sex, weight)

# also filter on species_id
selected_stuff <- surveys %>% 
    filter(year < 1995, weight > 78, species_id=="DM") %>% 
    select(year, sex, weight)

# you can actually do this
# (but please don't)
surveys %>% 
    filter(year < 1995, weight > 78, species_id=="DM") %>% 
    select(year, sex, weight) -> selected_stuff

# mutate 
surveys %>% 
    mutate(weight_kg = weight / 1000) %>% 
    tail()

# mutate + filter
surveys %>% 
    filter(weight > 78) %>% 
    mutate(weight_kg = weight / 1000) %>% 
    tail()

# mutate + filter + select
surveys %>% 
    filter(weight > 78) %>% 
    mutate(weight_kg = weight / 1000) %>% 
    select(weight, weight_kg) %>% 
    tail()

# this won't work
surveys %>% 
    filter(weight > 78) %>% 
    select(weight, weight_kg) %>% 
    mutate(weight_kg = weight / 1000) %>% 
    tail()

surveys_plus_weight_kg <- surveys %>% 
    filter(weight > 78) %>% 
    mutate(weight_kg = weight / 1000)

# add column and write over the surveys data
surveys <- surveys %>% 
    mutate(weight_kg = weight / 1000)

# square-root function
sqrt(5)

# challenge 5
result <- surveys %>% 
    mutate(hindfoot_sqrt=sqrt(hindfoot_length)) %>% 
    filter(hindfoot_sqrt < 3) %>% 
    select(species_id, hindfoot_sqrt)
    
# variation on challenge 5, saving rows with NAs
result2 <- surveys %>% 
    mutate(hindfoot_sqrt=sqrt(hindfoot_length)) %>% 
    filter(is.na(hindfoot_sqrt) | hindfoot_sqrt < 3) %>% 
    select(species_id, hindfoot_sqrt)

# count individuals by sex
surveys %>% 
    group_by(sex) %>% 
    tally()

# average weight by sex
surveys %>% 
    group_by(sex) %>% 
    summarize(mean_weight = mean(weight, na.rm=TRUE) )

surveys %>% 
    filter(!is.na(sex), sex != "") %>%  # <- need one or the other condition
    group_by(sex) %>% 
    summarize(mean_weight = mean(weight, na.rm=TRUE) )

# average weight by sex and by species_id    
surveys %>% 
    group_by(sex, species_id) %>% 
    summarize(mean_weight = mean(weight, na.rm=TRUE) )
    
# average weight by sex and by species_id
#   sort by mean weight
surveys %>% 
    group_by(sex, species_id) %>% 
    summarize(mean_weight = mean(weight, na.rm=TRUE) ) %>% 
    arrange(mean_weight)
    
# average weight by sex and by species_id
#   sort by mean weight, descending
surveys %>% 
    filter(!is.na(sex)) %>% 
    group_by(sex, species_id) %>% 
    summarize(mean_weight = mean(weight, na.rm=TRUE) ) %>% 
    arrange(desc(mean_weight)) %>% 
    filter(!is.na(mean_weight)) %>% 
    tail
    
# challenge
surveys %>% 
    group_by(plot_type) %>% 
    tally()


###############

# keep only rows that have complete data
surveys_complete <- surveys %>% 
    filter(!is.na(weight)) %>% 
    filter(!is.na(hindfoot_length)) %>% 
    filter(sex != "", !is.na(sex)) %>% 
    filter(species_id != "", !is.na(species_id))

# count species
species_counts <- surveys_complete %>% 
    group_by(species_id) %>% 
    tally()

# frequent species...counts >= 10
frequent_species <- species_counts %>% 
    filter(n >= 10) %>% 
    select(species_id)

# filter out less-frequent species 
reduced <- surveys_complete %>% 
    filter(species_id %in% frequent_species$species_id)

# save the reduced data to a file
write.csv(reduced, "CleanData/portal_data_reduced.csv")

## Now to the data visualization
library(ggplot2)

# a first plot
ggplot(reduced, aes(x=weight, y=hindfoot_length)) +
    geom_point()

# save thing-to-be-plotted to an object
p <- ggplot(reduced, aes(y=weight, x=hindfoot_length)) +
    geom_point()

# build that up in two steps
p1 <- ggplot(reduced, aes(x=weight, y=hindfoot_length))
p2 <- p1 + geom_point()

# challenge 9
reduced_DM <- reduced %>% 
    filter(species_id == "DM")

ggplot(reduced_DM, aes(x=weight, y=hindfoot_length)) +
    geom_point()

# or fully piped
reduced %>% 
    filter(species_id == "DM") %>% 
    ggplot(aes(x=weight, y=hindfoot_length)) +
    geom_point()

# use some other aesthetics
p <- reduced %>% 
    filter(species_id == "DM") %>% 
    ggplot(aes(x=weight, y=hindfoot_length))
p + geom_point(color = "slateblue")
p + geom_point(color = "slateblue", size=2)
p + geom_point(color = "slateblue", size=0.5)
p + geom_point(color = "slateblue", alpha=0.1)

# map further features aesthetics
ggplot(reduced, aes(x=weight, y=hindfoot_length))+
    geom_point(aes(color = species_id))

# challenge 10
summaries <- reduced %>% 
    group_by(species_id) %>% 
    summarize(mean_weight=mean(weight),
              mean_hfl = mean(hindfoot_length),
              sample_size = n()) # for sample size

ggplot(summaries, aes(x=mean_weight, y=mean_hfl)) +
    geom_point(aes(size=sample_size))

# geom_line to make a line plot
count_by_year <- reduced %>% 
    group_by(year) %>% 
    tally()
ggplot(count_by_year, aes(x=year, y=n)) +
    geom_line(color="slateblue") + geom_point()
ggplot(count_by_year, aes(x=year, y=n)) +
     geom_point(aes(color=year)) + geom_line()

# challenge 11
dmds_counts <- reduced %>% 
    filter(species_id=="DM" | species_id=="DS") %>% 
    group_by(year, species_id) %>% 
    tally()

ggplot(dmds_counts, aes(x=year, y=n, group=species_id)) +
    geom_line()

ggplot(reduced, aes(x=weight, y=hindfoot_length)) +
    geom_point() + facet_wrap(~ species_id)

ggplot(reduced, aes(x=weight, y=hindfoot_length)) +
    geom_point() + facet_grid(~ year)

reduced %>% 
    filter(year < 1983) %>% 
    filter(species_id == "DM" | species_id == "DS") %>% 
ggplot(aes(x=weight, y=hindfoot_length)) +
    geom_point() + facet_grid(species_id ~ .)

# univariate plots
ggplot(reduced, aes(x=weight)) +
    geom_histogram() + facet_wrap(~ species_id)

ggplot(reduced, aes(x=species_id, y=weight)) +
    geom_boxplot() 

ggplot(reduced, aes(x=species_id, y=weight)) +
    geom_boxplot() + theme_bw()

# save a plot to a file
p <- ggplot(reduced, aes(x=species_id, y=weight)) +
    geom_boxplot() + theme_bw()
ggsave("~/Desktop/my_ggplot.png", p, 
       height=8, width=10)
ggsave("~/Desktop/my_ggplot.pdf", p, 
       height=8, width=10)