John Karlen
4/25/17
# install.packages('nycflights13')
# install.packages('lubridate')
# install.packages('dplyr')
library(nycflights13)
library(lubridate)
library(dplyr)
year, month, day
ymd("20100604")
[1] "2010-06-04"
mdy("06-04-2011")
[1] "2011-06-04"
dmy("04/06/2011")
[1] "2011-06-04"
year, month, day, hour, minute, second, datetime
ymd_hms("2015-09-09 14:00:00")
[1] "2015-09-09 14:00:00 UTC"
as_datetime('2015-09-03T16:37:00Z')
[1] "2015-09-03 04:00:00 UTC"
johns_bday <- ymd_hms("1991-01-20T00:20:00Z")
date(johns_bday)
[1] "1991-01-20"
hour(johns_bday)
[1] 0
take a look at the flights data set
cat(names(flights))
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute time_hour
flights <- flights %>% mutate(date = ymd(paste0(year, "-", month, "-", day)))
flights <- flights %>% mutate(date_time = ymd_hm(paste0(year, "-", month, "-", day, " ", hour, ":", minute)))
flights %>% select(date_time)
# A tibble: 336,776 × 1
date_time
<dttm>
1 2013-01-01 05:15:00
2 2013-01-01 05:29:00
3 2013-01-01 05:40:00
4 2013-01-01 05:45:00
5 2013-01-01 06:00:00
6 2013-01-01 05:58:00
7 2013-01-01 06:00:00
8 2013-01-01 06:00:00
9 2013-01-01 06:00:00
10 2013-01-01 06:00:00
# ... with 336,766 more rows
flights %>% arrange(date) %>% select(date) %>% head()
# A tibble: 6 × 1
date
<date>
1 2013-01-01
2 2013-01-01
3 2013-01-01
4 2013-01-01
5 2013-01-01
6 2013-01-01
What was the time of the first flight in 2013?
flights %>% arrange(date_time) %>% select(date_time)
# A tibble: 336,776 × 1
date_time
<dttm>
1 2013-01-01 05:15:00
2 2013-01-01 05:29:00
3 2013-01-01 05:40:00
4 2013-01-01 05:45:00
5 2013-01-01 05:58:00
6 2013-01-01 05:59:00
7 2013-01-01 06:00:00
8 2013-01-01 06:00:00
9 2013-01-01 06:00:00
10 2013-01-01 06:00:00
# ... with 336,766 more rows
what day of the week was Dana born on?
danas_bday <- ymd("1992-06-15")
month(danas_bday)
[1] 6
wday(danas_bday)
[1] 2
wday(danas_bday, label = T)
[1] Mon
Levels: Sun < Mon < Tues < Wed < Thurs < Fri < Sat
aggregator functions!
flights %>% arrange(date) %>% group_by(month(date_time)) %>%
filter(row_number() == 1) %>% ungroup() %>%
select(date_time, tailnum)
# A tibble: 12 × 2
date_time tailnum
<dttm> <chr>
1 2013-01-01 05:15:00 N14228
2 2013-02-01 05:00:00 N197UW
3 2013-03-01 21:59:00 N706JB
4 2013-04-01 05:00:00 N566UW
5 2013-05-01 16:55:00 N628VA
6 2013-06-01 23:59:00 N618JB
7 2013-07-01 20:29:00 N653JB
8 2013-08-01 21:30:00 N618JB
9 2013-09-01 23:59:00 N663JB
10 2013-10-01 05:00:00 N538UW
11 2013-11-01 23:59:00 N568JB
12 2013-12-01 23:59:00 N715JB
what was the tailnumber of the 20th flight in September?
flights %>% arrange(date) %>% group_by(month(date)) %>%
filter(row_number() == 20) %>% ungroup() %>%
select(date_time, tailnum) %>% filter(month(date_time) == 9)
# A tibble: 1 × 2
date_time tailnum
<dttm> <chr>
1 2013-09-01 06:10:00 N320US
get all flights of a certain plane, “N14228”
flights %>% filter(tailnum == 'N14228') %>%
arrange(date_time) %>%
select(date_time, tailnum)
# A tibble: 111 × 2
date_time tailnum
<dttm> <chr>
1 2013-01-01 05:15:00 N14228
2 2013-01-08 14:40:00 N14228
3 2013-01-09 07:00:00 N14228
4 2013-01-09 11:44:00 N14228
5 2013-01-13 08:24:00 N14228
6 2013-01-16 17:30:00 N14228
7 2013-01-22 18:08:00 N14228
8 2013-01-23 10:56:00 N14228
9 2013-01-23 15:29:00 N14228
10 2013-01-25 07:20:00 N14228
# ... with 101 more rows
what day of the week was N0EGMQ's 100th flight
flights %>% filter(tailnum == 'N0EGMQ') %>%
arrange(date_time) %>% filter(row_number() == 100) %>%
select(date_time) %>% mutate(wday(date_time))
# A tibble: 1 × 2
date_time `wday(date_time)`
<dttm> <dbl>
1 2013-04-04 20:55:00 5
what if we want the date today?
date <- today()
date
[1] "2017-06-21"
how old is John?
johns_bday <- ymd("1991-01-20")
today() - johns_bday
Time difference of 9649 days
how old is John?
johns_bday <- ymd("1991-01-20")
johns_duration <- as.duration(today() - johns_bday)
johns_duration
[1] "833673600s (~26.42 years)"
how old is John… in weeks?
johns_bday <- ymd("1991-01-20")
johns_duration <- as.duration(today() - johns_bday)
johns_duration / dweeks(1)
[1] 1378.429
you can combine lubridate with R's plotting functionality
hist(wday(flights$date), breaks = seq(.5,7.5,1))
you can combine lubridate with ggplot too
library(ggplot2)
flights %>% group_by(date) %>% summarize(avg_dist = mean(distance)) %>%
ggplot(aes(date, avg_dist)) + geom_line()
Let's look at one month
flights %>% filter(month(date) == 7) %>% group_by(date) %>%
summarize(avg_dist = mean(distance)) %>%
ggplot(aes(date, avg_dist)) + geom_line()
Show me a plot of average flight distance vs day of the week (1-7)
# hint:
wday(johns_bday)
[1] 1
Let's look at how distance depends on weekday
flights %>% mutate(weekday = wday(date)) %>% group_by(weekday) %>%
summarize(avg_dist = mean(distance)) %>%
ggplot(aes(weekday, avg_dist)) + geom_point()
Give me the smallest non-zero time between consecutive flights of the same plane
flights %>% arrange(tailnum, date_time) %>% mutate(row = row_number(), next_time = date_time[row+1]) %>% arrange(tailnum) %>% select(tailnum, date_time, next_time) %>% group_by(tailnum) %>% filter(date_time != max(date_time), n() > 1) %>% ungroup() %>% mutate(diff_between_flights = next_time - date_time) %>% arrange(diff_between_flights) %>% filter(diff_between_flights > 0)
# A tibble: 332,323 × 4
tailnum date_time next_time diff_between_flights
<chr> <dttm> <dttm> <time>
1 N11164 2013-05-09 19:30:00 2013-05-09 19:31:00 60 secs
2 N11551 2013-05-11 11:59:00 2013-05-11 12:00:00 60 secs
3 N13964 2013-12-29 17:34:00 2013-12-29 17:35:00 60 secs
4 N309JB 2013-02-09 22:49:00 2013-02-09 22:50:00 60 secs
5 N713EV 2013-11-27 10:56:00 2013-11-27 10:57:00 60 secs
6 N750EV 2013-09-02 15:59:00 2013-09-02 16:00:00 60 secs
7 <NA> 2013-02-08 13:59:00 2013-02-08 14:00:00 60 secs
8 <NA> 2013-02-08 14:45:00 2013-02-08 14:46:00 60 secs
9 <NA> 2013-02-08 14:55:00 2013-02-08 14:56:00 60 secs
10 <NA> 2013-02-08 15:05:00 2013-02-08 15:06:00 60 secs
# ... with 332,313 more rows