-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path01_combine.R
51 lines (42 loc) · 1.53 KB
/
01_combine.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# Combine CSVs and calculate daily totals
# Data files downloaded from Los Angeles Metro Bike Share
# https://bikeshare.metro.net/about/data/
library(tidyverse)
library(lubridate)
# Get list of files
filelist <- list.files("./data/raw", "csv") %>%
paste0("./data/raw/", .)
# Read all CSV files and set start_time to datetime format
df <- filelist %>%
map_dfr(read_csv, col_types = cols(start_time = col_character(),
end_time = col_character(),
bike_id = col_character(),
end_lat = col_character(),
end_lon = col_character())) %>%
select(start_time) %>%
mutate(start = mdy_hm(start_time))
# The start_time formatting changed from mdy_hm to ymd_hms
# This handles rows that didn't parse above
df_no_secs <- df %>%
filter(is.na(start)) %>%
mutate(start = ymd_hms(start_time)) %>%
select(start)
# Combine both sets
starts <- df %>%
filter(!is.na(start)) %>%
select(start) %>%
bind_rows(df_no_secs)
# Calculate daily rides
daily_rides <- starts %>%
mutate(date = as.Date(start)) %>%
group_by(date) %>%
summarise(rides = n())
# Visualize
daily_rides %>%
ggplot(aes(x = date, y = rides)) +
geom_line()
# There are three crazy spikes, once a year around late September to early October.
# I don't know why, so I'll just take these out for now, and write to a CSV file.
daily_rides <- daily_rides %>%
filter(rides < 1485)
write_csv(daily_rides, "./data/metrobike_daily_rides.csv")