forked from udacity/pdsnd_github
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbikeshare_2.py
382 lines (309 loc) · 14.6 KB
/
bikeshare_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
import time
import calendar
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None # default='warn'
# mapping between cities and data file names
CITY_DATA = { 'chicago': './chicago.csv',
'new york city': './new_york_city.csv',
'washington': './washington.csv' }
# mapping between user input and city choice
city_dict = {0: 'all', 1: 'chicago', 2: 'new york city', 3: 'washington'}
# mapping between user input and month choice and vice versa
element = ['all']
monthlist = [calendar.month_name[x].lower() for x in range(1,7)]
values = element + monthlist
keys = [x for x in range(len(values))]
month_dict = dict(zip(keys, values))
month_dict_rev = dict(zip(values, keys))
# mapping between user input and day of week choice and vice versa
daylist = [calendar.day_name[x].lower() for x in range(7)]
values = element + daylist
keys = [x for x in range(len(values))]
day_dict = dict(zip(keys, values))
day_dict_rev = dict(zip(values, keys))
def get_filters():
"""
Asks user to select a city, month, and day (by entering an integer key) to analyze.
Returns:
(str) city - name of the city to analyze, or "all" to get the joint data of the cities
(str) month - name of the month to filter by, or "all" to apply no month filter
(str) day - name of the day of week to filter by, or "all" to apply no day filter
"""
print('\n')
print('Hello! Let\'s explore some US bikeshare data!\n')
print('\nNote: Data is available for the first six months of 2017 only.\n')
# get user input for city (all, chicago, new york city or washington)
while True:
try:
city_input = int(input('\nSelect one of more cities by entering a number\n0 - ALL\n1 - Chicago\n2 - New York City\n3 - Washington\n'))
except:
print("Sorry, not a valid input. Please try again with a single digit number")
continue
city_valid_choices = list(city_dict.keys())
if not city_input in city_valid_choices:
print("Sorry, not a valid input. Please try again")
continue
else:
break
# get user input for month (all, january, february etc.)
while True:
try:
month_input = int(input('\nSelect your filter for month by entering a number\n0 - ALL\n1 - January\n2 - February\n3 - March\n4 - April\n5 - May\n6 - June\n'))
except:
print("Sorry, not a valid input. Please try again with a single digit number")
continue
month_valid_choices = list(month_dict.keys())
if not month_input in month_valid_choices:
print("Sorry, not a valid input. Please try again")
continue
else:
break
# get user input for day of week (all, monday, tuesday, etc.)
while True:
try:
day_input = int(input('\nSelect your filter for day of the week by entering a number\n0 - ALL\n1 - Monday\n2 - Tuesday\n3 - Wednesday\n4 - Thursday\n5 - Friday\n6 - Saturday\n7 - Sunday\n'))
except:
print("Sorry, not a valid input. Please try again with a single digit number")
continue
day_valid_choices = list(day_dict.keys())
if not day_input in day_valid_choices:
print("Sorry, not a valid input. Please try again")
continue
else:
break
# preparing to return city, month, day in string format
city = city_dict[city_input]
month = month_dict[month_input]
day = day_dict[day_input]
print('-'*40)
print("\nThanks for your inputs! You have made the following selection:\n" + "\nCity: " + city.title() + "\nMonth: " + month.title() + "\nDay: " + day.title())
print('-'*40)
return city, month, day
def streamline_df(raw, city, month, day, month_input, day_input):
"""Data cleansing and filtering based on initial user inputs will transform raw data into a more useful DataFrame"""
raw['Start Time'] = pd.to_datetime(raw['Start Time'])
raw['End Time'] = pd.to_datetime(raw['End Time'])
# extract hour from the Start Time column to create an hour column
raw['hour'] = raw['Start Time'].dt.hour
# extract month from the Start Time column to create a month column
raw['month'] = raw['Start Time'].dt.month
month_input = month_dict_rev[month]
if month_input > 0:
raw = raw.loc[raw['month'] == month_input]
# extract day from the Start Time column to create a day column
raw['day'] = raw['Start Time'].dt.weekday + 1
day_input = day_dict_rev[day]
if day_input > 0:
raw = raw.loc[raw['day'] == day_input]
# as this particular city is missing two columns of data, adding them in order to streamline
if city == "washington":
raw['Gender'] = 'Unknown'
raw['Birth Year'] = 'Unknown'
lean_columns = raw[['Start Time', 'End Time', 'Trip Duration', 'Start Station', 'End Station', 'User Type', 'Gender', 'Birth Year', 'month', 'day', 'hour']]
lean_columns = lean_columns.fillna({'Gender':'Unknown','Birth Year':'Unknown'})
lean_columns['Birth Year'] = lean_columns['Birth Year'].astype(str)
df = lean_columns.dropna(axis = 0)
return df
def display_data(df):
step = 5
x = step
while True:
x += step
# option for the user to see 5 more rows of data
restart = input('\nWould you like to see 5 more rows? Enter yes or no.\n')
if restart.lower() != 'yes':
break
else:
print(df.iloc[:x])
def view_first_rows_option(df):
"""Displays the first rows of the selected data if the user chooses to do so."""
display_q = input('\nWould you like to see the first 5 rows of data in addition to the stats? Enter yes or no. Or enter yess to also sort by Start Date\n')
if display_q.lower() == 'yes':
print('\nFirst rows of data:\n')
print(df.head())
display_data(df)
elif display_q.lower() == 'yess':
print('\nFirst rows of data sorted by Start Date:\n')
sort = df.sort_values(df.columns[0], ascending = True)
print(sort.head())
display_data(sort)
else:
print('\nI will take that as a no.\n')
def load_data(city, month, day):
"""
Loads data based on city selection and filters by month and day if applicable.
Args:
(str) city - name of the city to analyze, or 'all' to get the joint data
(str) month - name of the month to filter by, or "all" to apply no month filter
(str) day - name of the day of week to filter by, or "all" to apply no day filter
Returns:
df - Pandas DataFrame containing city (or all cities) data filtered by month and day
"""
# utilizing globally defined mapping between user input and month/day of week choice
month_input = month_dict_rev[month]
day_input = day_dict_rev[day]
# if a single city is selected
if city.lower() != 'all':
# loading the raw data
raw = pd.read_csv(CITY_DATA[city])
# streamlining raw data into usable df
df = streamline_df(raw, city, month, day, month_input, day_input)
# making sure that our dataframe is not empty
assert len(df) != 0
# asking for user input on whether to display the first few rows, and, if so, whether it should be sorted by start date
view_first_rows_option(df)
# else: ALL cities have been selected by the user
else:
frames = []
for i in range(1,4):
# looping through each city
city = city_dict[i]
# loading the raw data
raw = pd.read_csv(CITY_DATA[city])
# streamlining raw data into usable df
df = streamline_df(raw, city, month, day, month_input, day_input)
# appending dataframe for each city to the frames list
frames.append(df)
# concatenating to one dataframe and adding some keys for each city
df = pd.concat(frames, keys=['chicago', 'new_york_city', 'washington'])
# making sure that our dataframe is not empty
assert len(df) != 0
# asking for user input on whether to display the first few rows, and, if so, whether it should be sorted by start date
view_first_rows_option(df)
return df
def time_stats(df):
"""Displays statistics on the most frequent times of travel."""
print('-'*40)
print('\nCalculating The Most Frequent Times of Travel...\n')
start_time = time.time()
# assigning variable for total rides (to be used in following calculations of relative frequencies)
tot_rides = len(df.index)
print("Total rides based on your initial selection: " + str(tot_rides) + "\n")
# display the most common month and its relative frequency
popular_month_index = df['month'].mode()[0]
popular_month = month_dict[popular_month_index].title()
freq_pop_month = df['month'].value_counts().max()
freq_pm_percent = freq_pop_month / tot_rides * 100
freq_pm_percent = "{0:.1f}".format(freq_pm_percent)
print('Most Common Month: ' + popular_month + " - with " + str(freq_pop_month) + " occurrences (" + str(freq_pm_percent) + " percent)")
# display the most common day of week and its relative frequency
popular_day_index = df['day'].mode()[0]
popular_day = day_dict[popular_day_index]
freq_pop_day = df['day'].value_counts().max()
freq_pd_percent = freq_pop_day / tot_rides * 100
freq_pd_percent = "{0:.1f}".format(freq_pd_percent)
print('\nMost Common Day of the Week: ' + popular_day.title() + " - with " + str(freq_pop_day) + " occurrences (" + str(freq_pd_percent) + " percent)")
# display the most common start hour and its relative frequency
popular_hour = df['hour'].mode()[0]
end_popular_hour = (popular_hour + 1) % 24
freq_pop_hour = df['hour'].value_counts().max()
freq_ph_percent = freq_pop_hour / tot_rides * 100
freq_ph_percent = "{0:.1f}".format(freq_ph_percent)
print('\nMost Common Start Hour: ' + str(popular_hour) + "-" + str(end_popular_hour) + " - with " + str(freq_pop_hour) + " occurrences (" + str(freq_ph_percent) + " percent)")
print("\nThis took %s seconds." % (time.time() - start_time))
print('-'*40)
def station_stats(df):
"""Displays statistics on the most popular stations and trip."""
print('\nCalculating The Most Popular Stations and Trip...\n')
start_time = time.time()
# display most commonly used start station
popular_start_st = df['Start Station'].mode()[0]
print("\nThe most popular start station is:\n" + popular_start_st)
# display most commonly used end station
popular_end_st = df['End Station'].mode()[0]
print("\nThe most popular end station is:\n" + popular_end_st)
# display most frequent combination of start station and end station trip
df['combined'] = df['Start Station'] + " - " + df['End Station']
popular_comb_st = df['combined'].mode()[0]
print("\nThe most popular trip is:\n" + popular_comb_st)
print("\nThis took %s seconds." % (time.time() - start_time))
print('-'*40)
def seconds_to_sentence(seconds):
"""Takes a number seconds and translates it to a natural language string that could contain minutes, hours, days and years."""
if seconds >= 365*24*60*60:
rest = seconds
years = int(rest // (365*24*60*60))
rest = int(rest % (365*24*60*60))
days = rest // (24*60*60)
rest = int(rest % (24*60*60))
hours = rest // (60*60)
rest = int(rest % (60*60))
minutes = rest // 60
seconds = int(rest % 60)
sentence = "{} years {} days {} hours {} minutes {} seconds".format(years, days, hours, minutes, seconds)
elif seconds >= 24*60*60:
rest = seconds
days = int(rest // (24*60*60))
rest = int(rest % (24*60*60))
hours = rest // (60*60)
rest = int(rest % (60*60))
minutes = rest // 60
seconds = int(rest % 60)
sentence = "{} days {} hours {} minutes {} seconds".format(days, hours, minutes, seconds)
elif seconds >= 60*60:
rest = seconds
hours = int(rest // (60*60))
rest = int(rest % (60*60))
minutes = int(rest // 60)
seconds = int(rest % 60)
sentence = "{} hours {} minutes {} seconds".format(hours, minutes, seconds)
elif seconds >= 60:
rest = seconds
minutes = int(rest // 60)
seconds = int(rest % 60)
sentence = "{} minutes {} seconds".format(minutes, seconds)
else:
sentence = "{} seconds".format(seconds)
return sentence
def trip_duration_stats(df):
"""Displays statistics on the total and average trip duration."""
print('\nCalculating Trip Duration...\n')
start_time = time.time()
# display total travel time
tot_travel_time = df['Trip Duration'].sum()
print("Total travel time: " + seconds_to_sentence(tot_travel_time))
# display average travel time
mean_travel_time = df['Trip Duration'].mean()
print("\nMean travel time: " + seconds_to_sentence(mean_travel_time))
# display median travel time
median_travel_time = df['Trip Duration'].median()
print("\nMedian travel time: " + seconds_to_sentence(median_travel_time))
print("\nThis took %s seconds.." % (time.time() - start_time))
print('-'*40)
def user_stats(df):
"""Displays statistics on bikeshare users."""
print('\nCalculating User Stats...\n')
start_time = time.time()
# Display counts of user types
user_type_count = df['User Type'].value_counts().to_frame()
print('Counts of user type:\n')
print(user_type_count)
# Display counts of gender
gender_count = df['Gender'].value_counts().to_frame()
print('\nCounts of gender:\n')
print(gender_count)
# Display most common year of birth
most_common_year = df['Birth Year'].value_counts().index.tolist()
# using if statement to handle cases where the most common year is 'Unknown'
# but also for washington data it must allow to return 'Unknown' when that's all there is.
if most_common_year[0] == 'Unknown' and len(most_common_year) > 1:
print('\nMost common birth year among customers: ' + str(most_common_year[1]))
else:
print('\nMost common birth year among customers: ' + str(most_common_year[0]))
print("\nThis took %s seconds." % (time.time() - start_time))
print('-'*40)
def main():
while True:
city, month, day = get_filters()
df = load_data(city, month, day)
time_stats(df)
station_stats(df)
trip_duration_stats(df)
user_stats(df)
# option for the user to restart
restart = input('\nWould you like to restart? Enter yes or no.\n')
if restart.lower() != 'yes':
break
if __name__ == "__main__":
main()