Analyze
Now all the required information are in one place and ready for
exploration.
Key tasks
- Aggregate your data so it’s useful and accessible.
- Organise and format your data.
- Perform calculations.
- Identify trends and relationships.
Deliverable
Checking the unique types of ride available
unique(all_rides$rideable_type)
## [1] "classic_bike" "docked_bike" "electric_bike"
Checking the unique values in member_casual
unique(all_rides$member_casual)
## [1] "member" "casual"
Verifying if there are any NA in dataset
sum(is.na(all_rides))
## [1] 0
Checking if there are any duplicates
#distinct(all_rides)
dim(all_rides)
## [1] 5853088 22
write_csv(all_rides, '/home/arjit/Projects/Case Study Cyclistic/Data/all_rides_modified.csv')
df <- read_csv('/home/arjit/Projects/Case Study Cyclistic/Data/all_rides_modified.csv')
## Rows: 5853088 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): ride_id, rideable_type, start_station_name, start_station_id, end...
## dbl (7): start_lat, start_lng, end_lat, end_lng, minutes, year, hour
## dttm (2): started_at, ended_at
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 22
## ride_id rideable_type started_at ended_at
## <chr> <chr> <dttm> <dttm>
## 1 EC2DE40644C6B0F4 classic_bike 2022-05-23 23:06:58 2022-05-23 23:40:19
## 2 1C31AD03897EE385 classic_bike 2022-05-11 08:53:28 2022-05-11 09:31:22
## 3 1542FBEC830415CF classic_bike 2022-05-26 18:36:28 2022-05-26 18:58:18
## 4 6FF59852924528F8 classic_bike 2022-05-10 07:30:07 2022-05-10 07:38:49
## 5 483C52CAAE12E3AC classic_bike 2022-05-10 17:31:56 2022-05-10 17:36:57
## 6 C0A3AA5A614DCE01 classic_bike 2022-05-04 14:48:55 2022-05-04 14:56:04
## # ℹ 18 more variables: start_station_name <chr>, start_station_id <chr>,
## # end_station_name <chr>, end_station_id <chr>, start_lat <dbl>,
## # start_lng <dbl>, end_lat <dbl>, end_lng <dbl>, member_casual <chr>,
## # minutes <dbl>, date <date>, month <chr>, day <chr>, year <dbl>,
## # day_of_week <chr>, hour <dbl>, season <chr>, time_of_day <chr>
dim(df)
## [1] 5853088 22
Checking the structure of dataset
str(df)
## spc_tbl_ [5,853,088 × 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:5853088] "EC2DE40644C6B0F4" "1C31AD03897EE385" "1542FBEC830415CF" "6FF59852924528F8" ...
## $ rideable_type : chr [1:5853088] "classic_bike" "classic_bike" "classic_bike" "classic_bike" ...
## $ started_at : POSIXct[1:5853088], format: "2022-05-23 23:06:58" "2022-05-11 08:53:28" ...
## $ ended_at : POSIXct[1:5853088], format: "2022-05-23 23:40:19" "2022-05-11 09:31:22" ...
## $ start_station_name: chr [1:5853088] "Wabash Ave & Grand Ave" "DuSable Lake Shore Dr & Monroe St" "Clinton St & Madison St" "Clinton St & Madison St" ...
## $ start_station_id : chr [1:5853088] "TA1307000117" "13300" "TA1305000032" "TA1305000032" ...
## $ end_station_name : chr [1:5853088] "Halsted St & Roscoe St" "Field Blvd & South Water St" "Wood St & Milwaukee Ave" "Clark St & Randolph St" ...
## $ end_station_id : chr [1:5853088] "TA1309000025" "15534" "13221" "TA1305000030" ...
## $ start_lat : num [1:5853088] 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num [1:5853088] -87.6 -87.6 -87.6 -87.6 -87.6 ...
## $ end_lat : num [1:5853088] 41.9 41.9 41.9 41.9 41.9 ...
## $ end_lng : num [1:5853088] -87.6 -87.6 -87.7 -87.6 -87.7 ...
## $ member_casual : chr [1:5853088] "member" "member" "member" "member" ...
## $ minutes : num [1:5853088] 33.4 37.9 21.8 8.7 5 7.2 8.9 12.2 16.7 1.5 ...
## $ date : Date[1:5853088], format: "2022-05-23" "2022-05-11" ...
## $ month : chr [1:5853088] "May" "May" "May" "May" ...
## $ day : chr [1:5853088] "23" "11" "26" "10" ...
## $ year : num [1:5853088] 2022 2022 2022 2022 2022 ...
## $ day_of_week : chr [1:5853088] "Monday" "Wednesday" "Thursday" "Tuesday" ...
## $ hour : num [1:5853088] 23 8 18 7 17 14 12 19 17 7 ...
## $ season : chr [1:5853088] "Summer" "Summer" "Summer" "Summer" ...
## $ time_of_day : chr [1:5853088] "Evening" "Morning" "Evening" "Morning" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character(),
## .. minutes = col_double(),
## .. date = col_date(format = ""),
## .. month = col_character(),
## .. day = col_character(),
## .. year = col_double(),
## .. day_of_week = col_character(),
## .. hour = col_double(),
## .. season = col_character(),
## .. time_of_day = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
Analyzing the summary of the dataset
summary(df)
## ride_id rideable_type started_at
## Length:5853088 Length:5853088 Min. :2022-05-01 00:00:06
## Class :character Class :character 1st Qu.:2022-07-03 11:25:11
## Mode :character Mode :character Median :2022-08-28 13:00:14
## Mean :2022-09-19 14:02:11
## 3rd Qu.:2022-11-08 07:01:29
## Max. :2023-04-30 23:59:05
## ended_at start_station_name start_station_id
## Min. :2022-05-01 00:05:17 Length:5853088 Length:5853088
## 1st Qu.:2022-07-03 11:48:44 Class :character Class :character
## Median :2022-08-28 13:20:19 Mode :character Mode :character
## Mean :2022-09-19 14:18:00
## 3rd Qu.:2022-11-08 07:12:01
## Max. :2023-05-01 08:06:56
## end_station_name end_station_id start_lat start_lng
## Length:5853088 Length:5853088 Min. :41.64 Min. :-87.84
## Class :character Class :character 1st Qu.:41.88 1st Qu.:-87.66
## Mode :character Mode :character Median :41.90 Median :-87.64
## Mean :41.90 Mean :-87.65
## 3rd Qu.:41.93 3rd Qu.:-87.63
## Max. :42.07 Max. :-87.52
## end_lat end_lng member_casual minutes
## Min. : 0.00 Min. :-88.14 Length:5853088 Min. :-10353.40
## 1st Qu.:41.88 1st Qu.:-87.66 Class :character 1st Qu.: 5.70
## Median :41.90 Median :-87.64 Mode :character Median : 10.00
## Mean :41.90 Mean :-87.65 Mean : 15.83
## 3rd Qu.:41.93 3rd Qu.:-87.63 3rd Qu.: 17.90
## Max. :42.37 Max. : 0.00 Max. : 32035.40
## date month day year
## Min. :2022-05-01 Length:5853088 Length:5853088 Min. :2022
## 1st Qu.:2022-07-03 Class :character Class :character 1st Qu.:2022
## Median :2022-08-28 Mode :character Mode :character Median :2022
## Mean :2022-09-18 Mean :2022
## 3rd Qu.:2022-11-08 3rd Qu.:2022
## Max. :2023-04-30 Max. :2023
## day_of_week hour season time_of_day
## Length:5853088 Min. : 0.00 Length:5853088 Length:5853088
## Class :character 1st Qu.:11.00 Class :character Class :character
## Mode :character Median :15.00 Mode :character Mode :character
## Mean :14.21
## 3rd Qu.:18.00
## Max. :23.00
Calculating the ride length in kilometers
df$ride_length <- distGeo(matrix(c(df$start_lng, df$start_lat), ncol = 2), matrix(c(df$end_lng, df$end_lat), ncol = 2))
df$ride_length <- df$ride_length/1000
Removing the rows with ride length as negative or
zero
df <- df[!(df$ride_length <= 0),]
Removing the NA in the dataset
sum(is.na(df))
## [1] 3020593
df <- na.omit(df)
Doing descriptive analysis, studying:
avg_ride_length -> straight average(total ride length / total
rides)
median_length -> midpoint number of ride length
max_ride_length -> longest ride
min_ride_length -> shortest ride
df %>% summarise(avg_ride_length = mean(ride_length), median_length = median(ride_length),
max_ride_length = max(ride_length), min_ride_length = min(ride_length))
## # A tibble: 1 × 4
## avg_ride_length median_length max_ride_length min_ride_length
## <dbl> <dbl> <dbl> <dbl>
## 1 2.18 1.61 9817. 0.0000185
Compare members and casual riders
- Member vs casual riders difference depending on the total rides
taken
df %>%
group_by(member_casual) %>%
summarise(ride_count = length(ride_id), ride_percentage = (length(ride_id) / nrow(df)) * 100)
## # A tibble: 2 × 3
## member_casual ride_count ride_percentage
## <chr> <int> <dbl>
## 1 casual 1668876 38.5
## 2 member 2663765 61.5
ggplot(df, aes(x = member_casual, fill=member_casual)) +
geom_bar() +
labs(x="Casuals vs Members", y="Number Of Rides", title= "Casuals vs Members distribution")
We can see on the Casuals vs Members distribution chart, members
possessing ~ 61%, and casual riders have ~ 39% of the dataset. So it is
clearly visible that in the whole year 2022 members used ride share 22%
more than casual riders.
Comparison between Member Casual riders depending on ride
length
df %>%
group_by(member_casual) %>%
summarise(avg_ride_length = mean(ride_length), median_length = median(ride_length),
max_ride_length = max(ride_length), min_ride_length = min(ride_length))
## # A tibble: 2 × 5
## member_casual avg_ride_length median_length max_ride_length min_ride_length
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 casual 2.29 1.74 9817. 0.0000185
## 2 member 2.12 1.53 9817. 0.0000202
From the above table we can conclude that casual riders took bike for
longer rides than members, as the average trip duration / average ride
length of member riders is lower than the average trip duration /
average ride length of casual riders.
Fixing the names of days in a week
df$day_of_week <- ordered(df$day_of_week,
levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
df %>%
group_by(member_casual, day_of_week) %>%
summarise(number_of_rides = n()
,avg_ride_length = mean(ride_length),.groups="drop") %>%
arrange(member_casual, day_of_week)
## # A tibble: 14 × 4
## member_casual day_of_week number_of_rides avg_ride_length
## <chr> <ord> <int> <dbl>
## 1 casual Sunday 277155 2.39
## 2 casual Monday 194373 2.20
## 3 casual Tuesday 191531 2.18
## 4 casual Wednesday 200016 2.28
## 5 casual Thursday 223623 2.22
## 6 casual Friday 245092 2.24
## 7 casual Saturday 337086 2.42
## 8 member Sunday 299215 2.17
## 9 member Monday 374705 2.03
## 10 member Tuesday 422636 2.07
## 11 member Wednesday 429342 2.22
## 12 member Thursday 428473 2.09
## 13 member Friday 374112 2.05
## 14 member Saturday 335282 2.21
Analyzing the total rides by members and casual riders in a
particular day in a week
df %>%
group_by(member_casual, day_of_week) %>%
summarise(number_of_rides = n(), .groups="drop") %>%
arrange(member_casual, day_of_week) %>%
ggplot(aes(x = day_of_week, y = number_of_rides, fill = member_casual)) +
labs(title ="Total rides by Members and Casual riders Vs. Day of the week") +
geom_col(width=0.5, position = position_dodge(width=0.5)) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE))
From the above graph, we can conclude that members took consistent
rides throughout the week with least on Sunday. For the casual riders,
the most rides were taken on weekends.
Analyzing the average ride time of members casual riders on
a particular day of a week
df %>%
group_by(member_casual, day_of_week) %>%
summarise(average_ride_length = mean(ride_length), .groups="drop") %>%
ggplot(aes(x = day_of_week, y = average_ride_length, fill = member_casual)) +
geom_col(width=0.5, position = position_dodge(width=0.5)) +
labs(title ="Average ride time by Members and Casual riders Vs. Day of the week")
The average ride length for members are comparatively less than that
of casual riders. Also it can be seen that weekend average ride length
is much higher for casual riders along with total rides. So both of this
facts can be correlated for casual riders. For members average ride
length is about the same throughout the week
Grouping the rides according to member casual and months of
a year
df %>%
group_by(member_casual, month) %>%
summarise(number_of_rides = n(), average_ride_length = mean(ride_length), .groups="drop") %>%
arrange(member_casual, month)
## # A tibble: 24 × 4
## member_casual month number_of_rides average_ride_length
## <chr> <chr> <int> <dbl>
## 1 casual April 102551 2.23
## 2 casual August 252507 2.33
## 3 casual December 30197 1.80
## 4 casual February 30825 1.87
## 5 casual January 28189 1.73
## 6 casual July 289054 2.37
## 7 casual June 269373 2.39
## 8 casual March 44359 1.89
## 9 casual May 200562 2.43
## 10 casual November 69553 2.27
## # ℹ 14 more rows
Analyzing the total rides by members and casual riders in a
month
df$month <- ordered(df$month,
levels=c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"))
df %>%
group_by(member_casual, month) %>%
summarise(number_of_rides = n(),.groups="drop") %>%
arrange(member_casual, month) %>%
ggplot(aes(x = month, y = number_of_rides, fill = member_casual)) +
labs(title ="Total rides by Members and Casual riders Vs. Month", x = "Month", y= "Number Of Rides") +
theme(axis.text.x = element_text(angle = 45)) +
geom_col(width=0.5, position = position_dodge(width=0.5)) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE))
The months June, July, August and September are the most busy time of
the year among both members and casual riders. It is possible due to
winter there is a significant drop in total rides in the months of
November, December, January and February for both type of customers. But
we can see that member’s total rides are higher than casual riders
throughout the year except from June, July and August.
Analyzing the ride length by members casual riders in a
month
df %>%
group_by(member_casual, month) %>%
summarise(average_ride_length = mean(ride_length),.groups="drop") %>%
ggplot(aes(x = month, y = average_ride_length, fill = member_casual)) +
geom_col(width=0.5, position = position_dodge(width=0.5)) +
labs(title ="Average ride length by Members and Casual riders Vs. Month") +
theme(axis.text.x = element_text(angle = 30))
Average ride length of members is about the same throughout the year.
While casual riders average ride length is greater than the members
throughout the year.
Analyzing the average distance traveled by member casual
rider
df %>%
group_by(member_casual) %>%
summarise(avg_ride_length = mean(ride_length)) %>%
ggplot() +
geom_col(mapping= aes(x= member_casual,y= avg_ride_length,fill=member_casual), show.legend = FALSE)+
labs(title = "Mean travel distance by Members and Casual riders", x="Member and Casual riders", y="Average distance In Km")
From the above chart we can see that both riders travel about the
same average distance. This similarity could be possible due to that
member take (same ride time) rides throughout the week, but casual
riders took rides mostly in weekends with higher ride time.
Analysis and visualize the bike demand by hour in a
day
df %>%
ggplot(aes(hour, fill= member_casual)) +
labs(x="Hour of the day", title="Cyclistic's Bike demand by hour in a day") +
geom_bar()
From the above chart we can see more members between 7am and 11am and
more casual riders between 3pm and 12am. Also there is bigger volume
rise in the afternoon for both type of riders. This information needs to
be checked on day basis.
Analysis and visualize the bikes demand by hour in a day of
a week
df %>%
ggplot(aes(hour, fill=member_casual)) +
geom_bar() +
labs(x="Hour of the day", title="Cyclistic's bike demand per hour by day of the week") +
facet_wrap(~ day_of_week)
There is a lot of difference between the weekdays and weekends. There
is a big increase of volume in the weekdays between 7am to 10am and
another volume increase from 5pm to 7pm. We can hypothesize that members
use the bikes as daily routine like going to work (same behavior
throughout the weekdays) and go back from work (5pm - 7pm). Weekends are
completely different for members and casual riders, Friday, Saturday and
Sunday there is huge peak in volume for casual riders, from this we can
hypothesize that casual riders mostly use bike share for leisure
activity in the weekends.
Analyzing the preferred bike by member casual
riders
df %>%
group_by(rideable_type) %>%
summarise(count = length(ride_id))
## # A tibble: 3 × 2
## rideable_type count
## <chr> <int>
## 1 classic_bike 2481004
## 2 docked_bike 137651
## 3 electric_bike 1713986
ggplot(df, aes(x=rideable_type, fill=member_casual)) +
labs(x="Rideable type", title="Rideable type Vs. total rides by Members and casual riders") +
geom_bar()
From the above viz we can see that members mostly use classic bikes,
followed by electric bikes. Docked bikes mostly used by casual riders.
Electric bikes are more favored by members.
Analyzing and visualizing the top 5 starting stations for
casual riders
df %>%
filter(!(is.na(start_station_name))) %>%
filter(member_casual == "casual") %>%
group_by(start_station_name) %>%
summarize(count=n()) %>%
arrange(-count) %>%
top_n(5)
## Selecting by count
## # A tibble: 5 × 2
## start_station_name count
## <chr> <int>
## 1 Streeter Dr & Grand Ave 46527
## 2 DuSable Lake Shore Dr & Monroe St 25609
## 3 Millennium Park 20893
## 4 Michigan Ave & Oak St 20459
## 5 DuSable Lake Shore Dr & North Blvd 20302
df %>%
filter(!(is.na(start_station_name))) %>%
filter(member_casual == "casual") %>%
group_by(start_station_name) %>%
summarize(count=n()) %>%
arrange(-count) %>%
top_n(5) %>%
mutate(start_station_name= fct_reorder(start_station_name, count)) %>%
ggplot(aes(x=start_station_name, y=count, fill=count)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(x="Number of Rides", y="Start Station Name", title="Top 5 starting stations for casual riders")
## Selecting by count
Analyzing and visualizing the top 5 ending stations for
casual riders
df %>%
filter(!(is.na(end_station_name))) %>%
filter(member_casual == "casual") %>%
group_by(end_station_name) %>%
summarize(count=n()) %>%
arrange(-count) %>%
top_n(5)
## Selecting by count
## # A tibble: 5 × 2
## end_station_name count
## <chr> <int>
## 1 Streeter Dr & Grand Ave 49616
## 2 DuSable Lake Shore Dr & North Blvd 23507
## 3 DuSable Lake Shore Dr & Monroe St 23431
## 4 Millennium Park 22762
## 5 Michigan Ave & Oak St 22148
df %>%
filter(!(is.na(end_station_name))) %>%
filter(member_casual == "casual") %>%
group_by(end_station_name) %>%
summarize(count=n()) %>%
arrange(-count) %>%
top_n(5) %>%
mutate(end_station_name= fct_reorder(end_station_name, count)) %>%
ggplot(aes(x=end_station_name, y=count, fill=count)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(x="Number of Rides", y="End Station Name", title="Top 5 ending stations for casual riders")
## Selecting by count
Analyzing and visualizing the top 5 starting stations for
member riders
df %>%
filter(!(is.na(start_station_name))) %>%
filter(member_casual == "member") %>%
group_by(start_station_name) %>%
summarize(count=n()) %>%
arrange(-count) %>%
top_n(5)
## Selecting by count
## # A tibble: 5 × 2
## start_station_name count
## <chr> <int>
## 1 Kingsbury St & Kinzie St 23442
## 2 Clark St & Elm St 20969
## 3 Clinton St & Washington Blvd 20260
## 4 Wells St & Concord Ln 19897
## 5 Loomis St & Lexington St 18979
df %>%
filter(!(is.na(start_station_name))) %>%
filter(member_casual == "member") %>%
group_by(start_station_name) %>%
summarize(count=n()) %>%
arrange(-count) %>%
top_n(5) %>%
mutate(start_station_name= fct_reorder(start_station_name, count)) %>%
ggplot(aes(x=start_station_name, y=count, fill=count)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(x="Number of Rides", y="Start Station Name", title="Top 5 starting stations for member riders")
## Selecting by count
Analyzing and visualizing the top 5 ending stations for
casual riders
df %>%
filter(!(is.na(end_station_name))) %>%
filter(member_casual == "member") %>%
group_by(end_station_name) %>%
summarize(count=n()) %>%
arrange(-count) %>%
top_n(5)
## Selecting by count
## # A tibble: 5 × 2
## end_station_name count
## <chr> <int>
## 1 Kingsbury St & Kinzie St 23231
## 2 Clark St & Elm St 21304
## 3 Clinton St & Washington Blvd 21181
## 4 Wells St & Concord Ln 20662
## 5 University Ave & 57th St 19424
df %>%
filter(!(is.na(end_station_name))) %>%
filter(member_casual == "member") %>%
group_by(end_station_name) %>%
summarize(count=n()) %>%
arrange(-count) %>%
top_n(5) %>%
mutate(end_station_name= fct_reorder(end_station_name, count)) %>%
ggplot(aes(x=end_station_name, y=count, fill=count)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(x="Number of Rides", y="End Station Name", title="Top 5 ending stations for member riders")
## Selecting by count
Analyze and visualize the dataset on coordinate
basis
coordinates_df <- df %>%
filter(start_lng != end_lng & start_lat != end_lat) %>%
group_by(start_lng, start_lat, end_lng, end_lat, member_casual, rideable_type) %>%
summarise(total_rides = n(),.groups="drop") %>%
filter(total_rides > 200)
casual_riders <- coordinates_df %>% filter(member_casual == "casual")
member_riders <- coordinates_df %>% filter(member_casual == "member")
Setting up the ggmap for Chicago
chicago <- c(left = -87.700424, bottom = 41.790769, right = -87.554855, top = 41.990119)
chicago_map <- get_stamenmap(bbox = chicago, zoom = 12, maptype = "terrain")
## ℹ Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
Visualization on the map
ggmap(chicago_map,darken = c(0.1, "white")) +
geom_point(casual_riders, mapping = aes(x = start_lng, y = start_lat, color=rideable_type), size = 2) +
coord_fixed(0.8) +
labs(title = "Most used routes by Casual riders",x=NULL,y=NULL) +
theme(legend.position="none")
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.
## Warning: Removed 8 rows containing missing values (`geom_point()`).
ggmap(chicago_map,darken = c(0.1, "white")) +
geom_point(member_riders, mapping = aes(x = start_lng, y = start_lat, color=rideable_type), size = 2) +
coord_fixed(0.8) +
labs(title = "Most used routes by Member riders",x=NULL,y=NULL) +
theme(legend.position="none")
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.
## Warning: Removed 37 rows containing missing values (`geom_point()`).
We can clearly see the casual rides are mostly located around the
center of the town (or the bay area), with all their trips located
around that area points towards their bike usage pattern, which is for
leisure, probably tourist or sightseeing related rides.
Members are mostly use bike all over the city including main city
area and outside main center. This can be hypothesize as they travel for
work purpose.