-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathBike rental prediction-project using R.R
More file actions
1371 lines (998 loc) · 54.9 KB
/
Bike rental prediction-project using R.R
File metadata and controls
1371 lines (998 loc) · 54.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
###Bike Rental Prediction - Project###
# Section: Loading Packages and Libraries for Visualizing Dataset and Performing ML Analysis
## Loading Important Libraries
#Before proceeding, ensure to install all the packages listed inside the `library()` function.
# Install ranger package if not already installed
# install.packages("ranger")
# ggplot2: A powerful and flexible plotting system for creating visualizations in R.
library(ggplot2)
# tidyverse: A collection of packages (including ggplot2) that provide a consistent and efficient data manipulation workflow.
library(tidyverse)
# explore: A package for exploratory data analysis, offering various visualizations and summary statistics.
library(explore)
# GGally: Extension to ggplot2, designed for creating ggplot2 visualizations with multiple plots.
library(GGally)
# ggridges: Creates ridge plots for visualizing the distribution of a numeric value by one or more categorical factors.
library(ggridges)
# Metrics: Provides various metrics for evaluating machine learning models, including RMSE and MAE.
library(Metrics)
# car: Companion to Applied Regression package, offering various regression-related functions.
library(car)
# corrgram: Package for creating correlograms to visualize correlation matrices.
library(corrgram)
# corrplot: Package for creating correlation plots.
library(corrplot)
# caret: Classification and Regression Training package, a comprehensive framework for building predictive models.
library(caret)
# randomForest: Implements random forest algorithms for classification and regression.
library(randomForest)
# ranger: Another implementation of random forest algorithms for faster performance.
library(ranger)
# DMwR2: Data Mining with R, provides functions and data sets for teaching data mining.
library(DMwR2)
# ipred: Improved Predictors package, provides bagging and bootstrapping algorithms for predictive modeling.
library(ipred)
# caTools: Provides various tools for data splitting and manipulation, often used in predictive modeling.
library(caTools)
# viridis: A colorblind-friendly color palette for data visualizations.
library(viridis)
# lubridate: Simplifies date and time handling in R.
library(lubridate)
# readxl: Package for reading Excel files.
library(readxl)
#Section: Adding File into R Programe
# Install and load the readxl package
install.packages("readxl")
library(readxl)
# Provide the correct path to your Excel file
file_path <- "C:/Users/anami/OneDrive/Desktop/bike rental dataset/Dataset/1657875746_day.xlsx"
# Read the Excel file into a data frame
bike_df <- read_excel(file_path)
#Section : Exploring Dataset Given
# Print the first few rows of the data frame using the `head` function
head(bike_df)
# Alternatively, you can use `head.matrix` to print the first few rows
head.matrix(bike_df)
# Explanation:
# The `head` function is commonly used to display the first few rows of a data frame.
# It helps to quickly inspect the structure and contents of the dataset.
# Print the last few rows of the data frame using the `tail` function
tail(bike_df)
# Alternatively, you can use `tail.matrix` to print the last few rows
tail.matrix(bike_df)
# Explanation:
# The `tail` function is commonly used to display the last few rows of a data frame.
# It helps to check the end of the dataset and ensures data has been read correctly.
# Using the paste function to concatenate strings and print information about the dataset dimensions
# dim(bikes_df) returns a vector with the number of rows and columns in the dataset
# The concatenated string includes "Dimension of dataset: " followed by the number of rows and columns
paste("Dimension of dataset: ", dim(bike_df))
#Section:Performing EDA: Exploratory Data Analysis
#Rename the columns
names(bike_df)<-c('record_id','datetime','season','year','month','holiday','weekday','workingday','weather_condition','temp','atemp','humidity','windspeed','casual_users','registerd_users','count')
head(bike_df)
#Section : Descriptive Analysis
# Summary of the dataset using the summary() function to get a quick overview of key statistics.
summary(bike_df)
#section: Date type conversion of attributes
# Display the structure of the dataset using the str() function to understand variable types and their distribution.
str(bike_df)
#Typecasting the datetime and numerical attributes to category
# Convert 'datetime' to Date format
bike_df$datetime<- as.Date(bike_df$datetime)
bike_df$year<-as.factor(bike_df$year)
bike_df$month<-as.factor(bike_df$month)
bike_df$season <- as.factor(bike_df$season)
bike_df$holiday<- as.factor(bike_df$holiday)
bike_df$weekday<- as.factor(bike_df$weekday)
bike_df$workingday<- as.factor(bike_df$workingday)
bike_df$weather_condition<- as.factor(bike_df$weather_condition)
# Display the structure of the dataset using the str() function to understand variable types and their distribution.
str(bike_df)
#Section: Missing Value ANalysis:
#Missing values in dataset
missing_val<-data.frame(apply(bike_df,2,function(x){sum(is.na(x))}))
names(missing_val)[1]='missing_val'
missing_val
# Section: Visualization of Numerical Variables
# Visualize the relationships and distributions of key numerical variables using a pairs plot.
# This plot includes temperature ('temp'), "feels-like" temperature ('atemp'), humidity ('hum'),
# windspeed ('windspeed'), casual rentals ('casual'), registered rentals ('registered'),
# and the total number of rentals ('cnt'). The plot helps identify potential patterns, correlations,
# and outliers among these variables.
# Reduce the outer margins to avoid "figure margins too large" error
par(mar = c(0.1, 0.1, 0.1, 0.1))
num_vars <- c("temp", "atemp", "humidity", "windspeed", "casual_users", "registerd_users", "count")
pairs(bike_df[, num_vars],
main = "Pairs Plot of Numerical Variables",
col = bike_df$season, # Color points based on the season for additional insights
pch = 16) # Use filled circles for better visibility
# Adding custom titles for each panel of the pairs plot.
for (i in 1:length(num_vars)) {
for (j in 1:length(num_vars)) {
panel_var1 <- num_vars[i]
panel_var2 <- num_vars[j]
panel_title <- paste("Relationship between", panel_var1, "and", panel_var2)
panel_subtitle <- ifelse(i == j, "Distribution", paste("Colored by Season:", unique(bike_df$season)))
title(panel_title, line = 2.5, cex.main = 0.8)
title(panel_subtitle, line = 4, cex.sub = 0.6)
}
}
# Reset the outer margins to default after creating the plot
par(mar = c(5, 4, 4, 2) + 0.1)
# Section: Exploring Bike Rentals Distribution
# Explore the distribution of bike rentals using a histogram.
# This plot provides an overview of the distribution of total bike rentals ('cnt') per observation.
# Plotting the histogram to visualize the distribution of bike rentals.
hist(bike_df$count,
main = "Distribution of Bike Rentals",
xlab = "Number of Rentals",
col = "#75AADB", # Custom color for better visibility
border = "#333333", # Border color
breaks = 30, # Adjust the number of bins
xlim = c(0, max(bike_df$count) + 50), # Set x-axis limits for better readability
las = 1, # Keep axis labels horizontal
cex.main = 1.5, # Increase main title size
cex.lab = 1.2, # Increase axis label size
cex.axis = 1.2) # Increase axis tick label size
# Adding informative elements to the plot.
abline(v = mean(bike_df$count), col = "red", lwd = 2, lty = 2) # Add a vertical line for mean
text(mean(bike_df$count) + 10, 250, "Mean", col = "red", font = 2, cex = 1.2) # Label for the mean
# Adding additional information inside the plot
mtext(" The distribution is slightly right-skewed.", side = 1, line = 2, cex = 1.1)
mtext("The mean number of rentals is marked by a red dashed line.", side = 3, line = 3, cex = 1.1)
# Section:Histogram of Target Variable- "count"
# Load the required library for data visualization
library(ggplot2)
# Create a histogram to explore the distribution of the target variable 'count'.
ggplot(bike_df, aes(x = count)) +
geom_histogram(bins = 30, colour = "black", fill = "#56B4E9") +
ggtitle("Distribution of Bike Rentals ('count')") +
xlab("Count Variable") + ylab("Density of the Sample") +
theme(
plot.title = element_text(color = "blue", size = 18, face = "bold"),
axis.title.x = element_text(color = "blue", size = 14),
axis.title.y = element_text(color = "blue", size = 14)
) +
# Adding informative elements on the plot
geom_vline(aes(xintercept = mean(bike_df$count)),
color = "red", linetype = "dashed", linewidth = 1.2) +
annotate("text", x = mean(bike_df$cnt) + 10, y = 30,
label = "Mean", color = "red", size = 5)
# Section: Log Transformation of Bike Rentals
# Create a histogram and density plot after applying log transformation to 'count'
ggplot(bike_df, aes(x = log(count))) +
geom_histogram(aes(y = after_stat(density)), bins = 30, colour = "black", fill = "grey") +
geom_density(alpha = 0.2, fill = "cyan") +
ggtitle("Distribution of Log-Transformed Bike Rentals ('count')") +
xlab("Log-Transformed Count Variable") + ylab("Density of the Sample") +
theme(
plot.title = element_text(color = "black", size = 18, face = "bold"),
axis.title.x = element_text(color = "blue", size = 14),
axis.title.y = element_text(color = "blue", size = 14)
) +
# Adding additional information on the plot
geom_vline(aes(xintercept = mean(log(bike_df$count))),
color = "red", linetype = "dashed", linewidth = 1.2) +
annotate("text", x = mean(log(bike_df$count)) + 0.1, y = 0.2,
label = "Mean (log-transformed)", color = "red", size = 4) +
geom_vline(aes(xintercept = median(log(bike_df$count))),
color = "green", linetype = "dashed", linewidth = 1.2) +
annotate("text", x = median(log(bike_df$count)) + 0.1, y = 0.4,
label = "Median (log-transformed)", color = "green", size = 4) +
# Adding insights on the plot
annotate("text", x = 6, y = 0.6,
label = "Insights:", color = "red", size = 4, fontface = "bold") +
annotate("text", x = 6, y = 0.55,
label = "1. Applying a log transformation has helped in reducing skewness.", color = "red", size = 2) +
annotate("text", x = 6, y = 0.5,
label = "2. The mean and median values are marked for reference.", color = "red", size = 2)
# Section: Creating Correlogram with ggpairs
# Create a new plotting device
dev.new()
# Load necessary libraries
library(GGally)
library(viridis)
# Create a correlogram with ggpairs
ggpairs(bike_df, title = "Correlogram with ggpairs()")
ggcorr(bike_df, method = c("everything", "pearson"), label=TRUE) +
ggtitle("Plot of correlation variables") +
theme(plot.title = element_text(color="blue", size=18, face="bold"))
cat("With the correlation plot, we can see how the variables that most interfere/correlate with the target are temperature, apparent temperature (atemp), weather, year, season, and finally, wind.\n")
# Section: Exploring Data with 'explore' Package
# Load required packages
library(explore)
library(lubridate)
# Extract the year from the 'datetime' variable
bike_df$year_from_date <- year(bike_df$datetime)
# Set display format to show full numeric values
options(scipen = 999)
# Use 'explore' to analyze the dataset, focusing on the 'cnt' variable with the extracted 'year_from_date' as the target
explore(bike_df, count, target = year_from_date)
# Section: Exploring Correlation Among Numerical Variables
# Explore the correlation matrix for numerical variables.
# The matrix and the subsequent heatmap help identify strong correlations among variables.
# Calculate the correlation matrix
cor_matrix <- cor(bike_df[, c("temp", "atemp", "humidity", "windspeed", "casual_users", "registerd_users", "count")])
# Display the correlation matrix
cat("Correlation Matrix:\n")
print(cor_matrix)
# Create a correlation heatmap for a visual representation of variable relationships.
# Strong positive or negative correlations are visually highlighted.
library(corrplot)
# Customize the correlation plot
corrplot(
cor_matrix,
method = "color",
font = 4,
col = colorRampPalette(c("#4575b4", "#91bfdb", "#e0f3f8", "#fee090", "#d73027"))(100), # Attractive color scheme
tl.col = "black",
tl.srt = 40,
addCoef.col = "black",
order = "hclust",
addrect = 3 # Highlight cells with correlations above a certain threshold
)
# Add informative elements to the plot
title("Correlation heatmap", line = 0.5, cex.main = 1.8) # Lowercase title
mtext("Strong Positive Correlation", side = 3, line = 2, col = "#d73027", cex = 0.8)
mtext("Strong Negative Correlation", side = 1, line = 3, col = "#4575b4", cex = 0.8)
# Section: Season and Weekday Analysis
# Load the required library
library(dplyr)
# Plot season-wise monthly distribution of counts
season_month_plot <- ggplot(bike_df, aes(x = month, y = count, fill = season)) +
theme_minimal() +
geom_col(position = "dodge", color = "black", alpha = 0.8) +
labs(x = 'Month', y = 'Total Count',
title = 'Season-wise Monthly Distribution of Counts',
subtitle = 'Comparison of counts across different seasons',
fill = 'Season') +
scale_fill_manual(values = c('#E41A1C', '#377EB8', '#4DAF4A', '#FF7F00')) # Custom colors
# Add data labels on top of each bar
season_month_plot <- season_month_plot +
geom_text(aes(label = count), position = position_dodge(width = 0.9), vjust = -0.5, size = 3)
# Highlight the peak month for each season
peak_months <- bike_df %>%
group_by(season) %>%
slice(which.max(count))
season_month_plot <- season_month_plot +
geom_point(data = peak_months, aes(x = month, y = count), color = "red", size = 3) +
geom_text(data = peak_months, aes(x = month, y = count, label = paste("Peak\nMonth")),
vjust = -0.5, hjust = 1, color = "red", size = 3)
# Customize legend
season_month_plot <- season_month_plot +
guides(fill = guide_legend(title = "Season"))
# Print the enhanced season-wise plot
print(season_month_plot)
# Plot weekday-wise monthly distribution of counts
weekday_month_plot <- ggplot(bike_df, aes(x = month, y = count, fill = factor(weekday))) +
theme_minimal() +
geom_col(position = "dodge", color = "white", alpha = 0.8) +
labs(x = 'Month', y = 'Total Count',
title = 'Weekday-wise Monthly Distribution of Counts',
subtitle = 'Comparison of counts across different weekdays',
fill = 'Weekday') +
scale_fill_discrete(name = 'Weekday', labels = c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat")) + # Customize weekday labels
# Add data labels on top of each bar
geom_text(aes(label = count), position = position_dodge(width = 0.9), vjust = -0.5, size = 3)
# Highlight the peak day of the week
peak_days <- bike_df %>%
group_by(weekday) %>%
slice_max(order_by = count)
# Customize legend
weekday_month_plot <- weekday_month_plot +
guides(fill = guide_legend(title = "Weekday"))
# Print the enhanced weekday-wise plot
print(weekday_month_plot)
# Section: Bike Rentals by Season
# Explore the distribution of bike rentals based on the season using a boxplot.
# This boxplot shows how the total number of rentals ('cnt') varies across different seasons.
# Create a boxplot to visualize bike rentals by season
ggplot(bike_df, aes(x = season, y = count, fill = as.factor(season))) +
geom_boxplot() +
ggtitle("Bike Rentals by Season") +
xlab("Season") + ylab("Number of Rentals") +
scale_fill_manual(values = c("#FF6666", "#FFCC66", "#99FF99", "#6666FF"), name = "Season") +
theme_minimal() +
theme(
plot.title = element_text(color = "blue", size = 18, face = "bold"),
axis.title.x = element_text(color = "blue", size = 14),
axis.title.y = element_text(color = "blue", size = 14),
legend.position = "top",
legend.title = element_text(color = "blue", size = 14)
) +
# Adding additional information on the plot
annotate("text", x = c(1, 2, 3, 4), y = 6000,
label = c("Spring", "Summer", "Fall", "Winter"),
color = "blue", size = 4, fontface = "bold") +
annotate("text", x = 1, y = 7000, label = "Median", color = "red", size = 4) +
annotate("text", x = 2, y = 7000, label = "Median", color = "red", size = 4) +
annotate("text", x = 3, y = 7000, label = "Median", color = "red", size = 4) +
annotate("text", x = 4, y = 7000, label = "Median", color = "red", size = 4) +
geom_hline(yintercept = median(bike_df$count), linetype = "dashed", color = "red", linewidth = 1.2)
# Section: Violin Plot for Yearly Distribution of Counts
library(ggplot2)
# Violin plot for Yearly wise distribution of counts
yearly_violin_plot <- ggplot(bike_df, aes(x = year, y = count, fill = as.factor(year))) +
geom_violin() +
theme_bw() +
labs(x = 'Year', y = 'Total Count',
title = 'Yearly Distribution of Counts',
subtitle = 'Comparison of counts across different years',
fill = 'Year') +
# Add data points on top of the violin plot for better insight
geom_jitter(aes(color = as.factor(year)), width = 0.2, alpha = 0.5) +
# Customize legend
scale_fill_manual(values = c('#1F78B4', '#33A02C'), name = 'Year') +
scale_color_manual(values = c('#1F78B4', '#33A02C'), name = 'Year') +
# Add a box plot inside the violin plot
geom_boxplot(width = 0.1, fill = 'white', color = 'black', alpha = 0.7) +
# Improve plot appearance
theme(legend.position = 'top') +
# Add informative annotations
annotate('text', x = c(0.8, 1.2), y = c(6000, 6000), label = c('2011', '2012'), size = 4, color = 'black') +
annotate('text', x = 1, y = 8000, label = 'Data Distribution', size = 6, color = 'black', fontface = 'bold') +
annotate('text', x = 1, y = 7000, label = 'Comparison of counts across different years using violin plots and box plots.', size = 3, color = 'black') +
annotate('text', x = 0.8, y = 6800, label = 'Data points are added for better insight.', size = 3, color = 'black')
# Print the Violin plot
print(yearly_violin_plot)
print("From the voilin plot, we can analysis that the bike rental count distribution is highest in year 2012 then the previous year.In the graph, year 0 represent 2011 and year 1 represent 2012 respectively\n")
# Section: Exploring Bike Rentals During Holidays
# Explore summary statistics for the 'count' variable based on the 'holiday' status.
# This analysis provides insights into whether there are notable differences in rentals during holidays.
# Use tapply to calculate summary statistics for 'count' grouped by 'holiday' status
cnt_summary_by_holiday <- tapply(bike_df$count, bike_df$holiday, summary)
# Display the summary statistics
cat("Summary Statistics for 'cnt' Variable Based on 'holiday' Status:\n")
print(cnt_summary_by_holiday)
# Create an analytical plot to visualize the distribution of rentals during holidays and non-holidays
par(mfrow = c(1, 3)) # Set up a 1x3 grid for side-by-side plots
# Boxplot for Rentals by Holiday Status
boxplot(bike_df$count ~ bike_df$holiday, main = "Boxplot of Rentals by Holiday Status", xlab = "Holiday", ylab = "Number of Rentals", col = "#75AADB")
# Add statistics on the boxplot
# Calculate boxplot statistics
stats <- boxplot.stats(bike_df$count[bike_df$holiday == 1])
stats_text <- paste("Median:", round(stats$stats[3], 2), "\nIQR:", round(IQR(bike_df$count[bike_df$holiday == 1]), 2))
mtext(stats_text, side = 3, line = 0.5, at = 1.8, adj = 0, font = 1, cex = 0.8)
# Histogram for Distribution of Rentals on Non-Holidays
hist(bike_df$count[bike_df$holiday == 0], main = "Distribution of Rentals on Non-Holidays", xlab = "Number of Rentals", col = "#75AADB", border = "#333333", xlim = c(0, max(bike_df$count) + 50))
# Add statistics on the histogram
hist_stats_text <- paste("Mean:", round(mean(bike_df$count[bike_df$holiday == 0]), 2), "\nSD:", round(sd(bike_df$count[bike_df$holiday == 0]), 2))
mtext(hist_stats_text, side = 3, line = 0.5, at = max(bike_df$count[bike_df$holiday == 0]) * 0.8, adj = 0, font = 1, cex = 0.8)
# Add mean and sd lines
abline(v = mean(bike_df$count[bike_df$holiday == 0]), col = "red", lty = 2, lwd = 2)
abline(v = mean(bike_df$count[bike_df$holiday == 0]) - sd(bike_df$count[bike_df$holiday == 0]), col = "purple", lty = 2, lwd = 2)
abline(v = mean(bike_df$count[bike_df$holiday == 0]) + sd(bike_df$count[bike_df$holiday == 0]), col = "purple", lty = 2, lwd = 2)
# Mark the observation points on the histogram
rug(bike_df$count[bike_df$holiday == 0], col = "blue", lwd = 1.5)
# Histogram for Distribution of Rentals on Holidays
hist(bike_df$count[bike_df$holiday == 1], main = "Distribution of Rentals on Holidays", xlab = "Number of Rentals", col = "#75AADB", border = "#333333", xlim = c(0, max(bike_df$count) + 50))
# Add statistics on the histogram
hist_stats_text_holiday <- paste("Mean:", round(mean(bike_df$count[bike_df$holiday == 1]), 2), "\nSD:", round(sd(bike_df$count[bike_df$holiday == 1]), 2))
mtext(hist_stats_text_holiday, side = 3, line = 0.5, at = max(bike_df$count[bike_df$holiday == 1]) * 0.8, adj = 0, font = 1, cex = 0.8)
# Add mean and sd lines
abline(v = mean(bike_df$count[bike_df$holiday == 1]), col = "red", lty = 2, lwd = 2)
abline(v = mean(bike_df$count[bike_df$holiday == 1]) - sd(bike_df$count[bike_df$holiday == 1]), col = "purple", lty = 2, lwd = 2)
abline(v = mean(bike_df$count[bike_df$holiday == 1]) + sd(bike_df$count[bike_df$holiday == 1]), col = "purple", lty = 2, lwd = 2)
# Mark the observation points on the histogram
rug(bike_df$count[bike_df$holiday == 1], col = "blue", lwd = 1.5)
# Section: Analyzing Bike Rentals on Holidays
# Column plot for holiday-wise distribution of counts
holiday_plot <- ggplot(bike_df, aes(x = factor(holiday), y = count, fill = season)) +
geom_col(position = "dodge", color = "black", alpha = 0.8) +
theme_minimal() +
labs(
x = 'Holiday',
y = 'Total Count',
title = 'Bike Rentals Analysis: Holidays vs. Non-Holidays',
subtitle = 'Comparing counts on holidays and non-holidays across different seasons',
fill = 'Season'
) +
scale_fill_manual(values = c('#E41A1C', '#377EB8', '#4DAF4A', '#FF7F00')) + # Custom colors +
geom_text(aes(label = count), position = position_dodge(width = 0.9), vjust = -0.5, size = 3) +
scale_x_discrete(labels = c("Non-Holiday", "Holiday")) + # Adding scale for 0 and 1 +
theme(legend.position = "top") +
guides(fill = guide_legend(title = "Season")) +
annotate("text", x = 1.5, y = max(bike_df$count) * 1.2,
label = "Insight: During non-holidays, bike rental counts are highest compared to holidays for different seasons.",
color = "red", size = 3, fontface = "bold")
# Print the informative and analytical plot
print(holiday_plot)
# Section: Analyzing Bike Rentals on neiher weekend nor holiday vs other days
# Column plot for working day-wise distribution of counts
workingday_plot <- ggplot(bike_df, aes(x = factor(workingday), y = count, fill = season)) +
geom_col(position = "dodge", color = "black", alpha = 0.8) +
theme_minimal() +
labs(
x = 'Working Day',
y = 'Total Count',
title = 'Bike Rentals Analysis: neiher weekend nor holiday vs. other days',
subtitle = 'Comparing counts on working days and non-working days across different seasons',
fill = 'Season'
) +
scale_fill_manual(values = c('#E41A1C', '#377EB8', '#4DAF4A', '#FF7F00')) + # Custom colors +
geom_text(aes(label = count), position = position_dodge(width = 0.9), vjust = -0.5, size = 3) +
scale_x_discrete(labels = c("neiher weekend nor holiday", "other days")) + # Adding scale for 0 and 1 +
theme(legend.position = "top") +
guides(fill = guide_legend(title = "Season")) +
annotate("text", x = 1.5, y = max(bike_df$count) * 1.1,
label = "Insight: neiher weekend nor holiday - Bike rental count is higher.\n other day - Bike rental count is lower",
color = "red", size = 3, fontface = "bold")
print(workingday_plot)
# Section: Impact of Weather Conditions on Bike Rentals
# Column plot for weather condition-wise distribution of counts
weather_condition_plot <- ggplot(bike_df, aes(x = factor(weather_condition), y = count, fill = season)) +
geom_col(position = "dodge", color = "black", alpha = 1) +
theme_minimal() +
labs(
x = 'Weather Condition',
y = 'Total Count',
title = 'Impact of Weather Conditions on Bike Rentals',
subtitle = 'Comparing bike rental counts under different weather conditions across different seasons',
fill = 'Season'
) +
scale_fill_manual(values = c('#E41A1C', '#377EB8', '#4DAF4A', '#FF7F00')) + # Custom colors +
geom_text(aes(label = count), position = position_dodge(width = 0.9), vjust = -0.5, size = 3) +
scale_x_discrete(labels = c("Clear", "Mist + Cloudy", "Light Snow/Light Rain", "Heavy Rain")) + # Adding scale +
theme(legend.position = "top") +
guides(fill = guide_legend(title = "Season")) +
annotate(
"text",
x = 3.0,
y = max(bike_df$count) * 1.1,
label = "Insight: 1. Clear - Bike rental count is very high.\n 2. Mist + Cloudy - Bike rental count is second highest.\n 3. Light Snow/Light Rain - Bike rental count is third highest.\n 4. Heavy Rain - there is no data of Bike renting.",
color = "red",
size = 3,
fontface = "bold"
)
# Print plot
print(weather_condition_plot)
# Section: Temperature Analysis
# Convert temperature variables to Celsius
bike_df$temp_celsius <- bike_df$temp * 41 # Assuming temp is in normalized units, adjust accordingly
bike_df$atemp_celsius <- bike_df$atemp * 50 # Assuming atemp is in normalized units, adjust accordingly
# Section: Combined Temperature Analysis
# Convert temperature variables to Celsius
bike_df$temp_celsius <- bike_df$temp * 41 # Assuming temp is in normalized units, adjust accordingly
bike_df$atemp_celsius <- bike_df$atemp * 50 # Assuming atemp is in normalized units, adjust accordingly
# Scatter plot for bike rentals against temperature and apparent temperature in Celsius
combined_temp_plot <- ggplot(bike_df, aes(x = temp_celsius, y = count, color = "Temperature")) +
geom_point() +
geom_point(aes(x = atemp_celsius, color = "Apparent Temperature"), alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color = "black",
aes(group = 1), formula = y ~ x) + # Add a linear trend line
theme_minimal() +
labs(
x = 'Temperature (Celsius)',
y = 'Total Count',
title = 'Bike Rentals vs. Temperature and Apparent Temperature',
subtitle = 'Scatter plot showing the relationship between bike rentals and temperature variables',
color = 'Variable'
) +
scale_color_manual(values = c("Temperature" = "blue", "Apparent Temperature" = "red")) +
annotate("text", x = 25, y = 8000, label = "Trend Line: Linear Regression", color = "black", size = 4) +
annotate("text", x = 5, y = 3000, label = "Temperature", color = "blue", size = 4) +
annotate("text", x = 30, y = 6000, label = "Apparent Temperature", color = "red", size = 4)
# Print the combined temperature plot
print(combined_temp_plot)
#Outlier Analysis
# Section: Boxplot for Bike Rental Count with Outliers
# Boxplot for bike rental count with outliers
boxplot(bike_df$count, main = 'Bike Rental Count', sub = ifelse(length(boxplot.stats(bike_df$count)$out) == 0, "No Outliers", paste("Outliers: ", boxplot.stats(bike_df$count)$out)),
ylab = 'Count', col = "cyan", border = "blue")
# Add statistical values
text(1, boxplot.stats(bike_df$count)$stats[1], paste("Min:", round(boxplot.stats(bike_df$count)$stats[1], 2)), pos = 4, cex = 1)
text(1, boxplot.stats(bike_df$count)$stats[2], paste("1st Quartile:", round(boxplot.stats(bike_df$count)$stats[2], 2)), pos = 4, cex = 1)
text(1, boxplot.stats(bike_df$count)$stats[3], paste("Median:", round(boxplot.stats(bike_df$count)$stats[3], 2)), pos = 4, cex = 1)
text(1, boxplot.stats(bike_df$count)$stats[4], paste("Mean:", round(mean(bike_df$count), 2)), pos = 4, cex = 1)
text(1, boxplot.stats(bike_df$count)$stats[5], paste("3rd Quartile:", round(boxplot.stats(bike_df$count)$stats[5], 2)), pos = 4, cex = 1)
text(1, boxplot.stats(bike_df$count)$stats[6], paste("Max:", round(boxplot.stats(bike_df$count)$stats[6], 2)), pos = 4, cex = 1)
# Section: Boxplots for Outliers in Temperature,feel-like temperature Humidity, and Windspeed
# Set up the layout for multiple boxplots
par(mfrow = c(2, 2))
# Box plot for temperature outliers
boxplot(bike_df$temp, main = "Temperature", sub = ifelse(length(boxplot.stats(bike_df$temp)$out) == 0, "No Outliers", paste("Outliers: ", boxplot.stats(bike_df$temp)$out)),
col = "#FF6347", border = "#8B0000", notch = TRUE, outline = FALSE)
# Box plot for temperature outliers
boxplot(bike_df$atemp, main = "Feellike-Temperature", sub = ifelse(length(boxplot.stats(bike_df$atemp)$out) == 0, "No Outliers", paste("Outliers: ", boxplot.stats(bike_df$atemp)$out)),
col = "pink", border = "red", notch = TRUE, outline = FALSE)
# Box plot for humidity outliers
boxplot(bike_df$humidity, main = "Humidity", sub = ifelse(length(boxplot.stats(bike_df$humidity)$out) == 0, "No Outliers", paste("Outliers: ", boxplot.stats(bike_df$humidity)$out)),
col = "#87CEEB", border = "#1E90FF", notch = TRUE, outline = FALSE)
# Box plot for windspeed outliers
boxplot(bike_df$windspeed, main = "Windspeed", sub = ifelse(length(boxplot.stats(bike_df$windspeed)$out) == 0, "No Outliers", paste("Outliers: ", boxplot.stats(bike_df$windspeed)$out)),
col = "#98FB98", border = "#008000", notch = TRUE, outline = FALSE)
# Outlier Replacement and Imputation
# Section: Replacing and Imputing Outliers in Humidity and Windspeed
# Install the dplyr package if not installed
install.packages("dplyr")
# Load the required libraries
library(DMwR2)
library(dplyr)
# Create a subset for windspeed and humidity variables
wind_hum <- subset(bike_df, select = c('windspeed', 'humidity'))
# Function to replace outliers with NA
replace_outliers <- function(x) {
q <- quantile(x, c(0.25, 0.75))
iqr <- q[2] - q[1]
lower_bound <- q[1] - 1.5 * iqr
upper_bound <- q[2] + 1.5 * iqr
x[x < lower_bound | x > upper_bound] <- NA
return(x)
}
# Apply the function to each column
wind_hum <- wind_hum %>% mutate(across(everything(), replace_outliers))
wind_hum
# Impute missing values using mean imputation method
wind_hum$windspeed[is.na(wind_hum$windspeed)] <- mean(wind_hum$windspeed, na.rm = TRUE)
wind_hum$humidity[is.na(wind_hum$humidity)] <- mean(wind_hum$humidity, na.rm = TRUE)
#Section : Combining the imputed dataset and original dataset
new_df <- subset(bike_df, select = -c(windspeed, humidity)) # Remove original windspeed and humidity
bike_df <- cbind(new_df, wind_hum) # Combine new_df and wind_hum data frames
# Display the first 5 rows of the updated dataset
head(bike_df)
summary(bike_df)
#Section: plot for numerical variables in combined dataset
# Load the required libraries
library(car)
library(ggplot2)
# Select numerical columns for probability plots in combined dataset
numerical_columns <- sapply(bike_df, is.numeric)
for (column in names(bike_df[, numerical_columns])) {
hist(bike_df[, column], main = paste("Histogram for", column),
xlab = column, col = "skyblue", border = "black")
}
# Create normal probability plots for numerical variables in combined dataset
for (column in names(bike_df[, numerical_columns])) {
qqnorm(bike_df[, column], main = paste("Normal Probability Plot for", column))
qqline(bike_df[, column], col = 2)
# Add insight annotation
annotation <- "Some data points are deviating from normality in a good way."
text(quantile(bike_df[, column], 1.0), quantile(bike_df[, column], 0.1), annotation, adj = c(0, 1), cex = 0.8, col = "darkgreen")
}
# Section: Correlation Analysis of in combined dataset
# Load the corrgram package for correlation analysis
library(corrgram)
# Identify numeric columns for correlation analysis
numeric_columns <- sapply(bike_df[, 8:19], is.numeric)
# Create a correlation plot
corrgram(bike_df[, 8:19][, numeric_columns], order = FALSE, upper.panel = panel.pie, text.panel = panel.txt, main = 'Correlation Plot')
# Add insight on positive and negative correlations
cat("Positive Correlations: temp, atemp, and year have positive correlations with the target variable.\n")
cat("Negative Correlations: weather_condition, humidity, and windspeed have negative correlations with the target variable.\n")
# Identify variables that may not be needed for further analysis based on correlation
cat("\nVariables with weak correlation (abs(correlation) <= 0.1) with the target variable:\n")
weak_corr_vars <- names(bike_df[, 8:19][, numeric_columns])[sapply(bike_df[, 8:19][, numeric_columns], function(x) abs(cor(x, bike_df$count)) <= 0.1)]
print(weak_corr_vars)
# Section: Splitting Dataset for Training and Testing
# Load the purrr library for functions and vectors
library(purrr)
# Split the dataset based on simple random resampling
train_index <- sample(1:nrow(bike_df), 0.7 * nrow(bike_df))
train_data <- bike_df[train_index,]
test_data <- bike_df[-train_index,]
# Display dimensions of the training and testing datasets
cat("Dimensions of Training Data:", dim(train_data), "\n")
cat("Dimensions of Testing Data:", dim(test_data), "\n")
#splitted data exploration
head(train_data)
head(test_data)
# Section: Creating Subsets for Training and Testing
# Create a new subset for train attributes
train <- subset(train_data, select = c('season', 'year', 'month', 'holiday', 'weekday', 'workingday', 'weather_condition', 'temp','atemp', 'humidity', 'windspeed', 'count'))
# Create a new subset for test attributes
test <- subset(test_data, select = c('season', 'year', 'month', 'holiday', 'weekday', 'workingday', 'weather_condition', 'temp', 'atemp','humidity', 'windspeed', 'count'))
# Display the first few rows of the training subset
cat("Training Subset:")
head(train)
# Display the first few rows of the testing subset
cat("\nTesting Subset:")
head(test)
# Section: Creating Subsets for Training and Testing
# Create a new subset for train categorical attributes
train_cat <- subset(train, select = c('season', 'holiday', 'workingday', 'weather_condition', 'year'))
# Create a new subset for test categorical attributes
test_cat <- subset(test, select = c('season', 'holiday', 'workingday', 'weather_condition', 'year'))
# Create a new subset for train numerical attributes
train_num <- subset(train, select = c('weekday', 'month', 'temp', 'atemp','humidity', 'windspeed', 'count'))
# Create a new subset for test numerical attributes
test_num <- subset(test, select = c('weekday', 'month', 'temp', 'atemp','humidity', 'windspeed', 'count'))
# Display the first few rows of the training categorical attributes subset
cat("Training Categorical Attributes Subset:")
head(train_cat)
# Display the first few rows of the testing categorical attributes subset
cat("\nTesting Categorical Attributes Subset:")
head(test_cat)
# Display the first few rows of the training numerical attributes subset
cat("\nTraining Numerical Attributes Subset:")
head(train_num)
# Display the first few rows of the testing numerical attributes subset
cat("\nTesting Numerical Attributes Subset:")
head(test_num)
# Section: Feature Engineering: Encoding Categorical Features
# Load the required libraries
library(caret)
# Define variables for dummy encoding
othervars <- c('month', 'weekday', 'temp', 'atemp', 'humidity', 'windspeed', 'count')
set.seed(2626)
# Identify categorical variables
vars <- setdiff(colnames(train), c(train$count, othervars))
vars
# Create a formula for encoding
f <- paste('~', paste(vars, collapse = ' + '))
# Use dummyVars to encode categorical variables
encoder <- dummyVars(as.formula(f), train)
encode_attributes <- predict(encoder, train)
# Combine numerical and encoded attributes
train_encoded_attributes <- cbind(train_num, encode_attributes)
# Display the head of the encoded dataset
head(train_encoded_attributes, 5)
# Section: Encoding Categorical Features (Test Dataset)
# Load the required libraries
library(caret)
# Define variables for dummy encoding
othervars <- c('month', 'weekday', 'temp', 'atemp', 'humidity', 'windspeed', 'count')
set.seed(5662)
# Identify categorical variables in the test dataset
vars <- setdiff(colnames(test), c(test$count, othervars))
vars
# Create a formula for encoding
f <- paste('~', paste(vars, collapse = ' + '))
# Use dummyVars to encode categorical variables
encoder <- dummyVars(as.formula(f), test)
encode_attributes <- predict(encoder, test)
# Combine numerical and encoded attributes for the test dataset
test_encoded_attributes <- cbind(test_num, encode_attributes)
# Display the head of the encoded test dataset
head(test_encoded_attributes, 5)
# Section: Modeling the Training Dataset with Linear Regression
# Set seed to reproduce the results of random sampling
set.seed(672)
# Train the Linear Regression model
lr_model <- lm(count ~ ., data = train_encoded_attributes[, -c(6)])
# Display the summary of the Linear Regression model
summary(lr_model)
# Section: Cross Validation Prediction with Linear Regression
# Ignore warning messages
options(warn = -1)
# Set seed to reproduce results of random sampling
set.seed(623)
# Cross validation resampling method
train_control <- trainControl(method = 'cv', number = 3)
# Cross validation prediction
CV_predict <- train(count ~ .,data = train_encoded_attributes[, -c(6)],
method = 'lm', trControl = train_control)
# Display summary of cross validation prediction
summary(CV_predict)
# Observations from the output
cat("\n**Observations:**\n")
cat("- The model's residuals range from -3713.0 to 3165.0, indicating the prediction performance.\n")
cat("- Coefficients and p-values provide insights into variable significance.\n")
cat("- The adjusted R-squared (0.8316) indicates the model's goodness of fit.\n")
cat("- Variables with smaller p-values are considered statistically significant.\n")
cat("- Overall, the linear regression model captures relationships and offers insights into predictor importance and model fit.\n")
#Section: Cross Validation Prediction Plot with Linear Regression
# Increase the size of the plot
par(mfrow=c(1, 1), mar=c(5, 5, 2, 2))
# Cross-validation prediction plot
residuals <- resid(CV_predict)
y_train <- train_encoded_attributes$count
# Scatter plot with residuals
plot(y_train, residuals, ylab = 'Residuals', xlab = 'Observed', main = 'Cross Validation Prediction Plot', pch = 16, col = 'blue')
# Add a reference line at zero
abline(h = 0, col = 'red', lwd = 2)
# Observations from the plot
text(5000, -1500, 'Residuals spread around zero indicates a good model fit', col = 'darkgreen', cex = 0.8)
text(5000, -2500, 'Scatter points should be random and evenly distributed', col = 'darkgreen', cex = 0.8)
# Section: Model Performance on Test Data Analysis
# Set seed for reproducibility
set.seed(6872)
# Suppress warning messages
options(warn = -1)
# Predict using the lr_model on the test_encoded_attributes
lm_predictions <- predict(lr_model, test_encoded_attributes[, -c(6)])
# Display the first 10 predictions
head(lm_predictions, 10)
# Section: Model Performance Visualization using Linear Regressor Model
# Extract actual and predicted values
actual_values <- test_encoded_attributes$count
predicted_values <- lm_predictions
# Plot past actual values and future 10 predicted values
plot(1:length(actual_values), actual_values, type = "l", col = "blue",
xlab = "Sample Index", ylab = "Count", main = "Actual vs Future Predicted Values",
xlim = c(1, length(actual_values) + 10), ylim = c(0, max(actual_values, predicted_values)))
lines(length(actual_values) + 1:length(predicted_values), predicted_values, col = "orange")
# Add lines for future predicted values
lines(length(actual_values) + 1:length(predicted_values), predicted_values, col = "orange")
# Add legend
legend("topright", legend = c("Actual", "Predicted"), col = c("blue", "orange"), lty = 1)
# Highlight the last observed data point
points(length(actual_values), actual_values[length(actual_values)], pch = 19, col = "red")
# Highlight the starting point of predicted values
points(length(actual_values) + 1, predicted_values[1], pch = 19, col = "green")
# Section: Model Evaluation Metrics on Rootmean squred error and mean abosolute error
# Set seed for reproducibility
set.seed(688)
# Root mean squared error (RMSE)
rmse <- RMSE(lm_predictions, test_encoded_attributes$count)
print(paste("Root Mean Squared Error (RMSE):", round(rmse, 2)))
# Mean absolute error (MAE)
mae <- MAE(lm_predictions, test_encoded_attributes$count)
print(paste("Mean Absolute Error (MAE):", round(mae, 2)))
# Section: Residual Analysis
# Set heading for the code block
cat("Residual Analysis:\n")
# Calculate residuals
y_test <- test_encoded_attributes$count
residuals_lm <- y_test - lm_predictions
residuals_lm
# Create an informative residual plot
plot(y_test, residuals_lm, xlab = 'Observed Count', ylab = 'Residuals', main = 'Residual Plot',
col = ifelse(residuals >= 0, 'blue', 'red'), pch = 16, cex = 1.2)
# Add a reference line at y = 0
abline(h = 0, col = 'black', lty = 2, lw = 2)
# Add legend for positive and negative residuals
legend('topright', legend = c('Positive Residuals', 'Negative Residuals'), col = c('blue', 'red'), pch = 16)
# Interpretation:
cat("\nInterpretation:\n")
cat("The residual plot shows the difference between observed and predicted counts.\n")
cat("Positive residuals (in blue) indicate underpredictions, while negative residuals (in red) indicate overpredictions.\n")
cat("Overall, the model seems to perform well, with residuals centered around zero.\n")
# Section: Decision Tree Regressor
# Set heading for the code block
cat("Decision Tree Regressor:\n")