yulu_project.py

# -*- coding: utf-8 -*-
"""Yulu Project.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1ChVHFS-hk8AgYl_08uHRcONGaOajsRtf

# Business Case Study - Yulu

---

~ By Abhinav Gupta <br>
Mail: abhinav.23bcs10090@ms.sst.scaler.com

<br>

## Overview of Yulu

Yulu stands as India's foremost micro-mobility service provider, offering distinctive vehicles tailored for daily commutes. Initially conceived to combat traffic congestion in India, Yulu presents the safest commuting solution through a user-friendly mobile application, facilitating shared, solo, and sustainable commuting.

Strategically positioned Yulu zones span various locations including metro stations, bus stands, office spaces, residential areas, and corporate offices, ensuring seamless, economical, and convenient first and last-mile connectivity.

Recently, Yulu has encountered significant declines in its revenues. In response, they've engaged a consulting firm to dissect the factors influencing demand for shared electric cycles within the Indian market.

## Problem Statement

This project endeavors to dissect the factors underpinning the demand for shared electric cycles in the Indian market. Specifically, the objective is to pinpoint the significant variables predictive of demand for these cycles and assess how effectively these variables elucidate electric cycle demand.

## Dataset

The dataset utilized for this analysis contains pertinent information regarding bike rentals and encompasses factors such as temperature, humidity, weather conditions, user types, and counts of bike rentals.

### Column Profiling

1. `datetime`: Date and time stamp.
   
2. `season`:
   - `1`: Spring
   - `2`: Summer
   - `3`: Fall
   - `4`: Winter

3. `holiday`: Indicates whether the day is a holiday or not, derived from the DC Government Holiday Schedule.

4. `workingday`: Binary indicator (1 for workdays, 0 for weekends/holidays).

5. `weather`:
   - `1`: Clear, few clouds, partly cloudy, or partly cloudy.
   - `2`: Mist, cloudy, broken clouds, or mist.
   - `3`: Light snow, light rain with thunderstorms and scattered clouds, or light rain with scattered clouds.
   - `4`: Heavy rain with ice pellets and thunderstorms, mist with heavy rain, or snow with fog.

6. `temp`: Temperature in Celsius.

7. `atemp`: "Feels like" temperature in Celsius.

8. `humidity`: Humidity level.

9. `windspeed`: Wind speed.

10. `casual`: Count of casual users.

11. `registered`: Count of registered users.

12. `count`: Total count of rental bikes, including both casual and registered users.

You can access the dataset [here](https://d2beiqkhq929f0.cloudfront.net/public_assets/assets/000/001/428/original/bike_sharing.csv?1642089089).

`Note that AI has been used for syntactical assistance and paraphraser has been used for better english/explanations`
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import ttest_ind,f_oneway, levene, kruskal, shapiro, chi2_contingency, chi2
from statsmodels.graphics.gofplots import qqplot

# was getting a lot of them so learned the way to remove them
import warnings
warnings.filterwarnings("ignore")

data = pd.read_csv("https://d2beiqkhq929f0.cloudfront.net/public_assets/assets/000/001/428/original/bike_sharing.csv?1642089089")
data.head()

data.info()

"""#### Observation
1. The dataframe has 10866 rows and 12 columns(features)
2. Most columns contains numbers
3. Only one column contains object
4. Season', 'holiday', 'workingday', 'weather', 'humidity', 'casual', 'registered', and 'count' are numerical columns.
"""

# Checking of null values
data.isna().sum()

"""All columns have 10886 non-null values, indicating there are no missing values.

There are totally 10886 rows and 12 columns in the data.

The data does not contain any nulls, thus no need of handling the missing data.
"""

# Uniques values of each columns
data.nunique()

data.describe().T

"""*   The data provided are within the dates 2011-01-01 to 2012-12-19
* The temparature ranges from 0.82 degree to 41.0 degree with the mean temparature of 20.23
* The humidity ranges from 0 to 100 with the mean of 61.88.
* Windspeed ranges from 0 to 56
* count ranges from 1 to a maximum value of 977
"""

categorical_cols = [ "workingday", "weather", "season", "holiday"]
continuous_cols = ["temp", "atemp", "humidity", "windspeed", "casual", "registered", "count"]

sns.set_style('darkgrid')
color_palette = ['skyblue', 'lightgreen', 'salmon', 'purple']

# Create subplots
fig, axs = plt.subplots(2, 2, figsize=(15, 7))

# Continuous columns without registered and count for now
continuous_columns = ['temp', 'atemp', 'humidity', 'windspeed']

for i, col in enumerate(continuous_columns):
    sns.histplot(data=data, x=col, ax=axs.flatten()[i], kde=True, color=color_palette[i])
    axs.flatten()[i].set_ylabel('Frequency')
    axs.flatten()[i].set_title('Histogram of ' + col)

plt.tight_layout()
plt.show()

"""Analysis of Continuous Variables:

1. **Temperature** (`temp`):

   - The temperature distribution appears to follow a single peak, resembling a normal distribution or slightly skewed to the right.
   - The majority of temperatures fall within the range of 10°C to 30°C.

2. **Perceived Temperature** (`atemp`):

   - The distribution of perceived temperature also shows a single peak and resembles the distribution of actual temperature.
   - Perceived temperature spans from approximately 5°C to 35°C.

3. **Humidity** (`humidity`):

   - Humidity distribution appears to exhibit two peaks, indicating a bimodal distribution.
   - Humidity values are spread across a wide range, suggesting considerable variability in humidity levels.

4. **Wind Speed** (`windspeed`):

   - The wind speed distribution is strongly skewed to the right, suggesting the presence of outliers or extreme values.
   - Most wind speed values are concentrated around 20 units or lower, with some instances of higher wind speeds.

"""

data[continuous_cols].describe()

fig, axs = plt.subplots(2,2, figsize=(12,10))
for i, col in enumerate(categorical_cols):
    sns.countplot(data=data, x=col, hue=col, ax=axs[i//2][i%2])
plt.show()

num_rows = 4
num_cols = 2

fig, axs = plt.subplots(num_rows, num_cols, figsize=(12, 10))

for i, col in enumerate(categorical_cols):
    sns.boxplot(data=data, x="count", hue=col, ax=axs[i][0])
    sns.histplot(data=data, x="count", hue=col, ax=axs[i][1])

plt.tight_layout()
plt.show()

data['datetime'] = pd.to_datetime(data['datetime']) # convert to datetime

data['date'] = data['datetime'].dt.date # extract date
data['date'] = data['date'].astype('datetime64[ns]')

## Converting the data types of the columns to category
for col in ['season','holiday','workingday','weather']:
    data[col] = data[col].astype('category')

plt.figure(figsize=(10, 6))
sns.heatmap(data.corr(), annot=True, cmap='viridis', fmt='.2f', linewidths=2)
plt.title('Correlation Heatmap')
plt.xticks(rotation=45)
plt.show()

"""# Hypothesis Testing

## Problem 1: if Working Day has an effect on the number of electric cycles rented
"""

# visual analysis
sns.kdeplot(data = data, x="count", hue="workingday")
plt.show()

"""**Null Hypothesis (H0):** Working day has no effect on the number of cycles being rented.

**Alternate Hypothesis (H1):** Working day has effect on the number of cycles being rented.

**Significance level (alpha):** 0.05

We will use the 2-Sample T-Test to test the hypothess defined above

### Assumptions:
1. Random Sampling: We took 30 random samples from the population data.
2. Independence: Yes the data is independent of each other.
3. Normality: We are testing means, and our sample size is 30. Hence, by CLT, we can state that the sample means of the data is normally distributed.
4. Equal Variances, we will assume that they have nearly equal variance

"""

# random sampling
s1 = np.random.choice(data[data['workingday'] == 0]['count'].values, 30)
s2 = np.random.choice(data[data['workingday'] == 1]['count'].values, 30)

# t-test for independence
t_stat, pval = ttest_ind(s1, s2, alternative="two-sided")
print("T-statistic:", t_stat)
print("P-value:", pval)

alpha = 0.05
if pval >= alpha:
    print("Fail to reject null hypothesis. Working Day does not have an effect on the number of electric cycles rented.")
else:
    print("Reject null hypothesis. Working Day has an effect on the number of electric cycles rented.")

"""### **Conclusion:**

The test suggests that the status of the day (whether it is a working day or not) does not have a statistically significant impact on the number of electric cycles rented.

-----

## Problem 2: Check if weathers have signifance on no. of cycles being rented
"""

sns.kdeplot(data = data, x="count", hue="weather")
plt.show()

"""From the above graph we can see that the data is not normal and rather follows a F distrubution

**Null Hypothesis (H0)** - No. of cycles rented is similar in different weathers.

**Alternate Hypothesis (H1)** - No. of cycles rented is different in different weathers.
"""

data_1 = data[data['weather'] == 1]['count'].values
data_2 = data[data['weather'] == 2]['count'].values
data_3 = data[data['weather'] == 3]['count'].values
data_4 = data[data['weather'] == 4]['count'].values

"""Assumptions

1. Independence: Yes the data is independent of each other.
2. Normality: We can test the data using QQ-Plots or Shapiro-Wilk Test. (significance level = 0.05)
3. Equal Variances: We will assume that it has equal variances

"""

fig, axs = plt.subplots(1, 4, figsize=(15, 3))

for i, data in enumerate([data_1, data_2, data_3, data_4]):
    qqplot(data, ax=axs[i])
    axs[i].set_title(f"QQ Plot for data_{i+1}")

plt.tight_layout()
plt.show()

"""The graphs further hints that the datas are not normally distributed. Let us check using Shapiro-Wilk Test."""

# Define a dictionary to hold the data and weather types
weather_data = {
    "Weather 1": data_1,
    "Weather 2": data_2,
    "Weather 3": data_3
}

# Perform Shapiro-Wilk test for each weather type
for weather, data in weather_data.items():
    stat, p_value = shapiro(data)
    print(f"Shapiro-Wilk Test for {weather}:")
    print("Test Statistic:", stat)
    print("p-value:", p_value)
    print("Data is normally distributed" if p_value > 0.05 else "Data is not normally distributed")
    print()

# Not doing for weather 4 as only 1 data point is there

"""This confirms that the data is not normally distributed.

Although we will perform ANNOVA here, but we will cross-check our hypothesis using Kruskal-Walis test, as it is better than ANOVA when the assumptions are not met.
"""

def perform_anova(data_list, alpha=0.05):
    """
    Perform ANOVA test and print results.

    Parameters:
    - data_list (list of array-like): List of datasets to compare.
    - alpha (float): Significance level (default is 0.05).
    """
    f_stat, p_value = f_oneway(*data_list)

    print("ANOVA Results:")
    print("F-Statistic:", f_stat)
    print("P-value:", p_value)

    if p_value < alpha:
        print("\nReject the null hypothesis.")
        print("The number of cycles rented is different in different weathers.")
    else:
        print("\nFail to reject null hypothesis.")
        print("The number of cycles rented is similar in different weathers.")

# Call the function with the list of data
perform_anova([data_1, data_2, data_3, data_4])

def perform_kruskal(data_list, alpha=0.05):
    """
    Perform Kruskal-Wallis test and print results.

    Parameters:
    - data_list (list of array-like): List of datasets to compare.
    - alpha (float): Significance level (default is 0.05).
    """
    # Perform Kruskal-Wallis test
    kruskal_stat, p_value = kruskal(*data_list)

    # Print results
    print("Kruskal-Wallis Results:")
    print("Kruskal-Statistic:", kruskal_stat)
    print("P-value:", p_value)

    # Interpret results
    if p_value < alpha:
        print("\nReject the null hypothesis.")
        print("The number of cycles rented is different in different weathers according to Kruskal-Wallis.")
    else:
        print("\nFail to reject null hypothesis.")
        print("The number of cycles rented is similar in different weathers according to Kruskal-Wallis.")

# Call the function with the list of data
perform_kruskal([data_1, data_2, data_3, data_4])

"""### Conclusion

Weather does have a affect on number of cycles being rented, the number of cycles rented is different in different weathers.

----

## Problem 3: Season has a affect on number of cycles being rented
"""

data = pd.read_csv("https://d2beiqkhq929f0.cloudfront.net/public_assets/assets/000/001/428/original/bike_sharing.csv?1642089089")
sns.kdeplot(data = data, x="count", hue="season")
plt.show()

"""Observation: The data doesn't look normally distrubted!

**Null Hypothesis (H0)** - No. of cycles rented is similar in different seasons.

**Alternate Hypothesis (H1)** - No. of cycles rented is different in different seasons.
"""

data_1 = data[data['season'] == 1]['count'].values
data_2 = data[data['season'] == 2]['count'].values
data_3 = data[data['season'] == 3]['count'].values
data_4 = data[data['season'] == 4]['count'].values

"""Assumptions

1. Independence: Yes the data is independent of each other.
2. Normality: We can test the data using QQ-Plots or Shapiro-Wilk Test. (significance level = 0.05)
3. Equal Variances: We will assume that the data is equi variance

Test for normality
"""

# normality test using qqplots
fig, axs = plt.subplots(1, 4, figsize=(16, 4))

qqplot(data_1, line='s', ax=axs[0])
axs[0].set_title('QQ Plot for data_1')

qqplot(data_2, line='s', ax=axs[1])
axs[1].set_title('QQ Plot for data_2')

qqplot(data_3, line='s', ax=axs[2])
axs[2].set_title('QQ Plot for data_3')

qqplot(data_4, line='s', ax=axs[3])
axs[3].set_title('QQ Plot for data_4')

plt.tight_layout()
plt.show()

"""The graphs further hints that the datas are not normally distributed. Let us check using Shapiro-Wilk Test."""

data_list = [data_1, data_2, data_3, data_4]
seasons = ["Season 1", "Season 2", "Season 3", "Season 4"]

for season, data in zip(seasons, data_list):
    stats, p_value = shapiro(data)
    print(f"Shapiro-Wilk Test for {season}:")
    print("Test Statistic:", stats)
    print("p-value:", p_value)
    print("Data is normally distributed" if p_value > 0.05 else "Data is not normally distributed")
    print()

"""The Shapiro-WIlk test confirms that the data is not normally distributed.

We'll use ANOVA first, but we'll double-check our findings with the Kruskal-Wallis test because it's better when ANOVA's rules aren't followed.
"""

def perform_anova(*args, alpha=0.05):
    f_stat, p_value = f_oneway(*args)
    print("F-Statistic:", f_stat)
    print("P-value:", p_value)

    if p_value < alpha:
        print("\nReject the null hypothesis.")
        print("Number of cycles rented is different in different seasons.")
    else:
        print("\nFail to reject null hypothesis.")
        print("Number of cycles rented is similar in different seasons.")

perform_anova(data_1, data_2, data_3, data_4)

def perform_kruskal(*args, alpha=0.05):
    kruskal_stat, p_value = kruskal(*args)
    print("Kruskal-Statistic:", kruskal_stat)
    print("P-value:", p_value)

    if p_value < alpha:
        print("\nReject the null hypothesis.")
        print("Number of cycles rented is different in different seasons.")
    else:
        print("\nFail to reject null hypothesis.")
        print("Number of cycles rented is similar in different seasons.")

perform_kruskal(data_1, data_2, data_3, data_4)

"""### Conclusion

Season does have a effect on the numbers of cycles being rented!

## Problem 4: Check if Weather is dependent on the season
"""

# visual analysis
data = pd.read_csv("https://d2beiqkhq929f0.cloudfront.net/public_assets/assets/000/001/428/original/bike_sharing.csv?1642089089")
sns.kdeplot(data = data, x="weather", hue="season")
plt.show()

sns.countplot(data = data, x="weather", hue="season")
plt.show()

data_table = pd.crosstab(data['season'], data['weather'])
data_table

"""**Null Hypothesis (H0):** Weather is independent of the season

**Alternate Hypothesis (H1):** Weather is not independent of the season

**Significance level (alpha):** 0.05

We will use **chi-square** test to test hypyothesis defined above.

Assumptions:
1. Random Sampling: Not needed as we have the population.
2. Independence: Yes the data is independent of each other.
3. Large enough sample-size: For weather type 4, we do not have sufficiently  large data.
"""

def chi_square_test(data, variable1, variable2, alpha=0.05):
    data_table = pd.crosstab(data[variable1], data[variable2])

    val = chi2_contingency(data_table)
    expected_values = val[3]

    nrows, ncols = data_table.shape
    dof = (nrows - 1) * (ncols - 1)

    chi_sqr_statistic = sum((data_table.values.flatten() - expected_values.flatten())**2 / expected_values.flatten())

    critical_val = chi2.ppf(q=1 - alpha, df=dof)

    p_val = 1 - chi2.cdf(x=chi_sqr_statistic, df=dof)

    if p_val <= alpha:
        print(f"\nSince p-value is less than the alpha {alpha}, \nWe reject the Null Hypothesis. \nMeaning that {variable2} is dependent on {variable1}.")
    else:
        print(f"\nSince p-value is greater than the alpha {alpha}, \nWe do not reject the Null Hypothesis")

chi_square_test(data, 'season', 'weather')