-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexploratory_analysis.py
231 lines (199 loc) · 9.21 KB
/
exploratory_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.path as mplPath
from scipy.stats import ttest_ind, f_oneway, lognorm, levy, skew, chisquare
import numpy as np
import seaborn as sns
df = pd.read_csv('2015_Green_Taxi_Trip_Data.csv')
#print df.describe()
# List unique values in the df['pickup_datetime'] column
# df.pickup_datetime.unique()
# Remove some features (columns) from dataframe
df = df.drop('vendorid', 1)
df = df.drop('Extra', 1)
df = df.drop('MTA_tax', 1)
df = df.drop('Ehail_fee', 1)
df = df.drop('Improvement_surcharge', 1)
df = df.drop('Store_and_fwd_flag', 1)
df = df.drop('Payment_type', 1)
#-------------------------------------------------------------
import datetime as dt
df['Pickup_dt'] = df.pickup_datetime.apply(lambda x:dt.datetime.strptime(x,"%m/%d/%Y %I:%M:%S %p"))
# datetime format: 01/10/2015 05:24:59 PM
df['Pickup_hour'] = df.Pickup_dt.apply(lambda x:x.hour)
#print df.describe()
#print list(df) # Print a list of all the dataframe columns :)
# Mean and Median of trip distance by pickup hour
# I will generate the table but also generate a plot for a better visualization
generate_trip_distance_graph = False
if generate_trip_distance_graph:
fig,ax = plt.subplots(1,1,figsize=(9,5)) # prepare fig to plot mean and media values
# use a pivot table to aggregate Trip_distance by hour
table1 = df.pivot_table(index='Pickup_hour', values='Trip_distance',aggfunc=('mean','median',np.std)).reset_index()
# rename columns
table1.columns = ['Hour','Mean_distance','Median_distance','Std_distance']
#table1[['Mean_distance','Median_distance']].plot(ax=ax)
plt.errorbar(table1.Hour, table1.Mean_distance, table1.Std_distance, fmt='-o')
plt.ylabel('Metric (miles)')
plt.xlabel('Hours after midnight')
plt.title('Distribution of trip distance by pickup hour')
#plt.xticks(np.arange(0,30,6)+0.35,range(0,30,6))
plt.xlim([0,23])
plt.savefig('Question3_1.jpeg',format='jpeg')
plt.show()
print '-----Trip distance by hour of the day-----\n'
print tabulate(table1.values.tolist(),["Hour","Mean distance","Median distance"])
generate_trip_distance_graph_day = False
if generate_trip_distance_graph_day:
df['Pickup_day'] = df.Pickup_dt.apply(lambda x:x.day)
fig,ax = plt.subplots(1,1,figsize=(9,5)) # prepare fig to plot mean and media values
# use a pivot table to aggregate Trip_distance by hour
table2 = df.pivot_table(index='Pickup_day', values='Trip_distance',aggfunc=('mean','median')).reset_index()
# rename columns
table2.columns = ['Day','Mean_distance','Median_distance']
table2[['Mean_distance','Median_distance']].plot(ax=ax)
plt.ylabel('Metric (miles)')
plt.xlabel('Day')
plt.title('Distribution of trip distance by pickup week day')
#plt.xticks(np.arange(0,30,6)+0.35,range(0,30,6))
plt.xlim([0,23])
plt.savefig('distance_by_day.jpeg',format='jpeg')
plt.show()
# Histogram of times
if False:
df = df[df.Trip_distance < 20]
G = sns.distplot( df.Pickup_hour.dropna() ) #set_axis_labels( "Trip Distance","Counts")
G.set_xlabel("Hours after midnight",size = 14,color="black")
G.set_ylabel("Count",size = 14,color="black")
G.tick_params(labelsize=12,labelcolor="black")
plt.xlim([0,23])
#G.figure.set_size_inches(14,9)
sns.plt.show()
# Joint HexBin plot
if False:
nstd=1.5
df = df[df.Trip_distance <2.882150+nstd*2.947592]
df = df[df.Trip_distance >2.882150-nstd*2.947592]
df = df[df.Total_amount < 14.795530+nstd* 12.065141]
df = df[df.Total_amount > 14.795530-nstd* 12.065141]
jp1 = sns.jointplot( x="Trip_distance" , y="Total_amount" , data=df, kind="hex", color = 'k').set_axis_labels("Trip distance (miles)","Total amount (USD)")
sns.plt.show()
jp1.savefig('jointplot_dist_amount.png')
# Joint HexBin plot
if False:
nstd=1
df = df[df.Trip_distance <2.882150+nstd*2.947592]
df = df[df.Trip_distance >2.882150-nstd*2.947592]
df = df[df.Tip_amount < 1.1 + nstd*1.75]
df = df[df.Tip_amount > 1.1 - nstd*1.75]
df = df[df.Tip_amount > 0]
jp1 = sns.jointplot(x="Trip_distance", y="Tip_amount",data=df,kind="hex",color='k').set_axis_labels("Trip distance (miles)","Tip amount (USD)")
sns.plt.show()
#jp1.savefig('jointplot_hours_amount.png')
# Correlation plot
if False:
df['Pickup_day'] = df.Pickup_dt.apply(lambda x:x.day)
#df = df.drop('pickup_datetime')
#df = df.drop('dropoff_datetime')
#df = df.drop('Pickup_dt')
sns.linearmodels.corrplot(df, annot=False, diag_names=False)
sns.plt.show()
quit()
#-------------------AIRPORT TRIPS-------------------------------
# select airport trips
airports_trips = df[(df.rate_code==2) | (df.rate_code==3)]
print "Number of trips to/from NYC airports: ", airports_trips.shape[0]
print "Average fare (calculated by the meter) of trips to/from NYC airports: $", airports_trips.Fare_amount.mean(),"per trip"
print "Average total charged amount (before tip) of trips to/from NYC airports: $", airports_trips.Total_amount.mean(),"per trip"
# create a vector to contain Trip Distance for
v2 = airports_trips.Trip_distance # airport trips
v3 = df.loc[~df.index.isin(v2.index),'Trip_distance'] # non-airport trips
# remove outliers: exclude any data point located further than 3 standard deviations of the
# median point and plot the histogram with 30 bins
v2 = v2[~((v2-v2.median()).abs()>3*v2.std())]
v3 = v3[~((v3-v3.median()).abs()>3*v3.std())]
# define bins boundaries
bins = np.histogram(v2,normed=True)[1]
h2 = np.histogram(v2,bins=bins,normed=True)
h3 = np.histogram(v3,bins=bins,normed=True)
# plot distributions of trip distance normalized among groups
#fig,ax = plt.subplots(1,2,figsize = (15,4))
fig,ax = plt.subplots(2,1,figsize = (8,10))
w = .4*(bins[1]-bins[0])
ax[0].bar(bins[:-1],h2[0],alpha=1,width=w,color='b')
ax[0].bar(bins[:-1]+w,h3[0],alpha=1,width=w,color='g')
ax[0].legend(['Airport trips','Non-airport trips'],loc='best',title='group')
ax[0].set_xlabel('Trip distance (miles)')
ax[0].set_ylabel('Group normalized trips count')
ax[0].set_title('A. Trip distance distribution')
#ax[0].set_yscale('log')
airports_trips.Pickup_hour.value_counts(normalize=True).sort_index().plot(ax=ax[1])
df.loc[~df.index.isin(v2.index),'Pickup_hour'].value_counts(normalize=True).sort_index().plot(ax=ax[1])
ax[1].set_xlabel('Hours after midnight')
ax[1].set_ylabel('Group normalized trips count')
ax[1].set_title('B. Hourly distribution of trips')
ax[1].legend(['Airport trips','Non-airport trips'],loc='best',title='group')
plt.savefig('Question3_2.jpeg',format='jpeg')
plt.show()
#---------------------------------------------------------------------
if False:
nstd=2
df = df[df.Trip_distance <2.882150+nstd*2.947592]
df = df[df.Trip_distance >2.882150-nstd*2.947592]
df = df[df.Tip_percentage >0]
df = df[abs(df.Tip_percentage-df.Tip_percentage.mean() ) < 3*df.Tip_percentage.std() ]
jp1 = sns.jointplot(x="Trip_distance",y="Tip_percentage",data=df,kind="hex",color='k').set_axis_labels("Trip_distance","Tip Percentage (%)")
sns.plt.show()
jp1.savefig('jointplot_dist_tip_percentage.png')
quit()
if False:
nstd=2
df = df[df.Trip_distance <2.882150+nstd*2.947592]
df = df[df.Trip_distance >2.882150-nstd*2.947592]
df = df[df.Total_amount < 14.795530+nstd* 12.065141]
df = df[df.Total_amount > 14.795530-nstd* 12.065141]
jp1 = sns.jointplot( x="Trip_distance" , y="Total_amount" , data=df , kind="reg").set_axis_labels("Trip_distance","Total_amount")
sns.plt.show()
jp1.savefig('jointplot_dist_amount.png')
"""
CORRELATION TABLE
sns.linearmodels.corrplot(df, annot=False, diag_names=False)
sns.plt.show()
"""
"""
TRIP DISTANCE DISTRIBUTION PLOT
df = df[df.Trip_distance < 20]
G = sns.distplot( df.Trip_distance.dropna() ) #set_axis_labels( "Trip Distance","Counts")
G.set_xlabel("Trip Distance (miles)",size = 14,color="black")
G.set_ylabel("Count",size = 14,color="black")
G.tick_params(labelsize=12,labelcolor="black")
#G.figure.set_size_inches(14,9)
sns.plt.show()
"""
"""
jp1 = sns.jointplot( x="Optical_GAP" , y="cGap" , data=df , kind="reg").set_axis_labels("Exp. optical gap", "b3lyp gap")
jp1.savefig('jointplot_Ogap_b3lypgap.png')
## Correlation Plot
sns.linearmodels.corrplot(df, annot=False, diag_names=False)
sns.plt.show()
"""
# df=pd.io.gbq.read_gbq("""
# SELECT ROUND(pickup_latitude, 4) as lat, ROUND(pickup_longitude, 4) as long, COUNT(*) as num_pickups
# FROM [nyc-tlc:yellow.trips_2015]
# WHERE (pickup_latitude BETWEEN 40.61 AND 40.91) AND (pickup_longitude BETWEEN -74.06 AND -73.77 )
# GROUP BY lat, long
# """, project_id='taxi-1029')
df = df[df.Dropoff_latitude > 35]
df = df[df.Dropoff_longitude > -76]
pd.options.display.mpl_style = 'default' #Better Styling
new_style = {'grid': False} #Remove grid
matplotlib.rc('axes', **new_style)
from matplotlib import rcParams
rcParams['figure.figsize'] = (17.5, 17) #Size of figure
rcParams['figure.dpi'] = 250
#P.set_axis_bgcolor('black') #Background Color
#P = df.plot(kind='scatter', x='Dropoff_longitude', y='Dropoff_latitude',color='red',xlim=(-74.06,-73.77),ylim=(40.61, 40.91),s=.02,alpha=.6)
P = df.plot(kind='scatter', x='Dropoff_longitude', y='Dropoff_latitude',color='red',xlim=(-74.06,-73.77),ylim=(40.61, 40.91),s=.02,alpha=.3)
P.set_axis_bgcolor('black')
plt.show()