-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathall_career_length.py
124 lines (97 loc) · 3.73 KB
/
all_career_length.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
Location1 = "/Users/xinhao/Downloads/lyft_data_challenge/final/erin_code.csv"
drive_df = pd.read_csv(Location1)
Location2 = "driver_ids.csv"
driver_df = pd.read_csv(Location2)
# read drivers and their on_board date into dict
drivers = {}
for index, row in driver_df.iterrows():
dr = row["driver_id"]
on_board_time = row["driver_onboard_date"]
drivers[dr] = on_board_time
# find the most recent ride : No. 365 2016-06-26 23:57:45 --- 2016-03-28 05:48:18
# change the on board date format
for idx, row in drive_df["requested_at"].items():
c_dr = drive_df["driver"][idx]
# # remove drivers without on_board date
# if c_dr not in drivers.keys():
# drive_df.drop(idx)
# continue
date = row[:10].split("-")
if len(date[2]) == 1:
new_date = date[0] + '-'+date[1] +'-'+ date[2]
drive_df.set_value(idx, "requested_at", new_date)
Sorted = drive_df.sort_values(['requested_at'], ascending=False)
#print (Sorted["requested_at"])
# find inactive drivers
# 20 days ago: 6/06/16
all_drivers = driver_df["driver_id"]
on_board_time = driver_df["driver_onboard_date"]
active_drivers = set([])
current_drivers = set([])
current_drivers_workhr = {}
#add the drivers after the break point
for idx, time in Sorted["requested_at"].items():
if time.startswith('6/06/16'):
break
active_drivers.add(Sorted["driver"][idx])
for idx, time in Sorted["requested_at"].items():
current_drivers.add(Sorted["driver"][idx])
# drop drivers without on_board_date
current_drivers = list(current_drivers)
# active_drivers = list(active_drivers)
for dr in current_drivers:
if dr not in drivers.keys():
current_drivers.remove(dr)
# print (len(active_drivers)) # 625
# print (len(current_drivers)) # 837
#
# print (len(set(current_drivers).difference(set(active_drivers))))
#
# inactive_drivers = (set(current_drivers).difference(set(active_drivers))) # 212
def compute_length(on_board_date, current_date):
print(on_board_date,current_date)
s_ds = on_board_date.split("-")
c_ds = current_date.split("-")
s_month = int(s_ds[0])
s_day = int(s_ds[1])
c_month = int(c_ds[0])
c_day = int(c_ds[1])
if s_month == c_month:
return c_day - s_day
else:
if s_month == 3 and c_month == 4:
return 31-s_day + c_day
if s_month == 3 and c_month == 5:
return 31-s_day + 30 + c_day
if s_month == 3 and c_month == 6:
return 31-s_day + 61 + c_day
if s_month == 4 and c_month == 5:
return 30-s_day+c_day
if s_month == 4 and c_month == 6:
return 30-s_day + 31 + c_day
else:
return 31-s_day + c_day
# find the last ride of inactive drivers
for idx, time in Sorted["requested_at"].items():
c_dr = Sorted["driver"][idx]
if c_dr in current_drivers:
# if c_dr not in inactive_drivers_workhr.keys():
current_drivers_workhr[c_dr] = time
# find the career length of inactive drivers
for dr in current_drivers_workhr:
print(drivers[dr], current_drivers_workhr[dr])
current_drivers_workhr[dr] = compute_length(drivers[dr], current_drivers_workhr[dr])
# print (inactive_drivers_workhr)
# average career length of drop-out drivers
# avr = float(sum(inactive_drivers_workhr.values())) / len(inactive_drivers_workhr)
# print (avr) 27.995283018867923
# write drop-out drivers' information into a dataframe & output csv file
data = []
for id, days in (current_drivers_workhr.items()):
data.append([id, days])
inact_df = pd.DataFrame(data, columns = ['driver_id', 'career_length'])
inact_df.to_csv(r'all_career.csv')