-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
151 lines (105 loc) · 5.18 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#PRATHAMESH CHANDRAKANT CHAUDHARI
#Associated with Binghamton University
import pandas as pd
import my_connection
from datetime import datetime
def get_monthy_cost_distribution(user_email):
# Get the current date
current_date = datetime.now()
# Extract the month from the current date
current_month = current_date.month
print("Current Month:", current_month)
query = "select * from Transactions where customer_id = (select customer_id from Customer where email_address = '" + user_email + "')"
transactions = my_connection.select_all(query)
headers = ['Date', 'Description', 'Category', 'Cost', 'Currency', 'customer_id']
df = pd.DataFrame(transactions, columns=headers)
print(df)
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
# Replace empty strings with NaN in the 'Cost' column
df['Cost'] = df['Cost'].replace('', pd.NA)
# Convert the 'Cost' column to float
df['Cost'] = pd.to_numeric(df['Cost'], errors='coerce')
# Get the current month
current_month = datetime.now().month
# Filter the DataFrame for rows in the current month
filtered_df = df[df['Date'].dt.month == current_month]
unique_categories = filtered_df['Category'].unique()
# print(unique_categories)
total_sum = 0
categorical_data = {}
for i in unique_categories:
cat_df = filtered_df[filtered_df['Category'] == i]
total_cost_per_cat = cat_df['Cost'].sum()
total_sum += total_cost_per_cat
categorical_data[i] = total_cost_per_cat
categorical_data_percentage = {}
for i in unique_categories:
total_cost_per_cat = categorical_data[i]
categorical_data_percentage[i] = (total_cost_per_cat/total_sum) * 100
print("categorical_data=", categorical_data)
print("categorical_data_percentage=", categorical_data_percentage)
# # # Display the total cost for each category
return categorical_data_percentage
def calculate_category_budgets(user_income, unique_categories, savings=0):
num_categories = len(unique_categories)
total_budget = (user_income - savings) / num_categories
category_budgets = {}
for i in unique_categories:
category_budgets[i] = total_budget
return category_budgets
def Expense_management_score(user_email):
query = "select customer_id, Income_amount, Saving_Percentage from Customer where email_address = '" + user_email + "'"
customer = my_connection.select_all(query)
user_income = float(customer[0][1])
Saving_Percentage = float(customer[0][2])
# savings = user_income * Saving_Percentage
query = "select * from Transactions where customer_id = (select customer_id from Customer where email_address = '" + user_email + "')"
transactions = my_connection.select_all(query)
headers = ['Date', 'Description', 'Category', 'Cost', 'Currency', 'customer_id']
df = pd.DataFrame(transactions, columns=headers)
print("df=", df)
unique_categories = df['Category'].unique()
# Calculate total expenses for each category
# category_expenses = df.groupby('Category')['Cost'].sum()
# Initialize total score
total_score = 0
# num_categories = len(unique_categories)
# print("num_categories=", num_categories)
# category_budgets = calculate_category_budgets(user_income, unique_categories, savings)
# print("category_budgets=", category_budgets)
# Get the current date
current_date = datetime.now()
# Extract the month from the current date
current_month = current_date.month
print("Current Month:", current_month)
filtered_df = df[df['Date'].dt.month == current_month]
print()
total_sum = 0
categorical_data = {}
for i in unique_categories:
cat_df = filtered_df[filtered_df['Category'] == i]
total_cost_per_cat = cat_df['Cost'].sum()
total_sum += total_cost_per_cat
categorical_data[i] = total_cost_per_cat
print("total_sum=", total_sum)
final_savings = user_income - total_sum
final_savings_percentage = final_savings/user_income
print(final_savings_percentage)
targeted_saving_percentage = Saving_Percentage
# # Iterate over each category
# for category, budget in category_budgets.items():
# # If the category exists in the dataset
# if category in unique_categories:
# print("total_score= ", total_score)
# expenses = category_expenses[category]
# # If expenses exceed the budget
# if expenses > budget:
# # Deduct points (you can adjust this based on your preference)
# deduction = (expenses - budget) * 100 / user_income
# total_score -= deduction
# else:
# # Give some positive points for staying within budget
# total_score += 10 # You can adjust this as needed
# else:
# print(f"Category '{category}' not found in the dataset.")
return float(final_savings_percentage), targeted_saving_percentage