AyushMann29 commited on
Commit
3908f31
·
verified ·
1 Parent(s): bbb5b01

Upload main.py

Browse files
Files changed (1) hide show
  1. main.py +346 -0
main.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fairlearn.metrics import MetricFrame, selection_rate, true_positive_rate
2
+ from sklearn.metrics import accuracy_score
3
+ from flask import Flask, request, jsonify
4
+ from flask_cors import CORS
5
+ import pandas as pd
6
+ import numpy as np
7
+ from sklearn.model_selection import train_test_split
8
+ from xgboost import XGBClassifier
9
+ from sklearn.metrics import precision_score, recall_score, f1_score
10
+ from io import StringIO
11
+ import os
12
+
13
+ # ===============================================================================
14
+ # Input Validation Functions
15
+ # ===============================================================================
16
+ def validate_input(data, trips_col='Number of Trips', earnings_col='Earnings', min_trips=0, max_trips=1000, min_earnings=0, max_earnings=100000):
17
+ """
18
+ Validates input data for negative trips and unrealistic earnings.
19
+ Returns (True, None) if valid, else (False, error_message).
20
+ """
21
+ # Check for single row (dict or DataFrame)
22
+ if isinstance(data, dict):
23
+ trips = data.get(trips_col, None)
24
+ earnings = data.get(earnings_col, None)
25
+ if trips is not None and (trips < min_trips or trips > max_trips):
26
+ return False, f"Invalid number of trips: {trips}. Must be between {min_trips} and {max_trips}."
27
+ if earnings is not None and (earnings < min_earnings or earnings > max_earnings):
28
+ return False, f"Invalid earnings: {earnings}. Must be between {min_earnings} and {max_earnings}."
29
+ elif isinstance(data, pd.DataFrame):
30
+ if trips_col in data.columns:
31
+ invalid_trips = data[(data[trips_col] < min_trips) | (data[trips_col] > max_trips)]
32
+ if not invalid_trips.empty:
33
+ return False, f"Invalid number of trips in rows: {invalid_trips.index.tolist()}"
34
+ if earnings_col in data.columns:
35
+ invalid_earnings = data[(data[earnings_col] < min_earnings) | (data[earnings_col] > max_earnings)]
36
+ if not invalid_earnings.empty:
37
+ return False, f"Invalid earnings in rows: {invalid_earnings.index.tolist()}"
38
+ return True, None
39
+
40
+ # ==============================================================================
41
+ # Step 1: Initialize Flask App and Model Variables
42
+ # ==============================================================================
43
+ app = Flask(__name__)
44
+ CORS(app) # Enable CORS to allow the frontend to access this API
45
+
46
+ # Global variables to hold the trained model and features
47
+ model = None
48
+ train_features_columns = None
49
+ evaluation_metrics = {}
50
+
51
+ # ==============================================================================
52
+ # Step 2: Core ML Functions (from your original script)
53
+ # ==============================================================================
54
+ def load_and_preprocess_data(csv_path):
55
+ """
56
+ Loads and preprocesses the dataset.
57
+ """
58
+ try:
59
+ df = pd.read_csv(csv_path)
60
+ except FileNotFoundError:
61
+ print(f"Error: The file {csv_path} was not found.")
62
+ return None, None
63
+
64
+ target_column = 'Creditworthy'
65
+
66
+ # Drop columns that are not features for the model
67
+ df = df.drop(columns=['Partner ID'], errors='ignore')
68
+
69
+ # Identify non-numeric columns
70
+ categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
71
+
72
+ # One-hot encode categorical features
73
+ df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
74
+
75
+ # Ensure all remaining feature columns are numeric
76
+ for col in df.columns:
77
+ if col != target_column:
78
+ df[col] = pd.to_numeric(df[col], errors='coerce')
79
+
80
+ # Drop any rows that now have NaN values after the coercion
81
+ df = df.dropna()
82
+
83
+ return df, target_column
84
+
85
+ def train_model(df, target_column):
86
+ """
87
+ Splits data and trains an XGBoost classifier.
88
+ """
89
+ X = df.drop(target_column, axis=1)
90
+ y = df[target_column]
91
+
92
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
93
+
94
+ model = XGBClassifier(eval_metric='logloss')
95
+ model.fit(X_train, y_train)
96
+
97
+ return model, X_test, y_test
98
+
99
+ def evaluate_model(model, X_test, y_test):
100
+ """
101
+ Evaluates the trained model using key metrics.
102
+ Returns the metrics as a dictionary.
103
+ """
104
+ y_pred = model.predict(X_test)
105
+ evaluation_metrics = {
106
+ 'accuracy': accuracy_score(y_test, y_pred),
107
+ 'precision': precision_score(y_test, y_pred),
108
+ 'recall': recall_score(y_test, y_pred),
109
+ 'f1_score': f1_score(y_test, y_pred)
110
+ }
111
+
112
+ # Fairness metrics using Fairlearn (if sensitive attribute exists)
113
+ sensitive_attr = None
114
+ # Try common sensitive attribute names
115
+ for col in ['gender', 'Gender', 'partner_gender', 'Partner Gender']:
116
+ if col in X_test.columns:
117
+ sensitive_attr = X_test[col]
118
+ break
119
+ if sensitive_attr is not None:
120
+ mf = MetricFrame(metrics={'accuracy': accuracy_score, 'selection_rate': selection_rate},
121
+ y_true=y_test,
122
+ y_pred=y_pred,
123
+ sensitive_features=sensitive_attr)
124
+ print("\nFairness metrics by group (Fairlearn):")
125
+ print(mf.by_group)
126
+ else:
127
+ print("No sensitive attribute found for group fairness metrics.")
128
+ return evaluation_metrics
129
+
130
+ def preprocess_user_data(user_df, train_columns):
131
+ """
132
+ Prepares the user's data to match the format of the training data.
133
+ """
134
+ # Identify and one-hot encode categorical features from the user's data
135
+ categorical_cols = user_df.select_dtypes(include=['object']).columns.tolist()
136
+ user_df = pd.get_dummies(user_df, columns=categorical_cols, drop_first=True)
137
+
138
+ # Identify which columns are in the training data but not the user data
139
+ missing_cols = set(train_columns) - set(user_df.columns)
140
+
141
+ # Add any missing columns from the training data with default value 0
142
+ for c in missing_cols:
143
+ user_df[c] = 0
144
+
145
+ # Drop any extra columns from the user data that were not in the training data
146
+ # This is crucial for single-entry data
147
+ extra_cols = set(user_df.columns) - set(train_columns)
148
+ user_df = user_df.drop(columns=list(extra_cols), errors='ignore')
149
+
150
+ # Reorder columns to match the training data
151
+ user_df = user_df[train_columns]
152
+
153
+ return user_df
154
+
155
+ # ==============================================================================
156
+ # Step 2.5: New Function to Save Data to CSV
157
+ # ==============================================================================
158
+ def save_to_csv(data_df, filename='online_testcases.csv'):
159
+ """
160
+ Saves a DataFrame to a CSV file.
161
+ Removes any empty columns (like 'Creditworthy') before saving.
162
+ """
163
+ # Drop 'Creditworthy' if it exists and is empty or all NaN
164
+ if 'Creditworthy' in data_df.columns and data_df['Creditworthy'].isnull().all():
165
+ data_df = data_df.drop(columns=['Creditworthy'])
166
+ # Drop any other columns that are all NaN
167
+ data_df = data_df.dropna(axis=1, how='all')
168
+ file_exists = os.path.isfile(filename)
169
+ data_df.to_csv(filename, mode='a', header=not file_exists, index=False)
170
+ print(f"Data successfully saved to {filename}")
171
+
172
+ # ==============================================================================
173
+ # Step 3: API Endpoint for Prediction (Single Input)
174
+ # ==============================================================================
175
+ @app.route('/predict', methods=['POST'])
176
+ def predict():
177
+ """
178
+ Endpoint to receive a single user input, make a prediction, and return metrics.
179
+ """
180
+ # Check if global variables are None. This is the correct way to handle this.
181
+ if model is None or train_features_columns is None or evaluation_metrics is None:
182
+ return jsonify({'error': 'Model is not trained or loaded. Please check backend logs.'}), 500
183
+
184
+ try:
185
+ user_input = request.json
186
+ # Input validation
187
+ valid, error_msg = validate_input(user_input)
188
+ if not valid:
189
+ return jsonify({'error': error_msg}), 400
190
+
191
+ user_df = pd.DataFrame([user_input])
192
+ # Preprocess the user's data to match the training data format
193
+ user_features_processed = preprocess_user_data(user_df.copy(), train_features_columns)
194
+ # Make the prediction
195
+ prediction = model.predict(user_features_processed)
196
+ result = "Eligible" if prediction[0] == 1 else "Not Eligible"
197
+ # Add prediction to the original DataFrame for logging
198
+ user_df['Creditworthy_Prediction'] = result
199
+ # Save the original user input plus prediction to the CSV file
200
+ save_to_csv(user_df)
201
+ # Return the prediction and evaluation metrics
202
+ return jsonify({
203
+ 'prediction': result,
204
+ 'metrics': evaluation_metrics
205
+ })
206
+
207
+ except Exception as e:
208
+ # Gracefully handle any errors during the process
209
+ return jsonify({'error': str(e)}), 500
210
+
211
+ # ==============================================================================
212
+ # Step 4: API Endpoint for Bulk Prediction (CSV Upload)
213
+ # ==============================================================================
214
+ @app.route('/predict_csv', methods=['POST'])
215
+ def predict_csv():
216
+ """
217
+ Endpoint to receive a CSV file, make bulk predictions, and return results.
218
+ """
219
+ if 'file' not in request.files:
220
+ return jsonify({'error': 'No file part in the request'}), 400
221
+
222
+ file = request.files['file']
223
+ if file.filename == '':
224
+ return jsonify({'error': 'No selected file'}), 400
225
+
226
+ if file:
227
+ try:
228
+ # Read the CSV file from the request
229
+ csv_data = StringIO(file.read().decode('utf-8'))
230
+ input_df = pd.read_csv(csv_data)
231
+
232
+ # Check if ground truth is present
233
+ has_ground_truth = 'Creditworthy' in input_df.columns
234
+
235
+ # Remove 'Creditworthy' column from features for prediction
236
+ if has_ground_truth:
237
+ y_true = input_df['Creditworthy']
238
+ input_df_features = input_df.drop(columns=['Creditworthy'])
239
+ else:
240
+ input_df_features = input_df
241
+
242
+ # Remove any other empty columns
243
+ input_df_features = input_df_features.dropna(axis=1, how='all')
244
+
245
+ # Input validation for all rows
246
+ valid, error_msg = validate_input(input_df_features)
247
+ if not valid:
248
+ return jsonify({'error': error_msg}), 400
249
+
250
+ # Preprocess the entire DataFrame
251
+ user_features_processed = preprocess_user_data(input_df_features.copy(), train_features_columns)
252
+ # Make the predictions
253
+ predictions = model.predict(user_features_processed)
254
+ # Add the predictions to the original DataFrame
255
+ input_df['Creditworthy_Prediction'] = np.where(predictions == 1, 'Eligible', 'Not Eligible')
256
+
257
+ # Remove any empty columns again before saving/returning
258
+ input_df = input_df.dropna(axis=1, how='all')
259
+
260
+ # Save the entire DataFrame to the CSV file
261
+ save_to_csv(input_df)
262
+
263
+ # --- Fairness & Bias Reporting ---
264
+ fairness_metrics = {}
265
+ fairness_observation = "Fairness metrics require ground truth labels and are not available for this upload."
266
+ if has_ground_truth:
267
+ # Only compute fairness if ground truth is present
268
+ sensitive_col = 'Partner Type'
269
+ if sensitive_col in input_df.columns:
270
+ y_pred = (input_df['Creditworthy_Prediction'] == 'Eligible').astype(int)
271
+ # If Creditworthy is string, convert to binary
272
+ if y_true.dtype == object:
273
+ y_true_bin = y_true.map(lambda x: 1 if str(x).lower() in ['eligible', '1', 'true', 'yes'] else 0)
274
+ else:
275
+ y_true_bin = y_true
276
+ sensitive_features = input_df[sensitive_col]
277
+ mf = MetricFrame(
278
+ metrics={
279
+ 'selection_rate': selection_rate,
280
+ 'equal_opportunity': true_positive_rate
281
+ },
282
+ y_true=y_true_bin,
283
+ y_pred=y_pred,
284
+ sensitive_features=sensitive_features
285
+ )
286
+ fairness_metrics = {
287
+ 'selection_rate': mf.by_group['selection_rate'].to_dict(),
288
+ 'equal_opportunity': mf.by_group['equal_opportunity'].to_dict()
289
+ }
290
+ # Observations
291
+ rates = mf.by_group['selection_rate']
292
+ max_group = rates.idxmax()
293
+ min_group = rates.idxmin()
294
+ diff = rates[max_group] - rates[min_group]
295
+ fairness_observation = f"{max_group} group approval rate is {diff:.2%} higher than {min_group} group."
296
+ if abs(diff) > 0.1:
297
+ fairness_observation += " Mitigation recommended: Consider reweighting or post-processing."
298
+
299
+ # Convert DataFrame to a list of dictionaries for JSON response
300
+ results = input_df.to_dict('records')
301
+ return jsonify({
302
+ 'predictions': results,
303
+ 'metrics': evaluation_metrics,
304
+ 'fairness_metrics': fairness_metrics,
305
+ 'fairness_observation': fairness_observation
306
+ })
307
+ except Exception as e:
308
+ import traceback
309
+ print(traceback.format_exc())
310
+ return jsonify({'error': f"Error processing file: {str(e)}"}), 500
311
+
312
+ return jsonify({'error': 'An unknown error occurred.'}), 500
313
+
314
+
315
+ # ==============================================================================
316
+ # Step 5: Main function to train the model once and run the server
317
+ # ==============================================================================
318
+ def main():
319
+ """
320
+ Initializes the model and runs the Flask server.
321
+ """
322
+ global model, train_features_columns, evaluation_metrics
323
+
324
+ print("--- Starting the Nova Backend ---")
325
+ print("Step 1: Loading and preprocessing data...")
326
+ train_df, target_column = load_and_preprocess_data('catalyst_train.csv')
327
+
328
+ if train_df is None:
329
+ print("Please ensure 'catalyst_train.csv' exists. Exiting.")
330
+ return
331
+
332
+ print("Step 2: Training the model and evaluating performance...")
333
+ model, X_test, y_test = train_model(train_df, target_column)
334
+ train_features_columns = train_df.drop(columns=[target_column]).columns
335
+ evaluation_metrics = evaluate_model(model, X_test, y_test)
336
+
337
+ print("\nModel trained successfully! Metrics:")
338
+ for key, value in evaluation_metrics.items():
339
+ print(f"- {key.capitalize()}: {value:.4f}")
340
+
341
+ print("\n--- Starting Flask server on http://127.0.0.1:5000 ---")
342
+ # This will serve the API, ready to accept requests from the frontend
343
+ app.run(debug=True, port=5000, use_reloader=False)
344
+
345
+ if __name__ == "__main__":
346
+ main()