AyushMann29
/

Project_Nova

+from fairlearn.metrics import MetricFrame, selection_rate, true_positive_rate
+from sklearn.metrics import accuracy_score
+from flask import Flask, request, jsonify
+from flask_cors import CORS
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from xgboost import XGBClassifier
+from sklearn.metrics import precision_score, recall_score, f1_score
+from io import StringIO
+import os
+# ===============================================================================
+# Input Validation Functions
+# ===============================================================================
+def validate_input(data, trips_col='Number of Trips', earnings_col='Earnings', min_trips=0, max_trips=1000, min_earnings=0, max_earnings=100000):
+    """
+    Validates input data for negative trips and unrealistic earnings.
+    Returns (True, None) if valid, else (False, error_message).
+    """
+    # Check for single row (dict or DataFrame)
+    if isinstance(data, dict):
+        trips = data.get(trips_col, None)
+        earnings = data.get(earnings_col, None)
+        if trips is not None and (trips < min_trips or trips > max_trips):
+            return False, f"Invalid number of trips: {trips}. Must be between {min_trips} and {max_trips}."
+        if earnings is not None and (earnings < min_earnings or earnings > max_earnings):
+            return False, f"Invalid earnings: {earnings}. Must be between {min_earnings} and {max_earnings}."
+    elif isinstance(data, pd.DataFrame):
+        if trips_col in data.columns:
+            invalid_trips = data[(data[trips_col] < min_trips) | (data[trips_col] > max_trips)]
+            if not invalid_trips.empty:
+                return False, f"Invalid number of trips in rows: {invalid_trips.index.tolist()}"
+        if earnings_col in data.columns:
+            invalid_earnings = data[(data[earnings_col] < min_earnings) | (data[earnings_col] > max_earnings)]
+            if not invalid_earnings.empty:
+                return False, f"Invalid earnings in rows: {invalid_earnings.index.tolist()}"
+    return True, None
+# ==============================================================================
+# Step 1: Initialize Flask App and Model Variables
+# ==============================================================================
+app = Flask(__name__)
+CORS(app)  # Enable CORS to allow the frontend to access this API
+# Global variables to hold the trained model and features
+model = None
+train_features_columns = None
+evaluation_metrics = {}
+# ==============================================================================
+# Step 2: Core ML Functions (from your original script)
+# ==============================================================================
+def load_and_preprocess_data(csv_path):
+    """
+    Loads and preprocesses the dataset.
+    """
+    try:
+        df = pd.read_csv(csv_path)
+    except FileNotFoundError:
+        print(f"Error: The file {csv_path} was not found.")
+        return None, None
+    target_column = 'Creditworthy'
+    # Drop columns that are not features for the model
+    df = df.drop(columns=['Partner ID'], errors='ignore')
+    # Identify non-numeric columns
+    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
+    # One-hot encode categorical features
+    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
+    # Ensure all remaining feature columns are numeric
+    for col in df.columns:
+        if col != target_column:
+            df[col] = pd.to_numeric(df[col], errors='coerce')
+    # Drop any rows that now have NaN values after the coercion
+    df = df.dropna()
+    return df, target_column
+def train_model(df, target_column):
+    """
+    Splits data and trains an XGBoost classifier.
+    """
+    X = df.drop(target_column, axis=1)
+    y = df[target_column]
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    model = XGBClassifier(eval_metric='logloss')
+    model.fit(X_train, y_train)
+    return model, X_test, y_test
+def evaluate_model(model, X_test, y_test):
+    """
+    Evaluates the trained model using key metrics.
+    Returns the metrics as a dictionary.
+    """
+    y_pred = model.predict(X_test)
+    evaluation_metrics = {
+        'accuracy': accuracy_score(y_test, y_pred),
+        'precision': precision_score(y_test, y_pred),
+        'recall': recall_score(y_test, y_pred),
+        'f1_score': f1_score(y_test, y_pred)
+    }
+    # Fairness metrics using Fairlearn (if sensitive attribute exists)
+    sensitive_attr = None
+    # Try common sensitive attribute names
+    for col in ['gender', 'Gender', 'partner_gender', 'Partner Gender']:
+        if col in X_test.columns:
+            sensitive_attr = X_test[col]
+            break
+    if sensitive_attr is not None:
+        mf = MetricFrame(metrics={'accuracy': accuracy_score, 'selection_rate': selection_rate},
+                         y_true=y_test,
+                         y_pred=y_pred,
+                         sensitive_features=sensitive_attr)
+        print("\nFairness metrics by group (Fairlearn):")
+        print(mf.by_group)
+    else:
+        print("No sensitive attribute found for group fairness metrics.")
+    return evaluation_metrics
+def preprocess_user_data(user_df, train_columns):
+    """
+    Prepares the user's data to match the format of the training data.
+    """
+    # Identify and one-hot encode categorical features from the user's data
+    categorical_cols = user_df.select_dtypes(include=['object']).columns.tolist()
+    user_df = pd.get_dummies(user_df, columns=categorical_cols, drop_first=True)
+    # Identify which columns are in the training data but not the user data
+    missing_cols = set(train_columns) - set(user_df.columns)
+    # Add any missing columns from the training data with default value 0
+    for c in missing_cols:
+        user_df[c] = 0
+    # Drop any extra columns from the user data that were not in the training data
+    # This is crucial for single-entry data
+    extra_cols = set(user_df.columns) - set(train_columns)
+    user_df = user_df.drop(columns=list(extra_cols), errors='ignore')
+    # Reorder columns to match the training data
+    user_df = user_df[train_columns]
+    return user_df
+# ==============================================================================
+# Step 2.5: New Function to Save Data to CSV
+# ==============================================================================
+def save_to_csv(data_df, filename='online_testcases.csv'):
+    """
+    Saves a DataFrame to a CSV file.
+    Removes any empty columns (like 'Creditworthy') before saving.
+    """
+    # Drop 'Creditworthy' if it exists and is empty or all NaN
+    if 'Creditworthy' in data_df.columns and data_df['Creditworthy'].isnull().all():
+        data_df = data_df.drop(columns=['Creditworthy'])
+    # Drop any other columns that are all NaN
+    data_df = data_df.dropna(axis=1, how='all')
+    file_exists = os.path.isfile(filename)
+    data_df.to_csv(filename, mode='a', header=not file_exists, index=False)
+    print(f"Data successfully saved to {filename}")
+# ==============================================================================
+# Step 3: API Endpoint for Prediction (Single Input)
+# ==============================================================================
+@app.route('/predict', methods=['POST'])
+def predict():
+    """
+    Endpoint to receive a single user input, make a prediction, and return metrics.
+    """
+    # Check if global variables are None. This is the correct way to handle this.
+    if model is None or train_features_columns is None or evaluation_metrics is None:
+        return jsonify({'error': 'Model is not trained or loaded. Please check backend logs.'}), 500
+    try:
+        user_input = request.json
+        # Input validation
+        valid, error_msg = validate_input(user_input)
+        if not valid:
+            return jsonify({'error': error_msg}), 400
+        user_df = pd.DataFrame([user_input])
+        # Preprocess the user's data to match the training data format
+        user_features_processed = preprocess_user_data(user_df.copy(), train_features_columns)
+        # Make the prediction
+        prediction = model.predict(user_features_processed)
+        result = "Eligible" if prediction[0] == 1 else "Not Eligible"
+        # Add prediction to the original DataFrame for logging
+        user_df['Creditworthy_Prediction'] = result
+        # Save the original user input plus prediction to the CSV file
+        save_to_csv(user_df)
+        # Return the prediction and evaluation metrics
+        return jsonify({
+            'prediction': result,
+            'metrics': evaluation_metrics
+        })
+    except Exception as e:
+        # Gracefully handle any errors during the process
+        return jsonify({'error': str(e)}), 500
+# ==============================================================================
+# Step 4: API Endpoint for Bulk Prediction (CSV Upload)
+# ==============================================================================
+@app.route('/predict_csv', methods=['POST'])
+def predict_csv():
+    """
+    Endpoint to receive a CSV file, make bulk predictions, and return results.
+    """
+    if 'file' not in request.files:
+        return jsonify({'error': 'No file part in the request'}), 400
+    file = request.files['file']
+    if file.filename == '':
+        return jsonify({'error': 'No selected file'}), 400
+    if file:
+        try:
+            # Read the CSV file from the request
+            csv_data = StringIO(file.read().decode('utf-8'))
+            input_df = pd.read_csv(csv_data)
+            # Check if ground truth is present
+            has_ground_truth = 'Creditworthy' in input_df.columns
+            # Remove 'Creditworthy' column from features for prediction
+            if has_ground_truth:
+                y_true = input_df['Creditworthy']
+                input_df_features = input_df.drop(columns=['Creditworthy'])
+            else:
+                input_df_features = input_df
+            # Remove any other empty columns
+            input_df_features = input_df_features.dropna(axis=1, how='all')
+            # Input validation for all rows
+            valid, error_msg = validate_input(input_df_features)
+            if not valid:
+                return jsonify({'error': error_msg}), 400
+            # Preprocess the entire DataFrame
+            user_features_processed = preprocess_user_data(input_df_features.copy(), train_features_columns)
+            # Make the predictions
+            predictions = model.predict(user_features_processed)
+            # Add the predictions to the original DataFrame
+            input_df['Creditworthy_Prediction'] = np.where(predictions == 1, 'Eligible', 'Not Eligible')
+            # Remove any empty columns again before saving/returning
+            input_df = input_df.dropna(axis=1, how='all')
+            # Save the entire DataFrame to the CSV file
+            save_to_csv(input_df)
+            # --- Fairness & Bias Reporting ---
+            fairness_metrics = {}
+            fairness_observation = "Fairness metrics require ground truth labels and are not available for this upload."
+            if has_ground_truth:
+                # Only compute fairness if ground truth is present
+                sensitive_col = 'Partner Type'
+                if sensitive_col in input_df.columns:
+                    y_pred = (input_df['Creditworthy_Prediction'] == 'Eligible').astype(int)
+                    # If Creditworthy is string, convert to binary
+                    if y_true.dtype == object:
+                        y_true_bin = y_true.map(lambda x: 1 if str(x).lower() in ['eligible', '1', 'true', 'yes'] else 0)
+                    else:
+                        y_true_bin = y_true
+                    sensitive_features = input_df[sensitive_col]
+                    mf = MetricFrame(
+                        metrics={
+                            'selection_rate': selection_rate,
+                            'equal_opportunity': true_positive_rate
+                        },
+                        y_true=y_true_bin,
+                        y_pred=y_pred,
+                        sensitive_features=sensitive_features
+                    )
+                    fairness_metrics = {
+                        'selection_rate': mf.by_group['selection_rate'].to_dict(),
+                        'equal_opportunity': mf.by_group['equal_opportunity'].to_dict()
+                    }
+                    # Observations
+                    rates = mf.by_group['selection_rate']
+                    max_group = rates.idxmax()
+                    min_group = rates.idxmin()
+                    diff = rates[max_group] - rates[min_group]
+                    fairness_observation = f"{max_group} group approval rate is {diff:.2%} higher than {min_group} group."
+                    if abs(diff) > 0.1:
+                        fairness_observation += " Mitigation recommended: Consider reweighting or post-processing."
+            # Convert DataFrame to a list of dictionaries for JSON response
+            results = input_df.to_dict('records')
+            return jsonify({
+                'predictions': results,
+                'metrics': evaluation_metrics,
+                'fairness_metrics': fairness_metrics,
+                'fairness_observation': fairness_observation
+            })
+        except Exception as e:
+            import traceback
+            print(traceback.format_exc())
+            return jsonify({'error': f"Error processing file: {str(e)}"}), 500
+    return jsonify({'error': 'An unknown error occurred.'}), 500
+# ==============================================================================
+# Step 5: Main function to train the model once and run the server
+# ==============================================================================
+def main():
+    """
+    Initializes the model and runs the Flask server.
+    """
+    global model, train_features_columns, evaluation_metrics
+    print("--- Starting the Nova Backend ---")
+    print("Step 1: Loading and preprocessing data...")
+    train_df, target_column = load_and_preprocess_data('catalyst_train.csv')
+    if train_df is None:
+        print("Please ensure 'catalyst_train.csv' exists. Exiting.")
+        return
+    print("Step 2: Training the model and evaluating performance...")
+    model, X_test, y_test = train_model(train_df, target_column)
+    train_features_columns = train_df.drop(columns=[target_column]).columns
+    evaluation_metrics = evaluate_model(model, X_test, y_test)
+    print("\nModel trained successfully! Metrics:")
+    for key, value in evaluation_metrics.items():
+        print(f"- {key.capitalize()}: {value:.4f}")
+    print("\n--- Starting Flask server on http://127.0.0.1:5000 ---")
+    # This will serve the API, ready to accept requests from the frontend
+    app.run(debug=True, port=5000, use_reloader=False)
+if __name__ == "__main__":
+    main()