I am self learning on a kaggle dataset #Loan Defaulter Prediction Dataset and tring to create a ML Model to predict confusion_matrix, classification_report out of it. But I am still confused why my code is not running.
# Import statements here
import pandas as pd
import matplotlib.pyplot as plt # For plotting, if needed later
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
import warnings
import boto3
from sagemaker import get_execution_role
import joblib # For saving the model
import tempfile # For saving the model
warnings.filterwarnings('ignore')
#role = get_execution_role() # If in SageMaker
print("Libraries imported.")
# Task I - Data Loading
# Import the dataset from S3
bucket_name = 'loan-dataXYZXYZ' # IMPORTANT: Replace XYZXYZ or use your full S3 bucket name
folder_name = 'loan_cleaned_data' # As per instructions
data_key = 'loan_cleaned_data.csv' # The name of your data file
data_location = f's3://{bucket_name}/{folder_name}/{data_key}'
print(f"Attempting to load data from: {data_location}")
# Load the dataset
try:
data = pd.read_csv(data_location)
print(f"Dataset loaded successfully. Shape: {data.shape}")
# print(data.head()) # Uncomment to verify first few rows
except Exception as e:
print(f"Error loading dataset: {e}")
print("Please check bucket name, file path, and S3 permissions.")
data = None # Ensure data is None if loading fails
# Task II - Feature Engineering
if data is not None and 'Purpose' in data.columns:
print("\nPerforming One-Hot Encoding on 'Purpose' column...")
purpose_dummies = pd.get_dummies(data['Purpose'], prefix='Purpose', dtype=int)
data = pd.concat([data, purpose_dummies], axis=1)
data.drop('Purpose', axis=1, inplace=True)
print("'Purpose' column encoded. New columns added.")
print(data.head()) # Uncomment to verify
# Store the updated dataframe below (data is already updated)
else:
print("\nSkipping Feature Engineering: Dataframe not loaded or 'purpose' column missing.")
# Task III - Data Preprocessing
df = data # Assuming 'data' is the dataframe from the previous step
df_minority_upsampled = None
if df is not None and 'not_fully_paid' in df.columns:
print(f"\nInitial class distribution for 'not_fully_paid':\n{df['not_fully_paid'].value_counts()}")
df_majority = df[df.not_fully_paid == 0]
df_minority = df[df.not_fully_paid == 1]
if not df_minority.empty and len(df_minority) < len(df_majority):
print(f"Oversampling minority class (count: {len(df_minority)}) to match majority (count: {len(df_majority)})...")
df_minority_upsampled = resample(df_minority,
replace=True, # Sample with replacement
n_samples=len(df_majority), # To match majority class
random_state=42)
df = pd.concat([df_majority, df_minority_upsampled])
df = df.sample(frac=1, random_state=42).reset_index(drop=True) # Shuffle
print(f"Resampled class distribution for 'not_fully_paid':\n{df['not_fully_paid'].value_counts()}")
elif df_minority.empty:
print("Minority class is empty. No upsampling done.")
else:
print("Data is already balanced or minority is larger. No upsampling done.")
else:
print("\nSkipping Data Preprocessing: Dataframe not available or 'not_fully_paid' column missing.")
# Task IV - Model Training
X = None
y = None
X_train, X_test, y_train, y_test = None, None, None, None # Initialize
rf = None # Initialize
if df is not None:
print("\nPreparing data for model training...")
# Create X and y data for train-test split
# Drop target and any ID columns. 'sl_no' was mentioned in instructions.
columns_to_drop = ['not_fully_paid']
if 'sl_no' in df.columns: # Check if 'sl_no' exists before trying to drop
columns_to_drop.append('sl_no')
X = df.drop(columns=columns_to_drop, axis=1)
y = df['not_fully_paid']
print(f"Features (X) shape: {X.shape}, Target (y) shape: {y.shape}")
# Split the data
# Using 60:40 split, random_state=42 as per instructions
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42, stratify=y)
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
# Train a Random Forest Classifier model
print("Training Random Forest Classifier model...")
rf = RandomForestClassifier(random_state=42) # Use random_state=42
rf.fit(X_train, y_train)
print("Model training complete.")
else:
print("\nSkipping Model Training: Dataframe 'df' is not available.")
# Task V - Model Evaluation
if rf is not None and X_test is not None and y_test is not None:
print("\nEvaluating model performance...")
# Predict using the trained Random Forest Classifier model
y_pred_test = rf.predict(X_test)
# Evaluate the predictions and Print the classification report
print("Classification Report on Test Data:")
print(classification_report(y_test, y_pred_test))
# print("\nConfusion Matrix on Test Data:") # Uncomment if you want to see it
# print(confusion_matrix(y_test, y_pred_test)) # Uncomment if you want to see it
else:
print("\nSkipping Model Evaluation: Model not trained or test data unavailable.")
# Task for Saving the Model to S3
print("\nAttempting to save the model to S3...")
if rf is not None:
s3_client = boto3.client('s3')
# Use the same bucket_name as defined in Task I for simplicity, or a different one if desired
model_s3_bucket_name = bucket_name
model_s3_key = 'model.pkl' # Name of the model file in S3 (can include a path like 'my_models/model.pkl')
try:
with tempfile.TemporaryFile() as temp_file:
joblib.dump(rf, temp_file)
temp_file.seek(0) # Reset file pointer to the beginning for reading
s3_client.upload_fileobj(temp_file, model_s3_bucket_name, model_s3_key)
print(f"Model saved successfully to s3://{model_s3_bucket_name}/{model_s3_key}")
except Exception as e:
print(f"Error saving model to S3: {e}")
print(f"Please check S3 bucket ('{model_s3_bucket_name}') permissions and existence.")
else:
print("\nSkipping model saving: Model 'rf' is not trained.")
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/put_object.html