Can someone help me where my code has problem? Kaggle Dataset

I am self learning on a kaggle dataset #Loan Defaulter Prediction Dataset and tring to create a ML Model to predict confusion_matrix, classification_report out of it. But I am still confused why my code is not running.


# Import statements here
import pandas as pd
import matplotlib.pyplot as plt # For plotting, if needed later
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
import warnings
import boto3
from sagemaker import get_execution_role
import joblib # For saving the model
import tempfile # For saving the model

warnings.filterwarnings('ignore')
#role = get_execution_role() # If in SageMaker
print("Libraries imported.")

# Task I - Data Loading
# Import the dataset from S3

bucket_name = 'loan-dataXYZXYZ' # IMPORTANT: Replace XYZXYZ or use your full S3 bucket name
folder_name = 'loan_cleaned_data' # As per instructions
data_key = 'loan_cleaned_data.csv' # The name of your data file
data_location = f's3://{bucket_name}/{folder_name}/{data_key}'

print(f"Attempting to load data from: {data_location}")

# Load the dataset
try:
    data = pd.read_csv(data_location)
    print(f"Dataset loaded successfully. Shape: {data.shape}")
    # print(data.head()) # Uncomment to verify first few rows
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("Please check bucket name, file path, and S3 permissions.")
    data = None # Ensure data is None if loading fails

# Task II - Feature Engineering

if data is not None and 'Purpose' in data.columns:
    print("\nPerforming One-Hot Encoding on 'Purpose' column...")
    purpose_dummies = pd.get_dummies(data['Purpose'], prefix='Purpose', dtype=int)
    data = pd.concat([data, purpose_dummies], axis=1)
    data.drop('Purpose', axis=1, inplace=True)
    print("'Purpose' column encoded. New columns added.")
    print(data.head()) # Uncomment to verify
    # Store the updated dataframe below (data is already updated)
else:
    print("\nSkipping Feature Engineering: Dataframe not loaded or 'purpose' column missing.")

# Task III - Data Preprocessing

df = data # Assuming 'data' is the dataframe from the previous step
df_minority_upsampled = None

if df is not None and 'not_fully_paid' in df.columns:
    print(f"\nInitial class distribution for 'not_fully_paid':\n{df['not_fully_paid'].value_counts()}")

    df_majority = df[df.not_fully_paid == 0]
    df_minority = df[df.not_fully_paid == 1]

    if not df_minority.empty and len(df_minority) < len(df_majority):
        print(f"Oversampling minority class (count: {len(df_minority)}) to match majority (count: {len(df_majority)})...")
        df_minority_upsampled = resample(df_minority,
                                         replace=True, # Sample with replacement
                                         n_samples=len(df_majority), # To match majority class
                                         random_state=42)

        df = pd.concat([df_majority, df_minority_upsampled])
        df = df.sample(frac=1, random_state=42).reset_index(drop=True) # Shuffle
        print(f"Resampled class distribution for 'not_fully_paid':\n{df['not_fully_paid'].value_counts()}")
    elif df_minority.empty:
        print("Minority class is empty. No upsampling done.")
    else:
        print("Data is already balanced or minority is larger. No upsampling done.")
else:
    print("\nSkipping Data Preprocessing: Dataframe not available or 'not_fully_paid' column missing.")

# Task IV - Model Training

X = None
y = None
X_train, X_test, y_train, y_test = None, None, None, None # Initialize
rf = None # Initialize

if df is not None:
    print("\nPreparing data for model training...")
    # Create X and y data for train-test split
    # Drop target and any ID columns. 'sl_no' was mentioned in instructions.
    columns_to_drop = ['not_fully_paid']
    if 'sl_no' in df.columns: # Check if 'sl_no' exists before trying to drop
        columns_to_drop.append('sl_no')

    X = df.drop(columns=columns_to_drop, axis=1)
    y = df['not_fully_paid']
    print(f"Features (X) shape: {X.shape}, Target (y) shape: {y.shape}")

    # Split the data
    # Using 60:40 split, random_state=42 as per instructions
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42, stratify=y)
    print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")

    # Train a Random Forest Classifier model
    print("Training Random Forest Classifier model...")
    rf = RandomForestClassifier(random_state=42) # Use random_state=42
    rf.fit(X_train, y_train)
    print("Model training complete.")
else:
    print("\nSkipping Model Training: Dataframe 'df' is not available.")

# Task V - Model Evaluation

if rf is not None and X_test is not None and y_test is not None:
    print("\nEvaluating model performance...")
    # Predict using the trained Random Forest Classifier model
    y_pred_test = rf.predict(X_test)

    # Evaluate the predictions and Print the classification report
    print("Classification Report on Test Data:")
    print(classification_report(y_test, y_pred_test))
    # print("\nConfusion Matrix on Test Data:") # Uncomment if you want to see it
    # print(confusion_matrix(y_test, y_pred_test)) # Uncomment if you want to see it
else:
    print("\nSkipping Model Evaluation: Model not trained or test data unavailable.")

# Task for Saving the Model to S3

print("\nAttempting to save the model to S3...")
if rf is not None:
    s3_client = boto3.client('s3')
    # Use the same bucket_name as defined in Task I for simplicity, or a different one if desired
    model_s3_bucket_name = bucket_name
    model_s3_key = 'model.pkl' # Name of the model file in S3 (can include a path like 'my_models/model.pkl')

    try:
        with tempfile.TemporaryFile() as temp_file:
            joblib.dump(rf, temp_file)
            temp_file.seek(0) # Reset file pointer to the beginning for reading
            s3_client.upload_fileobj(temp_file, model_s3_bucket_name, model_s3_key)
        print(f"Model saved successfully to s3://{model_s3_bucket_name}/{model_s3_key}")
    except Exception as e:
        print(f"Error saving model to S3: {e}")
        print(f"Please check S3 bucket ('{model_s3_bucket_name}') permissions and existence.")
else:
    print("\nSkipping model saving: Model 'rf' is not trained.")

https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/put_object.html
# EMR Spark Script
#Task 0. Confirm the imports and general structure.

import os
import ishutil
import pyspark
from pyspark.sql import Window
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import traceback

#Task 1: redshift_load_data (Correction: This was read_data as per problem statement, not redshift_load_data)
#This function reads data from S3.

#Problem Statement:
#mention the bucket name inside the bucket_name variable.
#The dataset will be in the s3 location inside the inputfile folder.
# Read the CSV file into a Dataframe. Ensure header is true and apply the customSchema.

# Solution for read_data:

def read-data(spark, customSchema):
	spark_session : spark
	customSchema : customSchema

print("----------------------")
	print("Starting read_data")
print("----------------------")

# mention the bucket name inside the bucket_name variable.
# replace 'your-s3-bucket-name' with your actual S# bucket name.
	
	bucket_name = "your-s3-bucket-name"
	s3_input_path = "s3://" + bucket_name + "/inputfile/loan_data.csv"

#Replace this line with your actual code.

	df = spark.read.csv(s3_input_path, header=True, schema=customSchema)
	return df

#Task 2: clean_data.
#This function cleans the DataFrame.

# Problem Statement:
# Drop any rows containing null values.
# Remove duplicates rows.
# Drop rows where the "Purpose" column contains the string "null".

# Solution for clean_data:

def clean_data(input_df):
	# for input file: input_df is output of read-data function
print("------------------------")
	print("Starting clean_data")
print("------------------------")
	df = input_df

	#1. Drop any rows containing null values.
	df = df.na.drop()

	#2. Remove duplicate rows.
	df = df.dropDuplicates()
	
	#3. Drop rows where the "purpose" column contains the string "null"
	#Assuming "null" is a string literal, not a null value in Spark's sense. 
	df = df.filter(col("purpose") != "null")
	
	return df

#Task 3: s3_load_data
#This function stores the processed DataFrame to S3.

#Problem Statement:
# mention the S3 bucket name in the bucket_name variable.
# Write the code to store the outputs to the respective locations using the output_path parameter.
# Output files should be a single partition CSV file with header.

#Solution for s3_load_data.

def s3_load_data(data, file_name):
	#data : the output data of result_1 and result_2 function.
	# file_name : the name of the output to be stored inside the s3.

	# mention the bucket name inside the bucket_name variable.
	#Replace 'your-s3-bucket-name' with your actual s3 bucket name.
	
	bucket_name = "your-s3-bucket-name"
	output_path = "s3://" + bucket_name + "/output/" + file_name

	# Write the DataFrame to s3 as a single CSV file with header.

	data.colease(1).write.csv(output_path, mode="overwrite", header = true)
	print(f"Data Successfully loaded to S3 at: {output_path)")

#Task 4: result_1
#This function performs transformations for result_1.

#Problem statement.
# Filters the rows where the "purpose" iseither "educational" or "small_business".
# Create a new column.
## income_to_installment-ratio which is the ratio of log_annual_inc to installment.
## create a new column int_rate_category which categorizes the "int_rate" as:
  ### low: int_rate < 0.1
  ### medium: 0.1 <= int-rate < 0.15
  ### high: int_rate >= 0.15
# Create a new column
  ##high_risk_borrower which flags high-risk borrowers with a value of "1" based on the following conditions (otherwise "0"):
    ## dti > 20
    ## fico < 700
    ## revol_util > 80

# Solution for result_1:

def result_1(input_df):
print("------------------------")
	print("Starting result_1")

print("----------------------------")

	df = input_df
	
	#1. Filters the rows where the "purpose" is either "educational" or "small_business"
	df = df.filter((col("purpose") == "educational") | (col("purpose") == "small_business"))

	#2. Create a new column income_to_installment_ratio
	df = df.withColumn("income_to_installment_ratio", col("log_annual_inc") / col("installment"))

	#3. Create a new column int_rate_category
	df = df.withColumn("int_rate_category", when(col("int_rate") < 0.1, "low")
		.when((col("int_rate") >= 0.1) & (col("int_rate") < 0.15), "medium")
		.otherwise("high"))

	#4. create a new column high_risk_borrower
	df = df.withColumn("high_risk_borrower", when((col("dti") > 20) & (col("fico") < 700 & (col("revol_util") > 80, 
		lit(1)).otherwise(lit(0)))

	return df

#Task 5: result_2
#This function calculates default rate per purpose.

#Problem statement.
#Calculate the default_rate for each purpose, defined as the count of loans that are not_fully_paid (i.e. not_fully_paid == 1) divided by the total count of loans.
#Now, Round the default-rate values to the two decimal values.

#Solution for result_2:

def result_2(input_df):
	#for input file: input_df is the output of clean_data function.

print("--------------------")
	print("Starting result_2")
print("--------------------")

	df = input_df

	#Calculate count of not_fully_paid loans per purpose.
	not_fully_paid_counts = df.filter(col("not_fully_paid") == 1).groupBy("purpose").agg(count(lit(1)).alias("not_fully_paid_count"))

	#Calculate total count of loans per purpose
	total_loans_counts = df.groupBy("purpose").agg(count(lit(1)).alias("total_loan_count"))

	#Join the two aggregated Dataframes to calculate default_rate
	df = total_loans_counts.join(not_fully_paid_counts, on ="purpose", how="left")
				.fillna(0, subset=["not_fully_paid_count"])
				#Fill 0 for purposes with no not_fully_paid loans

	#Calculate default_rate, handling division by Zero.
	df = df.withColumn("default_rate", when(col("total_loan_count") == 0, lit(0.0)).otherwise(col("not_fully_paid_count") / col("total_loan_count")))
	
	#Round the default_rate to two decimal places.
	df = df.withColumn("default_rate", round(col("default_rate"), 2))

	#Select only the required columns for the final output. 
	df = df.select("purpose", "default_rate")

	return df

#Task 6: redshift_load_data
#This function loads the result_2 output data into a Redshift table.

#Problem statement.
#Provide the jdbcUrl, username, password, and table_name for your redshift cluster.
#Write the command to load the data into the result_2 table in Redshift.

#Solution for redshift_load_data:
#To load data to Redshift using PySpark, you typically use the spark-redshift connector. 

def redshift_load_data(data):
	#data - the output of result_2 function.

	#1. jdbcurl - Replace with your actual Redshift JDBC URL
	#Example format: jdbc:redshift://<cluster-endpoint>:<port>/<database-name>
	jdbcurl = "jdbc:redshift://your-redshift-cluster-endpoint:5439/your-database-name"
	
	#2. username - replace with your redshift username.
	username = "your-redshift-username"

	#3. password - replace with your redshift password.
	password = "your-redshift-password"

	#4. table_name - the table to load the data into.
	table_name = "result_2" #As per problem statement, load into 'result_2' table.

	#Redshift connection properties
	#Ensure you have the redshift JDBC driver available to spark.
	#its usually included in EMR. if not, you might need to add it via --jars or Spark config.
	redshift_properties = {
				"user" : username, "password" : password, "driver" : "com.amazon.redshift.jdbc42.Driver"
				#or com.amazon.redshift.jdbc4.Driver for older versions
				}

	print(f"Attempting to write data to Redshift table: {table_name}")
	try:
		data.write.format("jdbc").option("url", jdbcUrl).option("dbtable", table_name)
					.option("user", username).option("password", password)
					.option("driver", "com.amazon.redshift.jdbc42.Driver")
					.mode("overwrite").save()
		print(f"Data successfully loaded to Redshift table: {table_name}")
	
	except Exception as e:
		print(f"Error loading data toredshift: {e}")

		traceback.print_exc()

Please read Getting good answers to your questions and follow it as a guide to posting here. It is a good general guide to posting in these types of forums, too.

This does not seem pertinent to this forum. What makes you think the issue is related to Jupyter? I suspect if you ran this code directly with Python by placing it in a text file with a .py extension and calling it with Python, or run the code in a Python console, that you’d have the same issue, and thus it is not pertinent to this forum.