library(knitr)
library(kableExtra)
knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)
library(caret)
library(rpart)
library(rpart.plot)
library(DALEX)
library(lime)
library(ggplot2)
# Load the German credit dataset from the UCI repository
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
data <- read.table(url, header = FALSE, stringsAsFactors = FALSE, strip.white = TRUE)
# Assign descriptive column names
col_names <- c(
"Status", "Duration", "CreditHistory", "Purpose", "CreditAmount", "Savings",
"EmploymentDuration", "InstallmentRate", "PersonalStatus", "OtherDebtors",
"ResidenceDuration", "Property", "Age", "OtherInstallmentPlans", "Housing",
"ExistingCreditsCount", "Job", "Dependents", "Telephone", "ForeignWorker", "CreditRisk"
)
names(data) <- col_names
# Convert CreditRisk to a factor with levels Bad (2) and Good (1)
data$CreditRisk <- factor(data$CreditRisk,
levels = c(2, 1),
labels = c("Bad", "Good")) # 'Bad' is the class of interest
# Specify which columns are categorical
categorical_cols <- c(
"Status", "CreditHistory", "Purpose", "Savings", "EmploymentDuration",
"InstallmentRate", "PersonalStatus", "OtherDebtors", "ResidenceDuration",
"Property", "OtherInstallmentPlans", "Housing", "ExistingCreditsCount",
"Job", "Telephone", "ForeignWorker"
)
# Convert these columns to factors with simplified level names
data[categorical_cols] <- lapply(data[categorical_cols], function(x) {
x <- as.factor(x)
levels(x) <- paste0("L", seq_along(levels(x))) # Simplify factor levels to L1, L2, …
return(x)
})