Build a churn classifier using Decision Tree and Random Forest. Practice preprocessing (encoding, missing values), model training, evaluation, and interpretability (feature importance).
Telecom Churn (CSV: customerID, gender, SeniorCitizen, tenure, MonthlyCharges, Contract, Churn …). Any churn dataset with similar columns works.
# 1) Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# 2) Load & basic clean
df = pd.read_csv("churn.csv") # replace with your path
df = df.dropna(subset=["Churn"]) # ensure target present
df["Churn"] = df["Churn"].map({"Yes":1, "No":0}).astype(int)
# 3) Features/target
y = df["Churn"]
X = df.drop(columns=["Churn","customerID"], errors="ignore")
# 4) Identify column types
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
# 5) Preprocessor: One-hot for categoricals, pass-through for numerics
preprocess = ColumnTransformer(
transformers=[
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
],
remainder="passthrough"
)
# 6) Models
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
# 7) Pipelines
pipe_dt = Pipeline([("prep", preprocess), ("clf", dt)])
pipe_rf = Pipeline([("prep", preprocess), ("clf", rf)])
# 8) Train/test split (stratify keeps class ratios)
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# 9) Hyperparameter search (quick)
param_dt = {"clf__max_depth":[4,6,8,None], "clf__min_samples_leaf":[1,2,5]}
param_rf = {"clf__max_depth":[None,8,12], "clf__min_samples_leaf":[1,2,4], "clf__max_features":["sqrt","log2"]}
grid_dt = GridSearchCV(pipe_dt, param_dt, cv=5, n_jobs=-1)
grid_rf = GridSearchCV(pipe_rf, param_rf, cv=5, n_jobs=-1)
grid_dt.fit(X_tr, y_tr)
grid_rf.fit(X_tr, y_tr)
best_dt = grid_dt.best_estimator_
best_rf = grid_rf.best_estimator_
# 10) Evaluate
for name, model in [("Decision Tree", best_dt), ("Random Forest", best_rf)]:
y_pred = model.predict(X_te)
print(f"\n== {name} ==")
print(confusion_matrix(y_te, y_pred))
print(classification_report(y_te, y_pred, digits=3))
Note: For imbalanced churn, monitor Recall (catching churners) and F1. Optionally set class_weight='balanced' in the classifier.
# Get feature names after One-Hot
ohe = best_rf.named_steps["prep"].named_transformers_["cat"]
ohe_features = ohe.get_feature_names_out() if len(ohe.categories_) else []
feature_names = list(ohe_features) + num_cols
importances = best_rf.named_steps["clf"].feature_importances_
idx = np.argsort(importances)[-10:][::-1]
print("Top features:")
for i in idx:
print(feature_names[i], "->", round(importances[i],4))
