Learn a lightweight, surprisingly strong baseline for text and tabular classification, built on Bayes’ Theorem with the “naive” assumption of feature independence.
P(y|x) ∝ P(x|y) · P(y). Naive Bayes assumes features are conditionally independent given the class.
# Step 1: Imports
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
# Step 2: Data (two categories for demo)
cats = ['sci.space','rec.sport.baseball']
data = fetch_20newsgroups(subset='all', categories=cats, remove=('headers','footers','quotes'))
X, y = data.data, data.target
# Step 3: Pipeline (TF-IDF -> Multinomial NB)
pipe = Pipeline([
("tfidf", TfidfVectorizer(min_df=2, ngram_range=(1,2))),
("nb", MultinomialNB(alpha=1.0))
])
# Step 4: Train/Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
pipe.fit(X_train, y_train)
# Step 5: Evaluate
y_pred = pipe.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=cats))
Tip: Adjust alpha (Laplace smoothing) and ngram_range to boost performance; keep stratify for balanced splits.
