Fit PCA with pipelines, pick the number of components, and transform data for downstream ML tasks.
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
X, y = load_wine(return_X_y=True)
pipe = Pipeline([
("scaler", StandardScaler()),
("pca", PCA(n_components=0.95, random_state=42)) # keep 95% variance
])
X_pca = pipe.fit_transform(X)
pca = pipe.named_steps["pca"]
print("n_components_:", pca.n_components_)
print("Explained variance ratio (first 5):", pca.explained_variance_ratio_[:5].round(3))
Tip: Use n_components=0.95 to keep enough components for 95% variance (auto-selects count).
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
clf_pipe = Pipeline([
("scaler", StandardScaler()),
("pca", PCA(n_components=10, random_state=42)),
("clf", LogisticRegression(max_iter=5000))
])
scores = cross_val_score(clf_pipe, X, y, cv=5, scoring="accuracy")
print("CV Accuracy:", scores.mean().round(3), "+/-", scores.std().round(3))
n_components as a number vs a variance ratio?
