Exploring the Basics of scikit-learn with Code Examples
Table of Contents
ToggleData Preprocessing
Clean and prepare your data for machine learning with scikit-learn’s preprocessing tools.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = [[1,2],[3,4]]
x_scaled = scaler.fit_transform(x)
print(x_scaled)
Train-Test Split
Divide data into training and testing sets for model assessment.
from sklearn.model_selection import train_test_split
x=[[1,2],[3,4],[5,6],[7,8]]
y=[0,1,1,0]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=42)
print(x_train)
print(x_test)
Linear Regression
Use linear regression to predict continuous target variables.
from sklearn. linear_model import LinearRegression
x = [[1],[2],[3],[4]]
y = [2,4,6,8]
model = LinearRegression()
model.fit(x,y)
y_pred = model.predict([[5]])
print(y_pred)
Logistic Regression
Use logistic regression to classify binary outcomes.
from sklearn.linear_model import LogisticRegression
x = [[0,0],[1,1],[2,2],[3,3]]
y = [0,0,1,1]
model = LogisticRegression()
model.fit(x,y)
y_pred = model.predict([[2,2]])
print(y_pred)
K-Nearest Neighbors
For classification and regression, use the k-nearest neighbor approach.
from sklearn.neighbors import KNeighborsClassifier
x = [[0,0],[1,1],[2,2],[3,3]]
y = [0,0,1,1]
model = KNeighborsClassifier(n_neighbors=3)
model.fit(x,y)
y_pred = model.predict([[2,2]])
print(y_pred)
Decision Trees
Create decision trees for classification and regression issues.
from sklearn.tree import DecisionTreeClassifier
x = [[0,0],[1,1],[2,2],[3,3]]
y = [0,0,1,1]
model = DecisionTreeClassifier()
model.fit(x,y)
y_pred = model.predict([[2,2]])
print(y_pred)
Random Forests
Combining numerous decision trees yields strong random forest models.
from sklearn.ensemble import RandomForestClassifier
x = [[0,0],[1,1],[2,2],[3,3]]
y = [0,0,1,1]
model = RandomForestClassifier()
model.fit(x,y)
y_pred = model.predict([[2,2]])
print(y_pred)
Cross-Validation
Use cross-validation techniques to evaluate model performance.
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
x = [[0,0],[1,1],[2,2],[3,3]]
y = [0,0,1,1]
model = LogisticRegression()
scores = cross_val_score(model, x, y, cv=2)
print(f"Accuracy: {scores.mean():.2f} +/- {scores.std():.2f}")
Model Evaluation Metrics
Evaluate model performance with multiple metrics.
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
x = [[0, 0], [1, 1], [2, 2], [3, 3]]
y = [0, 0, 1, 1]
model = LogisticRegression()
model.fit(x, y)
y_pred = model.predict(x)
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)
print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1-score: {f1:.2f}")
Model Persistence
Save and load trained models for later usage.
import joblib
from sklearn.linear_model import LogisticRegression
x = [[0,0],[1,1],[2,2],[3,3]]
y = [0,0,1,1]
model = LogisticRegression()
model.fit(x,y)
#save the model
joblib.dump(model, 'model.joblib')
#Load the model
loaded_model = joblib.load('model.joblib')
y_pred = loaded_model.predict([[2,2]])
print(y_pred)
Feature Selection
Use scikit-learn’s feature selection algorithms to choose the most relevant features from your dataset.
from sklearn.feature_selection import SelectKBest, f_classif
x = [[0,0],[1,1],[2,2],[3,3]]
y = [0,0,1,1]
selector = SelectKBest(f_classif, k=1)
X_new = selector.fit_transform(x,y)
print(X_new)
Dimensionality Reduction
Use PCA to reduce the amount of features in your dataset.
from sklearn.decomposition import PCA
x = [[0,0],[1,1],[2,2],[3,3]]
pca = PCA(n_components=1)
X_pca = pca.fit_transform(x)
print(X_pca)