Build ML models with scikit-learn, covering preprocessing, model selection, and evaluation.
# Machine Learning with Scikit-Learn
Build production-ready machine learning pipelines with Scikit-Learn using Google Antigravity IDE. This comprehensive guide covers data preprocessing, model training, evaluation, and deployment.
## Why Scikit-Learn?
Scikit-Learn provides consistent APIs for machine learning workflows. Google Antigravity IDE's Gemini 3 engine suggests optimal algorithms and hyperparameters for your data.
## Data Preprocessing Pipeline
```python
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from typing import Tuple, List
def create_preprocessing_pipeline(
numeric_features: List[str],
categorical_features: List[str]
) -> ColumnTransformer:
"""Create preprocessing pipeline for mixed data types."""
numeric_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler())
])
categorical_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features)
],
remainder="drop"
)
return preprocessor
def prepare_data(
df: pd.DataFrame,
target_column: str,
test_size: float = 0.2
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
"""Prepare data for training."""
X = df.drop(columns=[target_column])
y = df[target_column]
return train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
```
## Model Training Pipeline
```python
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
class ModelTrainer:
"""Train and evaluate machine learning models."""
def __init__(self, preprocessor: ColumnTransformer):
self.preprocessor = preprocessor
self.models = {
"logistic_regression": LogisticRegression(max_iter=1000),
"random_forest": RandomForestClassifier(n_estimators=100, random_state=42),
"gradient_boosting": GradientBoostingClassifier(random_state=42),
"svm": SVC(kernel="rbf", probability=True)
}
self.best_model = None
self.best_score = 0
def compare_models(
self,
X_train: np.ndarray,
y_train: np.ndarray,
cv: int = 5
) -> dict:
"""Compare multiple models using cross-validation."""
results = {}
for name, model in self.models.items():
pipeline = Pipeline([
("preprocessor", self.preprocessor),
("classifier", model)
])
scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring="accuracy")
results[name] = {
"mean_score": scores.mean(),
"std_score": scores.std(),
"scores": scores
}
if scores.mean() > self.best_score:
self.best_score = scores.mean()
self.best_model = name
return results
def hyperparameter_tuning(
self,
X_train: np.ndarray,
y_train: np.ndarray,
param_grid: dict,
cv: int = 5
) -> GridSearchCV:
"""Tune hyperparameters for best model."""
model = self.models[self.best_model]
pipeline = Pipeline([
("preprocessor", self.preprocessor),
("classifier", model)
])
# Prefix parameters with classifier__
prefixed_params = {
f"classifier__{k}": v for k, v in param_grid.items()
}
grid_search = GridSearchCV(
pipeline,
prefixed_params,
cv=cv,
scoring="accuracy",
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train, y_train)
return grid_search
```
## Model Evaluation
```python
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
roc_auc_score, roc_curve, precision_recall_curve
)
import matplotlib.pyplot as plt
def evaluate_model(
model: Pipeline,
X_test: np.ndarray,
y_test: np.ndarray
) -> dict:
"""Comprehensive model evaluation."""
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
metrics = {
"accuracy": accuracy_score(y_test, y_pred),
"precision": precision_score(y_test, y_pred, average="weighted"),
"recall": recall_score(y_test, y_pred, average="weighted"),
"f1": f1_score(y_test, y_pred, average="weighted"),
}
if y_prob is not None:
metrics["roc_auc"] = roc_auc_score(y_test, y_prob)
print(classification_report(y_test, y_pred))
return metrics
def plot_roc_curve(model: Pipeline, X_test: np.ndarray, y_test: np.ndarray):
"""Plot ROC curve for binary classification."""
y_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob)
auc = roc_auc_score(y_test, y_prob)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"AUC = {auc:.3f}")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.savefig("roc_curve.png")
```
## Model Persistence
```python
import joblib
from pathlib import Path
def save_model(model: Pipeline, path: str) -> None:
"""Save trained model to disk."""
Path(path).parent.mkdir(parents=True, exist_ok=True)
joblib.dump(model, path)
def load_model(path: str) -> Pipeline:
"""Load trained model from disk."""
return joblib.load(path)
```
## Best Practices
- Use pipelines for reproducible workflows
- Apply cross-validation for reliable estimates
- Perform hyperparameter tuning systematically
- Evaluate with multiple metrics
- Save preprocessing with model
- Version control your experiments
Google Antigravity IDE provides ML workflow suggestions and automatically recommends algorithms based on your data characteristics.This Python prompt is ideal for developers working on:
By using this prompt, you can save hours of manual coding and ensure best practices are followed from the start. It's particularly valuable for teams looking to maintain consistency across their python implementations.
Yes! All prompts on Antigravity AI Directory are free to use for both personal and commercial projects. No attribution required, though it's always appreciated.
This prompt works excellently with Claude, ChatGPT, Cursor, GitHub Copilot, and other modern AI coding assistants. For best results, use models with large context windows.
You can modify the prompt by adding specific requirements, constraints, or preferences. For Python projects, consider mentioning your framework version, coding style, and any specific libraries you're using.