Python Data Analysis with Pandas | Google Antigravity Directory

.antigravity

# Python Data Analysis with Pandas

Master data analysis with Pandas in Google Antigravity IDE. This comprehensive guide covers data manipulation, aggregation, visualization, and performance optimization for large datasets.

## Why Pandas?

Pandas is the de facto standard for data analysis in Python. Google Antigravity IDE's Gemini 3 engine provides intelligent suggestions for data transformations and analysis patterns.

## DataFrame Operations

```python
import pandas as pd
import numpy as np
from typing import Optional

# Load data with proper types
def load_sales_data(filepath: str) -> pd.DataFrame:
    """Load sales data with optimized dtypes."""
    
    dtype_spec = {
        "product_id": "category",
        "category": "category",
        "region": "category",
        "quantity": "int32",
        "unit_price": "float32",
    }
    
    df = pd.read_csv(
        filepath,
        dtype=dtype_spec,
        parse_dates=["order_date", "ship_date"],
        usecols=lambda col: col != "unused_column",
    )
    
    return df

# Chained transformations
def process_sales(df: pd.DataFrame) -> pd.DataFrame:
    """Process sales data with method chaining."""
    
    return (
        df
        .assign(
            total_amount=lambda x: x["quantity"] * x["unit_price"],
            order_month=lambda x: x["order_date"].dt.to_period("M"),
            is_large_order=lambda x: x["quantity"] > x["quantity"].quantile(0.9),
        )
        .query("total_amount > 0")
        .drop_duplicates(subset=["order_id"])
        .sort_values("order_date")
        .reset_index(drop=True)
    )
```

## Grouping and Aggregation

```python
def analyze_sales_by_region(df: pd.DataFrame) -> pd.DataFrame:
    """Aggregate sales metrics by region."""
    
    agg_funcs = {
        "total_amount": ["sum", "mean", "std"],
        "quantity": ["sum", "count"],
        "order_id": "nunique",
    }
    
    result = (
        df
        .groupby(["region", "category"])
        .agg(agg_funcs)
        .round(2)
    )
    
    # Flatten column names
    result.columns = ["_".join(col).strip() for col in result.columns]
    
    return result.reset_index()

def calculate_rolling_metrics(df: pd.DataFrame, window: int = 7) -> pd.DataFrame:
    """Calculate rolling statistics."""
    
    df = df.set_index("order_date").sort_index()
    
    rolling = df["total_amount"].rolling(window=window)
    
    return df.assign(
        rolling_mean=rolling.mean(),
        rolling_std=rolling.std(),
        rolling_min=rolling.min(),
        rolling_max=rolling.max(),
    )

def pivot_sales_report(df: pd.DataFrame) -> pd.DataFrame:
    """Create pivot table for sales analysis."""
    
    return pd.pivot_table(
        df,
        values="total_amount",
        index="region",
        columns="category",
        aggfunc=["sum", "mean", "count"],
        fill_value=0,
        margins=True,
        margins_name="Total",
    )
```

## Data Cleaning

```python
def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Clean and validate DataFrame."""
    
    # Handle missing values
    df = df.assign(
        # Fill numeric with median
        quantity=df["quantity"].fillna(df["quantity"].median()),
        # Fill categorical with mode
        category=df["category"].fillna(df["category"].mode()[0]),
        # Forward fill dates
        order_date=df["order_date"].ffill(),
    )
    
    # Remove outliers using IQR
    Q1 = df["total_amount"].quantile(0.25)
    Q3 = df["total_amount"].quantile(0.75)
    IQR = Q3 - Q1
    
    df = df[
        (df["total_amount"] >= Q1 - 1.5 * IQR) &
        (df["total_amount"] <= Q3 + 1.5 * IQR)
    ]
    
    # Validate data types
    df = df.astype({
        "product_id": "category",
        "quantity": "int32",
    })
    
    return df

def merge_datasets(
    orders: pd.DataFrame,
    customers: pd.DataFrame,
    products: pd.DataFrame,
) -> pd.DataFrame:
    """Merge multiple datasets efficiently."""
    
    return (
        orders
        .merge(customers, on="customer_id", how="left", suffixes=("", "_customer"))
        .merge(products, on="product_id", how="left", suffixes=("", "_product"))
        .pipe(lambda df: df.loc[:, ~df.columns.duplicated()])
    )
```

## Performance Optimization

```python
import dask.dataframe as dd

def process_large_dataset(filepath: str) -> pd.DataFrame:
    """Process large datasets with Dask."""
    
    # Use Dask for out-of-memory computation
    ddf = dd.read_csv(
        filepath,
        dtype={"category": "category"},
        blocksize="64MB",
    )
    
    result = (
        ddf
        .groupby("category")
        .agg({"amount": "sum", "quantity": "mean"})
        .compute()
    )
    
    return result

def optimize_memory(df: pd.DataFrame) -> pd.DataFrame:
    """Reduce DataFrame memory usage."""
    
    for col in df.select_dtypes(include=["int"]).columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")
    
    for col in df.select_dtypes(include=["float"]).columns:
        df[col] = pd.to_numeric(df[col], downcast="float")
    
    for col in df.select_dtypes(include=["object"]).columns:
        if df[col].nunique() / len(df) < 0.5:
            df[col] = df[col].astype("category")
    
    return df
```

## Best Practices

- Use method chaining for readable transformations
- Apply categorical dtypes for memory efficiency
- Leverage vectorized operations over loops
- Use query() for filtering expressions
- Apply Dask for datasets exceeding memory
- Profile memory with df.info(memory_usage="deep")

Google Antigravity IDE provides intelligent Pandas suggestions and automatically optimizes your data analysis code for performance.

When to Use This Prompt

This Python prompt is ideal for developers working on:

Python applications requiring modern best practices and optimal performance
Projects that need production-ready Python code with proper error handling
Teams looking to standardize their python development workflow
Developers wanting to learn industry-standard Python patterns and techniques

By using this prompt, you can save hours of manual coding and ensure best practices are followed from the start. It's particularly valuable for teams looking to maintain consistency across their python implementations.

How to Use

Copy the prompt - Click the copy button above to copy the entire prompt to your clipboard
Paste into your AI assistant - Use with Claude, ChatGPT, Cursor, or any AI coding tool
Customize as needed - Adjust the prompt based on your specific requirements
Review the output - Always review generated code for security and correctness

💡 Pro Tip: For best results, provide context about your project structure and any specific constraints or preferences you have.

Best Practices

✓ Always review generated code for security vulnerabilities before deploying
✓ Test the Python code in a development environment first
✓ Customize the prompt output to match your project's coding standards
✓ Keep your AI assistant's context window in mind for complex requirements
✓ Version control your prompts alongside your code for reproducibility

Frequently Asked Questions

Can I use this Python prompt commercially?

Yes! All prompts on Antigravity AI Directory are free to use for both personal and commercial projects. No attribution required, though it's always appreciated.

Which AI assistants work best with this prompt?

This prompt works excellently with Claude, ChatGPT, Cursor, GitHub Copilot, and other modern AI coding assistants. For best results, use models with large context windows.

How do I customize this prompt for my specific needs?

You can modify the prompt by adding specific requirements, constraints, or preferences. For Python projects, consider mentioning your framework version, coding style, and any specific libraries you're using.

Related Prompts

💬 Comments

Loading comments...

# Python Data Analysis with Pandas Master data analysis with Pandas in Google Antigravity IDE. This comprehensive guide covers data manipulation, aggregation, visualization, and performance optimization for large datasets. ## Why Pandas? Pandas is the de facto standard for data analysis in Python. Google Antigravity IDE's Gemini 3 engine provides intelligent suggestions for data transformations and analysis patterns. ## DataFrame Operations ```python import pandas as pd import numpy as np from typing import Optional # Load data with proper types def load_sales_data(filepath: str) -> pd.DataFrame: """Load sales data with optimized dtypes.""" dtype_spec = { "product_id": "category", "category": "category", "region": "category", "quantity": "int32", "unit_price": "float32", } df = pd.read_csv( filepath, dtype=dtype_spec, parse_dates=["order_date", "ship_date"], usecols=lambda col: col != "unused_column", ) return df # Chained transformations def process_sales(df: pd.DataFrame) -> pd.DataFrame: """Process sales data with method chaining.""" return ( df .assign( total_amount=lambda x: x["quantity"] * x["unit_price"], order_month=lambda x: x["order_date"].dt.to_period("M"), is_large_order=lambda x: x["quantity"] > x["quantity"].quantile(0.9), ) .query("total_amount > 0") .drop_duplicates(subset=["order_id"]) .sort_values("order_date") .reset_index(drop=True) ) ``` ## Grouping and Aggregation ```python def analyze_sales_by_region(df: pd.DataFrame) -> pd.DataFrame: """Aggregate sales metrics by region.""" agg_funcs = { "total_amount": ["sum", "mean", "std"], "quantity": ["sum", "count"], "order_id": "nunique", } result = ( df .groupby(["region", "category"]) .agg(agg_funcs) .round(2) ) # Flatten column names result.columns = ["_".join(col).strip() for col in result.columns] return result.reset_index() def calculate_rolling_metrics(df: pd.DataFrame, window: int = 7) -> pd.DataFrame: """Calculate rolling statistics.""" df = df.set_index("order_date").sort_index() rolling = df["total_amount"].rolling(window=window) return df.assign( rolling_mean=rolling.mean(), rolling_std=rolling.std(), rolling_min=rolling.min(), rolling_max=rolling.max(), ) def pivot_sales_report(df: pd.DataFrame) -> pd.DataFrame: """Create pivot table for sales analysis.""" return pd.pivot_table( df, values="total_amount", index="region", columns="category", aggfunc=["sum", "mean", "count"], fill_value=0, margins=True, margins_name="Total", ) ``` ## Data Cleaning ```python def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame: """Clean and validate DataFrame.""" # Handle missing values df = df.assign( # Fill numeric with median quantity=df["quantity"].fillna(df["quantity"].median()), # Fill categorical with mode category=df["category"].fillna(df["category"].mode()[0]), # Forward fill dates order_date=df["order_date"].ffill(), ) # Remove outliers using IQR Q1 = df["total_amount"].quantile(0.25) Q3 = df["total_amount"].quantile(0.75) IQR = Q3 - Q1 df = df[ (df["total_amount"] >= Q1 - 1.5 * IQR) & (df["total_amount"] <= Q3 + 1.5 * IQR) ] # Validate data types df = df.astype({ "product_id": "category", "quantity": "int32", }) return df def merge_datasets( orders: pd.DataFrame, customers: pd.DataFrame, products: pd.DataFrame, ) -> pd.DataFrame: """Merge multiple datasets efficiently.""" return ( orders .merge(customers, on="customer_id", how="left", suffixes=("", "_customer")) .merge(products, on="product_id", how="left", suffixes=("", "_product")) .pipe(lambda df: df.loc[:, ~df.columns.duplicated()]) ) ``` ## Performance Optimization ```python import dask.dataframe as dd def process_large_dataset(filepath: str) -> pd.DataFrame: """Process large datasets with Dask.""" # Use Dask for out-of-memory computation ddf = dd.read_csv( filepath, dtype={"category": "category"}, blocksize="64MB", ) result = ( ddf .groupby("category") .agg({"amount": "sum", "quantity": "mean"}) .compute() ) return result def optimize_memory(df: pd.DataFrame) -> pd.DataFrame: """Reduce DataFrame memory usage.""" for col in df.select_dtypes(include=["int"]).columns: df[col] = pd.to_numeric(df[col], downcast="integer") for col in df.select_dtypes(include=["float"]).columns: df[col] = pd.to_numeric(df[col], downcast="float") for col in df.select_dtypes(include=["object"]).columns: if df[col].nunique() / len(df) < 0.5: df[col] = df[col].astype("category") return df ``` ## Best Practices - Use method chaining for readable transformations - Apply categorical dtypes for memory efficiency - Leverage vectorized operations over loops - Use query() for filtering expressions - Apply Dask for datasets exceeding memory - Profile memory with df.info(memory_usage="deep") Google Antigravity IDE provides intelligent Pandas suggestions and automatically optimizes your data analysis code for performance.