Source code for stellium.io.dataframe

"""
Parser for pandas DataFrames containing birth data.

This module provides the same flexible parsing as the CSV module,
but works directly with in-memory pandas DataFrames. This is useful
when data comes from databases, Excel files, or other pandas-compatible
sources.

Example usage:
    >>> import pandas as pd
    >>> from stellium.io import parse_dataframe, read_dataframe
    >>>
    >>> # Load data from any source
    >>> df = pd.read_excel("birth_data.xlsx")
    >>> # Or: df = pd.read_sql("SELECT * FROM births", connection)
    >>> # Or: df = pd.read_parquet("data.parquet")
    >>>
    >>> # Auto-detect columns
    >>> natives = parse_dataframe(df)
    >>>
    >>> # Or specify columns explicitly
    >>> natives = read_dataframe(
    ...     df,
    ...     name="Full Name",
    ...     date="DOB",
    ...     time="Birth Time",
    ...     latitude="Lat",
    ...     longitude="Long",
    ... )
"""

from __future__ import annotations

import warnings
from typing import TYPE_CHECKING

from stellium.core.native import Native
from stellium.exceptions import DataQualityWarning
from stellium.io.csv import (
    CSVColumnMapping,
    _auto_detect_mapping,
    _row_to_native,
)

if TYPE_CHECKING:
    import pandas as pd


def _check_pandas_available() -> None:
    """Check if pandas is available, raise helpful error if not."""
    try:
        import pandas  # noqa: F401
    except ImportError as e:
        raise ImportError(
            "pandas is required for DataFrame parsing. "
            "Install it with: pip install pandas"
        ) from e



[docs]
def parse_dataframe(
    df: pd.DataFrame,
    mapping: CSVColumnMapping | None = None,
    *,
    skip_errors: bool = True,
) -> list[Native]:
    """
    Parse a pandas DataFrame containing birth data into Native objects.

    This function supports flexible DataFrame formats through column mapping.
    If no mapping is provided, it will auto-detect columns based on
    common naming conventions.

    Args:
        df: pandas DataFrame with birth data
        mapping: Optional column mapping configuration. If None, auto-detects
                 columns from DataFrame column names.
        skip_errors: If True, skip rows that fail to parse and continue.
                     If False, raise an exception on the first error.

    Returns:
        List of Native objects, one per valid row in the DataFrame

    Raises:
        ImportError: If pandas is not installed
        ValueError: If required columns are missing or skip_errors=False and
                    a row fails to parse

    Example:
        >>> import pandas as pd
        >>> from stellium.io import parse_dataframe
        >>>
        >>> df = pd.DataFrame({
        ...     "name": ["Kate Louie", "Albert Einstein"],
        ...     "date": ["1994-01-06", "1879-03-14"],
        ...     "time": ["11:47", "11:30"],
        ...     "latitude": [37.3861, 48.4011],
        ...     "longitude": [-122.0839, 9.9876],
        ... })
        >>> natives = parse_dataframe(df)
        >>> len(natives)
        2

        >>> # With custom column mapping
        >>> mapping = CSVColumnMapping(
        ...     name="Full Name",
        ...     date="DOB",
        ...     latitude="Lat",
        ...     longitude="Lon",
        ... )
        >>> natives = parse_dataframe(df, mapping=mapping)
    """
    _check_pandas_available()

    natives: list[Native] = []
    errors: list[tuple[int, str]] = []

    # Get column names as list
    headers = list(df.columns)

    # Auto-detect mapping if not provided
    if mapping is None:
        mapping = _auto_detect_mapping(headers)

    # Iterate over DataFrame rows
    for idx, row in df.iterrows():
        # Convert row to dict (handling both string and non-string values)
        row_dict = {
            col: str(val) if val is not None else "" for col, val in row.items()
        }

        try:
            native = _row_to_native(row_dict, mapping)
            natives.append(native)
        except Exception as e:
            if skip_errors:
                errors.append((idx, str(e)))
            else:
                raise ValueError(f"Error parsing row {idx}: {e}") from e

    if errors:
        detail = "\n".join(f"  Row {row_idx}: {error}" for row_idx, error in errors[:5])
        if len(errors) > 5:
            detail += f"\n  ... and {len(errors) - 5} more"
        warnings.warn(
            f"Skipped {len(errors)} row(s) with errors:\n{detail}",
            DataQualityWarning,
            stacklevel=2,
        )

    return natives




[docs]
def read_dataframe(
    df: pd.DataFrame,
    *,
    name: str | tuple[str, str] | None = None,
    datetime: str | None = None,
    date: str | None = None,
    time: str | None = None,
    location: str | None = None,
    latitude: str | None = None,
    longitude: str | None = None,
    date_format: str | None = None,
    time_format: str | None = None,
) -> list[Native]:
    """
    Simple interface for reading pandas DataFrames with common column configurations.

    This is a convenience wrapper around parse_dataframe() that allows specifying
    column names as keyword arguments.

    Args:
        df: pandas DataFrame with birth data
        name: Column name for person/event name, or tuple of (first, last)
        datetime: Column name for combined datetime
        date: Column name for date
        time: Column name for time
        location: Column name for location string
        latitude: Column name for latitude
        longitude: Column name for longitude
        date_format: strptime format for dates (e.g., "%d/%m/%Y")
        time_format: strptime format for times (e.g., "%I:%M %p")

    Returns:
        List of Native objects

    Example:
        >>> import pandas as pd
        >>> from stellium.io import read_dataframe
        >>>
        >>> df = pd.DataFrame({
        ...     "Person": ["Kate Louie"],
        ...     "Birthday": ["1994-01-06"],
        ...     "Birth Time": ["11:47"],
        ...     "Lat": [37.3861],
        ...     "Long": [-122.0839],
        ... })
        >>>
        >>> natives = read_dataframe(
        ...     df,
        ...     name="Person",
        ...     date="Birthday",
        ...     time="Birth Time",
        ...     latitude="Lat",
        ...     longitude="Long",
        ... )
    """
    _check_pandas_available()

    mapping = CSVColumnMapping(
        name=name,
        datetime=datetime,
        date=date,
        time=time,
        location=location,
        latitude=latitude,
        longitude=longitude,
        date_format=date_format,
        time_format=time_format,
    )

    # If all mapping fields are None, use auto-detection
    has_explicit_mapping = any(
        [name, datetime, date, time, location, latitude, longitude]
    )

    return parse_dataframe(df, mapping if has_explicit_mapping else None)




[docs]
def dataframe_from_natives(
    natives: list[Native],
    *,
    include_coords: bool = True,
    include_timezone: bool = False,
) -> pd.DataFrame:
    """
    Convert a list of Native objects back to a pandas DataFrame.

    This is useful for exporting processed data or for round-trip operations.

    Args:
        natives: List of Native objects to convert
        include_coords: Include latitude/longitude columns (default: True)
        include_timezone: Include timezone column (default: False)

    Returns:
        pandas DataFrame with birth data

    Example:
        >>> from stellium.io import parse_csv, dataframe_from_natives
        >>>
        >>> natives = parse_csv("birth_data.csv")
        >>> df = dataframe_from_natives(natives)
        >>> df.to_excel("birth_data.xlsx")  # Export to Excel
    """
    _check_pandas_available()
    import pandas as pd

    rows = []
    for native in natives:
        row = {
            "name": native.name or "",
            "date": native.datetime.utc_datetime.strftime("%Y-%m-%d"),
            "time": native.datetime.utc_datetime.strftime("%H:%M:%S"),
            "location": native.location.name or "",
        }

        if include_coords:
            row["latitude"] = native.location.latitude
            row["longitude"] = native.location.longitude

        if include_timezone:
            row["timezone"] = native.location.timezone

        rows.append(row)

    return pd.DataFrame(rows)