Source code for stellium.io.csv

"""
Parser for CSV files containing birth data.

CSV files are a common format for batch chart data. This module provides
flexible parsing with configurable column mapping to accommodate different
CSV formats and naming conventions.

Example CSV formats supported:

    # Standard format (auto-detected):
    name,date,time,location
    Kate Louie,1994-01-06,11:47,Mountain View CA

    # Combined datetime:
    name,datetime,place
    Kate,1994-01-06 11:47,37.3861,-122.0839

    # Separate date components:
    first_name,last_name,year,month,day,hour,minute,latitude,longitude
    Kate,Louie,1994,1,6,11,47,37.3861,-122.0839

    # With timezone:
    Name,Birth Date,Birth Time,City,Timezone
    Kate Louie,01/06/1994,11:47 AM,Mountain View CA,America/Los_Angeles
"""

import csv
import datetime as dt
import warnings
from dataclasses import dataclass
from pathlib import Path
from typing import Any

from stellium.core.models import ChartLocation
from stellium.core.native import Native
from stellium.exceptions import DataQualityWarning



[docs]
@dataclass
class CSVColumnMapping:
    """
    Configuration for mapping CSV columns to Native fields.

    This allows flexible handling of different CSV formats. All column names
    are case-insensitive and support multiple aliases.

    Attributes:
        name: Column(s) for person/event name. Can be a single column name
              or a tuple for (first_name, last_name) to combine.
        datetime: Column for combined datetime string (e.g., "1994-01-06 11:47")
        date: Column for date only (when datetime is split)
        time: Column for time only (when datetime is split)
        year: Column for year (when date is split into components)
        month: Column for month
        day: Column for day
        hour: Column for hour (when time is split into components)
        minute: Column for minute
        second: Column for second
        location: Column for location string (geocoded if no lat/lon, or used
                  as display name if lat/lon are provided)
        latitude: Column for latitude (when using coordinates)
        longitude: Column for longitude (when using coordinates)
        timezone: Column for timezone name (e.g., "America/Los_Angeles")
        time_unknown: Column indicating if birth time is unknown (bool/flag)

    Location handling:
        - If latitude + longitude are provided: Uses coordinates directly.
          If location is also provided, it's used as the display name.
        - If only location is provided (no lat/lon): Geocodes the string.
    """

    # Name field(s)
    name: str | tuple[str, str] | None = None

    # Datetime options (in order of precedence)
    datetime: str | None = None  # Combined datetime
    date: str | None = None  # Date only
    time: str | None = None  # Time only

    # Date components (used if date column not found)
    year: str | None = None
    month: str | None = None
    day: str | None = None

    # Time components (used if time column not found)
    hour: str | None = None
    minute: str | None = None
    second: str | None = None

    # Location options
    location: str | None = None  # String to geocode, or display name with coords
    latitude: str | None = None  # Numeric latitude
    longitude: str | None = None  # Numeric longitude

    # Optional fields
    timezone: str | None = None  # Timezone name
    time_unknown: str | None = None  # Flag for unknown birth time

    # Date/time format hints
    date_format: str | None = None  # e.g., "%m/%d/%Y" or "%d.%m.%Y"
    time_format: str | None = None  # e.g., "%I:%M %p" (12-hour with AM/PM)
    datetime_format: str | None = None  # e.g., "%Y-%m-%d %H:%M"



# Default column name aliases (case-insensitive)
DEFAULT_ALIASES: dict[str, list[str]] = {
    "name": ["name", "full_name", "fullname", "person", "subject", "native"],
    "first_name": ["first_name", "firstname", "first", "given_name", "givenname"],
    "last_name": ["last_name", "lastname", "last", "surname", "family_name"],
    "datetime": ["datetime", "date_time", "birth_datetime", "birthdatetime", "dob"],
    "date": ["date", "birth_date", "birthdate", "dob", "birthday"],
    "time": ["time", "birth_time", "birthtime", "tob", "time_of_birth"],
    "year": ["year", "birth_year", "yr"],
    "month": ["month", "birth_month", "mon", "mo"],
    "day": ["day", "birth_day", "dy"],
    "hour": ["hour", "hr", "hours"],
    "minute": ["minute", "min", "minutes"],
    "second": ["second", "sec", "seconds"],
    "location": [
        "location",
        "place",
        "birthplace",
        "birth_place",
        "city",
        "address",
        "pob",
        "location_name",
        "place_name",
        "city_name",
        "birth_city",
        "birth_location",
    ],
    "latitude": ["latitude", "lat", "birth_latitude"],
    "longitude": ["longitude", "lon", "lng", "long", "birth_longitude"],
    "timezone": ["timezone", "tz", "time_zone", "tzname"],
    "time_unknown": [
        "time_unknown",
        "unknown_time",
        "no_time",
        "time_uncertain",
        "approximate_time",
    ],
}


def _find_column(
    headers: list[str], target: str, aliases: dict[str, list[str]] | None = None
) -> str | None:
    """
    Find a column name in headers using aliases.

    Args:
        headers: List of CSV column headers
        target: The field we're looking for (e.g., "name", "date")
        aliases: Optional custom aliases dict

    Returns:
        The matching header name, or None if not found
    """
    aliases = aliases or DEFAULT_ALIASES
    target_aliases = aliases.get(target, [target])

    # Normalize headers for case-insensitive matching
    header_map = {h.lower().strip(): h for h in headers}

    for alias in target_aliases:
        if alias.lower() in header_map:
            return header_map[alias.lower()]

    return None


def _auto_detect_mapping(headers: list[str]) -> CSVColumnMapping:
    """
    Auto-detect column mapping from CSV headers.

    Args:
        headers: List of CSV column headers

    Returns:
        CSVColumnMapping with detected column names
    """
    mapping = CSVColumnMapping()

    # Try to find name column
    name_col = _find_column(headers, "name")
    if name_col:
        mapping.name = name_col
    else:
        # Try first_name + last_name
        first = _find_column(headers, "first_name")
        last = _find_column(headers, "last_name")
        if first and last:
            mapping.name = (first, last)
        elif first:
            mapping.name = first

    # Datetime detection
    mapping.datetime = _find_column(headers, "datetime")
    mapping.date = _find_column(headers, "date")
    mapping.time = _find_column(headers, "time")

    # Date components
    mapping.year = _find_column(headers, "year")
    mapping.month = _find_column(headers, "month")
    mapping.day = _find_column(headers, "day")

    # Time components
    mapping.hour = _find_column(headers, "hour")
    mapping.minute = _find_column(headers, "minute")
    mapping.second = _find_column(headers, "second")

    # Location
    mapping.location = _find_column(headers, "location")
    mapping.latitude = _find_column(headers, "latitude")
    mapping.longitude = _find_column(headers, "longitude")

    # Optional
    mapping.timezone = _find_column(headers, "timezone")
    mapping.time_unknown = _find_column(headers, "time_unknown")

    return mapping


def _parse_date_string(
    date_str: str, format_hint: str | None = None
) -> tuple[int, int, int]:
    """
    Parse a date string into (year, month, day).

    Supports various common formats:
    - ISO: 1994-01-06
    - US: 01/06/1994, 1/6/1994
    - EU: 06.01.1994, 6.1.1994
    - Text: January 6, 1994

    Args:
        date_str: The date string to parse
        format_hint: Optional strptime format string

    Returns:
        Tuple of (year, month, day)
    """
    date_str = date_str.strip()

    # Try explicit format first
    if format_hint:
        try:
            parsed = dt.datetime.strptime(date_str, format_hint)
            return (parsed.year, parsed.month, parsed.day)
        except ValueError:
            pass  # Fall through to auto-detection

    # Common formats to try
    formats = [
        "%Y-%m-%d",  # ISO: 1994-01-06
        "%Y/%m/%d",  # 1994/01/06
        "%m/%d/%Y",  # US: 01/06/1994
        "%m-%d-%Y",  # US: 01-06-1994
        "%d/%m/%Y",  # EU: 06/01/1994
        "%d-%m-%Y",  # EU: 06-01-1994
        "%d.%m.%Y",  # EU: 06.01.1994
        "%B %d, %Y",  # January 6, 1994
        "%b %d, %Y",  # Jan 6, 1994
        "%d %B %Y",  # 6 January 1994
        "%d %b %Y",  # 6 Jan 1994
    ]

    for fmt in formats:
        try:
            parsed = dt.datetime.strptime(date_str, fmt)
            return (parsed.year, parsed.month, parsed.day)
        except ValueError:
            continue

    raise ValueError(f"Could not parse date: {date_str}")


def _parse_time_string(
    time_str: str, format_hint: str | None = None
) -> tuple[int, int, int]:
    """
    Parse a time string into (hour, minute, second).

    Supports:
    - 24-hour: 11:47, 11:47:30
    - 12-hour: 11:47 AM, 11:47:30 PM

    Args:
        time_str: The time string to parse
        format_hint: Optional strptime format string

    Returns:
        Tuple of (hour, minute, second)
    """
    time_str = time_str.strip()

    # Try explicit format first
    if format_hint:
        try:
            parsed = dt.datetime.strptime(time_str, format_hint)
            return (parsed.hour, parsed.minute, parsed.second)
        except ValueError:
            pass

    # Common formats to try
    formats = [
        "%H:%M:%S",  # 11:47:30
        "%H:%M",  # 11:47
        "%I:%M:%S %p",  # 11:47:30 AM
        "%I:%M %p",  # 11:47 AM
        "%I:%M:%S%p",  # 11:47:30AM (no space)
        "%I:%M%p",  # 11:47AM
    ]

    for fmt in formats:
        try:
            parsed = dt.datetime.strptime(time_str, fmt)
            return (parsed.hour, parsed.minute, parsed.second)
        except ValueError:
            continue

    raise ValueError(f"Could not parse time: {time_str}")


def _parse_datetime_string(
    datetime_str: str, format_hint: str | None = None
) -> dt.datetime:
    """
    Parse a combined datetime string.

    Args:
        datetime_str: The datetime string to parse
        format_hint: Optional strptime format string

    Returns:
        datetime object
    """
    datetime_str = datetime_str.strip()

    # Try explicit format first
    if format_hint:
        try:
            return dt.datetime.strptime(datetime_str, format_hint)
        except ValueError:
            pass

    # Common formats
    formats = [
        "%Y-%m-%d %H:%M:%S",
        "%Y-%m-%d %H:%M",
        "%Y-%m-%dT%H:%M:%S",  # ISO with T
        "%Y-%m-%dT%H:%M",
        "%m/%d/%Y %H:%M:%S",
        "%m/%d/%Y %H:%M",
        "%m/%d/%Y %I:%M %p",
        "%d/%m/%Y %H:%M:%S",
        "%d/%m/%Y %H:%M",
        "%d.%m.%Y %H:%M:%S",
        "%d.%m.%Y %H:%M",
    ]

    for fmt in formats:
        try:
            return dt.datetime.strptime(datetime_str, fmt)
        except ValueError:
            continue

    raise ValueError(f"Could not parse datetime: {datetime_str}")


def _get_value(row: dict[str, str], col: str | None) -> str | None:
    """Get a value from a row, handling missing columns gracefully."""
    if col is None:
        return None
    return row.get(col, "").strip() or None


def _parse_bool(value: str | None) -> bool:
    """Parse a boolean value from various string representations."""
    if value is None:
        return False
    value = value.lower().strip()
    return value in ("true", "yes", "1", "y", "t", "x", "unknown")


def _row_to_native(row: dict[str, str], mapping: CSVColumnMapping) -> Native:
    """
    Convert a CSV row to a Native object using the column mapping.

    Args:
        row: Dictionary of column_name -> value
        mapping: The column mapping configuration

    Returns:
        Native object

    Raises:
        ValueError: If required fields are missing or invalid
    """
    # === Parse Name ===
    name = None
    if mapping.name:
        if isinstance(mapping.name, tuple):
            first = _get_value(row, mapping.name[0]) or ""
            last = _get_value(row, mapping.name[1]) or ""
            name = f"{first} {last}".strip() or None
        else:
            name = _get_value(row, mapping.name)

    # === Parse DateTime ===
    datetime_obj: dt.datetime | None = None

    # Option 1: Combined datetime column
    if mapping.datetime:
        datetime_str = _get_value(row, mapping.datetime)
        if datetime_str:
            datetime_obj = _parse_datetime_string(datetime_str, mapping.datetime_format)

    # Option 2: Separate date and time columns
    if datetime_obj is None and mapping.date:
        date_str = _get_value(row, mapping.date)
        if date_str:
            year, month, day = _parse_date_string(date_str, mapping.date_format)

            # Get time
            hour, minute, second = 12, 0, 0  # Default to noon
            if mapping.time:
                time_str = _get_value(row, mapping.time)
                if time_str:
                    hour, minute, second = _parse_time_string(
                        time_str, mapping.time_format
                    )

            datetime_obj = dt.datetime(year, month, day, hour, minute, second)

    # Option 3: Individual date/time components
    if datetime_obj is None and mapping.year:
        year_str = _get_value(row, mapping.year)
        month_str = _get_value(row, mapping.month)
        day_str = _get_value(row, mapping.day)

        if year_str and month_str and day_str:
            year = int(year_str)
            month = int(month_str)
            day = int(day_str)

            # Get time components
            hour = int(_get_value(row, mapping.hour) or 12)
            minute = int(_get_value(row, mapping.minute) or 0)
            second = int(_get_value(row, mapping.second) or 0)

            datetime_obj = dt.datetime(year, month, day, hour, minute, second)

    if datetime_obj is None:
        raise ValueError("Could not determine datetime from row")

    # === Parse Location ===
    location_input: Any = None

    # Get location name (used as display name or for geocoding)
    location_str = _get_value(row, mapping.location) if mapping.location else None

    # Option 1: Latitude and longitude columns
    lat_str = _get_value(row, mapping.latitude)
    lon_str = _get_value(row, mapping.longitude)
    if lat_str and lon_str:
        latitude = float(lat_str)
        longitude = float(lon_str)

        # If we also have a location name, create a ChartLocation with it
        if location_str:
            # Use timezonefinder to get timezone from coordinates
            from timezonefinder import TimezoneFinder

            tf = TimezoneFinder()
            timezone_name = tf.timezone_at(lat=latitude, lng=longitude) or "UTC"

            location_input = ChartLocation(
                latitude=latitude,
                longitude=longitude,
                name=location_str,
                timezone=timezone_name,
            )
        else:
            # Just use coordinates tuple (Native will handle timezone lookup)
            location_input = (latitude, longitude)

    # Option 2: Location string to geocode (no coordinates)
    elif location_str:
        location_input = location_str

    if location_input is None:
        raise ValueError("Could not determine location from row")

    # === Parse Optional Fields ===
    time_unknown = False
    if mapping.time_unknown:
        time_unknown = _parse_bool(_get_value(row, mapping.time_unknown))

    # If no time column was found/provided, mark as unknown
    if not time_unknown:
        has_time = bool(
            mapping.time
            or mapping.hour
            or (mapping.datetime and ":" in str(_get_value(row, mapping.datetime)))
        )
        if not has_time:
            time_unknown = True

    # Create and return Native
    return Native(
        datetime_input=datetime_obj,
        location_input=location_input,
        name=name,
        time_unknown=time_unknown,
    )



[docs]
def parse_csv(
    path: str | Path,
    mapping: CSVColumnMapping | None = None,
    *,
    delimiter: str = ",",
    encoding: str = "utf-8",
    skip_errors: bool = True,
) -> list[Native]:
    """
    Parse a CSV file containing birth data into Native objects.

    This function supports flexible CSV formats through column mapping.
    If no mapping is provided, it will auto-detect columns based on
    common naming conventions.

    Args:
        path: Path to the CSV file
        mapping: Optional column mapping configuration. If None, auto-detects
                 columns from headers.
        delimiter: CSV delimiter character (default: comma)
        encoding: File encoding (default: utf-8)
        skip_errors: If True, skip rows that fail to parse and continue.
                     If False, raise an exception on the first error.

    Returns:
        List of Native objects, one per valid row in the CSV

    Raises:
        FileNotFoundError: If the file doesn't exist
        ValueError: If required columns are missing or skip_errors=False and
                    a row fails to parse

    Example:
        # Auto-detect columns
        >>> natives = parse_csv("birth_data.csv")

        # Custom column mapping
        >>> mapping = CSVColumnMapping(
        ...     name="Full Name",
        ...     date="DOB",
        ...     time="Birth Time",
        ...     location="Birth Place",
        ... )
        >>> natives = parse_csv("birth_data.csv", mapping=mapping)

        # With date format hint for ambiguous dates
        >>> mapping = CSVColumnMapping(
        ...     date="date",
        ...     date_format="%d/%m/%Y",  # European format
        ... )
        >>> natives = parse_csv("european_data.csv", mapping=mapping)
    """
    path = Path(path)

    if not path.exists():
        raise FileNotFoundError(f"CSV file not found: {path}")

    natives: list[Native] = []
    errors: list[tuple[int, str]] = []

    with open(path, encoding=encoding, newline="") as f:
        reader = csv.DictReader(f, delimiter=delimiter)

        if reader.fieldnames is None:
            raise ValueError("CSV file has no headers")

        headers = list(reader.fieldnames)

        # Auto-detect mapping if not provided
        if mapping is None:
            mapping = _auto_detect_mapping(headers)
        else:
            # Validate that mapped columns exist
            # (User-provided mapping should use actual column names)
            pass

        for i, row in enumerate(reader, start=2):  # Start at 2 (header is row 1)
            try:
                native = _row_to_native(row, mapping)
                natives.append(native)
            except Exception as e:
                if skip_errors:
                    errors.append((i, str(e)))
                else:
                    raise ValueError(f"Error parsing row {i}: {e}") from e

    if errors:
        detail = "\n".join(f"  Row {row_num}: {error}" for row_num, error in errors[:5])
        if len(errors) > 5:
            detail += f"\n  ... and {len(errors) - 5} more"
        warnings.warn(
            f"Skipped {len(errors)} row(s) with errors:\n{detail}",
            DataQualityWarning,
            stacklevel=2,
        )

    return natives



# Convenience function for simple usage

[docs]
def read_csv(
    path: str | Path,
    *,
    name: str | tuple[str, str] | None = None,
    datetime: str | None = None,
    date: str | None = None,
    time: str | None = None,
    location: str | None = None,
    latitude: str | None = None,
    longitude: str | None = None,
    date_format: str | None = None,
    time_format: str | None = None,
) -> list[Native]:
    """
    Simple interface for reading CSV files with common column configurations.

    This is a convenience wrapper around parse_csv() that allows specifying
    column names as keyword arguments.

    Args:
        path: Path to the CSV file
        name: Column name for person/event name, or tuple of (first, last)
        datetime: Column name for combined datetime
        date: Column name for date
        time: Column name for time
        location: Column name for location string
        latitude: Column name for latitude
        longitude: Column name for longitude
        date_format: strptime format for dates (e.g., "%d/%m/%Y")
        time_format: strptime format for times (e.g., "%I:%M %p")

    Returns:
        List of Native objects

    Example:
        # Simple auto-detection
        >>> natives = read_csv("data.csv")

        # Specify key columns
        >>> natives = read_csv(
        ...     "data.csv",
        ...     name="Full Name",
        ...     date="DOB",
        ...     time="Birth Time",
        ...     location="City",
        ... )

        # Combined first/last name
        >>> natives = read_csv(
        ...     "data.csv",
        ...     name=("First Name", "Last Name"),
        ...     datetime="Birth DateTime",
        ...     latitude="Lat",
        ...     longitude="Long",
        ... )
    """
    mapping = CSVColumnMapping(
        name=name,
        datetime=datetime,
        date=date,
        time=time,
        location=location,
        latitude=latitude,
        longitude=longitude,
        date_format=date_format,
        time_format=time_format,
    )

    # If all mapping fields are None, use auto-detection
    has_explicit_mapping = any(
        [name, datetime, date, time, location, latitude, longitude]
    )

    return parse_csv(path, mapping if has_explicit_mapping else None)