Source code for stellium.io.csv

"""
Parser for CSV files containing birth data.

CSV files are a common format for batch chart data. This module provides
flexible parsing with configurable column mapping to accommodate different
CSV formats and naming conventions.

Example CSV formats supported:

    # Standard format (auto-detected):
    name,date,time,location
    Kate Louie,1994-01-06,11:47,Mountain View CA

    # Combined datetime:
    name,datetime,place
    Kate,1994-01-06 11:47,37.3861,-122.0839

    # Separate date components:
    first_name,last_name,year,month,day,hour,minute,latitude,longitude
    Kate,Louie,1994,1,6,11,47,37.3861,-122.0839

    # With timezone:
    Name,Birth Date,Birth Time,City,Timezone
    Kate Louie,01/06/1994,11:47 AM,Mountain View CA,America/Los_Angeles
"""

import csv
import datetime as dt
from dataclasses import dataclass
from pathlib import Path
from typing import Any

from stellium.core.models import ChartLocation
from stellium.core.native import Native


[docs] @dataclass class CSVColumnMapping: """ Configuration for mapping CSV columns to Native fields. This allows flexible handling of different CSV formats. All column names are case-insensitive and support multiple aliases. Attributes: name: Column(s) for person/event name. Can be a single column name or a tuple for (first_name, last_name) to combine. datetime: Column for combined datetime string (e.g., "1994-01-06 11:47") date: Column for date only (when datetime is split) time: Column for time only (when datetime is split) year: Column for year (when date is split into components) month: Column for month day: Column for day hour: Column for hour (when time is split into components) minute: Column for minute second: Column for second location: Column for location string (geocoded if no lat/lon, or used as display name if lat/lon are provided) latitude: Column for latitude (when using coordinates) longitude: Column for longitude (when using coordinates) timezone: Column for timezone name (e.g., "America/Los_Angeles") time_unknown: Column indicating if birth time is unknown (bool/flag) Location handling: - If latitude + longitude are provided: Uses coordinates directly. If location is also provided, it's used as the display name. - If only location is provided (no lat/lon): Geocodes the string. """ # Name field(s) name: str | tuple[str, str] | None = None # Datetime options (in order of precedence) datetime: str | None = None # Combined datetime date: str | None = None # Date only time: str | None = None # Time only # Date components (used if date column not found) year: str | None = None month: str | None = None day: str | None = None # Time components (used if time column not found) hour: str | None = None minute: str | None = None second: str | None = None # Location options location: str | None = None # String to geocode, or display name with coords latitude: str | None = None # Numeric latitude longitude: str | None = None # Numeric longitude # Optional fields timezone: str | None = None # Timezone name time_unknown: str | None = None # Flag for unknown birth time # Date/time format hints date_format: str | None = None # e.g., "%m/%d/%Y" or "%d.%m.%Y" time_format: str | None = None # e.g., "%I:%M %p" (12-hour with AM/PM) datetime_format: str | None = None # e.g., "%Y-%m-%d %H:%M"
# Default column name aliases (case-insensitive) DEFAULT_ALIASES: dict[str, list[str]] = { "name": ["name", "full_name", "fullname", "person", "subject", "native"], "first_name": ["first_name", "firstname", "first", "given_name", "givenname"], "last_name": ["last_name", "lastname", "last", "surname", "family_name"], "datetime": ["datetime", "date_time", "birth_datetime", "birthdatetime", "dob"], "date": ["date", "birth_date", "birthdate", "dob", "birthday"], "time": ["time", "birth_time", "birthtime", "tob", "time_of_birth"], "year": ["year", "birth_year", "yr"], "month": ["month", "birth_month", "mon", "mo"], "day": ["day", "birth_day", "dy"], "hour": ["hour", "hr", "hours"], "minute": ["minute", "min", "minutes"], "second": ["second", "sec", "seconds"], "location": [ "location", "place", "birthplace", "birth_place", "city", "address", "pob", "location_name", "place_name", "city_name", "birth_city", "birth_location", ], "latitude": ["latitude", "lat", "birth_latitude"], "longitude": ["longitude", "lon", "lng", "long", "birth_longitude"], "timezone": ["timezone", "tz", "time_zone", "tzname"], "time_unknown": [ "time_unknown", "unknown_time", "no_time", "time_uncertain", "approximate_time", ], } def _find_column( headers: list[str], target: str, aliases: dict[str, list[str]] | None = None ) -> str | None: """ Find a column name in headers using aliases. Args: headers: List of CSV column headers target: The field we're looking for (e.g., "name", "date") aliases: Optional custom aliases dict Returns: The matching header name, or None if not found """ aliases = aliases or DEFAULT_ALIASES target_aliases = aliases.get(target, [target]) # Normalize headers for case-insensitive matching header_map = {h.lower().strip(): h for h in headers} for alias in target_aliases: if alias.lower() in header_map: return header_map[alias.lower()] return None def _auto_detect_mapping(headers: list[str]) -> CSVColumnMapping: """ Auto-detect column mapping from CSV headers. Args: headers: List of CSV column headers Returns: CSVColumnMapping with detected column names """ mapping = CSVColumnMapping() # Try to find name column name_col = _find_column(headers, "name") if name_col: mapping.name = name_col else: # Try first_name + last_name first = _find_column(headers, "first_name") last = _find_column(headers, "last_name") if first and last: mapping.name = (first, last) elif first: mapping.name = first # Datetime detection mapping.datetime = _find_column(headers, "datetime") mapping.date = _find_column(headers, "date") mapping.time = _find_column(headers, "time") # Date components mapping.year = _find_column(headers, "year") mapping.month = _find_column(headers, "month") mapping.day = _find_column(headers, "day") # Time components mapping.hour = _find_column(headers, "hour") mapping.minute = _find_column(headers, "minute") mapping.second = _find_column(headers, "second") # Location mapping.location = _find_column(headers, "location") mapping.latitude = _find_column(headers, "latitude") mapping.longitude = _find_column(headers, "longitude") # Optional mapping.timezone = _find_column(headers, "timezone") mapping.time_unknown = _find_column(headers, "time_unknown") return mapping def _parse_date_string( date_str: str, format_hint: str | None = None ) -> tuple[int, int, int]: """ Parse a date string into (year, month, day). Supports various common formats: - ISO: 1994-01-06 - US: 01/06/1994, 1/6/1994 - EU: 06.01.1994, 6.1.1994 - Text: January 6, 1994 Args: date_str: The date string to parse format_hint: Optional strptime format string Returns: Tuple of (year, month, day) """ date_str = date_str.strip() # Try explicit format first if format_hint: try: parsed = dt.datetime.strptime(date_str, format_hint) return (parsed.year, parsed.month, parsed.day) except ValueError: pass # Fall through to auto-detection # Common formats to try formats = [ "%Y-%m-%d", # ISO: 1994-01-06 "%Y/%m/%d", # 1994/01/06 "%m/%d/%Y", # US: 01/06/1994 "%m-%d-%Y", # US: 01-06-1994 "%d/%m/%Y", # EU: 06/01/1994 "%d-%m-%Y", # EU: 06-01-1994 "%d.%m.%Y", # EU: 06.01.1994 "%B %d, %Y", # January 6, 1994 "%b %d, %Y", # Jan 6, 1994 "%d %B %Y", # 6 January 1994 "%d %b %Y", # 6 Jan 1994 ] for fmt in formats: try: parsed = dt.datetime.strptime(date_str, fmt) return (parsed.year, parsed.month, parsed.day) except ValueError: continue raise ValueError(f"Could not parse date: {date_str}") def _parse_time_string( time_str: str, format_hint: str | None = None ) -> tuple[int, int, int]: """ Parse a time string into (hour, minute, second). Supports: - 24-hour: 11:47, 11:47:30 - 12-hour: 11:47 AM, 11:47:30 PM Args: time_str: The time string to parse format_hint: Optional strptime format string Returns: Tuple of (hour, minute, second) """ time_str = time_str.strip() # Try explicit format first if format_hint: try: parsed = dt.datetime.strptime(time_str, format_hint) return (parsed.hour, parsed.minute, parsed.second) except ValueError: pass # Common formats to try formats = [ "%H:%M:%S", # 11:47:30 "%H:%M", # 11:47 "%I:%M:%S %p", # 11:47:30 AM "%I:%M %p", # 11:47 AM "%I:%M:%S%p", # 11:47:30AM (no space) "%I:%M%p", # 11:47AM ] for fmt in formats: try: parsed = dt.datetime.strptime(time_str, fmt) return (parsed.hour, parsed.minute, parsed.second) except ValueError: continue raise ValueError(f"Could not parse time: {time_str}") def _parse_datetime_string( datetime_str: str, format_hint: str | None = None ) -> dt.datetime: """ Parse a combined datetime string. Args: datetime_str: The datetime string to parse format_hint: Optional strptime format string Returns: datetime object """ datetime_str = datetime_str.strip() # Try explicit format first if format_hint: try: return dt.datetime.strptime(datetime_str, format_hint) except ValueError: pass # Common formats formats = [ "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M", "%Y-%m-%dT%H:%M:%S", # ISO with T "%Y-%m-%dT%H:%M", "%m/%d/%Y %H:%M:%S", "%m/%d/%Y %H:%M", "%m/%d/%Y %I:%M %p", "%d/%m/%Y %H:%M:%S", "%d/%m/%Y %H:%M", "%d.%m.%Y %H:%M:%S", "%d.%m.%Y %H:%M", ] for fmt in formats: try: return dt.datetime.strptime(datetime_str, fmt) except ValueError: continue raise ValueError(f"Could not parse datetime: {datetime_str}") def _get_value(row: dict[str, str], col: str | None) -> str | None: """Get a value from a row, handling missing columns gracefully.""" if col is None: return None return row.get(col, "").strip() or None def _parse_bool(value: str | None) -> bool: """Parse a boolean value from various string representations.""" if value is None: return False value = value.lower().strip() return value in ("true", "yes", "1", "y", "t", "x", "unknown") def _row_to_native(row: dict[str, str], mapping: CSVColumnMapping) -> Native: """ Convert a CSV row to a Native object using the column mapping. Args: row: Dictionary of column_name -> value mapping: The column mapping configuration Returns: Native object Raises: ValueError: If required fields are missing or invalid """ # === Parse Name === name = None if mapping.name: if isinstance(mapping.name, tuple): first = _get_value(row, mapping.name[0]) or "" last = _get_value(row, mapping.name[1]) or "" name = f"{first} {last}".strip() or None else: name = _get_value(row, mapping.name) # === Parse DateTime === datetime_obj: dt.datetime | None = None # Option 1: Combined datetime column if mapping.datetime: datetime_str = _get_value(row, mapping.datetime) if datetime_str: datetime_obj = _parse_datetime_string(datetime_str, mapping.datetime_format) # Option 2: Separate date and time columns if datetime_obj is None and mapping.date: date_str = _get_value(row, mapping.date) if date_str: year, month, day = _parse_date_string(date_str, mapping.date_format) # Get time hour, minute, second = 12, 0, 0 # Default to noon if mapping.time: time_str = _get_value(row, mapping.time) if time_str: hour, minute, second = _parse_time_string( time_str, mapping.time_format ) datetime_obj = dt.datetime(year, month, day, hour, minute, second) # Option 3: Individual date/time components if datetime_obj is None and mapping.year: year_str = _get_value(row, mapping.year) month_str = _get_value(row, mapping.month) day_str = _get_value(row, mapping.day) if year_str and month_str and day_str: year = int(year_str) month = int(month_str) day = int(day_str) # Get time components hour = int(_get_value(row, mapping.hour) or 12) minute = int(_get_value(row, mapping.minute) or 0) second = int(_get_value(row, mapping.second) or 0) datetime_obj = dt.datetime(year, month, day, hour, minute, second) if datetime_obj is None: raise ValueError("Could not determine datetime from row") # === Parse Location === location_input: Any = None # Get location name (used as display name or for geocoding) location_str = _get_value(row, mapping.location) if mapping.location else None # Option 1: Latitude and longitude columns lat_str = _get_value(row, mapping.latitude) lon_str = _get_value(row, mapping.longitude) if lat_str and lon_str: latitude = float(lat_str) longitude = float(lon_str) # If we also have a location name, create a ChartLocation with it if location_str: # Use timezonefinder to get timezone from coordinates from timezonefinder import TimezoneFinder tf = TimezoneFinder() timezone_name = tf.timezone_at(lat=latitude, lng=longitude) or "UTC" location_input = ChartLocation( latitude=latitude, longitude=longitude, name=location_str, timezone=timezone_name, ) else: # Just use coordinates tuple (Native will handle timezone lookup) location_input = (latitude, longitude) # Option 2: Location string to geocode (no coordinates) elif location_str: location_input = location_str if location_input is None: raise ValueError("Could not determine location from row") # === Parse Optional Fields === time_unknown = False if mapping.time_unknown: time_unknown = _parse_bool(_get_value(row, mapping.time_unknown)) # If no time column was found/provided, mark as unknown if not time_unknown: has_time = bool( mapping.time or mapping.hour or (mapping.datetime and ":" in str(_get_value(row, mapping.datetime))) ) if not has_time: time_unknown = True # Create and return Native return Native( datetime_input=datetime_obj, location_input=location_input, name=name, time_unknown=time_unknown, )
[docs] def parse_csv( path: str | Path, mapping: CSVColumnMapping | None = None, *, delimiter: str = ",", encoding: str = "utf-8", skip_errors: bool = True, ) -> list[Native]: """ Parse a CSV file containing birth data into Native objects. This function supports flexible CSV formats through column mapping. If no mapping is provided, it will auto-detect columns based on common naming conventions. Args: path: Path to the CSV file mapping: Optional column mapping configuration. If None, auto-detects columns from headers. delimiter: CSV delimiter character (default: comma) encoding: File encoding (default: utf-8) skip_errors: If True, skip rows that fail to parse and continue. If False, raise an exception on the first error. Returns: List of Native objects, one per valid row in the CSV Raises: FileNotFoundError: If the file doesn't exist ValueError: If required columns are missing or skip_errors=False and a row fails to parse Example: # Auto-detect columns >>> natives = parse_csv("birth_data.csv") # Custom column mapping >>> mapping = CSVColumnMapping( ... name="Full Name", ... date="DOB", ... time="Birth Time", ... location="Birth Place", ... ) >>> natives = parse_csv("birth_data.csv", mapping=mapping) # With date format hint for ambiguous dates >>> mapping = CSVColumnMapping( ... date="date", ... date_format="%d/%m/%Y", # European format ... ) >>> natives = parse_csv("european_data.csv", mapping=mapping) """ path = Path(path) if not path.exists(): raise FileNotFoundError(f"CSV file not found: {path}") natives: list[Native] = [] errors: list[tuple[int, str]] = [] with open(path, encoding=encoding, newline="") as f: reader = csv.DictReader(f, delimiter=delimiter) if reader.fieldnames is None: raise ValueError("CSV file has no headers") headers = list(reader.fieldnames) # Auto-detect mapping if not provided if mapping is None: mapping = _auto_detect_mapping(headers) else: # Validate that mapped columns exist # (User-provided mapping should use actual column names) pass for i, row in enumerate(reader, start=2): # Start at 2 (header is row 1) try: native = _row_to_native(row, mapping) natives.append(native) except Exception as e: if skip_errors: errors.append((i, str(e))) else: raise ValueError(f"Error parsing row {i}: {e}") from e if errors: print(f"Warning: Skipped {len(errors)} row(s) with errors:") for row_num, error in errors[:5]: # Show first 5 errors print(f" Row {row_num}: {error}") if len(errors) > 5: print(f" ... and {len(errors) - 5} more") return natives
# Convenience function for simple usage
[docs] def read_csv( path: str | Path, *, name: str | tuple[str, str] | None = None, datetime: str | None = None, date: str | None = None, time: str | None = None, location: str | None = None, latitude: str | None = None, longitude: str | None = None, date_format: str | None = None, time_format: str | None = None, ) -> list[Native]: """ Simple interface for reading CSV files with common column configurations. This is a convenience wrapper around parse_csv() that allows specifying column names as keyword arguments. Args: path: Path to the CSV file name: Column name for person/event name, or tuple of (first, last) datetime: Column name for combined datetime date: Column name for date time: Column name for time location: Column name for location string latitude: Column name for latitude longitude: Column name for longitude date_format: strptime format for dates (e.g., "%d/%m/%Y") time_format: strptime format for times (e.g., "%I:%M %p") Returns: List of Native objects Example: # Simple auto-detection >>> natives = read_csv("data.csv") # Specify key columns >>> natives = read_csv( ... "data.csv", ... name="Full Name", ... date="DOB", ... time="Birth Time", ... location="City", ... ) # Combined first/last name >>> natives = read_csv( ... "data.csv", ... name=("First Name", "Last Name"), ... datetime="Birth DateTime", ... latitude="Lat", ... longitude="Long", ... ) """ mapping = CSVColumnMapping( name=name, datetime=datetime, date=date, time=time, location=location, latitude=latitude, longitude=longitude, date_format=date_format, time_format=time_format, ) # If all mapping fields are None, use auto-detection has_explicit_mapping = any( [name, datetime, date, time, location, latitude, longitude] ) return parse_csv(path, mapping if has_explicit_mapping else None)