#!/usr/bin/env python3
"""Compute per-site suitability indicators from existing pipeline outputs.

The script is intentionally schema-tolerant: it prints one site's discovered JSON
structure first, then uses a small set of common field-name conventions to compute
SNR, S2 archive density, and S2-S3 GCC coherence.
"""

from __future__ import annotations

import argparse
import json
import math
import re
from collections.abc import Iterable
from pathlib import Path
from typing import Any

import numpy as np
import pandas as pd
from scipy.interpolate import UnivariateSpline
from scipy.stats import pearsonr


OUTPUT_NAME = "suitability_screening.json"
SNR_THRESHOLD = 2.0
MATCH_TOLERANCE_DAYS = 2


def load_json(path: Path) -> Any | None:
    if not path.is_file():
        return None
    try:
        with path.open("r", encoding="utf-8") as f:
            return json.load(f)
    except (json.JSONDecodeError, OSError) as exc:
        print(f"[WARN] Could not read JSON {path}: {exc}")
        return None


def jsonable_float(value: Any) -> float | None:
    if isinstance(value, bool):
        return None
    try:
        out = float(value)
    except (TypeError, ValueError):
        return None
    if not math.isfinite(out):
        return None
    return out


def parse_date(value: Any) -> pd.Timestamp | None:
    if value is None:
        return None
    if isinstance(value, pd.Timestamp):
        return value.normalize()
    text = str(value).strip()
    if not text:
        return None
    match = re.search(r"(?<!\d)(\d{8})(?!\d)", text)
    if match:
        try:
            return pd.to_datetime(match.group(1), format="%Y%m%d").normalize()
        except (TypeError, ValueError):
            pass
    try:
        ts = pd.to_datetime(text, errors="coerce")
    except (TypeError, ValueError):
        return None
    if pd.isna(ts):
        return None
    if getattr(ts, "tzinfo", None) is not None:
        ts = ts.tz_convert(None)
    return pd.Timestamp(ts).normalize()


def compact(value: Any, *, max_text: int = 220) -> Any:
    """Return a short representation suitable for discovery logging."""
    if isinstance(value, dict):
        return {k: compact(v, max_text=max_text) for k, v in list(value.items())[:12]}
    if isinstance(value, list):
        return [compact(v, max_text=max_text) for v in value[:2]]
    text = repr(value)
    if len(text) > max_text:
        return text[: max_text - 3] + "..."
    return value


def top_keys(data: Any) -> list[str]:
    if isinstance(data, dict):
        return list(data.keys())
    if isinstance(data, list) and data and isinstance(data[0], dict):
        keys: set[str] = set()
        for entry in data[:5]:
            keys.update(entry.keys())
        return sorted(keys)
    return []


def normalize_records(data: Any) -> list[dict[str, Any]]:
    """Convert common JSON shapes into a list of record dictionaries."""
    if data is None:
        return []
    if isinstance(data, list):
        records = []
        for item in data:
            if isinstance(item, dict):
                records.append(dict(item))
            else:
                records.append({"value": item})
        return records
    if not isinstance(data, dict):
        return [{"value": data}]

    for key in ("timeseries", "time_series", "data", "entries", "results", "records"):
        value = data.get(key)
        if isinstance(value, list):
            return normalize_records(value)

    # Dict keyed by date or filename.
    if data and all(not isinstance(v, (list, tuple)) for v in data.values()):
        records = []
        for key, value in data.items():
            if isinstance(value, dict):
                record = dict(value)
                record.setdefault("date", key)
            else:
                record = {"date": key, "value": value}
            records.append(record)
        return records

    return [dict(data)]


def first_records(data: Any, count: int = 2) -> list[Any]:
    records = normalize_records(data)
    return records[:count]


def recursive_snr_candidates(data: Any, prefix: str = "") -> list[tuple[str, Any]]:
    found: list[tuple[str, Any]] = []
    if isinstance(data, dict):
        for key, value in data.items():
            path = f"{prefix}.{key}" if prefix else str(key)
            if "snr" in str(key).lower():
                found.append((path, value))
            found.extend(recursive_snr_candidates(value, path))
    elif isinstance(data, list):
        for i, value in enumerate(data[:10]):
            found.extend(recursive_snr_candidates(value, f"{prefix}[{i}]"))
    return found


def find_numeric_snr(data: Any) -> float | None:
    candidates = recursive_snr_candidates(data)
    # Prefer exact leaf keys named "snr"; fall back to any numeric snr-containing key.
    candidates.sort(key=lambda kv: 0 if kv[0].split(".")[-1].lower() == "snr" else 1)
    for _, value in candidates:
        numeric = jsonable_float(value)
        if numeric is not None:
            return numeric
        if isinstance(value, dict):
            nested = value.get("snr")
            numeric = jsonable_float(nested)
            if numeric is not None:
                return numeric
    return None


def find_site_roots(base_dir: Path) -> list[tuple[str, Path]]:
    """Find direct site roots, plus the repo's common site/year layout."""
    roots: list[tuple[str, Path]] = []
    if not base_dir.is_dir():
        return roots

    def looks_like_site_root(path: Path) -> bool:
        return any(
            (
                (path / "metrics.json").exists(),
                (path / "raw" / "preselection").exists(),
                (path / "phenocam").exists(),
                (path / "raw" / "phenocam").exists(),
            )
        )

    for child in sorted(p for p in base_dir.iterdir() if p.is_dir()):
        if looks_like_site_root(child):
            roots.append((child.name, child))
            continue
        for grandchild in sorted(p for p in child.iterdir() if p.is_dir()):
            if looks_like_site_root(grandchild):
                name = child.name if grandchild.name.isdigit() else f"{child.name}_{grandchild.name}"
                roots.append((name, grandchild))

    return roots


def find_s2_preselection(site_root: Path) -> Path | None:
    candidates = [
        site_root / "raw" / "preselection" / "s2_preselection.json",
        site_root / "preselection" / "s2_preselection.json",
    ]
    return next((p for p in candidates if p.is_file()), None)


def find_s3_timeseries(site_root: Path) -> Path | None:
    candidates = [
        site_root / "processed_aggressive_sigma20" / "gcc" / "s3" / "timeseries.json",
        site_root / "processed_aggressive_itb_sigma20" / "gcc" / "s3" / "timeseries.json",
    ]
    for candidate in candidates:
        if candidate.is_file():
            return candidate
    matches = sorted(site_root.glob("processed*aggressive*sigma20*/gcc/s3/timeseries.json"))
    return matches[0] if matches else None


def find_metrics(site_root: Path) -> Path | None:
    path = site_root / "metrics.json"
    return path if path.is_file() else None


def find_phenocam(site_root: Path) -> Path | None:
    candidates = [
        site_root / "phenocam" / "gcc_90.json",
        site_root / "phenocam" / "phenocam_gcc.json",
        site_root / "raw" / "phenocam" / "gcc_90.json",
        site_root / "raw" / "phenocam" / "phenocam_gcc.json",
    ]
    for candidate in candidates:
        if candidate.is_file():
            return candidate
    patterns = [
        "phenocam/*gcc*90*.json",
        "phenocam/*gcc*.json",
        "raw/phenocam/*gcc*90*.json",
        "raw/phenocam/*gcc*.json",
        "raw/phenocam/*.json",
    ]
    for pattern in patterns:
        matches = sorted(site_root.glob(pattern))
        if matches:
            return matches[0]
    return None


def print_structure(label: str, path: Path | None) -> None:
    print(f"\n[{label}]")
    if path is None:
        print("missing")
        return
    data = load_json(path)
    print(f"path: {path}")
    print(f"type: {type(data).__name__}")
    print(f"keys: {top_keys(data)}")
    records = [] if label == "metrics.json" else first_records(data, 2)
    if records:
        print(f"first {len(records)} entr{'y' if len(records) == 1 else 'ies'}:")
        print(json.dumps(compact(records), indent=2, default=str))
    if label == "metrics.json":
        snr = recursive_snr_candidates(data)
        phenocam_keys = []
        if isinstance(data, dict):
            for key, value in data.items():
                if "phenocam" in str(key).lower():
                    phenocam_keys.append((key, top_keys(value)))
        print(f"phenocam-like keys: {phenocam_keys}")
        print(f"snr-like keys: {[(path, compact(value)) for path, value in snr]}")


def run_discovery(site_name: str, site_root: Path) -> None:
    print("\n=== Discovery mode ===")
    print(f"Using site: {site_name} ({site_root})")
    print_structure("s2_preselection.json", find_s2_preselection(site_root))
    print_structure("S3 timeseries.json", find_s3_timeseries(site_root))
    print_structure("metrics.json", find_metrics(site_root))
    print_structure("PhenoCam gcc_90 file", find_phenocam(site_root))
    print("\n=== Computing indicators ===")


def choose_discovery_site(site_roots: list[tuple[str, Path]]) -> tuple[str, Path]:
    def score(item: tuple[str, Path]) -> int:
        _, root = item
        return sum(
            int(path is not None)
            for path in (
                find_s2_preselection(root),
                find_s3_timeseries(root),
                find_metrics(root),
                find_phenocam(root),
            )
        )

    return max(site_roots, key=score)


def truthy_status(value: Any, *, field_name: str | None = None) -> bool | None:
    if isinstance(value, bool):
        if field_name and any(word in field_name.lower() for word in ("reject", "exclude")):
            return not value
        return value
    if value is None:
        return True
    if isinstance(value, (int, float)) and not isinstance(value, bool):
        if field_name and any(word in field_name.lower() for word in ("reject", "exclude")):
            return not bool(value)
        return bool(value)
    text = str(value).strip().lower()
    if text in {"", "none", "null", "nan", "ok", "pass", "passed", "keep", "kept", "valid", "selected"}:
        return True
    if text in {
        "fail",
        "failed",
        "false",
        "reject",
        "rejected",
        "exclude",
        "excluded",
        "invalid",
        "cloud",
        "cloudy",
        "dark",
        "bad",
    }:
        return False
    if field_name and any(word in field_name.lower() for word in ("reason", "status")):
        return False
    return None


def acquisition_passes(entry: dict[str, Any], strategy: str) -> bool:
    strategy_aliases = {
        strategy,
        strategy.replace("nonaggressive", "non_aggressive"),
        strategy.replace("nonaggressive", "non-aggressive"),
    }
    negative_prefixes = ("excluded", "exclude", "rejected", "reject")
    positive_prefixes = ("passed", "pass", "keep", "kept", "valid", "selected")

    for alias in strategy_aliases:
        for prefix in negative_prefixes:
            key = f"{prefix}_{alias}"
            if key in entry:
                return not bool(entry[key])
        for prefix in positive_prefixes:
            key = f"{prefix}_{alias}"
            if key in entry:
                return bool(entry[key])

    for alias in strategy_aliases:
        nested = entry.get(alias)
        if isinstance(nested, dict):
            for key, value in nested.items():
                passed = truthy_status(value, field_name=key)
                if passed is not None:
                    return passed
        elif nested is not None:
            passed = truthy_status(nested, field_name=alias)
            if passed is not None:
                return passed

    # Generic status fields.
    for key in (*negative_prefixes, *positive_prefixes, "status", "strategy", "reason", "rejection_reason"):
        if key in entry:
            passed = truthy_status(entry[key], field_name=key)
            if passed is not None:
                return passed

    # Dict keyed by date with a scalar rejection reason.
    if "value" in entry and len(entry) <= 3:
        passed = truthy_status(entry.get("value"), field_name="value")
        if passed is not None:
            return passed

    # Existing pipeline entries with band means and no rejection marker are usable.
    return True


def band_value(entry: dict[str, Any], names: Iterable[str]) -> float | None:
    lowered = {str(k).lower(): v for k, v in entry.items()}
    for name in names:
        if name.lower() in lowered:
            value = jsonable_float(lowered[name.lower()])
            if value is not None:
                return value
    for container_key in ("bands", "band_means", "reflectance", "reflectances", "means", "window_means"):
        container = entry.get(container_key)
        if isinstance(container, dict):
            value = band_value(container, names)
            if value is not None:
                return value
    return None


def entry_date(entry: dict[str, Any]) -> pd.Timestamp | None:
    for key in ("date", "datetime", "time", "timestamp", "acquisition_date"):
        if key in entry:
            date = parse_date(entry[key])
            if date is not None:
                return date
    for key in ("filename", "file", "path", "name"):
        if key in entry:
            date = parse_date(entry[key])
            if date is not None:
                return date
    return None


def s2_gcc_series(s2_data: Any) -> pd.DataFrame:
    rows = []
    for entry in normalize_records(s2_data):
        if not isinstance(entry, dict) or not acquisition_passes(entry, "aggressive"):
            continue
        date = entry_date(entry)
        blue = band_value(entry, ("b02", "blue", "B02", "band_1", "band1"))
        green = band_value(entry, ("b03", "green", "B03", "band_2", "band2"))
        red = band_value(entry, ("b04", "red", "B04", "band_3", "band3"))
        if date is None or blue is None or green is None or red is None:
            continue
        denom = blue + green + red
        if denom <= 0:
            continue
        rows.append({"date": date, "s2_gcc": green / denom})
    if not rows:
        return pd.DataFrame(columns=["date", "s2_gcc"])
    return pd.DataFrame(rows).groupby("date", as_index=False)["s2_gcc"].mean().sort_values("date")


def value_from_record(entry: dict[str, Any], preferred: Iterable[str]) -> float | None:
    lowered = {str(k).lower(): v for k, v in entry.items()}
    for name in preferred:
        value = jsonable_float(lowered.get(name.lower()))
        if value is not None:
            return value
    for key, value in lowered.items():
        if any(token in key for token in ("gcc", "greenness")):
            numeric = jsonable_float(value)
            if numeric is not None:
                return numeric
    return None


def gcc_timeseries(data: Any, value_name: str) -> pd.DataFrame:
    rows = []
    for entry in normalize_records(data):
        if not isinstance(entry, dict):
            continue
        date = entry_date(entry)
        value = value_from_record(
            entry,
            ("greenness_index", "gcc_90", "gcc", "value", "mean", "site_value"),
        )
        if date is not None and value is not None:
            rows.append({"date": date, value_name: value})
    if not rows:
        return pd.DataFrame(columns=["date", value_name])
    return pd.DataFrame(rows).groupby("date", as_index=False)[value_name].mean().sort_values("date")


def compute_archive_density(s2_data: Any | None) -> tuple[int | None, int | None]:
    if s2_data is None:
        return None, None
    records = [entry for entry in normalize_records(s2_data) if isinstance(entry, dict)]
    if not records:
        return None, None
    aggressive = sum(1 for entry in records if acquisition_passes(entry, "aggressive"))
    nonaggressive = sum(1 for entry in records if acquisition_passes(entry, "nonaggressive"))
    return aggressive, nonaggressive


def compute_coherence(s2_data: Any | None, s3_data: Any | None) -> tuple[int | None, float | None, float | None]:
    if s2_data is None or s3_data is None:
        return None, None, None
    s2 = s2_gcc_series(s2_data)
    s3 = gcc_timeseries(s3_data, "s3_gcc")
    if s2.empty or s3.empty:
        return 0, None, None

    matched = pd.merge_asof(
        s2.sort_values("date"),
        s3.sort_values("date"),
        on="date",
        direction="nearest",
        tolerance=pd.Timedelta(days=MATCH_TOLERANCE_DAYS),
    ).dropna(subset=["s2_gcc", "s3_gcc"])
    n = int(len(matched))
    if n < 2:
        return n, None, None
    r, p_value = pearsonr(matched["s2_gcc"].to_numpy(), matched["s3_gcc"].to_numpy())
    return n, jsonable_float(r), jsonable_float(p_value)


def phenocam_series(data: Any | None) -> pd.DataFrame:
    if data is None:
        return pd.DataFrame(columns=["date", "gcc"])
    rows = []
    for entry in normalize_records(data):
        if isinstance(entry, dict):
            date = entry_date(entry)
            value = value_from_record(
                entry,
                ("gcc_90", "greenness_index", "gcc", "gcc_mean", "value"),
            )
        else:
            date = None
            value = jsonable_float(entry)
        if date is not None and value is not None:
            rows.append({"date": date, "gcc": value})
    if not rows:
        return pd.DataFrame(columns=["date", "gcc"])
    return pd.DataFrame(rows).groupby("date", as_index=False)["gcc"].mean().sort_values("date")


def compute_snr_from_phenocam(phenocam_data: Any | None) -> float | None:
    series = phenocam_series(phenocam_data)
    if len(series) < 5:
        return None
    x = (series["date"] - series["date"].min()).dt.days.to_numpy(dtype=float)
    y = series["gcc"].to_numpy(dtype=float)
    if len(np.unique(x)) < 5:
        return None
    try:
        spline = UnivariateSpline(x, y, k=3)
        residual = y - spline(x)
    except Exception as exc:
        print(f"[WARN] Could not fit PhenoCam smoothing spline: {exc}")
        return None
    rmse = float(np.sqrt(np.mean(residual**2)))
    amplitude = float(np.max(y) - np.min(y))
    if rmse <= 0:
        return None
    return amplitude / rmse


def compute_snr(metrics_data: Any | None, phenocam_data: Any | None) -> float | None:
    from_metrics = find_numeric_snr(metrics_data)
    if from_metrics is not None:
        return from_metrics
    return compute_snr_from_phenocam(phenocam_data)


def compute_site(site_root: Path) -> dict[str, Any]:
    s2_data = load_json(find_s2_preselection(site_root) or Path("__missing__"))
    s3_data = load_json(find_s3_timeseries(site_root) or Path("__missing__"))
    metrics_data = load_json(find_metrics(site_root) or Path("__missing__"))
    phenocam_data = load_json(find_phenocam(site_root) or Path("__missing__"))

    snr = compute_snr(metrics_data, phenocam_data)
    n_s2_aggressive, n_s2_nonaggressive = compute_archive_density(s2_data)
    n_matched, pearson_r, p_value = compute_coherence(s2_data, s3_data)

    return {
        "snr": snr,
        "snr_pass": None if snr is None else snr >= SNR_THRESHOLD,
        "n_s2_aggressive": n_s2_aggressive,
        "n_s2_nonaggressive": n_s2_nonaggressive,
        "coherence_n_matched": n_matched,
        "coherence_pearson_r": pearson_r,
        "coherence_p_value": p_value,
    }


def print_summary(results: dict[str, dict[str, Any]]) -> None:
    print("\nSuitability summary")
    if not results:
        print("(no sites found)")
        return

    columns = [
        ("site", "site"),
        ("snr", "snr"),
        ("snr_pass", "pass"),
        ("n_s2_aggressive", "n_s2_agg"),
        ("n_s2_nonaggressive", "n_s2_nonagg"),
        ("coherence_n_matched", "n_match"),
        ("coherence_pearson_r", "pearson_r"),
        ("coherence_p_value", "p_value"),
    ]

    def fmt(value: Any, key: str) -> str:
        if value is None:
            return "null"
        if key.startswith("n_") or key == "coherence_n_matched":
            return str(int(value))
        if isinstance(value, bool):
            return "true" if value else "false"
        if isinstance(value, (int, float)):
            return f"{float(value):.4g}"
        return str(value)

    rows = []
    for site, values in results.items():
        rows.append([site, *[fmt(values.get(key), key) for key, _ in columns[1:]]])
    widths = [
        max(len(header), *(len(row[i]) for row in rows))
        for i, (_, header) in enumerate(columns)
    ]
    header = "  ".join(header.ljust(widths[i]) for i, (_, header) in enumerate(columns))
    print(header)
    print("  ".join("-" * width for width in widths))
    for row in rows:
        print("  ".join(row[i].ljust(widths[i]) for i in range(len(columns))))


def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--base-dir",
        required=True,
        type=Path,
        help="Pipeline output root containing one subdirectory per site.",
    )
    args = parser.parse_args()

    base_dir = args.base_dir.expanduser().resolve()
    site_roots = find_site_roots(base_dir)
    if site_roots:
        run_discovery(*choose_discovery_site(site_roots))
    else:
        print(f"[WARN] No site directories found under {base_dir}")

    results = {site_name: compute_site(site_root) for site_name, site_root in site_roots}
    output_path = base_dir / OUTPUT_NAME
    with output_path.open("w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, allow_nan=False)
        f.write("\n")
    print_summary(results)
    print(f"\nWrote {output_path}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())