efast-phenocam-validation/gap_validation/temporal_pc.py

"""Full-season gap-degraded fusion → temporal NSE_PC vs PhenoCam (tier after spatial validation)."""

from __future__ import annotations

import argparse
import json
import re
from datetime import datetime
from pathlib import Path

from metrics_indices import _get_gcc_from_original
from metrics_stats import (
    WHITTAKER_LAMBDA_DAYS_SQ,
    _norm_date_key,
    _s2_gcc_series_from_preselection,
    _whittaker_smooth_dict,
    calculate_temporal_metrics,
    load_timeseries,
)

from gap_validation.calendar import TRANSITIONS, load_manifest, validation_dir, write_manifest
from gap_validation.fusion_masked import run_masked_fusion_season
from gap_validation.run import (
    _filter_entries,
    _scenario_key,
    _withheld_iso,
    _yyyymmdd_from_withheld_filename,
)
from gap_validation.whittaker_compare import first_gap_where_fusion_below_whittaker


def _fusion_gcc_timeseries(
    fusion_dir: Path, site_position: tuple[float, float], mode: str
) -> dict[str, float]:
    """3×3 mean GCC at site from fused REFL/GCC rasters in ``fusion_dir``."""
    pattern = "REFL_*.tif" if mode == "bti" else "GCC_*.tif"
    out: dict[str, float] = {}
    for p in sorted(fusion_dir.glob(pattern)):
        m = re.search(r"_(\d{8})\.tif$", p.name)
        if not m:
            continue
        d = datetime.strptime(m.group(1), "%Y%m%d").date().isoformat()
        gcc = _get_gcc_from_original(p, site_position)
        if gcc is not None:
            out[d] = float(gcc)
    return out


def whittaker_timeseries_gap_degraded(
    base: Path,
    strategy: str,
    window_start_iso: str,
    window_end_iso: str,
    withheld_iso: str,
    lam: float = WHITTAKER_LAMBDA_DAYS_SQ,
) -> dict[str, float]:
    """Daily Whittaker GCC on S2 preselection with gap window + withheld day removed."""
    all_gcc, flags = _s2_gcc_series_from_preselection(base)
    if not all_gcc:
        return {}
    idx = 0 if strategy == "aggressive" else 1
    w0 = datetime.strptime(window_start_iso[:10], "%Y-%m-%d").date()
    w1 = datetime.strptime(window_end_iso[:10], "%Y-%m-%d").date()
    wh_k = _norm_date_key(withheld_iso)

    def in_window(dk: str) -> bool:
        try:
            d = datetime.strptime(dk[:10], "%Y-%m-%d").date()
        except ValueError:
            return False
        return w0 <= d <= w1

    kept = sorted(
        (d, g)
        for d, g in all_gcc.items()
        if d in flags
        and not flags[d][idx]
        and _norm_date_key(d) != wh_k
        and not in_window(_norm_date_key(d) or "")
    )
    if len(kept) < 2:
        return {}
    obs_d, obs_v = zip(*kept)
    return _whittaker_smooth_dict(obs_d, obs_v, lam)


def run_temporal_pc(
    site_name: str,
    season: int,
    site_position: tuple[float, float],
    strategy: str,
    sigma: int | None,
    mode: str,
    *,
    skip_manifest: bool,
    skip_fusion: bool,
    gap_days_filter: list[int] | None,
    transition_filter: list[str] | None,
    s2_calendar_strategy: str,
) -> Path:
    """Run full-season gap fusion + NSE_PC; write ``gap_metrics.json``."""
    base = Path(f"data/{site_name}/{season}")
    vdir = validation_dir(site_name, season)
    vdir.mkdir(parents=True, exist_ok=True)

    if not skip_manifest:
        write_manifest(
            site_name,
            season,
            site_position,
            s2_calendar_strategy=s2_calendar_strategy,
        )

    manifest = load_manifest(site_name, season)
    entries = _filter_entries(manifest["entries"], gap_days_filter, transition_filter)
    phenocam_ts_path = base / "raw" / "phenocam" / "phenocam_gcc.json"
    phenocam_ts = load_timeseries(phenocam_ts_path)

    nogap_metrics_path = base / "metrics.json"
    nogap_nse: dict[str, float | None] = {}
    if nogap_metrics_path.is_file():
        m = json.loads(nogap_metrics_path.read_text(encoding="utf-8"))
        sk = _scenario_key(strategy, sigma, mode)
        block = (m.get("temporal") or {}).get(sk) or {}
        nogap_nse["nse_pc"] = block.get("nse_pc")

    results: list[dict] = []
    crossover_rows: list[dict] = []

    for entry in entries:
        transition = entry.get("transition", "green_up")
        gap_days = entry["gap_days"]
        pred = entry["prediction_date"]
        w0, w1 = entry["window_start"], entry["window_end"]
        fn = entry.get("withheld_s2_filename")
        if not fn:
            results.append(
                {"transition": transition, "gap_days": gap_days, "error": "no_withheld_s2"}
            )
            continue
        wh_ymd = _yyyymmdd_from_withheld_filename(fn)
        if not wh_ymd:
            results.append(
                {
                    "transition": transition,
                    "gap_days": gap_days,
                    "error": "bad_withheld_filename",
                }
            )
            continue
        withheld_iso = _withheld_iso(entry) or f"{wh_ymd[:4]}-{wh_ymd[4:6]}-{wh_ymd[6:8]}"

        temporal_dir = (
            vdir / "temporal" / f"gap_{gap_days}_{transition}" / _scenario_key(strategy, sigma, mode)
        )
        if not skip_fusion:
            try:
                run_masked_fusion_season(
                    season,
                    site_position,
                    site_name,
                    strategy,
                    sigma,
                    mode,
                    w0,
                    w1,
                    wh_ymd,
                    temporal_dir,
                )
            except RuntimeError as e:
                results.append(
                    {
                        "transition": transition,
                        "gap_days": gap_days,
                        "error": str(e),
                    }
                )
                continue
            fusion_ts = _fusion_gcc_timeseries(temporal_dir, site_position, mode)
        else:
            fusion_ts = _fusion_gcc_timeseries(temporal_dir, site_position, mode)

        fused_metrics = calculate_temporal_metrics(fusion_ts, phenocam_ts)
        wh_ts = whittaker_timeseries_gap_degraded(
            base, strategy, w0, w1, withheld_iso
        )
        wh_metrics = calculate_temporal_metrics(wh_ts, phenocam_ts)

        row: dict = {
            "transition": transition,
            "gap_days": gap_days,
            "prediction_date": pred,
            "window_start": w0,
            "window_end": w1,
            "withheld_s2_filename": fn,
            "temporal": {
                "fused": fused_metrics,
                "whittaker": wh_metrics,
            },
            "fusion_dir": str(temporal_dir),
        }
        if fused_metrics and nogap_nse.get("nse_pc") is not None:
            g_rmse = fused_metrics.get("rmse")
            ng_rmse = None
            if nogap_metrics_path.is_file():
                sk = _scenario_key(strategy, sigma, mode)
                ng_rmse = (
                    (json.loads(nogap_metrics_path.read_text()).get("temporal") or {})
                    .get(sk, {})
                    .get("rmse")
                )
            n_g = fused_metrics.get("nse_pc")
            n_ng = nogap_nse["nse_pc"]
            if g_rmse is not None and ng_rmse is not None:
                row["delta_rmse"] = float(g_rmse - ng_rmse)
            if n_g is not None and n_ng is not None:
                row["delta_nse"] = float(n_ng - n_g)

        fn_pc = (fused_metrics or {}).get("nse_pc")
        wh_pc = (wh_metrics or {}).get("nse_pc")
        row["utility_crossover_row"] = {
            "transition": transition,
            "gap_days": gap_days,
            "nse_pc_fusion": fn_pc,
            "nse_pc_whittaker": wh_pc,
        }
        crossover_rows.append(row["utility_crossover_row"])
        results.append(row)

    scenario = _scenario_key(strategy, sigma, mode)
    payload = {
        "site_name": site_name,
        "season": season,
        "scenario": scenario,
        "tier": "temporal_nse_pc",
        "manifest": str(vdir / "gap_manifest.json"),
        "results": results,
        "utility_crossover": {
            scenario: {
                "metric": "nse_pc_vs_phenocam_gcc90",
                "first_gap_days_fusion_below_whittaker": first_gap_where_fusion_below_whittaker(
                    crossover_rows,
                    fusion_key="nse_pc_fusion",
                    whittaker_key="nse_pc_whittaker",
                ),
                "by_gap": crossover_rows,
            }
        },
    }
    out_path = vdir / f"gap_metrics_{mode}.json"
    out_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
    if mode == "bti":
        # Legacy alias for backward-compatible readers.
        (vdir / "gap_metrics.json").write_text(
            json.dumps(payload, indent=2) + "\n", encoding="utf-8"
        )
    return out_path


def main() -> None:
    ap = argparse.ArgumentParser(description="Gap-degraded full-season NSE_PC tier.")
    ap.add_argument("--site", required=True)
    ap.add_argument("--season", type=int, required=True)
    ap.add_argument("--lat", type=float, required=True)
    ap.add_argument("--lon", type=float, required=True)
    ap.add_argument("--strategy", default="aggressive")
    ap.add_argument("--sigma", type=int, default=20, choices=[20, 30])
    ap.add_argument("--mode", default="bti", choices=["bti", "itb"])
    ap.add_argument("--gap-days", type=int, action="append")
    ap.add_argument("--transition", choices=list(TRANSITIONS), action="append")
    ap.add_argument("--skip-manifest", action="store_true")
    ap.add_argument("--skip-fusion", action="store_true")
    ap.add_argument("--s2-calendar-strategy", default="aggressive")
    args = ap.parse_args()
    sigma_kw = 30 if args.sigma == 30 else None
    out = run_temporal_pc(
        args.site,
        args.season,
        (args.lat, args.lon),
        args.strategy,
        sigma_kw,
        args.mode,
        skip_manifest=args.skip_manifest,
        skip_fusion=args.skip_fusion,
        gap_days_filter=args.gap_days,
        transition_filter=args.transition,
        s2_calendar_strategy=args.s2_calendar_strategy,
    )
    print(out)


if __name__ == "__main__":
    main()