efast-phenocam-validation/5-metrics.py

"""Step 5: Pre-compute per-site GCC timeseries + raster index for the webapp.

Inputs (``data/``, ``{year}`` = ``--evaluation-year``):

- ``phenocam_screening/{year}.json`` — qualifying sites + metadata
- ``phenocam/{year}/{site}_1day.csv`` — daily GCC timeseries
- ``sentinel_data/{year}/{site}/prepared/s2/*_GCC.tif`` — S2 GCC rasters
- ``sentinel_data/{year}/{site}/prepared/gcc_s3/composite_*.tif`` — S3 GCC rasters
- ``fusion/{year}/{site}/bti/gcc/GCC_*.tif`` — BtI GCC rasters
- ``fusion/{year}/{site}/itb/fusion/GCC_*.tif`` — ItB GCC rasters

Outputs (``data/metrics/``):

- ``manifest.json`` — years + per-site metadata
- ``{year}/{site}/gcc_phenocam.json`` — PhenoCam ``gcc_90`` at matched dates
- ``{year}/{site}/gcc_s2.json`` — S2 GCC (center pixel, cloud-free scenes)
- ``{year}/{site}/gcc_s2_whittaker.json`` — Whittaker-smoothed S2 GCC
- ``{year}/{site}/gcc_s3.json`` — S3 composite GCC
- ``{year}/{site}/gcc_s3_smooth.json`` — S3 5-day moving average
- ``{year}/{site}/gcc_fusion_bti.json`` — BtI fused GCC
- ``{year}/{site}/gcc_fusion_itb.json`` — ItB fused GCC
- ``{year}/{site}/phenocam_images.json`` — midday photo URLs for the viewer
- ``{year}/{site}/rasters_s2_refl.json`` — S2 REFL paths (BtI view)
- ``{year}/{site}/rasters_s3_composite.json`` — S3 composite paths (BtI view)
- ``{year}/{site}/rasters_s2_gcc.json`` — S2 GCC paths (ItB view)
- ``{year}/{site}/rasters_s3_gcc.json`` — S3 GCC paths (ItB view)
- ``{year}/{site}/rasters_fusion_bti_refl.json`` — BtI fused REFL paths
- ``{year}/{site}/rasters_fusion_itb_gcc.json`` — ItB fused GCC paths
- ``{year}/{site}/metrics.json`` — NSE, RMSE, nRMSE, Pearson r vs PhenoCam per series
- ``{year}/{site}/bands_s2.json`` — S2 center-pixel reflectance (B02, B03, B04) per scene
- ``{year}/{site}/bands_s3.json`` — S3 center-pixel reflectance (Oa04, Oa06, Oa08, Oa17) per composite
- ``{year}/{site}/covariates.json`` — spatial CV/std, S2/S3 counts, gap stats

CLI:

- ``--evaluation-year`` (default 2025)
- ``--site`` (optional; default: all qualifying sites with sentinel data)
"""

from __future__ import annotations

import argparse
import csv
import json
import re
from pathlib import Path
from typing import Any

import datetime
import numpy as np
import rasterio
from rasterio.crs import CRS
from rasterio.transform import rowcol
from pyproj import Transformer
from scipy.stats import pearsonr
from tqdm import tqdm

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

DATA_DIR = Path("data")
DEFAULT_YEAR = 2025

# GCC smoothing window for S3 moving average (days)
S3_SMOOTH_WINDOW = 5

# Whittaker lambda (penalised smoothing strength for S2)
WHITTAKER_LAMBDA = 400.0

# Half-width in metres for the spatial heterogeneity footprint (~300 m = 1 S3 pixel)
SPATIAL_CV_HALF_M = 150

# PhenoCam archive image URL pattern
PHENOCAM_IMAGE_URL = (
    "https://phenocam.nau.edu/data/archive/{site}/{year}/{month}/{filename}"
)


# ---------------------------------------------------------------------------
# Helpers: raster pixel extraction
# ---------------------------------------------------------------------------


def _window_mean(data: np.ndarray) -> float | None:
    """Mean of a pixel window; ``None`` when every value is NaN."""
    valid = data[~np.isnan(data)]
    if valid.size == 0:
        return None
    return float(np.mean(valid))


def _read_center_pixel(path: Path, lat: float, lon: float) -> float | None:
    """Return the 3×3 mean GCC value at (lat, lon) from a single-band raster.

    Returns ``None`` when the pixel is masked/zero/NaN.
    """
    try:
        with rasterio.open(path) as src:
            transformer = Transformer.from_crs(
                CRS.from_epsg(4326), src.crs, always_xy=True
            )
            x, y = transformer.transform(lon, lat)
            row, col = rowcol(src.transform, x, y)
            h, w = src.height, src.width
            r0, r1 = max(0, row - 1), min(h, row + 2)
            c0, c1 = max(0, col - 1), min(w, col + 2)
            window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0)
            data = src.read(1, window=window).astype(float)
            nodata = src.nodata
        if nodata is not None:
            data = np.where(data == nodata, np.nan, data)
        data[data == 0] = np.nan
        return _window_mean(data)
    except Exception:
        return None


# ---------------------------------------------------------------------------
# Helpers: date extraction from filenames
# ---------------------------------------------------------------------------


def _date_from_gcc_tif(path: Path) -> str | None:
    """Extract YYYYMMDD from ``GCC_YYYYMMDD.tif`` or ``composite_YYYYMMDD.tif``."""
    m = re.search(r"(\d{8})", path.stem)
    return m.group(1) if m else None


def _date_from_s2_tif(path: Path) -> str | None:
    """Extract YYYYMMDD from S2 product name ``S2X_TTTT_YYYYMMDD_…``."""
    parts = path.stem.split("_")
    if len(parts) >= 3:
        m = re.match(r"(\d{8})", parts[2])
        return m.group(1) if m else None
    return None


# ---------------------------------------------------------------------------
# Helpers: Whittaker smoother (2nd-order differences, tridiagonal solver)
# ---------------------------------------------------------------------------


def _whittaker_smooth(
    values: list[float | None], lam: float = WHITTAKER_LAMBDA
) -> list[float | None]:
    """Penalised least-squares smoother (Whittaker, 2nd-order differences).

    Masked (None) values are filled via the smooth and then re-set to None in
    the output so the caller can distinguish observed from gap-filled points.
    """
    n = len(values)
    if n < 4:
        return values[:]

    obs_mask = [v is not None for v in values]
    y = np.array([v if v is not None else 0.0 for v in values], dtype=float)
    w = np.array([1.0 if m else 0.0 for m in obs_mask], dtype=float)

    W = np.diag(w)
    D = np.diff(np.eye(n), n=2, axis=0)  # (n-2) x n second-difference matrix
    A = W + lam * D.T @ D
    try:
        z = np.linalg.solve(A, w * y)
    except np.linalg.LinAlgError:
        return values[:]

    result: list[float | None] = []
    for i, m in enumerate(obs_mask):
        result.append(float(z[i]) if m else None)
    return result


# ---------------------------------------------------------------------------
# Helpers: PhenoCam CSV parsing
# ---------------------------------------------------------------------------


def _parse_phenocam_csv(
    csv_path: Path, year: int, site: str
) -> tuple[list[dict], list[dict]]:
    """Return (gcc_series, image_list) filtered to ``year``.

    ``gcc_series`` entries: ``{"date": "YYYY-MM-DD", "gcc_90": float}``
    ``image_list`` entries: ``{"date": "YYYY-MM-DD", "url": str}``
    """
    gcc_series: list[dict] = []
    image_list: list[dict] = []
    year_str = str(year)

    if not csv_path.is_file():
        return gcc_series, image_list

    with csv_path.open() as f:
        lines = [line for line in f if not line.startswith("#")]

    reader = csv.DictReader(lines)
    for row in reader:
        if row.get("year") != year_str:
            continue
        date = row.get("date", "")
        gcc_raw = row.get("gcc_90")
        if gcc_raw and gcc_raw not in ("NA", ""):
            try:
                gcc_series.append({"date": date, "gcc_90": float(gcc_raw)})
            except ValueError:
                pass
        fn = row.get("midday_filename", "").strip()
        if fn and fn != "NA" and date:
            month = date[5:7]
            url = PHENOCAM_IMAGE_URL.format(
                site=site, year=year_str, month=month, filename=fn
            )
            image_list.append({"date": date, "url": url})

    return gcc_series, image_list


# ---------------------------------------------------------------------------
# Helpers: moving average
# ---------------------------------------------------------------------------


def _moving_average(series: list[dict], value_key: str, window: int) -> list[dict]:
    """Compute centred moving average; returns new list with ``_smooth`` suffix key."""
    if not series:
        return []
    vals = [p[value_key] for p in series]
    half = window // 2
    smoothed = []
    for i, pt in enumerate(series):
        chunk = [v for v in vals[max(0, i - half) : i + half + 1] if v is not None]
        smoothed.append(
            {
                "date": pt["date"],
                value_key + "_smooth": (sum(chunk) / len(chunk)) if chunk else None,
            }
        )
    return smoothed


# ---------------------------------------------------------------------------
# Helpers: validation metrics
# ---------------------------------------------------------------------------

MATCH_TOLERANCE_DAYS = 5


def compute_metrics(
    ref: list[dict],
    ref_key: str,
    pred: list[dict],
    pred_key: str,
) -> dict | None:
    """Compute NSE, RMSE, nRMSE, Pearson r between pred and ref.

    Each pred point is matched to the nearest ref date within
    ``MATCH_TOLERANCE_DAYS``.  Returns a dict or ``None`` if fewer than
    2 matched pairs exist.
    """
    ref_lookup: dict[str, float] = {
        p["date"]: p[ref_key] for p in ref if p.get(ref_key) is not None
    }
    if not ref_lookup:
        return None

    ref_dates = sorted(ref_lookup)

    obs, sim = [], []
    for pt in pred:
        v = pt.get(pred_key)
        if v is None:
            continue
        nearest = min(
            ref_dates,
            key=lambda d: abs(
                (np.datetime64(pt["date"]) - np.datetime64(d)) / np.timedelta64(1, "D")
            ),
        )
        gap = abs(
            (np.datetime64(pt["date"]) - np.datetime64(nearest))
            / np.timedelta64(1, "D")
        )
        if gap <= MATCH_TOLERANCE_DAYS and nearest in ref_lookup:
            obs.append(ref_lookup[nearest])
            sim.append(v)

    if len(obs) < 2:
        return None

    obs_arr = np.array(obs)
    sim_arr = np.array(sim)
    obs_mean = obs_arr.mean()

    rmse = float(np.sqrt(np.mean((sim_arr - obs_arr) ** 2)))
    nrmse = rmse / obs_mean if obs_mean else None
    ss_res = float(np.sum((obs_arr - sim_arr) ** 2))
    ss_tot = float(np.sum((obs_arr - obs_mean) ** 2))
    nse = (1.0 - ss_res / ss_tot) if ss_tot else None
    r, _ = pearsonr(obs_arr, sim_arr)

    def _r4(v: float | None) -> float | None:
        return round(v, 4) if v is not None else None

    return {
        "n": len(obs),
        "rmse": _r4(rmse),
        "nrmse": _r4(nrmse),
        "nse": _r4(nse),
        "r": _r4(float(r)),
    }


S2_BAND_NAMES = ["B02", "B03", "B04"]
S3_BAND_NAMES = ["Oa04", "Oa06", "Oa08", "Oa17"]


def _read_multiband_center(
    path: Path, lat: float, lon: float, band_names: list[str]
) -> dict[str, float | None]:
    """Return 3×3 mean per band at (lat, lon). Keys are ``band_names``, values float or None."""
    try:
        with rasterio.open(path) as src:
            transformer = Transformer.from_crs(
                CRS.from_epsg(4326), src.crs, always_xy=True
            )
            x, y = transformer.transform(lon, lat)
            row, col = rowcol(src.transform, x, y)
            h, w = src.height, src.width
            r0, r1 = max(0, row - 1), min(h, row + 2)
            c0, c1 = max(0, col - 1), min(w, col + 2)
            window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0)
            nodata = src.nodata
            result = {}
            for i, name in enumerate(band_names, 1):
                if i > src.count:
                    result[name] = None
                    continue
                data = src.read(i, window=window).astype(float)
                if nodata is not None:
                    data = np.where(data == nodata, np.nan, data)
                data[data == 0] = np.nan
                val = _window_mean(data)
                result[name] = None if val is None else round(val, 6)
        return result
    except Exception:
        return {name: None for name in band_names}


def _multiband_series(
    tif_paths: list[Path],
    date_fn,
    lat: float,
    lon: float,
    band_names: list[str],
    desc: str,
) -> list[dict]:
    """Extract center-pixel values for all bands; return ``[{date, band1, band2, …}]``."""
    result = []
    for p in tqdm(tif_paths, desc=desc, leave=False):
        date = date_fn(p)
        if date is None:
            continue
        vals = _read_multiband_center(p, lat, lon, band_names)
        if any(v is not None for v in vals.values()):
            result.append({"date": f"{date[:4]}-{date[4:6]}-{date[6:]}", **vals})
    return sorted(result, key=lambda x: x["date"])


# ---------------------------------------------------------------------------
# Helpers: spatial heterogeneity + observation density
# ---------------------------------------------------------------------------


def _read_footprint_stats(
    path: Path, lat: float, lon: float, half_m: float = SPATIAL_CV_HALF_M
) -> tuple[float, float] | tuple[None, None]:
    """Return (mean, std) of valid GCC pixels within a ±half_m metre square window.

    Returns ``(None, None)`` on any error or when fewer than 4 valid pixels exist.
    """
    try:
        with rasterio.open(path) as src:
            transformer = Transformer.from_crs(
                CRS.from_epsg(4326), src.crs, always_xy=True
            )
            x, y = transformer.transform(lon, lat)
            res = abs(src.transform.a)  # pixel size in CRS units (metres for UTM)
            half_px = max(1, int(round(half_m / res)))
            row, col = rowcol(src.transform, x, y)
            h, w = src.height, src.width
            r0, r1 = max(0, row - half_px), min(h, row + half_px + 1)
            c0, c1 = max(0, col - half_px), min(w, col + half_px + 1)
            window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0)
            data = src.read(1, window=window).astype(float)
            nodata = src.nodata
        if nodata is not None:
            data = np.where(data == nodata, np.nan, data)
        data[data <= 0] = np.nan
        valid = data[~np.isnan(data)]
        if len(valid) < 4:
            return None, None
        return float(np.mean(valid)), float(np.std(valid))
    except Exception:
        return None, None


def compute_covariates(
    s2_gcc_paths: list[Path],
    s2_series: list[dict],
    s3_series: list[dict],
    n_gcc_points: int | None,
    lat: float,
    lon: float,
) -> dict:
    """Compute spatial heterogeneity and temporal observation density covariates."""
    # Spatial GCC statistics over ~300 m footprint
    means, stds = [], []
    for p in s2_gcc_paths:
        m, s = _read_footprint_stats(p, lat, lon)
        if m is not None and m > 0:
            means.append(m)
            stds.append(s)

    spatial_gcc_cv = (
        round(float(np.mean([s / m for s, m in zip(stds, means)])), 4)
        if means
        else None
    )
    spatial_gcc_std = round(float(np.mean(stds)), 4) if stds else None

    # S2 temporal gap statistics
    s2_dates = [datetime.date.fromisoformat(p["date"]) for p in s2_series]
    if len(s2_dates) >= 2:
        gaps = [(s2_dates[i + 1] - s2_dates[i]).days for i in range(len(s2_dates) - 1)]
        s2_mean_gap = round(float(np.mean(gaps)), 1)
        s2_max_gap = int(max(gaps))
    else:
        s2_mean_gap = None
        s2_max_gap = None

    return {
        "spatial_gcc_cv": spatial_gcc_cv,
        "spatial_gcc_std": spatial_gcc_std,
        "s2_scene_count": len(s2_series),
        "s2_mean_gap_days": s2_mean_gap,
        "s2_max_gap_days": s2_max_gap,
        "s3_composite_count": len(s3_series),
        "n_gcc_points": n_gcc_points,
    }


# ---------------------------------------------------------------------------
# Per-site export
# ---------------------------------------------------------------------------


def _write_json(path: Path, data: Any) -> None:
    path.write_text(json.dumps(data, separators=(",", ":")))


def _raster_series(
    tif_paths: list[Path],
    date_fn,
    lat: float,
    lon: float,
    desc: str,
) -> list[dict]:
    """Extract center-pixel GCC from each tif, return ``[{date, gcc}]`` sorted."""
    result = []
    for p in tqdm(tif_paths, desc=desc, leave=False):
        date = date_fn(p)
        if date is None:
            continue
        val = _read_center_pixel(p, lat, lon)
        if val is not None:
            result.append({"date": f"{date[:4]}-{date[4:6]}-{date[6:]}", "gcc": val})
    return sorted(result, key=lambda x: x["date"])


def _raster_index(tif_paths: list[Path], date_fn, rel_root: Path) -> list[dict]:
    """Build raster index: ``[{date, path}]`` sorted by date."""
    result = []
    for p in tif_paths:
        date = date_fn(p)
        if date is None:
            continue
        try:
            rel = str(p.relative_to(rel_root))
        except ValueError:
            rel = str(p)
        result.append({"date": date, "path": rel})
    return sorted(result, key=lambda x: x["date"])


def export_site(
    site: str,
    year: int,
    lat: float,
    lon: float,
    out_dir: Path,
    n_gcc_points: int | None = None,
) -> bool:
    """Export timeseries.json and rasters.json for one site. Returns True on success."""
    sentinel_base = DATA_DIR / "sentinel_data" / str(year) / site / "prepared"
    fusion_base = DATA_DIR / "fusion" / str(year) / site

    s2_gcc_dir = sentinel_base / "s2"
    s3_gcc_dir = sentinel_base / "gcc_s3"
    bti_gcc_dir = fusion_base / "bti" / "gcc"
    itb_gcc_dir = fusion_base / "itb" / "fusion"

    # Raster slider sources
    s2_refl_dir = sentinel_base / "s2"
    s3_comp_dir = sentinel_base / "s3"
    bti_refl_dir = fusion_base / "bti" / "fusion"

    has_fusion = bti_gcc_dir.is_dir() and any(bti_gcc_dir.glob("GCC_*.tif"))
    if not has_fusion:
        return False

    out_dir.mkdir(parents=True, exist_ok=True)

    # --- GCC timeseries from rasters ---
    s2_gcc_paths = sorted(s2_gcc_dir.glob("*_GCC.tif"))
    s3_gcc_paths = sorted(s3_gcc_dir.glob("composite_*.tif"))
    bti_paths = sorted(bti_gcc_dir.glob("GCC_*.tif"))
    itb_paths = sorted(itb_gcc_dir.glob("GCC_*.tif"))

    s2_series = _raster_series(s2_gcc_paths, _date_from_s2_tif, lat, lon, f"{site} S2")
    s3_series = _raster_series(s3_gcc_paths, _date_from_gcc_tif, lat, lon, f"{site} S3")
    bti_series = _raster_series(bti_paths, _date_from_gcc_tif, lat, lon, f"{site} BtI")
    itb_series = _raster_series(itb_paths, _date_from_gcc_tif, lat, lon, f"{site} ItB")

    # Whittaker on S2
    s2_vals = [p["gcc"] for p in s2_series]
    s2_smooth_vals = _whittaker_smooth(s2_vals)
    s2_whittaker = [
        {"date": p["date"], "gcc": v}
        for p, v in zip(s2_series, s2_smooth_vals)
        if v is not None
    ]

    # S3 5-day moving average
    s3_smooth = _moving_average(s3_series, "gcc", S3_SMOOTH_WINDOW)

    # PhenoCam CSV
    csv_path = DATA_DIR / "phenocam" / str(year) / f"{site}_1day.csv"
    phenocam_series, image_list = _parse_phenocam_csv(csv_path, year, site)

    s3_smooth_series = [
        {"date": p["date"], "gcc": p["gcc_smooth"]}
        for p in s3_smooth
        if p.get("gcc_smooth") is not None
    ]

    # Band reflectance timeseries (multi-band center-pixel)
    bands_s2 = _multiband_series(
        sorted(s2_refl_dir.glob("*_REFL.tif")),
        _date_from_s2_tif,
        lat,
        lon,
        S2_BAND_NAMES,
        f"{site} S2 bands",
    )
    bands_s3 = _multiband_series(
        sorted(s3_comp_dir.glob("composite_*.tif")),
        _date_from_gcc_tif,
        lat,
        lon,
        S3_BAND_NAMES,
        f"{site} S3 bands",
    )

    # --- Per-metric JSON outputs ---
    _write_json(out_dir / "gcc_phenocam.json", phenocam_series)
    _write_json(out_dir / "gcc_s2.json", s2_series)
    _write_json(out_dir / "gcc_s2_whittaker.json", s2_whittaker)
    _write_json(out_dir / "gcc_s3.json", s3_series)
    _write_json(out_dir / "gcc_s3_smooth.json", s3_smooth_series)
    _write_json(out_dir / "gcc_fusion_bti.json", bti_series)
    _write_json(out_dir / "gcc_fusion_itb.json", itb_series)
    _write_json(out_dir / "phenocam_images.json", image_list)
    _write_json(out_dir / "bands_s2.json", bands_s2)
    _write_json(out_dir / "bands_s3.json", bands_s3)

    # --- Raster index for slider ---
    rel_root = DATA_DIR.parent  # paths relative to project root

    # Valid-pixel sets: only show S2/S3 rasters where the center pixel had
    # usable data (non-zero GCC). This excludes cloud-masked / snow-covered
    # scenes that would render as black or visually nonsensical.
    s2_valid_dates = {p["date"].replace("-", "") for p in s2_series}
    s3_valid_dates = {p["date"].replace("-", "") for p in s3_series}

    s2_refl = [
        r
        for r in _raster_index(
            sorted(s2_refl_dir.glob("*_REFL.tif")), _date_from_s2_tif, rel_root
        )
        if r["date"] in s2_valid_dates
    ]
    s3_comp = [
        r
        for r in _raster_index(
            sorted(s3_comp_dir.glob("composite_*.tif")), _date_from_gcc_tif, rel_root
        )
        if r["date"] in s3_valid_dates
    ]
    s2_gcc = [
        r
        for r in _raster_index(
            sorted(s2_gcc_dir.glob("*_GCC.tif")), _date_from_s2_tif, rel_root
        )
        if r["date"] in s2_valid_dates
    ]
    s3_gcc = [
        r
        for r in _raster_index(
            sorted(s3_gcc_dir.glob("composite_*.tif")), _date_from_gcc_tif, rel_root
        )
        if r["date"] in s3_valid_dates
    ]
    bti_refl = _raster_index(
        sorted(bti_refl_dir.glob("REFL_*.tif")), _date_from_gcc_tif, rel_root
    )
    itb_gcc = _raster_index(
        sorted(itb_gcc_dir.glob("GCC_*.tif")), _date_from_gcc_tif, rel_root
    )

    _write_json(out_dir / "rasters_s2_refl.json", s2_refl)
    _write_json(out_dir / "rasters_s3_composite.json", s3_comp)
    _write_json(out_dir / "rasters_s2_gcc.json", s2_gcc)
    _write_json(out_dir / "rasters_s3_gcc.json", s3_gcc)
    _write_json(out_dir / "rasters_fusion_bti_refl.json", bti_refl)
    _write_json(out_dir / "rasters_fusion_itb_gcc.json", itb_gcc)

    # --- Site covariates (heterogeneity + observation density) ---
    _write_json(
        out_dir / "covariates.json",
        compute_covariates(s2_gcc_paths, s2_series, s3_series, n_gcc_points, lat, lon),
    )

    # --- Validation metrics vs PhenoCam gcc_90 ---
    _write_json(
        out_dir / "metrics.json",
        {
            "bti": compute_metrics(phenocam_series, "gcc_90", bti_series, "gcc"),
            "itb": compute_metrics(phenocam_series, "gcc_90", itb_series, "gcc"),
            "s2_whittaker": compute_metrics(
                phenocam_series, "gcc_90", s2_whittaker, "gcc"
            ),
            "s3_smooth": compute_metrics(
                phenocam_series, "gcc_90", s3_smooth_series, "gcc"
            ),
            "s2": compute_metrics(phenocam_series, "gcc_90", s2_series, "gcc"),
            "s3": compute_metrics(phenocam_series, "gcc_90", s3_series, "gcc"),
        },
    )

    # Remove legacy bundled outputs if present
    for legacy in ("timeseries.json", "rasters.json"):
        (out_dir / legacy).unlink(missing_ok=True)
    return True


# ---------------------------------------------------------------------------
# Manifest
# ---------------------------------------------------------------------------

VEG_TYPE_LABELS = {
    "AG": "Agriculture",
    "DB": "Deciduous broadleaf",
    "DN": "Deciduous needleleaf",
    "EB": "Evergreen broadleaf",
    "EN": "Evergreen needleleaf",
    "GR": "Grassland",
    "MX": "Mixed",
    "SH": "Shrubland",
    "TN": "Tundra",
    "UN": "Unknown",
    "WL": "Wetland",
    "RF": "Reference",
}


def build_manifest(years: list[int], filter_site: str | None = None) -> dict:
    manifest: dict[str, Any] = {"years": years, "sites": {}}

    for year in years:
        screening_path = DATA_DIR / "phenocam_screening" / f"{year}.json"
        if not screening_path.is_file():
            continue
        data = json.loads(screening_path.read_text())
        sites_meta: dict[str, Any] = {}
        for entry in data.get("sites", []):
            if entry.get("calculations", {}).get("status") != "PASS":
                continue
            cam = entry.get("response", {}).get("camera", {})
            roi = entry.get("response", {}).get("roi", {})
            calc = entry.get("calculations", {})
            site = cam.get("Sitename", "")
            if not site:
                continue
            if filter_site and site != filter_site:
                continue
            sm = cam.get("sitemetadata", {})
            veg_raw = sm.get("primary_veg_type") or roi.get("roitype") or "UN"
            fusion_dir = DATA_DIR / "fusion" / str(year) / site / "bti" / "gcc"
            has_fusion = fusion_dir.is_dir() and any(fusion_dir.glob("GCC_*.tif"))
            sites_meta[site] = {
                "lat": cam.get("Lat"),
                "lon": cam.get("Lon"),
                "veg_type": veg_raw,
                "veg_label": VEG_TYPE_LABELS.get(veg_raw, veg_raw),
                "description": sm.get("site_description", ""),
                "dominant_species": sm.get("dominant_species", ""),
                "group": sm.get("group", ""),
                "snr": calc.get("snr"),
                "n_gcc_points": calc.get("n_gcc_points"),
                "has_fusion": has_fusion,
            }
        manifest["sites"][str(year)] = sites_meta

    return manifest


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------


def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--evaluation-year", type=int, default=DEFAULT_YEAR)
    parser.add_argument("--site", type=str, default=None)
    args = parser.parse_args()

    year = args.evaluation_year
    filter_site = args.site

    out_base = DATA_DIR / "metrics"
    out_base.mkdir(parents=True, exist_ok=True)

    # Determine years with screening data
    screening_dir = DATA_DIR / "phenocam_screening"
    years = sorted(
        int(p.stem) for p in screening_dir.glob("*.json") if p.stem.isdigit()
    )
    if not years:
        years = [year]

    print(f"Building manifest for years: {years}")
    manifest = build_manifest(years, filter_site)

    # Export per-site data for the requested year
    year_sites = manifest["sites"].get(str(year), {})
    fusion_sites = {s: m for s, m in year_sites.items() if m["has_fusion"]}

    print(f"Exporting {len(fusion_sites)} site(s) with fusion data for {year}")
    for site, meta in tqdm(fusion_sites.items(), desc="Sites"):
        out_dir = out_base / str(year) / site
        ok = export_site(
            site, year, meta["lat"], meta["lon"], out_dir, meta.get("n_gcc_points")
        )
        if ok:
            print(f"  ✓ {site}")
        else:
            print(f"  ✗ {site} — no fusion data found")

    manifest_path = out_base / "manifest.json"
    if filter_site and manifest_path.is_file():
        # Merge: update only the filtered site(s); preserve all other entries.
        existing: dict = json.loads(manifest_path.read_text())
        for year_key, year_sites_new in manifest["sites"].items():
            existing.setdefault("sites", {}).setdefault(year_key, {}).update(
                year_sites_new
            )
        all_years = sorted(set(existing.get("years", [])) | set(manifest["years"]))
        existing["years"] = all_years
        manifest_path.write_text(json.dumps(existing, separators=(",", ":")))
    else:
        manifest_path.write_text(json.dumps(manifest, separators=(",", ":")))
    print(f"Manifest written → {manifest_path}")


if __name__ == "__main__":
    main()