Added gcc suitability stats.

2026-06-17 12:29:35 +02:00 · 2026-06-17 12:29:35 +02:00 · a593683314
commit a593683314
parent d55ee31e8d
3 changed files with 907 additions and 15 deletions
--- a/7-gcc-suitability.py
+++ b/7-gcc-suitability.py
@ -0,0 +1,742 @@
+"""Step 7: PhenoCam GCC suitability as a fusion-accuracy reference.
+
+Inputs (``data/``, ``{year}`` = ``--evaluation-year``):
+
+- ``metrics/{year}/{site}/gcc_s2.json``, ``gcc_phenocam.json`` — Step 5 timeseries
+- ``metrics/manifest.json`` — site lat/lon
+- ``sentinel_data/{year}/{site}/prepared/s2/`` — S2 REFL/GCC/DIST_CLOUD (Steps 3–4)
+- ``sentinel_data/{year}/{site}/prepared/gcc_s3/``, ``prepared/s3_rgb/`` — Step 4
+
+Outputs (``data/gcc_suitability/``):
+
+- ``{year}.json`` — representativeness (Line A), LOOCV concordance (Line B),
+  per-site and aggregate suitability verdict
+
+CLI:
+
+- ``--evaluation-year`` (default 2025)
+- ``--min-cloudfree-s2`` (default 10) — minimum cloud-free S2 dates for LOOCV
+- ``--alpha`` (default 0.05) — reserved for future significance tests
+
+Full-sample aggregate; does not accept ``--site``.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import shutil
+import tempfile
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import rasterio
+from pyproj import Transformer
+from rasterio.crs import CRS
+from rasterio.transform import rowcol
+from scipy.stats import linregress, pearsonr, spearmanr
+from tqdm import tqdm
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+
+DATA_DIR = Path("data")
+DEFAULT_YEAR = 2025
+DEFAULT_ALPHA = 0.05
+MIN_CLOUDFREE_S2 = 10
+REPR_R_THRESHOLD = 0.7
+MATCH_TOLERANCE_DAYS = 5
+
+RESOLUTION_RATIO = 30
+MAX_DAYS = 100
+MINIMUM_ACQUISITION_IMPORTANCE = 0
+
+SMALL_SAMPLE_SITES = 6
+
+
+# ---------------------------------------------------------------------------
+# efast import
+# ---------------------------------------------------------------------------
+
+
+def _import_efast():
+    try:
+        import efast.efast as efast_module
+
+        return efast_module
+    except ImportError as exc:
+        raise ImportError("efast not found. Install with: uv sync") from exc
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _r4(v: float | None) -> float | None:
+    return round(v, 4) if v is not None else None
+
+
+def _window_mean(data: np.ndarray) -> float | None:
+    valid = data[~np.isnan(data)]
+    if valid.size == 0:
+        return None
+    return float(np.mean(valid))
+
+
+def _read_center_pixel(path: Path, lat: float, lon: float) -> float | None:
+    try:
+        with rasterio.open(path) as src:
+            transformer = Transformer.from_crs(
+                CRS.from_epsg(4326), src.crs, always_xy=True
+            )
+            x, y = transformer.transform(lon, lat)
+            row, col = rowcol(src.transform, x, y)
+            h, w = src.height, src.width
+            r0, r1 = max(0, row - 1), min(h, row + 2)
+            c0, c1 = max(0, col - 1), min(w, col + 2)
+            window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0)
+            data = src.read(1, window=window).astype(float)
+            nodata = src.nodata
+        if nodata is not None:
+            data = np.where(data == nodata, np.nan, data)
+        data[data == 0] = np.nan
+        return _window_mean(data)
+    except Exception:
+        return None
+
+
+def _date_from_s2_tif(path: Path) -> str | None:
+    parts = path.stem.split("_")
+    if len(parts) >= 3:
+        m = re.match(r"(\d{8})", parts[2])
+        return m.group(1) if m else None
+    return None
+
+
+def _iso_to_yyyymmdd(iso: str) -> str:
+    return iso.replace("-", "")
+
+
+def _yyyymmdd_to_iso(d: str) -> str:
+    return f"{d[:4]}-{d[4:6]}-{d[6:]}"
+
+
+def _day_gap(a: str, b: str) -> float:
+    return abs((np.datetime64(a) - np.datetime64(b)) / np.timedelta64(1, "D"))
+
+
+def _match_series(
+    ref: list[dict],
+    ref_key: str,
+    pred: list[dict],
+    pred_key: str,
+    tolerance_days: int = MATCH_TOLERANCE_DAYS,
+) -> tuple[list[float], list[float], list[str]]:
+    """Return paired (ref_vals, pred_vals, ref_dates) within tolerance."""
+    ref_lookup: dict[str, float] = {
+        p["date"]: p[ref_key] for p in ref if p.get(ref_key) is not None
+    }
+    if not ref_lookup:
+        return [], [], []
+
+    ref_dates = sorted(ref_lookup)
+    obs, sim, matched_dates = [], [], []
+    for pt in pred:
+        v = pt.get(pred_key)
+        if v is None:
+            continue
+        nearest = min(ref_dates, key=lambda d: _day_gap(pt["date"], d))
+        if _day_gap(pt["date"], nearest) <= tolerance_days:
+            obs.append(ref_lookup[nearest])
+            sim.append(v)
+            matched_dates.append(nearest)
+    return obs, sim, matched_dates
+
+
+def _accuracy_metrics(obs: list[float], sim: list[float]) -> dict[str, Any] | None:
+    if len(obs) < 2:
+        return None
+    obs_arr = np.array(obs, dtype=float)
+    sim_arr = np.array(sim, dtype=float)
+    diff = sim_arr - obs_arr
+    rmse = float(np.sqrt(np.mean(diff**2)))
+    mae = float(np.mean(np.abs(diff)))
+    bias = float(np.mean(diff))
+    r, _ = pearsonr(obs_arr, sim_arr)
+    return {
+        "n": len(obs),
+        "rmse": _r4(rmse),
+        "mae": _r4(mae),
+        "bias": _r4(bias),
+        "r": _r4(float(r)),
+    }
+
+
+def _gcc_from_refl_file(refl_path: Path, gcc_path: Path) -> None:
+    with rasterio.open(refl_path) as src:
+        b, g, r = src.read(1), src.read(2), src.read(3)
+        profile = src.profile
+    total = b + g + r
+    invalid = (b < 0) | (g < 0) | (r < 0)
+    gcc = np.where(invalid, np.nan, g / (total + 1e-10))
+    gcc[total == 0] = np.nan
+    profile.update(count=1, dtype="float32")
+    with rasterio.open(gcc_path, "w", **profile) as dst:
+        dst.write(gcc[np.newaxis].astype("float32"))
+
+
+def _load_json_series(path: Path) -> list[dict]:
+    if not path.is_file():
+        return []
+    return json.loads(path.read_text())
+
+
+def _load_site_coords(year: int) -> dict[str, tuple[float, float]]:
+    manifest_path = DATA_DIR / "metrics" / "manifest.json"
+    if not manifest_path.is_file():
+        return {}
+    manifest = json.loads(manifest_path.read_text())
+    sites = manifest.get("sites", {}).get(str(year), {})
+    coords: dict[str, tuple[float, float]] = {}
+    for site, meta in sites.items():
+        lat, lon = meta.get("lat"), meta.get("lon")
+        if lat is not None and lon is not None:
+            coords[site] = (float(lat), float(lon))
+    return coords
+
+
+def _discover_sites(year: int) -> list[str]:
+    metrics_dir = DATA_DIR / "metrics" / str(year)
+    if not metrics_dir.is_dir():
+        return []
+    return sorted(
+        d.name
+        for d in metrics_dir.iterdir()
+        if d.is_dir() and (d / "gcc_s2.json").is_file()
+    )
+
+
+def _build_hr_symlink_dir(s2_dir: Path, holdout_yyyymmdd: str, dest: Path) -> None:
+    """Symlink all S2 hr inputs except the held-out acquisition date."""
+    dest.mkdir(parents=True, exist_ok=True)
+    for pattern in ("*_REFL.tif", "*_GCC.tif", "*_DIST_CLOUD.tif"):
+        for src in sorted(s2_dir.glob(pattern)):
+            date_token = (
+                src.stem.split("_")[2][:8] if len(src.stem.split("_")) >= 3 else ""
+            )
+            if date_token == holdout_yyyymmdd:
+                continue
+            link = dest / src.name
+            if link.exists() or link.is_symlink():
+                link.unlink()
+            link.symlink_to(src.resolve())
+
+
+# ---------------------------------------------------------------------------
+# Line A — representativeness
+# ---------------------------------------------------------------------------
+
+
+def compute_representativeness(phenocam: list[dict], s2: list[dict]) -> dict[str, Any]:
+    """PhenoCam gcc_90 vs co-located observed S2 GCC."""
+    obs, sim, _ = _match_series(phenocam, "gcc_90", s2, "gcc")
+    result: dict[str, Any] = {
+        "n": len(obs),
+        "r": None,
+        "spearman": None,
+        "slope": None,
+        "intercept": None,
+        "rmse": None,
+        "bias": None,
+        "peak_offset_days": None,
+        "representative": False,
+    }
+    if len(obs) < 2:
+        return result
+
+    obs_arr = np.array(obs, dtype=float)
+    sim_arr = np.array(sim, dtype=float)
+    r, _ = pearsonr(obs_arr, sim_arr)
+    sp, _ = spearmanr(obs_arr, sim_arr)
+    reg = linregress(sim_arr, obs_arr)
+    diff = sim_arr - obs_arr
+    result.update(
+        {
+            "r": _r4(float(r)),
+            "spearman": _r4(float(sp)),
+            "slope": _r4(float(reg.slope)),
+            "intercept": _r4(float(reg.intercept)),
+            "rmse": _r4(float(np.sqrt(np.mean(diff**2)))),
+            "bias": _r4(float(np.mean(diff))),
+            "representative": float(r) >= REPR_R_THRESHOLD,
+        }
+    )
+
+    pc_dates = [p["date"] for p in phenocam if p.get("gcc_90") is not None]
+    s2_dates = [p["date"] for p in s2 if p.get("gcc") is not None]
+    if pc_dates and s2_dates:
+        pc_peak = max(
+            phenocam,
+            key=lambda p: p["gcc_90"] if p.get("gcc_90") is not None else -1,
+        )["date"]
+        s2_peak = max(s2, key=lambda p: p["gcc"] if p.get("gcc") is not None else -1)[
+            "date"
+        ]
+        result["peak_offset_days"] = int(_day_gap(pc_peak, s2_peak))
+
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Line B — LOOCV
+# ---------------------------------------------------------------------------
+
+
+def _phenocam_lookup(phenocam: list[dict]) -> dict[str, float]:
+    return {p["date"]: p["gcc_90"] for p in phenocam if p.get("gcc_90") is not None}
+
+
+def _nearest_phenocam(iso_date: str, lookup: dict[str, float]) -> float | None:
+    if not lookup:
+        return None
+    dates = sorted(lookup)
+    nearest = min(dates, key=lambda d: _day_gap(iso_date, d))
+    if _day_gap(iso_date, nearest) <= MATCH_TOLERANCE_DAYS:
+        return lookup[nearest]
+    return None
+
+
+def run_loocv_site(
+    site: str,
+    year: int,
+    lat: float,
+    lon: float,
+    s2_series: list[dict],
+    phenocam: list[dict],
+    efast,
+) -> list[dict[str, Any]]:
+    """Leave-one-out EFAST for each cloud-free S2 date; return per-date records."""
+    s2_dir = DATA_DIR / "sentinel_data" / str(year) / site / "prepared" / "s2"
+    gcc_s3_dir = DATA_DIR / "sentinel_data" / str(year) / site / "prepared" / "gcc_s3"
+    s3_rgb_dir = DATA_DIR / "sentinel_data" / str(year) / site / "prepared" / "s3_rgb"
+
+    pc_lookup = _phenocam_lookup(phenocam)
+    s2_truth = {p["date"]: p["gcc"] for p in s2_series}
+    fusion_kwargs = dict(
+        ratio=RESOLUTION_RATIO,
+        max_days=MAX_DAYS,
+        minimum_acquisition_importance=MINIMUM_ACQUISITION_IMPORTANCE,
+    )
+
+    records: list[dict[str, Any]] = []
+    dates = [p["date"] for p in s2_series]
+
+    with tempfile.TemporaryDirectory(prefix=f"loocv_{site}_") as tmp_root:
+        tmp = Path(tmp_root)
+        hr_dir = tmp / "hr"
+        itb_out = tmp / "itb"
+        bti_out = tmp / "bti"
+        bti_gcc = tmp / "bti_gcc"
+        itb_out.mkdir()
+        bti_out.mkdir()
+        bti_gcc.mkdir()
+
+        for iso_date in tqdm(dates, desc=f"{site} LOOCV", leave=False):
+            yyyymmdd = _iso_to_yyyymmdd(iso_date)
+            truth = s2_truth.get(iso_date)
+            if truth is None:
+                continue
+
+            if hr_dir.exists():
+                shutil.rmtree(hr_dir)
+            _build_hr_symlink_dir(s2_dir, yyyymmdd, hr_dir)
+
+            pred_date = datetime.strptime(yyyymmdd, "%Y%m%d")
+
+            for f in itb_out.glob("*.tif"):
+                f.unlink()
+            for f in bti_out.glob("*.tif"):
+                f.unlink()
+            for f in bti_gcc.glob("*.tif"):
+                f.unlink()
+
+            efast.fusion(
+                pred_date,
+                gcc_s3_dir,
+                hr_dir,
+                itb_out,
+                product="GCC",
+                **fusion_kwargs,
+            )
+            efast.fusion(
+                pred_date,
+                s3_rgb_dir,
+                hr_dir,
+                bti_out,
+                product="REFL",
+                **fusion_kwargs,
+            )
+
+            itb_path = itb_out / f"GCC_{yyyymmdd}.tif"
+            refl_path = bti_out / f"REFL_{yyyymmdd}.tif"
+            bti_path = bti_gcc / f"GCC_{yyyymmdd}.tif"
+
+            pred_itb = (
+                _read_center_pixel(itb_path, lat, lon) if itb_path.is_file() else None
+            )
+            pred_bti = None
+            if refl_path.is_file():
+                _gcc_from_refl_file(refl_path, bti_path)
+                if bti_path.is_file():
+                    pred_bti = _read_center_pixel(bti_path, lat, lon)
+
+            pc_val = _nearest_phenocam(iso_date, pc_lookup)
+
+            records.append(
+                {
+                    "date": iso_date,
+                    "s2_truth": truth,
+                    "pred_bti": pred_bti,
+                    "pred_itb": pred_itb,
+                    "phenocam": pc_val,
+                }
+            )
+
+    return records
+
+
+def _method_accuracy(records: list[dict], pred_key: str, ref_key: str) -> dict | None:
+    obs, sim = [], []
+    for rec in records:
+        pred = rec.get(pred_key)
+        ref = rec.get(ref_key)
+        if pred is None or ref is None:
+            continue
+        obs.append(ref)
+        sim.append(pred)
+    return _accuracy_metrics(obs, sim)
+
+
+def _winner(rmse_bti: float | None, rmse_itb: float | None) -> str | None:
+    if rmse_bti is None or rmse_itb is None:
+        return None
+    if rmse_bti < rmse_itb:
+        return "bti"
+    if rmse_itb < rmse_bti:
+        return "itb"
+    return "tie"
+
+
+def summarize_loocv(records: list[dict]) -> dict[str, Any]:
+    bti_vs_s2 = _method_accuracy(records, "pred_bti", "s2_truth")
+    itb_vs_s2 = _method_accuracy(records, "pred_itb", "s2_truth")
+    bti_vs_pc = _method_accuracy(records, "pred_bti", "phenocam")
+    itb_vs_pc = _method_accuracy(records, "pred_itb", "phenocam")
+
+    winner_s2 = _winner(
+        bti_vs_s2["rmse"] if bti_vs_s2 else None,
+        itb_vs_s2["rmse"] if itb_vs_s2 else None,
+    )
+    winner_pc = _winner(
+        bti_vs_pc["rmse"] if bti_vs_pc else None,
+        itb_vs_pc["rmse"] if itb_vs_pc else None,
+    )
+    agreement = (
+        winner_s2 == winner_pc
+        if winner_s2 and winner_pc and winner_s2 != "tie" and winner_pc != "tie"
+        else None
+    )
+
+    return {
+        "n_dates": len(records),
+        "bti": {"vs_s2": bti_vs_s2, "vs_phenocam": bti_vs_pc},
+        "itb": {"vs_s2": itb_vs_s2, "vs_phenocam": itb_vs_pc},
+        "winner_s2": winner_s2,
+        "winner_phenocam": winner_pc,
+        "winner_agreement": agreement,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Aggregate concordance
+# ---------------------------------------------------------------------------
+
+
+def _pooled_concordance(
+    all_records: list[dict[str, Any]],
+) -> dict[str, Any]:
+    """Pooled metrics across all held-out dates."""
+    residual_pairs: list[tuple[float, float]] = []
+    vec_s2: list[float] = []
+    vec_pc: list[float] = []
+
+    for site_data in all_records:
+        for rec in site_data.get("records", []):
+            truth = rec.get("s2_truth")
+            pc = rec.get("phenocam")
+            for key in ("pred_bti", "pred_itb"):
+                pred = rec.get(key)
+                if pred is None or truth is None:
+                    continue
+                err_s2 = abs(pred - truth)
+                if pc is not None:
+                    err_pc = abs(pred - pc)
+                    vec_s2.append(err_s2)
+                    vec_pc.append(err_pc)
+                    residual_pairs.append((err_s2, err_pc))
+
+    pooled_spearman = None
+    if len(vec_s2) >= 3:
+        sp, _ = spearmanr(vec_s2, vec_pc)
+        if not np.isnan(sp):
+            pooled_spearman = _r4(float(sp))
+
+    residual_corr = None
+    if len(residual_pairs) >= 3:
+        xs = np.array([p[0] for p in residual_pairs])
+        ys = np.array([p[1] for p in residual_pairs])
+        rc, _ = pearsonr(xs, ys)
+        residual_corr = _r4(float(rc))
+
+    agreements = [
+        s.get("winner_agreement")
+        for s in all_records
+        if s.get("eligible") and s.get("winner_agreement") is not None
+    ]
+    winner_agreement_rate = (
+        _r4(sum(1 for a in agreements if a) / len(agreements)) if agreements else None
+    )
+
+    n_loocv_dates = sum(len(s.get("records", [])) for s in all_records)
+
+    return {
+        "pooled_spearman": pooled_spearman,
+        "residual_corr": residual_corr,
+        "winner_agreement_rate": winner_agreement_rate,
+        "n_loocv_dates": n_loocv_dates,
+    }
+
+
+def _suitability_verdict(
+    n_repr_pass: int,
+    n_eligible: int,
+    n_total: int,
+    pooled: dict[str, Any],
+) -> str:
+    if n_eligible == 0:
+        return "insufficient data"
+    repr_rate = n_repr_pass / n_total if n_total else 0
+    agree = pooled.get("winner_agreement_rate")
+    sp = pooled.get("pooled_spearman")
+    rc = pooled.get("residual_corr")
+
+    strong = 0
+    if repr_rate >= 0.6:
+        strong += 1
+    if agree is not None and agree >= 0.7:
+        strong += 1
+    if sp is not None and sp >= 0.8:
+        strong += 1
+    if rc is not None and rc >= 0.5:
+        strong += 1
+
+    if strong >= 3:
+        return "suitable"
+    if strong >= 1 or repr_rate >= 0.4:
+        return "partially suitable"
+    return "not suitable"
+
+
+# ---------------------------------------------------------------------------
+# Per-site processing
+# ---------------------------------------------------------------------------
+
+
+def process_site(
+    site: str,
+    year: int,
+    lat: float,
+    lon: float,
+    min_cloudfree: int,
+    efast,
+) -> dict[str, Any]:
+    metrics_dir = DATA_DIR / "metrics" / str(year) / site
+    phenocam = _load_json_series(metrics_dir / "gcc_phenocam.json")
+    s2_series = _load_json_series(metrics_dir / "gcc_s2.json")
+
+    repr_metrics = compute_representativeness(phenocam, s2_series)
+    n_cloudfree = len(s2_series)
+    eligible = n_cloudfree >= min_cloudfree
+
+    result: dict[str, Any] = {
+        "eligible": eligible,
+        "n_cloudfree_s2": n_cloudfree,
+        "representativeness": repr_metrics,
+        "loocv": None,
+        "winner_s2": None,
+        "winner_phenocam": None,
+        "winner_agreement": None,
+        "records": [],
+    }
+
+    if not eligible:
+        return result
+
+    records = run_loocv_site(site, year, lat, lon, s2_series, phenocam, efast)
+    loocv = summarize_loocv(records)
+    result["loocv"] = loocv
+    result["winner_s2"] = loocv["winner_s2"]
+    result["winner_phenocam"] = loocv["winner_phenocam"]
+    result["winner_agreement"] = loocv["winner_agreement"]
+    result["records"] = records
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Output / summary
+# ---------------------------------------------------------------------------
+
+
+def _compact_site_payload(site_result: dict[str, Any]) -> dict[str, Any]:
+    """Drop raw LOOCV records from JSON output (keep summaries only)."""
+    out = {
+        "eligible": site_result["eligible"],
+        "n_cloudfree_s2": site_result["n_cloudfree_s2"],
+        "representativeness": site_result["representativeness"],
+        "winner_s2": site_result.get("winner_s2"),
+        "winner_phenocam": site_result.get("winner_phenocam"),
+        "winner_agreement": site_result.get("winner_agreement"),
+    }
+    if site_result.get("loocv"):
+        out["loocv"] = site_result["loocv"]
+    return out
+
+
+def _print_summary(payload: dict[str, Any]) -> None:
+    year = payload["year"]
+    agg = payload["aggregate"]
+    print(
+        f"\nPhenoCam GCC suitability — {year} "
+        f"({payload['n_sites_total']} site(s), "
+        f"{payload['n_sites_eligible']} LOOCV-eligible, "
+        f"{payload['n_sites_repr_pass']} representative)"
+    )
+    print(f"Verdict: {agg['suitability_verdict']}")
+    print(
+        f"  pooled Spearman (method errors): {agg.get('pooled_spearman', '—')}  "
+        f"residual corr: {agg.get('residual_corr', '—')}  "
+        f"winner agreement: {agg.get('winner_agreement_rate', '—')}  "
+        f"LOOCV dates: {agg.get('n_loocv_dates', '—')}"
+    )
+    print(f"\n{'site':<28} {'repr r':>8} {'pass':>5} {'LOOCV n':>8} {'win agree':>10}")
+    print("-" * 65)
+    for site, data in sorted(payload["sites"].items()):
+        rep = data["representativeness"]
+        loocv_n = data.get("loocv", {}).get("n_dates") if data.get("loocv") else "—"
+        agree = data.get("winner_agreement")
+        agree_s = "yes" if agree else ("no" if agree is False else "—")
+        pass_s = "yes" if rep.get("representative") else "no"
+        print(
+            f"{site:<28} {rep.get('r') or '—':>8} {pass_s:>5} "
+            f"{loocv_n!s:>8} {agree_s:>10}"
+        )
+    if payload["n_sites_total"] < SMALL_SAMPLE_SITES:
+        print(
+            f"\nNote: only {payload['n_sites_total']} site(s); "
+            "interpret cross-site aggregates cautiously."
+        )
+    if payload.get("dropped_sites"):
+        print(f"Dropped/ineligible: {', '.join(payload['dropped_sites'])}")
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--evaluation-year", type=int, default=DEFAULT_YEAR)
+    parser.add_argument(
+        "--min-cloudfree-s2",
+        type=int,
+        default=MIN_CLOUDFREE_S2,
+        help="Minimum cloud-free S2 dates for LOOCV (default 10)",
+    )
+    parser.add_argument(
+        "--alpha",
+        type=float,
+        default=DEFAULT_ALPHA,
+        help="Significance threshold (reserved; default 0.05)",
+    )
+    args = parser.parse_args()
+
+    year = args.evaluation_year
+    min_cloudfree = args.min_cloudfree_s2
+
+    sites = _discover_sites(year)
+    if not sites:
+        raise SystemExit(
+            f"No Step 5 metrics found under {DATA_DIR / 'metrics' / str(year)}"
+        )
+
+    coords = _load_site_coords(year)
+    efast = _import_efast()
+
+    site_results: dict[str, dict[str, Any]] = {}
+    dropped: list[str] = []
+
+    for site in tqdm(sites, desc="Sites"):
+        if site not in coords:
+            dropped.append(site)
+            continue
+        lat, lon = coords[site]
+        site_results[site] = process_site(site, year, lat, lon, min_cloudfree, efast)
+        if not site_results[site]["eligible"]:
+            dropped.append(site)
+
+    n_eligible = sum(1 for s in site_results.values() if s["eligible"])
+    n_repr_pass = sum(
+        1
+        for s in site_results.values()
+        if s["representativeness"].get("representative")
+    )
+
+    pooled = _pooled_concordance(list(site_results.values()))
+    verdict = _suitability_verdict(n_repr_pass, n_eligible, len(sites), pooled)
+
+    payload = {
+        "year": year,
+        "alpha": args.alpha,
+        "repr_r_threshold": REPR_R_THRESHOLD,
+        "min_cloudfree_s2": min_cloudfree,
+        "n_sites_total": len(sites),
+        "n_sites_eligible": n_eligible,
+        "n_sites_repr_pass": n_repr_pass,
+        "aggregate": {
+            "suitability_verdict": verdict,
+            **pooled,
+        },
+        "sites": {
+            site: _compact_site_payload(data)
+            for site, data in sorted(site_results.items())
+        },
+        "dropped_sites": sorted(set(dropped)),
+    }
+
+    out_dir = DATA_DIR / "gcc_suitability"
+    out_dir.mkdir(parents=True, exist_ok=True)
+    out_path = out_dir / f"{year}.json"
+    out_path.write_text(json.dumps(payload, separators=(",", ":")))
+
+    _print_summary(payload)
+    print(f"\nWritten → {out_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/README.md
+++ b/README.md
@ -14,6 +14,7 @@ End-to-end pipeline from selecting sites from the global [PhenoCam Network](http
 | 4 | `4-fusion.py` | Run EFAST BtI (fuse reflectance → GCC) and ItB (fuse GCC directly) for each screened site |
 | 5 | `5-metrics.py` | Extract PhenoCam-matched timeseries, compute NSE/RMSE/r baselines and fusion metrics, emit per-site JSON and webapp manifest |
 | 6 | `6-statistics-fusion-order.py` | Paired ItB-vs-BtI significance test (Wilcoxon + t-test) across all sites |
+| 7 | `7-gcc-suitability.py` | PhenoCam GCC suitability as a fusion-accuracy reference (representativeness + LOOCV concordance) |

 ---

@ -43,9 +44,10 @@ uv run python 3-sentinel-data.py --evaluation-year 2025
 uv run python 4-fusion.py --evaluation-year 2025
 uv run python 5-metrics.py --evaluation-year 2025
 uv run python 6-statistics-fusion-order.py --evaluation-year 2025
+uv run python 7-gcc-suitability.py --evaluation-year 2025
 ```

-Steps 1–5 accept `--evaluation-year` (default `2025`) and `--site` (optional, for single-site runs). Step 6 is a full-sample aggregate and only accepts `--evaluation-year` and `--alpha` (default `0.05`). Steps 3–5 are resumable — existing output files are skipped.
+Steps 1–5 accept `--evaluation-year` (default `2025`) and `--site` (optional, for single-site runs). Steps 6–7 are full-sample aggregates and only accept `--evaluation-year` (Step 6 and 7 also accept `--alpha`; Step 7 adds `--min-cloudfree-s2`, default `10`). Steps 3–5 are resumable — existing output files are skipped.

 ```bash
 # single site
@ -73,6 +75,7 @@ Step 3 S3 download uses CDSE OpenEO (`SENTINEL3_SYN_L2_SYN`). Set `CDSE_USER` an
 | `metrics/{year}/{site}/` | 5 | Per-site timeseries, metrics, covariates JSON |
 | `metrics/manifest.json` | 5 | Webapp manifest (years + site metadata) |
 | `statistics_fusion_order/{year}.json` | 6 | Paired ItB-vs-BtI test summary (NSE, RMSE, nRMSE, r) |
+| `gcc_suitability/{year}.json` | 7 | PhenoCam GCC suitability summary (representativeness + LOOCV concordance) |

 ---

@ -80,7 +83,7 @@ Step 3 S3 download uses CDSE OpenEO (`SENTINEL3_SYN_L2_SYN`). Set `CDSE_USER` an



-`python3 -m http.server 8080` runs the webapp on [http://localhost:8000/index.html](http://localhost:8000/index.html). Requires step 5 output (`data/metrics/manifest.json`).
+`python3 -m http.server 8080` runs the webapp on [http://localhost:8000/index.html](http://localhost:8000/index.html). Requires step 5 output (`data/metrics/manifest.json`). The Statistics overlay GCC suitability tab uses step 7 output (`data/gcc_suitability/{year}.json`).

 ---

--- a/index.html
+++ b/index.html
@ -181,6 +181,13 @@ body { margin: 0; font: 13px/1.4 system-ui, sans-serif; background: #f5f5f5; col
 .stat-badge.bti { background: #e3f2fd; color: #0d47a1; }
 .stat-badge.none { background: #f5f5f5; color: #777; font-weight: 400; }
 .stat-badge.insuf { background: #fce4ec; color: #b71c1c; font-weight: 400; }
+.stat-badge.pass { background: #e8f5e9; color: #1b5e20; }
+.stat-badge.partial { background: #fff3e0; color: #e65100; font-weight: 400; }
+.stat-badge.fail { background: #fce4ec; color: #b71c1c; font-weight: 400; }
+.stat-site-table { width: 100%; border-collapse: collapse; font-size: 12px; margin-top: 8px; }
+.stat-site-table th, .stat-site-table td { padding: 4px 8px; text-align: left; border-bottom: 1px solid #f0f0f0; }
+.stat-site-table th { color: #888; font-weight: 500; }
+.stat-site-table .sval { font-variant-numeric: tabular-nums; }
 .stat-row-table { width: 100%; border-collapse: collapse; font-size: 12px; }
 .stat-row-table td { padding: 3px 0; vertical-align: top; }
 .stat-row-table .slabel { color: #888; width: 46%; }
@ -243,12 +250,7 @@ body { margin: 0; font: 13px/1.4 system-ui, sans-serif; background: #f5f5f5; col
      <span class="overlay-meta" id="statsMeta"></span>
      <button type="button" class="overlay-close" id="statsClose">Close</button>
    </div>
-    <div id="statsTabGcc" class="stats-tab-panel" style="display:none">
-      <div class="stat-placeholder">
-        <p>GCC suitability</p>
-        Coming soon.
-      </div>
-    </div>
+    <div id="statsTabGcc" class="stats-tab-panel" style="display:none"></div>
    <div id="statsTabComparison" class="stats-tab-panel"></div>
    <div id="statsTabSites" class="stats-tab-panel" style="display:none">
      <div class="stat-placeholder">
@ -352,6 +354,8 @@ let statsOverlayOpen = false;
 let statsTab = "comparison";
 let statsData = null;
 let statsYear = null;
+let gccSuitabilityData = null;
+let gccSuitabilityYear = null;
 const maps3 = {};         // { s2, fusion, s3 } Leaflet instances
 const overlays3 = {};     // current ImageOverlay per map
 const markers3 = {};      // site dot markers per map
@ -620,15 +624,23 @@ function betterOrderLabel(order) {
 }

 function updateStatsMeta() {
-  if (statsTab !== "comparison" || !statsData) {
-    qs("#statsMeta").textContent = `${currentYear}`;
-    return;
-  }
+  if (statsTab === "comparison" && statsData) {
    const nPairs = statsData.metrics?.nse?.n_pairs;
    const alpha = statsData.alpha ?? 0.05;
    qs("#statsMeta").textContent = nPairs != null
      ? `${nPairs} paired site${nPairs === 1 ? "" : "s"} · α=${alpha} · ${currentYear}`
      : `${currentYear}`;
+    return;
+  }
+  if (statsTab === "gcc" && gccSuitabilityData) {
+    const nEl = gccSuitabilityData.n_sites_eligible;
+    const nRep = gccSuitabilityData.n_sites_repr_pass;
+    qs("#statsMeta").textContent = nEl != null
+      ? `${nEl} eligible · ${nRep} representative · ${currentYear}`
+      : `${currentYear}`;
+    return;
+  }
+  qs("#statsMeta").textContent = `${currentYear}`;
 }

 function escHtml(s) {
@ -720,6 +732,130 @@ async function loadStatsPanel() {
  }
 }

+function suitabilityVerdictLabel(v) {
+  if (v === "suitable") return "Suitable";
+  if (v === "partially suitable") return "Partially suitable";
+  if (v === "not suitable") return "Not suitable";
+  if (v === "insufficient data") return "Insufficient data";
+  return v || "—";
+}
+
+function suitabilityBadgeClass(v) {
+  if (v === "suitable") return "pass";
+  if (v === "partially suitable") return "partial";
+  if (v === "not suitable") return "fail";
+  return "insuf";
+}
+
+function yesNo(v) {
+  if (v === true) return "yes";
+  if (v === false) return "no";
+  return "—";
+}
+
+function winnerLabel(w) {
+  if (w === "bti") return "BtI";
+  if (w === "itb") return "ItB";
+  if (w === "tie") return "tie";
+  return "—";
+}
+
+function gccDroppedSitesList(data) {
+  const dropped = data.dropped_sites || [];
+  if (!dropped.length) return "";
+  const links = dropped.map(site =>
+    `<button type="button" class="stat-site-link" data-site="${escHtml(site)}">${escHtml(site)}</button>`
+  ).join("");
+  return `<div class="stat-dropped">
+    <h4>Excluded sites (${dropped.length})</h4>
+    <p class="stat-dropped-note">Missing coordinates or fewer cloud-free S2 dates than the LOOCV minimum.</p>
+    <div class="stat-dropped-list">${links}</div>
+  </div>`;
+}
+
+function renderGccSuitabilityPanel(data) {
+  const panel = qs("#statsTabGcc");
+  const agg = data.aggregate || {};
+  const threshold = data.repr_r_threshold ?? 0.7;
+  const verdict = agg.suitability_verdict;
+  const badge = `<span class="stat-badge ${suitabilityBadgeClass(verdict)}">${suitabilityVerdictLabel(verdict)}</span>`;
+  const row = (label, val) =>
+    `<tr><td class="slabel">${label}</td><td class="sval">${val}</td></tr>`;
+
+  const siteRows = Object.entries(data.sites || {}).sort(([a], [b]) => a.localeCompare(b)).map(([site, s]) => {
+    const rep = s.representative ?? s.representativeness?.representative;
+    const repBadge = rep
+      ? '<span class="stat-badge pass">pass</span>'
+      : '<span class="stat-badge fail">fail</span>';
+    const loocv = s.loocv || {};
+    return `<tr>
+      <td><button type="button" class="stat-site-link" data-site="${escHtml(site)}">${escHtml(site)}</button></td>
+      <td class="sval">${fmtStat(s.representativeness?.r)}</td>
+      <td>${repBadge}</td>
+      <td class="sval">${loocv.n_dates ?? "—"}</td>
+      <td class="sval">${winnerLabel(s.winner_s2)} / ${winnerLabel(s.winner_phenocam)}</td>
+      <td class="sval">${yesNo(s.winner_agreement)}</td>
+    </tr>`;
+  }).join("");
+
+  const aggregateCard = `<div class="stat-card">
+    <h3>Aggregate verdict ${badge}</h3>
+    <table class="stat-row-table">
+      ${row("Pooled Spearman (errors)", fmtStat(agg.pooled_spearman))}
+      ${row("Residual correlation", fmtStat(agg.residual_corr))}
+      ${row("Winner agreement rate", fmtStat(agg.winner_agreement_rate))}
+      ${row("LOOCV dates (pooled)", agg.n_loocv_dates ?? "—")}
+      ${row("Representative sites", `${data.n_sites_repr_pass ?? "—"} / ${data.n_sites_total ?? "—"}`)}
+      ${row("LOOCV-eligible sites", `${data.n_sites_eligible ?? "—"} / ${data.n_sites_total ?? "—"}`)}
+    </table>
+  </div>`;
+
+  const reprCard = `<div class="stat-card">
+    <h3>Line A — PhenoCam vs S2 representativeness</h3>
+    <p style="font-size:11px;color:#999;margin:0 0 8px">Pass when Pearson r ≥ ${threshold} (oblique footprint tracks co-located S2 GCC).</p>
+    <table class="stat-site-table">
+      <thead><tr>
+        <th>Site</th><th>r</th><th>Pass</th><th>LOOCV n</th><th>Winner S2 / PC</th><th>Agree</th>
+      </tr></thead>
+      <tbody>${siteRows || '<tr><td colspan="6">No sites</td></tr>'}</tbody>
+    </table>
+  </div>`;
+
+  const concordanceCard = `<div class="stat-card">
+    <h3>Line B — LOOCV concordance</h3>
+    <p style="font-size:11px;color:#999;margin:0 0 8px">Same EFAST predictions scored against held-out S2 truth vs PhenoCam. Winner agreement and error correlations test whether PhenoCam ranks fusion methods like the satellite-internal reference.</p>
+    <table class="stat-row-table">
+      ${row("Min cloud-free S2 gate", data.min_cloudfree_s2 ?? "—")}
+      ${row("Pooled LOOCV dates", agg.n_loocv_dates ?? "—")}
+      ${row("Winner agreement rate", fmtStat(agg.winner_agreement_rate))}
+    </table>
+  </div>`;
+
+  panel.innerHTML =
+    `<div class="stat-summary">Is PhenoCam GCC a valid reference for ranking fusion accuracy? · ${data.n_sites_total ?? "—"} site(s) · r threshold ${threshold}</div>` +
+    `<div class="stat-grid">${aggregateCard}${reprCard}${concordanceCard}</div>` +
+    gccDroppedSitesList(data);
+  updateStatsMeta();
+}
+
+async function loadGccSuitabilityPanel() {
+  const panel = qs("#statsTabGcc");
+  panel.innerHTML = '<div class="stat-nodata">Loading…</div>';
+  try {
+    const data = await fetch(`data/gcc_suitability/${currentYear}.json`)
+      .then(r => { if (!r.ok) throw new Error(); return r.json(); });
+    gccSuitabilityData = data;
+    gccSuitabilityYear = currentYear;
+    renderGccSuitabilityPanel(data);
+  } catch {
+    gccSuitabilityData = null;
+    gccSuitabilityYear = null;
+    panel.innerHTML =
+      '<div class="stat-nodata">No GCC suitability file found — run 7-gcc-suitability.py first.</div>';
+    updateStatsMeta();
+  }
+}
+
 const STATS_TAB_PANELS = {
  comparison: "#statsTabComparison",
  gcc: "#statsTabGcc",
@ -736,6 +872,9 @@ function switchStatsTab(tab, updateHash = true) {
  if (tab === "comparison") {
    if (statsYear !== currentYear || !statsData) loadStatsPanel();
    else updateStatsMeta();
+  } else if (tab === "gcc") {
+    if (gccSuitabilityYear !== currentYear || !gccSuitabilityData) loadGccSuitabilityPanel();
+    else updateStatsMeta();
  } else {
    updateStatsMeta();
  }
@ -783,9 +922,12 @@ async function init() {
    currentYear = +yearSel.value;
    statsData = null;
    statsYear = null;
+    gccSuitabilityData = null;
+    gccSuitabilityYear = null;
    buildSiteList();
    if (worldOverlayOpen) buildWorldMap();
    if (statsOverlayOpen && statsTab === "comparison") loadStatsPanel();
+    if (statsOverlayOpen && statsTab === "gcc") loadGccSuitabilityPanel();
  });

  qs("#worldMapBtn").addEventListener("click", () => openWorldOverlay());
@ -797,6 +939,11 @@ async function init() {
    if (!link) return;
    pickSiteFromStats(link.dataset.site);
  });
+  qs("#statsTabGcc").addEventListener("click", e => {
+    const link = e.target.closest(".stat-site-link");
+    if (!link) return;
+    pickSiteFromStats(link.dataset.site);
+  });
  qs("#worldClose").addEventListener("click", () => closeWorldOverlay());
  qs("#statsClose").addEventListener("click", () => closeStatsOverlay());
  qs("#worldOverlay").addEventListener("click", e => {