Added gcc suitability stats.

This commit is contained in:
Felix Delattre 2026-06-17 12:29:35 +02:00
parent d55ee31e8d
commit a593683314
3 changed files with 907 additions and 15 deletions

742
7-gcc-suitability.py Normal file
View file

@ -0,0 +1,742 @@
"""Step 7: PhenoCam GCC suitability as a fusion-accuracy reference.
Inputs (``data/``, ``{year}`` = ``--evaluation-year``):
- ``metrics/{year}/{site}/gcc_s2.json``, ``gcc_phenocam.json`` Step 5 timeseries
- ``metrics/manifest.json`` site lat/lon
- ``sentinel_data/{year}/{site}/prepared/s2/`` S2 REFL/GCC/DIST_CLOUD (Steps 34)
- ``sentinel_data/{year}/{site}/prepared/gcc_s3/``, ``prepared/s3_rgb/`` Step 4
Outputs (``data/gcc_suitability/``):
- ``{year}.json`` representativeness (Line A), LOOCV concordance (Line B),
per-site and aggregate suitability verdict
CLI:
- ``--evaluation-year`` (default 2025)
- ``--min-cloudfree-s2`` (default 10) minimum cloud-free S2 dates for LOOCV
- ``--alpha`` (default 0.05) reserved for future significance tests
Full-sample aggregate; does not accept ``--site``.
"""
from __future__ import annotations
import argparse
import json
import re
import shutil
import tempfile
from datetime import datetime
from pathlib import Path
from typing import Any
import numpy as np
import rasterio
from pyproj import Transformer
from rasterio.crs import CRS
from rasterio.transform import rowcol
from scipy.stats import linregress, pearsonr, spearmanr
from tqdm import tqdm
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
DATA_DIR = Path("data")
DEFAULT_YEAR = 2025
DEFAULT_ALPHA = 0.05
MIN_CLOUDFREE_S2 = 10
REPR_R_THRESHOLD = 0.7
MATCH_TOLERANCE_DAYS = 5
RESOLUTION_RATIO = 30
MAX_DAYS = 100
MINIMUM_ACQUISITION_IMPORTANCE = 0
SMALL_SAMPLE_SITES = 6
# ---------------------------------------------------------------------------
# efast import
# ---------------------------------------------------------------------------
def _import_efast():
try:
import efast.efast as efast_module
return efast_module
except ImportError as exc:
raise ImportError("efast not found. Install with: uv sync") from exc
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _r4(v: float | None) -> float | None:
return round(v, 4) if v is not None else None
def _window_mean(data: np.ndarray) -> float | None:
valid = data[~np.isnan(data)]
if valid.size == 0:
return None
return float(np.mean(valid))
def _read_center_pixel(path: Path, lat: float, lon: float) -> float | None:
try:
with rasterio.open(path) as src:
transformer = Transformer.from_crs(
CRS.from_epsg(4326), src.crs, always_xy=True
)
x, y = transformer.transform(lon, lat)
row, col = rowcol(src.transform, x, y)
h, w = src.height, src.width
r0, r1 = max(0, row - 1), min(h, row + 2)
c0, c1 = max(0, col - 1), min(w, col + 2)
window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0)
data = src.read(1, window=window).astype(float)
nodata = src.nodata
if nodata is not None:
data = np.where(data == nodata, np.nan, data)
data[data == 0] = np.nan
return _window_mean(data)
except Exception:
return None
def _date_from_s2_tif(path: Path) -> str | None:
parts = path.stem.split("_")
if len(parts) >= 3:
m = re.match(r"(\d{8})", parts[2])
return m.group(1) if m else None
return None
def _iso_to_yyyymmdd(iso: str) -> str:
return iso.replace("-", "")
def _yyyymmdd_to_iso(d: str) -> str:
return f"{d[:4]}-{d[4:6]}-{d[6:]}"
def _day_gap(a: str, b: str) -> float:
return abs((np.datetime64(a) - np.datetime64(b)) / np.timedelta64(1, "D"))
def _match_series(
ref: list[dict],
ref_key: str,
pred: list[dict],
pred_key: str,
tolerance_days: int = MATCH_TOLERANCE_DAYS,
) -> tuple[list[float], list[float], list[str]]:
"""Return paired (ref_vals, pred_vals, ref_dates) within tolerance."""
ref_lookup: dict[str, float] = {
p["date"]: p[ref_key] for p in ref if p.get(ref_key) is not None
}
if not ref_lookup:
return [], [], []
ref_dates = sorted(ref_lookup)
obs, sim, matched_dates = [], [], []
for pt in pred:
v = pt.get(pred_key)
if v is None:
continue
nearest = min(ref_dates, key=lambda d: _day_gap(pt["date"], d))
if _day_gap(pt["date"], nearest) <= tolerance_days:
obs.append(ref_lookup[nearest])
sim.append(v)
matched_dates.append(nearest)
return obs, sim, matched_dates
def _accuracy_metrics(obs: list[float], sim: list[float]) -> dict[str, Any] | None:
if len(obs) < 2:
return None
obs_arr = np.array(obs, dtype=float)
sim_arr = np.array(sim, dtype=float)
diff = sim_arr - obs_arr
rmse = float(np.sqrt(np.mean(diff**2)))
mae = float(np.mean(np.abs(diff)))
bias = float(np.mean(diff))
r, _ = pearsonr(obs_arr, sim_arr)
return {
"n": len(obs),
"rmse": _r4(rmse),
"mae": _r4(mae),
"bias": _r4(bias),
"r": _r4(float(r)),
}
def _gcc_from_refl_file(refl_path: Path, gcc_path: Path) -> None:
with rasterio.open(refl_path) as src:
b, g, r = src.read(1), src.read(2), src.read(3)
profile = src.profile
total = b + g + r
invalid = (b < 0) | (g < 0) | (r < 0)
gcc = np.where(invalid, np.nan, g / (total + 1e-10))
gcc[total == 0] = np.nan
profile.update(count=1, dtype="float32")
with rasterio.open(gcc_path, "w", **profile) as dst:
dst.write(gcc[np.newaxis].astype("float32"))
def _load_json_series(path: Path) -> list[dict]:
if not path.is_file():
return []
return json.loads(path.read_text())
def _load_site_coords(year: int) -> dict[str, tuple[float, float]]:
manifest_path = DATA_DIR / "metrics" / "manifest.json"
if not manifest_path.is_file():
return {}
manifest = json.loads(manifest_path.read_text())
sites = manifest.get("sites", {}).get(str(year), {})
coords: dict[str, tuple[float, float]] = {}
for site, meta in sites.items():
lat, lon = meta.get("lat"), meta.get("lon")
if lat is not None and lon is not None:
coords[site] = (float(lat), float(lon))
return coords
def _discover_sites(year: int) -> list[str]:
metrics_dir = DATA_DIR / "metrics" / str(year)
if not metrics_dir.is_dir():
return []
return sorted(
d.name
for d in metrics_dir.iterdir()
if d.is_dir() and (d / "gcc_s2.json").is_file()
)
def _build_hr_symlink_dir(s2_dir: Path, holdout_yyyymmdd: str, dest: Path) -> None:
"""Symlink all S2 hr inputs except the held-out acquisition date."""
dest.mkdir(parents=True, exist_ok=True)
for pattern in ("*_REFL.tif", "*_GCC.tif", "*_DIST_CLOUD.tif"):
for src in sorted(s2_dir.glob(pattern)):
date_token = (
src.stem.split("_")[2][:8] if len(src.stem.split("_")) >= 3 else ""
)
if date_token == holdout_yyyymmdd:
continue
link = dest / src.name
if link.exists() or link.is_symlink():
link.unlink()
link.symlink_to(src.resolve())
# ---------------------------------------------------------------------------
# Line A — representativeness
# ---------------------------------------------------------------------------
def compute_representativeness(phenocam: list[dict], s2: list[dict]) -> dict[str, Any]:
"""PhenoCam gcc_90 vs co-located observed S2 GCC."""
obs, sim, _ = _match_series(phenocam, "gcc_90", s2, "gcc")
result: dict[str, Any] = {
"n": len(obs),
"r": None,
"spearman": None,
"slope": None,
"intercept": None,
"rmse": None,
"bias": None,
"peak_offset_days": None,
"representative": False,
}
if len(obs) < 2:
return result
obs_arr = np.array(obs, dtype=float)
sim_arr = np.array(sim, dtype=float)
r, _ = pearsonr(obs_arr, sim_arr)
sp, _ = spearmanr(obs_arr, sim_arr)
reg = linregress(sim_arr, obs_arr)
diff = sim_arr - obs_arr
result.update(
{
"r": _r4(float(r)),
"spearman": _r4(float(sp)),
"slope": _r4(float(reg.slope)),
"intercept": _r4(float(reg.intercept)),
"rmse": _r4(float(np.sqrt(np.mean(diff**2)))),
"bias": _r4(float(np.mean(diff))),
"representative": float(r) >= REPR_R_THRESHOLD,
}
)
pc_dates = [p["date"] for p in phenocam if p.get("gcc_90") is not None]
s2_dates = [p["date"] for p in s2 if p.get("gcc") is not None]
if pc_dates and s2_dates:
pc_peak = max(
phenocam,
key=lambda p: p["gcc_90"] if p.get("gcc_90") is not None else -1,
)["date"]
s2_peak = max(s2, key=lambda p: p["gcc"] if p.get("gcc") is not None else -1)[
"date"
]
result["peak_offset_days"] = int(_day_gap(pc_peak, s2_peak))
return result
# ---------------------------------------------------------------------------
# Line B — LOOCV
# ---------------------------------------------------------------------------
def _phenocam_lookup(phenocam: list[dict]) -> dict[str, float]:
return {p["date"]: p["gcc_90"] for p in phenocam if p.get("gcc_90") is not None}
def _nearest_phenocam(iso_date: str, lookup: dict[str, float]) -> float | None:
if not lookup:
return None
dates = sorted(lookup)
nearest = min(dates, key=lambda d: _day_gap(iso_date, d))
if _day_gap(iso_date, nearest) <= MATCH_TOLERANCE_DAYS:
return lookup[nearest]
return None
def run_loocv_site(
site: str,
year: int,
lat: float,
lon: float,
s2_series: list[dict],
phenocam: list[dict],
efast,
) -> list[dict[str, Any]]:
"""Leave-one-out EFAST for each cloud-free S2 date; return per-date records."""
s2_dir = DATA_DIR / "sentinel_data" / str(year) / site / "prepared" / "s2"
gcc_s3_dir = DATA_DIR / "sentinel_data" / str(year) / site / "prepared" / "gcc_s3"
s3_rgb_dir = DATA_DIR / "sentinel_data" / str(year) / site / "prepared" / "s3_rgb"
pc_lookup = _phenocam_lookup(phenocam)
s2_truth = {p["date"]: p["gcc"] for p in s2_series}
fusion_kwargs = dict(
ratio=RESOLUTION_RATIO,
max_days=MAX_DAYS,
minimum_acquisition_importance=MINIMUM_ACQUISITION_IMPORTANCE,
)
records: list[dict[str, Any]] = []
dates = [p["date"] for p in s2_series]
with tempfile.TemporaryDirectory(prefix=f"loocv_{site}_") as tmp_root:
tmp = Path(tmp_root)
hr_dir = tmp / "hr"
itb_out = tmp / "itb"
bti_out = tmp / "bti"
bti_gcc = tmp / "bti_gcc"
itb_out.mkdir()
bti_out.mkdir()
bti_gcc.mkdir()
for iso_date in tqdm(dates, desc=f"{site} LOOCV", leave=False):
yyyymmdd = _iso_to_yyyymmdd(iso_date)
truth = s2_truth.get(iso_date)
if truth is None:
continue
if hr_dir.exists():
shutil.rmtree(hr_dir)
_build_hr_symlink_dir(s2_dir, yyyymmdd, hr_dir)
pred_date = datetime.strptime(yyyymmdd, "%Y%m%d")
for f in itb_out.glob("*.tif"):
f.unlink()
for f in bti_out.glob("*.tif"):
f.unlink()
for f in bti_gcc.glob("*.tif"):
f.unlink()
efast.fusion(
pred_date,
gcc_s3_dir,
hr_dir,
itb_out,
product="GCC",
**fusion_kwargs,
)
efast.fusion(
pred_date,
s3_rgb_dir,
hr_dir,
bti_out,
product="REFL",
**fusion_kwargs,
)
itb_path = itb_out / f"GCC_{yyyymmdd}.tif"
refl_path = bti_out / f"REFL_{yyyymmdd}.tif"
bti_path = bti_gcc / f"GCC_{yyyymmdd}.tif"
pred_itb = (
_read_center_pixel(itb_path, lat, lon) if itb_path.is_file() else None
)
pred_bti = None
if refl_path.is_file():
_gcc_from_refl_file(refl_path, bti_path)
if bti_path.is_file():
pred_bti = _read_center_pixel(bti_path, lat, lon)
pc_val = _nearest_phenocam(iso_date, pc_lookup)
records.append(
{
"date": iso_date,
"s2_truth": truth,
"pred_bti": pred_bti,
"pred_itb": pred_itb,
"phenocam": pc_val,
}
)
return records
def _method_accuracy(records: list[dict], pred_key: str, ref_key: str) -> dict | None:
obs, sim = [], []
for rec in records:
pred = rec.get(pred_key)
ref = rec.get(ref_key)
if pred is None or ref is None:
continue
obs.append(ref)
sim.append(pred)
return _accuracy_metrics(obs, sim)
def _winner(rmse_bti: float | None, rmse_itb: float | None) -> str | None:
if rmse_bti is None or rmse_itb is None:
return None
if rmse_bti < rmse_itb:
return "bti"
if rmse_itb < rmse_bti:
return "itb"
return "tie"
def summarize_loocv(records: list[dict]) -> dict[str, Any]:
bti_vs_s2 = _method_accuracy(records, "pred_bti", "s2_truth")
itb_vs_s2 = _method_accuracy(records, "pred_itb", "s2_truth")
bti_vs_pc = _method_accuracy(records, "pred_bti", "phenocam")
itb_vs_pc = _method_accuracy(records, "pred_itb", "phenocam")
winner_s2 = _winner(
bti_vs_s2["rmse"] if bti_vs_s2 else None,
itb_vs_s2["rmse"] if itb_vs_s2 else None,
)
winner_pc = _winner(
bti_vs_pc["rmse"] if bti_vs_pc else None,
itb_vs_pc["rmse"] if itb_vs_pc else None,
)
agreement = (
winner_s2 == winner_pc
if winner_s2 and winner_pc and winner_s2 != "tie" and winner_pc != "tie"
else None
)
return {
"n_dates": len(records),
"bti": {"vs_s2": bti_vs_s2, "vs_phenocam": bti_vs_pc},
"itb": {"vs_s2": itb_vs_s2, "vs_phenocam": itb_vs_pc},
"winner_s2": winner_s2,
"winner_phenocam": winner_pc,
"winner_agreement": agreement,
}
# ---------------------------------------------------------------------------
# Aggregate concordance
# ---------------------------------------------------------------------------
def _pooled_concordance(
all_records: list[dict[str, Any]],
) -> dict[str, Any]:
"""Pooled metrics across all held-out dates."""
residual_pairs: list[tuple[float, float]] = []
vec_s2: list[float] = []
vec_pc: list[float] = []
for site_data in all_records:
for rec in site_data.get("records", []):
truth = rec.get("s2_truth")
pc = rec.get("phenocam")
for key in ("pred_bti", "pred_itb"):
pred = rec.get(key)
if pred is None or truth is None:
continue
err_s2 = abs(pred - truth)
if pc is not None:
err_pc = abs(pred - pc)
vec_s2.append(err_s2)
vec_pc.append(err_pc)
residual_pairs.append((err_s2, err_pc))
pooled_spearman = None
if len(vec_s2) >= 3:
sp, _ = spearmanr(vec_s2, vec_pc)
if not np.isnan(sp):
pooled_spearman = _r4(float(sp))
residual_corr = None
if len(residual_pairs) >= 3:
xs = np.array([p[0] for p in residual_pairs])
ys = np.array([p[1] for p in residual_pairs])
rc, _ = pearsonr(xs, ys)
residual_corr = _r4(float(rc))
agreements = [
s.get("winner_agreement")
for s in all_records
if s.get("eligible") and s.get("winner_agreement") is not None
]
winner_agreement_rate = (
_r4(sum(1 for a in agreements if a) / len(agreements)) if agreements else None
)
n_loocv_dates = sum(len(s.get("records", [])) for s in all_records)
return {
"pooled_spearman": pooled_spearman,
"residual_corr": residual_corr,
"winner_agreement_rate": winner_agreement_rate,
"n_loocv_dates": n_loocv_dates,
}
def _suitability_verdict(
n_repr_pass: int,
n_eligible: int,
n_total: int,
pooled: dict[str, Any],
) -> str:
if n_eligible == 0:
return "insufficient data"
repr_rate = n_repr_pass / n_total if n_total else 0
agree = pooled.get("winner_agreement_rate")
sp = pooled.get("pooled_spearman")
rc = pooled.get("residual_corr")
strong = 0
if repr_rate >= 0.6:
strong += 1
if agree is not None and agree >= 0.7:
strong += 1
if sp is not None and sp >= 0.8:
strong += 1
if rc is not None and rc >= 0.5:
strong += 1
if strong >= 3:
return "suitable"
if strong >= 1 or repr_rate >= 0.4:
return "partially suitable"
return "not suitable"
# ---------------------------------------------------------------------------
# Per-site processing
# ---------------------------------------------------------------------------
def process_site(
site: str,
year: int,
lat: float,
lon: float,
min_cloudfree: int,
efast,
) -> dict[str, Any]:
metrics_dir = DATA_DIR / "metrics" / str(year) / site
phenocam = _load_json_series(metrics_dir / "gcc_phenocam.json")
s2_series = _load_json_series(metrics_dir / "gcc_s2.json")
repr_metrics = compute_representativeness(phenocam, s2_series)
n_cloudfree = len(s2_series)
eligible = n_cloudfree >= min_cloudfree
result: dict[str, Any] = {
"eligible": eligible,
"n_cloudfree_s2": n_cloudfree,
"representativeness": repr_metrics,
"loocv": None,
"winner_s2": None,
"winner_phenocam": None,
"winner_agreement": None,
"records": [],
}
if not eligible:
return result
records = run_loocv_site(site, year, lat, lon, s2_series, phenocam, efast)
loocv = summarize_loocv(records)
result["loocv"] = loocv
result["winner_s2"] = loocv["winner_s2"]
result["winner_phenocam"] = loocv["winner_phenocam"]
result["winner_agreement"] = loocv["winner_agreement"]
result["records"] = records
return result
# ---------------------------------------------------------------------------
# Output / summary
# ---------------------------------------------------------------------------
def _compact_site_payload(site_result: dict[str, Any]) -> dict[str, Any]:
"""Drop raw LOOCV records from JSON output (keep summaries only)."""
out = {
"eligible": site_result["eligible"],
"n_cloudfree_s2": site_result["n_cloudfree_s2"],
"representativeness": site_result["representativeness"],
"winner_s2": site_result.get("winner_s2"),
"winner_phenocam": site_result.get("winner_phenocam"),
"winner_agreement": site_result.get("winner_agreement"),
}
if site_result.get("loocv"):
out["loocv"] = site_result["loocv"]
return out
def _print_summary(payload: dict[str, Any]) -> None:
year = payload["year"]
agg = payload["aggregate"]
print(
f"\nPhenoCam GCC suitability — {year} "
f"({payload['n_sites_total']} site(s), "
f"{payload['n_sites_eligible']} LOOCV-eligible, "
f"{payload['n_sites_repr_pass']} representative)"
)
print(f"Verdict: {agg['suitability_verdict']}")
print(
f" pooled Spearman (method errors): {agg.get('pooled_spearman', '')} "
f"residual corr: {agg.get('residual_corr', '')} "
f"winner agreement: {agg.get('winner_agreement_rate', '')} "
f"LOOCV dates: {agg.get('n_loocv_dates', '')}"
)
print(f"\n{'site':<28} {'repr r':>8} {'pass':>5} {'LOOCV n':>8} {'win agree':>10}")
print("-" * 65)
for site, data in sorted(payload["sites"].items()):
rep = data["representativeness"]
loocv_n = data.get("loocv", {}).get("n_dates") if data.get("loocv") else ""
agree = data.get("winner_agreement")
agree_s = "yes" if agree else ("no" if agree is False else "")
pass_s = "yes" if rep.get("representative") else "no"
print(
f"{site:<28} {rep.get('r') or '':>8} {pass_s:>5} "
f"{loocv_n!s:>8} {agree_s:>10}"
)
if payload["n_sites_total"] < SMALL_SAMPLE_SITES:
print(
f"\nNote: only {payload['n_sites_total']} site(s); "
"interpret cross-site aggregates cautiously."
)
if payload.get("dropped_sites"):
print(f"Dropped/ineligible: {', '.join(payload['dropped_sites'])}")
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--evaluation-year", type=int, default=DEFAULT_YEAR)
parser.add_argument(
"--min-cloudfree-s2",
type=int,
default=MIN_CLOUDFREE_S2,
help="Minimum cloud-free S2 dates for LOOCV (default 10)",
)
parser.add_argument(
"--alpha",
type=float,
default=DEFAULT_ALPHA,
help="Significance threshold (reserved; default 0.05)",
)
args = parser.parse_args()
year = args.evaluation_year
min_cloudfree = args.min_cloudfree_s2
sites = _discover_sites(year)
if not sites:
raise SystemExit(
f"No Step 5 metrics found under {DATA_DIR / 'metrics' / str(year)}"
)
coords = _load_site_coords(year)
efast = _import_efast()
site_results: dict[str, dict[str, Any]] = {}
dropped: list[str] = []
for site in tqdm(sites, desc="Sites"):
if site not in coords:
dropped.append(site)
continue
lat, lon = coords[site]
site_results[site] = process_site(site, year, lat, lon, min_cloudfree, efast)
if not site_results[site]["eligible"]:
dropped.append(site)
n_eligible = sum(1 for s in site_results.values() if s["eligible"])
n_repr_pass = sum(
1
for s in site_results.values()
if s["representativeness"].get("representative")
)
pooled = _pooled_concordance(list(site_results.values()))
verdict = _suitability_verdict(n_repr_pass, n_eligible, len(sites), pooled)
payload = {
"year": year,
"alpha": args.alpha,
"repr_r_threshold": REPR_R_THRESHOLD,
"min_cloudfree_s2": min_cloudfree,
"n_sites_total": len(sites),
"n_sites_eligible": n_eligible,
"n_sites_repr_pass": n_repr_pass,
"aggregate": {
"suitability_verdict": verdict,
**pooled,
},
"sites": {
site: _compact_site_payload(data)
for site, data in sorted(site_results.items())
},
"dropped_sites": sorted(set(dropped)),
}
out_dir = DATA_DIR / "gcc_suitability"
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / f"{year}.json"
out_path.write_text(json.dumps(payload, separators=(",", ":")))
_print_summary(payload)
print(f"\nWritten → {out_path}")
if __name__ == "__main__":
main()

View file

@ -14,6 +14,7 @@ End-to-end pipeline from selecting sites from the global [PhenoCam Network](http
| 4 | `4-fusion.py` | Run EFAST BtI (fuse reflectance → GCC) and ItB (fuse GCC directly) for each screened site | | 4 | `4-fusion.py` | Run EFAST BtI (fuse reflectance → GCC) and ItB (fuse GCC directly) for each screened site |
| 5 | `5-metrics.py` | Extract PhenoCam-matched timeseries, compute NSE/RMSE/r baselines and fusion metrics, emit per-site JSON and webapp manifest | | 5 | `5-metrics.py` | Extract PhenoCam-matched timeseries, compute NSE/RMSE/r baselines and fusion metrics, emit per-site JSON and webapp manifest |
| 6 | `6-statistics-fusion-order.py` | Paired ItB-vs-BtI significance test (Wilcoxon + t-test) across all sites | | 6 | `6-statistics-fusion-order.py` | Paired ItB-vs-BtI significance test (Wilcoxon + t-test) across all sites |
| 7 | `7-gcc-suitability.py` | PhenoCam GCC suitability as a fusion-accuracy reference (representativeness + LOOCV concordance) |
--- ---
@ -43,9 +44,10 @@ uv run python 3-sentinel-data.py --evaluation-year 2025
uv run python 4-fusion.py --evaluation-year 2025 uv run python 4-fusion.py --evaluation-year 2025
uv run python 5-metrics.py --evaluation-year 2025 uv run python 5-metrics.py --evaluation-year 2025
uv run python 6-statistics-fusion-order.py --evaluation-year 2025 uv run python 6-statistics-fusion-order.py --evaluation-year 2025
uv run python 7-gcc-suitability.py --evaluation-year 2025
``` ```
Steps 15 accept `--evaluation-year` (default `2025`) and `--site` (optional, for single-site runs). Step 6 is a full-sample aggregate and only accepts `--evaluation-year` and `--alpha` (default `0.05`). Steps 35 are resumable — existing output files are skipped. Steps 15 accept `--evaluation-year` (default `2025`) and `--site` (optional, for single-site runs). Steps 67 are full-sample aggregates and only accept `--evaluation-year` (Step 6 and 7 also accept `--alpha`; Step 7 adds `--min-cloudfree-s2`, default `10`). Steps 35 are resumable — existing output files are skipped.
```bash ```bash
# single site # single site
@ -73,6 +75,7 @@ Step 3 S3 download uses CDSE OpenEO (`SENTINEL3_SYN_L2_SYN`). Set `CDSE_USER` an
| `metrics/{year}/{site}/` | 5 | Per-site timeseries, metrics, covariates JSON | | `metrics/{year}/{site}/` | 5 | Per-site timeseries, metrics, covariates JSON |
| `metrics/manifest.json` | 5 | Webapp manifest (years + site metadata) | | `metrics/manifest.json` | 5 | Webapp manifest (years + site metadata) |
| `statistics_fusion_order/{year}.json` | 6 | Paired ItB-vs-BtI test summary (NSE, RMSE, nRMSE, r) | | `statistics_fusion_order/{year}.json` | 6 | Paired ItB-vs-BtI test summary (NSE, RMSE, nRMSE, r) |
| `gcc_suitability/{year}.json` | 7 | PhenoCam GCC suitability summary (representativeness + LOOCV concordance) |
--- ---
@ -80,7 +83,7 @@ Step 3 S3 download uses CDSE OpenEO (`SENTINEL3_SYN_L2_SYN`). Set `CDSE_USER` an
`python3 -m http.server 8080` runs the webapp on [http://localhost:8000/index.html](http://localhost:8000/index.html). Requires step 5 output (`data/metrics/manifest.json`). `python3 -m http.server 8080` runs the webapp on [http://localhost:8000/index.html](http://localhost:8000/index.html). Requires step 5 output (`data/metrics/manifest.json`). The Statistics overlay GCC suitability tab uses step 7 output (`data/gcc_suitability/{year}.json`).
--- ---

View file

@ -181,6 +181,13 @@ body { margin: 0; font: 13px/1.4 system-ui, sans-serif; background: #f5f5f5; col
.stat-badge.bti { background: #e3f2fd; color: #0d47a1; } .stat-badge.bti { background: #e3f2fd; color: #0d47a1; }
.stat-badge.none { background: #f5f5f5; color: #777; font-weight: 400; } .stat-badge.none { background: #f5f5f5; color: #777; font-weight: 400; }
.stat-badge.insuf { background: #fce4ec; color: #b71c1c; font-weight: 400; } .stat-badge.insuf { background: #fce4ec; color: #b71c1c; font-weight: 400; }
.stat-badge.pass { background: #e8f5e9; color: #1b5e20; }
.stat-badge.partial { background: #fff3e0; color: #e65100; font-weight: 400; }
.stat-badge.fail { background: #fce4ec; color: #b71c1c; font-weight: 400; }
.stat-site-table { width: 100%; border-collapse: collapse; font-size: 12px; margin-top: 8px; }
.stat-site-table th, .stat-site-table td { padding: 4px 8px; text-align: left; border-bottom: 1px solid #f0f0f0; }
.stat-site-table th { color: #888; font-weight: 500; }
.stat-site-table .sval { font-variant-numeric: tabular-nums; }
.stat-row-table { width: 100%; border-collapse: collapse; font-size: 12px; } .stat-row-table { width: 100%; border-collapse: collapse; font-size: 12px; }
.stat-row-table td { padding: 3px 0; vertical-align: top; } .stat-row-table td { padding: 3px 0; vertical-align: top; }
.stat-row-table .slabel { color: #888; width: 46%; } .stat-row-table .slabel { color: #888; width: 46%; }
@ -243,12 +250,7 @@ body { margin: 0; font: 13px/1.4 system-ui, sans-serif; background: #f5f5f5; col
<span class="overlay-meta" id="statsMeta"></span> <span class="overlay-meta" id="statsMeta"></span>
<button type="button" class="overlay-close" id="statsClose">Close</button> <button type="button" class="overlay-close" id="statsClose">Close</button>
</div> </div>
<div id="statsTabGcc" class="stats-tab-panel" style="display:none"> <div id="statsTabGcc" class="stats-tab-panel" style="display:none"></div>
<div class="stat-placeholder">
<p>GCC suitability</p>
Coming soon.
</div>
</div>
<div id="statsTabComparison" class="stats-tab-panel"></div> <div id="statsTabComparison" class="stats-tab-panel"></div>
<div id="statsTabSites" class="stats-tab-panel" style="display:none"> <div id="statsTabSites" class="stats-tab-panel" style="display:none">
<div class="stat-placeholder"> <div class="stat-placeholder">
@ -352,6 +354,8 @@ let statsOverlayOpen = false;
let statsTab = "comparison"; let statsTab = "comparison";
let statsData = null; let statsData = null;
let statsYear = null; let statsYear = null;
let gccSuitabilityData = null;
let gccSuitabilityYear = null;
const maps3 = {}; // { s2, fusion, s3 } Leaflet instances const maps3 = {}; // { s2, fusion, s3 } Leaflet instances
const overlays3 = {}; // current ImageOverlay per map const overlays3 = {}; // current ImageOverlay per map
const markers3 = {}; // site dot markers per map const markers3 = {}; // site dot markers per map
@ -620,15 +624,23 @@ function betterOrderLabel(order) {
} }
function updateStatsMeta() { function updateStatsMeta() {
if (statsTab !== "comparison" || !statsData) { if (statsTab === "comparison" && statsData) {
qs("#statsMeta").textContent = `${currentYear}`;
return;
}
const nPairs = statsData.metrics?.nse?.n_pairs; const nPairs = statsData.metrics?.nse?.n_pairs;
const alpha = statsData.alpha ?? 0.05; const alpha = statsData.alpha ?? 0.05;
qs("#statsMeta").textContent = nPairs != null qs("#statsMeta").textContent = nPairs != null
? `${nPairs} paired site${nPairs === 1 ? "" : "s"} · α=${alpha} · ${currentYear}` ? `${nPairs} paired site${nPairs === 1 ? "" : "s"} · α=${alpha} · ${currentYear}`
: `${currentYear}`; : `${currentYear}`;
return;
}
if (statsTab === "gcc" && gccSuitabilityData) {
const nEl = gccSuitabilityData.n_sites_eligible;
const nRep = gccSuitabilityData.n_sites_repr_pass;
qs("#statsMeta").textContent = nEl != null
? `${nEl} eligible · ${nRep} representative · ${currentYear}`
: `${currentYear}`;
return;
}
qs("#statsMeta").textContent = `${currentYear}`;
} }
function escHtml(s) { function escHtml(s) {
@ -720,6 +732,130 @@ async function loadStatsPanel() {
} }
} }
function suitabilityVerdictLabel(v) {
if (v === "suitable") return "Suitable";
if (v === "partially suitable") return "Partially suitable";
if (v === "not suitable") return "Not suitable";
if (v === "insufficient data") return "Insufficient data";
return v || "—";
}
function suitabilityBadgeClass(v) {
if (v === "suitable") return "pass";
if (v === "partially suitable") return "partial";
if (v === "not suitable") return "fail";
return "insuf";
}
function yesNo(v) {
if (v === true) return "yes";
if (v === false) return "no";
return "—";
}
function winnerLabel(w) {
if (w === "bti") return "BtI";
if (w === "itb") return "ItB";
if (w === "tie") return "tie";
return "—";
}
function gccDroppedSitesList(data) {
const dropped = data.dropped_sites || [];
if (!dropped.length) return "";
const links = dropped.map(site =>
`<button type="button" class="stat-site-link" data-site="${escHtml(site)}">${escHtml(site)}</button>`
).join("");
return `<div class="stat-dropped">
<h4>Excluded sites (${dropped.length})</h4>
<p class="stat-dropped-note">Missing coordinates or fewer cloud-free S2 dates than the LOOCV minimum.</p>
<div class="stat-dropped-list">${links}</div>
</div>`;
}
function renderGccSuitabilityPanel(data) {
const panel = qs("#statsTabGcc");
const agg = data.aggregate || {};
const threshold = data.repr_r_threshold ?? 0.7;
const verdict = agg.suitability_verdict;
const badge = `<span class="stat-badge ${suitabilityBadgeClass(verdict)}">${suitabilityVerdictLabel(verdict)}</span>`;
const row = (label, val) =>
`<tr><td class="slabel">${label}</td><td class="sval">${val}</td></tr>`;
const siteRows = Object.entries(data.sites || {}).sort(([a], [b]) => a.localeCompare(b)).map(([site, s]) => {
const rep = s.representative ?? s.representativeness?.representative;
const repBadge = rep
? '<span class="stat-badge pass">pass</span>'
: '<span class="stat-badge fail">fail</span>';
const loocv = s.loocv || {};
return `<tr>
<td><button type="button" class="stat-site-link" data-site="${escHtml(site)}">${escHtml(site)}</button></td>
<td class="sval">${fmtStat(s.representativeness?.r)}</td>
<td>${repBadge}</td>
<td class="sval">${loocv.n_dates ?? "—"}</td>
<td class="sval">${winnerLabel(s.winner_s2)} / ${winnerLabel(s.winner_phenocam)}</td>
<td class="sval">${yesNo(s.winner_agreement)}</td>
</tr>`;
}).join("");
const aggregateCard = `<div class="stat-card">
<h3>Aggregate verdict ${badge}</h3>
<table class="stat-row-table">
${row("Pooled Spearman (errors)", fmtStat(agg.pooled_spearman))}
${row("Residual correlation", fmtStat(agg.residual_corr))}
${row("Winner agreement rate", fmtStat(agg.winner_agreement_rate))}
${row("LOOCV dates (pooled)", agg.n_loocv_dates ?? "—")}
${row("Representative sites", `${data.n_sites_repr_pass ?? "—"} / ${data.n_sites_total ?? "—"}`)}
${row("LOOCV-eligible sites", `${data.n_sites_eligible ?? "—"} / ${data.n_sites_total ?? "—"}`)}
</table>
</div>`;
const reprCard = `<div class="stat-card">
<h3>Line A — PhenoCam vs S2 representativeness</h3>
<p style="font-size:11px;color:#999;margin:0 0 8px">Pass when Pearson r ≥ ${threshold} (oblique footprint tracks co-located S2 GCC).</p>
<table class="stat-site-table">
<thead><tr>
<th>Site</th><th>r</th><th>Pass</th><th>LOOCV n</th><th>Winner S2 / PC</th><th>Agree</th>
</tr></thead>
<tbody>${siteRows || '<tr><td colspan="6">No sites</td></tr>'}</tbody>
</table>
</div>`;
const concordanceCard = `<div class="stat-card">
<h3>Line B — LOOCV concordance</h3>
<p style="font-size:11px;color:#999;margin:0 0 8px">Same EFAST predictions scored against held-out S2 truth vs PhenoCam. Winner agreement and error correlations test whether PhenoCam ranks fusion methods like the satellite-internal reference.</p>
<table class="stat-row-table">
${row("Min cloud-free S2 gate", data.min_cloudfree_s2 ?? "—")}
${row("Pooled LOOCV dates", agg.n_loocv_dates ?? "—")}
${row("Winner agreement rate", fmtStat(agg.winner_agreement_rate))}
</table>
</div>`;
panel.innerHTML =
`<div class="stat-summary">Is PhenoCam GCC a valid reference for ranking fusion accuracy? · ${data.n_sites_total ?? "—"} site(s) · r threshold ${threshold}</div>` +
`<div class="stat-grid">${aggregateCard}${reprCard}${concordanceCard}</div>` +
gccDroppedSitesList(data);
updateStatsMeta();
}
async function loadGccSuitabilityPanel() {
const panel = qs("#statsTabGcc");
panel.innerHTML = '<div class="stat-nodata">Loading…</div>';
try {
const data = await fetch(`data/gcc_suitability/${currentYear}.json`)
.then(r => { if (!r.ok) throw new Error(); return r.json(); });
gccSuitabilityData = data;
gccSuitabilityYear = currentYear;
renderGccSuitabilityPanel(data);
} catch {
gccSuitabilityData = null;
gccSuitabilityYear = null;
panel.innerHTML =
'<div class="stat-nodata">No GCC suitability file found — run 7-gcc-suitability.py first.</div>';
updateStatsMeta();
}
}
const STATS_TAB_PANELS = { const STATS_TAB_PANELS = {
comparison: "#statsTabComparison", comparison: "#statsTabComparison",
gcc: "#statsTabGcc", gcc: "#statsTabGcc",
@ -736,6 +872,9 @@ function switchStatsTab(tab, updateHash = true) {
if (tab === "comparison") { if (tab === "comparison") {
if (statsYear !== currentYear || !statsData) loadStatsPanel(); if (statsYear !== currentYear || !statsData) loadStatsPanel();
else updateStatsMeta(); else updateStatsMeta();
} else if (tab === "gcc") {
if (gccSuitabilityYear !== currentYear || !gccSuitabilityData) loadGccSuitabilityPanel();
else updateStatsMeta();
} else { } else {
updateStatsMeta(); updateStatsMeta();
} }
@ -783,9 +922,12 @@ async function init() {
currentYear = +yearSel.value; currentYear = +yearSel.value;
statsData = null; statsData = null;
statsYear = null; statsYear = null;
gccSuitabilityData = null;
gccSuitabilityYear = null;
buildSiteList(); buildSiteList();
if (worldOverlayOpen) buildWorldMap(); if (worldOverlayOpen) buildWorldMap();
if (statsOverlayOpen && statsTab === "comparison") loadStatsPanel(); if (statsOverlayOpen && statsTab === "comparison") loadStatsPanel();
if (statsOverlayOpen && statsTab === "gcc") loadGccSuitabilityPanel();
}); });
qs("#worldMapBtn").addEventListener("click", () => openWorldOverlay()); qs("#worldMapBtn").addEventListener("click", () => openWorldOverlay());
@ -797,6 +939,11 @@ async function init() {
if (!link) return; if (!link) return;
pickSiteFromStats(link.dataset.site); pickSiteFromStats(link.dataset.site);
}); });
qs("#statsTabGcc").addEventListener("click", e => {
const link = e.target.closest(".stat-site-link");
if (!link) return;
pickSiteFromStats(link.dataset.site);
});
qs("#worldClose").addEventListener("click", () => closeWorldOverlay()); qs("#worldClose").addEventListener("click", () => closeWorldOverlay());
qs("#statsClose").addEventListener("click", () => closeStatsOverlay()); qs("#statsClose").addEventListener("click", () => closeStatsOverlay());
qs("#worldOverlay").addEventListener("click", e => { qs("#worldOverlay").addEventListener("click", e => {