Added gcc suitability stats.
This commit is contained in:
parent
d55ee31e8d
commit
a593683314
3 changed files with 907 additions and 15 deletions
742
7-gcc-suitability.py
Normal file
742
7-gcc-suitability.py
Normal file
|
|
@ -0,0 +1,742 @@
|
|||
"""Step 7: PhenoCam GCC suitability as a fusion-accuracy reference.
|
||||
|
||||
Inputs (``data/``, ``{year}`` = ``--evaluation-year``):
|
||||
|
||||
- ``metrics/{year}/{site}/gcc_s2.json``, ``gcc_phenocam.json`` — Step 5 timeseries
|
||||
- ``metrics/manifest.json`` — site lat/lon
|
||||
- ``sentinel_data/{year}/{site}/prepared/s2/`` — S2 REFL/GCC/DIST_CLOUD (Steps 3–4)
|
||||
- ``sentinel_data/{year}/{site}/prepared/gcc_s3/``, ``prepared/s3_rgb/`` — Step 4
|
||||
|
||||
Outputs (``data/gcc_suitability/``):
|
||||
|
||||
- ``{year}.json`` — representativeness (Line A), LOOCV concordance (Line B),
|
||||
per-site and aggregate suitability verdict
|
||||
|
||||
CLI:
|
||||
|
||||
- ``--evaluation-year`` (default 2025)
|
||||
- ``--min-cloudfree-s2`` (default 10) — minimum cloud-free S2 dates for LOOCV
|
||||
- ``--alpha`` (default 0.05) — reserved for future significance tests
|
||||
|
||||
Full-sample aggregate; does not accept ``--site``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import rasterio
|
||||
from pyproj import Transformer
|
||||
from rasterio.crs import CRS
|
||||
from rasterio.transform import rowcol
|
||||
from scipy.stats import linregress, pearsonr, spearmanr
|
||||
from tqdm import tqdm
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Constants
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
DATA_DIR = Path("data")
|
||||
DEFAULT_YEAR = 2025
|
||||
DEFAULT_ALPHA = 0.05
|
||||
MIN_CLOUDFREE_S2 = 10
|
||||
REPR_R_THRESHOLD = 0.7
|
||||
MATCH_TOLERANCE_DAYS = 5
|
||||
|
||||
RESOLUTION_RATIO = 30
|
||||
MAX_DAYS = 100
|
||||
MINIMUM_ACQUISITION_IMPORTANCE = 0
|
||||
|
||||
SMALL_SAMPLE_SITES = 6
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# efast import
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _import_efast():
|
||||
try:
|
||||
import efast.efast as efast_module
|
||||
|
||||
return efast_module
|
||||
except ImportError as exc:
|
||||
raise ImportError("efast not found. Install with: uv sync") from exc
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _r4(v: float | None) -> float | None:
|
||||
return round(v, 4) if v is not None else None
|
||||
|
||||
|
||||
def _window_mean(data: np.ndarray) -> float | None:
|
||||
valid = data[~np.isnan(data)]
|
||||
if valid.size == 0:
|
||||
return None
|
||||
return float(np.mean(valid))
|
||||
|
||||
|
||||
def _read_center_pixel(path: Path, lat: float, lon: float) -> float | None:
|
||||
try:
|
||||
with rasterio.open(path) as src:
|
||||
transformer = Transformer.from_crs(
|
||||
CRS.from_epsg(4326), src.crs, always_xy=True
|
||||
)
|
||||
x, y = transformer.transform(lon, lat)
|
||||
row, col = rowcol(src.transform, x, y)
|
||||
h, w = src.height, src.width
|
||||
r0, r1 = max(0, row - 1), min(h, row + 2)
|
||||
c0, c1 = max(0, col - 1), min(w, col + 2)
|
||||
window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0)
|
||||
data = src.read(1, window=window).astype(float)
|
||||
nodata = src.nodata
|
||||
if nodata is not None:
|
||||
data = np.where(data == nodata, np.nan, data)
|
||||
data[data == 0] = np.nan
|
||||
return _window_mean(data)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _date_from_s2_tif(path: Path) -> str | None:
|
||||
parts = path.stem.split("_")
|
||||
if len(parts) >= 3:
|
||||
m = re.match(r"(\d{8})", parts[2])
|
||||
return m.group(1) if m else None
|
||||
return None
|
||||
|
||||
|
||||
def _iso_to_yyyymmdd(iso: str) -> str:
|
||||
return iso.replace("-", "")
|
||||
|
||||
|
||||
def _yyyymmdd_to_iso(d: str) -> str:
|
||||
return f"{d[:4]}-{d[4:6]}-{d[6:]}"
|
||||
|
||||
|
||||
def _day_gap(a: str, b: str) -> float:
|
||||
return abs((np.datetime64(a) - np.datetime64(b)) / np.timedelta64(1, "D"))
|
||||
|
||||
|
||||
def _match_series(
|
||||
ref: list[dict],
|
||||
ref_key: str,
|
||||
pred: list[dict],
|
||||
pred_key: str,
|
||||
tolerance_days: int = MATCH_TOLERANCE_DAYS,
|
||||
) -> tuple[list[float], list[float], list[str]]:
|
||||
"""Return paired (ref_vals, pred_vals, ref_dates) within tolerance."""
|
||||
ref_lookup: dict[str, float] = {
|
||||
p["date"]: p[ref_key] for p in ref if p.get(ref_key) is not None
|
||||
}
|
||||
if not ref_lookup:
|
||||
return [], [], []
|
||||
|
||||
ref_dates = sorted(ref_lookup)
|
||||
obs, sim, matched_dates = [], [], []
|
||||
for pt in pred:
|
||||
v = pt.get(pred_key)
|
||||
if v is None:
|
||||
continue
|
||||
nearest = min(ref_dates, key=lambda d: _day_gap(pt["date"], d))
|
||||
if _day_gap(pt["date"], nearest) <= tolerance_days:
|
||||
obs.append(ref_lookup[nearest])
|
||||
sim.append(v)
|
||||
matched_dates.append(nearest)
|
||||
return obs, sim, matched_dates
|
||||
|
||||
|
||||
def _accuracy_metrics(obs: list[float], sim: list[float]) -> dict[str, Any] | None:
|
||||
if len(obs) < 2:
|
||||
return None
|
||||
obs_arr = np.array(obs, dtype=float)
|
||||
sim_arr = np.array(sim, dtype=float)
|
||||
diff = sim_arr - obs_arr
|
||||
rmse = float(np.sqrt(np.mean(diff**2)))
|
||||
mae = float(np.mean(np.abs(diff)))
|
||||
bias = float(np.mean(diff))
|
||||
r, _ = pearsonr(obs_arr, sim_arr)
|
||||
return {
|
||||
"n": len(obs),
|
||||
"rmse": _r4(rmse),
|
||||
"mae": _r4(mae),
|
||||
"bias": _r4(bias),
|
||||
"r": _r4(float(r)),
|
||||
}
|
||||
|
||||
|
||||
def _gcc_from_refl_file(refl_path: Path, gcc_path: Path) -> None:
|
||||
with rasterio.open(refl_path) as src:
|
||||
b, g, r = src.read(1), src.read(2), src.read(3)
|
||||
profile = src.profile
|
||||
total = b + g + r
|
||||
invalid = (b < 0) | (g < 0) | (r < 0)
|
||||
gcc = np.where(invalid, np.nan, g / (total + 1e-10))
|
||||
gcc[total == 0] = np.nan
|
||||
profile.update(count=1, dtype="float32")
|
||||
with rasterio.open(gcc_path, "w", **profile) as dst:
|
||||
dst.write(gcc[np.newaxis].astype("float32"))
|
||||
|
||||
|
||||
def _load_json_series(path: Path) -> list[dict]:
|
||||
if not path.is_file():
|
||||
return []
|
||||
return json.loads(path.read_text())
|
||||
|
||||
|
||||
def _load_site_coords(year: int) -> dict[str, tuple[float, float]]:
|
||||
manifest_path = DATA_DIR / "metrics" / "manifest.json"
|
||||
if not manifest_path.is_file():
|
||||
return {}
|
||||
manifest = json.loads(manifest_path.read_text())
|
||||
sites = manifest.get("sites", {}).get(str(year), {})
|
||||
coords: dict[str, tuple[float, float]] = {}
|
||||
for site, meta in sites.items():
|
||||
lat, lon = meta.get("lat"), meta.get("lon")
|
||||
if lat is not None and lon is not None:
|
||||
coords[site] = (float(lat), float(lon))
|
||||
return coords
|
||||
|
||||
|
||||
def _discover_sites(year: int) -> list[str]:
|
||||
metrics_dir = DATA_DIR / "metrics" / str(year)
|
||||
if not metrics_dir.is_dir():
|
||||
return []
|
||||
return sorted(
|
||||
d.name
|
||||
for d in metrics_dir.iterdir()
|
||||
if d.is_dir() and (d / "gcc_s2.json").is_file()
|
||||
)
|
||||
|
||||
|
||||
def _build_hr_symlink_dir(s2_dir: Path, holdout_yyyymmdd: str, dest: Path) -> None:
|
||||
"""Symlink all S2 hr inputs except the held-out acquisition date."""
|
||||
dest.mkdir(parents=True, exist_ok=True)
|
||||
for pattern in ("*_REFL.tif", "*_GCC.tif", "*_DIST_CLOUD.tif"):
|
||||
for src in sorted(s2_dir.glob(pattern)):
|
||||
date_token = (
|
||||
src.stem.split("_")[2][:8] if len(src.stem.split("_")) >= 3 else ""
|
||||
)
|
||||
if date_token == holdout_yyyymmdd:
|
||||
continue
|
||||
link = dest / src.name
|
||||
if link.exists() or link.is_symlink():
|
||||
link.unlink()
|
||||
link.symlink_to(src.resolve())
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Line A — representativeness
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def compute_representativeness(phenocam: list[dict], s2: list[dict]) -> dict[str, Any]:
|
||||
"""PhenoCam gcc_90 vs co-located observed S2 GCC."""
|
||||
obs, sim, _ = _match_series(phenocam, "gcc_90", s2, "gcc")
|
||||
result: dict[str, Any] = {
|
||||
"n": len(obs),
|
||||
"r": None,
|
||||
"spearman": None,
|
||||
"slope": None,
|
||||
"intercept": None,
|
||||
"rmse": None,
|
||||
"bias": None,
|
||||
"peak_offset_days": None,
|
||||
"representative": False,
|
||||
}
|
||||
if len(obs) < 2:
|
||||
return result
|
||||
|
||||
obs_arr = np.array(obs, dtype=float)
|
||||
sim_arr = np.array(sim, dtype=float)
|
||||
r, _ = pearsonr(obs_arr, sim_arr)
|
||||
sp, _ = spearmanr(obs_arr, sim_arr)
|
||||
reg = linregress(sim_arr, obs_arr)
|
||||
diff = sim_arr - obs_arr
|
||||
result.update(
|
||||
{
|
||||
"r": _r4(float(r)),
|
||||
"spearman": _r4(float(sp)),
|
||||
"slope": _r4(float(reg.slope)),
|
||||
"intercept": _r4(float(reg.intercept)),
|
||||
"rmse": _r4(float(np.sqrt(np.mean(diff**2)))),
|
||||
"bias": _r4(float(np.mean(diff))),
|
||||
"representative": float(r) >= REPR_R_THRESHOLD,
|
||||
}
|
||||
)
|
||||
|
||||
pc_dates = [p["date"] for p in phenocam if p.get("gcc_90") is not None]
|
||||
s2_dates = [p["date"] for p in s2 if p.get("gcc") is not None]
|
||||
if pc_dates and s2_dates:
|
||||
pc_peak = max(
|
||||
phenocam,
|
||||
key=lambda p: p["gcc_90"] if p.get("gcc_90") is not None else -1,
|
||||
)["date"]
|
||||
s2_peak = max(s2, key=lambda p: p["gcc"] if p.get("gcc") is not None else -1)[
|
||||
"date"
|
||||
]
|
||||
result["peak_offset_days"] = int(_day_gap(pc_peak, s2_peak))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Line B — LOOCV
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _phenocam_lookup(phenocam: list[dict]) -> dict[str, float]:
|
||||
return {p["date"]: p["gcc_90"] for p in phenocam if p.get("gcc_90") is not None}
|
||||
|
||||
|
||||
def _nearest_phenocam(iso_date: str, lookup: dict[str, float]) -> float | None:
|
||||
if not lookup:
|
||||
return None
|
||||
dates = sorted(lookup)
|
||||
nearest = min(dates, key=lambda d: _day_gap(iso_date, d))
|
||||
if _day_gap(iso_date, nearest) <= MATCH_TOLERANCE_DAYS:
|
||||
return lookup[nearest]
|
||||
return None
|
||||
|
||||
|
||||
def run_loocv_site(
|
||||
site: str,
|
||||
year: int,
|
||||
lat: float,
|
||||
lon: float,
|
||||
s2_series: list[dict],
|
||||
phenocam: list[dict],
|
||||
efast,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Leave-one-out EFAST for each cloud-free S2 date; return per-date records."""
|
||||
s2_dir = DATA_DIR / "sentinel_data" / str(year) / site / "prepared" / "s2"
|
||||
gcc_s3_dir = DATA_DIR / "sentinel_data" / str(year) / site / "prepared" / "gcc_s3"
|
||||
s3_rgb_dir = DATA_DIR / "sentinel_data" / str(year) / site / "prepared" / "s3_rgb"
|
||||
|
||||
pc_lookup = _phenocam_lookup(phenocam)
|
||||
s2_truth = {p["date"]: p["gcc"] for p in s2_series}
|
||||
fusion_kwargs = dict(
|
||||
ratio=RESOLUTION_RATIO,
|
||||
max_days=MAX_DAYS,
|
||||
minimum_acquisition_importance=MINIMUM_ACQUISITION_IMPORTANCE,
|
||||
)
|
||||
|
||||
records: list[dict[str, Any]] = []
|
||||
dates = [p["date"] for p in s2_series]
|
||||
|
||||
with tempfile.TemporaryDirectory(prefix=f"loocv_{site}_") as tmp_root:
|
||||
tmp = Path(tmp_root)
|
||||
hr_dir = tmp / "hr"
|
||||
itb_out = tmp / "itb"
|
||||
bti_out = tmp / "bti"
|
||||
bti_gcc = tmp / "bti_gcc"
|
||||
itb_out.mkdir()
|
||||
bti_out.mkdir()
|
||||
bti_gcc.mkdir()
|
||||
|
||||
for iso_date in tqdm(dates, desc=f"{site} LOOCV", leave=False):
|
||||
yyyymmdd = _iso_to_yyyymmdd(iso_date)
|
||||
truth = s2_truth.get(iso_date)
|
||||
if truth is None:
|
||||
continue
|
||||
|
||||
if hr_dir.exists():
|
||||
shutil.rmtree(hr_dir)
|
||||
_build_hr_symlink_dir(s2_dir, yyyymmdd, hr_dir)
|
||||
|
||||
pred_date = datetime.strptime(yyyymmdd, "%Y%m%d")
|
||||
|
||||
for f in itb_out.glob("*.tif"):
|
||||
f.unlink()
|
||||
for f in bti_out.glob("*.tif"):
|
||||
f.unlink()
|
||||
for f in bti_gcc.glob("*.tif"):
|
||||
f.unlink()
|
||||
|
||||
efast.fusion(
|
||||
pred_date,
|
||||
gcc_s3_dir,
|
||||
hr_dir,
|
||||
itb_out,
|
||||
product="GCC",
|
||||
**fusion_kwargs,
|
||||
)
|
||||
efast.fusion(
|
||||
pred_date,
|
||||
s3_rgb_dir,
|
||||
hr_dir,
|
||||
bti_out,
|
||||
product="REFL",
|
||||
**fusion_kwargs,
|
||||
)
|
||||
|
||||
itb_path = itb_out / f"GCC_{yyyymmdd}.tif"
|
||||
refl_path = bti_out / f"REFL_{yyyymmdd}.tif"
|
||||
bti_path = bti_gcc / f"GCC_{yyyymmdd}.tif"
|
||||
|
||||
pred_itb = (
|
||||
_read_center_pixel(itb_path, lat, lon) if itb_path.is_file() else None
|
||||
)
|
||||
pred_bti = None
|
||||
if refl_path.is_file():
|
||||
_gcc_from_refl_file(refl_path, bti_path)
|
||||
if bti_path.is_file():
|
||||
pred_bti = _read_center_pixel(bti_path, lat, lon)
|
||||
|
||||
pc_val = _nearest_phenocam(iso_date, pc_lookup)
|
||||
|
||||
records.append(
|
||||
{
|
||||
"date": iso_date,
|
||||
"s2_truth": truth,
|
||||
"pred_bti": pred_bti,
|
||||
"pred_itb": pred_itb,
|
||||
"phenocam": pc_val,
|
||||
}
|
||||
)
|
||||
|
||||
return records
|
||||
|
||||
|
||||
def _method_accuracy(records: list[dict], pred_key: str, ref_key: str) -> dict | None:
|
||||
obs, sim = [], []
|
||||
for rec in records:
|
||||
pred = rec.get(pred_key)
|
||||
ref = rec.get(ref_key)
|
||||
if pred is None or ref is None:
|
||||
continue
|
||||
obs.append(ref)
|
||||
sim.append(pred)
|
||||
return _accuracy_metrics(obs, sim)
|
||||
|
||||
|
||||
def _winner(rmse_bti: float | None, rmse_itb: float | None) -> str | None:
|
||||
if rmse_bti is None or rmse_itb is None:
|
||||
return None
|
||||
if rmse_bti < rmse_itb:
|
||||
return "bti"
|
||||
if rmse_itb < rmse_bti:
|
||||
return "itb"
|
||||
return "tie"
|
||||
|
||||
|
||||
def summarize_loocv(records: list[dict]) -> dict[str, Any]:
|
||||
bti_vs_s2 = _method_accuracy(records, "pred_bti", "s2_truth")
|
||||
itb_vs_s2 = _method_accuracy(records, "pred_itb", "s2_truth")
|
||||
bti_vs_pc = _method_accuracy(records, "pred_bti", "phenocam")
|
||||
itb_vs_pc = _method_accuracy(records, "pred_itb", "phenocam")
|
||||
|
||||
winner_s2 = _winner(
|
||||
bti_vs_s2["rmse"] if bti_vs_s2 else None,
|
||||
itb_vs_s2["rmse"] if itb_vs_s2 else None,
|
||||
)
|
||||
winner_pc = _winner(
|
||||
bti_vs_pc["rmse"] if bti_vs_pc else None,
|
||||
itb_vs_pc["rmse"] if itb_vs_pc else None,
|
||||
)
|
||||
agreement = (
|
||||
winner_s2 == winner_pc
|
||||
if winner_s2 and winner_pc and winner_s2 != "tie" and winner_pc != "tie"
|
||||
else None
|
||||
)
|
||||
|
||||
return {
|
||||
"n_dates": len(records),
|
||||
"bti": {"vs_s2": bti_vs_s2, "vs_phenocam": bti_vs_pc},
|
||||
"itb": {"vs_s2": itb_vs_s2, "vs_phenocam": itb_vs_pc},
|
||||
"winner_s2": winner_s2,
|
||||
"winner_phenocam": winner_pc,
|
||||
"winner_agreement": agreement,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Aggregate concordance
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _pooled_concordance(
|
||||
all_records: list[dict[str, Any]],
|
||||
) -> dict[str, Any]:
|
||||
"""Pooled metrics across all held-out dates."""
|
||||
residual_pairs: list[tuple[float, float]] = []
|
||||
vec_s2: list[float] = []
|
||||
vec_pc: list[float] = []
|
||||
|
||||
for site_data in all_records:
|
||||
for rec in site_data.get("records", []):
|
||||
truth = rec.get("s2_truth")
|
||||
pc = rec.get("phenocam")
|
||||
for key in ("pred_bti", "pred_itb"):
|
||||
pred = rec.get(key)
|
||||
if pred is None or truth is None:
|
||||
continue
|
||||
err_s2 = abs(pred - truth)
|
||||
if pc is not None:
|
||||
err_pc = abs(pred - pc)
|
||||
vec_s2.append(err_s2)
|
||||
vec_pc.append(err_pc)
|
||||
residual_pairs.append((err_s2, err_pc))
|
||||
|
||||
pooled_spearman = None
|
||||
if len(vec_s2) >= 3:
|
||||
sp, _ = spearmanr(vec_s2, vec_pc)
|
||||
if not np.isnan(sp):
|
||||
pooled_spearman = _r4(float(sp))
|
||||
|
||||
residual_corr = None
|
||||
if len(residual_pairs) >= 3:
|
||||
xs = np.array([p[0] for p in residual_pairs])
|
||||
ys = np.array([p[1] for p in residual_pairs])
|
||||
rc, _ = pearsonr(xs, ys)
|
||||
residual_corr = _r4(float(rc))
|
||||
|
||||
agreements = [
|
||||
s.get("winner_agreement")
|
||||
for s in all_records
|
||||
if s.get("eligible") and s.get("winner_agreement") is not None
|
||||
]
|
||||
winner_agreement_rate = (
|
||||
_r4(sum(1 for a in agreements if a) / len(agreements)) if agreements else None
|
||||
)
|
||||
|
||||
n_loocv_dates = sum(len(s.get("records", [])) for s in all_records)
|
||||
|
||||
return {
|
||||
"pooled_spearman": pooled_spearman,
|
||||
"residual_corr": residual_corr,
|
||||
"winner_agreement_rate": winner_agreement_rate,
|
||||
"n_loocv_dates": n_loocv_dates,
|
||||
}
|
||||
|
||||
|
||||
def _suitability_verdict(
|
||||
n_repr_pass: int,
|
||||
n_eligible: int,
|
||||
n_total: int,
|
||||
pooled: dict[str, Any],
|
||||
) -> str:
|
||||
if n_eligible == 0:
|
||||
return "insufficient data"
|
||||
repr_rate = n_repr_pass / n_total if n_total else 0
|
||||
agree = pooled.get("winner_agreement_rate")
|
||||
sp = pooled.get("pooled_spearman")
|
||||
rc = pooled.get("residual_corr")
|
||||
|
||||
strong = 0
|
||||
if repr_rate >= 0.6:
|
||||
strong += 1
|
||||
if agree is not None and agree >= 0.7:
|
||||
strong += 1
|
||||
if sp is not None and sp >= 0.8:
|
||||
strong += 1
|
||||
if rc is not None and rc >= 0.5:
|
||||
strong += 1
|
||||
|
||||
if strong >= 3:
|
||||
return "suitable"
|
||||
if strong >= 1 or repr_rate >= 0.4:
|
||||
return "partially suitable"
|
||||
return "not suitable"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-site processing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def process_site(
|
||||
site: str,
|
||||
year: int,
|
||||
lat: float,
|
||||
lon: float,
|
||||
min_cloudfree: int,
|
||||
efast,
|
||||
) -> dict[str, Any]:
|
||||
metrics_dir = DATA_DIR / "metrics" / str(year) / site
|
||||
phenocam = _load_json_series(metrics_dir / "gcc_phenocam.json")
|
||||
s2_series = _load_json_series(metrics_dir / "gcc_s2.json")
|
||||
|
||||
repr_metrics = compute_representativeness(phenocam, s2_series)
|
||||
n_cloudfree = len(s2_series)
|
||||
eligible = n_cloudfree >= min_cloudfree
|
||||
|
||||
result: dict[str, Any] = {
|
||||
"eligible": eligible,
|
||||
"n_cloudfree_s2": n_cloudfree,
|
||||
"representativeness": repr_metrics,
|
||||
"loocv": None,
|
||||
"winner_s2": None,
|
||||
"winner_phenocam": None,
|
||||
"winner_agreement": None,
|
||||
"records": [],
|
||||
}
|
||||
|
||||
if not eligible:
|
||||
return result
|
||||
|
||||
records = run_loocv_site(site, year, lat, lon, s2_series, phenocam, efast)
|
||||
loocv = summarize_loocv(records)
|
||||
result["loocv"] = loocv
|
||||
result["winner_s2"] = loocv["winner_s2"]
|
||||
result["winner_phenocam"] = loocv["winner_phenocam"]
|
||||
result["winner_agreement"] = loocv["winner_agreement"]
|
||||
result["records"] = records
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Output / summary
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _compact_site_payload(site_result: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Drop raw LOOCV records from JSON output (keep summaries only)."""
|
||||
out = {
|
||||
"eligible": site_result["eligible"],
|
||||
"n_cloudfree_s2": site_result["n_cloudfree_s2"],
|
||||
"representativeness": site_result["representativeness"],
|
||||
"winner_s2": site_result.get("winner_s2"),
|
||||
"winner_phenocam": site_result.get("winner_phenocam"),
|
||||
"winner_agreement": site_result.get("winner_agreement"),
|
||||
}
|
||||
if site_result.get("loocv"):
|
||||
out["loocv"] = site_result["loocv"]
|
||||
return out
|
||||
|
||||
|
||||
def _print_summary(payload: dict[str, Any]) -> None:
|
||||
year = payload["year"]
|
||||
agg = payload["aggregate"]
|
||||
print(
|
||||
f"\nPhenoCam GCC suitability — {year} "
|
||||
f"({payload['n_sites_total']} site(s), "
|
||||
f"{payload['n_sites_eligible']} LOOCV-eligible, "
|
||||
f"{payload['n_sites_repr_pass']} representative)"
|
||||
)
|
||||
print(f"Verdict: {agg['suitability_verdict']}")
|
||||
print(
|
||||
f" pooled Spearman (method errors): {agg.get('pooled_spearman', '—')} "
|
||||
f"residual corr: {agg.get('residual_corr', '—')} "
|
||||
f"winner agreement: {agg.get('winner_agreement_rate', '—')} "
|
||||
f"LOOCV dates: {agg.get('n_loocv_dates', '—')}"
|
||||
)
|
||||
print(f"\n{'site':<28} {'repr r':>8} {'pass':>5} {'LOOCV n':>8} {'win agree':>10}")
|
||||
print("-" * 65)
|
||||
for site, data in sorted(payload["sites"].items()):
|
||||
rep = data["representativeness"]
|
||||
loocv_n = data.get("loocv", {}).get("n_dates") if data.get("loocv") else "—"
|
||||
agree = data.get("winner_agreement")
|
||||
agree_s = "yes" if agree else ("no" if agree is False else "—")
|
||||
pass_s = "yes" if rep.get("representative") else "no"
|
||||
print(
|
||||
f"{site:<28} {rep.get('r') or '—':>8} {pass_s:>5} "
|
||||
f"{loocv_n!s:>8} {agree_s:>10}"
|
||||
)
|
||||
if payload["n_sites_total"] < SMALL_SAMPLE_SITES:
|
||||
print(
|
||||
f"\nNote: only {payload['n_sites_total']} site(s); "
|
||||
"interpret cross-site aggregates cautiously."
|
||||
)
|
||||
if payload.get("dropped_sites"):
|
||||
print(f"Dropped/ineligible: {', '.join(payload['dropped_sites'])}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--evaluation-year", type=int, default=DEFAULT_YEAR)
|
||||
parser.add_argument(
|
||||
"--min-cloudfree-s2",
|
||||
type=int,
|
||||
default=MIN_CLOUDFREE_S2,
|
||||
help="Minimum cloud-free S2 dates for LOOCV (default 10)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--alpha",
|
||||
type=float,
|
||||
default=DEFAULT_ALPHA,
|
||||
help="Significance threshold (reserved; default 0.05)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
year = args.evaluation_year
|
||||
min_cloudfree = args.min_cloudfree_s2
|
||||
|
||||
sites = _discover_sites(year)
|
||||
if not sites:
|
||||
raise SystemExit(
|
||||
f"No Step 5 metrics found under {DATA_DIR / 'metrics' / str(year)}"
|
||||
)
|
||||
|
||||
coords = _load_site_coords(year)
|
||||
efast = _import_efast()
|
||||
|
||||
site_results: dict[str, dict[str, Any]] = {}
|
||||
dropped: list[str] = []
|
||||
|
||||
for site in tqdm(sites, desc="Sites"):
|
||||
if site not in coords:
|
||||
dropped.append(site)
|
||||
continue
|
||||
lat, lon = coords[site]
|
||||
site_results[site] = process_site(site, year, lat, lon, min_cloudfree, efast)
|
||||
if not site_results[site]["eligible"]:
|
||||
dropped.append(site)
|
||||
|
||||
n_eligible = sum(1 for s in site_results.values() if s["eligible"])
|
||||
n_repr_pass = sum(
|
||||
1
|
||||
for s in site_results.values()
|
||||
if s["representativeness"].get("representative")
|
||||
)
|
||||
|
||||
pooled = _pooled_concordance(list(site_results.values()))
|
||||
verdict = _suitability_verdict(n_repr_pass, n_eligible, len(sites), pooled)
|
||||
|
||||
payload = {
|
||||
"year": year,
|
||||
"alpha": args.alpha,
|
||||
"repr_r_threshold": REPR_R_THRESHOLD,
|
||||
"min_cloudfree_s2": min_cloudfree,
|
||||
"n_sites_total": len(sites),
|
||||
"n_sites_eligible": n_eligible,
|
||||
"n_sites_repr_pass": n_repr_pass,
|
||||
"aggregate": {
|
||||
"suitability_verdict": verdict,
|
||||
**pooled,
|
||||
},
|
||||
"sites": {
|
||||
site: _compact_site_payload(data)
|
||||
for site, data in sorted(site_results.items())
|
||||
},
|
||||
"dropped_sites": sorted(set(dropped)),
|
||||
}
|
||||
|
||||
out_dir = DATA_DIR / "gcc_suitability"
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
out_path = out_dir / f"{year}.json"
|
||||
out_path.write_text(json.dumps(payload, separators=(",", ":")))
|
||||
|
||||
_print_summary(payload)
|
||||
print(f"\nWritten → {out_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -14,6 +14,7 @@ End-to-end pipeline from selecting sites from the global [PhenoCam Network](http
|
|||
| 4 | `4-fusion.py` | Run EFAST BtI (fuse reflectance → GCC) and ItB (fuse GCC directly) for each screened site |
|
||||
| 5 | `5-metrics.py` | Extract PhenoCam-matched timeseries, compute NSE/RMSE/r baselines and fusion metrics, emit per-site JSON and webapp manifest |
|
||||
| 6 | `6-statistics-fusion-order.py` | Paired ItB-vs-BtI significance test (Wilcoxon + t-test) across all sites |
|
||||
| 7 | `7-gcc-suitability.py` | PhenoCam GCC suitability as a fusion-accuracy reference (representativeness + LOOCV concordance) |
|
||||
|
||||
---
|
||||
|
||||
|
|
@ -43,9 +44,10 @@ uv run python 3-sentinel-data.py --evaluation-year 2025
|
|||
uv run python 4-fusion.py --evaluation-year 2025
|
||||
uv run python 5-metrics.py --evaluation-year 2025
|
||||
uv run python 6-statistics-fusion-order.py --evaluation-year 2025
|
||||
uv run python 7-gcc-suitability.py --evaluation-year 2025
|
||||
```
|
||||
|
||||
Steps 1–5 accept `--evaluation-year` (default `2025`) and `--site` (optional, for single-site runs). Step 6 is a full-sample aggregate and only accepts `--evaluation-year` and `--alpha` (default `0.05`). Steps 3–5 are resumable — existing output files are skipped.
|
||||
Steps 1–5 accept `--evaluation-year` (default `2025`) and `--site` (optional, for single-site runs). Steps 6–7 are full-sample aggregates and only accept `--evaluation-year` (Step 6 and 7 also accept `--alpha`; Step 7 adds `--min-cloudfree-s2`, default `10`). Steps 3–5 are resumable — existing output files are skipped.
|
||||
|
||||
```bash
|
||||
# single site
|
||||
|
|
@ -73,6 +75,7 @@ Step 3 S3 download uses CDSE OpenEO (`SENTINEL3_SYN_L2_SYN`). Set `CDSE_USER` an
|
|||
| `metrics/{year}/{site}/` | 5 | Per-site timeseries, metrics, covariates JSON |
|
||||
| `metrics/manifest.json` | 5 | Webapp manifest (years + site metadata) |
|
||||
| `statistics_fusion_order/{year}.json` | 6 | Paired ItB-vs-BtI test summary (NSE, RMSE, nRMSE, r) |
|
||||
| `gcc_suitability/{year}.json` | 7 | PhenoCam GCC suitability summary (representativeness + LOOCV concordance) |
|
||||
|
||||
---
|
||||
|
||||
|
|
@ -80,7 +83,7 @@ Step 3 S3 download uses CDSE OpenEO (`SENTINEL3_SYN_L2_SYN`). Set `CDSE_USER` an
|
|||
|
||||
|
||||
|
||||
`python3 -m http.server 8080` runs the webapp on [http://localhost:8000/index.html](http://localhost:8000/index.html). Requires step 5 output (`data/metrics/manifest.json`).
|
||||
`python3 -m http.server 8080` runs the webapp on [http://localhost:8000/index.html](http://localhost:8000/index.html). Requires step 5 output (`data/metrics/manifest.json`). The Statistics overlay GCC suitability tab uses step 7 output (`data/gcc_suitability/{year}.json`).
|
||||
|
||||
---
|
||||
|
||||
|
|
|
|||
167
index.html
167
index.html
|
|
@ -181,6 +181,13 @@ body { margin: 0; font: 13px/1.4 system-ui, sans-serif; background: #f5f5f5; col
|
|||
.stat-badge.bti { background: #e3f2fd; color: #0d47a1; }
|
||||
.stat-badge.none { background: #f5f5f5; color: #777; font-weight: 400; }
|
||||
.stat-badge.insuf { background: #fce4ec; color: #b71c1c; font-weight: 400; }
|
||||
.stat-badge.pass { background: #e8f5e9; color: #1b5e20; }
|
||||
.stat-badge.partial { background: #fff3e0; color: #e65100; font-weight: 400; }
|
||||
.stat-badge.fail { background: #fce4ec; color: #b71c1c; font-weight: 400; }
|
||||
.stat-site-table { width: 100%; border-collapse: collapse; font-size: 12px; margin-top: 8px; }
|
||||
.stat-site-table th, .stat-site-table td { padding: 4px 8px; text-align: left; border-bottom: 1px solid #f0f0f0; }
|
||||
.stat-site-table th { color: #888; font-weight: 500; }
|
||||
.stat-site-table .sval { font-variant-numeric: tabular-nums; }
|
||||
.stat-row-table { width: 100%; border-collapse: collapse; font-size: 12px; }
|
||||
.stat-row-table td { padding: 3px 0; vertical-align: top; }
|
||||
.stat-row-table .slabel { color: #888; width: 46%; }
|
||||
|
|
@ -243,12 +250,7 @@ body { margin: 0; font: 13px/1.4 system-ui, sans-serif; background: #f5f5f5; col
|
|||
<span class="overlay-meta" id="statsMeta"></span>
|
||||
<button type="button" class="overlay-close" id="statsClose">Close</button>
|
||||
</div>
|
||||
<div id="statsTabGcc" class="stats-tab-panel" style="display:none">
|
||||
<div class="stat-placeholder">
|
||||
<p>GCC suitability</p>
|
||||
Coming soon.
|
||||
</div>
|
||||
</div>
|
||||
<div id="statsTabGcc" class="stats-tab-panel" style="display:none"></div>
|
||||
<div id="statsTabComparison" class="stats-tab-panel"></div>
|
||||
<div id="statsTabSites" class="stats-tab-panel" style="display:none">
|
||||
<div class="stat-placeholder">
|
||||
|
|
@ -352,6 +354,8 @@ let statsOverlayOpen = false;
|
|||
let statsTab = "comparison";
|
||||
let statsData = null;
|
||||
let statsYear = null;
|
||||
let gccSuitabilityData = null;
|
||||
let gccSuitabilityYear = null;
|
||||
const maps3 = {}; // { s2, fusion, s3 } Leaflet instances
|
||||
const overlays3 = {}; // current ImageOverlay per map
|
||||
const markers3 = {}; // site dot markers per map
|
||||
|
|
@ -620,15 +624,23 @@ function betterOrderLabel(order) {
|
|||
}
|
||||
|
||||
function updateStatsMeta() {
|
||||
if (statsTab !== "comparison" || !statsData) {
|
||||
qs("#statsMeta").textContent = `${currentYear}`;
|
||||
return;
|
||||
}
|
||||
if (statsTab === "comparison" && statsData) {
|
||||
const nPairs = statsData.metrics?.nse?.n_pairs;
|
||||
const alpha = statsData.alpha ?? 0.05;
|
||||
qs("#statsMeta").textContent = nPairs != null
|
||||
? `${nPairs} paired site${nPairs === 1 ? "" : "s"} · α=${alpha} · ${currentYear}`
|
||||
: `${currentYear}`;
|
||||
return;
|
||||
}
|
||||
if (statsTab === "gcc" && gccSuitabilityData) {
|
||||
const nEl = gccSuitabilityData.n_sites_eligible;
|
||||
const nRep = gccSuitabilityData.n_sites_repr_pass;
|
||||
qs("#statsMeta").textContent = nEl != null
|
||||
? `${nEl} eligible · ${nRep} representative · ${currentYear}`
|
||||
: `${currentYear}`;
|
||||
return;
|
||||
}
|
||||
qs("#statsMeta").textContent = `${currentYear}`;
|
||||
}
|
||||
|
||||
function escHtml(s) {
|
||||
|
|
@ -720,6 +732,130 @@ async function loadStatsPanel() {
|
|||
}
|
||||
}
|
||||
|
||||
function suitabilityVerdictLabel(v) {
|
||||
if (v === "suitable") return "Suitable";
|
||||
if (v === "partially suitable") return "Partially suitable";
|
||||
if (v === "not suitable") return "Not suitable";
|
||||
if (v === "insufficient data") return "Insufficient data";
|
||||
return v || "—";
|
||||
}
|
||||
|
||||
function suitabilityBadgeClass(v) {
|
||||
if (v === "suitable") return "pass";
|
||||
if (v === "partially suitable") return "partial";
|
||||
if (v === "not suitable") return "fail";
|
||||
return "insuf";
|
||||
}
|
||||
|
||||
function yesNo(v) {
|
||||
if (v === true) return "yes";
|
||||
if (v === false) return "no";
|
||||
return "—";
|
||||
}
|
||||
|
||||
function winnerLabel(w) {
|
||||
if (w === "bti") return "BtI";
|
||||
if (w === "itb") return "ItB";
|
||||
if (w === "tie") return "tie";
|
||||
return "—";
|
||||
}
|
||||
|
||||
function gccDroppedSitesList(data) {
|
||||
const dropped = data.dropped_sites || [];
|
||||
if (!dropped.length) return "";
|
||||
const links = dropped.map(site =>
|
||||
`<button type="button" class="stat-site-link" data-site="${escHtml(site)}">${escHtml(site)}</button>`
|
||||
).join("");
|
||||
return `<div class="stat-dropped">
|
||||
<h4>Excluded sites (${dropped.length})</h4>
|
||||
<p class="stat-dropped-note">Missing coordinates or fewer cloud-free S2 dates than the LOOCV minimum.</p>
|
||||
<div class="stat-dropped-list">${links}</div>
|
||||
</div>`;
|
||||
}
|
||||
|
||||
function renderGccSuitabilityPanel(data) {
|
||||
const panel = qs("#statsTabGcc");
|
||||
const agg = data.aggregate || {};
|
||||
const threshold = data.repr_r_threshold ?? 0.7;
|
||||
const verdict = agg.suitability_verdict;
|
||||
const badge = `<span class="stat-badge ${suitabilityBadgeClass(verdict)}">${suitabilityVerdictLabel(verdict)}</span>`;
|
||||
const row = (label, val) =>
|
||||
`<tr><td class="slabel">${label}</td><td class="sval">${val}</td></tr>`;
|
||||
|
||||
const siteRows = Object.entries(data.sites || {}).sort(([a], [b]) => a.localeCompare(b)).map(([site, s]) => {
|
||||
const rep = s.representative ?? s.representativeness?.representative;
|
||||
const repBadge = rep
|
||||
? '<span class="stat-badge pass">pass</span>'
|
||||
: '<span class="stat-badge fail">fail</span>';
|
||||
const loocv = s.loocv || {};
|
||||
return `<tr>
|
||||
<td><button type="button" class="stat-site-link" data-site="${escHtml(site)}">${escHtml(site)}</button></td>
|
||||
<td class="sval">${fmtStat(s.representativeness?.r)}</td>
|
||||
<td>${repBadge}</td>
|
||||
<td class="sval">${loocv.n_dates ?? "—"}</td>
|
||||
<td class="sval">${winnerLabel(s.winner_s2)} / ${winnerLabel(s.winner_phenocam)}</td>
|
||||
<td class="sval">${yesNo(s.winner_agreement)}</td>
|
||||
</tr>`;
|
||||
}).join("");
|
||||
|
||||
const aggregateCard = `<div class="stat-card">
|
||||
<h3>Aggregate verdict ${badge}</h3>
|
||||
<table class="stat-row-table">
|
||||
${row("Pooled Spearman (errors)", fmtStat(agg.pooled_spearman))}
|
||||
${row("Residual correlation", fmtStat(agg.residual_corr))}
|
||||
${row("Winner agreement rate", fmtStat(agg.winner_agreement_rate))}
|
||||
${row("LOOCV dates (pooled)", agg.n_loocv_dates ?? "—")}
|
||||
${row("Representative sites", `${data.n_sites_repr_pass ?? "—"} / ${data.n_sites_total ?? "—"}`)}
|
||||
${row("LOOCV-eligible sites", `${data.n_sites_eligible ?? "—"} / ${data.n_sites_total ?? "—"}`)}
|
||||
</table>
|
||||
</div>`;
|
||||
|
||||
const reprCard = `<div class="stat-card">
|
||||
<h3>Line A — PhenoCam vs S2 representativeness</h3>
|
||||
<p style="font-size:11px;color:#999;margin:0 0 8px">Pass when Pearson r ≥ ${threshold} (oblique footprint tracks co-located S2 GCC).</p>
|
||||
<table class="stat-site-table">
|
||||
<thead><tr>
|
||||
<th>Site</th><th>r</th><th>Pass</th><th>LOOCV n</th><th>Winner S2 / PC</th><th>Agree</th>
|
||||
</tr></thead>
|
||||
<tbody>${siteRows || '<tr><td colspan="6">No sites</td></tr>'}</tbody>
|
||||
</table>
|
||||
</div>`;
|
||||
|
||||
const concordanceCard = `<div class="stat-card">
|
||||
<h3>Line B — LOOCV concordance</h3>
|
||||
<p style="font-size:11px;color:#999;margin:0 0 8px">Same EFAST predictions scored against held-out S2 truth vs PhenoCam. Winner agreement and error correlations test whether PhenoCam ranks fusion methods like the satellite-internal reference.</p>
|
||||
<table class="stat-row-table">
|
||||
${row("Min cloud-free S2 gate", data.min_cloudfree_s2 ?? "—")}
|
||||
${row("Pooled LOOCV dates", agg.n_loocv_dates ?? "—")}
|
||||
${row("Winner agreement rate", fmtStat(agg.winner_agreement_rate))}
|
||||
</table>
|
||||
</div>`;
|
||||
|
||||
panel.innerHTML =
|
||||
`<div class="stat-summary">Is PhenoCam GCC a valid reference for ranking fusion accuracy? · ${data.n_sites_total ?? "—"} site(s) · r threshold ${threshold}</div>` +
|
||||
`<div class="stat-grid">${aggregateCard}${reprCard}${concordanceCard}</div>` +
|
||||
gccDroppedSitesList(data);
|
||||
updateStatsMeta();
|
||||
}
|
||||
|
||||
async function loadGccSuitabilityPanel() {
|
||||
const panel = qs("#statsTabGcc");
|
||||
panel.innerHTML = '<div class="stat-nodata">Loading…</div>';
|
||||
try {
|
||||
const data = await fetch(`data/gcc_suitability/${currentYear}.json`)
|
||||
.then(r => { if (!r.ok) throw new Error(); return r.json(); });
|
||||
gccSuitabilityData = data;
|
||||
gccSuitabilityYear = currentYear;
|
||||
renderGccSuitabilityPanel(data);
|
||||
} catch {
|
||||
gccSuitabilityData = null;
|
||||
gccSuitabilityYear = null;
|
||||
panel.innerHTML =
|
||||
'<div class="stat-nodata">No GCC suitability file found — run 7-gcc-suitability.py first.</div>';
|
||||
updateStatsMeta();
|
||||
}
|
||||
}
|
||||
|
||||
const STATS_TAB_PANELS = {
|
||||
comparison: "#statsTabComparison",
|
||||
gcc: "#statsTabGcc",
|
||||
|
|
@ -736,6 +872,9 @@ function switchStatsTab(tab, updateHash = true) {
|
|||
if (tab === "comparison") {
|
||||
if (statsYear !== currentYear || !statsData) loadStatsPanel();
|
||||
else updateStatsMeta();
|
||||
} else if (tab === "gcc") {
|
||||
if (gccSuitabilityYear !== currentYear || !gccSuitabilityData) loadGccSuitabilityPanel();
|
||||
else updateStatsMeta();
|
||||
} else {
|
||||
updateStatsMeta();
|
||||
}
|
||||
|
|
@ -783,9 +922,12 @@ async function init() {
|
|||
currentYear = +yearSel.value;
|
||||
statsData = null;
|
||||
statsYear = null;
|
||||
gccSuitabilityData = null;
|
||||
gccSuitabilityYear = null;
|
||||
buildSiteList();
|
||||
if (worldOverlayOpen) buildWorldMap();
|
||||
if (statsOverlayOpen && statsTab === "comparison") loadStatsPanel();
|
||||
if (statsOverlayOpen && statsTab === "gcc") loadGccSuitabilityPanel();
|
||||
});
|
||||
|
||||
qs("#worldMapBtn").addEventListener("click", () => openWorldOverlay());
|
||||
|
|
@ -797,6 +939,11 @@ async function init() {
|
|||
if (!link) return;
|
||||
pickSiteFromStats(link.dataset.site);
|
||||
});
|
||||
qs("#statsTabGcc").addEventListener("click", e => {
|
||||
const link = e.target.closest(".stat-site-link");
|
||||
if (!link) return;
|
||||
pickSiteFromStats(link.dataset.site);
|
||||
});
|
||||
qs("#worldClose").addEventListener("click", () => closeWorldOverlay());
|
||||
qs("#statsClose").addEventListener("click", () => closeStatsOverlay());
|
||||
qs("#worldOverlay").addEventListener("click", e => {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue