efast-phenocam-validation/5-metrics.py
2026-06-10 14:18:06 +02:00

695 lines
26 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Step 5: Pre-compute per-site GCC timeseries + raster index for the webapp.
Inputs (``data/``, ``{year}`` = ``--evaluation-year``):
- ``phenocam_screening/{year}.json`` — qualifying sites + metadata
- ``phenocam/{year}/{site}_1day.csv`` — daily GCC timeseries
- ``sentinel_data/{year}/{site}/prepared/s2/*_GCC.tif`` — S2 GCC rasters
- ``sentinel_data/{year}/{site}/prepared/gcc_s3/composite_*.tif`` — S3 GCC rasters
- ``fusion/{year}/{site}/bti/gcc/GCC_*.tif`` — BtI GCC rasters
- ``fusion/{year}/{site}/itb/fusion/GCC_*.tif`` — ItB GCC rasters
Outputs (``data/metrics/``):
- ``manifest.json`` — years + per-site metadata
- ``{year}/{site}/gcc_phenocam.json`` — PhenoCam ``gcc_90`` at matched dates
- ``{year}/{site}/gcc_s2.json`` — S2 GCC (center pixel, cloud-free scenes)
- ``{year}/{site}/gcc_s2_whittaker.json`` — Whittaker-smoothed S2 GCC
- ``{year}/{site}/gcc_s3.json`` — S3 composite GCC
- ``{year}/{site}/gcc_s3_smooth.json`` — S3 5-day moving average
- ``{year}/{site}/gcc_fusion_bti.json`` — BtI fused GCC
- ``{year}/{site}/gcc_fusion_itb.json`` — ItB fused GCC
- ``{year}/{site}/phenocam_images.json`` — midday photo URLs for the viewer
- ``{year}/{site}/rasters_s2_refl.json`` — S2 REFL paths (BtI view)
- ``{year}/{site}/rasters_s3_composite.json`` — S3 composite paths (BtI view)
- ``{year}/{site}/rasters_s2_gcc.json`` — S2 GCC paths (ItB view)
- ``{year}/{site}/rasters_s3_gcc.json`` — S3 GCC paths (ItB view)
- ``{year}/{site}/rasters_fusion_bti_refl.json`` — BtI fused REFL paths
- ``{year}/{site}/rasters_fusion_itb_gcc.json`` — ItB fused GCC paths
- ``{year}/{site}/metrics.json`` — NSE, RMSE, nRMSE, Pearson r vs PhenoCam per series
- ``{year}/{site}/bands_s2.json`` — S2 center-pixel reflectance (B02, B03, B04) per scene
- ``{year}/{site}/bands_s3.json`` — S3 center-pixel reflectance (Oa04, Oa06, Oa08, Oa17) per composite
- ``{year}/{site}/covariates.json`` — spatial CV/std, S2/S3 counts, gap stats
CLI:
- ``--evaluation-year`` (default 2025)
- ``--site`` (optional; default: all qualifying sites with sentinel data)
"""
from __future__ import annotations
import argparse
import csv
import json
import re
from pathlib import Path
from typing import Any
import datetime
import numpy as np
import rasterio
from rasterio.crs import CRS
from rasterio.transform import rowcol
from pyproj import Transformer
from scipy.stats import pearsonr
from tqdm import tqdm
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
DATA_DIR = Path("data")
DEFAULT_YEAR = 2025
# GCC smoothing window for S3 moving average (days)
S3_SMOOTH_WINDOW = 5
# Whittaker lambda (penalised smoothing strength for S2)
WHITTAKER_LAMBDA = 400.0
# Half-width in metres for the spatial heterogeneity footprint (~300 m = 1 S3 pixel)
SPATIAL_CV_HALF_M = 150
# PhenoCam archive image URL pattern
PHENOCAM_IMAGE_URL = "https://phenocam.nau.edu/data/archive/{site}/{year}/{month}/{filename}"
# ---------------------------------------------------------------------------
# Helpers: raster pixel extraction
# ---------------------------------------------------------------------------
def _read_center_pixel(path: Path, lat: float, lon: float) -> float | None:
"""Return the 3×3 mean GCC value at (lat, lon) from a single-band raster.
Returns ``None`` when the pixel is masked/zero/NaN.
"""
try:
with rasterio.open(path) as src:
transformer = Transformer.from_crs(
CRS.from_epsg(4326), src.crs, always_xy=True
)
x, y = transformer.transform(lon, lat)
row, col = rowcol(src.transform, x, y)
h, w = src.height, src.width
r0, r1 = max(0, row - 1), min(h, row + 2)
c0, c1 = max(0, col - 1), min(w, col + 2)
window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0)
data = src.read(1, window=window).astype(float)
nodata = src.nodata
if nodata is not None:
data = np.where(data == nodata, np.nan, data)
data[data == 0] = np.nan
val = np.nanmean(data)
return None if np.isnan(val) else float(val)
except Exception:
return None
# ---------------------------------------------------------------------------
# Helpers: date extraction from filenames
# ---------------------------------------------------------------------------
def _date_from_gcc_tif(path: Path) -> str | None:
"""Extract YYYYMMDD from ``GCC_YYYYMMDD.tif`` or ``composite_YYYYMMDD.tif``."""
m = re.search(r"(\d{8})", path.stem)
return m.group(1) if m else None
def _date_from_s2_tif(path: Path) -> str | None:
"""Extract YYYYMMDD from S2 product name ``S2X_TTTT_YYYYMMDD_…``."""
parts = path.stem.split("_")
if len(parts) >= 3:
m = re.match(r"(\d{8})", parts[2])
return m.group(1) if m else None
return None
# ---------------------------------------------------------------------------
# Helpers: Whittaker smoother (2nd-order differences, tridiagonal solver)
# ---------------------------------------------------------------------------
def _whittaker_smooth(values: list[float | None], lam: float = WHITTAKER_LAMBDA) -> list[float | None]:
"""Penalised least-squares smoother (Whittaker, 2nd-order differences).
Masked (None) values are filled via the smooth and then re-set to None in
the output so the caller can distinguish observed from gap-filled points.
"""
n = len(values)
if n < 4:
return values[:]
obs_mask = [v is not None for v in values]
y = np.array([v if v is not None else 0.0 for v in values], dtype=float)
w = np.array([1.0 if m else 0.0 for m in obs_mask], dtype=float)
W = np.diag(w)
D = np.diff(np.eye(n), n=2, axis=0) # (n-2) x n second-difference matrix
A = W + lam * D.T @ D
try:
z = np.linalg.solve(A, w * y)
except np.linalg.LinAlgError:
return values[:]
result: list[float | None] = []
for i, m in enumerate(obs_mask):
result.append(float(z[i]) if m else None)
return result
# ---------------------------------------------------------------------------
# Helpers: PhenoCam CSV parsing
# ---------------------------------------------------------------------------
def _parse_phenocam_csv(
csv_path: Path, year: int, site: str
) -> tuple[list[dict], list[dict]]:
"""Return (gcc_series, image_list) filtered to ``year``.
``gcc_series`` entries: ``{"date": "YYYY-MM-DD", "gcc_90": float}``
``image_list`` entries: ``{"date": "YYYY-MM-DD", "url": str}``
"""
gcc_series: list[dict] = []
image_list: list[dict] = []
year_str = str(year)
if not csv_path.is_file():
return gcc_series, image_list
with csv_path.open() as f:
lines = [l for l in f if not l.startswith("#")]
reader = csv.DictReader(lines)
for row in reader:
if row.get("year") != year_str:
continue
date = row.get("date", "")
gcc_raw = row.get("gcc_90")
if gcc_raw and gcc_raw not in ("NA", ""):
try:
gcc_series.append({"date": date, "gcc_90": float(gcc_raw)})
except ValueError:
pass
fn = row.get("midday_filename", "").strip()
if fn and fn != "NA" and date:
month = date[5:7]
url = PHENOCAM_IMAGE_URL.format(
site=site, year=year_str, month=month, filename=fn
)
image_list.append({"date": date, "url": url})
return gcc_series, image_list
# ---------------------------------------------------------------------------
# Helpers: moving average
# ---------------------------------------------------------------------------
def _moving_average(
series: list[dict], value_key: str, window: int
) -> list[dict]:
"""Compute centred moving average; returns new list with ``_smooth`` suffix key."""
if not series:
return []
vals = [p[value_key] for p in series]
half = window // 2
smoothed = []
for i, pt in enumerate(series):
chunk = [v for v in vals[max(0, i - half): i + half + 1] if v is not None]
smoothed.append({
"date": pt["date"],
value_key + "_smooth": (sum(chunk) / len(chunk)) if chunk else None,
})
return smoothed
# ---------------------------------------------------------------------------
# Helpers: validation metrics
# ---------------------------------------------------------------------------
MATCH_TOLERANCE_DAYS = 5
def compute_metrics(
ref: list[dict], ref_key: str,
pred: list[dict], pred_key: str,
) -> dict | None:
"""Compute NSE, RMSE, nRMSE, Pearson r between pred and ref.
Each pred point is matched to the nearest ref date within
``MATCH_TOLERANCE_DAYS``. Returns a dict or ``None`` if fewer than
2 matched pairs exist.
"""
ref_lookup: dict[str, float] = {p["date"]: p[ref_key] for p in ref if p.get(ref_key) is not None}
if not ref_lookup:
return None
ref_dates = sorted(ref_lookup)
obs, sim = [], []
for pt in pred:
v = pt.get(pred_key)
if v is None:
continue
nearest = min(ref_dates, key=lambda d: abs((
np.datetime64(pt["date"]) - np.datetime64(d)) / np.timedelta64(1, "D")))
gap = abs((np.datetime64(pt["date"]) - np.datetime64(nearest)) / np.timedelta64(1, "D"))
if gap <= MATCH_TOLERANCE_DAYS and nearest in ref_lookup:
obs.append(ref_lookup[nearest])
sim.append(v)
if len(obs) < 2:
return None
obs_arr = np.array(obs)
sim_arr = np.array(sim)
obs_mean = obs_arr.mean()
rmse = float(np.sqrt(np.mean((sim_arr - obs_arr) ** 2)))
nrmse = rmse / obs_mean if obs_mean else None
ss_res = float(np.sum((obs_arr - sim_arr) ** 2))
ss_tot = float(np.sum((obs_arr - obs_mean) ** 2))
nse = (1.0 - ss_res / ss_tot) if ss_tot else None
r, _ = pearsonr(obs_arr, sim_arr)
def _r4(v: float | None) -> float | None:
return round(v, 4) if v is not None else None
return {"n": len(obs), "rmse": _r4(rmse), "nrmse": _r4(nrmse), "nse": _r4(nse), "r": _r4(float(r))}
S2_BAND_NAMES = ["B02", "B03", "B04"]
S3_BAND_NAMES = ["Oa04", "Oa06", "Oa08", "Oa17"]
def _read_multiband_center(
path: Path, lat: float, lon: float, band_names: list[str]
) -> dict[str, float | None]:
"""Return 3×3 mean per band at (lat, lon). Keys are ``band_names``, values float or None."""
try:
with rasterio.open(path) as src:
transformer = Transformer.from_crs(CRS.from_epsg(4326), src.crs, always_xy=True)
x, y = transformer.transform(lon, lat)
row, col = rowcol(src.transform, x, y)
h, w = src.height, src.width
r0, r1 = max(0, row - 1), min(h, row + 2)
c0, c1 = max(0, col - 1), min(w, col + 2)
window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0)
nodata = src.nodata
result = {}
for i, name in enumerate(band_names, 1):
if i > src.count:
result[name] = None
continue
data = src.read(i, window=window).astype(float)
if nodata is not None:
data = np.where(data == nodata, np.nan, data)
data[data == 0] = np.nan
val = np.nanmean(data)
result[name] = None if np.isnan(val) else round(float(val), 6)
return result
except Exception:
return {name: None for name in band_names}
def _multiband_series(
tif_paths: list[Path],
date_fn,
lat: float,
lon: float,
band_names: list[str],
desc: str,
) -> list[dict]:
"""Extract center-pixel values for all bands; return ``[{date, band1, band2, …}]``."""
result = []
for p in tqdm(tif_paths, desc=desc, leave=False):
date = date_fn(p)
if date is None:
continue
vals = _read_multiband_center(p, lat, lon, band_names)
if any(v is not None for v in vals.values()):
result.append({"date": f"{date[:4]}-{date[4:6]}-{date[6:]}", **vals})
return sorted(result, key=lambda x: x["date"])
# ---------------------------------------------------------------------------
# Helpers: spatial heterogeneity + observation density
# ---------------------------------------------------------------------------
def _read_footprint_stats(
path: Path, lat: float, lon: float, half_m: float = SPATIAL_CV_HALF_M
) -> tuple[float, float] | tuple[None, None]:
"""Return (mean, std) of valid GCC pixels within a ±half_m metre square window.
Returns ``(None, None)`` on any error or when fewer than 4 valid pixels exist.
"""
try:
with rasterio.open(path) as src:
transformer = Transformer.from_crs(CRS.from_epsg(4326), src.crs, always_xy=True)
x, y = transformer.transform(lon, lat)
res = abs(src.transform.a) # pixel size in CRS units (metres for UTM)
half_px = max(1, int(round(half_m / res)))
row, col = rowcol(src.transform, x, y)
h, w = src.height, src.width
r0, r1 = max(0, row - half_px), min(h, row + half_px + 1)
c0, c1 = max(0, col - half_px), min(w, col + half_px + 1)
window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0)
data = src.read(1, window=window).astype(float)
nodata = src.nodata
if nodata is not None:
data = np.where(data == nodata, np.nan, data)
data[data <= 0] = np.nan
valid = data[~np.isnan(data)]
if len(valid) < 4:
return None, None
return float(np.mean(valid)), float(np.std(valid))
except Exception:
return None, None
def compute_covariates(
s2_gcc_paths: list[Path],
s2_series: list[dict],
s3_series: list[dict],
n_gcc_points: int | None,
lat: float,
lon: float,
) -> dict:
"""Compute spatial heterogeneity and temporal observation density covariates."""
# Spatial GCC statistics over ~300 m footprint
means, stds = [], []
for p in s2_gcc_paths:
m, s = _read_footprint_stats(p, lat, lon)
if m is not None and m > 0:
means.append(m)
stds.append(s)
spatial_gcc_cv = round(float(np.mean([s / m for s, m in zip(stds, means)])), 4) if means else None
spatial_gcc_std = round(float(np.mean(stds)), 4) if stds else None
# S2 temporal gap statistics
s2_dates = [datetime.date.fromisoformat(p["date"]) for p in s2_series]
if len(s2_dates) >= 2:
gaps = [(s2_dates[i + 1] - s2_dates[i]).days for i in range(len(s2_dates) - 1)]
s2_mean_gap = round(float(np.mean(gaps)), 1)
s2_max_gap = int(max(gaps))
else:
s2_mean_gap = None
s2_max_gap = None
return {
"spatial_gcc_cv": spatial_gcc_cv,
"spatial_gcc_std": spatial_gcc_std,
"s2_scene_count": len(s2_series),
"s2_mean_gap_days": s2_mean_gap,
"s2_max_gap_days": s2_max_gap,
"s3_composite_count": len(s3_series),
"n_gcc_points": n_gcc_points,
}
# ---------------------------------------------------------------------------
# Per-site export
# ---------------------------------------------------------------------------
def _write_json(path: Path, data: Any) -> None:
path.write_text(json.dumps(data, separators=(",", ":")))
def _raster_series(
tif_paths: list[Path],
date_fn,
lat: float,
lon: float,
desc: str,
) -> list[dict]:
"""Extract center-pixel GCC from each tif, return ``[{date, gcc}]`` sorted."""
result = []
for p in tqdm(tif_paths, desc=desc, leave=False):
date = date_fn(p)
if date is None:
continue
val = _read_center_pixel(p, lat, lon)
if val is not None:
result.append({"date": f"{date[:4]}-{date[4:6]}-{date[6:]}", "gcc": val})
return sorted(result, key=lambda x: x["date"])
def _raster_index(tif_paths: list[Path], date_fn, rel_root: Path) -> list[dict]:
"""Build raster index: ``[{date, path}]`` sorted by date."""
result = []
for p in tif_paths:
date = date_fn(p)
if date is None:
continue
try:
rel = str(p.relative_to(rel_root))
except ValueError:
rel = str(p)
result.append({"date": date, "path": rel})
return sorted(result, key=lambda x: x["date"])
def export_site(
site: str,
year: int,
lat: float,
lon: float,
out_dir: Path,
n_gcc_points: int | None = None,
) -> bool:
"""Export timeseries.json and rasters.json for one site. Returns True on success."""
sentinel_base = DATA_DIR / "sentinel_data" / str(year) / site / "prepared"
fusion_base = DATA_DIR / "fusion" / str(year) / site
s2_gcc_dir = sentinel_base / "s2"
s3_gcc_dir = sentinel_base / "gcc_s3"
bti_gcc_dir = fusion_base / "bti" / "gcc"
itb_gcc_dir = fusion_base / "itb" / "fusion"
# Raster slider sources
s2_refl_dir = sentinel_base / "s2"
s3_comp_dir = sentinel_base / "s3"
bti_refl_dir = fusion_base / "bti" / "fusion"
has_fusion = bti_gcc_dir.is_dir() and any(bti_gcc_dir.glob("GCC_*.tif"))
if not has_fusion:
return False
out_dir.mkdir(parents=True, exist_ok=True)
# --- GCC timeseries from rasters ---
s2_gcc_paths = sorted(s2_gcc_dir.glob("*_GCC.tif"))
s3_gcc_paths = sorted(s3_gcc_dir.glob("composite_*.tif"))
bti_paths = sorted(bti_gcc_dir.glob("GCC_*.tif"))
itb_paths = sorted(itb_gcc_dir.glob("GCC_*.tif"))
s2_series = _raster_series(s2_gcc_paths, _date_from_s2_tif, lat, lon, f"{site} S2")
s3_series = _raster_series(s3_gcc_paths, _date_from_gcc_tif, lat, lon, f"{site} S3")
bti_series = _raster_series(bti_paths, _date_from_gcc_tif, lat, lon, f"{site} BtI")
itb_series = _raster_series(itb_paths, _date_from_gcc_tif, lat, lon, f"{site} ItB")
# Whittaker on S2
s2_vals = [p["gcc"] for p in s2_series]
s2_smooth_vals = _whittaker_smooth(s2_vals)
s2_whittaker = [
{"date": p["date"], "gcc": v}
for p, v in zip(s2_series, s2_smooth_vals)
if v is not None
]
# S3 5-day moving average
s3_smooth = _moving_average(s3_series, "gcc", S3_SMOOTH_WINDOW)
# PhenoCam CSV
csv_path = DATA_DIR / "phenocam" / str(year) / f"{site}_1day.csv"
phenocam_series, image_list = _parse_phenocam_csv(csv_path, year, site)
s3_smooth_series = [
{"date": p["date"], "gcc": p["gcc_smooth"]}
for p in s3_smooth
if p.get("gcc_smooth") is not None
]
# Band reflectance timeseries (multi-band center-pixel)
bands_s2 = _multiband_series(sorted(s2_refl_dir.glob("*_REFL.tif")), _date_from_s2_tif, lat, lon, S2_BAND_NAMES, f"{site} S2 bands")
bands_s3 = _multiband_series(sorted(s3_comp_dir.glob("composite_*.tif")), _date_from_gcc_tif, lat, lon, S3_BAND_NAMES, f"{site} S3 bands")
# --- Per-metric JSON outputs ---
_write_json(out_dir / "gcc_phenocam.json", phenocam_series)
_write_json(out_dir / "gcc_s2.json", s2_series)
_write_json(out_dir / "gcc_s2_whittaker.json", s2_whittaker)
_write_json(out_dir / "gcc_s3.json", s3_series)
_write_json(out_dir / "gcc_s3_smooth.json", s3_smooth_series)
_write_json(out_dir / "gcc_fusion_bti.json", bti_series)
_write_json(out_dir / "gcc_fusion_itb.json", itb_series)
_write_json(out_dir / "phenocam_images.json", image_list)
_write_json(out_dir / "bands_s2.json", bands_s2)
_write_json(out_dir / "bands_s3.json", bands_s3)
# --- Raster index for slider ---
rel_root = DATA_DIR.parent # paths relative to project root
# Valid-pixel sets: only show S2/S3 rasters where the center pixel had
# usable data (non-zero GCC). This excludes cloud-masked / snow-covered
# scenes that would render as black or visually nonsensical.
s2_valid_dates = {p["date"].replace("-", "") for p in s2_series}
s3_valid_dates = {p["date"].replace("-", "") for p in s3_series}
s2_refl = [r for r in _raster_index(sorted(s2_refl_dir.glob("*_REFL.tif")), _date_from_s2_tif, rel_root)
if r["date"] in s2_valid_dates]
s3_comp = [r for r in _raster_index(sorted(s3_comp_dir.glob("composite_*.tif")), _date_from_gcc_tif, rel_root)
if r["date"] in s3_valid_dates]
s2_gcc = [r for r in _raster_index(sorted(s2_gcc_dir.glob("*_GCC.tif")), _date_from_s2_tif, rel_root)
if r["date"] in s2_valid_dates]
s3_gcc = [r for r in _raster_index(sorted(s3_gcc_dir.glob("composite_*.tif")), _date_from_gcc_tif, rel_root)
if r["date"] in s3_valid_dates]
bti_refl = _raster_index(sorted(bti_refl_dir.glob("REFL_*.tif")), _date_from_gcc_tif, rel_root)
itb_gcc = _raster_index(sorted(itb_gcc_dir.glob("GCC_*.tif")), _date_from_gcc_tif, rel_root)
_write_json(out_dir / "rasters_s2_refl.json", s2_refl)
_write_json(out_dir / "rasters_s3_composite.json", s3_comp)
_write_json(out_dir / "rasters_s2_gcc.json", s2_gcc)
_write_json(out_dir / "rasters_s3_gcc.json", s3_gcc)
_write_json(out_dir / "rasters_fusion_bti_refl.json", bti_refl)
_write_json(out_dir / "rasters_fusion_itb_gcc.json", itb_gcc)
# --- Site covariates (heterogeneity + observation density) ---
_write_json(out_dir / "covariates.json", compute_covariates(
s2_gcc_paths, s2_series, s3_series, n_gcc_points, lat, lon
))
# --- Validation metrics vs PhenoCam gcc_90 ---
_write_json(out_dir / "metrics.json", {
"bti": compute_metrics(phenocam_series, "gcc_90", bti_series, "gcc"),
"itb": compute_metrics(phenocam_series, "gcc_90", itb_series, "gcc"),
"s2_whittaker": compute_metrics(phenocam_series, "gcc_90", s2_whittaker, "gcc"),
"s3_smooth": compute_metrics(phenocam_series, "gcc_90", s3_smooth_series, "gcc"),
"s2": compute_metrics(phenocam_series, "gcc_90", s2_series, "gcc"),
"s3": compute_metrics(phenocam_series, "gcc_90", s3_series, "gcc"),
})
# Remove legacy bundled outputs if present
for legacy in ("timeseries.json", "rasters.json"):
(out_dir / legacy).unlink(missing_ok=True)
return True
# ---------------------------------------------------------------------------
# Manifest
# ---------------------------------------------------------------------------
VEG_TYPE_LABELS = {
"AG": "Agriculture",
"DB": "Deciduous broadleaf",
"DN": "Deciduous needleleaf",
"EB": "Evergreen broadleaf",
"EN": "Evergreen needleleaf",
"GR": "Grassland",
"MX": "Mixed",
"SH": "Shrubland",
"TN": "Tundra",
"UN": "Unknown",
"WL": "Wetland",
"RF": "Reference",
}
def build_manifest(years: list[int], filter_site: str | None = None) -> dict:
manifest: dict[str, Any] = {"years": years, "sites": {}}
for year in years:
screening_path = DATA_DIR / "phenocam_screening" / f"{year}.json"
if not screening_path.is_file():
continue
data = json.loads(screening_path.read_text())
sites_meta: dict[str, Any] = {}
for entry in data.get("sites", []):
if entry.get("calculations", {}).get("status") != "PASS":
continue
cam = entry.get("response", {}).get("camera", {})
roi = entry.get("response", {}).get("roi", {})
calc = entry.get("calculations", {})
site = cam.get("Sitename", "")
if not site:
continue
if filter_site and site != filter_site:
continue
sm = cam.get("sitemetadata", {})
veg_raw = sm.get("primary_veg_type") or roi.get("roitype") or "UN"
fusion_dir = DATA_DIR / "fusion" / str(year) / site / "bti" / "gcc"
has_fusion = fusion_dir.is_dir() and any(fusion_dir.glob("GCC_*.tif"))
sites_meta[site] = {
"lat": cam.get("Lat"),
"lon": cam.get("Lon"),
"veg_type": veg_raw,
"veg_label": VEG_TYPE_LABELS.get(veg_raw, veg_raw),
"description": sm.get("site_description", ""),
"dominant_species": sm.get("dominant_species", ""),
"group": sm.get("group", ""),
"snr": calc.get("snr"),
"n_gcc_points": calc.get("n_gcc_points"),
"has_fusion": has_fusion,
}
manifest["sites"][str(year)] = sites_meta
return manifest
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--evaluation-year", type=int, default=DEFAULT_YEAR)
parser.add_argument("--site", type=str, default=None)
args = parser.parse_args()
year = args.evaluation_year
filter_site = args.site
out_base = DATA_DIR / "metrics"
out_base.mkdir(parents=True, exist_ok=True)
# Determine years with screening data
screening_dir = DATA_DIR / "phenocam_screening"
years = sorted(
int(p.stem) for p in screening_dir.glob("*.json") if p.stem.isdigit()
)
if not years:
years = [year]
print(f"Building manifest for years: {years}")
manifest = build_manifest(years, filter_site)
# Export per-site data for the requested year
year_sites = manifest["sites"].get(str(year), {})
fusion_sites = {s: m for s, m in year_sites.items() if m["has_fusion"]}
if filter_site:
fusion_sites = {s: m for s, m in fusion_sites.items() if s == filter_site}
print(f"Exporting {len(fusion_sites)} site(s) with fusion data for {year}")
for site, meta in tqdm(fusion_sites.items(), desc="Sites"):
out_dir = out_base / str(year) / site
ok = export_site(site, year, meta["lat"], meta["lon"], out_dir, meta.get("n_gcc_points"))
if ok:
print(f"{site}")
else:
print(f"{site} — no fusion data found")
manifest_path = out_base / "manifest.json"
manifest_path.write_text(json.dumps(manifest, separators=(",", ":")))
print(f"Manifest written → {manifest_path}")
if __name__ == "__main__":
main()