788 lines
27 KiB
Python
788 lines
27 KiB
Python
"""Step 5: Pre-compute per-site GCC timeseries + raster index for the webapp.
|
||
|
||
Inputs (``data/``, ``{year}`` = ``--evaluation-year``):
|
||
|
||
- ``phenocam_screening/{year}.json`` — qualifying sites + metadata
|
||
- ``phenocam/{year}/{site}_1day.csv`` — daily GCC timeseries
|
||
- ``sentinel_data/{year}/{site}/prepared/s2/*_GCC.tif`` — S2 GCC rasters
|
||
- ``sentinel_data/{year}/{site}/prepared/gcc_s3/composite_*.tif`` — S3 GCC rasters
|
||
- ``fusion/{year}/{site}/bti/gcc/GCC_*.tif`` — BtI GCC rasters
|
||
- ``fusion/{year}/{site}/itb/fusion/GCC_*.tif`` — ItB GCC rasters
|
||
|
||
Outputs (``data/metrics/``):
|
||
|
||
- ``manifest.json`` — years + per-site metadata
|
||
- ``{year}/{site}/gcc_phenocam.json`` — PhenoCam ``gcc_90`` at matched dates
|
||
- ``{year}/{site}/gcc_s2.json`` — S2 GCC (center pixel, cloud-free scenes)
|
||
- ``{year}/{site}/gcc_s2_whittaker.json`` — Whittaker-smoothed S2 GCC
|
||
- ``{year}/{site}/gcc_s3.json`` — S3 composite GCC
|
||
- ``{year}/{site}/gcc_s3_smooth.json`` — S3 5-day moving average
|
||
- ``{year}/{site}/gcc_fusion_bti.json`` — BtI fused GCC
|
||
- ``{year}/{site}/gcc_fusion_itb.json`` — ItB fused GCC
|
||
- ``{year}/{site}/phenocam_images.json`` — midday photo URLs for the viewer
|
||
- ``{year}/{site}/rasters_s2_refl.json`` — S2 REFL paths (BtI view)
|
||
- ``{year}/{site}/rasters_s3_composite.json`` — S3 composite paths (BtI view)
|
||
- ``{year}/{site}/rasters_s2_gcc.json`` — S2 GCC paths (ItB view)
|
||
- ``{year}/{site}/rasters_s3_gcc.json`` — S3 GCC paths (ItB view)
|
||
- ``{year}/{site}/rasters_fusion_bti_refl.json`` — BtI fused REFL paths
|
||
- ``{year}/{site}/rasters_fusion_itb_gcc.json`` — ItB fused GCC paths
|
||
- ``{year}/{site}/metrics.json`` — NSE, RMSE, nRMSE, Pearson r vs PhenoCam per series
|
||
- ``{year}/{site}/bands_s2.json`` — S2 center-pixel reflectance (B02, B03, B04) per scene
|
||
- ``{year}/{site}/bands_s3.json`` — S3 center-pixel reflectance (Oa04, Oa06, Oa08, Oa17) per composite
|
||
- ``{year}/{site}/covariates.json`` — spatial CV/std, S2/S3 counts, gap stats
|
||
|
||
CLI:
|
||
|
||
- ``--evaluation-year`` (default 2025)
|
||
- ``--site`` (optional; default: all qualifying sites with sentinel data)
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import csv
|
||
import json
|
||
import re
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
import datetime
|
||
import numpy as np
|
||
import rasterio
|
||
from rasterio.crs import CRS
|
||
from rasterio.transform import rowcol
|
||
from pyproj import Transformer
|
||
from scipy.stats import pearsonr
|
||
from tqdm import tqdm
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Constants
|
||
# ---------------------------------------------------------------------------
|
||
|
||
DATA_DIR = Path("data")
|
||
DEFAULT_YEAR = 2025
|
||
|
||
# GCC smoothing window for S3 moving average (days)
|
||
S3_SMOOTH_WINDOW = 5
|
||
|
||
# Whittaker lambda (penalised smoothing strength for S2)
|
||
WHITTAKER_LAMBDA = 400.0
|
||
|
||
# Half-width in metres for the spatial heterogeneity footprint (~300 m = 1 S3 pixel)
|
||
SPATIAL_CV_HALF_M = 150
|
||
|
||
# PhenoCam archive image URL pattern
|
||
PHENOCAM_IMAGE_URL = (
|
||
"https://phenocam.nau.edu/data/archive/{site}/{year}/{month}/{filename}"
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers: raster pixel extraction
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def _window_mean(data: np.ndarray) -> float | None:
|
||
"""Mean of a pixel window; ``None`` when every value is NaN."""
|
||
valid = data[~np.isnan(data)]
|
||
if valid.size == 0:
|
||
return None
|
||
return float(np.mean(valid))
|
||
|
||
|
||
def _read_center_pixel(path: Path, lat: float, lon: float) -> float | None:
|
||
"""Return the 3×3 mean GCC value at (lat, lon) from a single-band raster.
|
||
|
||
Returns ``None`` when the pixel is masked/zero/NaN.
|
||
"""
|
||
try:
|
||
with rasterio.open(path) as src:
|
||
transformer = Transformer.from_crs(
|
||
CRS.from_epsg(4326), src.crs, always_xy=True
|
||
)
|
||
x, y = transformer.transform(lon, lat)
|
||
row, col = rowcol(src.transform, x, y)
|
||
h, w = src.height, src.width
|
||
r0, r1 = max(0, row - 1), min(h, row + 2)
|
||
c0, c1 = max(0, col - 1), min(w, col + 2)
|
||
window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0)
|
||
data = src.read(1, window=window).astype(float)
|
||
nodata = src.nodata
|
||
if nodata is not None:
|
||
data = np.where(data == nodata, np.nan, data)
|
||
data[data == 0] = np.nan
|
||
return _window_mean(data)
|
||
except Exception:
|
||
return None
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers: date extraction from filenames
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def _date_from_gcc_tif(path: Path) -> str | None:
|
||
"""Extract YYYYMMDD from ``GCC_YYYYMMDD.tif`` or ``composite_YYYYMMDD.tif``."""
|
||
m = re.search(r"(\d{8})", path.stem)
|
||
return m.group(1) if m else None
|
||
|
||
|
||
def _date_from_s2_tif(path: Path) -> str | None:
|
||
"""Extract YYYYMMDD from S2 product name ``S2X_TTTT_YYYYMMDD_…``."""
|
||
parts = path.stem.split("_")
|
||
if len(parts) >= 3:
|
||
m = re.match(r"(\d{8})", parts[2])
|
||
return m.group(1) if m else None
|
||
return None
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers: Whittaker smoother (2nd-order differences, tridiagonal solver)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def _whittaker_smooth(
|
||
values: list[float | None], lam: float = WHITTAKER_LAMBDA
|
||
) -> list[float | None]:
|
||
"""Penalised least-squares smoother (Whittaker, 2nd-order differences).
|
||
|
||
Masked (None) values are filled via the smooth and then re-set to None in
|
||
the output so the caller can distinguish observed from gap-filled points.
|
||
"""
|
||
n = len(values)
|
||
if n < 4:
|
||
return values[:]
|
||
|
||
obs_mask = [v is not None for v in values]
|
||
y = np.array([v if v is not None else 0.0 for v in values], dtype=float)
|
||
w = np.array([1.0 if m else 0.0 for m in obs_mask], dtype=float)
|
||
|
||
W = np.diag(w)
|
||
D = np.diff(np.eye(n), n=2, axis=0) # (n-2) x n second-difference matrix
|
||
A = W + lam * D.T @ D
|
||
try:
|
||
z = np.linalg.solve(A, w * y)
|
||
except np.linalg.LinAlgError:
|
||
return values[:]
|
||
|
||
result: list[float | None] = []
|
||
for i, m in enumerate(obs_mask):
|
||
result.append(float(z[i]) if m else None)
|
||
return result
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers: PhenoCam CSV parsing
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def _parse_phenocam_csv(
|
||
csv_path: Path, year: int, site: str
|
||
) -> tuple[list[dict], list[dict]]:
|
||
"""Return (gcc_series, image_list) filtered to ``year``.
|
||
|
||
``gcc_series`` entries: ``{"date": "YYYY-MM-DD", "gcc_90": float}``
|
||
``image_list`` entries: ``{"date": "YYYY-MM-DD", "url": str}``
|
||
"""
|
||
gcc_series: list[dict] = []
|
||
image_list: list[dict] = []
|
||
year_str = str(year)
|
||
|
||
if not csv_path.is_file():
|
||
return gcc_series, image_list
|
||
|
||
with csv_path.open() as f:
|
||
lines = [line for line in f if not line.startswith("#")]
|
||
|
||
reader = csv.DictReader(lines)
|
||
for row in reader:
|
||
if row.get("year") != year_str:
|
||
continue
|
||
date = row.get("date", "")
|
||
gcc_raw = row.get("gcc_90")
|
||
if gcc_raw and gcc_raw not in ("NA", ""):
|
||
try:
|
||
gcc_series.append({"date": date, "gcc_90": float(gcc_raw)})
|
||
except ValueError:
|
||
pass
|
||
fn = row.get("midday_filename", "").strip()
|
||
if fn and fn != "NA" and date:
|
||
month = date[5:7]
|
||
url = PHENOCAM_IMAGE_URL.format(
|
||
site=site, year=year_str, month=month, filename=fn
|
||
)
|
||
image_list.append({"date": date, "url": url})
|
||
|
||
return gcc_series, image_list
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers: moving average
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def _moving_average(series: list[dict], value_key: str, window: int) -> list[dict]:
|
||
"""Compute centred moving average; returns new list with ``_smooth`` suffix key."""
|
||
if not series:
|
||
return []
|
||
vals = [p[value_key] for p in series]
|
||
half = window // 2
|
||
smoothed = []
|
||
for i, pt in enumerate(series):
|
||
chunk = [v for v in vals[max(0, i - half) : i + half + 1] if v is not None]
|
||
smoothed.append(
|
||
{
|
||
"date": pt["date"],
|
||
value_key + "_smooth": (sum(chunk) / len(chunk)) if chunk else None,
|
||
}
|
||
)
|
||
return smoothed
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers: validation metrics
|
||
# ---------------------------------------------------------------------------
|
||
|
||
MATCH_TOLERANCE_DAYS = 5
|
||
|
||
|
||
def compute_metrics(
|
||
ref: list[dict],
|
||
ref_key: str,
|
||
pred: list[dict],
|
||
pred_key: str,
|
||
) -> dict | None:
|
||
"""Compute NSE, RMSE, nRMSE, Pearson r between pred and ref.
|
||
|
||
Each pred point is matched to the nearest ref date within
|
||
``MATCH_TOLERANCE_DAYS``. Returns a dict or ``None`` if fewer than
|
||
2 matched pairs exist.
|
||
"""
|
||
ref_lookup: dict[str, float] = {
|
||
p["date"]: p[ref_key] for p in ref if p.get(ref_key) is not None
|
||
}
|
||
if not ref_lookup:
|
||
return None
|
||
|
||
ref_dates = sorted(ref_lookup)
|
||
|
||
obs, sim = [], []
|
||
for pt in pred:
|
||
v = pt.get(pred_key)
|
||
if v is None:
|
||
continue
|
||
nearest = min(
|
||
ref_dates,
|
||
key=lambda d: abs(
|
||
(np.datetime64(pt["date"]) - np.datetime64(d)) / np.timedelta64(1, "D")
|
||
),
|
||
)
|
||
gap = abs(
|
||
(np.datetime64(pt["date"]) - np.datetime64(nearest))
|
||
/ np.timedelta64(1, "D")
|
||
)
|
||
if gap <= MATCH_TOLERANCE_DAYS and nearest in ref_lookup:
|
||
obs.append(ref_lookup[nearest])
|
||
sim.append(v)
|
||
|
||
if len(obs) < 2:
|
||
return None
|
||
|
||
obs_arr = np.array(obs)
|
||
sim_arr = np.array(sim)
|
||
obs_mean = obs_arr.mean()
|
||
|
||
rmse = float(np.sqrt(np.mean((sim_arr - obs_arr) ** 2)))
|
||
nrmse = rmse / obs_mean if obs_mean else None
|
||
ss_res = float(np.sum((obs_arr - sim_arr) ** 2))
|
||
ss_tot = float(np.sum((obs_arr - obs_mean) ** 2))
|
||
nse = (1.0 - ss_res / ss_tot) if ss_tot else None
|
||
r, _ = pearsonr(obs_arr, sim_arr)
|
||
|
||
def _r4(v: float | None) -> float | None:
|
||
return round(v, 4) if v is not None else None
|
||
|
||
return {
|
||
"n": len(obs),
|
||
"rmse": _r4(rmse),
|
||
"nrmse": _r4(nrmse),
|
||
"nse": _r4(nse),
|
||
"r": _r4(float(r)),
|
||
}
|
||
|
||
|
||
S2_BAND_NAMES = ["B02", "B03", "B04"]
|
||
S3_BAND_NAMES = ["Oa04", "Oa06", "Oa08", "Oa17"]
|
||
|
||
|
||
def _read_multiband_center(
|
||
path: Path, lat: float, lon: float, band_names: list[str]
|
||
) -> dict[str, float | None]:
|
||
"""Return 3×3 mean per band at (lat, lon). Keys are ``band_names``, values float or None."""
|
||
try:
|
||
with rasterio.open(path) as src:
|
||
transformer = Transformer.from_crs(
|
||
CRS.from_epsg(4326), src.crs, always_xy=True
|
||
)
|
||
x, y = transformer.transform(lon, lat)
|
||
row, col = rowcol(src.transform, x, y)
|
||
h, w = src.height, src.width
|
||
r0, r1 = max(0, row - 1), min(h, row + 2)
|
||
c0, c1 = max(0, col - 1), min(w, col + 2)
|
||
window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0)
|
||
nodata = src.nodata
|
||
result = {}
|
||
for i, name in enumerate(band_names, 1):
|
||
if i > src.count:
|
||
result[name] = None
|
||
continue
|
||
data = src.read(i, window=window).astype(float)
|
||
if nodata is not None:
|
||
data = np.where(data == nodata, np.nan, data)
|
||
data[data == 0] = np.nan
|
||
val = _window_mean(data)
|
||
result[name] = None if val is None else round(val, 6)
|
||
return result
|
||
except Exception:
|
||
return {name: None for name in band_names}
|
||
|
||
|
||
def _multiband_series(
|
||
tif_paths: list[Path],
|
||
date_fn,
|
||
lat: float,
|
||
lon: float,
|
||
band_names: list[str],
|
||
desc: str,
|
||
) -> list[dict]:
|
||
"""Extract center-pixel values for all bands; return ``[{date, band1, band2, …}]``."""
|
||
result = []
|
||
for p in tqdm(tif_paths, desc=desc, leave=False):
|
||
date = date_fn(p)
|
||
if date is None:
|
||
continue
|
||
vals = _read_multiband_center(p, lat, lon, band_names)
|
||
if any(v is not None for v in vals.values()):
|
||
result.append({"date": f"{date[:4]}-{date[4:6]}-{date[6:]}", **vals})
|
||
return sorted(result, key=lambda x: x["date"])
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers: spatial heterogeneity + observation density
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def _read_footprint_stats(
|
||
path: Path, lat: float, lon: float, half_m: float = SPATIAL_CV_HALF_M
|
||
) -> tuple[float, float] | tuple[None, None]:
|
||
"""Return (mean, std) of valid GCC pixels within a ±half_m metre square window.
|
||
|
||
Returns ``(None, None)`` on any error or when fewer than 4 valid pixels exist.
|
||
"""
|
||
try:
|
||
with rasterio.open(path) as src:
|
||
transformer = Transformer.from_crs(
|
||
CRS.from_epsg(4326), src.crs, always_xy=True
|
||
)
|
||
x, y = transformer.transform(lon, lat)
|
||
res = abs(src.transform.a) # pixel size in CRS units (metres for UTM)
|
||
half_px = max(1, int(round(half_m / res)))
|
||
row, col = rowcol(src.transform, x, y)
|
||
h, w = src.height, src.width
|
||
r0, r1 = max(0, row - half_px), min(h, row + half_px + 1)
|
||
c0, c1 = max(0, col - half_px), min(w, col + half_px + 1)
|
||
window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0)
|
||
data = src.read(1, window=window).astype(float)
|
||
nodata = src.nodata
|
||
if nodata is not None:
|
||
data = np.where(data == nodata, np.nan, data)
|
||
data[data <= 0] = np.nan
|
||
valid = data[~np.isnan(data)]
|
||
if len(valid) < 4:
|
||
return None, None
|
||
return float(np.mean(valid)), float(np.std(valid))
|
||
except Exception:
|
||
return None, None
|
||
|
||
|
||
def compute_covariates(
|
||
s2_gcc_paths: list[Path],
|
||
s2_series: list[dict],
|
||
s3_series: list[dict],
|
||
n_gcc_points: int | None,
|
||
lat: float,
|
||
lon: float,
|
||
) -> dict:
|
||
"""Compute spatial heterogeneity and temporal observation density covariates."""
|
||
# Spatial GCC statistics over ~300 m footprint
|
||
means, stds = [], []
|
||
for p in s2_gcc_paths:
|
||
m, s = _read_footprint_stats(p, lat, lon)
|
||
if m is not None and m > 0:
|
||
means.append(m)
|
||
stds.append(s)
|
||
|
||
spatial_gcc_cv = (
|
||
round(float(np.mean([s / m for s, m in zip(stds, means)])), 4)
|
||
if means
|
||
else None
|
||
)
|
||
spatial_gcc_std = round(float(np.mean(stds)), 4) if stds else None
|
||
|
||
# S2 temporal gap statistics
|
||
s2_dates = [datetime.date.fromisoformat(p["date"]) for p in s2_series]
|
||
if len(s2_dates) >= 2:
|
||
gaps = [(s2_dates[i + 1] - s2_dates[i]).days for i in range(len(s2_dates) - 1)]
|
||
s2_mean_gap = round(float(np.mean(gaps)), 1)
|
||
s2_max_gap = int(max(gaps))
|
||
else:
|
||
s2_mean_gap = None
|
||
s2_max_gap = None
|
||
|
||
return {
|
||
"spatial_gcc_cv": spatial_gcc_cv,
|
||
"spatial_gcc_std": spatial_gcc_std,
|
||
"s2_scene_count": len(s2_series),
|
||
"s2_mean_gap_days": s2_mean_gap,
|
||
"s2_max_gap_days": s2_max_gap,
|
||
"s3_composite_count": len(s3_series),
|
||
"n_gcc_points": n_gcc_points,
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Per-site export
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def _write_json(path: Path, data: Any) -> None:
|
||
path.write_text(json.dumps(data, separators=(",", ":")))
|
||
|
||
|
||
def _raster_series(
|
||
tif_paths: list[Path],
|
||
date_fn,
|
||
lat: float,
|
||
lon: float,
|
||
desc: str,
|
||
) -> list[dict]:
|
||
"""Extract center-pixel GCC from each tif, return ``[{date, gcc}]`` sorted."""
|
||
result = []
|
||
for p in tqdm(tif_paths, desc=desc, leave=False):
|
||
date = date_fn(p)
|
||
if date is None:
|
||
continue
|
||
val = _read_center_pixel(p, lat, lon)
|
||
if val is not None:
|
||
result.append({"date": f"{date[:4]}-{date[4:6]}-{date[6:]}", "gcc": val})
|
||
return sorted(result, key=lambda x: x["date"])
|
||
|
||
|
||
def _raster_index(tif_paths: list[Path], date_fn, rel_root: Path) -> list[dict]:
|
||
"""Build raster index: ``[{date, path}]`` sorted by date."""
|
||
result = []
|
||
for p in tif_paths:
|
||
date = date_fn(p)
|
||
if date is None:
|
||
continue
|
||
try:
|
||
rel = str(p.relative_to(rel_root))
|
||
except ValueError:
|
||
rel = str(p)
|
||
result.append({"date": date, "path": rel})
|
||
return sorted(result, key=lambda x: x["date"])
|
||
|
||
|
||
def export_site(
|
||
site: str,
|
||
year: int,
|
||
lat: float,
|
||
lon: float,
|
||
out_dir: Path,
|
||
n_gcc_points: int | None = None,
|
||
) -> bool:
|
||
"""Export timeseries.json and rasters.json for one site. Returns True on success."""
|
||
sentinel_base = DATA_DIR / "sentinel_data" / str(year) / site / "prepared"
|
||
fusion_base = DATA_DIR / "fusion" / str(year) / site
|
||
|
||
s2_gcc_dir = sentinel_base / "s2"
|
||
s3_gcc_dir = sentinel_base / "gcc_s3"
|
||
bti_gcc_dir = fusion_base / "bti" / "gcc"
|
||
itb_gcc_dir = fusion_base / "itb" / "fusion"
|
||
|
||
# Raster slider sources
|
||
s2_refl_dir = sentinel_base / "s2"
|
||
s3_comp_dir = sentinel_base / "s3"
|
||
bti_refl_dir = fusion_base / "bti" / "fusion"
|
||
|
||
has_fusion = bti_gcc_dir.is_dir() and any(bti_gcc_dir.glob("GCC_*.tif"))
|
||
if not has_fusion:
|
||
return False
|
||
|
||
out_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
# --- GCC timeseries from rasters ---
|
||
s2_gcc_paths = sorted(s2_gcc_dir.glob("*_GCC.tif"))
|
||
s3_gcc_paths = sorted(s3_gcc_dir.glob("composite_*.tif"))
|
||
bti_paths = sorted(bti_gcc_dir.glob("GCC_*.tif"))
|
||
itb_paths = sorted(itb_gcc_dir.glob("GCC_*.tif"))
|
||
|
||
s2_series = _raster_series(s2_gcc_paths, _date_from_s2_tif, lat, lon, f"{site} S2")
|
||
s3_series = _raster_series(s3_gcc_paths, _date_from_gcc_tif, lat, lon, f"{site} S3")
|
||
bti_series = _raster_series(bti_paths, _date_from_gcc_tif, lat, lon, f"{site} BtI")
|
||
itb_series = _raster_series(itb_paths, _date_from_gcc_tif, lat, lon, f"{site} ItB")
|
||
|
||
# Whittaker on S2
|
||
s2_vals = [p["gcc"] for p in s2_series]
|
||
s2_smooth_vals = _whittaker_smooth(s2_vals)
|
||
s2_whittaker = [
|
||
{"date": p["date"], "gcc": v}
|
||
for p, v in zip(s2_series, s2_smooth_vals)
|
||
if v is not None
|
||
]
|
||
|
||
# S3 5-day moving average
|
||
s3_smooth = _moving_average(s3_series, "gcc", S3_SMOOTH_WINDOW)
|
||
|
||
# PhenoCam CSV
|
||
csv_path = DATA_DIR / "phenocam" / str(year) / f"{site}_1day.csv"
|
||
phenocam_series, image_list = _parse_phenocam_csv(csv_path, year, site)
|
||
|
||
s3_smooth_series = [
|
||
{"date": p["date"], "gcc": p["gcc_smooth"]}
|
||
for p in s3_smooth
|
||
if p.get("gcc_smooth") is not None
|
||
]
|
||
|
||
# Band reflectance timeseries (multi-band center-pixel)
|
||
bands_s2 = _multiband_series(
|
||
sorted(s2_refl_dir.glob("*_REFL.tif")),
|
||
_date_from_s2_tif,
|
||
lat,
|
||
lon,
|
||
S2_BAND_NAMES,
|
||
f"{site} S2 bands",
|
||
)
|
||
bands_s3 = _multiband_series(
|
||
sorted(s3_comp_dir.glob("composite_*.tif")),
|
||
_date_from_gcc_tif,
|
||
lat,
|
||
lon,
|
||
S3_BAND_NAMES,
|
||
f"{site} S3 bands",
|
||
)
|
||
|
||
# --- Per-metric JSON outputs ---
|
||
_write_json(out_dir / "gcc_phenocam.json", phenocam_series)
|
||
_write_json(out_dir / "gcc_s2.json", s2_series)
|
||
_write_json(out_dir / "gcc_s2_whittaker.json", s2_whittaker)
|
||
_write_json(out_dir / "gcc_s3.json", s3_series)
|
||
_write_json(out_dir / "gcc_s3_smooth.json", s3_smooth_series)
|
||
_write_json(out_dir / "gcc_fusion_bti.json", bti_series)
|
||
_write_json(out_dir / "gcc_fusion_itb.json", itb_series)
|
||
_write_json(out_dir / "phenocam_images.json", image_list)
|
||
_write_json(out_dir / "bands_s2.json", bands_s2)
|
||
_write_json(out_dir / "bands_s3.json", bands_s3)
|
||
|
||
# --- Raster index for slider ---
|
||
rel_root = DATA_DIR.parent # paths relative to project root
|
||
|
||
# Valid-pixel sets: only show S2/S3 rasters where the center pixel had
|
||
# usable data (non-zero GCC). This excludes cloud-masked / snow-covered
|
||
# scenes that would render as black or visually nonsensical.
|
||
s2_valid_dates = {p["date"].replace("-", "") for p in s2_series}
|
||
s3_valid_dates = {p["date"].replace("-", "") for p in s3_series}
|
||
|
||
s2_refl = [
|
||
r
|
||
for r in _raster_index(
|
||
sorted(s2_refl_dir.glob("*_REFL.tif")), _date_from_s2_tif, rel_root
|
||
)
|
||
if r["date"] in s2_valid_dates
|
||
]
|
||
s3_comp = [
|
||
r
|
||
for r in _raster_index(
|
||
sorted(s3_comp_dir.glob("composite_*.tif")), _date_from_gcc_tif, rel_root
|
||
)
|
||
if r["date"] in s3_valid_dates
|
||
]
|
||
s2_gcc = [
|
||
r
|
||
for r in _raster_index(
|
||
sorted(s2_gcc_dir.glob("*_GCC.tif")), _date_from_s2_tif, rel_root
|
||
)
|
||
if r["date"] in s2_valid_dates
|
||
]
|
||
s3_gcc = [
|
||
r
|
||
for r in _raster_index(
|
||
sorted(s3_gcc_dir.glob("composite_*.tif")), _date_from_gcc_tif, rel_root
|
||
)
|
||
if r["date"] in s3_valid_dates
|
||
]
|
||
bti_refl = _raster_index(
|
||
sorted(bti_refl_dir.glob("REFL_*.tif")), _date_from_gcc_tif, rel_root
|
||
)
|
||
itb_gcc = _raster_index(
|
||
sorted(itb_gcc_dir.glob("GCC_*.tif")), _date_from_gcc_tif, rel_root
|
||
)
|
||
|
||
_write_json(out_dir / "rasters_s2_refl.json", s2_refl)
|
||
_write_json(out_dir / "rasters_s3_composite.json", s3_comp)
|
||
_write_json(out_dir / "rasters_s2_gcc.json", s2_gcc)
|
||
_write_json(out_dir / "rasters_s3_gcc.json", s3_gcc)
|
||
_write_json(out_dir / "rasters_fusion_bti_refl.json", bti_refl)
|
||
_write_json(out_dir / "rasters_fusion_itb_gcc.json", itb_gcc)
|
||
|
||
# --- Site covariates (heterogeneity + observation density) ---
|
||
_write_json(
|
||
out_dir / "covariates.json",
|
||
compute_covariates(s2_gcc_paths, s2_series, s3_series, n_gcc_points, lat, lon),
|
||
)
|
||
|
||
# --- Validation metrics vs PhenoCam gcc_90 ---
|
||
_write_json(
|
||
out_dir / "metrics.json",
|
||
{
|
||
"bti": compute_metrics(phenocam_series, "gcc_90", bti_series, "gcc"),
|
||
"itb": compute_metrics(phenocam_series, "gcc_90", itb_series, "gcc"),
|
||
"s2_whittaker": compute_metrics(
|
||
phenocam_series, "gcc_90", s2_whittaker, "gcc"
|
||
),
|
||
"s3_smooth": compute_metrics(
|
||
phenocam_series, "gcc_90", s3_smooth_series, "gcc"
|
||
),
|
||
"s2": compute_metrics(phenocam_series, "gcc_90", s2_series, "gcc"),
|
||
"s3": compute_metrics(phenocam_series, "gcc_90", s3_series, "gcc"),
|
||
},
|
||
)
|
||
|
||
# Remove legacy bundled outputs if present
|
||
for legacy in ("timeseries.json", "rasters.json"):
|
||
(out_dir / legacy).unlink(missing_ok=True)
|
||
return True
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Manifest
|
||
# ---------------------------------------------------------------------------
|
||
|
||
VEG_TYPE_LABELS = {
|
||
"AG": "Agriculture",
|
||
"DB": "Deciduous broadleaf",
|
||
"DN": "Deciduous needleleaf",
|
||
"EB": "Evergreen broadleaf",
|
||
"EN": "Evergreen needleleaf",
|
||
"GR": "Grassland",
|
||
"MX": "Mixed",
|
||
"SH": "Shrubland",
|
||
"TN": "Tundra",
|
||
"UN": "Unknown",
|
||
"WL": "Wetland",
|
||
"RF": "Reference",
|
||
}
|
||
|
||
|
||
def build_manifest(years: list[int], filter_site: str | None = None) -> dict:
|
||
manifest: dict[str, Any] = {"years": years, "sites": {}}
|
||
|
||
for year in years:
|
||
screening_path = DATA_DIR / "phenocam_screening" / f"{year}.json"
|
||
if not screening_path.is_file():
|
||
continue
|
||
data = json.loads(screening_path.read_text())
|
||
sites_meta: dict[str, Any] = {}
|
||
for entry in data.get("sites", []):
|
||
if entry.get("calculations", {}).get("status") != "PASS":
|
||
continue
|
||
cam = entry.get("response", {}).get("camera", {})
|
||
roi = entry.get("response", {}).get("roi", {})
|
||
calc = entry.get("calculations", {})
|
||
site = cam.get("Sitename", "")
|
||
if not site:
|
||
continue
|
||
if filter_site and site != filter_site:
|
||
continue
|
||
sm = cam.get("sitemetadata", {})
|
||
veg_raw = sm.get("primary_veg_type") or roi.get("roitype") or "UN"
|
||
fusion_dir = DATA_DIR / "fusion" / str(year) / site / "bti" / "gcc"
|
||
has_fusion = fusion_dir.is_dir() and any(fusion_dir.glob("GCC_*.tif"))
|
||
sites_meta[site] = {
|
||
"lat": cam.get("Lat"),
|
||
"lon": cam.get("Lon"),
|
||
"veg_type": veg_raw,
|
||
"veg_label": VEG_TYPE_LABELS.get(veg_raw, veg_raw),
|
||
"description": sm.get("site_description", ""),
|
||
"dominant_species": sm.get("dominant_species", ""),
|
||
"group": sm.get("group", ""),
|
||
"snr": calc.get("snr"),
|
||
"n_gcc_points": calc.get("n_gcc_points"),
|
||
"has_fusion": has_fusion,
|
||
}
|
||
manifest["sites"][str(year)] = sites_meta
|
||
|
||
return manifest
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# CLI
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def main() -> None:
|
||
parser = argparse.ArgumentParser(description=__doc__)
|
||
parser.add_argument("--evaluation-year", type=int, default=DEFAULT_YEAR)
|
||
parser.add_argument("--site", type=str, default=None)
|
||
args = parser.parse_args()
|
||
|
||
year = args.evaluation_year
|
||
filter_site = args.site
|
||
|
||
out_base = DATA_DIR / "metrics"
|
||
out_base.mkdir(parents=True, exist_ok=True)
|
||
|
||
# Determine years with screening data
|
||
screening_dir = DATA_DIR / "phenocam_screening"
|
||
years = sorted(
|
||
int(p.stem) for p in screening_dir.glob("*.json") if p.stem.isdigit()
|
||
)
|
||
if not years:
|
||
years = [year]
|
||
|
||
print(f"Building manifest for years: {years}")
|
||
manifest = build_manifest(years, filter_site)
|
||
|
||
# Export per-site data for the requested year
|
||
year_sites = manifest["sites"].get(str(year), {})
|
||
fusion_sites = {s: m for s, m in year_sites.items() if m["has_fusion"]}
|
||
|
||
print(f"Exporting {len(fusion_sites)} site(s) with fusion data for {year}")
|
||
for site, meta in tqdm(fusion_sites.items(), desc="Sites"):
|
||
out_dir = out_base / str(year) / site
|
||
ok = export_site(
|
||
site, year, meta["lat"], meta["lon"], out_dir, meta.get("n_gcc_points")
|
||
)
|
||
if ok:
|
||
print(f" ✓ {site}")
|
||
else:
|
||
print(f" ✗ {site} — no fusion data found")
|
||
|
||
manifest_path = out_base / "manifest.json"
|
||
if filter_site and manifest_path.is_file():
|
||
# Merge: update only the filtered site(s); preserve all other entries.
|
||
existing: dict = json.loads(manifest_path.read_text())
|
||
for year_key, year_sites_new in manifest["sites"].items():
|
||
existing.setdefault("sites", {}).setdefault(year_key, {}).update(
|
||
year_sites_new
|
||
)
|
||
all_years = sorted(set(existing.get("years", [])) | set(manifest["years"]))
|
||
existing["years"] = all_years
|
||
manifest_path.write_text(json.dumps(existing, separators=(",", ":")))
|
||
else:
|
||
manifest_path.write_text(json.dumps(manifest, separators=(",", ":")))
|
||
print(f"Manifest written → {manifest_path}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|