"""Step 5: Pre-compute per-site GCC timeseries + raster index for the webapp. Inputs (``data/``, ``{year}`` = ``--evaluation-year``): - ``phenocam_screening/{year}.json`` — qualifying sites + metadata - ``phenocam/{year}/{site}_1day.csv`` — daily GCC timeseries - ``sentinel_data/{year}/{site}/prepared/s2/*_GCC.tif`` — S2 GCC rasters - ``sentinel_data/{year}/{site}/prepared/gcc_s3/composite_*.tif`` — S3 GCC rasters - ``fusion/{year}/{site}/bti/gcc/GCC_*.tif`` — BtI GCC rasters - ``fusion/{year}/{site}/itb/fusion/GCC_*.tif`` — ItB GCC rasters Outputs (``data/metrics/``): - ``manifest.json`` — years + per-site metadata - ``{year}/{site}/gcc_phenocam.json`` — PhenoCam ``gcc_90`` at matched dates - ``{year}/{site}/gcc_s2.json`` — S2 GCC (center pixel, cloud-free scenes) - ``{year}/{site}/gcc_s2_whittaker.json`` — Whittaker-smoothed S2 GCC - ``{year}/{site}/gcc_s3.json`` — S3 composite GCC - ``{year}/{site}/gcc_s3_smooth.json`` — S3 5-day moving average - ``{year}/{site}/gcc_fusion_bti.json`` — BtI fused GCC - ``{year}/{site}/gcc_fusion_itb.json`` — ItB fused GCC - ``{year}/{site}/phenocam_images.json`` — midday photo URLs for the viewer - ``{year}/{site}/rasters_s2_refl.json`` — S2 REFL paths (BtI view) - ``{year}/{site}/rasters_s3_composite.json`` — S3 composite paths (BtI view) - ``{year}/{site}/rasters_s2_gcc.json`` — S2 GCC paths (ItB view) - ``{year}/{site}/rasters_s3_gcc.json`` — S3 GCC paths (ItB view) - ``{year}/{site}/rasters_fusion_bti_refl.json`` — BtI fused REFL paths - ``{year}/{site}/rasters_fusion_itb_gcc.json`` — ItB fused GCC paths - ``{year}/{site}/metrics.json`` — NSE, RMSE, nRMSE, Pearson r vs PhenoCam per series - ``{year}/{site}/bands_s2.json`` — S2 center-pixel reflectance (B02, B03, B04) per scene - ``{year}/{site}/bands_s3.json`` — S3 center-pixel reflectance (Oa04, Oa06, Oa08, Oa17) per composite - ``{year}/{site}/covariates.json`` — spatial CV/std, S2/S3 counts, gap stats CLI: - ``--evaluation-year`` (default 2025) - ``--site`` (optional; default: all qualifying sites with sentinel data) """ from __future__ import annotations import argparse import csv import json import re from pathlib import Path from typing import Any import datetime import numpy as np import rasterio from rasterio.crs import CRS from rasterio.transform import rowcol from pyproj import Transformer from scipy.stats import pearsonr from tqdm import tqdm # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- DATA_DIR = Path("data") DEFAULT_YEAR = 2025 # GCC smoothing window for S3 moving average (days) S3_SMOOTH_WINDOW = 5 # Whittaker lambda (penalised smoothing strength for S2) WHITTAKER_LAMBDA = 400.0 # Half-width in metres for the spatial heterogeneity footprint (~300 m = 1 S3 pixel) SPATIAL_CV_HALF_M = 150 # PhenoCam archive image URL pattern PHENOCAM_IMAGE_URL = ( "https://phenocam.nau.edu/data/archive/{site}/{year}/{month}/{filename}" ) # --------------------------------------------------------------------------- # Helpers: raster pixel extraction # --------------------------------------------------------------------------- def _window_mean(data: np.ndarray) -> float | None: """Mean of a pixel window; ``None`` when every value is NaN.""" valid = data[~np.isnan(data)] if valid.size == 0: return None return float(np.mean(valid)) def _read_center_pixel(path: Path, lat: float, lon: float) -> float | None: """Return the 3×3 mean GCC value at (lat, lon) from a single-band raster. Returns ``None`` when the pixel is masked/zero/NaN. """ try: with rasterio.open(path) as src: transformer = Transformer.from_crs( CRS.from_epsg(4326), src.crs, always_xy=True ) x, y = transformer.transform(lon, lat) row, col = rowcol(src.transform, x, y) h, w = src.height, src.width r0, r1 = max(0, row - 1), min(h, row + 2) c0, c1 = max(0, col - 1), min(w, col + 2) window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0) data = src.read(1, window=window).astype(float) nodata = src.nodata if nodata is not None: data = np.where(data == nodata, np.nan, data) data[data == 0] = np.nan return _window_mean(data) except Exception: return None # --------------------------------------------------------------------------- # Helpers: date extraction from filenames # --------------------------------------------------------------------------- def _date_from_gcc_tif(path: Path) -> str | None: """Extract YYYYMMDD from ``GCC_YYYYMMDD.tif`` or ``composite_YYYYMMDD.tif``.""" m = re.search(r"(\d{8})", path.stem) return m.group(1) if m else None def _date_from_s2_tif(path: Path) -> str | None: """Extract YYYYMMDD from S2 product name ``S2X_TTTT_YYYYMMDD_…``.""" parts = path.stem.split("_") if len(parts) >= 3: m = re.match(r"(\d{8})", parts[2]) return m.group(1) if m else None return None # --------------------------------------------------------------------------- # Helpers: Whittaker smoother (2nd-order differences, tridiagonal solver) # --------------------------------------------------------------------------- def _whittaker_smooth( values: list[float | None], lam: float = WHITTAKER_LAMBDA ) -> list[float | None]: """Penalised least-squares smoother (Whittaker, 2nd-order differences). Masked (None) values are filled via the smooth and then re-set to None in the output so the caller can distinguish observed from gap-filled points. """ n = len(values) if n < 4: return values[:] obs_mask = [v is not None for v in values] y = np.array([v if v is not None else 0.0 for v in values], dtype=float) w = np.array([1.0 if m else 0.0 for m in obs_mask], dtype=float) W = np.diag(w) D = np.diff(np.eye(n), n=2, axis=0) # (n-2) x n second-difference matrix A = W + lam * D.T @ D try: z = np.linalg.solve(A, w * y) except np.linalg.LinAlgError: return values[:] result: list[float | None] = [] for i, m in enumerate(obs_mask): result.append(float(z[i]) if m else None) return result # --------------------------------------------------------------------------- # Helpers: PhenoCam CSV parsing # --------------------------------------------------------------------------- def _parse_phenocam_csv( csv_path: Path, year: int, site: str ) -> tuple[list[dict], list[dict]]: """Return (gcc_series, image_list) filtered to ``year``. ``gcc_series`` entries: ``{"date": "YYYY-MM-DD", "gcc_90": float}`` ``image_list`` entries: ``{"date": "YYYY-MM-DD", "url": str}`` """ gcc_series: list[dict] = [] image_list: list[dict] = [] year_str = str(year) if not csv_path.is_file(): return gcc_series, image_list with csv_path.open() as f: lines = [line for line in f if not line.startswith("#")] reader = csv.DictReader(lines) for row in reader: if row.get("year") != year_str: continue date = row.get("date", "") gcc_raw = row.get("gcc_90") if gcc_raw and gcc_raw not in ("NA", ""): try: gcc_series.append({"date": date, "gcc_90": float(gcc_raw)}) except ValueError: pass fn = row.get("midday_filename", "").strip() if fn and fn != "NA" and date: month = date[5:7] url = PHENOCAM_IMAGE_URL.format( site=site, year=year_str, month=month, filename=fn ) image_list.append({"date": date, "url": url}) return gcc_series, image_list # --------------------------------------------------------------------------- # Helpers: moving average # --------------------------------------------------------------------------- def _moving_average(series: list[dict], value_key: str, window: int) -> list[dict]: """Compute centred moving average; returns new list with ``_smooth`` suffix key.""" if not series: return [] vals = [p[value_key] for p in series] half = window // 2 smoothed = [] for i, pt in enumerate(series): chunk = [v for v in vals[max(0, i - half) : i + half + 1] if v is not None] smoothed.append( { "date": pt["date"], value_key + "_smooth": (sum(chunk) / len(chunk)) if chunk else None, } ) return smoothed # --------------------------------------------------------------------------- # Helpers: validation metrics # --------------------------------------------------------------------------- MATCH_TOLERANCE_DAYS = 5 def compute_metrics( ref: list[dict], ref_key: str, pred: list[dict], pred_key: str, ) -> dict | None: """Compute NSE, RMSE, nRMSE, Pearson r between pred and ref. Each pred point is matched to the nearest ref date within ``MATCH_TOLERANCE_DAYS``. Returns a dict or ``None`` if fewer than 2 matched pairs exist. """ ref_lookup: dict[str, float] = { p["date"]: p[ref_key] for p in ref if p.get(ref_key) is not None } if not ref_lookup: return None ref_dates = sorted(ref_lookup) obs, sim = [], [] for pt in pred: v = pt.get(pred_key) if v is None: continue nearest = min( ref_dates, key=lambda d: abs( (np.datetime64(pt["date"]) - np.datetime64(d)) / np.timedelta64(1, "D") ), ) gap = abs( (np.datetime64(pt["date"]) - np.datetime64(nearest)) / np.timedelta64(1, "D") ) if gap <= MATCH_TOLERANCE_DAYS and nearest in ref_lookup: obs.append(ref_lookup[nearest]) sim.append(v) if len(obs) < 2: return None obs_arr = np.array(obs) sim_arr = np.array(sim) obs_mean = obs_arr.mean() rmse = float(np.sqrt(np.mean((sim_arr - obs_arr) ** 2))) nrmse = rmse / obs_mean if obs_mean else None ss_res = float(np.sum((obs_arr - sim_arr) ** 2)) ss_tot = float(np.sum((obs_arr - obs_mean) ** 2)) nse = (1.0 - ss_res / ss_tot) if ss_tot else None r, _ = pearsonr(obs_arr, sim_arr) def _r4(v: float | None) -> float | None: return round(v, 4) if v is not None else None return { "n": len(obs), "rmse": _r4(rmse), "nrmse": _r4(nrmse), "nse": _r4(nse), "r": _r4(float(r)), } S2_BAND_NAMES = ["B02", "B03", "B04"] S3_BAND_NAMES = ["Oa04", "Oa06", "Oa08", "Oa17"] def _read_multiband_center( path: Path, lat: float, lon: float, band_names: list[str] ) -> dict[str, float | None]: """Return 3×3 mean per band at (lat, lon). Keys are ``band_names``, values float or None.""" try: with rasterio.open(path) as src: transformer = Transformer.from_crs( CRS.from_epsg(4326), src.crs, always_xy=True ) x, y = transformer.transform(lon, lat) row, col = rowcol(src.transform, x, y) h, w = src.height, src.width r0, r1 = max(0, row - 1), min(h, row + 2) c0, c1 = max(0, col - 1), min(w, col + 2) window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0) nodata = src.nodata result = {} for i, name in enumerate(band_names, 1): if i > src.count: result[name] = None continue data = src.read(i, window=window).astype(float) if nodata is not None: data = np.where(data == nodata, np.nan, data) data[data == 0] = np.nan val = _window_mean(data) result[name] = None if val is None else round(val, 6) return result except Exception: return {name: None for name in band_names} def _multiband_series( tif_paths: list[Path], date_fn, lat: float, lon: float, band_names: list[str], desc: str, ) -> list[dict]: """Extract center-pixel values for all bands; return ``[{date, band1, band2, …}]``.""" result = [] for p in tqdm(tif_paths, desc=desc, leave=False): date = date_fn(p) if date is None: continue vals = _read_multiband_center(p, lat, lon, band_names) if any(v is not None for v in vals.values()): result.append({"date": f"{date[:4]}-{date[4:6]}-{date[6:]}", **vals}) return sorted(result, key=lambda x: x["date"]) # --------------------------------------------------------------------------- # Helpers: spatial heterogeneity + observation density # --------------------------------------------------------------------------- def _read_footprint_stats( path: Path, lat: float, lon: float, half_m: float = SPATIAL_CV_HALF_M ) -> tuple[float, float] | tuple[None, None]: """Return (mean, std) of valid GCC pixels within a ±half_m metre square window. Returns ``(None, None)`` on any error or when fewer than 4 valid pixels exist. """ try: with rasterio.open(path) as src: transformer = Transformer.from_crs( CRS.from_epsg(4326), src.crs, always_xy=True ) x, y = transformer.transform(lon, lat) res = abs(src.transform.a) # pixel size in CRS units (metres for UTM) half_px = max(1, int(round(half_m / res))) row, col = rowcol(src.transform, x, y) h, w = src.height, src.width r0, r1 = max(0, row - half_px), min(h, row + half_px + 1) c0, c1 = max(0, col - half_px), min(w, col + half_px + 1) window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0) data = src.read(1, window=window).astype(float) nodata = src.nodata if nodata is not None: data = np.where(data == nodata, np.nan, data) data[data <= 0] = np.nan valid = data[~np.isnan(data)] if len(valid) < 4: return None, None return float(np.mean(valid)), float(np.std(valid)) except Exception: return None, None def compute_covariates( s2_gcc_paths: list[Path], s2_series: list[dict], s3_series: list[dict], n_gcc_points: int | None, lat: float, lon: float, ) -> dict: """Compute spatial heterogeneity and temporal observation density covariates.""" # Spatial GCC statistics over ~300 m footprint means, stds = [], [] for p in s2_gcc_paths: m, s = _read_footprint_stats(p, lat, lon) if m is not None and m > 0: means.append(m) stds.append(s) spatial_gcc_cv = ( round(float(np.mean([s / m for s, m in zip(stds, means)])), 4) if means else None ) spatial_gcc_std = round(float(np.mean(stds)), 4) if stds else None # S2 temporal gap statistics s2_dates = [datetime.date.fromisoformat(p["date"]) for p in s2_series] if len(s2_dates) >= 2: gaps = [(s2_dates[i + 1] - s2_dates[i]).days for i in range(len(s2_dates) - 1)] s2_mean_gap = round(float(np.mean(gaps)), 1) s2_max_gap = int(max(gaps)) else: s2_mean_gap = None s2_max_gap = None return { "spatial_gcc_cv": spatial_gcc_cv, "spatial_gcc_std": spatial_gcc_std, "s2_scene_count": len(s2_series), "s2_mean_gap_days": s2_mean_gap, "s2_max_gap_days": s2_max_gap, "s3_composite_count": len(s3_series), "n_gcc_points": n_gcc_points, } # --------------------------------------------------------------------------- # Per-site export # --------------------------------------------------------------------------- def _write_json(path: Path, data: Any) -> None: path.write_text(json.dumps(data, separators=(",", ":"))) def _raster_series( tif_paths: list[Path], date_fn, lat: float, lon: float, desc: str, ) -> list[dict]: """Extract center-pixel GCC from each tif, return ``[{date, gcc}]`` sorted.""" result = [] for p in tqdm(tif_paths, desc=desc, leave=False): date = date_fn(p) if date is None: continue val = _read_center_pixel(p, lat, lon) if val is not None: result.append({"date": f"{date[:4]}-{date[4:6]}-{date[6:]}", "gcc": val}) return sorted(result, key=lambda x: x["date"]) def _raster_index(tif_paths: list[Path], date_fn, rel_root: Path) -> list[dict]: """Build raster index: ``[{date, path}]`` sorted by date.""" result = [] for p in tif_paths: date = date_fn(p) if date is None: continue try: rel = str(p.relative_to(rel_root)) except ValueError: rel = str(p) result.append({"date": date, "path": rel}) return sorted(result, key=lambda x: x["date"]) def export_site( site: str, year: int, lat: float, lon: float, out_dir: Path, n_gcc_points: int | None = None, ) -> bool: """Export timeseries.json and rasters.json for one site. Returns True on success.""" sentinel_base = DATA_DIR / "sentinel_data" / str(year) / site / "prepared" fusion_base = DATA_DIR / "fusion" / str(year) / site s2_gcc_dir = sentinel_base / "s2" s3_gcc_dir = sentinel_base / "gcc_s3" bti_gcc_dir = fusion_base / "bti" / "gcc" itb_gcc_dir = fusion_base / "itb" / "fusion" # Raster slider sources s2_refl_dir = sentinel_base / "s2" s3_comp_dir = sentinel_base / "s3" bti_refl_dir = fusion_base / "bti" / "fusion" has_fusion = bti_gcc_dir.is_dir() and any(bti_gcc_dir.glob("GCC_*.tif")) if not has_fusion: return False out_dir.mkdir(parents=True, exist_ok=True) # --- GCC timeseries from rasters --- s2_gcc_paths = sorted(s2_gcc_dir.glob("*_GCC.tif")) s3_gcc_paths = sorted(s3_gcc_dir.glob("composite_*.tif")) bti_paths = sorted(bti_gcc_dir.glob("GCC_*.tif")) itb_paths = sorted(itb_gcc_dir.glob("GCC_*.tif")) s2_series = _raster_series(s2_gcc_paths, _date_from_s2_tif, lat, lon, f"{site} S2") s3_series = _raster_series(s3_gcc_paths, _date_from_gcc_tif, lat, lon, f"{site} S3") bti_series = _raster_series(bti_paths, _date_from_gcc_tif, lat, lon, f"{site} BtI") itb_series = _raster_series(itb_paths, _date_from_gcc_tif, lat, lon, f"{site} ItB") # Whittaker on S2 s2_vals = [p["gcc"] for p in s2_series] s2_smooth_vals = _whittaker_smooth(s2_vals) s2_whittaker = [ {"date": p["date"], "gcc": v} for p, v in zip(s2_series, s2_smooth_vals) if v is not None ] # S3 5-day moving average s3_smooth = _moving_average(s3_series, "gcc", S3_SMOOTH_WINDOW) # PhenoCam CSV csv_path = DATA_DIR / "phenocam" / str(year) / f"{site}_1day.csv" phenocam_series, image_list = _parse_phenocam_csv(csv_path, year, site) s3_smooth_series = [ {"date": p["date"], "gcc": p["gcc_smooth"]} for p in s3_smooth if p.get("gcc_smooth") is not None ] # Band reflectance timeseries (multi-band center-pixel) bands_s2 = _multiband_series( sorted(s2_refl_dir.glob("*_REFL.tif")), _date_from_s2_tif, lat, lon, S2_BAND_NAMES, f"{site} S2 bands", ) bands_s3 = _multiband_series( sorted(s3_comp_dir.glob("composite_*.tif")), _date_from_gcc_tif, lat, lon, S3_BAND_NAMES, f"{site} S3 bands", ) # --- Per-metric JSON outputs --- _write_json(out_dir / "gcc_phenocam.json", phenocam_series) _write_json(out_dir / "gcc_s2.json", s2_series) _write_json(out_dir / "gcc_s2_whittaker.json", s2_whittaker) _write_json(out_dir / "gcc_s3.json", s3_series) _write_json(out_dir / "gcc_s3_smooth.json", s3_smooth_series) _write_json(out_dir / "gcc_fusion_bti.json", bti_series) _write_json(out_dir / "gcc_fusion_itb.json", itb_series) _write_json(out_dir / "phenocam_images.json", image_list) _write_json(out_dir / "bands_s2.json", bands_s2) _write_json(out_dir / "bands_s3.json", bands_s3) # --- Raster index for slider --- rel_root = DATA_DIR.parent # paths relative to project root # Valid-pixel sets: only show S2/S3 rasters where the center pixel had # usable data (non-zero GCC). This excludes cloud-masked / snow-covered # scenes that would render as black or visually nonsensical. s2_valid_dates = {p["date"].replace("-", "") for p in s2_series} s3_valid_dates = {p["date"].replace("-", "") for p in s3_series} s2_refl = [ r for r in _raster_index( sorted(s2_refl_dir.glob("*_REFL.tif")), _date_from_s2_tif, rel_root ) if r["date"] in s2_valid_dates ] s3_comp = [ r for r in _raster_index( sorted(s3_comp_dir.glob("composite_*.tif")), _date_from_gcc_tif, rel_root ) if r["date"] in s3_valid_dates ] s2_gcc = [ r for r in _raster_index( sorted(s2_gcc_dir.glob("*_GCC.tif")), _date_from_s2_tif, rel_root ) if r["date"] in s2_valid_dates ] s3_gcc = [ r for r in _raster_index( sorted(s3_gcc_dir.glob("composite_*.tif")), _date_from_gcc_tif, rel_root ) if r["date"] in s3_valid_dates ] bti_refl = _raster_index( sorted(bti_refl_dir.glob("REFL_*.tif")), _date_from_gcc_tif, rel_root ) itb_gcc = _raster_index( sorted(itb_gcc_dir.glob("GCC_*.tif")), _date_from_gcc_tif, rel_root ) _write_json(out_dir / "rasters_s2_refl.json", s2_refl) _write_json(out_dir / "rasters_s3_composite.json", s3_comp) _write_json(out_dir / "rasters_s2_gcc.json", s2_gcc) _write_json(out_dir / "rasters_s3_gcc.json", s3_gcc) _write_json(out_dir / "rasters_fusion_bti_refl.json", bti_refl) _write_json(out_dir / "rasters_fusion_itb_gcc.json", itb_gcc) # --- Site covariates (heterogeneity + observation density) --- _write_json( out_dir / "covariates.json", compute_covariates(s2_gcc_paths, s2_series, s3_series, n_gcc_points, lat, lon), ) # --- Validation metrics vs PhenoCam gcc_90 --- _write_json( out_dir / "metrics.json", { "bti": compute_metrics(phenocam_series, "gcc_90", bti_series, "gcc"), "itb": compute_metrics(phenocam_series, "gcc_90", itb_series, "gcc"), "s2_whittaker": compute_metrics( phenocam_series, "gcc_90", s2_whittaker, "gcc" ), "s3_smooth": compute_metrics( phenocam_series, "gcc_90", s3_smooth_series, "gcc" ), "s2": compute_metrics(phenocam_series, "gcc_90", s2_series, "gcc"), "s3": compute_metrics(phenocam_series, "gcc_90", s3_series, "gcc"), }, ) # Remove legacy bundled outputs if present for legacy in ("timeseries.json", "rasters.json"): (out_dir / legacy).unlink(missing_ok=True) return True # --------------------------------------------------------------------------- # Manifest # --------------------------------------------------------------------------- VEG_TYPE_LABELS = { "AG": "Agriculture", "DB": "Deciduous broadleaf", "DN": "Deciduous needleleaf", "EB": "Evergreen broadleaf", "EN": "Evergreen needleleaf", "GR": "Grassland", "MX": "Mixed", "SH": "Shrubland", "TN": "Tundra", "UN": "Unknown", "WL": "Wetland", "RF": "Reference", } def build_manifest(years: list[int], filter_site: str | None = None) -> dict: manifest: dict[str, Any] = {"years": years, "sites": {}} for year in years: screening_path = DATA_DIR / "phenocam_screening" / f"{year}.json" if not screening_path.is_file(): continue data = json.loads(screening_path.read_text()) sites_meta: dict[str, Any] = {} for entry in data.get("sites", []): if entry.get("calculations", {}).get("status") != "PASS": continue cam = entry.get("response", {}).get("camera", {}) roi = entry.get("response", {}).get("roi", {}) calc = entry.get("calculations", {}) site = cam.get("Sitename", "") if not site: continue if filter_site and site != filter_site: continue sm = cam.get("sitemetadata", {}) veg_raw = sm.get("primary_veg_type") or roi.get("roitype") or "UN" fusion_dir = DATA_DIR / "fusion" / str(year) / site / "bti" / "gcc" has_fusion = fusion_dir.is_dir() and any(fusion_dir.glob("GCC_*.tif")) sites_meta[site] = { "lat": cam.get("Lat"), "lon": cam.get("Lon"), "veg_type": veg_raw, "veg_label": VEG_TYPE_LABELS.get(veg_raw, veg_raw), "description": sm.get("site_description", ""), "dominant_species": sm.get("dominant_species", ""), "group": sm.get("group", ""), "snr": calc.get("snr"), "n_gcc_points": calc.get("n_gcc_points"), "has_fusion": has_fusion, } manifest["sites"][str(year)] = sites_meta return manifest # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def main() -> None: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--evaluation-year", type=int, default=DEFAULT_YEAR) parser.add_argument("--site", type=str, default=None) args = parser.parse_args() year = args.evaluation_year filter_site = args.site out_base = DATA_DIR / "metrics" out_base.mkdir(parents=True, exist_ok=True) # Determine years with screening data screening_dir = DATA_DIR / "phenocam_screening" years = sorted( int(p.stem) for p in screening_dir.glob("*.json") if p.stem.isdigit() ) if not years: years = [year] print(f"Building manifest for years: {years}") manifest = build_manifest(years, filter_site) # Export per-site data for the requested year year_sites = manifest["sites"].get(str(year), {}) fusion_sites = {s: m for s, m in year_sites.items() if m["has_fusion"]} if filter_site: fusion_sites = {s: m for s, m in fusion_sites.items() if s == filter_site} print(f"Exporting {len(fusion_sites)} site(s) with fusion data for {year}") for site, meta in tqdm(fusion_sites.items(), desc="Sites"): out_dir = out_base / str(year) / site ok = export_site( site, year, meta["lat"], meta["lon"], out_dir, meta.get("n_gcc_points") ) if ok: print(f" ✓ {site}") else: print(f" ✗ {site} — no fusion data found") manifest_path = out_base / "manifest.json" manifest_path.write_text(json.dumps(manifest, separators=(",", ":"))) print(f"Manifest written → {manifest_path}") if __name__ == "__main__": main()