diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..369ceb2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,75 @@
+# Generated caches and downloads (regenerate via pipeline steps)
+data/
+
+# Environment and secrets
+.env
+.env.*
+!.env.example
+.venv/
+venv/
+env/
+ENV/
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+*.pyd
+.Python
+
+# uv
+.uv/
+
+# Testing & coverage
+.pytest_cache/
+.coverage
+.coverage.*
+coverage.xml
+nosetests.xml
+htmlcov/
+.tox/
+
+# Linting & type checking
+.ruff_cache/
+.mypy_cache/
+.dmypy.json
+dmypy.json
+.pytype/
+
+# Distribution / packaging
+build/
+dist/
+*.egg-info/
+*.egg
+MANIFEST
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Logs
+*.log
+nohup.out
+
+# Jupyter
+.ipynb_checkpoints/
+profile_default/
+ipython_config.py
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS
+.DS_Store
+Thumbs.db
+desktop.ini
+
+# Merge artifacts
+*.orig
+*.rej
+
+# Agents
+AGENTS.md
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..6ba9060
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,8 @@
+repos:
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: v0.8.4
+ hooks:
+ - id: ruff
+ args: [--fix]
+ - id: ruff-format
+
diff --git a/.python-version b/.python-version
new file mode 100644
index 0000000..3e72aa6
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.11.10
diff --git a/1-phenocam.py b/1-phenocam.py
new file mode 100644
index 0000000..89ca797
--- /dev/null
+++ b/1-phenocam.py
@@ -0,0 +1,284 @@
+"""Step 1: download worldwide PhenoCam sites for a calendar year.
+
+Inputs (``data/``): none — queries the PhenoCam API.
+
+Outputs (``data/``, ``{year}`` = ``--evaluation-year``):
+
+- ``phenocam/{year}.json`` — site list manifest
+- ``phenocam/{year}/{sitename}.json`` — camera + ROI metadata
+- ``phenocam/{year}/{sitename}_1day.csv`` — ``one_day_summary`` GCC CSV
+
+CLI: ``--evaluation-year`` (default 2025), ``--sites`` (optional comma-separated filter).
+
+Next step: :mod:`2-phenocam-screening`.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from datetime import date
+from pathlib import Path
+from typing import Any
+
+import requests
+
+PROCESSING_DIR = Path(__file__).resolve().parents[1] / "processing"
+if str(PROCESSING_DIR) not in sys.path:
+ sys.path.insert(0, str(PROCESSING_DIR))
+
+from acquisition_phenocam import PHENOCAM_API # noqa: E402
+from acquisition_phenocam_all_europe import _paginate_cameras, _parse_iso_date # noqa: E402
+
+EVALUATION_YEAR = 2025
+HOST_PROBE = "https://phenocam.nau.edu/api/cameras/?limit=1"
+ONE_DAY_CSV_SUFFIX = "_1day.csv"
+
+
+def check_phenocam_host() -> None:
+ try:
+ response = requests.get(HOST_PROBE, timeout=30)
+ response.raise_for_status()
+ except requests.RequestException as exc:
+ raise RuntimeError(
+ f"PhenoCam API unreachable (phenocam.nau.edu): "
+ f"{exc.__class__.__name__}: {exc}"
+ ) from exc
+
+
+def _overlaps_year(first: str | None, last: str | None, season: int) -> bool:
+ start = _parse_iso_date(first)
+ end = _parse_iso_date(last)
+ if start is None or end is None:
+ return False
+ return start <= date(season, 12, 31) and end >= date(season, 1, 1)
+
+
+def sites_dir(cache_dir: Path, evaluation_year: int) -> Path:
+ return cache_dir / "phenocam" / str(evaluation_year)
+
+
+def site_json_path(cache_dir: Path, evaluation_year: int, sitename: str) -> Path:
+ return sites_dir(cache_dir, evaluation_year) / f"{sitename}.json"
+
+
+def site_csv_path(cache_dir: Path, evaluation_year: int, sitename: str) -> Path:
+ return sites_dir(cache_dir, evaluation_year) / f"{sitename}{ONE_DAY_CSV_SUFFIX}"
+
+
+def load_candidate_cameras(
+ evaluation_year: int,
+ *,
+ site_filter: set[str] | None = None,
+ active_only: bool = False,
+ limit: int | None = None,
+) -> list[dict[str, Any]]:
+ cameras: list[dict[str, Any]] = []
+ for camera in _paginate_cameras():
+ if active_only and not camera.get("active"):
+ continue
+ sitename = str(camera["Sitename"])
+ if site_filter is not None and sitename not in site_filter:
+ continue
+ if not _overlaps_year(
+ camera.get("date_first"), camera.get("date_last"), evaluation_year
+ ):
+ continue
+ cameras.append(dict(camera))
+ cameras.sort(key=lambda item: str(item["Sitename"]))
+ if limit is not None:
+ cameras = cameras[:limit]
+ return cameras
+
+
+def fetch_roi_record(site_name: str) -> dict[str, Any] | None:
+ rois: list[dict[str, Any]] = []
+ url = f"{PHENOCAM_API}/roilists/"
+ params: dict[str, Any] | None = {"site": site_name}
+ while url:
+ response = requests.get(url, params=params, timeout=60)
+ response.raise_for_status()
+ payload = response.json()
+ rois.extend(
+ item for item in payload.get("results", []) if item.get("site") == site_name
+ )
+ url = payload.get("next")
+ params = None
+ if rois:
+ break
+ return dict(rois[0]) if rois else None
+
+
+def download_one_day_csv(csv_url: str, output_path: Path) -> None:
+ response = requests.get(csv_url, timeout=60)
+ response.raise_for_status()
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ output_path.write_text(response.text, encoding="utf-8")
+
+
+def download_site(
+ camera: dict[str, Any],
+ evaluation_year: int,
+ cache_dir: Path,
+) -> str:
+ sitename = str(camera["Sitename"])
+ roi = fetch_roi_record(sitename)
+ payload = {"response": {"camera": camera, "roi": roi}}
+ json_path = site_json_path(cache_dir, evaluation_year, sitename)
+ json_path.parent.mkdir(parents=True, exist_ok=True)
+ json_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
+
+ csv_url = roi.get("one_day_summary") if roi else None
+ if csv_url:
+ download_one_day_csv(
+ csv_url, site_csv_path(cache_dir, evaluation_year, sitename)
+ )
+ return sitename
+
+
+def load_or_download_site(
+ camera: dict[str, Any],
+ evaluation_year: int,
+ cache_dir: Path,
+ *,
+ refresh: bool,
+) -> str:
+ sitename = str(camera["Sitename"])
+ json_path = site_json_path(cache_dir, evaluation_year, sitename)
+ csv_path = site_csv_path(cache_dir, evaluation_year, sitename)
+ if not refresh and json_path.is_file():
+ if not csv_path.is_file():
+ payload = json.loads(json_path.read_text(encoding="utf-8"))
+ roi = payload.get("response", {}).get("roi") or {}
+ csv_url = roi.get("one_day_summary")
+ if csv_url:
+ download_one_day_csv(csv_url, csv_path)
+ return sitename
+ return download_site(camera, evaluation_year, cache_dir)
+
+
+def run_download(
+ *,
+ cache_dir: Path,
+ evaluation_year: int,
+ active_only: bool = False,
+ site_filter: set[str] | None = None,
+ limit: int | None = None,
+ refresh: bool = False,
+) -> list[str]:
+ check_phenocam_host()
+ candidates = load_candidate_cameras(
+ evaluation_year,
+ site_filter=site_filter,
+ active_only=active_only,
+ limit=limit,
+ )
+ print(
+ f"[PhenoCam-1] {len(candidates)} candidate(s) with archive overlap for "
+ f"{evaluation_year}"
+ )
+
+ sitenames: list[str] = []
+ for index, camera in enumerate(candidates, start=1):
+ sitename = str(camera["Sitename"])
+ print(
+ f"[PhenoCam-1] ({index}/{len(candidates)}) {sitename} "
+ f"({float(camera['Lat']):.4f}, {float(camera['Lon']):.4f})"
+ )
+ sitenames.append(
+ load_or_download_site(
+ camera,
+ evaluation_year,
+ cache_dir,
+ refresh=refresh,
+ )
+ )
+ return sorted(sitenames)
+
+
+def write_manifest(
+ sitenames: list[str],
+ output_path: Path,
+ cache_dir: Path,
+ evaluation_year: int,
+) -> None:
+ rel_sites_dir = sites_dir(cache_dir, evaluation_year).relative_to(
+ output_path.parent
+ )
+ payload = {
+ "evaluation_year": evaluation_year,
+ "count": len(sitenames),
+ "sites_dir": rel_sites_dir.as_posix(),
+ "sites": sitenames,
+ }
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ output_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
+ print(f"[PhenoCam-1] Wrote {output_path}")
+
+
+def main(argv: list[str] | None = None) -> int:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument(
+ "--cache-dir",
+ type=Path,
+ default=Path("data"),
+ help="Base directory for per-site files and manifest",
+ )
+ parser.add_argument(
+ "--evaluation-year",
+ type=int,
+ default=EVALUATION_YEAR,
+ help=f"Calendar year to download (default: {EVALUATION_YEAR})",
+ )
+ parser.add_argument(
+ "--active-only",
+ action="store_true",
+ help="Restrict candidates to cameras marked active in the API",
+ )
+ parser.add_argument(
+ "--limit",
+ type=int,
+ default=None,
+ help="Process only the first N candidate sites (testing)",
+ )
+ parser.add_argument(
+ "--sites",
+ type=str,
+ default=None,
+ help="Comma-separated sitenames to download (testing)",
+ )
+ parser.add_argument(
+ "--refresh",
+ action="store_true",
+ help="Re-download sites even when cache files exist",
+ )
+ parser.add_argument(
+ "--output-json",
+ type=Path,
+ default=None,
+ help="Manifest output path (default: data/phenocam/{year}.json)",
+ )
+ args = parser.parse_args(argv)
+
+ site_filter = None
+ if args.sites:
+ site_filter = {name.strip() for name in args.sites.split(",") if name.strip()}
+
+ sitenames = run_download(
+ cache_dir=args.cache_dir,
+ evaluation_year=args.evaluation_year,
+ active_only=args.active_only,
+ site_filter=site_filter,
+ limit=args.limit,
+ refresh=args.refresh,
+ )
+ manifest_path = args.output_json or (
+ args.cache_dir / "phenocam" / f"{args.evaluation_year}.json"
+ )
+ write_manifest(sitenames, manifest_path, args.cache_dir, args.evaluation_year)
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/2-phenocam-screening.py b/2-phenocam-screening.py
new file mode 100644
index 0000000..5848ae9
--- /dev/null
+++ b/2-phenocam-screening.py
@@ -0,0 +1,494 @@
+"""Step 2: PhenoCam GCC + SNR screening on step-1 cache.
+
+Inputs (``data/``, ``{year}`` = ``--evaluation-year``):
+
+- ``phenocam/{year}.json`` — step-1 manifest
+- ``phenocam/{year}/{sitename}.json`` — per-site metadata
+- ``phenocam/{year}/{sitename}_1day.csv`` — GCC timeseries
+
+Outputs (``data/phenocam_screening/``):
+
+- ``{year}.json`` — full per-site results
+- ``{year}.csv`` — flat summary table
+
+CLI: ``--evaluation-year`` (default 2025), ``--sites`` (optional; default: all manifest sites).
+
+Next step: :mod:`3-sentinel-clouds`.
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+import math
+import sys
+from datetime import date, datetime
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+from scipy.interpolate import UnivariateSpline
+
+PROCESSING_DIR = Path(__file__).resolve().parents[1] / "processing"
+if str(PROCESSING_DIR) not in sys.path:
+ sys.path.insert(0, str(PROCESSING_DIR))
+
+from acquisition_phenocam import _phenocam_summary_gcc_value # noqa: E402
+
+MIN_GCC_POINTS = 30
+SNR_THRESHOLD = 2.0
+CLUSTER_RADIUS_M = 500.0
+GATE_ORDER = ("phenocam", "snr", "cluster")
+ONE_DAY_CSV_SUFFIX = "_1day.csv"
+_EARTH_RADIUS_M = 6371000.0
+
+
+def load_manifest(path: Path) -> dict[str, Any]:
+ payload = json.loads(path.read_text(encoding="utf-8"))
+ for key in ("evaluation_year", "sites_dir", "sites"):
+ if key not in payload:
+ raise ValueError(f"Expected '{key}' in manifest {path}")
+ return payload
+
+
+def resolve_sites_dir(manifest_path: Path, manifest: dict[str, Any]) -> Path:
+ return (manifest_path.parent / manifest["sites_dir"]).resolve()
+
+
+def load_site_entry(sites_dir: Path, sitename: str) -> dict[str, Any]:
+ json_path = sites_dir / f"{sitename}.json"
+ payload = json.loads(json_path.read_text(encoding="utf-8"))
+ csv_path = sites_dir / f"{sitename}{ONE_DAY_CSV_SUFFIX}"
+ payload["_one_day_csv"] = csv_path if csv_path.is_file() else None
+ return payload
+
+
+def parse_gcc90_series(csv_path: Path, evaluation_year: int) -> list[tuple[str, float]]:
+ lines = [
+ line
+ for line in csv_path.read_text(encoding="utf-8").split("\n")
+ if line and not line.startswith("#")
+ ]
+ reader = csv.DictReader(lines)
+ fieldnames = reader.fieldnames or ()
+ use_mean_fallback = "gcc_90" not in fieldnames
+
+ year_start = date(evaluation_year, 1, 1)
+ year_end = date(evaluation_year, 12, 31)
+ series: list[tuple[str, float]] = []
+ for row in reader:
+ date_str = row.get("date")
+ if not date_str:
+ continue
+ try:
+ row_date = datetime.strptime(date_str, "%Y-%m-%d").date()
+ except ValueError:
+ continue
+ if not (year_start <= row_date <= year_end):
+ continue
+ gcc = _phenocam_summary_gcc_value(row, use_mean_fallback)
+ if gcc is None:
+ continue
+ series.append((row_date.isoformat(), float(gcc)))
+ series.sort(key=lambda item: item[0])
+ return series
+
+
+def _months_covered(day_strings: list[str]) -> int:
+ months: set[int] = set()
+ for day in day_strings:
+ months.add(datetime.strptime(day, "%Y-%m-%d").month)
+ return len(months)
+
+
+def _aic_for_spline(x: np.ndarray, y: np.ndarray, spline: UnivariateSpline) -> float:
+ residuals = y - spline(x)
+ rss = float(np.sum(residuals**2))
+ n = len(y)
+ if rss <= 0 or n < 4:
+ return math.inf
+ edf = float(spline.get_knots().shape[0] + spline.get_coeffs().shape[0])
+ return n * math.log(rss / n) + 2.0 * edf
+
+
+def compute_snr_aic_spline(series: list[tuple[str, float]]) -> float | None:
+ if len(series) < MIN_GCC_POINTS:
+ return None
+
+ dates = [datetime.strptime(day, "%Y-%m-%d").date() for day, _ in series]
+ x = np.array([(d - dates[0]).days for d in dates], dtype=float)
+ y = np.array([value for _, value in series], dtype=float)
+ if len(np.unique(x)) < 5:
+ return None
+
+ y_var = float(np.var(y))
+ if y_var <= 0:
+ return None
+
+ candidates = np.logspace(-4, 2, 40) * y_var * len(y)
+ best_spline: UnivariateSpline | None = None
+ best_aic = math.inf
+ for smoothing in candidates:
+ try:
+ spline = UnivariateSpline(x, y, k=3, s=float(smoothing))
+ except Exception:
+ continue
+ aic = _aic_for_spline(x, y, spline)
+ if aic < best_aic:
+ best_aic = aic
+ best_spline = spline
+
+ if best_spline is None:
+ return None
+
+ residuals = y - best_spline(x)
+ rmse = float(np.sqrt(np.mean(residuals**2)))
+ amplitude = float(np.max(y) - np.min(y))
+ if rmse <= 0:
+ return None
+ return amplitude / rmse
+
+
+def screen_site(
+ site_entry: dict[str, Any],
+ *,
+ evaluation_year: int,
+ min_gcc_points: int,
+ snr_threshold: float,
+) -> dict[str, Any]:
+ response = site_entry["response"]
+ roi = response.get("roi")
+ csv_path = site_entry.get("_one_day_csv")
+ calculations: dict[str, Any] = {
+ "evaluation_year": evaluation_year,
+ "n_gcc_points": 0,
+ "first_gcc_date": None,
+ "last_gcc_date": None,
+ "months_with_gcc": 0,
+ "snr": None,
+ "min_gcc_points": min_gcc_points,
+ "snr_threshold": snr_threshold,
+ "status": "FAIL",
+ "failing_gate": None,
+ "passed_gates": [],
+ "reason": None,
+ }
+
+ if roi is None or not roi.get("one_day_summary") or csv_path is None:
+ calculations["failing_gate"] = "phenocam"
+ calculations["reason"] = "no_roi"
+ return {"response": response, "calculations": calculations}
+
+ series = parse_gcc90_series(csv_path, evaluation_year)
+ calculations["n_gcc_points"] = len(series)
+ if calculations["n_gcc_points"] == 0:
+ calculations["failing_gate"] = "phenocam"
+ calculations["reason"] = "no_gcc_in_year"
+ return {"response": response, "calculations": calculations}
+
+ day_strings = [day for day, _ in series]
+ calculations["first_gcc_date"] = day_strings[0]
+ calculations["last_gcc_date"] = day_strings[-1]
+ calculations["months_with_gcc"] = _months_covered(day_strings)
+
+ if calculations["n_gcc_points"] < min_gcc_points:
+ calculations["failing_gate"] = "phenocam"
+ calculations["reason"] = "insufficient_gcc_points"
+ return {"response": response, "calculations": calculations}
+
+ calculations["passed_gates"].append("phenocam")
+
+ snr = compute_snr_aic_spline(series)
+ calculations["snr"] = snr
+ if snr is None or snr < snr_threshold:
+ calculations["failing_gate"] = "snr"
+ calculations["reason"] = (
+ "insufficient_snr" if snr is not None else "snr_undefined"
+ )
+ return {"response": response, "calculations": calculations}
+
+ calculations["passed_gates"].append("snr")
+ calculations["status"] = "PASS"
+ calculations["failing_gate"] = None
+ calculations["reason"] = None
+ return {"response": response, "calculations": calculations}
+
+
+def _haversine_m(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
+ p1, p2 = math.radians(lat1), math.radians(lat2)
+ dlat = math.radians(lat2 - lat1)
+ dlon = math.radians(lon2 - lon1)
+ a = math.sin(dlat / 2) ** 2 + math.cos(p1) * math.cos(p2) * math.sin(dlon / 2) ** 2
+ return 2 * _EARTH_RADIUS_M * math.asin(math.sqrt(a))
+
+
+def _site_coords(row: dict[str, Any]) -> tuple[float, float] | None:
+ camera = row["response"]["camera"]
+ lat, lon = camera.get("Lat"), camera.get("Lon")
+ if lat is None or lon is None:
+ return None
+ return float(lat), float(lon)
+
+
+def _cluster_rank(row: dict[str, Any]) -> tuple[int, float]:
+ calc = row["calculations"]
+ return calc["n_gcc_points"], float(calc.get("snr") or 0.0)
+
+
+def apply_cluster_gate(results: list[dict[str, Any]], *, radius_m: float) -> int:
+ pool: list[tuple[int, float, float]] = []
+ for idx, row in enumerate(results):
+ if "snr" not in row["calculations"]["passed_gates"]:
+ continue
+ coords = _site_coords(row)
+ if coords is None:
+ row["calculations"]["passed_gates"].append("cluster")
+ continue
+ pool.append((idx, coords[0], coords[1]))
+
+ n = len(pool)
+ parent = list(range(n))
+
+ def find(x: int) -> int:
+ while parent[x] != x:
+ parent[x] = parent[parent[x]]
+ x = parent[x]
+ return x
+
+ def union(a: int, b: int) -> None:
+ ra, rb = find(a), find(b)
+ if ra != rb:
+ parent[rb] = ra
+
+ for i in range(n):
+ _, lat1, lon1 = pool[i]
+ for j in range(i + 1, n):
+ _, lat2, lon2 = pool[j]
+ if _haversine_m(lat1, lon1, lat2, lon2) <= radius_m:
+ union(i, j)
+
+ clusters: dict[int, list[int]] = {}
+ for i in range(n):
+ clusters.setdefault(find(i), []).append(i)
+
+ demoted = 0
+ for members in clusters.values():
+ result_indices = [pool[i][0] for i in members]
+ cluster_size = len(result_indices)
+ winner_idx = max(result_indices, key=lambda idx: _cluster_rank(results[idx]))
+ winner_name = str(results[winner_idx]["response"]["camera"]["Sitename"])
+ for idx in result_indices:
+ calc = results[idx]["calculations"]
+ calc["cluster_size"] = cluster_size
+ if idx == winner_idx:
+ calc["passed_gates"].append("cluster")
+ else:
+ calc["status"] = "FAIL"
+ calc["failing_gate"] = "cluster"
+ calc["reason"] = "nearby_duplicate"
+ calc["cluster_winner"] = winner_name
+ demoted += 1
+ return demoted
+
+
+def run_screening(
+ manifest: dict[str, Any],
+ sites_dir: Path,
+ *,
+ evaluation_year: int,
+ min_gcc_points: int,
+ snr_threshold: float,
+ site_filter: set[str] | None = None,
+) -> list[dict[str, Any]]:
+ results: list[dict[str, Any]] = []
+ sitenames = manifest["sites"]
+ if site_filter is not None:
+ sitenames = [name for name in sitenames if name in site_filter]
+ for index, sitename in enumerate(sitenames, start=1):
+ print(f"[PhenoCam-2] ({index}/{len(sitenames)}) {sitename}")
+ site_entry = load_site_entry(sites_dir, sitename)
+ results.append(
+ screen_site(
+ site_entry,
+ evaluation_year=evaluation_year,
+ min_gcc_points=min_gcc_points,
+ snr_threshold=snr_threshold,
+ )
+ )
+ return results
+
+
+def print_summary(results: list[dict[str, Any]], evaluation_year: int) -> None:
+ passing = [row for row in results if row["calculations"]["status"] == "PASS"]
+ gates_label = " + ".join(GATE_ORDER)
+ print(
+ f"\n[PhenoCam-2] Screening for {evaluation_year}: "
+ f"{len(passing)}/{len(results)} pass ({gates_label})"
+ )
+
+ for gate in GATE_ORDER:
+ fails = sum(1 for row in results if row["calculations"]["failing_gate"] == gate)
+ after = sum(1 for row in results if gate in row["calculations"]["passed_gates"])
+ print(f" after_{gate}: {after}, fail_at_{gate}: {fails}")
+
+ print("\nPer-site table")
+ print(f"{'site':<24} {'n':>4} {'mon':>3} {'snr':>6} {'status':>6} gate reason")
+ print("-" * 72)
+ for row in sorted(
+ results,
+ key=lambda item: str(item["response"]["camera"]["Sitename"]),
+ ):
+ camera = row["response"]["camera"]
+ calc = row["calculations"]
+ snr_text = f"{calc['snr']:.2f}" if calc["snr"] is not None else ""
+ print(
+ f"{camera['Sitename']:<24} {calc['n_gcc_points']:4d} "
+ f"{calc['months_with_gcc']:3d} {snr_text:>6} "
+ f"{calc['status']:>6} {(calc['failing_gate'] or '-'):<8} "
+ f"{calc['reason'] or '-'}"
+ )
+
+
+def write_screening_json(
+ results: list[dict[str, Any]],
+ output_path: Path,
+ evaluation_year: int,
+) -> None:
+ passing = [row for row in results if row["calculations"]["status"] == "PASS"]
+ payload = {
+ "evaluation_year": evaluation_year,
+ "count": len(results),
+ "qualifying_count": len(passing),
+ "sites": sorted(
+ results,
+ key=lambda item: str(item["response"]["camera"]["Sitename"]),
+ ),
+ }
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ output_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
+ print(f"[PhenoCam-2] Wrote {output_path}")
+
+
+def write_screening_csv(results: list[dict[str, Any]], output_path: Path) -> None:
+ rows: list[dict[str, Any]] = []
+ for row in results:
+ camera = row["response"]["camera"]
+ metadata = camera.get("sitemetadata") or {}
+ roi = row["response"].get("roi") or {}
+ calc = row["calculations"]
+ rows.append(
+ {
+ "Sitename": camera.get("Sitename"),
+ "Lat": camera.get("Lat"),
+ "Lon": camera.get("Lon"),
+ "site_description": metadata.get("site_description"),
+ "primary_veg_type": metadata.get("primary_veg_type"),
+ "site_type": metadata.get("site_type"),
+ "one_day_summary": roi.get("one_day_summary"),
+ **calc,
+ }
+ )
+ fieldnames = list(rows[0].keys()) if rows else ["Sitename", "status"]
+ if rows:
+ extra = [k for row in rows for k in row if k not in fieldnames]
+ fieldnames.extend(dict.fromkeys(extra))
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ with output_path.open("w", encoding="utf-8", newline="") as handle:
+ writer = csv.DictWriter(handle, fieldnames=fieldnames)
+ writer.writeheader()
+ writer.writerows(rows)
+ print(f"[PhenoCam-2] Wrote {output_path}")
+
+
+def main(argv: list[str] | None = None) -> int:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument(
+ "--evaluation-year",
+ type=int,
+ default=2025,
+ help="Evaluation year (default: 2025)",
+ )
+ parser.add_argument(
+ "--sites",
+ type=str,
+ default=None,
+ help="Comma-separated sitenames (default: all sites in step-1 manifest)",
+ )
+ parser.add_argument(
+ "--min-gcc-points",
+ type=int,
+ default=MIN_GCC_POINTS,
+ help=f"Minimum valid gcc_90 observations in-year (default: {MIN_GCC_POINTS})",
+ )
+ parser.add_argument(
+ "--snr-threshold",
+ type=float,
+ default=SNR_THRESHOLD,
+ help=f"Minimum AIC-spline SNR (default: {SNR_THRESHOLD})",
+ )
+ parser.add_argument(
+ "--output-json",
+ type=Path,
+ default=None,
+ help="Screening output (default: data/phenocam_screening/{year}.json)",
+ )
+ parser.add_argument(
+ "--output-csv",
+ type=Path,
+ default=None,
+ help="Flat CSV summary path",
+ )
+ parser.add_argument(
+ "--cluster-radius-m",
+ type=float,
+ default=CLUSTER_RADIUS_M,
+ help=f"Deduplicate SNR-passed sites within this radius (default: {CLUSTER_RADIUS_M})",
+ )
+ parser.add_argument(
+ "--no-cluster",
+ action="store_true",
+ help="Skip nearby-site deduplication gate",
+ )
+ args = parser.parse_args(argv)
+
+ evaluation_year = args.evaluation_year
+ manifest_path = Path("data") / "phenocam" / f"{evaluation_year}.json"
+ if not manifest_path.is_file():
+ raise SystemExit(f"Step-1 manifest not found: {manifest_path}")
+
+ site_filter = None
+ if args.sites:
+ site_filter = {name.strip() for name in args.sites.split(",") if name.strip()}
+
+ manifest = load_manifest(manifest_path)
+ sites_dir_path = resolve_sites_dir(manifest_path, manifest)
+
+ results = run_screening(
+ manifest,
+ sites_dir_path,
+ evaluation_year=evaluation_year,
+ min_gcc_points=args.min_gcc_points,
+ snr_threshold=args.snr_threshold,
+ site_filter=site_filter,
+ )
+ if not args.no_cluster:
+ demoted = apply_cluster_gate(results, radius_m=args.cluster_radius_m)
+ if demoted:
+ print(f"[PhenoCam-2] Cluster dedup: demoted {demoted} nearby duplicate(s)")
+ print_summary(results, evaluation_year)
+
+ default_dir = Path("data") / "phenocam_screening"
+ json_name = f"{evaluation_year}.json"
+ csv_name = f"{evaluation_year}.csv"
+ write_screening_json(
+ results,
+ args.output_json or (default_dir / json_name),
+ evaluation_year,
+ )
+ write_screening_csv(results, args.output_csv or (default_dir / csv_name))
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/3-sentinel-data.py b/3-sentinel-data.py
new file mode 100644
index 0000000..75877bf
--- /dev/null
+++ b/3-sentinel-data.py
@@ -0,0 +1,931 @@
+"""Step 3: Download S2 and S3 rasters and prepare EFAST inputs.
+
+Inputs (``data/``, ``{year}`` = ``--evaluation-year``):
+
+- ``phenocam_screening/{year}.json`` — step-2 PASS sites (coordinates included)
+
+Outputs (``data/``):
+
+- ``sentinel_data/{year}/{sitename}/raw/s3/*.tif`` — S3 SYN L2 per-date GeoTIFFs
+- ``sentinel_data/{year}/{sitename}/prepared/s2/`` — S2 REFL + DIST_CLOUD GeoTIFFs
+- ``sentinel_data/{year}/{sitename}/prepared/s3/`` — S3 composite GeoTIFFs
+- ``sentinel_data/{year}/{sitename}/data.json`` — run summary
+
+Requires ``CDSE_USER`` / ``CDSE_PASSWORD`` in ``../.env`` (workspace root; ``uv sync`` installs efast).
+
+CLI:
+
+- ``--evaluation-year`` (default 2025)
+- ``--site`` (optional; default: all step-2 PASS sites)
+
+Prior step: :mod:`2-phenocam-screening`.
+Next step: :mod:`4-fusion`.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import shutil
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+import netCDF4
+import numpy as np
+import openeo
+import rasterio
+import requests
+from dotenv import load_dotenv
+from pystac_client import Client
+from rasterio import shutil as rio_shutil
+from rasterio.enums import Resampling
+from rasterio.errors import WindowError
+from rasterio.transform import from_bounds
+from rasterio.vrt import WarpedVRT
+from rasterio.warp import transform_geom
+from rasterio.windows import Window
+from rasterio.windows import from_bounds as window_from_bounds
+from rasterio.windows import transform as window_transform
+from shapely import wkt as shapely_wkt
+from tqdm import tqdm
+
+# ---------------------------------------------------------------------------
+# Public constants — edit here to change pipeline behaviour
+# ---------------------------------------------------------------------------
+
+S2_BANDS = ["B02", "B03", "B04"]
+
+S3_BANDS = [
+ "Syn_Oa04_reflectance",
+ "Syn_Oa06_reflectance",
+ "Syn_Oa08_reflectance",
+ "Syn_Oa17_reflectance",
+]
+S3_BAND_NAMES = ["SDR_Oa04", "SDR_Oa06", "SDR_Oa08", "SDR_Oa17"]
+
+RESOLUTION_RATIO = 30
+S3_MOSAIC_DAYS = 100
+S3_COMPOSITE_STEP = 2
+S3_COMPOSITE_SIGMA_DOY = 10
+S3_COMPOSITE_D = 20
+S3_SMOOTHING_STD = 1
+S3_REFLECTANCE_SCALE = 10_000 # OpenEO SYN L2 SDR → 0–1 (EFAST expects < 5)
+
+# ---------------------------------------------------------------------------
+# Internal S2 constants
+# ---------------------------------------------------------------------------
+
+EARTH_SEARCH_URL = "https://earth-search.aws.element84.com/v1"
+
+_BAND_ASSETS: dict[str, str] = {
+ "B02": "blue",
+ "B03": "green",
+ "B04": "red",
+ "B05": "rededge1",
+ "B06": "rededge2",
+ "B07": "rededge3",
+ "B08": "nir",
+ "B8A": "nir08",
+ "B11": "swir16",
+ "B12": "swir22",
+}
+_SCL_ASSET = "scl"
+_MIN_BBOX_HALF_DEG = 0.008
+
+_GDAL_COG_ENV = {
+ # HTTP/1.1 avoids HTTP/2 multiplexing connection-reset cascades on S3.
+ "GDAL_HTTP_VERSION": "1.1",
+ "GDAL_HTTP_MERGE_CONSECUTIVE_RANGES": "YES",
+ "GDAL_HTTP_TCP_KEEPALIVE": "YES",
+ "GDAL_DISABLE_READDIR_ON_OPEN": "EMPTY_DIR",
+ "CPL_VSIL_CURL_CACHE_SIZE": "200000000",
+ # Built-in GDAL retries for 429/502/503/504 and transient resets.
+ "GDAL_HTTP_MAX_RETRY": "3",
+ "GDAL_HTTP_RETRY_DELAY": "0.5",
+ "AWS_NO_SIGN_REQUEST": "YES",
+}
+
+# ---------------------------------------------------------------------------
+# Internal S3 constants
+# ---------------------------------------------------------------------------
+
+CDSE_TOKEN_URL = (
+ "https://identity.dataspace.copernicus.eu/auth/realms/CDSE/"
+ "protocol/openid-connect/token"
+)
+OPENEO_URL = "openeo.dataspace.copernicus.eu"
+S3_COLLECTION = "SENTINEL3_SYN_L2_SYN"
+
+DATA_DIR = Path("data")
+DEFAULT_YEAR = 2025
+WORKSPACE_ROOT = Path(__file__).resolve().parent.parent
+
+
+# ---------------------------------------------------------------------------
+# Credentials
+# ---------------------------------------------------------------------------
+
+
+def _cdse_credentials() -> dict[str, str | None]:
+ load_dotenv(WORKSPACE_ROOT / ".env")
+ return {
+ "username": os.getenv("CDSE_USER"),
+ "password": os.getenv("CDSE_PASSWORD"),
+ }
+
+
+# ---------------------------------------------------------------------------
+# Screening manifest helpers
+# ---------------------------------------------------------------------------
+
+
+def _load_screening_pass_sites(year: int) -> list[dict[str, Any]]:
+ """Return list of PASS-site dicts from step-2 screening JSON.
+
+ Each entry has ``sitename``, ``lat``, ``lon`` keys.
+ """
+ path = DATA_DIR / "phenocam_screening" / f"{year}.json"
+ if not path.is_file():
+ raise FileNotFoundError(f"Step-2 screening manifest not found: {path}")
+ payload = json.loads(path.read_text(encoding="utf-8"))
+ sites = []
+ for row in payload.get("sites", []):
+ calc = row.get("calculations", {})
+ if calc.get("status") != "PASS":
+ continue
+ camera = row.get("response", {}).get("camera", {})
+ name = camera.get("Sitename")
+ lat = camera.get("Lat")
+ lon = camera.get("Lon")
+ if name and lat is not None and lon is not None:
+ sites.append({"sitename": str(name), "lat": float(lat), "lon": float(lon)})
+ return sites
+
+
+# ---------------------------------------------------------------------------
+# S2: geometry helpers (from s2_cloud_native.py)
+# ---------------------------------------------------------------------------
+
+
+def wkt_to_bbox(geometry_wkt: str) -> list[float]:
+ """Convert a WKT geometry to a ``[west, south, east, north]`` bbox."""
+ geom = shapely_wkt.loads(geometry_wkt)
+ minx, miny, maxx, maxy = geom.bounds
+ if minx == maxx and miny == maxy:
+ minx -= _MIN_BBOX_HALF_DEG
+ maxx += _MIN_BBOX_HALF_DEG
+ miny -= _MIN_BBOX_HALF_DEG
+ maxy += _MIN_BBOX_HALF_DEG
+ return [minx, miny, maxx, maxy]
+
+
+def _boa_offset(item: Any) -> int:
+ """Return the BOA additive offset for a STAC item.
+
+ Processing baseline >= 04.00 applies a -1000 offset; earlier baselines use 0.
+ """
+ if item.properties.get("earthsearch:boa_offset_applied"):
+ return 0
+ baseline_str = str(
+ item.properties.get("processing:baseline")
+ or item.properties.get("s2:processing_baseline")
+ or "0"
+ )
+ try:
+ baseline = float(baseline_str)
+ except ValueError:
+ baseline = 0.0
+ return -1000 if baseline >= 4.0 else 0
+
+
+def _window_for_bbox(
+ src: rasterio.io.DatasetReader,
+ bbox_4326: list[float],
+) -> Window | None:
+ """Return the rasterio Window for a EPSG:4326 bbox clipped to src bounds."""
+ bbox_geom = {
+ "type": "Polygon",
+ "coordinates": [
+ [
+ [bbox_4326[0], bbox_4326[1]],
+ [bbox_4326[2], bbox_4326[1]],
+ [bbox_4326[2], bbox_4326[3]],
+ [bbox_4326[0], bbox_4326[3]],
+ [bbox_4326[0], bbox_4326[1]],
+ ]
+ ],
+ }
+ src_geom = transform_geom("EPSG:4326", src.crs.to_wkt(), bbox_geom)
+ xs = [c[0] for c in src_geom["coordinates"][0][:4]]
+ ys = [c[1] for c in src_geom["coordinates"][0][:4]]
+ win = window_from_bounds(min(xs), min(ys), max(xs), max(ys), src.transform)
+ try:
+ return win.intersection(Window(0, 0, src.width, src.height))
+ except WindowError:
+ return None
+
+
+def _read_window(
+ href: str,
+ bbox_4326: list[float],
+ out_shape: tuple[int, int] | None = None,
+ resampling: Resampling = Resampling.bilinear,
+) -> tuple[np.ndarray, dict[str, Any]] | None:
+ """Range-read a single-band array for the bbox window from a COG URL."""
+ with rasterio.open(href) as src:
+ win = _window_for_bbox(src, bbox_4326)
+ if win is None:
+ return None
+ data = src.read(1, window=win, out_shape=out_shape, resampling=resampling)
+ profile: dict[str, Any] = {
+ "crs": src.crs,
+ "transform": window_transform(win, src.transform),
+ "height": data.shape[0],
+ "width": data.shape[1],
+ "dtype": src.dtypes[0],
+ }
+ return data, profile
+
+
+def _read_bands(
+ item: Any,
+ bbox: list[float],
+ bands: list[str],
+) -> tuple[list[np.ndarray], dict[str, Any]] | None:
+ """Range-read all requested bands for one STAC item."""
+ band_arrays: list[np.ndarray] = []
+ ref_profile: dict[str, Any] | None = None
+
+ for band_name in bands:
+ asset_key = _BAND_ASSETS.get(band_name)
+ if asset_key is None or asset_key not in item.assets:
+ return None
+ ref_shape = (
+ (ref_profile["height"], ref_profile["width"]) if ref_profile else None
+ )
+ result = _read_window(item.assets[asset_key].href, bbox, out_shape=ref_shape)
+ if result is None:
+ return None
+ data, profile = result
+ if ref_profile is None:
+ ref_profile = profile
+ band_arrays.append(data.astype("float32"))
+
+ return (band_arrays, ref_profile) if ref_profile is not None else None
+
+
+def _cloud_mask(item: Any, bbox: list[float], shape: tuple[int, int]) -> np.ndarray:
+ """Return a boolean cloud/shadow mask from the item's SCL band.
+
+ Masks SCL classes 0 (no data), 3 (cloud shadow), and >7 (clouds, cirrus, snow).
+ """
+ scl = item.assets.get(_SCL_ASSET)
+ result = (
+ _read_window(scl.href, bbox, out_shape=shape, resampling=Resampling.nearest)
+ if scl
+ else None
+ )
+ if result is None:
+ return np.zeros(shape, dtype=bool)
+ scl_data, _ = result
+ return (scl_data == 0) | (scl_data == 3) | (scl_data > 7)
+
+
+def _pad_to_multiple(arr: np.ndarray, ratio: int) -> np.ndarray:
+ """Zero-pad (bands, H, W) so H and W are multiples of ``ratio``."""
+ pad_h = (ratio - arr.shape[1] % ratio) % ratio
+ pad_w = (ratio - arr.shape[2] % ratio) % ratio
+ if pad_h or pad_w:
+ arr = np.pad(arr, ((0, 0), (0, pad_h), (0, pad_w)), constant_values=0)
+ return arr
+
+
+# ---------------------------------------------------------------------------
+# S2: STAC search + download (from s2_cloud_native.py)
+# ---------------------------------------------------------------------------
+
+
+def stac_search_s2(
+ bbox: list[float],
+ start_date: datetime,
+ end_date: datetime,
+) -> list[Any]:
+ """Search Earth Search for S2 L2A items intersecting a bbox."""
+ client = Client.open(EARTH_SEARCH_URL)
+ search = client.search(
+ collections=["sentinel-2-l2a"],
+ bbox=bbox,
+ datetime=(
+ f"{start_date.strftime('%Y-%m-%dT%H:%M:%SZ')}/"
+ f"{end_date.strftime('%Y-%m-%dT23:59:59Z')}"
+ ),
+ max_items=10_000,
+ )
+ return list({item.id: item for item in search.items()}.values())
+
+
+def _process_item(
+ item: Any,
+ bbox: list[float],
+ bands: list[str],
+ output_dir: Path,
+ ratio: int,
+) -> str | None:
+ """Range-read one S2 item and write a masked REFL GeoTIFF.
+
+ Returns a skip-message string when the item cannot be processed, else None.
+ """
+ out_path = output_dir / f"{item.id}_REFL.tif"
+ if out_path.is_file():
+ return None
+ bands_result = _read_bands(item, bbox, bands)
+ if bands_result is None:
+ return f"[S2] Skipping {item.id}: missing asset or no bbox overlap"
+ band_arrays, ref_profile = bands_result
+ mask = _cloud_mask(item, bbox, (ref_profile["height"], ref_profile["width"]))
+ stacked = (np.stack(band_arrays) + _boa_offset(item)) / 10_000.0
+ np.clip(stacked, 0, None, out=stacked)
+ stacked[:, mask] = 0.0
+ stacked = _pad_to_multiple(stacked, ratio)
+ out_profile = {
+ "driver": "GTiff",
+ "count": len(bands),
+ "dtype": "float32",
+ "nodata": 0,
+ "crs": ref_profile["crs"],
+ "transform": ref_profile["transform"],
+ "height": stacked.shape[1],
+ "width": stacked.shape[2],
+ "compress": "lzw",
+ }
+ with rasterio.open(out_path, "w", **out_profile) as dst:
+ dst.write(stacked)
+ for i, band_name in enumerate(bands, 1):
+ dst.set_band_description(i, band_name)
+ return None
+
+
+def download_s2_window(
+ items: list[Any],
+ bbox: list[float],
+ output_dir: Path,
+ bands: list[str],
+ ratio: int = RESOLUTION_RATIO,
+ max_workers: int = 12,
+) -> None:
+ """Range-read S2 L2A COG windows and write masked REFL GeoTIFFs.
+
+ Writes ``{item.id}_REFL.tif`` directly — no intermediate raw download.
+ Cloud/shadow pixels (SCL 0, 3, >7) are zeroed. BOA offset is inferred from
+ ``processing:baseline``. Output is zero-padded to multiples of ``ratio``.
+ Items are fetched in parallel using ``max_workers`` threads.
+ """
+ output_dir.mkdir(parents=True, exist_ok=True)
+ with rasterio.Env(**_GDAL_COG_ENV):
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
+ futures = {
+ pool.submit(
+ _process_item, item, bbox, bands, output_dir, ratio
+ ): item.id
+ for item in items
+ }
+ with tqdm(
+ total=len(futures), unit="granule", desc="S2 COG window read"
+ ) as pbar:
+ for fut in as_completed(futures):
+ msg = fut.result()
+ if msg:
+ tqdm.write(msg)
+ pbar.update(1)
+
+
+# ---------------------------------------------------------------------------
+# S3: download (from s3_openeo.py)
+# ---------------------------------------------------------------------------
+
+
+def _utm_epsg(bbox: list[float]) -> int:
+ """Return the UTM EPSG code for the centre of a ``[W, S, E, N]`` bbox."""
+ lon = (bbox[0] + bbox[2]) / 2
+ lat = (bbox[1] + bbox[3]) / 2
+ zone = int((lon + 180) / 6) + 1
+ return 32600 + zone if lat >= 0 else 32700 + zone
+
+
+def _cdse_token(username: str, password: str) -> str:
+ """Obtain a CDSE bearer token via password grant."""
+ resp = requests.post(
+ CDSE_TOKEN_URL,
+ data={
+ "grant_type": "password",
+ "username": username,
+ "password": password,
+ "client_id": "cdse-public",
+ },
+ timeout=30,
+ )
+ resp.raise_for_status()
+ return resp.json()["access_token"]
+
+
+def _netcdf_to_geotiffs(nc_path: Path, output_dir: Path, epsg: int) -> int:
+ """Split an OpenEO NetCDF into per-date GeoTIFFs.
+
+ Output filenames match the ``S3*__YYYYMMDDTHHMMSS.tif`` pattern that
+ ``s3_processing.produce_median_composite`` expects.
+
+ Handles half-pixel cell-centre coordinates, ascending y-axis (flip_y),
+ and fills NetCDF masked values with NaN.
+ """
+ written = 0
+ with netCDF4.Dataset(str(nc_path), "r") as nc:
+ times = netCDF4.num2date(nc.variables["t"][:], nc.variables["t"].units)
+ x_coords = np.asarray(nc.variables["x"][:], dtype=float)
+ y_coords = np.asarray(nc.variables["y"][:], dtype=float)
+
+ half_x = abs(x_coords[1] - x_coords[0]) / 2 if len(x_coords) > 1 else 0.0
+ half_y = abs(y_coords[1] - y_coords[0]) / 2 if len(y_coords) > 1 else 0.0
+ transform = from_bounds(
+ x_coords.min() - half_x,
+ y_coords.min() - half_y,
+ x_coords.max() + half_x,
+ y_coords.max() + half_y,
+ len(x_coords),
+ len(y_coords),
+ )
+ flip_y = len(y_coords) > 1 and y_coords[0] < y_coords[-1]
+
+ date_counts: dict[str, int] = {}
+ for t_idx, time_val in enumerate(times):
+ date_str = time_val.strftime("%Y%m%d")
+ n = date_counts.get(date_str, 0)
+ date_counts[date_str] = n + 1
+
+ raw = np.stack([nc.variables[b][t_idx, :, :] for b in S3_BANDS], axis=0)
+ stacked = (
+ np.ma.filled(raw, fill_value=np.nan).astype("float32")
+ / S3_REFLECTANCE_SCALE
+ )
+ if flip_y:
+ stacked = stacked[:, ::-1, :]
+
+ filename = f"S3_{date_str}_{n}__{date_str}T120000.tif"
+ with rasterio.open(
+ output_dir / filename,
+ "w",
+ driver="GTiff",
+ height=len(y_coords),
+ width=len(x_coords),
+ count=len(S3_BANDS),
+ dtype="float32",
+ nodata=float("nan"),
+ crs=f"EPSG:{epsg}",
+ transform=transform,
+ compress="lzw",
+ ) as dst:
+ dst.write(stacked)
+ for i, band_name in enumerate(S3_BAND_NAMES, 1):
+ dst.set_band_description(i, band_name)
+ written += 1
+
+ return written
+
+
+_S3_DOWNLOAD_RETRIES = 4
+_S3_DOWNLOAD_BACKOFF = 30 # seconds; doubled on each retry
+
+
+def _download_with_retry(datacube: Any, nc_path: Path) -> None:
+ """Download an OpenEO datacube to *nc_path*, retrying on transient errors.
+
+ Retries up to ``_S3_DOWNLOAD_RETRIES`` times with exponential backoff
+ starting at ``_S3_DOWNLOAD_BACKOFF`` seconds. Re-authenticates on each
+ attempt so an expired token never blocks a retry.
+ """
+ delay = _S3_DOWNLOAD_BACKOFF
+ last_exc: Exception | None = None
+ for attempt in range(1, _S3_DOWNLOAD_RETRIES + 1):
+ try:
+ if nc_path.exists():
+ nc_path.unlink()
+ datacube.download(str(nc_path), format="NetCDF")
+ return
+ except Exception as exc:
+ last_exc = exc
+ if attempt < _S3_DOWNLOAD_RETRIES:
+ print(
+ f"[S3-OEO] Download attempt {attempt} failed ({exc}); "
+ f"retrying in {delay}s..."
+ )
+ time.sleep(delay)
+ delay *= 2
+ else:
+ print(f"[S3-OEO] All {_S3_DOWNLOAD_RETRIES} download attempts failed")
+ raise RuntimeError(
+ f"S3 download failed after {_S3_DOWNLOAD_RETRIES} attempts"
+ ) from last_exc
+
+
+def download_s3_openeo(
+ start_date: datetime,
+ end_date: datetime,
+ aoi_geometry: str,
+ output_dir: Path,
+ credentials: dict[str, str | None],
+) -> None:
+ """Download S3 SYN L2 SDR for an AOI via CDSE OpenEO, server-side clipped.
+
+ Writes per-date ``S3_{YYYYMMDD}_{n}__{YYYYMMDD}T120000.tif`` files to
+ ``output_dir``, ready for ``s3_processing.produce_median_composite``.
+ Skips if any ``S3*.tif`` files already exist.
+ """
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ if any(output_dir.glob("S3*.tif")):
+ print("[S3-OEO] Skipping — output_dir already contains S3 GeoTIFFs")
+ return
+
+ bbox = wkt_to_bbox(aoi_geometry)
+ epsg = _utm_epsg(bbox)
+ spatial_extent = {
+ "west": bbox[0],
+ "east": bbox[2],
+ "south": bbox[1],
+ "north": bbox[3],
+ }
+
+ print("[S3-OEO] Authenticating with CDSE...")
+ token = _cdse_token(credentials["username"], credentials["password"]) # type: ignore[arg-type]
+ conn = openeo.connect(OPENEO_URL)
+ conn.authenticate_oidc_access_token(token)
+
+ start_str = start_date.strftime("%Y-%m-%d")
+ end_str = end_date.strftime("%Y-%m-%d")
+ print(f"[S3-OEO] Loading {S3_COLLECTION} ({start_str} → {end_str})...")
+ datacube = conn.load_collection(
+ S3_COLLECTION,
+ spatial_extent=spatial_extent,
+ temporal_extent=[start_str, end_str],
+ bands=S3_BANDS,
+ ).resample_spatial(projection=epsg)
+
+ nc_path = output_dir / "_s3_syn_l2.nc"
+ print(f"[S3-OEO] Downloading NetCDF to {nc_path}...")
+ t0 = time.time()
+ _download_with_retry(datacube, nc_path)
+ print(f"[S3-OEO] Download completed in {time.time() - t0:.1f}s")
+
+ print("[S3-OEO] Splitting into per-date GeoTIFFs...")
+ written = _netcdf_to_geotiffs(nc_path, output_dir, epsg)
+ nc_path.unlink(missing_ok=True)
+ print(f"[S3-OEO] {written} GeoTIFFs written to {output_dir}")
+
+
+# ---------------------------------------------------------------------------
+# S2: distance_to_clouds helper
+# ---------------------------------------------------------------------------
+
+
+def _import_distance_to_clouds():
+ try:
+ from efast.s2_processing import distance_to_clouds
+
+ return distance_to_clouds
+ except ImportError as exc:
+ raise ImportError("efast not found. Install with: uv sync") from exc
+
+
+def _normalize_s2_grid(s2_dir: Path) -> None:
+ """Remove REFL (and companion DIST_CLOUD/GCC) files that don't match the dominant shape.
+
+ Sites at MGRS tile boundaries can have S2 downloads from two tiles with different
+ spatial extents. For example, a site at the top edge of tile 16SDA will produce
+ narrow (e.g. 30×180) REFL files from 16SDA and full-extent (180×180) files from
+ 16SDB. EFAST requires all S2 files to share the same grid, so we keep only the
+ shape with the most pixels (largest tile coverage) and discard the rest together
+ with any already-computed companions.
+ """
+ refl_files = sorted(s2_dir.glob("*_REFL.tif"))
+ if not refl_files:
+ return
+
+ shape_to_files: dict[tuple[int, int], list[Path]] = {}
+ for f in refl_files:
+ with rasterio.open(f) as src:
+ shape: tuple[int, int] = src.shape # type: ignore[assignment]
+ shape_to_files.setdefault(shape, []).append(f)
+
+ if len(shape_to_files) <= 1:
+ return
+
+ ref_shape = max(shape_to_files.keys(), key=lambda s: s[0] * s[1])
+ shapes_summary = ", ".join(
+ f"{s[0]}×{s[1]}({len(fs)})" for s, fs in shape_to_files.items()
+ )
+
+ n_removed = 0
+ for shape, files in shape_to_files.items():
+ if shape == ref_shape:
+ continue
+ for refl_path in files:
+ stem = refl_path.stem[: -len("_REFL")] # e.g. "S2A_16SDA_20250106_0_L2A"
+ for companion in s2_dir.glob(f"{stem}_*.tif"):
+ companion.unlink()
+ refl_path.unlink(missing_ok=True)
+ n_removed += 1
+
+ print(
+ f"[S2-NORM] Tile boundary — shapes: {shapes_summary}; "
+ f"removed {n_removed} file-sets, kept {ref_shape[0]}×{ref_shape[1]}"
+ )
+
+
+def _rescale_dist_cloud(s2_dir: Path) -> None:
+ """Ensure DIST_CLOUD values are in pixel units (not normalised to [0,1])."""
+ for dc_path in s2_dir.glob("*DIST_CLOUD.tif"):
+ with rasterio.open(dc_path) as src:
+ d = src.read(1)
+ if float(np.nanmax(d)) <= 1:
+ with rasterio.open(dc_path, "r+") as dst:
+ dst.write(np.where(d > 0, 2.0, d).astype(np.float32), 1)
+
+
+# ---------------------------------------------------------------------------
+# S3: compositing + reprojection helpers (from 4-sentinel-data.py)
+# ---------------------------------------------------------------------------
+
+
+def _import_s3_processing():
+ try:
+ from efast import s3_processing
+
+ return s3_processing
+ except ImportError as exc:
+ raise ImportError("efast not found. Install with: uv sync") from exc
+
+
+def _reproject_s3_composites_to_s2_grid(
+ composite_dir: Path,
+ s2_refl_path: Path,
+ s3_out_dir: Path,
+ *,
+ resolution_ratio: int = RESOLUTION_RATIO,
+) -> None:
+ """Reproject S3 composites to the S2 spatial grid at LR resolution."""
+ s3_out_dir.mkdir(parents=True, exist_ok=True)
+ with rasterio.open(s2_refl_path) as s2_ref:
+ target_bounds = s2_ref.bounds
+ target_crs = s2_ref.crs
+ width = s2_ref.width // resolution_ratio
+ height = s2_ref.height // resolution_ratio
+ s3_transform = rasterio.transform.from_bounds(
+ target_bounds.left,
+ target_bounds.bottom,
+ target_bounds.right,
+ target_bounds.top,
+ width,
+ height,
+ )
+
+ for sen3_path in sorted(composite_dir.glob("composite_*.tif")):
+ date_part = sen3_path.stem.split("_", 1)[1].replace("-", "")
+ outfile = s3_out_dir / f"composite_{date_part}.tif"
+ vrt_options = {
+ "transform": s3_transform,
+ "height": height,
+ "width": width,
+ "crs": target_crs,
+ "resampling": Resampling.cubic,
+ }
+ with rasterio.open(sen3_path) as s3_src:
+ with WarpedVRT(s3_src, **vrt_options) as vrt:
+ profile = vrt.profile.copy()
+ profile.update({"dtype": "float32", "nodata": 0, "driver": "GTiff"})
+ rio_shutil.copy(vrt, outfile, **profile)
+
+
+def _s3_reflectance_scale(raw_s3_dir: Path) -> float:
+ """Return multiplier that maps raw SYN L2 SDR values to 0–1 reflectance."""
+ for path in raw_s3_dir.glob("S3*.tif"):
+ with rasterio.open(path) as src:
+ data = src.read()
+ if not np.any(np.isfinite(data)):
+ continue
+ mx = float(np.nanmax(data))
+ if np.isfinite(mx) and mx > 5:
+ return 1.0 / S3_REFLECTANCE_SCALE
+ return 1.0
+
+
+def _stage_s3_for_efast(raw_s3_dir: Path, staging_dir: Path) -> int:
+ """Copy ``S3_*.tif`` inputs, scaling reflectance when still in DN form."""
+ scale = _s3_reflectance_scale(raw_s3_dir)
+ if staging_dir.exists():
+ shutil.rmtree(staging_dir)
+ staging_dir.mkdir(parents=True)
+
+ count = 0
+ for src_path in sorted(raw_s3_dir.glob("S3*.tif")):
+ dst_path = staging_dir / src_path.name
+ with rasterio.open(src_path) as src:
+ data = src.read().astype("float32") * scale
+ profile = src.profile.copy()
+ profile.update(dtype="float32")
+ descriptions = src.descriptions
+ with rasterio.open(dst_path, "w", **profile) as dst:
+ dst.write(data)
+ for i, desc in enumerate(descriptions, 1):
+ if desc:
+ dst.set_band_description(i, desc)
+ count += 1
+
+ if scale != 1.0:
+ print(f"[S3-PREP] Scaled raw SDR by {scale:g} for EFAST compositing")
+ return count
+
+
+def _prepare_s3(
+ raw_s3_dir: Path,
+ s2_refl_path: Path,
+ s3_out_dir: Path,
+ *,
+ work_dir: Path | None = None,
+) -> None:
+ """Run EFAST S3 compositing pipeline and reproject to S2 grid."""
+ s3 = _import_s3_processing()
+ base = work_dir or (s3_out_dir / "_efast_work")
+ staging = base / "scaled"
+ composites = base / "composites"
+ blurred = base / "blurred"
+ calibrated = base / "calibrated"
+
+ for directory in (staging, composites, blurred, calibrated):
+ if directory.exists():
+ shutil.rmtree(directory)
+ directory.mkdir(parents=True, exist_ok=True)
+
+ staged = _stage_s3_for_efast(raw_s3_dir, staging)
+ if staged == 0:
+ raise ValueError(f"No S3*.tif files found in {raw_s3_dir}")
+
+ print(
+ f"[S3-PREP] produce_median_composite: mosaic_days={S3_MOSAIC_DAYS}, "
+ f"step={S3_COMPOSITE_STEP}, sigma_doy={S3_COMPOSITE_SIGMA_DOY}, "
+ f"D={S3_COMPOSITE_D}"
+ )
+ s3.produce_median_composite(
+ staging,
+ composites,
+ step=S3_COMPOSITE_STEP,
+ mosaic_days=S3_MOSAIC_DAYS,
+ s3_bands=[1, 2, 3, 4],
+ D=S3_COMPOSITE_D,
+ sigma_doy=S3_COMPOSITE_SIGMA_DOY,
+ )
+ s3.smoothing(
+ composites,
+ blurred,
+ product="composite",
+ std=S3_SMOOTHING_STD,
+ preserve_nan=False,
+ )
+ s3.reformat_s3(blurred, calibrated, product="composite", scaling_factor=1)
+
+ for old in s3_out_dir.glob("composite_*.tif"):
+ old.unlink()
+ _reproject_s3_composites_to_s2_grid(calibrated, s2_refl_path, s3_out_dir)
+
+ if work_dir is None and base.exists():
+ shutil.rmtree(base)
+
+ n_out = len(list(s3_out_dir.glob("composite_*.tif")))
+ print(f"[S3-PREP] Wrote {n_out} composites")
+
+
+# ---------------------------------------------------------------------------
+# Per-site pipeline
+# ---------------------------------------------------------------------------
+
+
+def process_site(
+ sitename: str,
+ lat: float,
+ lon: float,
+ year: int,
+) -> dict[str, Any]:
+ """Download S2 + S3 and run EFAST preparation for one site."""
+ site_dir = DATA_DIR / "sentinel_data" / str(year) / sitename
+ s2_out = site_dir / "prepared" / "s2"
+ s3_raw = site_dir / "raw" / "s3"
+ s3_out = site_dir / "prepared" / "s3"
+ aoi_wkt = f"POINT ({lon} {lat})"
+ bbox = wkt_to_bbox(aoi_wkt)
+ creds = _cdse_credentials()
+
+ # S3 download
+ print(f"[{sitename}] Downloading S3...")
+ download_s3_openeo(
+ start_date=datetime(year, 1, 1),
+ end_date=datetime(year, 12, 31),
+ aoi_geometry=aoi_wkt,
+ output_dir=s3_raw,
+ credentials=creds,
+ )
+
+ # S2 download
+ print(f"[{sitename}] Searching S2 on Earth Search...")
+ items = stac_search_s2(bbox, datetime(year, 1, 1), datetime(year, 12, 31))
+ print(f"[{sitename}] {len(items)} S2 items found — downloading windows...")
+ download_s2_window(items, bbox, s2_out, S2_BANDS, RESOLUTION_RATIO)
+ _normalize_s2_grid(s2_out)
+
+ # S2 distance-to-clouds
+ print(f"[{sitename}] Computing distance-to-clouds...")
+ distance_to_clouds = _import_distance_to_clouds()
+ distance_to_clouds(s2_out, ratio=RESOLUTION_RATIO)
+ _rescale_dist_cloud(s2_out)
+
+ # S3 compositing
+ s2_refl_path = next(iter(s2_out.glob("*_REFL.tif")), None)
+ if s2_refl_path is None:
+ raise ValueError(f"No REFL files in {s2_out} — S2 download may have failed")
+ s3_out.mkdir(parents=True, exist_ok=True)
+ print(f"[{sitename}] Running S3 compositing pipeline...")
+ _prepare_s3(s3_raw, s2_refl_path, s3_out)
+
+ summary = {
+ "sitename": sitename,
+ "evaluation_year": year,
+ "lat": lat,
+ "lon": lon,
+ "s2_refl_count": len(list(s2_out.glob("*_REFL.tif"))),
+ "s2_dist_cloud_count": len(list(s2_out.glob("*_DIST_CLOUD.tif"))),
+ "s3_raw_count": len(list(s3_raw.glob("S3*.tif"))),
+ "s3_composite_count": len(list(s3_out.glob("composite_*.tif"))),
+ }
+ site_dir.mkdir(parents=True, exist_ok=True)
+ (site_dir / "data.json").write_text(
+ json.dumps(summary, indent=2) + "\n", encoding="utf-8"
+ )
+ return summary
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def main(argv: list[str] | None = None) -> int:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument("--evaluation-year", type=int, default=DEFAULT_YEAR)
+ parser.add_argument(
+ "--site",
+ type=str,
+ default=None,
+ help="Single sitename to process (default: all step-2 PASS sites)",
+ )
+ parser.add_argument(
+ "--skip-downloaded",
+ action="store_true",
+ help="Skip sites whose directory already exists under data/sentinel_data/{year}/",
+ )
+ args = parser.parse_args(argv)
+ year = args.evaluation_year
+
+ pass_sites = _load_screening_pass_sites(year)
+ if not pass_sites:
+ print("[Sentinel data] No PASS sites found in step-2 screening output")
+ return 1
+
+ if args.site:
+ pass_sites = [s for s in pass_sites if s["sitename"] == args.site]
+ if not pass_sites:
+ print(f"[Sentinel data] Site '{args.site}' not found in step-2 PASS sites")
+ return 1
+
+ print(f"[Sentinel data] Processing {len(pass_sites)} site(s)")
+ for i, site in enumerate(pass_sites, 1):
+ sitename = site["sitename"]
+ site_dir = DATA_DIR / "sentinel_data" / str(year) / sitename
+ if args.skip_downloaded and site_dir.exists():
+ print(
+ f"[Sentinel data] ({i}/{len(pass_sites)}) {sitename} — skipping (directory exists)"
+ )
+ continue
+ print(f"[Sentinel data] ({i}/{len(pass_sites)}) {sitename}")
+ summary = process_site(sitename, site["lat"], site["lon"], year)
+ print(
+ f"[Sentinel data] {sitename} done — "
+ f"{summary['s2_refl_count']} REFL, "
+ f"{summary['s3_composite_count']} composites"
+ )
+
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/4-fusion.py b/4-fusion.py
new file mode 100644
index 0000000..5950c44
--- /dev/null
+++ b/4-fusion.py
@@ -0,0 +1,349 @@
+"""Step 4: Compute GCC and run EFAST BtI + ItB fusion for prepared sites.
+
+Inputs (``data/``, ``{year}`` = ``--evaluation-year``):
+
+- ``sentinel_data/{year}/{sitename}/prepared/s2/`` — ``*_REFL.tif`` + ``*_DIST_CLOUD.tif``
+- ``sentinel_data/{year}/{sitename}/prepared/s3/`` — ``composite_*.tif`` (4-band)
+
+Outputs (``data/``):
+
+- ``sentinel_data/{year}/{sitename}/prepared/s2/*_GCC.tif`` — S2 GCC (in-place)
+- ``sentinel_data/{year}/{sitename}/prepared/gcc_s3/*.tif`` — S3 GCC composites
+- ``fusion/{year}/{sitename}/bti/fusion/REFL_*.tif`` — BtI fused 4-band reflectance
+- ``fusion/{year}/{sitename}/bti/gcc/GCC_*.tif`` — GCC derived from BtI fusion
+- ``fusion/{year}/{sitename}/itb/s2/GCC_*.tif`` — per-acquisition S2 GCC (simplified names)
+- ``fusion/{year}/{sitename}/itb/s3/GCC_*.tif`` — per-composite S3 GCC (simplified names)
+- ``fusion/{year}/{sitename}/itb/fusion/GCC_*.tif`` — ItB fused GCC
+
+Requires ``uv sync`` (efast).
+
+CLI:
+
+- ``--evaluation-year`` (default 2025)
+- ``--site`` (optional; default: all prepared sites under ``sentinel_data/{year}/``)
+
+Prior step: :mod:`3-sentinel-data`.
+"""
+
+from __future__ import annotations
+
+import argparse
+import shutil
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import rasterio
+from dateutil import rrule
+
+# ---------------------------------------------------------------------------
+# Public constants
+# ---------------------------------------------------------------------------
+
+RESOLUTION_RATIO = 30
+MOSAIC_STEP = 2
+MAX_DAYS = 100
+MINIMUM_ACQUISITION_IMPORTANCE = 0
+
+DATA_DIR = Path("data")
+DEFAULT_YEAR = 2025
+
+
+# ---------------------------------------------------------------------------
+# efast import helper
+# ---------------------------------------------------------------------------
+
+
+def _import_efast():
+ try:
+ import efast.efast as efast_module
+
+ return efast_module
+ except ImportError as exc:
+ raise ImportError("efast not found. Install with: uv sync") from exc
+
+
+# ---------------------------------------------------------------------------
+# GCC computation (from s2_cloud_native.py and s3_openeo.py)
+# ---------------------------------------------------------------------------
+
+
+def compute_gcc_s2(s2_dir: Path, output_dir: Path) -> None:
+ """Compute GCC from S2 REFL files and write ``*_GCC.tif`` to ``output_dir``.
+
+ Reads every ``*_REFL.tif`` (band order B02/B03/B04) and writes a co-located
+ single-band GCC file. Cloud-masked pixels (zero in all bands) remain zero.
+ """
+ output_dir.mkdir(parents=True, exist_ok=True)
+ for src_path in sorted(s2_dir.glob("*_REFL.tif")):
+ out_path = output_dir / src_path.name.replace("_REFL.tif", "_GCC.tif")
+ if out_path.is_file():
+ continue
+ with rasterio.open(src_path) as src:
+ b, g, r = src.read(1), src.read(2), src.read(3)
+ profile = src.profile
+ total = b + g + r
+ gcc = g / (total + 1e-10)
+ gcc[total == 0] = 0
+ profile.update(count=1)
+ with rasterio.open(out_path, "w", **profile) as dst:
+ dst.write(gcc[np.newaxis].astype("float32"))
+
+
+def compute_gcc_s3(s3_dir: Path, output_dir: Path) -> None:
+ """Compute GCC from S3 composite files and write single-band GeoTIFFs.
+
+ Reads every ``composite_*.tif`` (band order Oa04/Oa06/Oa08/Oa17) and writes
+ a single-band GCC file. NaN pixels in the input remain NaN.
+ """
+ output_dir.mkdir(parents=True, exist_ok=True)
+ for src_path in sorted(s3_dir.glob("composite_*.tif")):
+ out_path = output_dir / src_path.name
+ if out_path.is_file():
+ continue
+ with rasterio.open(src_path) as src:
+ b, g, r = src.read(1), src.read(2), src.read(3)
+ profile = src.profile
+ total = b + g + r
+ gcc = g / (total + 1e-10)
+ gcc[np.isnan(total)] = np.nan
+ profile.update(count=1, dtype="float32")
+ with rasterio.open(out_path, "w", **profile) as dst:
+ dst.write(gcc[np.newaxis].astype("float32"))
+
+
+def compute_gcc_from_refl(refl_dir: Path, gcc_dir: Path) -> None:
+ """Derive GCC from ``REFL_YYYYMMDD.tif`` files (BtI fusion output).
+
+ Reads every ``REFL_*.tif`` and writes a co-located single-band
+ ``GCC_YYYYMMDD.tif``. Pixels where any reflectance band is negative (which
+ can arise from EFAST's temporal correction) are written as NaN; the GCC
+ ratio is undefined there because the denominator B+G+R is near-zero or
+ negative due to band cancellation. All-zero pixels (cloud mask) remain
+ zero.
+ """
+ gcc_dir.mkdir(parents=True, exist_ok=True)
+ for src_path in sorted(refl_dir.glob("REFL_*.tif")):
+ out_path = gcc_dir / src_path.name.replace("REFL_", "GCC_")
+ if out_path.is_file():
+ continue
+ with rasterio.open(src_path) as src:
+ b, g, r = src.read(1), src.read(2), src.read(3)
+ profile = src.profile
+ total = b + g + r
+ invalid = (b < 0) | (g < 0) | (r < 0)
+ gcc = np.where(invalid, np.nan, g / (total + 1e-10))
+ gcc[total == 0] = np.nan
+ profile.update(count=1, dtype="float32")
+ with rasterio.open(out_path, "w", **profile) as dst:
+ dst.write(gcc[np.newaxis].astype("float32"))
+
+
+# ---------------------------------------------------------------------------
+# Date-range detection
+# ---------------------------------------------------------------------------
+
+
+def _refl_date_range(s2_dir: Path) -> tuple[datetime, datetime] | None:
+ """Return (start, end) datetime from REFL filenames in ``s2_dir``.
+
+ Filenames are expected to follow the S2 product naming convention, where
+ the acquisition date ``YYYYMMDD`` appears at position index 2 when the
+ stem is split by ``_``, e.g.
+ ``S2A_MSIL2A_20230911T114111_N0509_R025_T29PKT_20230911T153131_REFL.tif``.
+ """
+ dates: list[datetime] = []
+ for p in s2_dir.glob("*_REFL.tif"):
+ parts = p.stem.split("_")
+ if len(parts) >= 3:
+ try:
+ dates.append(datetime.strptime(parts[2][:8], "%Y%m%d"))
+ except ValueError:
+ pass
+ if not dates:
+ return None
+ return min(dates), max(dates)
+
+
+# ---------------------------------------------------------------------------
+# Per-site fusion
+# ---------------------------------------------------------------------------
+
+
+def fuse_site(sitename: str, year: int) -> dict[str, Any]:
+ """Run GCC computation and EFAST BtI + ItB fusion for one prepared site."""
+ efast = _import_efast()
+
+ s2_dir = DATA_DIR / "sentinel_data" / str(year) / sitename / "prepared" / "s2"
+ s3_dir = DATA_DIR / "sentinel_data" / str(year) / sitename / "prepared" / "s3"
+ gcc_s3_dir = (
+ DATA_DIR / "sentinel_data" / str(year) / sitename / "prepared" / "gcc_s3"
+ )
+ base = DATA_DIR / "fusion" / str(year) / sitename
+
+ if not s2_dir.is_dir() or not any(s2_dir.glob("*_REFL.tif")):
+ raise FileNotFoundError(f"No REFL files in {s2_dir}")
+ if not s3_dir.is_dir() or not any(s3_dir.glob("composite_*.tif")):
+ raise FileNotFoundError(f"No composite files in {s3_dir}")
+
+ print(f"[{sitename}] Computing S2 GCC (in-place)...")
+ compute_gcc_s2(s2_dir, s2_dir)
+
+ print(f"[{sitename}] Computing S3 GCC...")
+ compute_gcc_s3(s3_dir, gcc_s3_dir)
+
+ date_range = _refl_date_range(s2_dir)
+ if date_range is None:
+ raise ValueError(f"Could not detect date range from REFL filenames in {s2_dir}")
+ start, end = date_range
+ print(f"[{sitename}] Date range: {start.date()} → {end.date()}")
+
+ fusion_dates = list(
+ rrule.rrule(
+ rrule.DAILY,
+ dtstart=start + timedelta(MOSAIC_STEP),
+ until=end - timedelta(MOSAIC_STEP),
+ interval=MOSAIC_STEP,
+ )
+ )
+
+ _fusion_kwargs = dict(
+ ratio=RESOLUTION_RATIO,
+ max_days=MAX_DAYS,
+ minimum_acquisition_importance=MINIMUM_ACQUISITION_IMPORTANCE,
+ )
+
+ # --- ItB: GCC first, then fuse GCC ---
+ itb_s2 = base / "itb" / "s2"
+ itb_s3 = base / "itb" / "s3"
+ itb_fusion = base / "itb" / "fusion"
+ itb_s2.mkdir(parents=True, exist_ok=True)
+ itb_s3.mkdir(parents=True, exist_ok=True)
+ itb_fusion.mkdir(parents=True, exist_ok=True)
+
+ for p in sorted(s2_dir.glob("*_GCC.tif")):
+ dst = itb_s2 / f"GCC_{p.stem.split('_')[2][:8]}.tif"
+ if not dst.exists():
+ shutil.copy2(p, dst)
+ for p in sorted(gcc_s3_dir.glob("composite_*.tif")):
+ dst = itb_s3 / f"GCC_{p.stem.split('_')[1]}.tif"
+ if not dst.exists():
+ shutil.copy2(p, dst)
+
+ print(f"[{sitename}] ItB: fusing GCC over {len(fusion_dates)} dates...")
+ for date in fusion_dates:
+ efast.fusion(
+ date, gcc_s3_dir, s2_dir, itb_fusion, product="GCC", **_fusion_kwargs
+ )
+
+ # --- BtI: fuse reflectance (3-band, matching S2 B02/B03/B04), then derive GCC ---
+ # S3 composites have 4 bands; strip band 4 (Oa17/NIR) so shapes match S2 REFL.
+ s3_rgb_dir = (
+ DATA_DIR / "sentinel_data" / str(year) / sitename / "prepared" / "s3_rgb"
+ )
+ s3_rgb_dir.mkdir(parents=True, exist_ok=True)
+ for p in sorted(s3_dir.glob("composite_*.tif")):
+ out = s3_rgb_dir / p.name
+ if not out.exists():
+ with rasterio.open(p) as src:
+ data = src.read([1, 2, 3])
+ profile = src.profile.copy()
+ profile.update(count=3)
+ with rasterio.open(out, "w", **profile) as dst:
+ dst.write(data)
+
+ bti_fusion = base / "bti" / "fusion"
+ bti_gcc = base / "bti" / "gcc"
+ bti_fusion.mkdir(parents=True, exist_ok=True)
+
+ print(f"[{sitename}] BtI: fusing REFL over {len(fusion_dates)} dates...")
+ for date in fusion_dates:
+ efast.fusion(
+ date, s3_rgb_dir, s2_dir, bti_fusion, product="REFL", **_fusion_kwargs
+ )
+
+ print(f"[{sitename}] BtI: deriving GCC from fused REFL...")
+ compute_gcc_from_refl(bti_fusion, bti_gcc)
+
+ return {
+ "sitename": sitename,
+ "evaluation_year": year,
+ "start": start.date().isoformat(),
+ "end": end.date().isoformat(),
+ "fusion_dates": len(fusion_dates),
+ "itb_fusion_files": len(list(itb_fusion.glob("*.tif"))),
+ "bti_fusion_files": len(list(bti_fusion.glob("*.tif"))),
+ "bti_gcc_files": len(list(bti_gcc.glob("*.tif"))),
+ }
+
+
+# ---------------------------------------------------------------------------
+# Site discovery
+# ---------------------------------------------------------------------------
+
+
+def _discover_sites(year: int) -> list[str]:
+ """Return sitenames that have prepared S2 REFL files under sentinel_data."""
+ base = DATA_DIR / "sentinel_data" / str(year)
+ if not base.is_dir():
+ return []
+ return sorted(
+ d.name
+ for d in base.iterdir()
+ if d.is_dir() and any((d / "prepared" / "s2").glob("*_REFL.tif"))
+ )
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def main(argv: list[str] | None = None) -> int:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument("--evaluation-year", type=int, default=DEFAULT_YEAR)
+ parser.add_argument(
+ "--site",
+ type=str,
+ default=None,
+ help="Single sitename to fuse (default: all prepared sites)",
+ )
+ parser.add_argument(
+ "--skip-blended",
+ action="store_true",
+ help="Skip sites whose fusion directory already exists under data/fusion/{year}/",
+ )
+ args = parser.parse_args(argv)
+ year = args.evaluation_year
+
+ if args.site:
+ sites = [args.site]
+ else:
+ sites = _discover_sites(year)
+ if not sites:
+ print(f"[Fusion] No prepared sites found under data/sentinel_data/{year}/")
+ return 1
+
+ print(f"[Fusion] Processing {len(sites)} site(s)")
+ for i, sitename in enumerate(sites, 1):
+ fusion_dir = DATA_DIR / "fusion" / str(year) / sitename
+ if args.skip_blended and fusion_dir.exists():
+ print(
+ f"[Fusion] ({i}/{len(sites)}) {sitename} — skipping (fusion directory exists)"
+ )
+ continue
+ print(f"[Fusion] ({i}/{len(sites)}) {sitename}")
+ summary = fuse_site(sitename, year)
+ print(
+ f"[Fusion] {sitename} done — "
+ f"{summary['fusion_dates']} dates, "
+ f"itb={summary['itb_fusion_files']} bti={summary['bti_fusion_files']} "
+ f"bti_gcc={summary['bti_gcc_files']}"
+ )
+
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/5-metrics.py b/5-metrics.py
new file mode 100644
index 0000000..9658797
--- /dev/null
+++ b/5-metrics.py
@@ -0,0 +1,788 @@
+"""Step 5: Pre-compute per-site GCC timeseries + raster index for the webapp.
+
+Inputs (``data/``, ``{year}`` = ``--evaluation-year``):
+
+- ``phenocam_screening/{year}.json`` — qualifying sites + metadata
+- ``phenocam/{year}/{site}_1day.csv`` — daily GCC timeseries
+- ``sentinel_data/{year}/{site}/prepared/s2/*_GCC.tif`` — S2 GCC rasters
+- ``sentinel_data/{year}/{site}/prepared/gcc_s3/composite_*.tif`` — S3 GCC rasters
+- ``fusion/{year}/{site}/bti/gcc/GCC_*.tif`` — BtI GCC rasters
+- ``fusion/{year}/{site}/itb/fusion/GCC_*.tif`` — ItB GCC rasters
+
+Outputs (``data/metrics/``):
+
+- ``manifest.json`` — years + per-site metadata
+- ``{year}/{site}/gcc_phenocam.json`` — PhenoCam ``gcc_90`` at matched dates
+- ``{year}/{site}/gcc_s2.json`` — S2 GCC (center pixel, cloud-free scenes)
+- ``{year}/{site}/gcc_s2_whittaker.json`` — Whittaker-smoothed S2 GCC
+- ``{year}/{site}/gcc_s3.json`` — S3 composite GCC
+- ``{year}/{site}/gcc_s3_smooth.json`` — S3 5-day moving average
+- ``{year}/{site}/gcc_fusion_bti.json`` — BtI fused GCC
+- ``{year}/{site}/gcc_fusion_itb.json`` — ItB fused GCC
+- ``{year}/{site}/phenocam_images.json`` — midday photo URLs for the viewer
+- ``{year}/{site}/rasters_s2_refl.json`` — S2 REFL paths (BtI view)
+- ``{year}/{site}/rasters_s3_composite.json`` — S3 composite paths (BtI view)
+- ``{year}/{site}/rasters_s2_gcc.json`` — S2 GCC paths (ItB view)
+- ``{year}/{site}/rasters_s3_gcc.json`` — S3 GCC paths (ItB view)
+- ``{year}/{site}/rasters_fusion_bti_refl.json`` — BtI fused REFL paths
+- ``{year}/{site}/rasters_fusion_itb_gcc.json`` — ItB fused GCC paths
+- ``{year}/{site}/metrics.json`` — NSE, RMSE, nRMSE, Pearson r vs PhenoCam per series
+- ``{year}/{site}/bands_s2.json`` — S2 center-pixel reflectance (B02, B03, B04) per scene
+- ``{year}/{site}/bands_s3.json`` — S3 center-pixel reflectance (Oa04, Oa06, Oa08, Oa17) per composite
+- ``{year}/{site}/covariates.json`` — spatial CV/std, S2/S3 counts, gap stats
+
+CLI:
+
+- ``--evaluation-year`` (default 2025)
+- ``--site`` (optional; default: all qualifying sites with sentinel data)
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+import re
+from pathlib import Path
+from typing import Any
+
+import datetime
+import numpy as np
+import rasterio
+from rasterio.crs import CRS
+from rasterio.transform import rowcol
+from pyproj import Transformer
+from scipy.stats import pearsonr
+from tqdm import tqdm
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+
+DATA_DIR = Path("data")
+DEFAULT_YEAR = 2025
+
+# GCC smoothing window for S3 moving average (days)
+S3_SMOOTH_WINDOW = 5
+
+# Whittaker lambda (penalised smoothing strength for S2)
+WHITTAKER_LAMBDA = 400.0
+
+# Half-width in metres for the spatial heterogeneity footprint (~300 m = 1 S3 pixel)
+SPATIAL_CV_HALF_M = 150
+
+# PhenoCam archive image URL pattern
+PHENOCAM_IMAGE_URL = (
+ "https://phenocam.nau.edu/data/archive/{site}/{year}/{month}/{filename}"
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers: raster pixel extraction
+# ---------------------------------------------------------------------------
+
+
+def _window_mean(data: np.ndarray) -> float | None:
+ """Mean of a pixel window; ``None`` when every value is NaN."""
+ valid = data[~np.isnan(data)]
+ if valid.size == 0:
+ return None
+ return float(np.mean(valid))
+
+
+def _read_center_pixel(path: Path, lat: float, lon: float) -> float | None:
+ """Return the 3×3 mean GCC value at (lat, lon) from a single-band raster.
+
+ Returns ``None`` when the pixel is masked/zero/NaN.
+ """
+ try:
+ with rasterio.open(path) as src:
+ transformer = Transformer.from_crs(
+ CRS.from_epsg(4326), src.crs, always_xy=True
+ )
+ x, y = transformer.transform(lon, lat)
+ row, col = rowcol(src.transform, x, y)
+ h, w = src.height, src.width
+ r0, r1 = max(0, row - 1), min(h, row + 2)
+ c0, c1 = max(0, col - 1), min(w, col + 2)
+ window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0)
+ data = src.read(1, window=window).astype(float)
+ nodata = src.nodata
+ if nodata is not None:
+ data = np.where(data == nodata, np.nan, data)
+ data[data == 0] = np.nan
+ return _window_mean(data)
+ except Exception:
+ return None
+
+
+# ---------------------------------------------------------------------------
+# Helpers: date extraction from filenames
+# ---------------------------------------------------------------------------
+
+
+def _date_from_gcc_tif(path: Path) -> str | None:
+ """Extract YYYYMMDD from ``GCC_YYYYMMDD.tif`` or ``composite_YYYYMMDD.tif``."""
+ m = re.search(r"(\d{8})", path.stem)
+ return m.group(1) if m else None
+
+
+def _date_from_s2_tif(path: Path) -> str | None:
+ """Extract YYYYMMDD from S2 product name ``S2X_TTTT_YYYYMMDD_…``."""
+ parts = path.stem.split("_")
+ if len(parts) >= 3:
+ m = re.match(r"(\d{8})", parts[2])
+ return m.group(1) if m else None
+ return None
+
+
+# ---------------------------------------------------------------------------
+# Helpers: Whittaker smoother (2nd-order differences, tridiagonal solver)
+# ---------------------------------------------------------------------------
+
+
+def _whittaker_smooth(
+ values: list[float | None], lam: float = WHITTAKER_LAMBDA
+) -> list[float | None]:
+ """Penalised least-squares smoother (Whittaker, 2nd-order differences).
+
+ Masked (None) values are filled via the smooth and then re-set to None in
+ the output so the caller can distinguish observed from gap-filled points.
+ """
+ n = len(values)
+ if n < 4:
+ return values[:]
+
+ obs_mask = [v is not None for v in values]
+ y = np.array([v if v is not None else 0.0 for v in values], dtype=float)
+ w = np.array([1.0 if m else 0.0 for m in obs_mask], dtype=float)
+
+ W = np.diag(w)
+ D = np.diff(np.eye(n), n=2, axis=0) # (n-2) x n second-difference matrix
+ A = W + lam * D.T @ D
+ try:
+ z = np.linalg.solve(A, w * y)
+ except np.linalg.LinAlgError:
+ return values[:]
+
+ result: list[float | None] = []
+ for i, m in enumerate(obs_mask):
+ result.append(float(z[i]) if m else None)
+ return result
+
+
+# ---------------------------------------------------------------------------
+# Helpers: PhenoCam CSV parsing
+# ---------------------------------------------------------------------------
+
+
+def _parse_phenocam_csv(
+ csv_path: Path, year: int, site: str
+) -> tuple[list[dict], list[dict]]:
+ """Return (gcc_series, image_list) filtered to ``year``.
+
+ ``gcc_series`` entries: ``{"date": "YYYY-MM-DD", "gcc_90": float}``
+ ``image_list`` entries: ``{"date": "YYYY-MM-DD", "url": str}``
+ """
+ gcc_series: list[dict] = []
+ image_list: list[dict] = []
+ year_str = str(year)
+
+ if not csv_path.is_file():
+ return gcc_series, image_list
+
+ with csv_path.open() as f:
+ lines = [line for line in f if not line.startswith("#")]
+
+ reader = csv.DictReader(lines)
+ for row in reader:
+ if row.get("year") != year_str:
+ continue
+ date = row.get("date", "")
+ gcc_raw = row.get("gcc_90")
+ if gcc_raw and gcc_raw not in ("NA", ""):
+ try:
+ gcc_series.append({"date": date, "gcc_90": float(gcc_raw)})
+ except ValueError:
+ pass
+ fn = row.get("midday_filename", "").strip()
+ if fn and fn != "NA" and date:
+ month = date[5:7]
+ url = PHENOCAM_IMAGE_URL.format(
+ site=site, year=year_str, month=month, filename=fn
+ )
+ image_list.append({"date": date, "url": url})
+
+ return gcc_series, image_list
+
+
+# ---------------------------------------------------------------------------
+# Helpers: moving average
+# ---------------------------------------------------------------------------
+
+
+def _moving_average(series: list[dict], value_key: str, window: int) -> list[dict]:
+ """Compute centred moving average; returns new list with ``_smooth`` suffix key."""
+ if not series:
+ return []
+ vals = [p[value_key] for p in series]
+ half = window // 2
+ smoothed = []
+ for i, pt in enumerate(series):
+ chunk = [v for v in vals[max(0, i - half) : i + half + 1] if v is not None]
+ smoothed.append(
+ {
+ "date": pt["date"],
+ value_key + "_smooth": (sum(chunk) / len(chunk)) if chunk else None,
+ }
+ )
+ return smoothed
+
+
+# ---------------------------------------------------------------------------
+# Helpers: validation metrics
+# ---------------------------------------------------------------------------
+
+MATCH_TOLERANCE_DAYS = 5
+
+
+def compute_metrics(
+ ref: list[dict],
+ ref_key: str,
+ pred: list[dict],
+ pred_key: str,
+) -> dict | None:
+ """Compute NSE, RMSE, nRMSE, Pearson r between pred and ref.
+
+ Each pred point is matched to the nearest ref date within
+ ``MATCH_TOLERANCE_DAYS``. Returns a dict or ``None`` if fewer than
+ 2 matched pairs exist.
+ """
+ ref_lookup: dict[str, float] = {
+ p["date"]: p[ref_key] for p in ref if p.get(ref_key) is not None
+ }
+ if not ref_lookup:
+ return None
+
+ ref_dates = sorted(ref_lookup)
+
+ obs, sim = [], []
+ for pt in pred:
+ v = pt.get(pred_key)
+ if v is None:
+ continue
+ nearest = min(
+ ref_dates,
+ key=lambda d: abs(
+ (np.datetime64(pt["date"]) - np.datetime64(d)) / np.timedelta64(1, "D")
+ ),
+ )
+ gap = abs(
+ (np.datetime64(pt["date"]) - np.datetime64(nearest))
+ / np.timedelta64(1, "D")
+ )
+ if gap <= MATCH_TOLERANCE_DAYS and nearest in ref_lookup:
+ obs.append(ref_lookup[nearest])
+ sim.append(v)
+
+ if len(obs) < 2:
+ return None
+
+ obs_arr = np.array(obs)
+ sim_arr = np.array(sim)
+ obs_mean = obs_arr.mean()
+
+ rmse = float(np.sqrt(np.mean((sim_arr - obs_arr) ** 2)))
+ nrmse = rmse / obs_mean if obs_mean else None
+ ss_res = float(np.sum((obs_arr - sim_arr) ** 2))
+ ss_tot = float(np.sum((obs_arr - obs_mean) ** 2))
+ nse = (1.0 - ss_res / ss_tot) if ss_tot else None
+ r, _ = pearsonr(obs_arr, sim_arr)
+
+ def _r4(v: float | None) -> float | None:
+ return round(v, 4) if v is not None else None
+
+ return {
+ "n": len(obs),
+ "rmse": _r4(rmse),
+ "nrmse": _r4(nrmse),
+ "nse": _r4(nse),
+ "r": _r4(float(r)),
+ }
+
+
+S2_BAND_NAMES = ["B02", "B03", "B04"]
+S3_BAND_NAMES = ["Oa04", "Oa06", "Oa08", "Oa17"]
+
+
+def _read_multiband_center(
+ path: Path, lat: float, lon: float, band_names: list[str]
+) -> dict[str, float | None]:
+ """Return 3×3 mean per band at (lat, lon). Keys are ``band_names``, values float or None."""
+ try:
+ with rasterio.open(path) as src:
+ transformer = Transformer.from_crs(
+ CRS.from_epsg(4326), src.crs, always_xy=True
+ )
+ x, y = transformer.transform(lon, lat)
+ row, col = rowcol(src.transform, x, y)
+ h, w = src.height, src.width
+ r0, r1 = max(0, row - 1), min(h, row + 2)
+ c0, c1 = max(0, col - 1), min(w, col + 2)
+ window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0)
+ nodata = src.nodata
+ result = {}
+ for i, name in enumerate(band_names, 1):
+ if i > src.count:
+ result[name] = None
+ continue
+ data = src.read(i, window=window).astype(float)
+ if nodata is not None:
+ data = np.where(data == nodata, np.nan, data)
+ data[data == 0] = np.nan
+ val = _window_mean(data)
+ result[name] = None if val is None else round(val, 6)
+ return result
+ except Exception:
+ return {name: None for name in band_names}
+
+
+def _multiband_series(
+ tif_paths: list[Path],
+ date_fn,
+ lat: float,
+ lon: float,
+ band_names: list[str],
+ desc: str,
+) -> list[dict]:
+ """Extract center-pixel values for all bands; return ``[{date, band1, band2, …}]``."""
+ result = []
+ for p in tqdm(tif_paths, desc=desc, leave=False):
+ date = date_fn(p)
+ if date is None:
+ continue
+ vals = _read_multiband_center(p, lat, lon, band_names)
+ if any(v is not None for v in vals.values()):
+ result.append({"date": f"{date[:4]}-{date[4:6]}-{date[6:]}", **vals})
+ return sorted(result, key=lambda x: x["date"])
+
+
+# ---------------------------------------------------------------------------
+# Helpers: spatial heterogeneity + observation density
+# ---------------------------------------------------------------------------
+
+
+def _read_footprint_stats(
+ path: Path, lat: float, lon: float, half_m: float = SPATIAL_CV_HALF_M
+) -> tuple[float, float] | tuple[None, None]:
+ """Return (mean, std) of valid GCC pixels within a ±half_m metre square window.
+
+ Returns ``(None, None)`` on any error or when fewer than 4 valid pixels exist.
+ """
+ try:
+ with rasterio.open(path) as src:
+ transformer = Transformer.from_crs(
+ CRS.from_epsg(4326), src.crs, always_xy=True
+ )
+ x, y = transformer.transform(lon, lat)
+ res = abs(src.transform.a) # pixel size in CRS units (metres for UTM)
+ half_px = max(1, int(round(half_m / res)))
+ row, col = rowcol(src.transform, x, y)
+ h, w = src.height, src.width
+ r0, r1 = max(0, row - half_px), min(h, row + half_px + 1)
+ c0, c1 = max(0, col - half_px), min(w, col + half_px + 1)
+ window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0)
+ data = src.read(1, window=window).astype(float)
+ nodata = src.nodata
+ if nodata is not None:
+ data = np.where(data == nodata, np.nan, data)
+ data[data <= 0] = np.nan
+ valid = data[~np.isnan(data)]
+ if len(valid) < 4:
+ return None, None
+ return float(np.mean(valid)), float(np.std(valid))
+ except Exception:
+ return None, None
+
+
+def compute_covariates(
+ s2_gcc_paths: list[Path],
+ s2_series: list[dict],
+ s3_series: list[dict],
+ n_gcc_points: int | None,
+ lat: float,
+ lon: float,
+) -> dict:
+ """Compute spatial heterogeneity and temporal observation density covariates."""
+ # Spatial GCC statistics over ~300 m footprint
+ means, stds = [], []
+ for p in s2_gcc_paths:
+ m, s = _read_footprint_stats(p, lat, lon)
+ if m is not None and m > 0:
+ means.append(m)
+ stds.append(s)
+
+ spatial_gcc_cv = (
+ round(float(np.mean([s / m for s, m in zip(stds, means)])), 4)
+ if means
+ else None
+ )
+ spatial_gcc_std = round(float(np.mean(stds)), 4) if stds else None
+
+ # S2 temporal gap statistics
+ s2_dates = [datetime.date.fromisoformat(p["date"]) for p in s2_series]
+ if len(s2_dates) >= 2:
+ gaps = [(s2_dates[i + 1] - s2_dates[i]).days for i in range(len(s2_dates) - 1)]
+ s2_mean_gap = round(float(np.mean(gaps)), 1)
+ s2_max_gap = int(max(gaps))
+ else:
+ s2_mean_gap = None
+ s2_max_gap = None
+
+ return {
+ "spatial_gcc_cv": spatial_gcc_cv,
+ "spatial_gcc_std": spatial_gcc_std,
+ "s2_scene_count": len(s2_series),
+ "s2_mean_gap_days": s2_mean_gap,
+ "s2_max_gap_days": s2_max_gap,
+ "s3_composite_count": len(s3_series),
+ "n_gcc_points": n_gcc_points,
+ }
+
+
+# ---------------------------------------------------------------------------
+# Per-site export
+# ---------------------------------------------------------------------------
+
+
+def _write_json(path: Path, data: Any) -> None:
+ path.write_text(json.dumps(data, separators=(",", ":")))
+
+
+def _raster_series(
+ tif_paths: list[Path],
+ date_fn,
+ lat: float,
+ lon: float,
+ desc: str,
+) -> list[dict]:
+ """Extract center-pixel GCC from each tif, return ``[{date, gcc}]`` sorted."""
+ result = []
+ for p in tqdm(tif_paths, desc=desc, leave=False):
+ date = date_fn(p)
+ if date is None:
+ continue
+ val = _read_center_pixel(p, lat, lon)
+ if val is not None:
+ result.append({"date": f"{date[:4]}-{date[4:6]}-{date[6:]}", "gcc": val})
+ return sorted(result, key=lambda x: x["date"])
+
+
+def _raster_index(tif_paths: list[Path], date_fn, rel_root: Path) -> list[dict]:
+ """Build raster index: ``[{date, path}]`` sorted by date."""
+ result = []
+ for p in tif_paths:
+ date = date_fn(p)
+ if date is None:
+ continue
+ try:
+ rel = str(p.relative_to(rel_root))
+ except ValueError:
+ rel = str(p)
+ result.append({"date": date, "path": rel})
+ return sorted(result, key=lambda x: x["date"])
+
+
+def export_site(
+ site: str,
+ year: int,
+ lat: float,
+ lon: float,
+ out_dir: Path,
+ n_gcc_points: int | None = None,
+) -> bool:
+ """Export timeseries.json and rasters.json for one site. Returns True on success."""
+ sentinel_base = DATA_DIR / "sentinel_data" / str(year) / site / "prepared"
+ fusion_base = DATA_DIR / "fusion" / str(year) / site
+
+ s2_gcc_dir = sentinel_base / "s2"
+ s3_gcc_dir = sentinel_base / "gcc_s3"
+ bti_gcc_dir = fusion_base / "bti" / "gcc"
+ itb_gcc_dir = fusion_base / "itb" / "fusion"
+
+ # Raster slider sources
+ s2_refl_dir = sentinel_base / "s2"
+ s3_comp_dir = sentinel_base / "s3"
+ bti_refl_dir = fusion_base / "bti" / "fusion"
+
+ has_fusion = bti_gcc_dir.is_dir() and any(bti_gcc_dir.glob("GCC_*.tif"))
+ if not has_fusion:
+ return False
+
+ out_dir.mkdir(parents=True, exist_ok=True)
+
+ # --- GCC timeseries from rasters ---
+ s2_gcc_paths = sorted(s2_gcc_dir.glob("*_GCC.tif"))
+ s3_gcc_paths = sorted(s3_gcc_dir.glob("composite_*.tif"))
+ bti_paths = sorted(bti_gcc_dir.glob("GCC_*.tif"))
+ itb_paths = sorted(itb_gcc_dir.glob("GCC_*.tif"))
+
+ s2_series = _raster_series(s2_gcc_paths, _date_from_s2_tif, lat, lon, f"{site} S2")
+ s3_series = _raster_series(s3_gcc_paths, _date_from_gcc_tif, lat, lon, f"{site} S3")
+ bti_series = _raster_series(bti_paths, _date_from_gcc_tif, lat, lon, f"{site} BtI")
+ itb_series = _raster_series(itb_paths, _date_from_gcc_tif, lat, lon, f"{site} ItB")
+
+ # Whittaker on S2
+ s2_vals = [p["gcc"] for p in s2_series]
+ s2_smooth_vals = _whittaker_smooth(s2_vals)
+ s2_whittaker = [
+ {"date": p["date"], "gcc": v}
+ for p, v in zip(s2_series, s2_smooth_vals)
+ if v is not None
+ ]
+
+ # S3 5-day moving average
+ s3_smooth = _moving_average(s3_series, "gcc", S3_SMOOTH_WINDOW)
+
+ # PhenoCam CSV
+ csv_path = DATA_DIR / "phenocam" / str(year) / f"{site}_1day.csv"
+ phenocam_series, image_list = _parse_phenocam_csv(csv_path, year, site)
+
+ s3_smooth_series = [
+ {"date": p["date"], "gcc": p["gcc_smooth"]}
+ for p in s3_smooth
+ if p.get("gcc_smooth") is not None
+ ]
+
+ # Band reflectance timeseries (multi-band center-pixel)
+ bands_s2 = _multiband_series(
+ sorted(s2_refl_dir.glob("*_REFL.tif")),
+ _date_from_s2_tif,
+ lat,
+ lon,
+ S2_BAND_NAMES,
+ f"{site} S2 bands",
+ )
+ bands_s3 = _multiband_series(
+ sorted(s3_comp_dir.glob("composite_*.tif")),
+ _date_from_gcc_tif,
+ lat,
+ lon,
+ S3_BAND_NAMES,
+ f"{site} S3 bands",
+ )
+
+ # --- Per-metric JSON outputs ---
+ _write_json(out_dir / "gcc_phenocam.json", phenocam_series)
+ _write_json(out_dir / "gcc_s2.json", s2_series)
+ _write_json(out_dir / "gcc_s2_whittaker.json", s2_whittaker)
+ _write_json(out_dir / "gcc_s3.json", s3_series)
+ _write_json(out_dir / "gcc_s3_smooth.json", s3_smooth_series)
+ _write_json(out_dir / "gcc_fusion_bti.json", bti_series)
+ _write_json(out_dir / "gcc_fusion_itb.json", itb_series)
+ _write_json(out_dir / "phenocam_images.json", image_list)
+ _write_json(out_dir / "bands_s2.json", bands_s2)
+ _write_json(out_dir / "bands_s3.json", bands_s3)
+
+ # --- Raster index for slider ---
+ rel_root = DATA_DIR.parent # paths relative to project root
+
+ # Valid-pixel sets: only show S2/S3 rasters where the center pixel had
+ # usable data (non-zero GCC). This excludes cloud-masked / snow-covered
+ # scenes that would render as black or visually nonsensical.
+ s2_valid_dates = {p["date"].replace("-", "") for p in s2_series}
+ s3_valid_dates = {p["date"].replace("-", "") for p in s3_series}
+
+ s2_refl = [
+ r
+ for r in _raster_index(
+ sorted(s2_refl_dir.glob("*_REFL.tif")), _date_from_s2_tif, rel_root
+ )
+ if r["date"] in s2_valid_dates
+ ]
+ s3_comp = [
+ r
+ for r in _raster_index(
+ sorted(s3_comp_dir.glob("composite_*.tif")), _date_from_gcc_tif, rel_root
+ )
+ if r["date"] in s3_valid_dates
+ ]
+ s2_gcc = [
+ r
+ for r in _raster_index(
+ sorted(s2_gcc_dir.glob("*_GCC.tif")), _date_from_s2_tif, rel_root
+ )
+ if r["date"] in s2_valid_dates
+ ]
+ s3_gcc = [
+ r
+ for r in _raster_index(
+ sorted(s3_gcc_dir.glob("composite_*.tif")), _date_from_gcc_tif, rel_root
+ )
+ if r["date"] in s3_valid_dates
+ ]
+ bti_refl = _raster_index(
+ sorted(bti_refl_dir.glob("REFL_*.tif")), _date_from_gcc_tif, rel_root
+ )
+ itb_gcc = _raster_index(
+ sorted(itb_gcc_dir.glob("GCC_*.tif")), _date_from_gcc_tif, rel_root
+ )
+
+ _write_json(out_dir / "rasters_s2_refl.json", s2_refl)
+ _write_json(out_dir / "rasters_s3_composite.json", s3_comp)
+ _write_json(out_dir / "rasters_s2_gcc.json", s2_gcc)
+ _write_json(out_dir / "rasters_s3_gcc.json", s3_gcc)
+ _write_json(out_dir / "rasters_fusion_bti_refl.json", bti_refl)
+ _write_json(out_dir / "rasters_fusion_itb_gcc.json", itb_gcc)
+
+ # --- Site covariates (heterogeneity + observation density) ---
+ _write_json(
+ out_dir / "covariates.json",
+ compute_covariates(s2_gcc_paths, s2_series, s3_series, n_gcc_points, lat, lon),
+ )
+
+ # --- Validation metrics vs PhenoCam gcc_90 ---
+ _write_json(
+ out_dir / "metrics.json",
+ {
+ "bti": compute_metrics(phenocam_series, "gcc_90", bti_series, "gcc"),
+ "itb": compute_metrics(phenocam_series, "gcc_90", itb_series, "gcc"),
+ "s2_whittaker": compute_metrics(
+ phenocam_series, "gcc_90", s2_whittaker, "gcc"
+ ),
+ "s3_smooth": compute_metrics(
+ phenocam_series, "gcc_90", s3_smooth_series, "gcc"
+ ),
+ "s2": compute_metrics(phenocam_series, "gcc_90", s2_series, "gcc"),
+ "s3": compute_metrics(phenocam_series, "gcc_90", s3_series, "gcc"),
+ },
+ )
+
+ # Remove legacy bundled outputs if present
+ for legacy in ("timeseries.json", "rasters.json"):
+ (out_dir / legacy).unlink(missing_ok=True)
+ return True
+
+
+# ---------------------------------------------------------------------------
+# Manifest
+# ---------------------------------------------------------------------------
+
+VEG_TYPE_LABELS = {
+ "AG": "Agriculture",
+ "DB": "Deciduous broadleaf",
+ "DN": "Deciduous needleleaf",
+ "EB": "Evergreen broadleaf",
+ "EN": "Evergreen needleleaf",
+ "GR": "Grassland",
+ "MX": "Mixed",
+ "SH": "Shrubland",
+ "TN": "Tundra",
+ "UN": "Unknown",
+ "WL": "Wetland",
+ "RF": "Reference",
+}
+
+
+def build_manifest(years: list[int], filter_site: str | None = None) -> dict:
+ manifest: dict[str, Any] = {"years": years, "sites": {}}
+
+ for year in years:
+ screening_path = DATA_DIR / "phenocam_screening" / f"{year}.json"
+ if not screening_path.is_file():
+ continue
+ data = json.loads(screening_path.read_text())
+ sites_meta: dict[str, Any] = {}
+ for entry in data.get("sites", []):
+ if entry.get("calculations", {}).get("status") != "PASS":
+ continue
+ cam = entry.get("response", {}).get("camera", {})
+ roi = entry.get("response", {}).get("roi", {})
+ calc = entry.get("calculations", {})
+ site = cam.get("Sitename", "")
+ if not site:
+ continue
+ if filter_site and site != filter_site:
+ continue
+ sm = cam.get("sitemetadata", {})
+ veg_raw = sm.get("primary_veg_type") or roi.get("roitype") or "UN"
+ fusion_dir = DATA_DIR / "fusion" / str(year) / site / "bti" / "gcc"
+ has_fusion = fusion_dir.is_dir() and any(fusion_dir.glob("GCC_*.tif"))
+ sites_meta[site] = {
+ "lat": cam.get("Lat"),
+ "lon": cam.get("Lon"),
+ "veg_type": veg_raw,
+ "veg_label": VEG_TYPE_LABELS.get(veg_raw, veg_raw),
+ "description": sm.get("site_description", ""),
+ "dominant_species": sm.get("dominant_species", ""),
+ "group": sm.get("group", ""),
+ "snr": calc.get("snr"),
+ "n_gcc_points": calc.get("n_gcc_points"),
+ "has_fusion": has_fusion,
+ }
+ manifest["sites"][str(year)] = sites_meta
+
+ return manifest
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument("--evaluation-year", type=int, default=DEFAULT_YEAR)
+ parser.add_argument("--site", type=str, default=None)
+ args = parser.parse_args()
+
+ year = args.evaluation_year
+ filter_site = args.site
+
+ out_base = DATA_DIR / "metrics"
+ out_base.mkdir(parents=True, exist_ok=True)
+
+ # Determine years with screening data
+ screening_dir = DATA_DIR / "phenocam_screening"
+ years = sorted(
+ int(p.stem) for p in screening_dir.glob("*.json") if p.stem.isdigit()
+ )
+ if not years:
+ years = [year]
+
+ print(f"Building manifest for years: {years}")
+ manifest = build_manifest(years, filter_site)
+
+ # Export per-site data for the requested year
+ year_sites = manifest["sites"].get(str(year), {})
+ fusion_sites = {s: m for s, m in year_sites.items() if m["has_fusion"]}
+
+ print(f"Exporting {len(fusion_sites)} site(s) with fusion data for {year}")
+ for site, meta in tqdm(fusion_sites.items(), desc="Sites"):
+ out_dir = out_base / str(year) / site
+ ok = export_site(
+ site, year, meta["lat"], meta["lon"], out_dir, meta.get("n_gcc_points")
+ )
+ if ok:
+ print(f" ✓ {site}")
+ else:
+ print(f" ✗ {site} — no fusion data found")
+
+ manifest_path = out_base / "manifest.json"
+ if filter_site and manifest_path.is_file():
+ # Merge: update only the filtered site(s); preserve all other entries.
+ existing: dict = json.loads(manifest_path.read_text())
+ for year_key, year_sites_new in manifest["sites"].items():
+ existing.setdefault("sites", {}).setdefault(year_key, {}).update(
+ year_sites_new
+ )
+ all_years = sorted(set(existing.get("years", [])) | set(manifest["years"]))
+ existing["years"] = all_years
+ manifest_path.write_text(json.dumps(existing, separators=(",", ":")))
+ else:
+ manifest_path.write_text(json.dumps(manifest, separators=(",", ":")))
+ print(f"Manifest written → {manifest_path}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/6-statistics-fusion-order.py b/6-statistics-fusion-order.py
new file mode 100644
index 0000000..0ec0541
--- /dev/null
+++ b/6-statistics-fusion-order.py
@@ -0,0 +1,257 @@
+"""Step 6: Paired ItB-vs-BtI significance test across the full sample.
+
+Inputs (``data/``, ``{year}`` = ``--evaluation-year``):
+
+- ``metrics/{year}/{site}/metrics.json`` — per-site validation metrics (Step 5)
+
+Outputs (``data/statistics_fusion_order/``):
+
+- ``{year}.json`` — paired Wilcoxon + t-test summary for NSE, RMSE, nRMSE, r;
+ includes ``dropped_sites`` (union) and per-metric ``dropped_sites`` lists
+
+CLI:
+
+- ``--evaluation-year`` (default 2025)
+- ``--alpha`` (default 0.05) — significance threshold for ``better_order``
+
+This step aggregates across all sites with Step 5 output; it does not accept
+``--site`` (a single-site filter would not support a sample-level test).
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+from scipy.stats import ttest_rel, wilcoxon
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+
+DATA_DIR = Path("data")
+DEFAULT_YEAR = 2025
+DEFAULT_ALPHA = 0.05
+
+METRICS = ["nse", "rmse", "nrmse", "r"]
+LOWER_IS_BETTER = {"rmse", "nrmse"}
+
+MIN_PAIRS_WARNING = 6
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _r4(v: float | None) -> float | None:
+ return round(v, 4) if v is not None else None
+
+
+def _load_site_metrics(year: int) -> list[tuple[str, dict[str, Any]]]:
+ """Return ``(sitename, metrics.json payload)`` for every site under ``{year}``."""
+ metrics_dir = DATA_DIR / "metrics" / str(year)
+ if not metrics_dir.is_dir():
+ return []
+
+ payloads: list[tuple[str, dict[str, Any]]] = []
+ for site_dir in sorted(metrics_dir.iterdir()):
+ if not site_dir.is_dir():
+ continue
+ path = site_dir / "metrics.json"
+ if not path.is_file():
+ continue
+ payloads.append((site_dir.name, json.loads(path.read_text())))
+ return payloads
+
+
+def collect_pairs(
+ site_metrics: list[tuple[str, dict[str, Any]]], metric: str
+) -> tuple[list[float], list[float], list[str]]:
+ """Return paired BtI / ItB values for ``metric`` and dropped site names."""
+ bti_vals: list[float] = []
+ itb_vals: list[float] = []
+ dropped_sites: list[str] = []
+
+ for site, payload in site_metrics:
+ bti = payload.get("bti")
+ itb = payload.get("itb")
+ if not isinstance(bti, dict) or not isinstance(itb, dict):
+ dropped_sites.append(site)
+ continue
+
+ bti_v = bti.get(metric)
+ itb_v = itb.get(metric)
+ if bti_v is None or itb_v is None:
+ dropped_sites.append(site)
+ continue
+
+ bti_vals.append(float(bti_v))
+ itb_vals.append(float(itb_v))
+
+ return bti_vals, itb_vals, dropped_sites
+
+
+def _better_order(
+ bti_vals: list[float],
+ itb_vals: list[float],
+ metric: str,
+ p_value: float | None,
+ alpha: float,
+) -> str:
+ """Name the better fusion order when Wilcoxon p < alpha, else no difference."""
+ if p_value is None or p_value >= alpha:
+ return "no significant difference"
+
+ mean_diff = float(np.mean(itb_vals) - np.mean(bti_vals))
+ if metric in LOWER_IS_BETTER:
+ return "itb" if mean_diff < 0 else "bti"
+ return "itb" if mean_diff > 0 else "bti"
+
+
+def paired_test(
+ bti_vals: list[float],
+ itb_vals: list[float],
+ metric: str,
+ alpha: float,
+) -> dict[str, Any]:
+ """Run paired Wilcoxon (primary) and t-test; return summary dict."""
+ n_pairs = len(bti_vals)
+ bti_arr = np.array(bti_vals, dtype=float)
+ itb_arr = np.array(itb_vals, dtype=float)
+ diffs = itb_arr - bti_arr
+
+ result: dict[str, Any] = {
+ "n_pairs": n_pairs,
+ "bti_mean": _r4(float(bti_arr.mean())) if n_pairs else None,
+ "bti_median": _r4(float(np.median(bti_arr))) if n_pairs else None,
+ "itb_mean": _r4(float(itb_arr.mean())) if n_pairs else None,
+ "itb_median": _r4(float(np.median(itb_arr))) if n_pairs else None,
+ "mean_diff": _r4(float(diffs.mean())) if n_pairs else None,
+ "median_diff": _r4(float(np.median(diffs))) if n_pairs else None,
+ "wilcoxon": {"statistic": None, "p_value": None},
+ "ttest": {"statistic": None, "p_value": None},
+ "better_order": "insufficient data",
+ }
+
+ if n_pairs < 2:
+ return result
+
+ wilcoxon_stat: float | None = None
+ wilcoxon_p: float | None = None
+ if np.any(diffs != 0):
+ try:
+ w_stat, w_p = wilcoxon(itb_arr, bti_arr)
+ wilcoxon_stat = float(w_stat)
+ wilcoxon_p = float(w_p)
+ except ValueError:
+ pass
+
+ t_stat, t_p = ttest_rel(itb_arr, bti_arr)
+
+ result["wilcoxon"] = {
+ "statistic": _r4(wilcoxon_stat),
+ "p_value": _r4(wilcoxon_p),
+ }
+ result["ttest"] = {
+ "statistic": _r4(float(t_stat)),
+ "p_value": _r4(float(t_p)),
+ }
+ result["better_order"] = _better_order(
+ bti_vals, itb_vals, metric, wilcoxon_p, alpha
+ )
+ return result
+
+
+def _print_summary(
+ year: int, alpha: float, n_sites_total: int, metrics_out: dict
+) -> None:
+ print(f"\nPaired ItB vs BtI test — {year} (alpha={alpha}, sites={n_sites_total})")
+ print(
+ f"{'metric':<8} {'n':>4} {'BtI mean':>10} {'ItB mean':>10} "
+ f"{'diff':>10} {'W p':>8} {'t p':>8} better"
+ )
+ print("-" * 78)
+ for metric in METRICS:
+ m = metrics_out[metric]
+ bti_mean = m["bti_mean"]
+ itb_mean = m["itb_mean"]
+ mean_diff = m["mean_diff"]
+ w_p = m["wilcoxon"]["p_value"]
+ t_p = m["ttest"]["p_value"]
+ better = m["better_order"]
+
+ def _fmt(v: float | None) -> str:
+ return f"{v:10.4f}" if v is not None else f"{'—':>10}"
+
+ print(
+ f"{metric:<8} {m['n_pairs']:>4} {_fmt(bti_mean)} {_fmt(itb_mean)} "
+ f"{_fmt(mean_diff)} "
+ f"{w_p if w_p is not None else '—':>8} "
+ f"{t_p if t_p is not None else '—':>8} {better}"
+ )
+ if 0 < m["n_pairs"] < MIN_PAIRS_WARNING:
+ print(
+ f" warning: only {m['n_pairs']} pair(s) for {metric}; "
+ "interpret p-values cautiously"
+ )
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument("--evaluation-year", type=int, default=DEFAULT_YEAR)
+ parser.add_argument(
+ "--alpha",
+ type=float,
+ default=DEFAULT_ALPHA,
+ help="Significance threshold for better_order (default 0.05)",
+ )
+ args = parser.parse_args()
+
+ year = args.evaluation_year
+ alpha = args.alpha
+
+ site_metrics = _load_site_metrics(year)
+ n_sites_total = len(site_metrics)
+ if n_sites_total == 0:
+ raise SystemExit(
+ f"No Step 5 metrics found under {DATA_DIR / 'metrics' / str(year)}"
+ )
+
+ metrics_out: dict[str, Any] = {}
+ all_dropped: set[str] = set()
+ for metric in METRICS:
+ bti_vals, itb_vals, dropped_sites = collect_pairs(site_metrics, metric)
+ summary = paired_test(bti_vals, itb_vals, metric, alpha)
+ summary["n_dropped"] = len(dropped_sites)
+ summary["dropped_sites"] = dropped_sites
+ all_dropped.update(dropped_sites)
+ metrics_out[metric] = summary
+
+ payload = {
+ "year": year,
+ "alpha": alpha,
+ "n_sites_total": n_sites_total,
+ "dropped_sites": sorted(all_dropped),
+ "metrics": metrics_out,
+ }
+
+ out_dir = DATA_DIR / "statistics_fusion_order"
+ out_dir.mkdir(parents=True, exist_ok=True)
+ out_path = out_dir / f"{year}.json"
+ out_path.write_text(json.dumps(payload, separators=(",", ":")))
+
+ _print_summary(year, alpha, n_sites_total, metrics_out)
+ print(f"\nWritten → {out_path}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/7-gcc-suitability.py b/7-gcc-suitability.py
new file mode 100644
index 0000000..32cc4e5
--- /dev/null
+++ b/7-gcc-suitability.py
@@ -0,0 +1,742 @@
+"""Step 7: PhenoCam GCC suitability as a fusion-accuracy reference.
+
+Inputs (``data/``, ``{year}`` = ``--evaluation-year``):
+
+- ``metrics/{year}/{site}/gcc_s2.json``, ``gcc_phenocam.json`` — Step 5 timeseries
+- ``metrics/manifest.json`` — site lat/lon
+- ``sentinel_data/{year}/{site}/prepared/s2/`` — S2 REFL/GCC/DIST_CLOUD (Steps 3–4)
+- ``sentinel_data/{year}/{site}/prepared/gcc_s3/``, ``prepared/s3_rgb/`` — Step 4
+
+Outputs (``data/gcc_suitability/``):
+
+- ``{year}.json`` — representativeness (Line A), LOOCV concordance (Line B),
+ per-site and aggregate suitability verdict
+
+CLI:
+
+- ``--evaluation-year`` (default 2025)
+- ``--min-cloudfree-s2`` (default 10) — minimum cloud-free S2 dates for LOOCV
+- ``--alpha`` (default 0.05) — reserved for future significance tests
+
+Full-sample aggregate; does not accept ``--site``.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import shutil
+import tempfile
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import rasterio
+from pyproj import Transformer
+from rasterio.crs import CRS
+from rasterio.transform import rowcol
+from scipy.stats import linregress, pearsonr, spearmanr
+from tqdm import tqdm
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+
+DATA_DIR = Path("data")
+DEFAULT_YEAR = 2025
+DEFAULT_ALPHA = 0.05
+MIN_CLOUDFREE_S2 = 10
+REPR_R_THRESHOLD = 0.7
+MATCH_TOLERANCE_DAYS = 5
+
+RESOLUTION_RATIO = 30
+MAX_DAYS = 100
+MINIMUM_ACQUISITION_IMPORTANCE = 0
+
+SMALL_SAMPLE_SITES = 6
+
+
+# ---------------------------------------------------------------------------
+# efast import
+# ---------------------------------------------------------------------------
+
+
+def _import_efast():
+ try:
+ import efast.efast as efast_module
+
+ return efast_module
+ except ImportError as exc:
+ raise ImportError("efast not found. Install with: uv sync") from exc
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _r4(v: float | None) -> float | None:
+ return round(v, 4) if v is not None else None
+
+
+def _window_mean(data: np.ndarray) -> float | None:
+ valid = data[~np.isnan(data)]
+ if valid.size == 0:
+ return None
+ return float(np.mean(valid))
+
+
+def _read_center_pixel(path: Path, lat: float, lon: float) -> float | None:
+ try:
+ with rasterio.open(path) as src:
+ transformer = Transformer.from_crs(
+ CRS.from_epsg(4326), src.crs, always_xy=True
+ )
+ x, y = transformer.transform(lon, lat)
+ row, col = rowcol(src.transform, x, y)
+ h, w = src.height, src.width
+ r0, r1 = max(0, row - 1), min(h, row + 2)
+ c0, c1 = max(0, col - 1), min(w, col + 2)
+ window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0)
+ data = src.read(1, window=window).astype(float)
+ nodata = src.nodata
+ if nodata is not None:
+ data = np.where(data == nodata, np.nan, data)
+ data[data == 0] = np.nan
+ return _window_mean(data)
+ except Exception:
+ return None
+
+
+def _date_from_s2_tif(path: Path) -> str | None:
+ parts = path.stem.split("_")
+ if len(parts) >= 3:
+ m = re.match(r"(\d{8})", parts[2])
+ return m.group(1) if m else None
+ return None
+
+
+def _iso_to_yyyymmdd(iso: str) -> str:
+ return iso.replace("-", "")
+
+
+def _yyyymmdd_to_iso(d: str) -> str:
+ return f"{d[:4]}-{d[4:6]}-{d[6:]}"
+
+
+def _day_gap(a: str, b: str) -> float:
+ return abs((np.datetime64(a) - np.datetime64(b)) / np.timedelta64(1, "D"))
+
+
+def _match_series(
+ ref: list[dict],
+ ref_key: str,
+ pred: list[dict],
+ pred_key: str,
+ tolerance_days: int = MATCH_TOLERANCE_DAYS,
+) -> tuple[list[float], list[float], list[str]]:
+ """Return paired (ref_vals, pred_vals, ref_dates) within tolerance."""
+ ref_lookup: dict[str, float] = {
+ p["date"]: p[ref_key] for p in ref if p.get(ref_key) is not None
+ }
+ if not ref_lookup:
+ return [], [], []
+
+ ref_dates = sorted(ref_lookup)
+ obs, sim, matched_dates = [], [], []
+ for pt in pred:
+ v = pt.get(pred_key)
+ if v is None:
+ continue
+ nearest = min(ref_dates, key=lambda d: _day_gap(pt["date"], d))
+ if _day_gap(pt["date"], nearest) <= tolerance_days:
+ obs.append(ref_lookup[nearest])
+ sim.append(v)
+ matched_dates.append(nearest)
+ return obs, sim, matched_dates
+
+
+def _accuracy_metrics(obs: list[float], sim: list[float]) -> dict[str, Any] | None:
+ if len(obs) < 2:
+ return None
+ obs_arr = np.array(obs, dtype=float)
+ sim_arr = np.array(sim, dtype=float)
+ diff = sim_arr - obs_arr
+ rmse = float(np.sqrt(np.mean(diff**2)))
+ mae = float(np.mean(np.abs(diff)))
+ bias = float(np.mean(diff))
+ r, _ = pearsonr(obs_arr, sim_arr)
+ return {
+ "n": len(obs),
+ "rmse": _r4(rmse),
+ "mae": _r4(mae),
+ "bias": _r4(bias),
+ "r": _r4(float(r)),
+ }
+
+
+def _gcc_from_refl_file(refl_path: Path, gcc_path: Path) -> None:
+ with rasterio.open(refl_path) as src:
+ b, g, r = src.read(1), src.read(2), src.read(3)
+ profile = src.profile
+ total = b + g + r
+ invalid = (b < 0) | (g < 0) | (r < 0)
+ gcc = np.where(invalid, np.nan, g / (total + 1e-10))
+ gcc[total == 0] = np.nan
+ profile.update(count=1, dtype="float32")
+ with rasterio.open(gcc_path, "w", **profile) as dst:
+ dst.write(gcc[np.newaxis].astype("float32"))
+
+
+def _load_json_series(path: Path) -> list[dict]:
+ if not path.is_file():
+ return []
+ return json.loads(path.read_text())
+
+
+def _load_site_coords(year: int) -> dict[str, tuple[float, float]]:
+ manifest_path = DATA_DIR / "metrics" / "manifest.json"
+ if not manifest_path.is_file():
+ return {}
+ manifest = json.loads(manifest_path.read_text())
+ sites = manifest.get("sites", {}).get(str(year), {})
+ coords: dict[str, tuple[float, float]] = {}
+ for site, meta in sites.items():
+ lat, lon = meta.get("lat"), meta.get("lon")
+ if lat is not None and lon is not None:
+ coords[site] = (float(lat), float(lon))
+ return coords
+
+
+def _discover_sites(year: int) -> list[str]:
+ metrics_dir = DATA_DIR / "metrics" / str(year)
+ if not metrics_dir.is_dir():
+ return []
+ return sorted(
+ d.name
+ for d in metrics_dir.iterdir()
+ if d.is_dir() and (d / "gcc_s2.json").is_file()
+ )
+
+
+def _build_hr_symlink_dir(s2_dir: Path, holdout_yyyymmdd: str, dest: Path) -> None:
+ """Symlink all S2 hr inputs except the held-out acquisition date."""
+ dest.mkdir(parents=True, exist_ok=True)
+ for pattern in ("*_REFL.tif", "*_GCC.tif", "*_DIST_CLOUD.tif"):
+ for src in sorted(s2_dir.glob(pattern)):
+ date_token = (
+ src.stem.split("_")[2][:8] if len(src.stem.split("_")) >= 3 else ""
+ )
+ if date_token == holdout_yyyymmdd:
+ continue
+ link = dest / src.name
+ if link.exists() or link.is_symlink():
+ link.unlink()
+ link.symlink_to(src.resolve())
+
+
+# ---------------------------------------------------------------------------
+# Line A — representativeness
+# ---------------------------------------------------------------------------
+
+
+def compute_representativeness(phenocam: list[dict], s2: list[dict]) -> dict[str, Any]:
+ """PhenoCam gcc_90 vs co-located observed S2 GCC."""
+ obs, sim, _ = _match_series(phenocam, "gcc_90", s2, "gcc")
+ result: dict[str, Any] = {
+ "n": len(obs),
+ "r": None,
+ "spearman": None,
+ "slope": None,
+ "intercept": None,
+ "rmse": None,
+ "bias": None,
+ "peak_offset_days": None,
+ "representative": False,
+ }
+ if len(obs) < 2:
+ return result
+
+ obs_arr = np.array(obs, dtype=float)
+ sim_arr = np.array(sim, dtype=float)
+ r, _ = pearsonr(obs_arr, sim_arr)
+ sp, _ = spearmanr(obs_arr, sim_arr)
+ reg = linregress(sim_arr, obs_arr)
+ diff = sim_arr - obs_arr
+ result.update(
+ {
+ "r": _r4(float(r)),
+ "spearman": _r4(float(sp)),
+ "slope": _r4(float(reg.slope)),
+ "intercept": _r4(float(reg.intercept)),
+ "rmse": _r4(float(np.sqrt(np.mean(diff**2)))),
+ "bias": _r4(float(np.mean(diff))),
+ "representative": float(r) >= REPR_R_THRESHOLD,
+ }
+ )
+
+ pc_dates = [p["date"] for p in phenocam if p.get("gcc_90") is not None]
+ s2_dates = [p["date"] for p in s2 if p.get("gcc") is not None]
+ if pc_dates and s2_dates:
+ pc_peak = max(
+ phenocam,
+ key=lambda p: p["gcc_90"] if p.get("gcc_90") is not None else -1,
+ )["date"]
+ s2_peak = max(s2, key=lambda p: p["gcc"] if p.get("gcc") is not None else -1)[
+ "date"
+ ]
+ result["peak_offset_days"] = int(_day_gap(pc_peak, s2_peak))
+
+ return result
+
+
+# ---------------------------------------------------------------------------
+# Line B — LOOCV
+# ---------------------------------------------------------------------------
+
+
+def _phenocam_lookup(phenocam: list[dict]) -> dict[str, float]:
+ return {p["date"]: p["gcc_90"] for p in phenocam if p.get("gcc_90") is not None}
+
+
+def _nearest_phenocam(iso_date: str, lookup: dict[str, float]) -> float | None:
+ if not lookup:
+ return None
+ dates = sorted(lookup)
+ nearest = min(dates, key=lambda d: _day_gap(iso_date, d))
+ if _day_gap(iso_date, nearest) <= MATCH_TOLERANCE_DAYS:
+ return lookup[nearest]
+ return None
+
+
+def run_loocv_site(
+ site: str,
+ year: int,
+ lat: float,
+ lon: float,
+ s2_series: list[dict],
+ phenocam: list[dict],
+ efast,
+) -> list[dict[str, Any]]:
+ """Leave-one-out EFAST for each cloud-free S2 date; return per-date records."""
+ s2_dir = DATA_DIR / "sentinel_data" / str(year) / site / "prepared" / "s2"
+ gcc_s3_dir = DATA_DIR / "sentinel_data" / str(year) / site / "prepared" / "gcc_s3"
+ s3_rgb_dir = DATA_DIR / "sentinel_data" / str(year) / site / "prepared" / "s3_rgb"
+
+ pc_lookup = _phenocam_lookup(phenocam)
+ s2_truth = {p["date"]: p["gcc"] for p in s2_series}
+ fusion_kwargs = dict(
+ ratio=RESOLUTION_RATIO,
+ max_days=MAX_DAYS,
+ minimum_acquisition_importance=MINIMUM_ACQUISITION_IMPORTANCE,
+ )
+
+ records: list[dict[str, Any]] = []
+ dates = [p["date"] for p in s2_series]
+
+ with tempfile.TemporaryDirectory(prefix=f"loocv_{site}_") as tmp_root:
+ tmp = Path(tmp_root)
+ hr_dir = tmp / "hr"
+ itb_out = tmp / "itb"
+ bti_out = tmp / "bti"
+ bti_gcc = tmp / "bti_gcc"
+ itb_out.mkdir()
+ bti_out.mkdir()
+ bti_gcc.mkdir()
+
+ for iso_date in tqdm(dates, desc=f"{site} LOOCV", leave=False):
+ yyyymmdd = _iso_to_yyyymmdd(iso_date)
+ truth = s2_truth.get(iso_date)
+ if truth is None:
+ continue
+
+ if hr_dir.exists():
+ shutil.rmtree(hr_dir)
+ _build_hr_symlink_dir(s2_dir, yyyymmdd, hr_dir)
+
+ pred_date = datetime.strptime(yyyymmdd, "%Y%m%d")
+
+ for f in itb_out.glob("*.tif"):
+ f.unlink()
+ for f in bti_out.glob("*.tif"):
+ f.unlink()
+ for f in bti_gcc.glob("*.tif"):
+ f.unlink()
+
+ efast.fusion(
+ pred_date,
+ gcc_s3_dir,
+ hr_dir,
+ itb_out,
+ product="GCC",
+ **fusion_kwargs,
+ )
+ efast.fusion(
+ pred_date,
+ s3_rgb_dir,
+ hr_dir,
+ bti_out,
+ product="REFL",
+ **fusion_kwargs,
+ )
+
+ itb_path = itb_out / f"GCC_{yyyymmdd}.tif"
+ refl_path = bti_out / f"REFL_{yyyymmdd}.tif"
+ bti_path = bti_gcc / f"GCC_{yyyymmdd}.tif"
+
+ pred_itb = (
+ _read_center_pixel(itb_path, lat, lon) if itb_path.is_file() else None
+ )
+ pred_bti = None
+ if refl_path.is_file():
+ _gcc_from_refl_file(refl_path, bti_path)
+ if bti_path.is_file():
+ pred_bti = _read_center_pixel(bti_path, lat, lon)
+
+ pc_val = _nearest_phenocam(iso_date, pc_lookup)
+
+ records.append(
+ {
+ "date": iso_date,
+ "s2_truth": truth,
+ "pred_bti": pred_bti,
+ "pred_itb": pred_itb,
+ "phenocam": pc_val,
+ }
+ )
+
+ return records
+
+
+def _method_accuracy(records: list[dict], pred_key: str, ref_key: str) -> dict | None:
+ obs, sim = [], []
+ for rec in records:
+ pred = rec.get(pred_key)
+ ref = rec.get(ref_key)
+ if pred is None or ref is None:
+ continue
+ obs.append(ref)
+ sim.append(pred)
+ return _accuracy_metrics(obs, sim)
+
+
+def _winner(rmse_bti: float | None, rmse_itb: float | None) -> str | None:
+ if rmse_bti is None or rmse_itb is None:
+ return None
+ if rmse_bti < rmse_itb:
+ return "bti"
+ if rmse_itb < rmse_bti:
+ return "itb"
+ return "tie"
+
+
+def summarize_loocv(records: list[dict]) -> dict[str, Any]:
+ bti_vs_s2 = _method_accuracy(records, "pred_bti", "s2_truth")
+ itb_vs_s2 = _method_accuracy(records, "pred_itb", "s2_truth")
+ bti_vs_pc = _method_accuracy(records, "pred_bti", "phenocam")
+ itb_vs_pc = _method_accuracy(records, "pred_itb", "phenocam")
+
+ winner_s2 = _winner(
+ bti_vs_s2["rmse"] if bti_vs_s2 else None,
+ itb_vs_s2["rmse"] if itb_vs_s2 else None,
+ )
+ winner_pc = _winner(
+ bti_vs_pc["rmse"] if bti_vs_pc else None,
+ itb_vs_pc["rmse"] if itb_vs_pc else None,
+ )
+ agreement = (
+ winner_s2 == winner_pc
+ if winner_s2 and winner_pc and winner_s2 != "tie" and winner_pc != "tie"
+ else None
+ )
+
+ return {
+ "n_dates": len(records),
+ "bti": {"vs_s2": bti_vs_s2, "vs_phenocam": bti_vs_pc},
+ "itb": {"vs_s2": itb_vs_s2, "vs_phenocam": itb_vs_pc},
+ "winner_s2": winner_s2,
+ "winner_phenocam": winner_pc,
+ "winner_agreement": agreement,
+ }
+
+
+# ---------------------------------------------------------------------------
+# Aggregate concordance
+# ---------------------------------------------------------------------------
+
+
+def _pooled_concordance(
+ all_records: list[dict[str, Any]],
+) -> dict[str, Any]:
+ """Pooled metrics across all held-out dates."""
+ residual_pairs: list[tuple[float, float]] = []
+ vec_s2: list[float] = []
+ vec_pc: list[float] = []
+
+ for site_data in all_records:
+ for rec in site_data.get("records", []):
+ truth = rec.get("s2_truth")
+ pc = rec.get("phenocam")
+ for key in ("pred_bti", "pred_itb"):
+ pred = rec.get(key)
+ if pred is None or truth is None:
+ continue
+ err_s2 = abs(pred - truth)
+ if pc is not None:
+ err_pc = abs(pred - pc)
+ vec_s2.append(err_s2)
+ vec_pc.append(err_pc)
+ residual_pairs.append((err_s2, err_pc))
+
+ pooled_spearman = None
+ if len(vec_s2) >= 3:
+ sp, _ = spearmanr(vec_s2, vec_pc)
+ if not np.isnan(sp):
+ pooled_spearman = _r4(float(sp))
+
+ residual_corr = None
+ if len(residual_pairs) >= 3:
+ xs = np.array([p[0] for p in residual_pairs])
+ ys = np.array([p[1] for p in residual_pairs])
+ rc, _ = pearsonr(xs, ys)
+ residual_corr = _r4(float(rc))
+
+ agreements = [
+ s.get("winner_agreement")
+ for s in all_records
+ if s.get("eligible") and s.get("winner_agreement") is not None
+ ]
+ winner_agreement_rate = (
+ _r4(sum(1 for a in agreements if a) / len(agreements)) if agreements else None
+ )
+
+ n_loocv_dates = sum(len(s.get("records", [])) for s in all_records)
+
+ return {
+ "pooled_spearman": pooled_spearman,
+ "residual_corr": residual_corr,
+ "winner_agreement_rate": winner_agreement_rate,
+ "n_loocv_dates": n_loocv_dates,
+ }
+
+
+def _suitability_verdict(
+ n_repr_pass: int,
+ n_eligible: int,
+ n_total: int,
+ pooled: dict[str, Any],
+) -> str:
+ if n_eligible == 0:
+ return "insufficient data"
+ repr_rate = n_repr_pass / n_total if n_total else 0
+ agree = pooled.get("winner_agreement_rate")
+ sp = pooled.get("pooled_spearman")
+ rc = pooled.get("residual_corr")
+
+ strong = 0
+ if repr_rate >= 0.6:
+ strong += 1
+ if agree is not None and agree >= 0.7:
+ strong += 1
+ if sp is not None and sp >= 0.8:
+ strong += 1
+ if rc is not None and rc >= 0.5:
+ strong += 1
+
+ if strong >= 3:
+ return "suitable"
+ if strong >= 1 or repr_rate >= 0.4:
+ return "partially suitable"
+ return "not suitable"
+
+
+# ---------------------------------------------------------------------------
+# Per-site processing
+# ---------------------------------------------------------------------------
+
+
+def process_site(
+ site: str,
+ year: int,
+ lat: float,
+ lon: float,
+ min_cloudfree: int,
+ efast,
+) -> dict[str, Any]:
+ metrics_dir = DATA_DIR / "metrics" / str(year) / site
+ phenocam = _load_json_series(metrics_dir / "gcc_phenocam.json")
+ s2_series = _load_json_series(metrics_dir / "gcc_s2.json")
+
+ repr_metrics = compute_representativeness(phenocam, s2_series)
+ n_cloudfree = len(s2_series)
+ eligible = n_cloudfree >= min_cloudfree
+
+ result: dict[str, Any] = {
+ "eligible": eligible,
+ "n_cloudfree_s2": n_cloudfree,
+ "representativeness": repr_metrics,
+ "loocv": None,
+ "winner_s2": None,
+ "winner_phenocam": None,
+ "winner_agreement": None,
+ "records": [],
+ }
+
+ if not eligible:
+ return result
+
+ records = run_loocv_site(site, year, lat, lon, s2_series, phenocam, efast)
+ loocv = summarize_loocv(records)
+ result["loocv"] = loocv
+ result["winner_s2"] = loocv["winner_s2"]
+ result["winner_phenocam"] = loocv["winner_phenocam"]
+ result["winner_agreement"] = loocv["winner_agreement"]
+ result["records"] = records
+ return result
+
+
+# ---------------------------------------------------------------------------
+# Output / summary
+# ---------------------------------------------------------------------------
+
+
+def _compact_site_payload(site_result: dict[str, Any]) -> dict[str, Any]:
+ """Drop raw LOOCV records from JSON output (keep summaries only)."""
+ out = {
+ "eligible": site_result["eligible"],
+ "n_cloudfree_s2": site_result["n_cloudfree_s2"],
+ "representativeness": site_result["representativeness"],
+ "winner_s2": site_result.get("winner_s2"),
+ "winner_phenocam": site_result.get("winner_phenocam"),
+ "winner_agreement": site_result.get("winner_agreement"),
+ }
+ if site_result.get("loocv"):
+ out["loocv"] = site_result["loocv"]
+ return out
+
+
+def _print_summary(payload: dict[str, Any]) -> None:
+ year = payload["year"]
+ agg = payload["aggregate"]
+ print(
+ f"\nPhenoCam GCC suitability — {year} "
+ f"({payload['n_sites_total']} site(s), "
+ f"{payload['n_sites_eligible']} LOOCV-eligible, "
+ f"{payload['n_sites_repr_pass']} representative)"
+ )
+ print(f"Verdict: {agg['suitability_verdict']}")
+ print(
+ f" pooled Spearman (method errors): {agg.get('pooled_spearman', '—')} "
+ f"residual corr: {agg.get('residual_corr', '—')} "
+ f"winner agreement: {agg.get('winner_agreement_rate', '—')} "
+ f"LOOCV dates: {agg.get('n_loocv_dates', '—')}"
+ )
+ print(f"\n{'site':<28} {'repr r':>8} {'pass':>5} {'LOOCV n':>8} {'win agree':>10}")
+ print("-" * 65)
+ for site, data in sorted(payload["sites"].items()):
+ rep = data["representativeness"]
+ loocv_n = data.get("loocv", {}).get("n_dates") if data.get("loocv") else "—"
+ agree = data.get("winner_agreement")
+ agree_s = "yes" if agree else ("no" if agree is False else "—")
+ pass_s = "yes" if rep.get("representative") else "no"
+ print(
+ f"{site:<28} {rep.get('r') or '—':>8} {pass_s:>5} "
+ f"{loocv_n!s:>8} {agree_s:>10}"
+ )
+ if payload["n_sites_total"] < SMALL_SAMPLE_SITES:
+ print(
+ f"\nNote: only {payload['n_sites_total']} site(s); "
+ "interpret cross-site aggregates cautiously."
+ )
+ if payload.get("dropped_sites"):
+ print(f"Dropped/ineligible: {', '.join(payload['dropped_sites'])}")
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument("--evaluation-year", type=int, default=DEFAULT_YEAR)
+ parser.add_argument(
+ "--min-cloudfree-s2",
+ type=int,
+ default=MIN_CLOUDFREE_S2,
+ help="Minimum cloud-free S2 dates for LOOCV (default 10)",
+ )
+ parser.add_argument(
+ "--alpha",
+ type=float,
+ default=DEFAULT_ALPHA,
+ help="Significance threshold (reserved; default 0.05)",
+ )
+ args = parser.parse_args()
+
+ year = args.evaluation_year
+ min_cloudfree = args.min_cloudfree_s2
+
+ sites = _discover_sites(year)
+ if not sites:
+ raise SystemExit(
+ f"No Step 5 metrics found under {DATA_DIR / 'metrics' / str(year)}"
+ )
+
+ coords = _load_site_coords(year)
+ efast = _import_efast()
+
+ site_results: dict[str, dict[str, Any]] = {}
+ dropped: list[str] = []
+
+ for site in tqdm(sites, desc="Sites"):
+ if site not in coords:
+ dropped.append(site)
+ continue
+ lat, lon = coords[site]
+ site_results[site] = process_site(site, year, lat, lon, min_cloudfree, efast)
+ if not site_results[site]["eligible"]:
+ dropped.append(site)
+
+ n_eligible = sum(1 for s in site_results.values() if s["eligible"])
+ n_repr_pass = sum(
+ 1
+ for s in site_results.values()
+ if s["representativeness"].get("representative")
+ )
+
+ pooled = _pooled_concordance(list(site_results.values()))
+ verdict = _suitability_verdict(n_repr_pass, n_eligible, len(sites), pooled)
+
+ payload = {
+ "year": year,
+ "alpha": args.alpha,
+ "repr_r_threshold": REPR_R_THRESHOLD,
+ "min_cloudfree_s2": min_cloudfree,
+ "n_sites_total": len(sites),
+ "n_sites_eligible": n_eligible,
+ "n_sites_repr_pass": n_repr_pass,
+ "aggregate": {
+ "suitability_verdict": verdict,
+ **pooled,
+ },
+ "sites": {
+ site: _compact_site_payload(data)
+ for site, data in sorted(site_results.items())
+ },
+ "dropped_sites": sorted(set(dropped)),
+ }
+
+ out_dir = DATA_DIR / "gcc_suitability"
+ out_dir.mkdir(parents=True, exist_ok=True)
+ out_path = out_dir / f"{year}.json"
+ out_path.write_text(json.dumps(payload, separators=(",", ":")))
+
+ _print_summary(payload)
+ print(f"\nWritten → {out_path}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..8f8092c
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2026 Felix Delattre
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..baf3ba7
--- /dev/null
+++ b/README.md
@@ -0,0 +1,92 @@
+# EFAST fusion with phenocam validation.
+
+End-to-end pipeline from selecting sites from the global [PhenoCam Network](https://phenocam.nau.edu/) to run [EFAST](https://github.com/DHI-GRAS/efast) spatio-temporal fusion with Sentinel-2 / Sentinel-3 and validate GCCs accross sensors. The numbered steps cover site selection, Sentinel data acquisition, different fusion orders, accuracy metrics, and sample-level statistics, all feeding a static web QA viewer.
+
+---
+
+## Pipeline overview
+
+| Step | Script | What it does |
+|------|--------|--------------|
+| 1 | `1-phenocam.py` | Download PhenoCam metadata and `one_day_summary` GCC CSVs |
+| 2 | `2-phenocam-screening.py` | Apply PhenoCam count, SNR, and proximity gates to select feasible sites |
+| 3 | `3-sentinel-data.py` | Acquire S2 (Earth Search COG) and S3 OLCI SYN L2 (CDSE OpenEO); prepare REFL, DIST_CLOUD, and composite GeoTIFFs |
+| 4 | `4-fusion.py` | Run EFAST BtI (fuse reflectance → GCC) and ItB (fuse GCC directly) for each screened site |
+| 5 | `5-metrics.py` | Extract PhenoCam-matched timeseries, compute NSE/RMSE/r baselines and fusion metrics, emit per-site JSON and webapp manifest |
+| 6 | `6-statistics-fusion-order.py` | Paired ItB-vs-BtI significance test (Wilcoxon + t-test) across all sites |
+| 7 | `7-gcc-suitability.py` | PhenoCam GCC suitability as a fusion-accuracy reference (representativeness + LOOCV concordance) |
+
+---
+
+## Quick start
+
+### Run pipeline wrapper (recommended)
+
+```bash
+uv sync
+uv run python run-pipeline.py --evaluation-year 2025
+```
+
+Runs all five steps in order. Steps 1 and 2 are skipped when their output already exists. Each site in steps 3–5 is skipped when `data/metrics/{year}/{site}/metrics.json` is present. Any failure stops the run immediately, so one can fix the issue and re-run; completed work is never repeated.
+
+```bash
+# single site (steps 1 and 2 still skip if already done)
+uv run python run-pipeline.py --evaluation-year 2025 --site ICOSFR-Fon1
+```
+
+### Step by step
+
+```bash
+uv sync
+uv run python 1-phenocam.py --evaluation-year 2025
+uv run python 2-phenocam-screening.py --evaluation-year 2025
+uv run python 3-sentinel-data.py --evaluation-year 2025
+uv run python 4-fusion.py --evaluation-year 2025
+uv run python 5-metrics.py --evaluation-year 2025
+uv run python 6-statistics-fusion-order.py --evaluation-year 2025
+uv run python 7-gcc-suitability.py --evaluation-year 2025
+```
+
+Steps 1–5 accept `--evaluation-year` (default `2025`) and `--site` (optional, for single-site runs). Steps 6–7 are full-sample aggregates and only accept `--evaluation-year` (Step 6 and 7 also accept `--alpha`; Step 7 adds `--min-cloudfree-s2`, default `10`). Steps 3–5 are resumable — existing output files are skipped.
+
+```bash
+# single site
+uv run python 3-sentinel-data.py --evaluation-year 2025 --site ICOSFR-Fon1
+uv run python 4-fusion.py --evaluation-year 2025 --site ICOSFR-Fon1
+uv run python 5-metrics.py --evaluation-year 2025 --site ICOSFR-Fon1
+```
+
+### Credentials
+
+Step 3 S3 download uses CDSE OpenEO (`SENTINEL3_SYN_L2_SYN`). Set `CDSE_USER` and `CDSE_PASSWORD` in `../.env` at the workspace root. S2 uses AWS Earth Search COG range reads (no auth required).
+
+---
+
+## Outputs (under `data/`)
+
+| Artifact | Step | Role |
+|----------|------|------|
+| `phenocam/{year}.json` | 1 | Site list + `sites_dir` pointer |
+| `phenocam/{year}/{site}.json`, `{site}_1day.csv` | 1 | Raw API payload and GCC CSV |
+| `phenocam_screening/{year}.json` / `.csv` | 2 | Gate results (pass/fail per site) |
+| `sentinel_data/{year}/{site}/prepared/s2/` | 3 | S2 REFL + DIST_CLOUD GeoTIFFs |
+| `sentinel_data/{year}/{site}/prepared/s3/` | 3 | S3 composite GeoTIFFs |
+| `fusion/{year}/{site}/bti/`, `.../itb/` | 4 | BtI fused reflectance + GCC; ItB fused GCC |
+| `metrics/{year}/{site}/` | 5 | Per-site timeseries, metrics, covariates JSON |
+| `metrics/manifest.json` | 5 | Webapp manifest (years + site metadata) |
+| `statistics_fusion_order/{year}.json` | 6 | Paired ItB-vs-BtI test summary (NSE, RMSE, nRMSE, r) |
+| `gcc_suitability/{year}.json` | 7 | PhenoCam GCC suitability summary (representativeness + LOOCV concordance) |
+
+---
+
+## Web viewer
+
+
+
+`python3 -m http.server 8080` runs the webapp on [http://localhost:8000/index.html](http://localhost:8000/index.html). Requires step 5 output (`data/metrics/manifest.json`). The Statistics overlay GCC suitability tab uses step 7 output (`data/gcc_suitability/{year}.json`).
+
+---
+
+## License
+
+[MIT](LICENSE)
diff --git a/amoros2011multitemporal.pdf b/amoros2011multitemporal.pdf
deleted file mode 100644
index 5a49dff..0000000
--- a/amoros2011multitemporal.pdf
+++ /dev/null
@@ -1,9635 +0,0 @@
-%PDF-1.5
-%
-1 0 obj
-<<
-/OCProperties <<
-/D <<
-/RBGroups []
-/Order []
->>
-/OCGs [2 0 R]
->>
-/Type /Catalog
-/FICL#3AEnfocus 3 0 R
-/Metadata 4 0 R
-/Pages 5 0 R
->>
-endobj
-6 0 obj
-<<
-/Producer <7064665465582D312E34302E31303B206D6F646966696564207573696E67206954657874AE20372E312E3120A9323030302D323031382069546578742047726F7570204E5620284147504C2D76657273696F6E29>
-/IEEE#20Publication#20ID (5983499)
-/Meeting#20Ending#20Date (14 July 2011)
-/Trapped (False)
-/ModDate (D:20181126164003-05'00')
-/Subject (2011 6th International Workshop on the Analysis of Multi-temporal Remote Sensing Images \(Multi-Temp\);2011; ; ;10.1109/Multi-Temp.2011.6005053)
-/Application ('Certified by IEEE PDFeXpress at 06/17/2011 3:24:51 AM')
-/PTEX.Fullbanner (This is pdfTeX, Version 3.1415926-1.40.10-2.2 \(TeX Live 2009/Debian\) kpathsea version 5.0.0)
-/Creator ('Certified by IEEE PDFeXpress at 06/17/2011 3:24:51 AM')
-/IEEE#20Issue#20ID (6005026)
-/IEEE#20Article#20ID (6005053)
-/Title (Multitemporal fusion of Landsat and MERIS images)
-/Meeting#20Starting#20Date (12 July 2011)
-/CreationDate (D:20110617121830+02'00')
->>
-endobj
-2 0 obj
-<<
-/Usage <<
-/PageElement <<
-/Subtype /HF
->>
->>
-/Type /OCG
-/Name (Headers/Footers)
->>
-endobj
-3 0 obj
-<<
-/CertifiedWF 7 0 R
->>
-endobj
-4 0 obj
-<<
-/Length 3528
-/Type /Metadata
-/Subtype /XML
->>
-stream
-
-
-
-application/pdfIEEE2011 6th International Workshop on the Analysis of Multi-temporal Remote Sensing Images (Multi-Temp);2011; ; ;10.1109/Multi-Temp.2011.6005053Multi-resolutiondata fusionspatial unmixingsub-pixelLandsat TMMERISMultitemporal fusion of Landsat and MERIS imagesJ. Amoros-LopezL. Gomez-ChovaL. GuanterL. AlonsoJ. MorenoG. Camps-Valls
-2011 6th International Workshop on the Analysis of Multi-temporal Remote Sensing Images (Multi-Temp)81 July 201110.1109/Multi-Temp.2011.600505384
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-endstream
-endobj
-5 0 obj
-<<
-/Type /Pages
-/Count 4
-/Kids [8 0 R 9 0 R 10 0 R 11 0 R]
->>
-endobj
-7 0 obj
-<<
-/Stamp [61 158 234 37 46 215 123 102 121 190
-16 110 0 160 172 209 121 174 11 44]
-/Version 1
-/Sessions [12 0 R 13 0 R 14 0 R 15 0 R]
->>
-endobj
-8 0 obj
-<<
-/Parent 5 0 R
-/Type /Page
-/Contents [16 0 R 17 0 R 18 0 R]
-/Resources <<
-/ExtGState <<
-/GS0 19 0 R
->>
-/XObject <<
-/Fm0 20 0 R
-/Fm1 21 0 R
-/Fm2 22 0 R
->>
-/ProcSet [/PDF /Text]
-/Font <<
-/T1_3 23 0 R
-/T1_1 24 0 R
-/T1_4 25 0 R
-/T1_2 26 0 R
-/T1_5 27 0 R
-/T1_6 28 0 R
-/T1_0 29 0 R
-/T1_7 30 0 R
-/F9 31 0 R
->>
->>
-/MediaBox [0 0 612 792]
->>
-endobj
-9 0 obj
-<<
-/Group 32 0 R
-/Parent 5 0 R
-/Type /Page
-/Contents [33 0 R 34 0 R 35 0 R]
-/Resources <<
-/ExtGState <<
-/GS0 36 0 R
->>
-/XObject <<
-/Fm0 37 0 R
-/Fm1 38 0 R
-/Fm2 39 0 R
-/Im3 40 0 R
-/Fm3 41 0 R
-/Im1 42 0 R
-/Im2 43 0 R
-/Im0 44 0 R
->>
-/ProcSet [/PDF /Text /ImageC]
-/Font <<
-/T1_1 23 0 R
-/T1_2 30 0 R
-/T1_0 29 0 R
-/F4 31 0 R
->>
->>
-/MediaBox [0 0 612 792]
->>
-endobj
-10 0 obj
-<<
-/Group 45 0 R
-/Parent 5 0 R
-/Type /Page
-/Contents [46 0 R 47 0 R 48 0 R]
-/Resources <<
-/ExtGState <<
-/GS0 49 0 R
->>
-/XObject <<
-/Fm0 50 0 R
-/Fm1 51 0 R
-/Fm2 52 0 R
-/Fm3 53 0 R
-/Fm4 54 0 R
-/Fm5 55 0 R
-/Fm6 56 0 R
->>
-/ProcSet [/PDF /Text]
-/Font <<
-/T1_3 28 0 R
-/T1_1 23 0 R
-/T1_4 30 0 R
-/T1_2 27 0 R
-/T1_0 29 0 R
-/F6 31 0 R
->>
->>
-/MediaBox [0 0 612 792]
->>
-endobj
-11 0 obj
-<<
-/Group 57 0 R
-/Parent 5 0 R
-/Type /Page
-/Contents [58 0 R 59 0 R 60 0 R]
-/Resources <<
-/ExtGState <<
-/GS0 61 0 R
->>
-/XObject <<
-/Fm0 62 0 R
-/Fm1 63 0 R
-/Fm2 64 0 R
-/Fm3 65 0 R
->>
-/ProcSet [/PDF /Text]
-/Font <<
-/T1_1 23 0 R
-/T1_2 24 0 R
-/T1_0 29 0 R
-/F4 31 0 R
->>
->>
-/MediaBox [0 0 612 792]
->>
-endobj
-12 0 obj
-<<
-/Length 25686
-/Stamp <03C9E20B79A667795E349A0D00A115EE>
-/Version 1
-/Filter /FlateDecode
->>
-stream
-x اg]zZ
-@@@Nc$@ $d@23~Ģ`ۨRE7\Ń
-EqU+߳ ʜSk{}߾\r|q./_e}?WO߿o{ǟwxި雞~z#sYnCJЍO5GN=k7={g1ġG#KxQZd֚?Ъ>r!ӛ4^?ҽO{ltvo7<{W]!zgwo}O=ȭx[Nt䖛wo9sl30ǟ;zӷcs郸2'Nߓ̙
|ٳ\KN߱xpGn%[eţ+vۿ:?~gzQ>욫x]Mӷ9rjԇˎçn.UCM~hVC]ܩ#GW$==%)>O?$Ї8+`b_yꙛW:~mgo5'?}{~ 9vbo9yTV;ʎc.\y>FntVtV{3?rYo>#kNrYTzC.
æ~هݏܽi#8F Fƫ=?&w\~g`g7(jMh~rfy3G7nG͵_xI+.:3{reOtdžvm`o?9R?/{79u'^eyZe":nnl|gOŹ/K
-$NN!~m.h>7I{'e/5u'$LiS;ԏa<
;q柶f+Y-Ux+_qqLX">5ӎnͥ>LDyJ_
i?dZՃgmM?4v}{ WKY~|L}]5_%/懾J:m6zv Gns5L\mtT̛q U*gڨg+9} ,Q33O;~vc_
-cvudG3&1Hgy}n/"~G,ۭ}K}~lO`;Nc˳Wy\B ϺJ&=2ʸeeOnk7yCw__9B:`
-Tgl9Ck?n>_>!]*F30H觨_4ZCШiyfЭOױb̠zSfl/6@h`: /vPA^|߄_.lWv8ufgV'݀dC7?D3h&@qI9qeO~۱|UM;*'#4]&vG#ŝ3P6
-1ac ڪ1N
-bXv2YkݎYu;19Vs
-ouXv_m Hh+/H