diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index 369ceb2..0000000
--- a/.gitignore
+++ /dev/null
@@ -1,75 +0,0 @@
-# Generated caches and downloads (regenerate via pipeline steps)
-data/
-
-# Environment and secrets
-.env
-.env.*
-!.env.example
-.venv/
-venv/
-env/
-ENV/
-
-# Python
-__pycache__/
-*.py[cod]
-*$py.class
-*.so
-*.pyd
-.Python
-
-# uv
-.uv/
-
-# Testing & coverage
-.pytest_cache/
-.coverage
-.coverage.*
-coverage.xml
-nosetests.xml
-htmlcov/
-.tox/
-
-# Linting & type checking
-.ruff_cache/
-.mypy_cache/
-.dmypy.json
-dmypy.json
-.pytype/
-
-# Distribution / packaging
-build/
-dist/
-*.egg-info/
-*.egg
-MANIFEST
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Logs
-*.log
-nohup.out
-
-# Jupyter
-.ipynb_checkpoints/
-profile_default/
-ipython_config.py
-
-# IDE
-.vscode/
-.idea/
-*.swp
-*.swo
-*~
-
-# OS
-.DS_Store
-Thumbs.db
-desktop.ini
-
-# Merge artifacts
-*.orig
-*.rej
-
-# Agents
-AGENTS.md
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
deleted file mode 100644
index 6ba9060..0000000
--- a/.pre-commit-config.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-repos:
- - repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.8.4
- hooks:
- - id: ruff
- args: [--fix]
- - id: ruff-format
-
diff --git a/.python-version b/.python-version
deleted file mode 100644
index 3e72aa6..0000000
--- a/.python-version
+++ /dev/null
@@ -1 +0,0 @@
-3.11.10
diff --git a/1-phenocam.py b/1-phenocam.py
deleted file mode 100644
index 89ca797..0000000
--- a/1-phenocam.py
+++ /dev/null
@@ -1,284 +0,0 @@
-"""Step 1: download worldwide PhenoCam sites for a calendar year.
-
-Inputs (``data/``): none — queries the PhenoCam API.
-
-Outputs (``data/``, ``{year}`` = ``--evaluation-year``):
-
-- ``phenocam/{year}.json`` — site list manifest
-- ``phenocam/{year}/{sitename}.json`` — camera + ROI metadata
-- ``phenocam/{year}/{sitename}_1day.csv`` — ``one_day_summary`` GCC CSV
-
-CLI: ``--evaluation-year`` (default 2025), ``--sites`` (optional comma-separated filter).
-
-Next step: :mod:`2-phenocam-screening`.
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import sys
-from datetime import date
-from pathlib import Path
-from typing import Any
-
-import requests
-
-PROCESSING_DIR = Path(__file__).resolve().parents[1] / "processing"
-if str(PROCESSING_DIR) not in sys.path:
- sys.path.insert(0, str(PROCESSING_DIR))
-
-from acquisition_phenocam import PHENOCAM_API # noqa: E402
-from acquisition_phenocam_all_europe import _paginate_cameras, _parse_iso_date # noqa: E402
-
-EVALUATION_YEAR = 2025
-HOST_PROBE = "https://phenocam.nau.edu/api/cameras/?limit=1"
-ONE_DAY_CSV_SUFFIX = "_1day.csv"
-
-
-def check_phenocam_host() -> None:
- try:
- response = requests.get(HOST_PROBE, timeout=30)
- response.raise_for_status()
- except requests.RequestException as exc:
- raise RuntimeError(
- f"PhenoCam API unreachable (phenocam.nau.edu): "
- f"{exc.__class__.__name__}: {exc}"
- ) from exc
-
-
-def _overlaps_year(first: str | None, last: str | None, season: int) -> bool:
- start = _parse_iso_date(first)
- end = _parse_iso_date(last)
- if start is None or end is None:
- return False
- return start <= date(season, 12, 31) and end >= date(season, 1, 1)
-
-
-def sites_dir(cache_dir: Path, evaluation_year: int) -> Path:
- return cache_dir / "phenocam" / str(evaluation_year)
-
-
-def site_json_path(cache_dir: Path, evaluation_year: int, sitename: str) -> Path:
- return sites_dir(cache_dir, evaluation_year) / f"{sitename}.json"
-
-
-def site_csv_path(cache_dir: Path, evaluation_year: int, sitename: str) -> Path:
- return sites_dir(cache_dir, evaluation_year) / f"{sitename}{ONE_DAY_CSV_SUFFIX}"
-
-
-def load_candidate_cameras(
- evaluation_year: int,
- *,
- site_filter: set[str] | None = None,
- active_only: bool = False,
- limit: int | None = None,
-) -> list[dict[str, Any]]:
- cameras: list[dict[str, Any]] = []
- for camera in _paginate_cameras():
- if active_only and not camera.get("active"):
- continue
- sitename = str(camera["Sitename"])
- if site_filter is not None and sitename not in site_filter:
- continue
- if not _overlaps_year(
- camera.get("date_first"), camera.get("date_last"), evaluation_year
- ):
- continue
- cameras.append(dict(camera))
- cameras.sort(key=lambda item: str(item["Sitename"]))
- if limit is not None:
- cameras = cameras[:limit]
- return cameras
-
-
-def fetch_roi_record(site_name: str) -> dict[str, Any] | None:
- rois: list[dict[str, Any]] = []
- url = f"{PHENOCAM_API}/roilists/"
- params: dict[str, Any] | None = {"site": site_name}
- while url:
- response = requests.get(url, params=params, timeout=60)
- response.raise_for_status()
- payload = response.json()
- rois.extend(
- item for item in payload.get("results", []) if item.get("site") == site_name
- )
- url = payload.get("next")
- params = None
- if rois:
- break
- return dict(rois[0]) if rois else None
-
-
-def download_one_day_csv(csv_url: str, output_path: Path) -> None:
- response = requests.get(csv_url, timeout=60)
- response.raise_for_status()
- output_path.parent.mkdir(parents=True, exist_ok=True)
- output_path.write_text(response.text, encoding="utf-8")
-
-
-def download_site(
- camera: dict[str, Any],
- evaluation_year: int,
- cache_dir: Path,
-) -> str:
- sitename = str(camera["Sitename"])
- roi = fetch_roi_record(sitename)
- payload = {"response": {"camera": camera, "roi": roi}}
- json_path = site_json_path(cache_dir, evaluation_year, sitename)
- json_path.parent.mkdir(parents=True, exist_ok=True)
- json_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
-
- csv_url = roi.get("one_day_summary") if roi else None
- if csv_url:
- download_one_day_csv(
- csv_url, site_csv_path(cache_dir, evaluation_year, sitename)
- )
- return sitename
-
-
-def load_or_download_site(
- camera: dict[str, Any],
- evaluation_year: int,
- cache_dir: Path,
- *,
- refresh: bool,
-) -> str:
- sitename = str(camera["Sitename"])
- json_path = site_json_path(cache_dir, evaluation_year, sitename)
- csv_path = site_csv_path(cache_dir, evaluation_year, sitename)
- if not refresh and json_path.is_file():
- if not csv_path.is_file():
- payload = json.loads(json_path.read_text(encoding="utf-8"))
- roi = payload.get("response", {}).get("roi") or {}
- csv_url = roi.get("one_day_summary")
- if csv_url:
- download_one_day_csv(csv_url, csv_path)
- return sitename
- return download_site(camera, evaluation_year, cache_dir)
-
-
-def run_download(
- *,
- cache_dir: Path,
- evaluation_year: int,
- active_only: bool = False,
- site_filter: set[str] | None = None,
- limit: int | None = None,
- refresh: bool = False,
-) -> list[str]:
- check_phenocam_host()
- candidates = load_candidate_cameras(
- evaluation_year,
- site_filter=site_filter,
- active_only=active_only,
- limit=limit,
- )
- print(
- f"[PhenoCam-1] {len(candidates)} candidate(s) with archive overlap for "
- f"{evaluation_year}"
- )
-
- sitenames: list[str] = []
- for index, camera in enumerate(candidates, start=1):
- sitename = str(camera["Sitename"])
- print(
- f"[PhenoCam-1] ({index}/{len(candidates)}) {sitename} "
- f"({float(camera['Lat']):.4f}, {float(camera['Lon']):.4f})"
- )
- sitenames.append(
- load_or_download_site(
- camera,
- evaluation_year,
- cache_dir,
- refresh=refresh,
- )
- )
- return sorted(sitenames)
-
-
-def write_manifest(
- sitenames: list[str],
- output_path: Path,
- cache_dir: Path,
- evaluation_year: int,
-) -> None:
- rel_sites_dir = sites_dir(cache_dir, evaluation_year).relative_to(
- output_path.parent
- )
- payload = {
- "evaluation_year": evaluation_year,
- "count": len(sitenames),
- "sites_dir": rel_sites_dir.as_posix(),
- "sites": sitenames,
- }
- output_path.parent.mkdir(parents=True, exist_ok=True)
- output_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
- print(f"[PhenoCam-1] Wrote {output_path}")
-
-
-def main(argv: list[str] | None = None) -> int:
- parser = argparse.ArgumentParser(description=__doc__)
- parser.add_argument(
- "--cache-dir",
- type=Path,
- default=Path("data"),
- help="Base directory for per-site files and manifest",
- )
- parser.add_argument(
- "--evaluation-year",
- type=int,
- default=EVALUATION_YEAR,
- help=f"Calendar year to download (default: {EVALUATION_YEAR})",
- )
- parser.add_argument(
- "--active-only",
- action="store_true",
- help="Restrict candidates to cameras marked active in the API",
- )
- parser.add_argument(
- "--limit",
- type=int,
- default=None,
- help="Process only the first N candidate sites (testing)",
- )
- parser.add_argument(
- "--sites",
- type=str,
- default=None,
- help="Comma-separated sitenames to download (testing)",
- )
- parser.add_argument(
- "--refresh",
- action="store_true",
- help="Re-download sites even when cache files exist",
- )
- parser.add_argument(
- "--output-json",
- type=Path,
- default=None,
- help="Manifest output path (default: data/phenocam/{year}.json)",
- )
- args = parser.parse_args(argv)
-
- site_filter = None
- if args.sites:
- site_filter = {name.strip() for name in args.sites.split(",") if name.strip()}
-
- sitenames = run_download(
- cache_dir=args.cache_dir,
- evaluation_year=args.evaluation_year,
- active_only=args.active_only,
- site_filter=site_filter,
- limit=args.limit,
- refresh=args.refresh,
- )
- manifest_path = args.output_json or (
- args.cache_dir / "phenocam" / f"{args.evaluation_year}.json"
- )
- write_manifest(sitenames, manifest_path, args.cache_dir, args.evaluation_year)
- return 0
-
-
-if __name__ == "__main__":
- raise SystemExit(main())
diff --git a/2-phenocam-screening.py b/2-phenocam-screening.py
deleted file mode 100644
index 5848ae9..0000000
--- a/2-phenocam-screening.py
+++ /dev/null
@@ -1,494 +0,0 @@
-"""Step 2: PhenoCam GCC + SNR screening on step-1 cache.
-
-Inputs (``data/``, ``{year}`` = ``--evaluation-year``):
-
-- ``phenocam/{year}.json`` — step-1 manifest
-- ``phenocam/{year}/{sitename}.json`` — per-site metadata
-- ``phenocam/{year}/{sitename}_1day.csv`` — GCC timeseries
-
-Outputs (``data/phenocam_screening/``):
-
-- ``{year}.json`` — full per-site results
-- ``{year}.csv`` — flat summary table
-
-CLI: ``--evaluation-year`` (default 2025), ``--sites`` (optional; default: all manifest sites).
-
-Next step: :mod:`3-sentinel-clouds`.
-"""
-
-from __future__ import annotations
-
-import argparse
-import csv
-import json
-import math
-import sys
-from datetime import date, datetime
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-from scipy.interpolate import UnivariateSpline
-
-PROCESSING_DIR = Path(__file__).resolve().parents[1] / "processing"
-if str(PROCESSING_DIR) not in sys.path:
- sys.path.insert(0, str(PROCESSING_DIR))
-
-from acquisition_phenocam import _phenocam_summary_gcc_value # noqa: E402
-
-MIN_GCC_POINTS = 30
-SNR_THRESHOLD = 2.0
-CLUSTER_RADIUS_M = 500.0
-GATE_ORDER = ("phenocam", "snr", "cluster")
-ONE_DAY_CSV_SUFFIX = "_1day.csv"
-_EARTH_RADIUS_M = 6371000.0
-
-
-def load_manifest(path: Path) -> dict[str, Any]:
- payload = json.loads(path.read_text(encoding="utf-8"))
- for key in ("evaluation_year", "sites_dir", "sites"):
- if key not in payload:
- raise ValueError(f"Expected '{key}' in manifest {path}")
- return payload
-
-
-def resolve_sites_dir(manifest_path: Path, manifest: dict[str, Any]) -> Path:
- return (manifest_path.parent / manifest["sites_dir"]).resolve()
-
-
-def load_site_entry(sites_dir: Path, sitename: str) -> dict[str, Any]:
- json_path = sites_dir / f"{sitename}.json"
- payload = json.loads(json_path.read_text(encoding="utf-8"))
- csv_path = sites_dir / f"{sitename}{ONE_DAY_CSV_SUFFIX}"
- payload["_one_day_csv"] = csv_path if csv_path.is_file() else None
- return payload
-
-
-def parse_gcc90_series(csv_path: Path, evaluation_year: int) -> list[tuple[str, float]]:
- lines = [
- line
- for line in csv_path.read_text(encoding="utf-8").split("\n")
- if line and not line.startswith("#")
- ]
- reader = csv.DictReader(lines)
- fieldnames = reader.fieldnames or ()
- use_mean_fallback = "gcc_90" not in fieldnames
-
- year_start = date(evaluation_year, 1, 1)
- year_end = date(evaluation_year, 12, 31)
- series: list[tuple[str, float]] = []
- for row in reader:
- date_str = row.get("date")
- if not date_str:
- continue
- try:
- row_date = datetime.strptime(date_str, "%Y-%m-%d").date()
- except ValueError:
- continue
- if not (year_start <= row_date <= year_end):
- continue
- gcc = _phenocam_summary_gcc_value(row, use_mean_fallback)
- if gcc is None:
- continue
- series.append((row_date.isoformat(), float(gcc)))
- series.sort(key=lambda item: item[0])
- return series
-
-
-def _months_covered(day_strings: list[str]) -> int:
- months: set[int] = set()
- for day in day_strings:
- months.add(datetime.strptime(day, "%Y-%m-%d").month)
- return len(months)
-
-
-def _aic_for_spline(x: np.ndarray, y: np.ndarray, spline: UnivariateSpline) -> float:
- residuals = y - spline(x)
- rss = float(np.sum(residuals**2))
- n = len(y)
- if rss <= 0 or n < 4:
- return math.inf
- edf = float(spline.get_knots().shape[0] + spline.get_coeffs().shape[0])
- return n * math.log(rss / n) + 2.0 * edf
-
-
-def compute_snr_aic_spline(series: list[tuple[str, float]]) -> float | None:
- if len(series) < MIN_GCC_POINTS:
- return None
-
- dates = [datetime.strptime(day, "%Y-%m-%d").date() for day, _ in series]
- x = np.array([(d - dates[0]).days for d in dates], dtype=float)
- y = np.array([value for _, value in series], dtype=float)
- if len(np.unique(x)) < 5:
- return None
-
- y_var = float(np.var(y))
- if y_var <= 0:
- return None
-
- candidates = np.logspace(-4, 2, 40) * y_var * len(y)
- best_spline: UnivariateSpline | None = None
- best_aic = math.inf
- for smoothing in candidates:
- try:
- spline = UnivariateSpline(x, y, k=3, s=float(smoothing))
- except Exception:
- continue
- aic = _aic_for_spline(x, y, spline)
- if aic < best_aic:
- best_aic = aic
- best_spline = spline
-
- if best_spline is None:
- return None
-
- residuals = y - best_spline(x)
- rmse = float(np.sqrt(np.mean(residuals**2)))
- amplitude = float(np.max(y) - np.min(y))
- if rmse <= 0:
- return None
- return amplitude / rmse
-
-
-def screen_site(
- site_entry: dict[str, Any],
- *,
- evaluation_year: int,
- min_gcc_points: int,
- snr_threshold: float,
-) -> dict[str, Any]:
- response = site_entry["response"]
- roi = response.get("roi")
- csv_path = site_entry.get("_one_day_csv")
- calculations: dict[str, Any] = {
- "evaluation_year": evaluation_year,
- "n_gcc_points": 0,
- "first_gcc_date": None,
- "last_gcc_date": None,
- "months_with_gcc": 0,
- "snr": None,
- "min_gcc_points": min_gcc_points,
- "snr_threshold": snr_threshold,
- "status": "FAIL",
- "failing_gate": None,
- "passed_gates": [],
- "reason": None,
- }
-
- if roi is None or not roi.get("one_day_summary") or csv_path is None:
- calculations["failing_gate"] = "phenocam"
- calculations["reason"] = "no_roi"
- return {"response": response, "calculations": calculations}
-
- series = parse_gcc90_series(csv_path, evaluation_year)
- calculations["n_gcc_points"] = len(series)
- if calculations["n_gcc_points"] == 0:
- calculations["failing_gate"] = "phenocam"
- calculations["reason"] = "no_gcc_in_year"
- return {"response": response, "calculations": calculations}
-
- day_strings = [day for day, _ in series]
- calculations["first_gcc_date"] = day_strings[0]
- calculations["last_gcc_date"] = day_strings[-1]
- calculations["months_with_gcc"] = _months_covered(day_strings)
-
- if calculations["n_gcc_points"] < min_gcc_points:
- calculations["failing_gate"] = "phenocam"
- calculations["reason"] = "insufficient_gcc_points"
- return {"response": response, "calculations": calculations}
-
- calculations["passed_gates"].append("phenocam")
-
- snr = compute_snr_aic_spline(series)
- calculations["snr"] = snr
- if snr is None or snr < snr_threshold:
- calculations["failing_gate"] = "snr"
- calculations["reason"] = (
- "insufficient_snr" if snr is not None else "snr_undefined"
- )
- return {"response": response, "calculations": calculations}
-
- calculations["passed_gates"].append("snr")
- calculations["status"] = "PASS"
- calculations["failing_gate"] = None
- calculations["reason"] = None
- return {"response": response, "calculations": calculations}
-
-
-def _haversine_m(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
- p1, p2 = math.radians(lat1), math.radians(lat2)
- dlat = math.radians(lat2 - lat1)
- dlon = math.radians(lon2 - lon1)
- a = math.sin(dlat / 2) ** 2 + math.cos(p1) * math.cos(p2) * math.sin(dlon / 2) ** 2
- return 2 * _EARTH_RADIUS_M * math.asin(math.sqrt(a))
-
-
-def _site_coords(row: dict[str, Any]) -> tuple[float, float] | None:
- camera = row["response"]["camera"]
- lat, lon = camera.get("Lat"), camera.get("Lon")
- if lat is None or lon is None:
- return None
- return float(lat), float(lon)
-
-
-def _cluster_rank(row: dict[str, Any]) -> tuple[int, float]:
- calc = row["calculations"]
- return calc["n_gcc_points"], float(calc.get("snr") or 0.0)
-
-
-def apply_cluster_gate(results: list[dict[str, Any]], *, radius_m: float) -> int:
- pool: list[tuple[int, float, float]] = []
- for idx, row in enumerate(results):
- if "snr" not in row["calculations"]["passed_gates"]:
- continue
- coords = _site_coords(row)
- if coords is None:
- row["calculations"]["passed_gates"].append("cluster")
- continue
- pool.append((idx, coords[0], coords[1]))
-
- n = len(pool)
- parent = list(range(n))
-
- def find(x: int) -> int:
- while parent[x] != x:
- parent[x] = parent[parent[x]]
- x = parent[x]
- return x
-
- def union(a: int, b: int) -> None:
- ra, rb = find(a), find(b)
- if ra != rb:
- parent[rb] = ra
-
- for i in range(n):
- _, lat1, lon1 = pool[i]
- for j in range(i + 1, n):
- _, lat2, lon2 = pool[j]
- if _haversine_m(lat1, lon1, lat2, lon2) <= radius_m:
- union(i, j)
-
- clusters: dict[int, list[int]] = {}
- for i in range(n):
- clusters.setdefault(find(i), []).append(i)
-
- demoted = 0
- for members in clusters.values():
- result_indices = [pool[i][0] for i in members]
- cluster_size = len(result_indices)
- winner_idx = max(result_indices, key=lambda idx: _cluster_rank(results[idx]))
- winner_name = str(results[winner_idx]["response"]["camera"]["Sitename"])
- for idx in result_indices:
- calc = results[idx]["calculations"]
- calc["cluster_size"] = cluster_size
- if idx == winner_idx:
- calc["passed_gates"].append("cluster")
- else:
- calc["status"] = "FAIL"
- calc["failing_gate"] = "cluster"
- calc["reason"] = "nearby_duplicate"
- calc["cluster_winner"] = winner_name
- demoted += 1
- return demoted
-
-
-def run_screening(
- manifest: dict[str, Any],
- sites_dir: Path,
- *,
- evaluation_year: int,
- min_gcc_points: int,
- snr_threshold: float,
- site_filter: set[str] | None = None,
-) -> list[dict[str, Any]]:
- results: list[dict[str, Any]] = []
- sitenames = manifest["sites"]
- if site_filter is not None:
- sitenames = [name for name in sitenames if name in site_filter]
- for index, sitename in enumerate(sitenames, start=1):
- print(f"[PhenoCam-2] ({index}/{len(sitenames)}) {sitename}")
- site_entry = load_site_entry(sites_dir, sitename)
- results.append(
- screen_site(
- site_entry,
- evaluation_year=evaluation_year,
- min_gcc_points=min_gcc_points,
- snr_threshold=snr_threshold,
- )
- )
- return results
-
-
-def print_summary(results: list[dict[str, Any]], evaluation_year: int) -> None:
- passing = [row for row in results if row["calculations"]["status"] == "PASS"]
- gates_label = " + ".join(GATE_ORDER)
- print(
- f"\n[PhenoCam-2] Screening for {evaluation_year}: "
- f"{len(passing)}/{len(results)} pass ({gates_label})"
- )
-
- for gate in GATE_ORDER:
- fails = sum(1 for row in results if row["calculations"]["failing_gate"] == gate)
- after = sum(1 for row in results if gate in row["calculations"]["passed_gates"])
- print(f" after_{gate}: {after}, fail_at_{gate}: {fails}")
-
- print("\nPer-site table")
- print(f"{'site':<24} {'n':>4} {'mon':>3} {'snr':>6} {'status':>6} gate reason")
- print("-" * 72)
- for row in sorted(
- results,
- key=lambda item: str(item["response"]["camera"]["Sitename"]),
- ):
- camera = row["response"]["camera"]
- calc = row["calculations"]
- snr_text = f"{calc['snr']:.2f}" if calc["snr"] is not None else ""
- print(
- f"{camera['Sitename']:<24} {calc['n_gcc_points']:4d} "
- f"{calc['months_with_gcc']:3d} {snr_text:>6} "
- f"{calc['status']:>6} {(calc['failing_gate'] or '-'):<8} "
- f"{calc['reason'] or '-'}"
- )
-
-
-def write_screening_json(
- results: list[dict[str, Any]],
- output_path: Path,
- evaluation_year: int,
-) -> None:
- passing = [row for row in results if row["calculations"]["status"] == "PASS"]
- payload = {
- "evaluation_year": evaluation_year,
- "count": len(results),
- "qualifying_count": len(passing),
- "sites": sorted(
- results,
- key=lambda item: str(item["response"]["camera"]["Sitename"]),
- ),
- }
- output_path.parent.mkdir(parents=True, exist_ok=True)
- output_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
- print(f"[PhenoCam-2] Wrote {output_path}")
-
-
-def write_screening_csv(results: list[dict[str, Any]], output_path: Path) -> None:
- rows: list[dict[str, Any]] = []
- for row in results:
- camera = row["response"]["camera"]
- metadata = camera.get("sitemetadata") or {}
- roi = row["response"].get("roi") or {}
- calc = row["calculations"]
- rows.append(
- {
- "Sitename": camera.get("Sitename"),
- "Lat": camera.get("Lat"),
- "Lon": camera.get("Lon"),
- "site_description": metadata.get("site_description"),
- "primary_veg_type": metadata.get("primary_veg_type"),
- "site_type": metadata.get("site_type"),
- "one_day_summary": roi.get("one_day_summary"),
- **calc,
- }
- )
- fieldnames = list(rows[0].keys()) if rows else ["Sitename", "status"]
- if rows:
- extra = [k for row in rows for k in row if k not in fieldnames]
- fieldnames.extend(dict.fromkeys(extra))
- output_path.parent.mkdir(parents=True, exist_ok=True)
- with output_path.open("w", encoding="utf-8", newline="") as handle:
- writer = csv.DictWriter(handle, fieldnames=fieldnames)
- writer.writeheader()
- writer.writerows(rows)
- print(f"[PhenoCam-2] Wrote {output_path}")
-
-
-def main(argv: list[str] | None = None) -> int:
- parser = argparse.ArgumentParser(description=__doc__)
- parser.add_argument(
- "--evaluation-year",
- type=int,
- default=2025,
- help="Evaluation year (default: 2025)",
- )
- parser.add_argument(
- "--sites",
- type=str,
- default=None,
- help="Comma-separated sitenames (default: all sites in step-1 manifest)",
- )
- parser.add_argument(
- "--min-gcc-points",
- type=int,
- default=MIN_GCC_POINTS,
- help=f"Minimum valid gcc_90 observations in-year (default: {MIN_GCC_POINTS})",
- )
- parser.add_argument(
- "--snr-threshold",
- type=float,
- default=SNR_THRESHOLD,
- help=f"Minimum AIC-spline SNR (default: {SNR_THRESHOLD})",
- )
- parser.add_argument(
- "--output-json",
- type=Path,
- default=None,
- help="Screening output (default: data/phenocam_screening/{year}.json)",
- )
- parser.add_argument(
- "--output-csv",
- type=Path,
- default=None,
- help="Flat CSV summary path",
- )
- parser.add_argument(
- "--cluster-radius-m",
- type=float,
- default=CLUSTER_RADIUS_M,
- help=f"Deduplicate SNR-passed sites within this radius (default: {CLUSTER_RADIUS_M})",
- )
- parser.add_argument(
- "--no-cluster",
- action="store_true",
- help="Skip nearby-site deduplication gate",
- )
- args = parser.parse_args(argv)
-
- evaluation_year = args.evaluation_year
- manifest_path = Path("data") / "phenocam" / f"{evaluation_year}.json"
- if not manifest_path.is_file():
- raise SystemExit(f"Step-1 manifest not found: {manifest_path}")
-
- site_filter = None
- if args.sites:
- site_filter = {name.strip() for name in args.sites.split(",") if name.strip()}
-
- manifest = load_manifest(manifest_path)
- sites_dir_path = resolve_sites_dir(manifest_path, manifest)
-
- results = run_screening(
- manifest,
- sites_dir_path,
- evaluation_year=evaluation_year,
- min_gcc_points=args.min_gcc_points,
- snr_threshold=args.snr_threshold,
- site_filter=site_filter,
- )
- if not args.no_cluster:
- demoted = apply_cluster_gate(results, radius_m=args.cluster_radius_m)
- if demoted:
- print(f"[PhenoCam-2] Cluster dedup: demoted {demoted} nearby duplicate(s)")
- print_summary(results, evaluation_year)
-
- default_dir = Path("data") / "phenocam_screening"
- json_name = f"{evaluation_year}.json"
- csv_name = f"{evaluation_year}.csv"
- write_screening_json(
- results,
- args.output_json or (default_dir / json_name),
- evaluation_year,
- )
- write_screening_csv(results, args.output_csv or (default_dir / csv_name))
- return 0
-
-
-if __name__ == "__main__":
- raise SystemExit(main())
diff --git a/3-sentinel-data.py b/3-sentinel-data.py
deleted file mode 100644
index 75877bf..0000000
--- a/3-sentinel-data.py
+++ /dev/null
@@ -1,931 +0,0 @@
-"""Step 3: Download S2 and S3 rasters and prepare EFAST inputs.
-
-Inputs (``data/``, ``{year}`` = ``--evaluation-year``):
-
-- ``phenocam_screening/{year}.json`` — step-2 PASS sites (coordinates included)
-
-Outputs (``data/``):
-
-- ``sentinel_data/{year}/{sitename}/raw/s3/*.tif`` — S3 SYN L2 per-date GeoTIFFs
-- ``sentinel_data/{year}/{sitename}/prepared/s2/`` — S2 REFL + DIST_CLOUD GeoTIFFs
-- ``sentinel_data/{year}/{sitename}/prepared/s3/`` — S3 composite GeoTIFFs
-- ``sentinel_data/{year}/{sitename}/data.json`` — run summary
-
-Requires ``CDSE_USER`` / ``CDSE_PASSWORD`` in ``../.env`` (workspace root; ``uv sync`` installs efast).
-
-CLI:
-
-- ``--evaluation-year`` (default 2025)
-- ``--site`` (optional; default: all step-2 PASS sites)
-
-Prior step: :mod:`2-phenocam-screening`.
-Next step: :mod:`4-fusion`.
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import shutil
-import time
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from datetime import datetime
-from pathlib import Path
-from typing import Any
-
-import netCDF4
-import numpy as np
-import openeo
-import rasterio
-import requests
-from dotenv import load_dotenv
-from pystac_client import Client
-from rasterio import shutil as rio_shutil
-from rasterio.enums import Resampling
-from rasterio.errors import WindowError
-from rasterio.transform import from_bounds
-from rasterio.vrt import WarpedVRT
-from rasterio.warp import transform_geom
-from rasterio.windows import Window
-from rasterio.windows import from_bounds as window_from_bounds
-from rasterio.windows import transform as window_transform
-from shapely import wkt as shapely_wkt
-from tqdm import tqdm
-
-# ---------------------------------------------------------------------------
-# Public constants — edit here to change pipeline behaviour
-# ---------------------------------------------------------------------------
-
-S2_BANDS = ["B02", "B03", "B04"]
-
-S3_BANDS = [
- "Syn_Oa04_reflectance",
- "Syn_Oa06_reflectance",
- "Syn_Oa08_reflectance",
- "Syn_Oa17_reflectance",
-]
-S3_BAND_NAMES = ["SDR_Oa04", "SDR_Oa06", "SDR_Oa08", "SDR_Oa17"]
-
-RESOLUTION_RATIO = 30
-S3_MOSAIC_DAYS = 100
-S3_COMPOSITE_STEP = 2
-S3_COMPOSITE_SIGMA_DOY = 10
-S3_COMPOSITE_D = 20
-S3_SMOOTHING_STD = 1
-S3_REFLECTANCE_SCALE = 10_000 # OpenEO SYN L2 SDR → 0–1 (EFAST expects < 5)
-
-# ---------------------------------------------------------------------------
-# Internal S2 constants
-# ---------------------------------------------------------------------------
-
-EARTH_SEARCH_URL = "https://earth-search.aws.element84.com/v1"
-
-_BAND_ASSETS: dict[str, str] = {
- "B02": "blue",
- "B03": "green",
- "B04": "red",
- "B05": "rededge1",
- "B06": "rededge2",
- "B07": "rededge3",
- "B08": "nir",
- "B8A": "nir08",
- "B11": "swir16",
- "B12": "swir22",
-}
-_SCL_ASSET = "scl"
-_MIN_BBOX_HALF_DEG = 0.008
-
-_GDAL_COG_ENV = {
- # HTTP/1.1 avoids HTTP/2 multiplexing connection-reset cascades on S3.
- "GDAL_HTTP_VERSION": "1.1",
- "GDAL_HTTP_MERGE_CONSECUTIVE_RANGES": "YES",
- "GDAL_HTTP_TCP_KEEPALIVE": "YES",
- "GDAL_DISABLE_READDIR_ON_OPEN": "EMPTY_DIR",
- "CPL_VSIL_CURL_CACHE_SIZE": "200000000",
- # Built-in GDAL retries for 429/502/503/504 and transient resets.
- "GDAL_HTTP_MAX_RETRY": "3",
- "GDAL_HTTP_RETRY_DELAY": "0.5",
- "AWS_NO_SIGN_REQUEST": "YES",
-}
-
-# ---------------------------------------------------------------------------
-# Internal S3 constants
-# ---------------------------------------------------------------------------
-
-CDSE_TOKEN_URL = (
- "https://identity.dataspace.copernicus.eu/auth/realms/CDSE/"
- "protocol/openid-connect/token"
-)
-OPENEO_URL = "openeo.dataspace.copernicus.eu"
-S3_COLLECTION = "SENTINEL3_SYN_L2_SYN"
-
-DATA_DIR = Path("data")
-DEFAULT_YEAR = 2025
-WORKSPACE_ROOT = Path(__file__).resolve().parent.parent
-
-
-# ---------------------------------------------------------------------------
-# Credentials
-# ---------------------------------------------------------------------------
-
-
-def _cdse_credentials() -> dict[str, str | None]:
- load_dotenv(WORKSPACE_ROOT / ".env")
- return {
- "username": os.getenv("CDSE_USER"),
- "password": os.getenv("CDSE_PASSWORD"),
- }
-
-
-# ---------------------------------------------------------------------------
-# Screening manifest helpers
-# ---------------------------------------------------------------------------
-
-
-def _load_screening_pass_sites(year: int) -> list[dict[str, Any]]:
- """Return list of PASS-site dicts from step-2 screening JSON.
-
- Each entry has ``sitename``, ``lat``, ``lon`` keys.
- """
- path = DATA_DIR / "phenocam_screening" / f"{year}.json"
- if not path.is_file():
- raise FileNotFoundError(f"Step-2 screening manifest not found: {path}")
- payload = json.loads(path.read_text(encoding="utf-8"))
- sites = []
- for row in payload.get("sites", []):
- calc = row.get("calculations", {})
- if calc.get("status") != "PASS":
- continue
- camera = row.get("response", {}).get("camera", {})
- name = camera.get("Sitename")
- lat = camera.get("Lat")
- lon = camera.get("Lon")
- if name and lat is not None and lon is not None:
- sites.append({"sitename": str(name), "lat": float(lat), "lon": float(lon)})
- return sites
-
-
-# ---------------------------------------------------------------------------
-# S2: geometry helpers (from s2_cloud_native.py)
-# ---------------------------------------------------------------------------
-
-
-def wkt_to_bbox(geometry_wkt: str) -> list[float]:
- """Convert a WKT geometry to a ``[west, south, east, north]`` bbox."""
- geom = shapely_wkt.loads(geometry_wkt)
- minx, miny, maxx, maxy = geom.bounds
- if minx == maxx and miny == maxy:
- minx -= _MIN_BBOX_HALF_DEG
- maxx += _MIN_BBOX_HALF_DEG
- miny -= _MIN_BBOX_HALF_DEG
- maxy += _MIN_BBOX_HALF_DEG
- return [minx, miny, maxx, maxy]
-
-
-def _boa_offset(item: Any) -> int:
- """Return the BOA additive offset for a STAC item.
-
- Processing baseline >= 04.00 applies a -1000 offset; earlier baselines use 0.
- """
- if item.properties.get("earthsearch:boa_offset_applied"):
- return 0
- baseline_str = str(
- item.properties.get("processing:baseline")
- or item.properties.get("s2:processing_baseline")
- or "0"
- )
- try:
- baseline = float(baseline_str)
- except ValueError:
- baseline = 0.0
- return -1000 if baseline >= 4.0 else 0
-
-
-def _window_for_bbox(
- src: rasterio.io.DatasetReader,
- bbox_4326: list[float],
-) -> Window | None:
- """Return the rasterio Window for a EPSG:4326 bbox clipped to src bounds."""
- bbox_geom = {
- "type": "Polygon",
- "coordinates": [
- [
- [bbox_4326[0], bbox_4326[1]],
- [bbox_4326[2], bbox_4326[1]],
- [bbox_4326[2], bbox_4326[3]],
- [bbox_4326[0], bbox_4326[3]],
- [bbox_4326[0], bbox_4326[1]],
- ]
- ],
- }
- src_geom = transform_geom("EPSG:4326", src.crs.to_wkt(), bbox_geom)
- xs = [c[0] for c in src_geom["coordinates"][0][:4]]
- ys = [c[1] for c in src_geom["coordinates"][0][:4]]
- win = window_from_bounds(min(xs), min(ys), max(xs), max(ys), src.transform)
- try:
- return win.intersection(Window(0, 0, src.width, src.height))
- except WindowError:
- return None
-
-
-def _read_window(
- href: str,
- bbox_4326: list[float],
- out_shape: tuple[int, int] | None = None,
- resampling: Resampling = Resampling.bilinear,
-) -> tuple[np.ndarray, dict[str, Any]] | None:
- """Range-read a single-band array for the bbox window from a COG URL."""
- with rasterio.open(href) as src:
- win = _window_for_bbox(src, bbox_4326)
- if win is None:
- return None
- data = src.read(1, window=win, out_shape=out_shape, resampling=resampling)
- profile: dict[str, Any] = {
- "crs": src.crs,
- "transform": window_transform(win, src.transform),
- "height": data.shape[0],
- "width": data.shape[1],
- "dtype": src.dtypes[0],
- }
- return data, profile
-
-
-def _read_bands(
- item: Any,
- bbox: list[float],
- bands: list[str],
-) -> tuple[list[np.ndarray], dict[str, Any]] | None:
- """Range-read all requested bands for one STAC item."""
- band_arrays: list[np.ndarray] = []
- ref_profile: dict[str, Any] | None = None
-
- for band_name in bands:
- asset_key = _BAND_ASSETS.get(band_name)
- if asset_key is None or asset_key not in item.assets:
- return None
- ref_shape = (
- (ref_profile["height"], ref_profile["width"]) if ref_profile else None
- )
- result = _read_window(item.assets[asset_key].href, bbox, out_shape=ref_shape)
- if result is None:
- return None
- data, profile = result
- if ref_profile is None:
- ref_profile = profile
- band_arrays.append(data.astype("float32"))
-
- return (band_arrays, ref_profile) if ref_profile is not None else None
-
-
-def _cloud_mask(item: Any, bbox: list[float], shape: tuple[int, int]) -> np.ndarray:
- """Return a boolean cloud/shadow mask from the item's SCL band.
-
- Masks SCL classes 0 (no data), 3 (cloud shadow), and >7 (clouds, cirrus, snow).
- """
- scl = item.assets.get(_SCL_ASSET)
- result = (
- _read_window(scl.href, bbox, out_shape=shape, resampling=Resampling.nearest)
- if scl
- else None
- )
- if result is None:
- return np.zeros(shape, dtype=bool)
- scl_data, _ = result
- return (scl_data == 0) | (scl_data == 3) | (scl_data > 7)
-
-
-def _pad_to_multiple(arr: np.ndarray, ratio: int) -> np.ndarray:
- """Zero-pad (bands, H, W) so H and W are multiples of ``ratio``."""
- pad_h = (ratio - arr.shape[1] % ratio) % ratio
- pad_w = (ratio - arr.shape[2] % ratio) % ratio
- if pad_h or pad_w:
- arr = np.pad(arr, ((0, 0), (0, pad_h), (0, pad_w)), constant_values=0)
- return arr
-
-
-# ---------------------------------------------------------------------------
-# S2: STAC search + download (from s2_cloud_native.py)
-# ---------------------------------------------------------------------------
-
-
-def stac_search_s2(
- bbox: list[float],
- start_date: datetime,
- end_date: datetime,
-) -> list[Any]:
- """Search Earth Search for S2 L2A items intersecting a bbox."""
- client = Client.open(EARTH_SEARCH_URL)
- search = client.search(
- collections=["sentinel-2-l2a"],
- bbox=bbox,
- datetime=(
- f"{start_date.strftime('%Y-%m-%dT%H:%M:%SZ')}/"
- f"{end_date.strftime('%Y-%m-%dT23:59:59Z')}"
- ),
- max_items=10_000,
- )
- return list({item.id: item for item in search.items()}.values())
-
-
-def _process_item(
- item: Any,
- bbox: list[float],
- bands: list[str],
- output_dir: Path,
- ratio: int,
-) -> str | None:
- """Range-read one S2 item and write a masked REFL GeoTIFF.
-
- Returns a skip-message string when the item cannot be processed, else None.
- """
- out_path = output_dir / f"{item.id}_REFL.tif"
- if out_path.is_file():
- return None
- bands_result = _read_bands(item, bbox, bands)
- if bands_result is None:
- return f"[S2] Skipping {item.id}: missing asset or no bbox overlap"
- band_arrays, ref_profile = bands_result
- mask = _cloud_mask(item, bbox, (ref_profile["height"], ref_profile["width"]))
- stacked = (np.stack(band_arrays) + _boa_offset(item)) / 10_000.0
- np.clip(stacked, 0, None, out=stacked)
- stacked[:, mask] = 0.0
- stacked = _pad_to_multiple(stacked, ratio)
- out_profile = {
- "driver": "GTiff",
- "count": len(bands),
- "dtype": "float32",
- "nodata": 0,
- "crs": ref_profile["crs"],
- "transform": ref_profile["transform"],
- "height": stacked.shape[1],
- "width": stacked.shape[2],
- "compress": "lzw",
- }
- with rasterio.open(out_path, "w", **out_profile) as dst:
- dst.write(stacked)
- for i, band_name in enumerate(bands, 1):
- dst.set_band_description(i, band_name)
- return None
-
-
-def download_s2_window(
- items: list[Any],
- bbox: list[float],
- output_dir: Path,
- bands: list[str],
- ratio: int = RESOLUTION_RATIO,
- max_workers: int = 12,
-) -> None:
- """Range-read S2 L2A COG windows and write masked REFL GeoTIFFs.
-
- Writes ``{item.id}_REFL.tif`` directly — no intermediate raw download.
- Cloud/shadow pixels (SCL 0, 3, >7) are zeroed. BOA offset is inferred from
- ``processing:baseline``. Output is zero-padded to multiples of ``ratio``.
- Items are fetched in parallel using ``max_workers`` threads.
- """
- output_dir.mkdir(parents=True, exist_ok=True)
- with rasterio.Env(**_GDAL_COG_ENV):
- with ThreadPoolExecutor(max_workers=max_workers) as pool:
- futures = {
- pool.submit(
- _process_item, item, bbox, bands, output_dir, ratio
- ): item.id
- for item in items
- }
- with tqdm(
- total=len(futures), unit="granule", desc="S2 COG window read"
- ) as pbar:
- for fut in as_completed(futures):
- msg = fut.result()
- if msg:
- tqdm.write(msg)
- pbar.update(1)
-
-
-# ---------------------------------------------------------------------------
-# S3: download (from s3_openeo.py)
-# ---------------------------------------------------------------------------
-
-
-def _utm_epsg(bbox: list[float]) -> int:
- """Return the UTM EPSG code for the centre of a ``[W, S, E, N]`` bbox."""
- lon = (bbox[0] + bbox[2]) / 2
- lat = (bbox[1] + bbox[3]) / 2
- zone = int((lon + 180) / 6) + 1
- return 32600 + zone if lat >= 0 else 32700 + zone
-
-
-def _cdse_token(username: str, password: str) -> str:
- """Obtain a CDSE bearer token via password grant."""
- resp = requests.post(
- CDSE_TOKEN_URL,
- data={
- "grant_type": "password",
- "username": username,
- "password": password,
- "client_id": "cdse-public",
- },
- timeout=30,
- )
- resp.raise_for_status()
- return resp.json()["access_token"]
-
-
-def _netcdf_to_geotiffs(nc_path: Path, output_dir: Path, epsg: int) -> int:
- """Split an OpenEO NetCDF into per-date GeoTIFFs.
-
- Output filenames match the ``S3*__YYYYMMDDTHHMMSS.tif`` pattern that
- ``s3_processing.produce_median_composite`` expects.
-
- Handles half-pixel cell-centre coordinates, ascending y-axis (flip_y),
- and fills NetCDF masked values with NaN.
- """
- written = 0
- with netCDF4.Dataset(str(nc_path), "r") as nc:
- times = netCDF4.num2date(nc.variables["t"][:], nc.variables["t"].units)
- x_coords = np.asarray(nc.variables["x"][:], dtype=float)
- y_coords = np.asarray(nc.variables["y"][:], dtype=float)
-
- half_x = abs(x_coords[1] - x_coords[0]) / 2 if len(x_coords) > 1 else 0.0
- half_y = abs(y_coords[1] - y_coords[0]) / 2 if len(y_coords) > 1 else 0.0
- transform = from_bounds(
- x_coords.min() - half_x,
- y_coords.min() - half_y,
- x_coords.max() + half_x,
- y_coords.max() + half_y,
- len(x_coords),
- len(y_coords),
- )
- flip_y = len(y_coords) > 1 and y_coords[0] < y_coords[-1]
-
- date_counts: dict[str, int] = {}
- for t_idx, time_val in enumerate(times):
- date_str = time_val.strftime("%Y%m%d")
- n = date_counts.get(date_str, 0)
- date_counts[date_str] = n + 1
-
- raw = np.stack([nc.variables[b][t_idx, :, :] for b in S3_BANDS], axis=0)
- stacked = (
- np.ma.filled(raw, fill_value=np.nan).astype("float32")
- / S3_REFLECTANCE_SCALE
- )
- if flip_y:
- stacked = stacked[:, ::-1, :]
-
- filename = f"S3_{date_str}_{n}__{date_str}T120000.tif"
- with rasterio.open(
- output_dir / filename,
- "w",
- driver="GTiff",
- height=len(y_coords),
- width=len(x_coords),
- count=len(S3_BANDS),
- dtype="float32",
- nodata=float("nan"),
- crs=f"EPSG:{epsg}",
- transform=transform,
- compress="lzw",
- ) as dst:
- dst.write(stacked)
- for i, band_name in enumerate(S3_BAND_NAMES, 1):
- dst.set_band_description(i, band_name)
- written += 1
-
- return written
-
-
-_S3_DOWNLOAD_RETRIES = 4
-_S3_DOWNLOAD_BACKOFF = 30 # seconds; doubled on each retry
-
-
-def _download_with_retry(datacube: Any, nc_path: Path) -> None:
- """Download an OpenEO datacube to *nc_path*, retrying on transient errors.
-
- Retries up to ``_S3_DOWNLOAD_RETRIES`` times with exponential backoff
- starting at ``_S3_DOWNLOAD_BACKOFF`` seconds. Re-authenticates on each
- attempt so an expired token never blocks a retry.
- """
- delay = _S3_DOWNLOAD_BACKOFF
- last_exc: Exception | None = None
- for attempt in range(1, _S3_DOWNLOAD_RETRIES + 1):
- try:
- if nc_path.exists():
- nc_path.unlink()
- datacube.download(str(nc_path), format="NetCDF")
- return
- except Exception as exc:
- last_exc = exc
- if attempt < _S3_DOWNLOAD_RETRIES:
- print(
- f"[S3-OEO] Download attempt {attempt} failed ({exc}); "
- f"retrying in {delay}s..."
- )
- time.sleep(delay)
- delay *= 2
- else:
- print(f"[S3-OEO] All {_S3_DOWNLOAD_RETRIES} download attempts failed")
- raise RuntimeError(
- f"S3 download failed after {_S3_DOWNLOAD_RETRIES} attempts"
- ) from last_exc
-
-
-def download_s3_openeo(
- start_date: datetime,
- end_date: datetime,
- aoi_geometry: str,
- output_dir: Path,
- credentials: dict[str, str | None],
-) -> None:
- """Download S3 SYN L2 SDR for an AOI via CDSE OpenEO, server-side clipped.
-
- Writes per-date ``S3_{YYYYMMDD}_{n}__{YYYYMMDD}T120000.tif`` files to
- ``output_dir``, ready for ``s3_processing.produce_median_composite``.
- Skips if any ``S3*.tif`` files already exist.
- """
- output_dir.mkdir(parents=True, exist_ok=True)
-
- if any(output_dir.glob("S3*.tif")):
- print("[S3-OEO] Skipping — output_dir already contains S3 GeoTIFFs")
- return
-
- bbox = wkt_to_bbox(aoi_geometry)
- epsg = _utm_epsg(bbox)
- spatial_extent = {
- "west": bbox[0],
- "east": bbox[2],
- "south": bbox[1],
- "north": bbox[3],
- }
-
- print("[S3-OEO] Authenticating with CDSE...")
- token = _cdse_token(credentials["username"], credentials["password"]) # type: ignore[arg-type]
- conn = openeo.connect(OPENEO_URL)
- conn.authenticate_oidc_access_token(token)
-
- start_str = start_date.strftime("%Y-%m-%d")
- end_str = end_date.strftime("%Y-%m-%d")
- print(f"[S3-OEO] Loading {S3_COLLECTION} ({start_str} → {end_str})...")
- datacube = conn.load_collection(
- S3_COLLECTION,
- spatial_extent=spatial_extent,
- temporal_extent=[start_str, end_str],
- bands=S3_BANDS,
- ).resample_spatial(projection=epsg)
-
- nc_path = output_dir / "_s3_syn_l2.nc"
- print(f"[S3-OEO] Downloading NetCDF to {nc_path}...")
- t0 = time.time()
- _download_with_retry(datacube, nc_path)
- print(f"[S3-OEO] Download completed in {time.time() - t0:.1f}s")
-
- print("[S3-OEO] Splitting into per-date GeoTIFFs...")
- written = _netcdf_to_geotiffs(nc_path, output_dir, epsg)
- nc_path.unlink(missing_ok=True)
- print(f"[S3-OEO] {written} GeoTIFFs written to {output_dir}")
-
-
-# ---------------------------------------------------------------------------
-# S2: distance_to_clouds helper
-# ---------------------------------------------------------------------------
-
-
-def _import_distance_to_clouds():
- try:
- from efast.s2_processing import distance_to_clouds
-
- return distance_to_clouds
- except ImportError as exc:
- raise ImportError("efast not found. Install with: uv sync") from exc
-
-
-def _normalize_s2_grid(s2_dir: Path) -> None:
- """Remove REFL (and companion DIST_CLOUD/GCC) files that don't match the dominant shape.
-
- Sites at MGRS tile boundaries can have S2 downloads from two tiles with different
- spatial extents. For example, a site at the top edge of tile 16SDA will produce
- narrow (e.g. 30×180) REFL files from 16SDA and full-extent (180×180) files from
- 16SDB. EFAST requires all S2 files to share the same grid, so we keep only the
- shape with the most pixels (largest tile coverage) and discard the rest together
- with any already-computed companions.
- """
- refl_files = sorted(s2_dir.glob("*_REFL.tif"))
- if not refl_files:
- return
-
- shape_to_files: dict[tuple[int, int], list[Path]] = {}
- for f in refl_files:
- with rasterio.open(f) as src:
- shape: tuple[int, int] = src.shape # type: ignore[assignment]
- shape_to_files.setdefault(shape, []).append(f)
-
- if len(shape_to_files) <= 1:
- return
-
- ref_shape = max(shape_to_files.keys(), key=lambda s: s[0] * s[1])
- shapes_summary = ", ".join(
- f"{s[0]}×{s[1]}({len(fs)})" for s, fs in shape_to_files.items()
- )
-
- n_removed = 0
- for shape, files in shape_to_files.items():
- if shape == ref_shape:
- continue
- for refl_path in files:
- stem = refl_path.stem[: -len("_REFL")] # e.g. "S2A_16SDA_20250106_0_L2A"
- for companion in s2_dir.glob(f"{stem}_*.tif"):
- companion.unlink()
- refl_path.unlink(missing_ok=True)
- n_removed += 1
-
- print(
- f"[S2-NORM] Tile boundary — shapes: {shapes_summary}; "
- f"removed {n_removed} file-sets, kept {ref_shape[0]}×{ref_shape[1]}"
- )
-
-
-def _rescale_dist_cloud(s2_dir: Path) -> None:
- """Ensure DIST_CLOUD values are in pixel units (not normalised to [0,1])."""
- for dc_path in s2_dir.glob("*DIST_CLOUD.tif"):
- with rasterio.open(dc_path) as src:
- d = src.read(1)
- if float(np.nanmax(d)) <= 1:
- with rasterio.open(dc_path, "r+") as dst:
- dst.write(np.where(d > 0, 2.0, d).astype(np.float32), 1)
-
-
-# ---------------------------------------------------------------------------
-# S3: compositing + reprojection helpers (from 4-sentinel-data.py)
-# ---------------------------------------------------------------------------
-
-
-def _import_s3_processing():
- try:
- from efast import s3_processing
-
- return s3_processing
- except ImportError as exc:
- raise ImportError("efast not found. Install with: uv sync") from exc
-
-
-def _reproject_s3_composites_to_s2_grid(
- composite_dir: Path,
- s2_refl_path: Path,
- s3_out_dir: Path,
- *,
- resolution_ratio: int = RESOLUTION_RATIO,
-) -> None:
- """Reproject S3 composites to the S2 spatial grid at LR resolution."""
- s3_out_dir.mkdir(parents=True, exist_ok=True)
- with rasterio.open(s2_refl_path) as s2_ref:
- target_bounds = s2_ref.bounds
- target_crs = s2_ref.crs
- width = s2_ref.width // resolution_ratio
- height = s2_ref.height // resolution_ratio
- s3_transform = rasterio.transform.from_bounds(
- target_bounds.left,
- target_bounds.bottom,
- target_bounds.right,
- target_bounds.top,
- width,
- height,
- )
-
- for sen3_path in sorted(composite_dir.glob("composite_*.tif")):
- date_part = sen3_path.stem.split("_", 1)[1].replace("-", "")
- outfile = s3_out_dir / f"composite_{date_part}.tif"
- vrt_options = {
- "transform": s3_transform,
- "height": height,
- "width": width,
- "crs": target_crs,
- "resampling": Resampling.cubic,
- }
- with rasterio.open(sen3_path) as s3_src:
- with WarpedVRT(s3_src, **vrt_options) as vrt:
- profile = vrt.profile.copy()
- profile.update({"dtype": "float32", "nodata": 0, "driver": "GTiff"})
- rio_shutil.copy(vrt, outfile, **profile)
-
-
-def _s3_reflectance_scale(raw_s3_dir: Path) -> float:
- """Return multiplier that maps raw SYN L2 SDR values to 0–1 reflectance."""
- for path in raw_s3_dir.glob("S3*.tif"):
- with rasterio.open(path) as src:
- data = src.read()
- if not np.any(np.isfinite(data)):
- continue
- mx = float(np.nanmax(data))
- if np.isfinite(mx) and mx > 5:
- return 1.0 / S3_REFLECTANCE_SCALE
- return 1.0
-
-
-def _stage_s3_for_efast(raw_s3_dir: Path, staging_dir: Path) -> int:
- """Copy ``S3_*.tif`` inputs, scaling reflectance when still in DN form."""
- scale = _s3_reflectance_scale(raw_s3_dir)
- if staging_dir.exists():
- shutil.rmtree(staging_dir)
- staging_dir.mkdir(parents=True)
-
- count = 0
- for src_path in sorted(raw_s3_dir.glob("S3*.tif")):
- dst_path = staging_dir / src_path.name
- with rasterio.open(src_path) as src:
- data = src.read().astype("float32") * scale
- profile = src.profile.copy()
- profile.update(dtype="float32")
- descriptions = src.descriptions
- with rasterio.open(dst_path, "w", **profile) as dst:
- dst.write(data)
- for i, desc in enumerate(descriptions, 1):
- if desc:
- dst.set_band_description(i, desc)
- count += 1
-
- if scale != 1.0:
- print(f"[S3-PREP] Scaled raw SDR by {scale:g} for EFAST compositing")
- return count
-
-
-def _prepare_s3(
- raw_s3_dir: Path,
- s2_refl_path: Path,
- s3_out_dir: Path,
- *,
- work_dir: Path | None = None,
-) -> None:
- """Run EFAST S3 compositing pipeline and reproject to S2 grid."""
- s3 = _import_s3_processing()
- base = work_dir or (s3_out_dir / "_efast_work")
- staging = base / "scaled"
- composites = base / "composites"
- blurred = base / "blurred"
- calibrated = base / "calibrated"
-
- for directory in (staging, composites, blurred, calibrated):
- if directory.exists():
- shutil.rmtree(directory)
- directory.mkdir(parents=True, exist_ok=True)
-
- staged = _stage_s3_for_efast(raw_s3_dir, staging)
- if staged == 0:
- raise ValueError(f"No S3*.tif files found in {raw_s3_dir}")
-
- print(
- f"[S3-PREP] produce_median_composite: mosaic_days={S3_MOSAIC_DAYS}, "
- f"step={S3_COMPOSITE_STEP}, sigma_doy={S3_COMPOSITE_SIGMA_DOY}, "
- f"D={S3_COMPOSITE_D}"
- )
- s3.produce_median_composite(
- staging,
- composites,
- step=S3_COMPOSITE_STEP,
- mosaic_days=S3_MOSAIC_DAYS,
- s3_bands=[1, 2, 3, 4],
- D=S3_COMPOSITE_D,
- sigma_doy=S3_COMPOSITE_SIGMA_DOY,
- )
- s3.smoothing(
- composites,
- blurred,
- product="composite",
- std=S3_SMOOTHING_STD,
- preserve_nan=False,
- )
- s3.reformat_s3(blurred, calibrated, product="composite", scaling_factor=1)
-
- for old in s3_out_dir.glob("composite_*.tif"):
- old.unlink()
- _reproject_s3_composites_to_s2_grid(calibrated, s2_refl_path, s3_out_dir)
-
- if work_dir is None and base.exists():
- shutil.rmtree(base)
-
- n_out = len(list(s3_out_dir.glob("composite_*.tif")))
- print(f"[S3-PREP] Wrote {n_out} composites")
-
-
-# ---------------------------------------------------------------------------
-# Per-site pipeline
-# ---------------------------------------------------------------------------
-
-
-def process_site(
- sitename: str,
- lat: float,
- lon: float,
- year: int,
-) -> dict[str, Any]:
- """Download S2 + S3 and run EFAST preparation for one site."""
- site_dir = DATA_DIR / "sentinel_data" / str(year) / sitename
- s2_out = site_dir / "prepared" / "s2"
- s3_raw = site_dir / "raw" / "s3"
- s3_out = site_dir / "prepared" / "s3"
- aoi_wkt = f"POINT ({lon} {lat})"
- bbox = wkt_to_bbox(aoi_wkt)
- creds = _cdse_credentials()
-
- # S3 download
- print(f"[{sitename}] Downloading S3...")
- download_s3_openeo(
- start_date=datetime(year, 1, 1),
- end_date=datetime(year, 12, 31),
- aoi_geometry=aoi_wkt,
- output_dir=s3_raw,
- credentials=creds,
- )
-
- # S2 download
- print(f"[{sitename}] Searching S2 on Earth Search...")
- items = stac_search_s2(bbox, datetime(year, 1, 1), datetime(year, 12, 31))
- print(f"[{sitename}] {len(items)} S2 items found — downloading windows...")
- download_s2_window(items, bbox, s2_out, S2_BANDS, RESOLUTION_RATIO)
- _normalize_s2_grid(s2_out)
-
- # S2 distance-to-clouds
- print(f"[{sitename}] Computing distance-to-clouds...")
- distance_to_clouds = _import_distance_to_clouds()
- distance_to_clouds(s2_out, ratio=RESOLUTION_RATIO)
- _rescale_dist_cloud(s2_out)
-
- # S3 compositing
- s2_refl_path = next(iter(s2_out.glob("*_REFL.tif")), None)
- if s2_refl_path is None:
- raise ValueError(f"No REFL files in {s2_out} — S2 download may have failed")
- s3_out.mkdir(parents=True, exist_ok=True)
- print(f"[{sitename}] Running S3 compositing pipeline...")
- _prepare_s3(s3_raw, s2_refl_path, s3_out)
-
- summary = {
- "sitename": sitename,
- "evaluation_year": year,
- "lat": lat,
- "lon": lon,
- "s2_refl_count": len(list(s2_out.glob("*_REFL.tif"))),
- "s2_dist_cloud_count": len(list(s2_out.glob("*_DIST_CLOUD.tif"))),
- "s3_raw_count": len(list(s3_raw.glob("S3*.tif"))),
- "s3_composite_count": len(list(s3_out.glob("composite_*.tif"))),
- }
- site_dir.mkdir(parents=True, exist_ok=True)
- (site_dir / "data.json").write_text(
- json.dumps(summary, indent=2) + "\n", encoding="utf-8"
- )
- return summary
-
-
-# ---------------------------------------------------------------------------
-# CLI
-# ---------------------------------------------------------------------------
-
-
-def main(argv: list[str] | None = None) -> int:
- parser = argparse.ArgumentParser(description=__doc__)
- parser.add_argument("--evaluation-year", type=int, default=DEFAULT_YEAR)
- parser.add_argument(
- "--site",
- type=str,
- default=None,
- help="Single sitename to process (default: all step-2 PASS sites)",
- )
- parser.add_argument(
- "--skip-downloaded",
- action="store_true",
- help="Skip sites whose directory already exists under data/sentinel_data/{year}/",
- )
- args = parser.parse_args(argv)
- year = args.evaluation_year
-
- pass_sites = _load_screening_pass_sites(year)
- if not pass_sites:
- print("[Sentinel data] No PASS sites found in step-2 screening output")
- return 1
-
- if args.site:
- pass_sites = [s for s in pass_sites if s["sitename"] == args.site]
- if not pass_sites:
- print(f"[Sentinel data] Site '{args.site}' not found in step-2 PASS sites")
- return 1
-
- print(f"[Sentinel data] Processing {len(pass_sites)} site(s)")
- for i, site in enumerate(pass_sites, 1):
- sitename = site["sitename"]
- site_dir = DATA_DIR / "sentinel_data" / str(year) / sitename
- if args.skip_downloaded and site_dir.exists():
- print(
- f"[Sentinel data] ({i}/{len(pass_sites)}) {sitename} — skipping (directory exists)"
- )
- continue
- print(f"[Sentinel data] ({i}/{len(pass_sites)}) {sitename}")
- summary = process_site(sitename, site["lat"], site["lon"], year)
- print(
- f"[Sentinel data] {sitename} done — "
- f"{summary['s2_refl_count']} REFL, "
- f"{summary['s3_composite_count']} composites"
- )
-
- return 0
-
-
-if __name__ == "__main__":
- raise SystemExit(main())
diff --git a/4-fusion.py b/4-fusion.py
deleted file mode 100644
index 5950c44..0000000
--- a/4-fusion.py
+++ /dev/null
@@ -1,349 +0,0 @@
-"""Step 4: Compute GCC and run EFAST BtI + ItB fusion for prepared sites.
-
-Inputs (``data/``, ``{year}`` = ``--evaluation-year``):
-
-- ``sentinel_data/{year}/{sitename}/prepared/s2/`` — ``*_REFL.tif`` + ``*_DIST_CLOUD.tif``
-- ``sentinel_data/{year}/{sitename}/prepared/s3/`` — ``composite_*.tif`` (4-band)
-
-Outputs (``data/``):
-
-- ``sentinel_data/{year}/{sitename}/prepared/s2/*_GCC.tif`` — S2 GCC (in-place)
-- ``sentinel_data/{year}/{sitename}/prepared/gcc_s3/*.tif`` — S3 GCC composites
-- ``fusion/{year}/{sitename}/bti/fusion/REFL_*.tif`` — BtI fused 4-band reflectance
-- ``fusion/{year}/{sitename}/bti/gcc/GCC_*.tif`` — GCC derived from BtI fusion
-- ``fusion/{year}/{sitename}/itb/s2/GCC_*.tif`` — per-acquisition S2 GCC (simplified names)
-- ``fusion/{year}/{sitename}/itb/s3/GCC_*.tif`` — per-composite S3 GCC (simplified names)
-- ``fusion/{year}/{sitename}/itb/fusion/GCC_*.tif`` — ItB fused GCC
-
-Requires ``uv sync`` (efast).
-
-CLI:
-
-- ``--evaluation-year`` (default 2025)
-- ``--site`` (optional; default: all prepared sites under ``sentinel_data/{year}/``)
-
-Prior step: :mod:`3-sentinel-data`.
-"""
-
-from __future__ import annotations
-
-import argparse
-import shutil
-from datetime import datetime, timedelta
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-import rasterio
-from dateutil import rrule
-
-# ---------------------------------------------------------------------------
-# Public constants
-# ---------------------------------------------------------------------------
-
-RESOLUTION_RATIO = 30
-MOSAIC_STEP = 2
-MAX_DAYS = 100
-MINIMUM_ACQUISITION_IMPORTANCE = 0
-
-DATA_DIR = Path("data")
-DEFAULT_YEAR = 2025
-
-
-# ---------------------------------------------------------------------------
-# efast import helper
-# ---------------------------------------------------------------------------
-
-
-def _import_efast():
- try:
- import efast.efast as efast_module
-
- return efast_module
- except ImportError as exc:
- raise ImportError("efast not found. Install with: uv sync") from exc
-
-
-# ---------------------------------------------------------------------------
-# GCC computation (from s2_cloud_native.py and s3_openeo.py)
-# ---------------------------------------------------------------------------
-
-
-def compute_gcc_s2(s2_dir: Path, output_dir: Path) -> None:
- """Compute GCC from S2 REFL files and write ``*_GCC.tif`` to ``output_dir``.
-
- Reads every ``*_REFL.tif`` (band order B02/B03/B04) and writes a co-located
- single-band GCC file. Cloud-masked pixels (zero in all bands) remain zero.
- """
- output_dir.mkdir(parents=True, exist_ok=True)
- for src_path in sorted(s2_dir.glob("*_REFL.tif")):
- out_path = output_dir / src_path.name.replace("_REFL.tif", "_GCC.tif")
- if out_path.is_file():
- continue
- with rasterio.open(src_path) as src:
- b, g, r = src.read(1), src.read(2), src.read(3)
- profile = src.profile
- total = b + g + r
- gcc = g / (total + 1e-10)
- gcc[total == 0] = 0
- profile.update(count=1)
- with rasterio.open(out_path, "w", **profile) as dst:
- dst.write(gcc[np.newaxis].astype("float32"))
-
-
-def compute_gcc_s3(s3_dir: Path, output_dir: Path) -> None:
- """Compute GCC from S3 composite files and write single-band GeoTIFFs.
-
- Reads every ``composite_*.tif`` (band order Oa04/Oa06/Oa08/Oa17) and writes
- a single-band GCC file. NaN pixels in the input remain NaN.
- """
- output_dir.mkdir(parents=True, exist_ok=True)
- for src_path in sorted(s3_dir.glob("composite_*.tif")):
- out_path = output_dir / src_path.name
- if out_path.is_file():
- continue
- with rasterio.open(src_path) as src:
- b, g, r = src.read(1), src.read(2), src.read(3)
- profile = src.profile
- total = b + g + r
- gcc = g / (total + 1e-10)
- gcc[np.isnan(total)] = np.nan
- profile.update(count=1, dtype="float32")
- with rasterio.open(out_path, "w", **profile) as dst:
- dst.write(gcc[np.newaxis].astype("float32"))
-
-
-def compute_gcc_from_refl(refl_dir: Path, gcc_dir: Path) -> None:
- """Derive GCC from ``REFL_YYYYMMDD.tif`` files (BtI fusion output).
-
- Reads every ``REFL_*.tif`` and writes a co-located single-band
- ``GCC_YYYYMMDD.tif``. Pixels where any reflectance band is negative (which
- can arise from EFAST's temporal correction) are written as NaN; the GCC
- ratio is undefined there because the denominator B+G+R is near-zero or
- negative due to band cancellation. All-zero pixels (cloud mask) remain
- zero.
- """
- gcc_dir.mkdir(parents=True, exist_ok=True)
- for src_path in sorted(refl_dir.glob("REFL_*.tif")):
- out_path = gcc_dir / src_path.name.replace("REFL_", "GCC_")
- if out_path.is_file():
- continue
- with rasterio.open(src_path) as src:
- b, g, r = src.read(1), src.read(2), src.read(3)
- profile = src.profile
- total = b + g + r
- invalid = (b < 0) | (g < 0) | (r < 0)
- gcc = np.where(invalid, np.nan, g / (total + 1e-10))
- gcc[total == 0] = np.nan
- profile.update(count=1, dtype="float32")
- with rasterio.open(out_path, "w", **profile) as dst:
- dst.write(gcc[np.newaxis].astype("float32"))
-
-
-# ---------------------------------------------------------------------------
-# Date-range detection
-# ---------------------------------------------------------------------------
-
-
-def _refl_date_range(s2_dir: Path) -> tuple[datetime, datetime] | None:
- """Return (start, end) datetime from REFL filenames in ``s2_dir``.
-
- Filenames are expected to follow the S2 product naming convention, where
- the acquisition date ``YYYYMMDD`` appears at position index 2 when the
- stem is split by ``_``, e.g.
- ``S2A_MSIL2A_20230911T114111_N0509_R025_T29PKT_20230911T153131_REFL.tif``.
- """
- dates: list[datetime] = []
- for p in s2_dir.glob("*_REFL.tif"):
- parts = p.stem.split("_")
- if len(parts) >= 3:
- try:
- dates.append(datetime.strptime(parts[2][:8], "%Y%m%d"))
- except ValueError:
- pass
- if not dates:
- return None
- return min(dates), max(dates)
-
-
-# ---------------------------------------------------------------------------
-# Per-site fusion
-# ---------------------------------------------------------------------------
-
-
-def fuse_site(sitename: str, year: int) -> dict[str, Any]:
- """Run GCC computation and EFAST BtI + ItB fusion for one prepared site."""
- efast = _import_efast()
-
- s2_dir = DATA_DIR / "sentinel_data" / str(year) / sitename / "prepared" / "s2"
- s3_dir = DATA_DIR / "sentinel_data" / str(year) / sitename / "prepared" / "s3"
- gcc_s3_dir = (
- DATA_DIR / "sentinel_data" / str(year) / sitename / "prepared" / "gcc_s3"
- )
- base = DATA_DIR / "fusion" / str(year) / sitename
-
- if not s2_dir.is_dir() or not any(s2_dir.glob("*_REFL.tif")):
- raise FileNotFoundError(f"No REFL files in {s2_dir}")
- if not s3_dir.is_dir() or not any(s3_dir.glob("composite_*.tif")):
- raise FileNotFoundError(f"No composite files in {s3_dir}")
-
- print(f"[{sitename}] Computing S2 GCC (in-place)...")
- compute_gcc_s2(s2_dir, s2_dir)
-
- print(f"[{sitename}] Computing S3 GCC...")
- compute_gcc_s3(s3_dir, gcc_s3_dir)
-
- date_range = _refl_date_range(s2_dir)
- if date_range is None:
- raise ValueError(f"Could not detect date range from REFL filenames in {s2_dir}")
- start, end = date_range
- print(f"[{sitename}] Date range: {start.date()} → {end.date()}")
-
- fusion_dates = list(
- rrule.rrule(
- rrule.DAILY,
- dtstart=start + timedelta(MOSAIC_STEP),
- until=end - timedelta(MOSAIC_STEP),
- interval=MOSAIC_STEP,
- )
- )
-
- _fusion_kwargs = dict(
- ratio=RESOLUTION_RATIO,
- max_days=MAX_DAYS,
- minimum_acquisition_importance=MINIMUM_ACQUISITION_IMPORTANCE,
- )
-
- # --- ItB: GCC first, then fuse GCC ---
- itb_s2 = base / "itb" / "s2"
- itb_s3 = base / "itb" / "s3"
- itb_fusion = base / "itb" / "fusion"
- itb_s2.mkdir(parents=True, exist_ok=True)
- itb_s3.mkdir(parents=True, exist_ok=True)
- itb_fusion.mkdir(parents=True, exist_ok=True)
-
- for p in sorted(s2_dir.glob("*_GCC.tif")):
- dst = itb_s2 / f"GCC_{p.stem.split('_')[2][:8]}.tif"
- if not dst.exists():
- shutil.copy2(p, dst)
- for p in sorted(gcc_s3_dir.glob("composite_*.tif")):
- dst = itb_s3 / f"GCC_{p.stem.split('_')[1]}.tif"
- if not dst.exists():
- shutil.copy2(p, dst)
-
- print(f"[{sitename}] ItB: fusing GCC over {len(fusion_dates)} dates...")
- for date in fusion_dates:
- efast.fusion(
- date, gcc_s3_dir, s2_dir, itb_fusion, product="GCC", **_fusion_kwargs
- )
-
- # --- BtI: fuse reflectance (3-band, matching S2 B02/B03/B04), then derive GCC ---
- # S3 composites have 4 bands; strip band 4 (Oa17/NIR) so shapes match S2 REFL.
- s3_rgb_dir = (
- DATA_DIR / "sentinel_data" / str(year) / sitename / "prepared" / "s3_rgb"
- )
- s3_rgb_dir.mkdir(parents=True, exist_ok=True)
- for p in sorted(s3_dir.glob("composite_*.tif")):
- out = s3_rgb_dir / p.name
- if not out.exists():
- with rasterio.open(p) as src:
- data = src.read([1, 2, 3])
- profile = src.profile.copy()
- profile.update(count=3)
- with rasterio.open(out, "w", **profile) as dst:
- dst.write(data)
-
- bti_fusion = base / "bti" / "fusion"
- bti_gcc = base / "bti" / "gcc"
- bti_fusion.mkdir(parents=True, exist_ok=True)
-
- print(f"[{sitename}] BtI: fusing REFL over {len(fusion_dates)} dates...")
- for date in fusion_dates:
- efast.fusion(
- date, s3_rgb_dir, s2_dir, bti_fusion, product="REFL", **_fusion_kwargs
- )
-
- print(f"[{sitename}] BtI: deriving GCC from fused REFL...")
- compute_gcc_from_refl(bti_fusion, bti_gcc)
-
- return {
- "sitename": sitename,
- "evaluation_year": year,
- "start": start.date().isoformat(),
- "end": end.date().isoformat(),
- "fusion_dates": len(fusion_dates),
- "itb_fusion_files": len(list(itb_fusion.glob("*.tif"))),
- "bti_fusion_files": len(list(bti_fusion.glob("*.tif"))),
- "bti_gcc_files": len(list(bti_gcc.glob("*.tif"))),
- }
-
-
-# ---------------------------------------------------------------------------
-# Site discovery
-# ---------------------------------------------------------------------------
-
-
-def _discover_sites(year: int) -> list[str]:
- """Return sitenames that have prepared S2 REFL files under sentinel_data."""
- base = DATA_DIR / "sentinel_data" / str(year)
- if not base.is_dir():
- return []
- return sorted(
- d.name
- for d in base.iterdir()
- if d.is_dir() and any((d / "prepared" / "s2").glob("*_REFL.tif"))
- )
-
-
-# ---------------------------------------------------------------------------
-# CLI
-# ---------------------------------------------------------------------------
-
-
-def main(argv: list[str] | None = None) -> int:
- parser = argparse.ArgumentParser(description=__doc__)
- parser.add_argument("--evaluation-year", type=int, default=DEFAULT_YEAR)
- parser.add_argument(
- "--site",
- type=str,
- default=None,
- help="Single sitename to fuse (default: all prepared sites)",
- )
- parser.add_argument(
- "--skip-blended",
- action="store_true",
- help="Skip sites whose fusion directory already exists under data/fusion/{year}/",
- )
- args = parser.parse_args(argv)
- year = args.evaluation_year
-
- if args.site:
- sites = [args.site]
- else:
- sites = _discover_sites(year)
- if not sites:
- print(f"[Fusion] No prepared sites found under data/sentinel_data/{year}/")
- return 1
-
- print(f"[Fusion] Processing {len(sites)} site(s)")
- for i, sitename in enumerate(sites, 1):
- fusion_dir = DATA_DIR / "fusion" / str(year) / sitename
- if args.skip_blended and fusion_dir.exists():
- print(
- f"[Fusion] ({i}/{len(sites)}) {sitename} — skipping (fusion directory exists)"
- )
- continue
- print(f"[Fusion] ({i}/{len(sites)}) {sitename}")
- summary = fuse_site(sitename, year)
- print(
- f"[Fusion] {sitename} done — "
- f"{summary['fusion_dates']} dates, "
- f"itb={summary['itb_fusion_files']} bti={summary['bti_fusion_files']} "
- f"bti_gcc={summary['bti_gcc_files']}"
- )
-
- return 0
-
-
-if __name__ == "__main__":
- raise SystemExit(main())
diff --git a/5-metrics.py b/5-metrics.py
deleted file mode 100644
index 9658797..0000000
--- a/5-metrics.py
+++ /dev/null
@@ -1,788 +0,0 @@
-"""Step 5: Pre-compute per-site GCC timeseries + raster index for the webapp.
-
-Inputs (``data/``, ``{year}`` = ``--evaluation-year``):
-
-- ``phenocam_screening/{year}.json`` — qualifying sites + metadata
-- ``phenocam/{year}/{site}_1day.csv`` — daily GCC timeseries
-- ``sentinel_data/{year}/{site}/prepared/s2/*_GCC.tif`` — S2 GCC rasters
-- ``sentinel_data/{year}/{site}/prepared/gcc_s3/composite_*.tif`` — S3 GCC rasters
-- ``fusion/{year}/{site}/bti/gcc/GCC_*.tif`` — BtI GCC rasters
-- ``fusion/{year}/{site}/itb/fusion/GCC_*.tif`` — ItB GCC rasters
-
-Outputs (``data/metrics/``):
-
-- ``manifest.json`` — years + per-site metadata
-- ``{year}/{site}/gcc_phenocam.json`` — PhenoCam ``gcc_90`` at matched dates
-- ``{year}/{site}/gcc_s2.json`` — S2 GCC (center pixel, cloud-free scenes)
-- ``{year}/{site}/gcc_s2_whittaker.json`` — Whittaker-smoothed S2 GCC
-- ``{year}/{site}/gcc_s3.json`` — S3 composite GCC
-- ``{year}/{site}/gcc_s3_smooth.json`` — S3 5-day moving average
-- ``{year}/{site}/gcc_fusion_bti.json`` — BtI fused GCC
-- ``{year}/{site}/gcc_fusion_itb.json`` — ItB fused GCC
-- ``{year}/{site}/phenocam_images.json`` — midday photo URLs for the viewer
-- ``{year}/{site}/rasters_s2_refl.json`` — S2 REFL paths (BtI view)
-- ``{year}/{site}/rasters_s3_composite.json`` — S3 composite paths (BtI view)
-- ``{year}/{site}/rasters_s2_gcc.json`` — S2 GCC paths (ItB view)
-- ``{year}/{site}/rasters_s3_gcc.json`` — S3 GCC paths (ItB view)
-- ``{year}/{site}/rasters_fusion_bti_refl.json`` — BtI fused REFL paths
-- ``{year}/{site}/rasters_fusion_itb_gcc.json`` — ItB fused GCC paths
-- ``{year}/{site}/metrics.json`` — NSE, RMSE, nRMSE, Pearson r vs PhenoCam per series
-- ``{year}/{site}/bands_s2.json`` — S2 center-pixel reflectance (B02, B03, B04) per scene
-- ``{year}/{site}/bands_s3.json`` — S3 center-pixel reflectance (Oa04, Oa06, Oa08, Oa17) per composite
-- ``{year}/{site}/covariates.json`` — spatial CV/std, S2/S3 counts, gap stats
-
-CLI:
-
-- ``--evaluation-year`` (default 2025)
-- ``--site`` (optional; default: all qualifying sites with sentinel data)
-"""
-
-from __future__ import annotations
-
-import argparse
-import csv
-import json
-import re
-from pathlib import Path
-from typing import Any
-
-import datetime
-import numpy as np
-import rasterio
-from rasterio.crs import CRS
-from rasterio.transform import rowcol
-from pyproj import Transformer
-from scipy.stats import pearsonr
-from tqdm import tqdm
-
-# ---------------------------------------------------------------------------
-# Constants
-# ---------------------------------------------------------------------------
-
-DATA_DIR = Path("data")
-DEFAULT_YEAR = 2025
-
-# GCC smoothing window for S3 moving average (days)
-S3_SMOOTH_WINDOW = 5
-
-# Whittaker lambda (penalised smoothing strength for S2)
-WHITTAKER_LAMBDA = 400.0
-
-# Half-width in metres for the spatial heterogeneity footprint (~300 m = 1 S3 pixel)
-SPATIAL_CV_HALF_M = 150
-
-# PhenoCam archive image URL pattern
-PHENOCAM_IMAGE_URL = (
- "https://phenocam.nau.edu/data/archive/{site}/{year}/{month}/{filename}"
-)
-
-
-# ---------------------------------------------------------------------------
-# Helpers: raster pixel extraction
-# ---------------------------------------------------------------------------
-
-
-def _window_mean(data: np.ndarray) -> float | None:
- """Mean of a pixel window; ``None`` when every value is NaN."""
- valid = data[~np.isnan(data)]
- if valid.size == 0:
- return None
- return float(np.mean(valid))
-
-
-def _read_center_pixel(path: Path, lat: float, lon: float) -> float | None:
- """Return the 3×3 mean GCC value at (lat, lon) from a single-band raster.
-
- Returns ``None`` when the pixel is masked/zero/NaN.
- """
- try:
- with rasterio.open(path) as src:
- transformer = Transformer.from_crs(
- CRS.from_epsg(4326), src.crs, always_xy=True
- )
- x, y = transformer.transform(lon, lat)
- row, col = rowcol(src.transform, x, y)
- h, w = src.height, src.width
- r0, r1 = max(0, row - 1), min(h, row + 2)
- c0, c1 = max(0, col - 1), min(w, col + 2)
- window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0)
- data = src.read(1, window=window).astype(float)
- nodata = src.nodata
- if nodata is not None:
- data = np.where(data == nodata, np.nan, data)
- data[data == 0] = np.nan
- return _window_mean(data)
- except Exception:
- return None
-
-
-# ---------------------------------------------------------------------------
-# Helpers: date extraction from filenames
-# ---------------------------------------------------------------------------
-
-
-def _date_from_gcc_tif(path: Path) -> str | None:
- """Extract YYYYMMDD from ``GCC_YYYYMMDD.tif`` or ``composite_YYYYMMDD.tif``."""
- m = re.search(r"(\d{8})", path.stem)
- return m.group(1) if m else None
-
-
-def _date_from_s2_tif(path: Path) -> str | None:
- """Extract YYYYMMDD from S2 product name ``S2X_TTTT_YYYYMMDD_…``."""
- parts = path.stem.split("_")
- if len(parts) >= 3:
- m = re.match(r"(\d{8})", parts[2])
- return m.group(1) if m else None
- return None
-
-
-# ---------------------------------------------------------------------------
-# Helpers: Whittaker smoother (2nd-order differences, tridiagonal solver)
-# ---------------------------------------------------------------------------
-
-
-def _whittaker_smooth(
- values: list[float | None], lam: float = WHITTAKER_LAMBDA
-) -> list[float | None]:
- """Penalised least-squares smoother (Whittaker, 2nd-order differences).
-
- Masked (None) values are filled via the smooth and then re-set to None in
- the output so the caller can distinguish observed from gap-filled points.
- """
- n = len(values)
- if n < 4:
- return values[:]
-
- obs_mask = [v is not None for v in values]
- y = np.array([v if v is not None else 0.0 for v in values], dtype=float)
- w = np.array([1.0 if m else 0.0 for m in obs_mask], dtype=float)
-
- W = np.diag(w)
- D = np.diff(np.eye(n), n=2, axis=0) # (n-2) x n second-difference matrix
- A = W + lam * D.T @ D
- try:
- z = np.linalg.solve(A, w * y)
- except np.linalg.LinAlgError:
- return values[:]
-
- result: list[float | None] = []
- for i, m in enumerate(obs_mask):
- result.append(float(z[i]) if m else None)
- return result
-
-
-# ---------------------------------------------------------------------------
-# Helpers: PhenoCam CSV parsing
-# ---------------------------------------------------------------------------
-
-
-def _parse_phenocam_csv(
- csv_path: Path, year: int, site: str
-) -> tuple[list[dict], list[dict]]:
- """Return (gcc_series, image_list) filtered to ``year``.
-
- ``gcc_series`` entries: ``{"date": "YYYY-MM-DD", "gcc_90": float}``
- ``image_list`` entries: ``{"date": "YYYY-MM-DD", "url": str}``
- """
- gcc_series: list[dict] = []
- image_list: list[dict] = []
- year_str = str(year)
-
- if not csv_path.is_file():
- return gcc_series, image_list
-
- with csv_path.open() as f:
- lines = [line for line in f if not line.startswith("#")]
-
- reader = csv.DictReader(lines)
- for row in reader:
- if row.get("year") != year_str:
- continue
- date = row.get("date", "")
- gcc_raw = row.get("gcc_90")
- if gcc_raw and gcc_raw not in ("NA", ""):
- try:
- gcc_series.append({"date": date, "gcc_90": float(gcc_raw)})
- except ValueError:
- pass
- fn = row.get("midday_filename", "").strip()
- if fn and fn != "NA" and date:
- month = date[5:7]
- url = PHENOCAM_IMAGE_URL.format(
- site=site, year=year_str, month=month, filename=fn
- )
- image_list.append({"date": date, "url": url})
-
- return gcc_series, image_list
-
-
-# ---------------------------------------------------------------------------
-# Helpers: moving average
-# ---------------------------------------------------------------------------
-
-
-def _moving_average(series: list[dict], value_key: str, window: int) -> list[dict]:
- """Compute centred moving average; returns new list with ``_smooth`` suffix key."""
- if not series:
- return []
- vals = [p[value_key] for p in series]
- half = window // 2
- smoothed = []
- for i, pt in enumerate(series):
- chunk = [v for v in vals[max(0, i - half) : i + half + 1] if v is not None]
- smoothed.append(
- {
- "date": pt["date"],
- value_key + "_smooth": (sum(chunk) / len(chunk)) if chunk else None,
- }
- )
- return smoothed
-
-
-# ---------------------------------------------------------------------------
-# Helpers: validation metrics
-# ---------------------------------------------------------------------------
-
-MATCH_TOLERANCE_DAYS = 5
-
-
-def compute_metrics(
- ref: list[dict],
- ref_key: str,
- pred: list[dict],
- pred_key: str,
-) -> dict | None:
- """Compute NSE, RMSE, nRMSE, Pearson r between pred and ref.
-
- Each pred point is matched to the nearest ref date within
- ``MATCH_TOLERANCE_DAYS``. Returns a dict or ``None`` if fewer than
- 2 matched pairs exist.
- """
- ref_lookup: dict[str, float] = {
- p["date"]: p[ref_key] for p in ref if p.get(ref_key) is not None
- }
- if not ref_lookup:
- return None
-
- ref_dates = sorted(ref_lookup)
-
- obs, sim = [], []
- for pt in pred:
- v = pt.get(pred_key)
- if v is None:
- continue
- nearest = min(
- ref_dates,
- key=lambda d: abs(
- (np.datetime64(pt["date"]) - np.datetime64(d)) / np.timedelta64(1, "D")
- ),
- )
- gap = abs(
- (np.datetime64(pt["date"]) - np.datetime64(nearest))
- / np.timedelta64(1, "D")
- )
- if gap <= MATCH_TOLERANCE_DAYS and nearest in ref_lookup:
- obs.append(ref_lookup[nearest])
- sim.append(v)
-
- if len(obs) < 2:
- return None
-
- obs_arr = np.array(obs)
- sim_arr = np.array(sim)
- obs_mean = obs_arr.mean()
-
- rmse = float(np.sqrt(np.mean((sim_arr - obs_arr) ** 2)))
- nrmse = rmse / obs_mean if obs_mean else None
- ss_res = float(np.sum((obs_arr - sim_arr) ** 2))
- ss_tot = float(np.sum((obs_arr - obs_mean) ** 2))
- nse = (1.0 - ss_res / ss_tot) if ss_tot else None
- r, _ = pearsonr(obs_arr, sim_arr)
-
- def _r4(v: float | None) -> float | None:
- return round(v, 4) if v is not None else None
-
- return {
- "n": len(obs),
- "rmse": _r4(rmse),
- "nrmse": _r4(nrmse),
- "nse": _r4(nse),
- "r": _r4(float(r)),
- }
-
-
-S2_BAND_NAMES = ["B02", "B03", "B04"]
-S3_BAND_NAMES = ["Oa04", "Oa06", "Oa08", "Oa17"]
-
-
-def _read_multiband_center(
- path: Path, lat: float, lon: float, band_names: list[str]
-) -> dict[str, float | None]:
- """Return 3×3 mean per band at (lat, lon). Keys are ``band_names``, values float or None."""
- try:
- with rasterio.open(path) as src:
- transformer = Transformer.from_crs(
- CRS.from_epsg(4326), src.crs, always_xy=True
- )
- x, y = transformer.transform(lon, lat)
- row, col = rowcol(src.transform, x, y)
- h, w = src.height, src.width
- r0, r1 = max(0, row - 1), min(h, row + 2)
- c0, c1 = max(0, col - 1), min(w, col + 2)
- window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0)
- nodata = src.nodata
- result = {}
- for i, name in enumerate(band_names, 1):
- if i > src.count:
- result[name] = None
- continue
- data = src.read(i, window=window).astype(float)
- if nodata is not None:
- data = np.where(data == nodata, np.nan, data)
- data[data == 0] = np.nan
- val = _window_mean(data)
- result[name] = None if val is None else round(val, 6)
- return result
- except Exception:
- return {name: None for name in band_names}
-
-
-def _multiband_series(
- tif_paths: list[Path],
- date_fn,
- lat: float,
- lon: float,
- band_names: list[str],
- desc: str,
-) -> list[dict]:
- """Extract center-pixel values for all bands; return ``[{date, band1, band2, …}]``."""
- result = []
- for p in tqdm(tif_paths, desc=desc, leave=False):
- date = date_fn(p)
- if date is None:
- continue
- vals = _read_multiband_center(p, lat, lon, band_names)
- if any(v is not None for v in vals.values()):
- result.append({"date": f"{date[:4]}-{date[4:6]}-{date[6:]}", **vals})
- return sorted(result, key=lambda x: x["date"])
-
-
-# ---------------------------------------------------------------------------
-# Helpers: spatial heterogeneity + observation density
-# ---------------------------------------------------------------------------
-
-
-def _read_footprint_stats(
- path: Path, lat: float, lon: float, half_m: float = SPATIAL_CV_HALF_M
-) -> tuple[float, float] | tuple[None, None]:
- """Return (mean, std) of valid GCC pixels within a ±half_m metre square window.
-
- Returns ``(None, None)`` on any error or when fewer than 4 valid pixels exist.
- """
- try:
- with rasterio.open(path) as src:
- transformer = Transformer.from_crs(
- CRS.from_epsg(4326), src.crs, always_xy=True
- )
- x, y = transformer.transform(lon, lat)
- res = abs(src.transform.a) # pixel size in CRS units (metres for UTM)
- half_px = max(1, int(round(half_m / res)))
- row, col = rowcol(src.transform, x, y)
- h, w = src.height, src.width
- r0, r1 = max(0, row - half_px), min(h, row + half_px + 1)
- c0, c1 = max(0, col - half_px), min(w, col + half_px + 1)
- window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0)
- data = src.read(1, window=window).astype(float)
- nodata = src.nodata
- if nodata is not None:
- data = np.where(data == nodata, np.nan, data)
- data[data <= 0] = np.nan
- valid = data[~np.isnan(data)]
- if len(valid) < 4:
- return None, None
- return float(np.mean(valid)), float(np.std(valid))
- except Exception:
- return None, None
-
-
-def compute_covariates(
- s2_gcc_paths: list[Path],
- s2_series: list[dict],
- s3_series: list[dict],
- n_gcc_points: int | None,
- lat: float,
- lon: float,
-) -> dict:
- """Compute spatial heterogeneity and temporal observation density covariates."""
- # Spatial GCC statistics over ~300 m footprint
- means, stds = [], []
- for p in s2_gcc_paths:
- m, s = _read_footprint_stats(p, lat, lon)
- if m is not None and m > 0:
- means.append(m)
- stds.append(s)
-
- spatial_gcc_cv = (
- round(float(np.mean([s / m for s, m in zip(stds, means)])), 4)
- if means
- else None
- )
- spatial_gcc_std = round(float(np.mean(stds)), 4) if stds else None
-
- # S2 temporal gap statistics
- s2_dates = [datetime.date.fromisoformat(p["date"]) for p in s2_series]
- if len(s2_dates) >= 2:
- gaps = [(s2_dates[i + 1] - s2_dates[i]).days for i in range(len(s2_dates) - 1)]
- s2_mean_gap = round(float(np.mean(gaps)), 1)
- s2_max_gap = int(max(gaps))
- else:
- s2_mean_gap = None
- s2_max_gap = None
-
- return {
- "spatial_gcc_cv": spatial_gcc_cv,
- "spatial_gcc_std": spatial_gcc_std,
- "s2_scene_count": len(s2_series),
- "s2_mean_gap_days": s2_mean_gap,
- "s2_max_gap_days": s2_max_gap,
- "s3_composite_count": len(s3_series),
- "n_gcc_points": n_gcc_points,
- }
-
-
-# ---------------------------------------------------------------------------
-# Per-site export
-# ---------------------------------------------------------------------------
-
-
-def _write_json(path: Path, data: Any) -> None:
- path.write_text(json.dumps(data, separators=(",", ":")))
-
-
-def _raster_series(
- tif_paths: list[Path],
- date_fn,
- lat: float,
- lon: float,
- desc: str,
-) -> list[dict]:
- """Extract center-pixel GCC from each tif, return ``[{date, gcc}]`` sorted."""
- result = []
- for p in tqdm(tif_paths, desc=desc, leave=False):
- date = date_fn(p)
- if date is None:
- continue
- val = _read_center_pixel(p, lat, lon)
- if val is not None:
- result.append({"date": f"{date[:4]}-{date[4:6]}-{date[6:]}", "gcc": val})
- return sorted(result, key=lambda x: x["date"])
-
-
-def _raster_index(tif_paths: list[Path], date_fn, rel_root: Path) -> list[dict]:
- """Build raster index: ``[{date, path}]`` sorted by date."""
- result = []
- for p in tif_paths:
- date = date_fn(p)
- if date is None:
- continue
- try:
- rel = str(p.relative_to(rel_root))
- except ValueError:
- rel = str(p)
- result.append({"date": date, "path": rel})
- return sorted(result, key=lambda x: x["date"])
-
-
-def export_site(
- site: str,
- year: int,
- lat: float,
- lon: float,
- out_dir: Path,
- n_gcc_points: int | None = None,
-) -> bool:
- """Export timeseries.json and rasters.json for one site. Returns True on success."""
- sentinel_base = DATA_DIR / "sentinel_data" / str(year) / site / "prepared"
- fusion_base = DATA_DIR / "fusion" / str(year) / site
-
- s2_gcc_dir = sentinel_base / "s2"
- s3_gcc_dir = sentinel_base / "gcc_s3"
- bti_gcc_dir = fusion_base / "bti" / "gcc"
- itb_gcc_dir = fusion_base / "itb" / "fusion"
-
- # Raster slider sources
- s2_refl_dir = sentinel_base / "s2"
- s3_comp_dir = sentinel_base / "s3"
- bti_refl_dir = fusion_base / "bti" / "fusion"
-
- has_fusion = bti_gcc_dir.is_dir() and any(bti_gcc_dir.glob("GCC_*.tif"))
- if not has_fusion:
- return False
-
- out_dir.mkdir(parents=True, exist_ok=True)
-
- # --- GCC timeseries from rasters ---
- s2_gcc_paths = sorted(s2_gcc_dir.glob("*_GCC.tif"))
- s3_gcc_paths = sorted(s3_gcc_dir.glob("composite_*.tif"))
- bti_paths = sorted(bti_gcc_dir.glob("GCC_*.tif"))
- itb_paths = sorted(itb_gcc_dir.glob("GCC_*.tif"))
-
- s2_series = _raster_series(s2_gcc_paths, _date_from_s2_tif, lat, lon, f"{site} S2")
- s3_series = _raster_series(s3_gcc_paths, _date_from_gcc_tif, lat, lon, f"{site} S3")
- bti_series = _raster_series(bti_paths, _date_from_gcc_tif, lat, lon, f"{site} BtI")
- itb_series = _raster_series(itb_paths, _date_from_gcc_tif, lat, lon, f"{site} ItB")
-
- # Whittaker on S2
- s2_vals = [p["gcc"] for p in s2_series]
- s2_smooth_vals = _whittaker_smooth(s2_vals)
- s2_whittaker = [
- {"date": p["date"], "gcc": v}
- for p, v in zip(s2_series, s2_smooth_vals)
- if v is not None
- ]
-
- # S3 5-day moving average
- s3_smooth = _moving_average(s3_series, "gcc", S3_SMOOTH_WINDOW)
-
- # PhenoCam CSV
- csv_path = DATA_DIR / "phenocam" / str(year) / f"{site}_1day.csv"
- phenocam_series, image_list = _parse_phenocam_csv(csv_path, year, site)
-
- s3_smooth_series = [
- {"date": p["date"], "gcc": p["gcc_smooth"]}
- for p in s3_smooth
- if p.get("gcc_smooth") is not None
- ]
-
- # Band reflectance timeseries (multi-band center-pixel)
- bands_s2 = _multiband_series(
- sorted(s2_refl_dir.glob("*_REFL.tif")),
- _date_from_s2_tif,
- lat,
- lon,
- S2_BAND_NAMES,
- f"{site} S2 bands",
- )
- bands_s3 = _multiband_series(
- sorted(s3_comp_dir.glob("composite_*.tif")),
- _date_from_gcc_tif,
- lat,
- lon,
- S3_BAND_NAMES,
- f"{site} S3 bands",
- )
-
- # --- Per-metric JSON outputs ---
- _write_json(out_dir / "gcc_phenocam.json", phenocam_series)
- _write_json(out_dir / "gcc_s2.json", s2_series)
- _write_json(out_dir / "gcc_s2_whittaker.json", s2_whittaker)
- _write_json(out_dir / "gcc_s3.json", s3_series)
- _write_json(out_dir / "gcc_s3_smooth.json", s3_smooth_series)
- _write_json(out_dir / "gcc_fusion_bti.json", bti_series)
- _write_json(out_dir / "gcc_fusion_itb.json", itb_series)
- _write_json(out_dir / "phenocam_images.json", image_list)
- _write_json(out_dir / "bands_s2.json", bands_s2)
- _write_json(out_dir / "bands_s3.json", bands_s3)
-
- # --- Raster index for slider ---
- rel_root = DATA_DIR.parent # paths relative to project root
-
- # Valid-pixel sets: only show S2/S3 rasters where the center pixel had
- # usable data (non-zero GCC). This excludes cloud-masked / snow-covered
- # scenes that would render as black or visually nonsensical.
- s2_valid_dates = {p["date"].replace("-", "") for p in s2_series}
- s3_valid_dates = {p["date"].replace("-", "") for p in s3_series}
-
- s2_refl = [
- r
- for r in _raster_index(
- sorted(s2_refl_dir.glob("*_REFL.tif")), _date_from_s2_tif, rel_root
- )
- if r["date"] in s2_valid_dates
- ]
- s3_comp = [
- r
- for r in _raster_index(
- sorted(s3_comp_dir.glob("composite_*.tif")), _date_from_gcc_tif, rel_root
- )
- if r["date"] in s3_valid_dates
- ]
- s2_gcc = [
- r
- for r in _raster_index(
- sorted(s2_gcc_dir.glob("*_GCC.tif")), _date_from_s2_tif, rel_root
- )
- if r["date"] in s2_valid_dates
- ]
- s3_gcc = [
- r
- for r in _raster_index(
- sorted(s3_gcc_dir.glob("composite_*.tif")), _date_from_gcc_tif, rel_root
- )
- if r["date"] in s3_valid_dates
- ]
- bti_refl = _raster_index(
- sorted(bti_refl_dir.glob("REFL_*.tif")), _date_from_gcc_tif, rel_root
- )
- itb_gcc = _raster_index(
- sorted(itb_gcc_dir.glob("GCC_*.tif")), _date_from_gcc_tif, rel_root
- )
-
- _write_json(out_dir / "rasters_s2_refl.json", s2_refl)
- _write_json(out_dir / "rasters_s3_composite.json", s3_comp)
- _write_json(out_dir / "rasters_s2_gcc.json", s2_gcc)
- _write_json(out_dir / "rasters_s3_gcc.json", s3_gcc)
- _write_json(out_dir / "rasters_fusion_bti_refl.json", bti_refl)
- _write_json(out_dir / "rasters_fusion_itb_gcc.json", itb_gcc)
-
- # --- Site covariates (heterogeneity + observation density) ---
- _write_json(
- out_dir / "covariates.json",
- compute_covariates(s2_gcc_paths, s2_series, s3_series, n_gcc_points, lat, lon),
- )
-
- # --- Validation metrics vs PhenoCam gcc_90 ---
- _write_json(
- out_dir / "metrics.json",
- {
- "bti": compute_metrics(phenocam_series, "gcc_90", bti_series, "gcc"),
- "itb": compute_metrics(phenocam_series, "gcc_90", itb_series, "gcc"),
- "s2_whittaker": compute_metrics(
- phenocam_series, "gcc_90", s2_whittaker, "gcc"
- ),
- "s3_smooth": compute_metrics(
- phenocam_series, "gcc_90", s3_smooth_series, "gcc"
- ),
- "s2": compute_metrics(phenocam_series, "gcc_90", s2_series, "gcc"),
- "s3": compute_metrics(phenocam_series, "gcc_90", s3_series, "gcc"),
- },
- )
-
- # Remove legacy bundled outputs if present
- for legacy in ("timeseries.json", "rasters.json"):
- (out_dir / legacy).unlink(missing_ok=True)
- return True
-
-
-# ---------------------------------------------------------------------------
-# Manifest
-# ---------------------------------------------------------------------------
-
-VEG_TYPE_LABELS = {
- "AG": "Agriculture",
- "DB": "Deciduous broadleaf",
- "DN": "Deciduous needleleaf",
- "EB": "Evergreen broadleaf",
- "EN": "Evergreen needleleaf",
- "GR": "Grassland",
- "MX": "Mixed",
- "SH": "Shrubland",
- "TN": "Tundra",
- "UN": "Unknown",
- "WL": "Wetland",
- "RF": "Reference",
-}
-
-
-def build_manifest(years: list[int], filter_site: str | None = None) -> dict:
- manifest: dict[str, Any] = {"years": years, "sites": {}}
-
- for year in years:
- screening_path = DATA_DIR / "phenocam_screening" / f"{year}.json"
- if not screening_path.is_file():
- continue
- data = json.loads(screening_path.read_text())
- sites_meta: dict[str, Any] = {}
- for entry in data.get("sites", []):
- if entry.get("calculations", {}).get("status") != "PASS":
- continue
- cam = entry.get("response", {}).get("camera", {})
- roi = entry.get("response", {}).get("roi", {})
- calc = entry.get("calculations", {})
- site = cam.get("Sitename", "")
- if not site:
- continue
- if filter_site and site != filter_site:
- continue
- sm = cam.get("sitemetadata", {})
- veg_raw = sm.get("primary_veg_type") or roi.get("roitype") or "UN"
- fusion_dir = DATA_DIR / "fusion" / str(year) / site / "bti" / "gcc"
- has_fusion = fusion_dir.is_dir() and any(fusion_dir.glob("GCC_*.tif"))
- sites_meta[site] = {
- "lat": cam.get("Lat"),
- "lon": cam.get("Lon"),
- "veg_type": veg_raw,
- "veg_label": VEG_TYPE_LABELS.get(veg_raw, veg_raw),
- "description": sm.get("site_description", ""),
- "dominant_species": sm.get("dominant_species", ""),
- "group": sm.get("group", ""),
- "snr": calc.get("snr"),
- "n_gcc_points": calc.get("n_gcc_points"),
- "has_fusion": has_fusion,
- }
- manifest["sites"][str(year)] = sites_meta
-
- return manifest
-
-
-# ---------------------------------------------------------------------------
-# CLI
-# ---------------------------------------------------------------------------
-
-
-def main() -> None:
- parser = argparse.ArgumentParser(description=__doc__)
- parser.add_argument("--evaluation-year", type=int, default=DEFAULT_YEAR)
- parser.add_argument("--site", type=str, default=None)
- args = parser.parse_args()
-
- year = args.evaluation_year
- filter_site = args.site
-
- out_base = DATA_DIR / "metrics"
- out_base.mkdir(parents=True, exist_ok=True)
-
- # Determine years with screening data
- screening_dir = DATA_DIR / "phenocam_screening"
- years = sorted(
- int(p.stem) for p in screening_dir.glob("*.json") if p.stem.isdigit()
- )
- if not years:
- years = [year]
-
- print(f"Building manifest for years: {years}")
- manifest = build_manifest(years, filter_site)
-
- # Export per-site data for the requested year
- year_sites = manifest["sites"].get(str(year), {})
- fusion_sites = {s: m for s, m in year_sites.items() if m["has_fusion"]}
-
- print(f"Exporting {len(fusion_sites)} site(s) with fusion data for {year}")
- for site, meta in tqdm(fusion_sites.items(), desc="Sites"):
- out_dir = out_base / str(year) / site
- ok = export_site(
- site, year, meta["lat"], meta["lon"], out_dir, meta.get("n_gcc_points")
- )
- if ok:
- print(f" ✓ {site}")
- else:
- print(f" ✗ {site} — no fusion data found")
-
- manifest_path = out_base / "manifest.json"
- if filter_site and manifest_path.is_file():
- # Merge: update only the filtered site(s); preserve all other entries.
- existing: dict = json.loads(manifest_path.read_text())
- for year_key, year_sites_new in manifest["sites"].items():
- existing.setdefault("sites", {}).setdefault(year_key, {}).update(
- year_sites_new
- )
- all_years = sorted(set(existing.get("years", [])) | set(manifest["years"]))
- existing["years"] = all_years
- manifest_path.write_text(json.dumps(existing, separators=(",", ":")))
- else:
- manifest_path.write_text(json.dumps(manifest, separators=(",", ":")))
- print(f"Manifest written → {manifest_path}")
-
-
-if __name__ == "__main__":
- main()
diff --git a/6-statistics-fusion-order.py b/6-statistics-fusion-order.py
deleted file mode 100644
index 0ec0541..0000000
--- a/6-statistics-fusion-order.py
+++ /dev/null
@@ -1,257 +0,0 @@
-"""Step 6: Paired ItB-vs-BtI significance test across the full sample.
-
-Inputs (``data/``, ``{year}`` = ``--evaluation-year``):
-
-- ``metrics/{year}/{site}/metrics.json`` — per-site validation metrics (Step 5)
-
-Outputs (``data/statistics_fusion_order/``):
-
-- ``{year}.json`` — paired Wilcoxon + t-test summary for NSE, RMSE, nRMSE, r;
- includes ``dropped_sites`` (union) and per-metric ``dropped_sites`` lists
-
-CLI:
-
-- ``--evaluation-year`` (default 2025)
-- ``--alpha`` (default 0.05) — significance threshold for ``better_order``
-
-This step aggregates across all sites with Step 5 output; it does not accept
-``--site`` (a single-site filter would not support a sample-level test).
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-from scipy.stats import ttest_rel, wilcoxon
-
-# ---------------------------------------------------------------------------
-# Constants
-# ---------------------------------------------------------------------------
-
-DATA_DIR = Path("data")
-DEFAULT_YEAR = 2025
-DEFAULT_ALPHA = 0.05
-
-METRICS = ["nse", "rmse", "nrmse", "r"]
-LOWER_IS_BETTER = {"rmse", "nrmse"}
-
-MIN_PAIRS_WARNING = 6
-
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-
-def _r4(v: float | None) -> float | None:
- return round(v, 4) if v is not None else None
-
-
-def _load_site_metrics(year: int) -> list[tuple[str, dict[str, Any]]]:
- """Return ``(sitename, metrics.json payload)`` for every site under ``{year}``."""
- metrics_dir = DATA_DIR / "metrics" / str(year)
- if not metrics_dir.is_dir():
- return []
-
- payloads: list[tuple[str, dict[str, Any]]] = []
- for site_dir in sorted(metrics_dir.iterdir()):
- if not site_dir.is_dir():
- continue
- path = site_dir / "metrics.json"
- if not path.is_file():
- continue
- payloads.append((site_dir.name, json.loads(path.read_text())))
- return payloads
-
-
-def collect_pairs(
- site_metrics: list[tuple[str, dict[str, Any]]], metric: str
-) -> tuple[list[float], list[float], list[str]]:
- """Return paired BtI / ItB values for ``metric`` and dropped site names."""
- bti_vals: list[float] = []
- itb_vals: list[float] = []
- dropped_sites: list[str] = []
-
- for site, payload in site_metrics:
- bti = payload.get("bti")
- itb = payload.get("itb")
- if not isinstance(bti, dict) or not isinstance(itb, dict):
- dropped_sites.append(site)
- continue
-
- bti_v = bti.get(metric)
- itb_v = itb.get(metric)
- if bti_v is None or itb_v is None:
- dropped_sites.append(site)
- continue
-
- bti_vals.append(float(bti_v))
- itb_vals.append(float(itb_v))
-
- return bti_vals, itb_vals, dropped_sites
-
-
-def _better_order(
- bti_vals: list[float],
- itb_vals: list[float],
- metric: str,
- p_value: float | None,
- alpha: float,
-) -> str:
- """Name the better fusion order when Wilcoxon p < alpha, else no difference."""
- if p_value is None or p_value >= alpha:
- return "no significant difference"
-
- mean_diff = float(np.mean(itb_vals) - np.mean(bti_vals))
- if metric in LOWER_IS_BETTER:
- return "itb" if mean_diff < 0 else "bti"
- return "itb" if mean_diff > 0 else "bti"
-
-
-def paired_test(
- bti_vals: list[float],
- itb_vals: list[float],
- metric: str,
- alpha: float,
-) -> dict[str, Any]:
- """Run paired Wilcoxon (primary) and t-test; return summary dict."""
- n_pairs = len(bti_vals)
- bti_arr = np.array(bti_vals, dtype=float)
- itb_arr = np.array(itb_vals, dtype=float)
- diffs = itb_arr - bti_arr
-
- result: dict[str, Any] = {
- "n_pairs": n_pairs,
- "bti_mean": _r4(float(bti_arr.mean())) if n_pairs else None,
- "bti_median": _r4(float(np.median(bti_arr))) if n_pairs else None,
- "itb_mean": _r4(float(itb_arr.mean())) if n_pairs else None,
- "itb_median": _r4(float(np.median(itb_arr))) if n_pairs else None,
- "mean_diff": _r4(float(diffs.mean())) if n_pairs else None,
- "median_diff": _r4(float(np.median(diffs))) if n_pairs else None,
- "wilcoxon": {"statistic": None, "p_value": None},
- "ttest": {"statistic": None, "p_value": None},
- "better_order": "insufficient data",
- }
-
- if n_pairs < 2:
- return result
-
- wilcoxon_stat: float | None = None
- wilcoxon_p: float | None = None
- if np.any(diffs != 0):
- try:
- w_stat, w_p = wilcoxon(itb_arr, bti_arr)
- wilcoxon_stat = float(w_stat)
- wilcoxon_p = float(w_p)
- except ValueError:
- pass
-
- t_stat, t_p = ttest_rel(itb_arr, bti_arr)
-
- result["wilcoxon"] = {
- "statistic": _r4(wilcoxon_stat),
- "p_value": _r4(wilcoxon_p),
- }
- result["ttest"] = {
- "statistic": _r4(float(t_stat)),
- "p_value": _r4(float(t_p)),
- }
- result["better_order"] = _better_order(
- bti_vals, itb_vals, metric, wilcoxon_p, alpha
- )
- return result
-
-
-def _print_summary(
- year: int, alpha: float, n_sites_total: int, metrics_out: dict
-) -> None:
- print(f"\nPaired ItB vs BtI test — {year} (alpha={alpha}, sites={n_sites_total})")
- print(
- f"{'metric':<8} {'n':>4} {'BtI mean':>10} {'ItB mean':>10} "
- f"{'diff':>10} {'W p':>8} {'t p':>8} better"
- )
- print("-" * 78)
- for metric in METRICS:
- m = metrics_out[metric]
- bti_mean = m["bti_mean"]
- itb_mean = m["itb_mean"]
- mean_diff = m["mean_diff"]
- w_p = m["wilcoxon"]["p_value"]
- t_p = m["ttest"]["p_value"]
- better = m["better_order"]
-
- def _fmt(v: float | None) -> str:
- return f"{v:10.4f}" if v is not None else f"{'—':>10}"
-
- print(
- f"{metric:<8} {m['n_pairs']:>4} {_fmt(bti_mean)} {_fmt(itb_mean)} "
- f"{_fmt(mean_diff)} "
- f"{w_p if w_p is not None else '—':>8} "
- f"{t_p if t_p is not None else '—':>8} {better}"
- )
- if 0 < m["n_pairs"] < MIN_PAIRS_WARNING:
- print(
- f" warning: only {m['n_pairs']} pair(s) for {metric}; "
- "interpret p-values cautiously"
- )
-
-
-# ---------------------------------------------------------------------------
-# CLI
-# ---------------------------------------------------------------------------
-
-
-def main() -> None:
- parser = argparse.ArgumentParser(description=__doc__)
- parser.add_argument("--evaluation-year", type=int, default=DEFAULT_YEAR)
- parser.add_argument(
- "--alpha",
- type=float,
- default=DEFAULT_ALPHA,
- help="Significance threshold for better_order (default 0.05)",
- )
- args = parser.parse_args()
-
- year = args.evaluation_year
- alpha = args.alpha
-
- site_metrics = _load_site_metrics(year)
- n_sites_total = len(site_metrics)
- if n_sites_total == 0:
- raise SystemExit(
- f"No Step 5 metrics found under {DATA_DIR / 'metrics' / str(year)}"
- )
-
- metrics_out: dict[str, Any] = {}
- all_dropped: set[str] = set()
- for metric in METRICS:
- bti_vals, itb_vals, dropped_sites = collect_pairs(site_metrics, metric)
- summary = paired_test(bti_vals, itb_vals, metric, alpha)
- summary["n_dropped"] = len(dropped_sites)
- summary["dropped_sites"] = dropped_sites
- all_dropped.update(dropped_sites)
- metrics_out[metric] = summary
-
- payload = {
- "year": year,
- "alpha": alpha,
- "n_sites_total": n_sites_total,
- "dropped_sites": sorted(all_dropped),
- "metrics": metrics_out,
- }
-
- out_dir = DATA_DIR / "statistics_fusion_order"
- out_dir.mkdir(parents=True, exist_ok=True)
- out_path = out_dir / f"{year}.json"
- out_path.write_text(json.dumps(payload, separators=(",", ":")))
-
- _print_summary(year, alpha, n_sites_total, metrics_out)
- print(f"\nWritten → {out_path}")
-
-
-if __name__ == "__main__":
- main()
diff --git a/7-gcc-suitability.py b/7-gcc-suitability.py
deleted file mode 100644
index 32cc4e5..0000000
--- a/7-gcc-suitability.py
+++ /dev/null
@@ -1,742 +0,0 @@
-"""Step 7: PhenoCam GCC suitability as a fusion-accuracy reference.
-
-Inputs (``data/``, ``{year}`` = ``--evaluation-year``):
-
-- ``metrics/{year}/{site}/gcc_s2.json``, ``gcc_phenocam.json`` — Step 5 timeseries
-- ``metrics/manifest.json`` — site lat/lon
-- ``sentinel_data/{year}/{site}/prepared/s2/`` — S2 REFL/GCC/DIST_CLOUD (Steps 3–4)
-- ``sentinel_data/{year}/{site}/prepared/gcc_s3/``, ``prepared/s3_rgb/`` — Step 4
-
-Outputs (``data/gcc_suitability/``):
-
-- ``{year}.json`` — representativeness (Line A), LOOCV concordance (Line B),
- per-site and aggregate suitability verdict
-
-CLI:
-
-- ``--evaluation-year`` (default 2025)
-- ``--min-cloudfree-s2`` (default 10) — minimum cloud-free S2 dates for LOOCV
-- ``--alpha`` (default 0.05) — reserved for future significance tests
-
-Full-sample aggregate; does not accept ``--site``.
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import re
-import shutil
-import tempfile
-from datetime import datetime
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-import rasterio
-from pyproj import Transformer
-from rasterio.crs import CRS
-from rasterio.transform import rowcol
-from scipy.stats import linregress, pearsonr, spearmanr
-from tqdm import tqdm
-
-# ---------------------------------------------------------------------------
-# Constants
-# ---------------------------------------------------------------------------
-
-DATA_DIR = Path("data")
-DEFAULT_YEAR = 2025
-DEFAULT_ALPHA = 0.05
-MIN_CLOUDFREE_S2 = 10
-REPR_R_THRESHOLD = 0.7
-MATCH_TOLERANCE_DAYS = 5
-
-RESOLUTION_RATIO = 30
-MAX_DAYS = 100
-MINIMUM_ACQUISITION_IMPORTANCE = 0
-
-SMALL_SAMPLE_SITES = 6
-
-
-# ---------------------------------------------------------------------------
-# efast import
-# ---------------------------------------------------------------------------
-
-
-def _import_efast():
- try:
- import efast.efast as efast_module
-
- return efast_module
- except ImportError as exc:
- raise ImportError("efast not found. Install with: uv sync") from exc
-
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-
-def _r4(v: float | None) -> float | None:
- return round(v, 4) if v is not None else None
-
-
-def _window_mean(data: np.ndarray) -> float | None:
- valid = data[~np.isnan(data)]
- if valid.size == 0:
- return None
- return float(np.mean(valid))
-
-
-def _read_center_pixel(path: Path, lat: float, lon: float) -> float | None:
- try:
- with rasterio.open(path) as src:
- transformer = Transformer.from_crs(
- CRS.from_epsg(4326), src.crs, always_xy=True
- )
- x, y = transformer.transform(lon, lat)
- row, col = rowcol(src.transform, x, y)
- h, w = src.height, src.width
- r0, r1 = max(0, row - 1), min(h, row + 2)
- c0, c1 = max(0, col - 1), min(w, col + 2)
- window = rasterio.windows.Window(c0, r0, c1 - c0, r1 - r0)
- data = src.read(1, window=window).astype(float)
- nodata = src.nodata
- if nodata is not None:
- data = np.where(data == nodata, np.nan, data)
- data[data == 0] = np.nan
- return _window_mean(data)
- except Exception:
- return None
-
-
-def _date_from_s2_tif(path: Path) -> str | None:
- parts = path.stem.split("_")
- if len(parts) >= 3:
- m = re.match(r"(\d{8})", parts[2])
- return m.group(1) if m else None
- return None
-
-
-def _iso_to_yyyymmdd(iso: str) -> str:
- return iso.replace("-", "")
-
-
-def _yyyymmdd_to_iso(d: str) -> str:
- return f"{d[:4]}-{d[4:6]}-{d[6:]}"
-
-
-def _day_gap(a: str, b: str) -> float:
- return abs((np.datetime64(a) - np.datetime64(b)) / np.timedelta64(1, "D"))
-
-
-def _match_series(
- ref: list[dict],
- ref_key: str,
- pred: list[dict],
- pred_key: str,
- tolerance_days: int = MATCH_TOLERANCE_DAYS,
-) -> tuple[list[float], list[float], list[str]]:
- """Return paired (ref_vals, pred_vals, ref_dates) within tolerance."""
- ref_lookup: dict[str, float] = {
- p["date"]: p[ref_key] for p in ref if p.get(ref_key) is not None
- }
- if not ref_lookup:
- return [], [], []
-
- ref_dates = sorted(ref_lookup)
- obs, sim, matched_dates = [], [], []
- for pt in pred:
- v = pt.get(pred_key)
- if v is None:
- continue
- nearest = min(ref_dates, key=lambda d: _day_gap(pt["date"], d))
- if _day_gap(pt["date"], nearest) <= tolerance_days:
- obs.append(ref_lookup[nearest])
- sim.append(v)
- matched_dates.append(nearest)
- return obs, sim, matched_dates
-
-
-def _accuracy_metrics(obs: list[float], sim: list[float]) -> dict[str, Any] | None:
- if len(obs) < 2:
- return None
- obs_arr = np.array(obs, dtype=float)
- sim_arr = np.array(sim, dtype=float)
- diff = sim_arr - obs_arr
- rmse = float(np.sqrt(np.mean(diff**2)))
- mae = float(np.mean(np.abs(diff)))
- bias = float(np.mean(diff))
- r, _ = pearsonr(obs_arr, sim_arr)
- return {
- "n": len(obs),
- "rmse": _r4(rmse),
- "mae": _r4(mae),
- "bias": _r4(bias),
- "r": _r4(float(r)),
- }
-
-
-def _gcc_from_refl_file(refl_path: Path, gcc_path: Path) -> None:
- with rasterio.open(refl_path) as src:
- b, g, r = src.read(1), src.read(2), src.read(3)
- profile = src.profile
- total = b + g + r
- invalid = (b < 0) | (g < 0) | (r < 0)
- gcc = np.where(invalid, np.nan, g / (total + 1e-10))
- gcc[total == 0] = np.nan
- profile.update(count=1, dtype="float32")
- with rasterio.open(gcc_path, "w", **profile) as dst:
- dst.write(gcc[np.newaxis].astype("float32"))
-
-
-def _load_json_series(path: Path) -> list[dict]:
- if not path.is_file():
- return []
- return json.loads(path.read_text())
-
-
-def _load_site_coords(year: int) -> dict[str, tuple[float, float]]:
- manifest_path = DATA_DIR / "metrics" / "manifest.json"
- if not manifest_path.is_file():
- return {}
- manifest = json.loads(manifest_path.read_text())
- sites = manifest.get("sites", {}).get(str(year), {})
- coords: dict[str, tuple[float, float]] = {}
- for site, meta in sites.items():
- lat, lon = meta.get("lat"), meta.get("lon")
- if lat is not None and lon is not None:
- coords[site] = (float(lat), float(lon))
- return coords
-
-
-def _discover_sites(year: int) -> list[str]:
- metrics_dir = DATA_DIR / "metrics" / str(year)
- if not metrics_dir.is_dir():
- return []
- return sorted(
- d.name
- for d in metrics_dir.iterdir()
- if d.is_dir() and (d / "gcc_s2.json").is_file()
- )
-
-
-def _build_hr_symlink_dir(s2_dir: Path, holdout_yyyymmdd: str, dest: Path) -> None:
- """Symlink all S2 hr inputs except the held-out acquisition date."""
- dest.mkdir(parents=True, exist_ok=True)
- for pattern in ("*_REFL.tif", "*_GCC.tif", "*_DIST_CLOUD.tif"):
- for src in sorted(s2_dir.glob(pattern)):
- date_token = (
- src.stem.split("_")[2][:8] if len(src.stem.split("_")) >= 3 else ""
- )
- if date_token == holdout_yyyymmdd:
- continue
- link = dest / src.name
- if link.exists() or link.is_symlink():
- link.unlink()
- link.symlink_to(src.resolve())
-
-
-# ---------------------------------------------------------------------------
-# Line A — representativeness
-# ---------------------------------------------------------------------------
-
-
-def compute_representativeness(phenocam: list[dict], s2: list[dict]) -> dict[str, Any]:
- """PhenoCam gcc_90 vs co-located observed S2 GCC."""
- obs, sim, _ = _match_series(phenocam, "gcc_90", s2, "gcc")
- result: dict[str, Any] = {
- "n": len(obs),
- "r": None,
- "spearman": None,
- "slope": None,
- "intercept": None,
- "rmse": None,
- "bias": None,
- "peak_offset_days": None,
- "representative": False,
- }
- if len(obs) < 2:
- return result
-
- obs_arr = np.array(obs, dtype=float)
- sim_arr = np.array(sim, dtype=float)
- r, _ = pearsonr(obs_arr, sim_arr)
- sp, _ = spearmanr(obs_arr, sim_arr)
- reg = linregress(sim_arr, obs_arr)
- diff = sim_arr - obs_arr
- result.update(
- {
- "r": _r4(float(r)),
- "spearman": _r4(float(sp)),
- "slope": _r4(float(reg.slope)),
- "intercept": _r4(float(reg.intercept)),
- "rmse": _r4(float(np.sqrt(np.mean(diff**2)))),
- "bias": _r4(float(np.mean(diff))),
- "representative": float(r) >= REPR_R_THRESHOLD,
- }
- )
-
- pc_dates = [p["date"] for p in phenocam if p.get("gcc_90") is not None]
- s2_dates = [p["date"] for p in s2 if p.get("gcc") is not None]
- if pc_dates and s2_dates:
- pc_peak = max(
- phenocam,
- key=lambda p: p["gcc_90"] if p.get("gcc_90") is not None else -1,
- )["date"]
- s2_peak = max(s2, key=lambda p: p["gcc"] if p.get("gcc") is not None else -1)[
- "date"
- ]
- result["peak_offset_days"] = int(_day_gap(pc_peak, s2_peak))
-
- return result
-
-
-# ---------------------------------------------------------------------------
-# Line B — LOOCV
-# ---------------------------------------------------------------------------
-
-
-def _phenocam_lookup(phenocam: list[dict]) -> dict[str, float]:
- return {p["date"]: p["gcc_90"] for p in phenocam if p.get("gcc_90") is not None}
-
-
-def _nearest_phenocam(iso_date: str, lookup: dict[str, float]) -> float | None:
- if not lookup:
- return None
- dates = sorted(lookup)
- nearest = min(dates, key=lambda d: _day_gap(iso_date, d))
- if _day_gap(iso_date, nearest) <= MATCH_TOLERANCE_DAYS:
- return lookup[nearest]
- return None
-
-
-def run_loocv_site(
- site: str,
- year: int,
- lat: float,
- lon: float,
- s2_series: list[dict],
- phenocam: list[dict],
- efast,
-) -> list[dict[str, Any]]:
- """Leave-one-out EFAST for each cloud-free S2 date; return per-date records."""
- s2_dir = DATA_DIR / "sentinel_data" / str(year) / site / "prepared" / "s2"
- gcc_s3_dir = DATA_DIR / "sentinel_data" / str(year) / site / "prepared" / "gcc_s3"
- s3_rgb_dir = DATA_DIR / "sentinel_data" / str(year) / site / "prepared" / "s3_rgb"
-
- pc_lookup = _phenocam_lookup(phenocam)
- s2_truth = {p["date"]: p["gcc"] for p in s2_series}
- fusion_kwargs = dict(
- ratio=RESOLUTION_RATIO,
- max_days=MAX_DAYS,
- minimum_acquisition_importance=MINIMUM_ACQUISITION_IMPORTANCE,
- )
-
- records: list[dict[str, Any]] = []
- dates = [p["date"] for p in s2_series]
-
- with tempfile.TemporaryDirectory(prefix=f"loocv_{site}_") as tmp_root:
- tmp = Path(tmp_root)
- hr_dir = tmp / "hr"
- itb_out = tmp / "itb"
- bti_out = tmp / "bti"
- bti_gcc = tmp / "bti_gcc"
- itb_out.mkdir()
- bti_out.mkdir()
- bti_gcc.mkdir()
-
- for iso_date in tqdm(dates, desc=f"{site} LOOCV", leave=False):
- yyyymmdd = _iso_to_yyyymmdd(iso_date)
- truth = s2_truth.get(iso_date)
- if truth is None:
- continue
-
- if hr_dir.exists():
- shutil.rmtree(hr_dir)
- _build_hr_symlink_dir(s2_dir, yyyymmdd, hr_dir)
-
- pred_date = datetime.strptime(yyyymmdd, "%Y%m%d")
-
- for f in itb_out.glob("*.tif"):
- f.unlink()
- for f in bti_out.glob("*.tif"):
- f.unlink()
- for f in bti_gcc.glob("*.tif"):
- f.unlink()
-
- efast.fusion(
- pred_date,
- gcc_s3_dir,
- hr_dir,
- itb_out,
- product="GCC",
- **fusion_kwargs,
- )
- efast.fusion(
- pred_date,
- s3_rgb_dir,
- hr_dir,
- bti_out,
- product="REFL",
- **fusion_kwargs,
- )
-
- itb_path = itb_out / f"GCC_{yyyymmdd}.tif"
- refl_path = bti_out / f"REFL_{yyyymmdd}.tif"
- bti_path = bti_gcc / f"GCC_{yyyymmdd}.tif"
-
- pred_itb = (
- _read_center_pixel(itb_path, lat, lon) if itb_path.is_file() else None
- )
- pred_bti = None
- if refl_path.is_file():
- _gcc_from_refl_file(refl_path, bti_path)
- if bti_path.is_file():
- pred_bti = _read_center_pixel(bti_path, lat, lon)
-
- pc_val = _nearest_phenocam(iso_date, pc_lookup)
-
- records.append(
- {
- "date": iso_date,
- "s2_truth": truth,
- "pred_bti": pred_bti,
- "pred_itb": pred_itb,
- "phenocam": pc_val,
- }
- )
-
- return records
-
-
-def _method_accuracy(records: list[dict], pred_key: str, ref_key: str) -> dict | None:
- obs, sim = [], []
- for rec in records:
- pred = rec.get(pred_key)
- ref = rec.get(ref_key)
- if pred is None or ref is None:
- continue
- obs.append(ref)
- sim.append(pred)
- return _accuracy_metrics(obs, sim)
-
-
-def _winner(rmse_bti: float | None, rmse_itb: float | None) -> str | None:
- if rmse_bti is None or rmse_itb is None:
- return None
- if rmse_bti < rmse_itb:
- return "bti"
- if rmse_itb < rmse_bti:
- return "itb"
- return "tie"
-
-
-def summarize_loocv(records: list[dict]) -> dict[str, Any]:
- bti_vs_s2 = _method_accuracy(records, "pred_bti", "s2_truth")
- itb_vs_s2 = _method_accuracy(records, "pred_itb", "s2_truth")
- bti_vs_pc = _method_accuracy(records, "pred_bti", "phenocam")
- itb_vs_pc = _method_accuracy(records, "pred_itb", "phenocam")
-
- winner_s2 = _winner(
- bti_vs_s2["rmse"] if bti_vs_s2 else None,
- itb_vs_s2["rmse"] if itb_vs_s2 else None,
- )
- winner_pc = _winner(
- bti_vs_pc["rmse"] if bti_vs_pc else None,
- itb_vs_pc["rmse"] if itb_vs_pc else None,
- )
- agreement = (
- winner_s2 == winner_pc
- if winner_s2 and winner_pc and winner_s2 != "tie" and winner_pc != "tie"
- else None
- )
-
- return {
- "n_dates": len(records),
- "bti": {"vs_s2": bti_vs_s2, "vs_phenocam": bti_vs_pc},
- "itb": {"vs_s2": itb_vs_s2, "vs_phenocam": itb_vs_pc},
- "winner_s2": winner_s2,
- "winner_phenocam": winner_pc,
- "winner_agreement": agreement,
- }
-
-
-# ---------------------------------------------------------------------------
-# Aggregate concordance
-# ---------------------------------------------------------------------------
-
-
-def _pooled_concordance(
- all_records: list[dict[str, Any]],
-) -> dict[str, Any]:
- """Pooled metrics across all held-out dates."""
- residual_pairs: list[tuple[float, float]] = []
- vec_s2: list[float] = []
- vec_pc: list[float] = []
-
- for site_data in all_records:
- for rec in site_data.get("records", []):
- truth = rec.get("s2_truth")
- pc = rec.get("phenocam")
- for key in ("pred_bti", "pred_itb"):
- pred = rec.get(key)
- if pred is None or truth is None:
- continue
- err_s2 = abs(pred - truth)
- if pc is not None:
- err_pc = abs(pred - pc)
- vec_s2.append(err_s2)
- vec_pc.append(err_pc)
- residual_pairs.append((err_s2, err_pc))
-
- pooled_spearman = None
- if len(vec_s2) >= 3:
- sp, _ = spearmanr(vec_s2, vec_pc)
- if not np.isnan(sp):
- pooled_spearman = _r4(float(sp))
-
- residual_corr = None
- if len(residual_pairs) >= 3:
- xs = np.array([p[0] for p in residual_pairs])
- ys = np.array([p[1] for p in residual_pairs])
- rc, _ = pearsonr(xs, ys)
- residual_corr = _r4(float(rc))
-
- agreements = [
- s.get("winner_agreement")
- for s in all_records
- if s.get("eligible") and s.get("winner_agreement") is not None
- ]
- winner_agreement_rate = (
- _r4(sum(1 for a in agreements if a) / len(agreements)) if agreements else None
- )
-
- n_loocv_dates = sum(len(s.get("records", [])) for s in all_records)
-
- return {
- "pooled_spearman": pooled_spearman,
- "residual_corr": residual_corr,
- "winner_agreement_rate": winner_agreement_rate,
- "n_loocv_dates": n_loocv_dates,
- }
-
-
-def _suitability_verdict(
- n_repr_pass: int,
- n_eligible: int,
- n_total: int,
- pooled: dict[str, Any],
-) -> str:
- if n_eligible == 0:
- return "insufficient data"
- repr_rate = n_repr_pass / n_total if n_total else 0
- agree = pooled.get("winner_agreement_rate")
- sp = pooled.get("pooled_spearman")
- rc = pooled.get("residual_corr")
-
- strong = 0
- if repr_rate >= 0.6:
- strong += 1
- if agree is not None and agree >= 0.7:
- strong += 1
- if sp is not None and sp >= 0.8:
- strong += 1
- if rc is not None and rc >= 0.5:
- strong += 1
-
- if strong >= 3:
- return "suitable"
- if strong >= 1 or repr_rate >= 0.4:
- return "partially suitable"
- return "not suitable"
-
-
-# ---------------------------------------------------------------------------
-# Per-site processing
-# ---------------------------------------------------------------------------
-
-
-def process_site(
- site: str,
- year: int,
- lat: float,
- lon: float,
- min_cloudfree: int,
- efast,
-) -> dict[str, Any]:
- metrics_dir = DATA_DIR / "metrics" / str(year) / site
- phenocam = _load_json_series(metrics_dir / "gcc_phenocam.json")
- s2_series = _load_json_series(metrics_dir / "gcc_s2.json")
-
- repr_metrics = compute_representativeness(phenocam, s2_series)
- n_cloudfree = len(s2_series)
- eligible = n_cloudfree >= min_cloudfree
-
- result: dict[str, Any] = {
- "eligible": eligible,
- "n_cloudfree_s2": n_cloudfree,
- "representativeness": repr_metrics,
- "loocv": None,
- "winner_s2": None,
- "winner_phenocam": None,
- "winner_agreement": None,
- "records": [],
- }
-
- if not eligible:
- return result
-
- records = run_loocv_site(site, year, lat, lon, s2_series, phenocam, efast)
- loocv = summarize_loocv(records)
- result["loocv"] = loocv
- result["winner_s2"] = loocv["winner_s2"]
- result["winner_phenocam"] = loocv["winner_phenocam"]
- result["winner_agreement"] = loocv["winner_agreement"]
- result["records"] = records
- return result
-
-
-# ---------------------------------------------------------------------------
-# Output / summary
-# ---------------------------------------------------------------------------
-
-
-def _compact_site_payload(site_result: dict[str, Any]) -> dict[str, Any]:
- """Drop raw LOOCV records from JSON output (keep summaries only)."""
- out = {
- "eligible": site_result["eligible"],
- "n_cloudfree_s2": site_result["n_cloudfree_s2"],
- "representativeness": site_result["representativeness"],
- "winner_s2": site_result.get("winner_s2"),
- "winner_phenocam": site_result.get("winner_phenocam"),
- "winner_agreement": site_result.get("winner_agreement"),
- }
- if site_result.get("loocv"):
- out["loocv"] = site_result["loocv"]
- return out
-
-
-def _print_summary(payload: dict[str, Any]) -> None:
- year = payload["year"]
- agg = payload["aggregate"]
- print(
- f"\nPhenoCam GCC suitability — {year} "
- f"({payload['n_sites_total']} site(s), "
- f"{payload['n_sites_eligible']} LOOCV-eligible, "
- f"{payload['n_sites_repr_pass']} representative)"
- )
- print(f"Verdict: {agg['suitability_verdict']}")
- print(
- f" pooled Spearman (method errors): {agg.get('pooled_spearman', '—')} "
- f"residual corr: {agg.get('residual_corr', '—')} "
- f"winner agreement: {agg.get('winner_agreement_rate', '—')} "
- f"LOOCV dates: {agg.get('n_loocv_dates', '—')}"
- )
- print(f"\n{'site':<28} {'repr r':>8} {'pass':>5} {'LOOCV n':>8} {'win agree':>10}")
- print("-" * 65)
- for site, data in sorted(payload["sites"].items()):
- rep = data["representativeness"]
- loocv_n = data.get("loocv", {}).get("n_dates") if data.get("loocv") else "—"
- agree = data.get("winner_agreement")
- agree_s = "yes" if agree else ("no" if agree is False else "—")
- pass_s = "yes" if rep.get("representative") else "no"
- print(
- f"{site:<28} {rep.get('r') or '—':>8} {pass_s:>5} "
- f"{loocv_n!s:>8} {agree_s:>10}"
- )
- if payload["n_sites_total"] < SMALL_SAMPLE_SITES:
- print(
- f"\nNote: only {payload['n_sites_total']} site(s); "
- "interpret cross-site aggregates cautiously."
- )
- if payload.get("dropped_sites"):
- print(f"Dropped/ineligible: {', '.join(payload['dropped_sites'])}")
-
-
-# ---------------------------------------------------------------------------
-# CLI
-# ---------------------------------------------------------------------------
-
-
-def main() -> None:
- parser = argparse.ArgumentParser(description=__doc__)
- parser.add_argument("--evaluation-year", type=int, default=DEFAULT_YEAR)
- parser.add_argument(
- "--min-cloudfree-s2",
- type=int,
- default=MIN_CLOUDFREE_S2,
- help="Minimum cloud-free S2 dates for LOOCV (default 10)",
- )
- parser.add_argument(
- "--alpha",
- type=float,
- default=DEFAULT_ALPHA,
- help="Significance threshold (reserved; default 0.05)",
- )
- args = parser.parse_args()
-
- year = args.evaluation_year
- min_cloudfree = args.min_cloudfree_s2
-
- sites = _discover_sites(year)
- if not sites:
- raise SystemExit(
- f"No Step 5 metrics found under {DATA_DIR / 'metrics' / str(year)}"
- )
-
- coords = _load_site_coords(year)
- efast = _import_efast()
-
- site_results: dict[str, dict[str, Any]] = {}
- dropped: list[str] = []
-
- for site in tqdm(sites, desc="Sites"):
- if site not in coords:
- dropped.append(site)
- continue
- lat, lon = coords[site]
- site_results[site] = process_site(site, year, lat, lon, min_cloudfree, efast)
- if not site_results[site]["eligible"]:
- dropped.append(site)
-
- n_eligible = sum(1 for s in site_results.values() if s["eligible"])
- n_repr_pass = sum(
- 1
- for s in site_results.values()
- if s["representativeness"].get("representative")
- )
-
- pooled = _pooled_concordance(list(site_results.values()))
- verdict = _suitability_verdict(n_repr_pass, n_eligible, len(sites), pooled)
-
- payload = {
- "year": year,
- "alpha": args.alpha,
- "repr_r_threshold": REPR_R_THRESHOLD,
- "min_cloudfree_s2": min_cloudfree,
- "n_sites_total": len(sites),
- "n_sites_eligible": n_eligible,
- "n_sites_repr_pass": n_repr_pass,
- "aggregate": {
- "suitability_verdict": verdict,
- **pooled,
- },
- "sites": {
- site: _compact_site_payload(data)
- for site, data in sorted(site_results.items())
- },
- "dropped_sites": sorted(set(dropped)),
- }
-
- out_dir = DATA_DIR / "gcc_suitability"
- out_dir.mkdir(parents=True, exist_ok=True)
- out_path = out_dir / f"{year}.json"
- out_path.write_text(json.dumps(payload, separators=(",", ":")))
-
- _print_summary(payload)
- print(f"\nWritten → {out_path}")
-
-
-if __name__ == "__main__":
- main()
diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index 8f8092c..0000000
--- a/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2026 Felix Delattre
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/README.md b/README.md
deleted file mode 100644
index baf3ba7..0000000
--- a/README.md
+++ /dev/null
@@ -1,92 +0,0 @@
-# EFAST fusion with phenocam validation.
-
-End-to-end pipeline from selecting sites from the global [PhenoCam Network](https://phenocam.nau.edu/) to run [EFAST](https://github.com/DHI-GRAS/efast) spatio-temporal fusion with Sentinel-2 / Sentinel-3 and validate GCCs accross sensors. The numbered steps cover site selection, Sentinel data acquisition, different fusion orders, accuracy metrics, and sample-level statistics, all feeding a static web QA viewer.
-
----
-
-## Pipeline overview
-
-| Step | Script | What it does |
-|------|--------|--------------|
-| 1 | `1-phenocam.py` | Download PhenoCam metadata and `one_day_summary` GCC CSVs |
-| 2 | `2-phenocam-screening.py` | Apply PhenoCam count, SNR, and proximity gates to select feasible sites |
-| 3 | `3-sentinel-data.py` | Acquire S2 (Earth Search COG) and S3 OLCI SYN L2 (CDSE OpenEO); prepare REFL, DIST_CLOUD, and composite GeoTIFFs |
-| 4 | `4-fusion.py` | Run EFAST BtI (fuse reflectance → GCC) and ItB (fuse GCC directly) for each screened site |
-| 5 | `5-metrics.py` | Extract PhenoCam-matched timeseries, compute NSE/RMSE/r baselines and fusion metrics, emit per-site JSON and webapp manifest |
-| 6 | `6-statistics-fusion-order.py` | Paired ItB-vs-BtI significance test (Wilcoxon + t-test) across all sites |
-| 7 | `7-gcc-suitability.py` | PhenoCam GCC suitability as a fusion-accuracy reference (representativeness + LOOCV concordance) |
-
----
-
-## Quick start
-
-### Run pipeline wrapper (recommended)
-
-```bash
-uv sync
-uv run python run-pipeline.py --evaluation-year 2025
-```
-
-Runs all five steps in order. Steps 1 and 2 are skipped when their output already exists. Each site in steps 3–5 is skipped when `data/metrics/{year}/{site}/metrics.json` is present. Any failure stops the run immediately, so one can fix the issue and re-run; completed work is never repeated.
-
-```bash
-# single site (steps 1 and 2 still skip if already done)
-uv run python run-pipeline.py --evaluation-year 2025 --site ICOSFR-Fon1
-```
-
-### Step by step
-
-```bash
-uv sync
-uv run python 1-phenocam.py --evaluation-year 2025
-uv run python 2-phenocam-screening.py --evaluation-year 2025
-uv run python 3-sentinel-data.py --evaluation-year 2025
-uv run python 4-fusion.py --evaluation-year 2025
-uv run python 5-metrics.py --evaluation-year 2025
-uv run python 6-statistics-fusion-order.py --evaluation-year 2025
-uv run python 7-gcc-suitability.py --evaluation-year 2025
-```
-
-Steps 1–5 accept `--evaluation-year` (default `2025`) and `--site` (optional, for single-site runs). Steps 6–7 are full-sample aggregates and only accept `--evaluation-year` (Step 6 and 7 also accept `--alpha`; Step 7 adds `--min-cloudfree-s2`, default `10`). Steps 3–5 are resumable — existing output files are skipped.
-
-```bash
-# single site
-uv run python 3-sentinel-data.py --evaluation-year 2025 --site ICOSFR-Fon1
-uv run python 4-fusion.py --evaluation-year 2025 --site ICOSFR-Fon1
-uv run python 5-metrics.py --evaluation-year 2025 --site ICOSFR-Fon1
-```
-
-### Credentials
-
-Step 3 S3 download uses CDSE OpenEO (`SENTINEL3_SYN_L2_SYN`). Set `CDSE_USER` and `CDSE_PASSWORD` in `../.env` at the workspace root. S2 uses AWS Earth Search COG range reads (no auth required).
-
----
-
-## Outputs (under `data/`)
-
-| Artifact | Step | Role |
-|----------|------|------|
-| `phenocam/{year}.json` | 1 | Site list + `sites_dir` pointer |
-| `phenocam/{year}/{site}.json`, `{site}_1day.csv` | 1 | Raw API payload and GCC CSV |
-| `phenocam_screening/{year}.json` / `.csv` | 2 | Gate results (pass/fail per site) |
-| `sentinel_data/{year}/{site}/prepared/s2/` | 3 | S2 REFL + DIST_CLOUD GeoTIFFs |
-| `sentinel_data/{year}/{site}/prepared/s3/` | 3 | S3 composite GeoTIFFs |
-| `fusion/{year}/{site}/bti/`, `.../itb/` | 4 | BtI fused reflectance + GCC; ItB fused GCC |
-| `metrics/{year}/{site}/` | 5 | Per-site timeseries, metrics, covariates JSON |
-| `metrics/manifest.json` | 5 | Webapp manifest (years + site metadata) |
-| `statistics_fusion_order/{year}.json` | 6 | Paired ItB-vs-BtI test summary (NSE, RMSE, nRMSE, r) |
-| `gcc_suitability/{year}.json` | 7 | PhenoCam GCC suitability summary (representativeness + LOOCV concordance) |
-
----
-
-## Web viewer
-
-
-
-`python3 -m http.server 8080` runs the webapp on [http://localhost:8000/index.html](http://localhost:8000/index.html). Requires step 5 output (`data/metrics/manifest.json`). The Statistics overlay GCC suitability tab uses step 7 output (`data/gcc_suitability/{year}.json`).
-
----
-
-## License
-
-[MIT](LICENSE)
diff --git a/amoros2011multitemporal.pdf b/amoros2011multitemporal.pdf
new file mode 100644
index 0000000..5a49dff
--- /dev/null
+++ b/amoros2011multitemporal.pdf
@@ -0,0 +1,9635 @@
+%PDF-1.5
+%
+1 0 obj
+<<
+/OCProperties <<
+/D <<
+/RBGroups []
+/Order []
+>>
+/OCGs [2 0 R]
+>>
+/Type /Catalog
+/FICL#3AEnfocus 3 0 R
+/Metadata 4 0 R
+/Pages 5 0 R
+>>
+endobj
+6 0 obj
+<<
+/Producer <7064665465582D312E34302E31303B206D6F646966696564207573696E67206954657874AE20372E312E3120A9323030302D323031382069546578742047726F7570204E5620284147504C2D76657273696F6E29>
+/IEEE#20Publication#20ID (5983499)
+/Meeting#20Ending#20Date (14 July 2011)
+/Trapped (False)
+/ModDate (D:20181126164003-05'00')
+/Subject (2011 6th International Workshop on the Analysis of Multi-temporal Remote Sensing Images \(Multi-Temp\);2011; ; ;10.1109/Multi-Temp.2011.6005053)
+/Application ('Certified by IEEE PDFeXpress at 06/17/2011 3:24:51 AM')
+/PTEX.Fullbanner (This is pdfTeX, Version 3.1415926-1.40.10-2.2 \(TeX Live 2009/Debian\) kpathsea version 5.0.0)
+/Creator ('Certified by IEEE PDFeXpress at 06/17/2011 3:24:51 AM')
+/IEEE#20Issue#20ID (6005026)
+/IEEE#20Article#20ID (6005053)
+/Title (Multitemporal fusion of Landsat and MERIS images)
+/Meeting#20Starting#20Date (12 July 2011)
+/CreationDate (D:20110617121830+02'00')
+>>
+endobj
+2 0 obj
+<<
+/Usage <<
+/PageElement <<
+/Subtype /HF
+>>
+>>
+/Type /OCG
+/Name (Headers/Footers)
+>>
+endobj
+3 0 obj
+<<
+/CertifiedWF 7 0 R
+>>
+endobj
+4 0 obj
+<<
+/Length 3528
+/Type /Metadata
+/Subtype /XML
+>>
+stream
+
+
+
+application/pdfIEEE2011 6th International Workshop on the Analysis of Multi-temporal Remote Sensing Images (Multi-Temp);2011; ; ;10.1109/Multi-Temp.2011.6005053Multi-resolutiondata fusionspatial unmixingsub-pixelLandsat TMMERISMultitemporal fusion of Landsat and MERIS imagesJ. Amoros-LopezL. Gomez-ChovaL. GuanterL. AlonsoJ. MorenoG. Camps-Valls
+2011 6th International Workshop on the Analysis of Multi-temporal Remote Sensing Images (Multi-Temp)81 July 201110.1109/Multi-Temp.2011.600505384
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+endstream
+endobj
+5 0 obj
+<<
+/Type /Pages
+/Count 4
+/Kids [8 0 R 9 0 R 10 0 R 11 0 R]
+>>
+endobj
+7 0 obj
+<<
+/Stamp [61 158 234 37 46 215 123 102 121 190
+16 110 0 160 172 209 121 174 11 44]
+/Version 1
+/Sessions [12 0 R 13 0 R 14 0 R 15 0 R]
+>>
+endobj
+8 0 obj
+<<
+/Parent 5 0 R
+/Type /Page
+/Contents [16 0 R 17 0 R 18 0 R]
+/Resources <<
+/ExtGState <<
+/GS0 19 0 R
+>>
+/XObject <<
+/Fm0 20 0 R
+/Fm1 21 0 R
+/Fm2 22 0 R
+>>
+/ProcSet [/PDF /Text]
+/Font <<
+/T1_3 23 0 R
+/T1_1 24 0 R
+/T1_4 25 0 R
+/T1_2 26 0 R
+/T1_5 27 0 R
+/T1_6 28 0 R
+/T1_0 29 0 R
+/T1_7 30 0 R
+/F9 31 0 R
+>>
+>>
+/MediaBox [0 0 612 792]
+>>
+endobj
+9 0 obj
+<<
+/Group 32 0 R
+/Parent 5 0 R
+/Type /Page
+/Contents [33 0 R 34 0 R 35 0 R]
+/Resources <<
+/ExtGState <<
+/GS0 36 0 R
+>>
+/XObject <<
+/Fm0 37 0 R
+/Fm1 38 0 R
+/Fm2 39 0 R
+/Im3 40 0 R
+/Fm3 41 0 R
+/Im1 42 0 R
+/Im2 43 0 R
+/Im0 44 0 R
+>>
+/ProcSet [/PDF /Text /ImageC]
+/Font <<
+/T1_1 23 0 R
+/T1_2 30 0 R
+/T1_0 29 0 R
+/F4 31 0 R
+>>
+>>
+/MediaBox [0 0 612 792]
+>>
+endobj
+10 0 obj
+<<
+/Group 45 0 R
+/Parent 5 0 R
+/Type /Page
+/Contents [46 0 R 47 0 R 48 0 R]
+/Resources <<
+/ExtGState <<
+/GS0 49 0 R
+>>
+/XObject <<
+/Fm0 50 0 R
+/Fm1 51 0 R
+/Fm2 52 0 R
+/Fm3 53 0 R
+/Fm4 54 0 R
+/Fm5 55 0 R
+/Fm6 56 0 R
+>>
+/ProcSet [/PDF /Text]
+/Font <<
+/T1_3 28 0 R
+/T1_1 23 0 R
+/T1_4 30 0 R
+/T1_2 27 0 R
+/T1_0 29 0 R
+/F6 31 0 R
+>>
+>>
+/MediaBox [0 0 612 792]
+>>
+endobj
+11 0 obj
+<<
+/Group 57 0 R
+/Parent 5 0 R
+/Type /Page
+/Contents [58 0 R 59 0 R 60 0 R]
+/Resources <<
+/ExtGState <<
+/GS0 61 0 R
+>>
+/XObject <<
+/Fm0 62 0 R
+/Fm1 63 0 R
+/Fm2 64 0 R
+/Fm3 65 0 R
+>>
+/ProcSet [/PDF /Text]
+/Font <<
+/T1_1 23 0 R
+/T1_2 24 0 R
+/T1_0 29 0 R
+/F4 31 0 R
+>>
+>>
+/MediaBox [0 0 612 792]
+>>
+endobj
+12 0 obj
+<<
+/Length 25686
+/Stamp <03C9E20B79A667795E349A0D00A115EE>
+/Version 1
+/Filter /FlateDecode
+>>
+stream
+x اg]zZ
+@@@Nc$@ $d@23~Ģ`ۨRE7\Ń
+EqU+߳ ʜSk{}߾\r|q./_e}?WO߿o{ǟwxި雞~z#sYnCJЍO5GN=k7={g1ġG#KxQZd֚?Ъ>r!ӛ4^?ҽO{ltvo7<{W]!zgwo}O=ȭx[Nt䖛wo9sl30ǟ;zӷcs郸2'Nߓ̙
|ٳ\KN߱xpGn%[eţ+vۿ:?~gzQ>욫x]Mӷ9rjԇˎçn.UCM~hVC]ܩ#GW$==%)>O?$Ї8+`b_yꙛW:~mgo5'?}{~ 9vbo9yTV;ʎc.\y>FntVtV{3?rYo>#kNrYTzC.
æ~هݏܽi#8F Fƫ=?&w\~g`g7(jMh~rfy3G7nG͵_xI+.:3{reOtdžvm`o?9R?/{79u'^eyZe":nnl|gOŹ/K
+$NN!~m.h>7I{'e/5u'$LiS;ԏa<
;q柶f+Y-Ux+_qqLX">5ӎnͥ>LDyJ_
i?dZՃgmM?4v}{ WKY~|L}]5_%/懾J:m6zv Gns5L\mtT̛q U*gڨg+9} ,Q33O;~vc_
+cvudG3&1Hgy}n/"~G,ۭ}K}~lO`;Nc˳Wy\B ϺJ&=2ʸeeOnk7yCw__9B:`
+Tgl9Ck?n>_>!]*F30H觨_4ZCШiyfЭOױb̠zSfl/6@h`: /vPA^|߄_.lWv8ufgV'݀dC7?D3h&@qI9qeO~۱|UM;*'#4]&vG#ŝ3P6
+1ac ڪ1N
+bXv2YkݎYu;19Vs
+ouXv_m Hh+/H