Switching horses.

This commit is contained in:
Felix Delattre 2026-06-10 14:18:06 +02:00
parent 25cbd97662
commit e3e14027fc
51 changed files with 5078 additions and 11678 deletions

278
1-phenocam.py Normal file
View file

@ -0,0 +1,278 @@
"""Step 1: download worldwide PhenoCam sites for a calendar year.
Inputs (``data/``): none queries the PhenoCam API.
Outputs (``data/``, ``{year}`` = ``--evaluation-year``):
- ``phenocam/{year}.json`` site list manifest
- ``phenocam/{year}/{sitename}.json`` camera + ROI metadata
- ``phenocam/{year}/{sitename}_1day.csv`` ``one_day_summary`` GCC CSV
CLI: ``--evaluation-year`` (default 2025), ``--sites`` (optional comma-separated filter).
Next step: :mod:`2-phenocam-screening`.
"""
from __future__ import annotations
import argparse
import json
import sys
from datetime import date
from pathlib import Path
from typing import Any
import requests
PROCESSING_DIR = Path(__file__).resolve().parents[1] / "processing"
if str(PROCESSING_DIR) not in sys.path:
sys.path.insert(0, str(PROCESSING_DIR))
from acquisition_phenocam import PHENOCAM_API # noqa: E402
from acquisition_phenocam_all_europe import _paginate_cameras, _parse_iso_date # noqa: E402
EVALUATION_YEAR = 2025
HOST_PROBE = "https://phenocam.nau.edu/api/cameras/?limit=1"
ONE_DAY_CSV_SUFFIX = "_1day.csv"
def check_phenocam_host() -> None:
try:
response = requests.get(HOST_PROBE, timeout=30)
response.raise_for_status()
except requests.RequestException as exc:
raise RuntimeError(
f"PhenoCam API unreachable (phenocam.nau.edu): "
f"{exc.__class__.__name__}: {exc}"
) from exc
def _overlaps_year(first: str | None, last: str | None, season: int) -> bool:
start = _parse_iso_date(first)
end = _parse_iso_date(last)
if start is None or end is None:
return False
return start <= date(season, 12, 31) and end >= date(season, 1, 1)
def sites_dir(cache_dir: Path, evaluation_year: int) -> Path:
return cache_dir / "phenocam" / str(evaluation_year)
def site_json_path(cache_dir: Path, evaluation_year: int, sitename: str) -> Path:
return sites_dir(cache_dir, evaluation_year) / f"{sitename}.json"
def site_csv_path(cache_dir: Path, evaluation_year: int, sitename: str) -> Path:
return sites_dir(cache_dir, evaluation_year) / f"{sitename}{ONE_DAY_CSV_SUFFIX}"
def load_candidate_cameras(
evaluation_year: int,
*,
site_filter: set[str] | None = None,
active_only: bool = False,
limit: int | None = None,
) -> list[dict[str, Any]]:
cameras: list[dict[str, Any]] = []
for camera in _paginate_cameras():
if active_only and not camera.get("active"):
continue
sitename = str(camera["Sitename"])
if site_filter is not None and sitename not in site_filter:
continue
if not _overlaps_year(camera.get("date_first"), camera.get("date_last"), evaluation_year):
continue
cameras.append(dict(camera))
cameras.sort(key=lambda item: str(item["Sitename"]))
if limit is not None:
cameras = cameras[:limit]
return cameras
def fetch_roi_record(site_name: str) -> dict[str, Any] | None:
rois: list[dict[str, Any]] = []
url = f"{PHENOCAM_API}/roilists/"
params: dict[str, Any] | None = {"site": site_name}
while url:
response = requests.get(url, params=params, timeout=60)
response.raise_for_status()
payload = response.json()
rois.extend(
item for item in payload.get("results", []) if item.get("site") == site_name
)
url = payload.get("next")
params = None
if rois:
break
return dict(rois[0]) if rois else None
def download_one_day_csv(csv_url: str, output_path: Path) -> None:
response = requests.get(csv_url, timeout=60)
response.raise_for_status()
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(response.text, encoding="utf-8")
def download_site(
camera: dict[str, Any],
evaluation_year: int,
cache_dir: Path,
) -> str:
sitename = str(camera["Sitename"])
roi = fetch_roi_record(sitename)
payload = {"response": {"camera": camera, "roi": roi}}
json_path = site_json_path(cache_dir, evaluation_year, sitename)
json_path.parent.mkdir(parents=True, exist_ok=True)
json_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
csv_url = roi.get("one_day_summary") if roi else None
if csv_url:
download_one_day_csv(csv_url, site_csv_path(cache_dir, evaluation_year, sitename))
return sitename
def load_or_download_site(
camera: dict[str, Any],
evaluation_year: int,
cache_dir: Path,
*,
refresh: bool,
) -> str:
sitename = str(camera["Sitename"])
json_path = site_json_path(cache_dir, evaluation_year, sitename)
csv_path = site_csv_path(cache_dir, evaluation_year, sitename)
if not refresh and json_path.is_file():
if not csv_path.is_file():
payload = json.loads(json_path.read_text(encoding="utf-8"))
roi = payload.get("response", {}).get("roi") or {}
csv_url = roi.get("one_day_summary")
if csv_url:
download_one_day_csv(csv_url, csv_path)
return sitename
return download_site(camera, evaluation_year, cache_dir)
def run_download(
*,
cache_dir: Path,
evaluation_year: int,
active_only: bool = False,
site_filter: set[str] | None = None,
limit: int | None = None,
refresh: bool = False,
) -> list[str]:
check_phenocam_host()
candidates = load_candidate_cameras(
evaluation_year,
site_filter=site_filter,
active_only=active_only,
limit=limit,
)
print(
f"[PhenoCam-1] {len(candidates)} candidate(s) with archive overlap for "
f"{evaluation_year}"
)
sitenames: list[str] = []
for index, camera in enumerate(candidates, start=1):
sitename = str(camera["Sitename"])
print(
f"[PhenoCam-1] ({index}/{len(candidates)}) {sitename} "
f"({float(camera['Lat']):.4f}, {float(camera['Lon']):.4f})"
)
sitenames.append(
load_or_download_site(
camera,
evaluation_year,
cache_dir,
refresh=refresh,
)
)
return sorted(sitenames)
def write_manifest(
sitenames: list[str],
output_path: Path,
cache_dir: Path,
evaluation_year: int,
) -> None:
rel_sites_dir = sites_dir(cache_dir, evaluation_year).relative_to(output_path.parent)
payload = {
"evaluation_year": evaluation_year,
"count": len(sitenames),
"sites_dir": rel_sites_dir.as_posix(),
"sites": sitenames,
}
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
print(f"[PhenoCam-1] Wrote {output_path}")
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--cache-dir",
type=Path,
default=Path("data"),
help="Base directory for per-site files and manifest",
)
parser.add_argument(
"--evaluation-year",
type=int,
default=EVALUATION_YEAR,
help=f"Calendar year to download (default: {EVALUATION_YEAR})",
)
parser.add_argument(
"--active-only",
action="store_true",
help="Restrict candidates to cameras marked active in the API",
)
parser.add_argument(
"--limit",
type=int,
default=None,
help="Process only the first N candidate sites (testing)",
)
parser.add_argument(
"--sites",
type=str,
default=None,
help="Comma-separated sitenames to download (testing)",
)
parser.add_argument(
"--refresh",
action="store_true",
help="Re-download sites even when cache files exist",
)
parser.add_argument(
"--output-json",
type=Path,
default=None,
help="Manifest output path (default: data/phenocam/{year}.json)",
)
args = parser.parse_args(argv)
site_filter = None
if args.sites:
site_filter = {name.strip() for name in args.sites.split(",") if name.strip()}
sitenames = run_download(
cache_dir=args.cache_dir,
evaluation_year=args.evaluation_year,
active_only=args.active_only,
site_filter=site_filter,
limit=args.limit,
refresh=args.refresh,
)
manifest_path = args.output_json or (
args.cache_dir / "phenocam" / f"{args.evaluation_year}.json"
)
write_manifest(sitenames, manifest_path, args.cache_dir, args.evaluation_year)
return 0
if __name__ == "__main__":
raise SystemExit(main())