Switching horses.

2026-06-10 14:18:06 +02:00 · 2026-06-10 14:18:06 +02:00 · e3e14027fc
commit e3e14027fc
parent 25cbd97662
51 changed files with 5078 additions and 11678 deletions
--- a/1-phenocam.py
+++ b/1-phenocam.py
@ -0,0 +1,278 @@
+"""Step 1: download worldwide PhenoCam sites for a calendar year.
+
+Inputs (``data/``): none — queries the PhenoCam API.
+
+Outputs (``data/``, ``{year}`` = ``--evaluation-year``):
+
+- ``phenocam/{year}.json`` — site list manifest
+- ``phenocam/{year}/{sitename}.json`` — camera + ROI metadata
+- ``phenocam/{year}/{sitename}_1day.csv`` — ``one_day_summary`` GCC CSV
+
+CLI: ``--evaluation-year`` (default 2025), ``--sites`` (optional comma-separated filter).
+
+Next step: :mod:`2-phenocam-screening`.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from datetime import date
+from pathlib import Path
+from typing import Any
+
+import requests
+
+PROCESSING_DIR = Path(__file__).resolve().parents[1] / "processing"
+if str(PROCESSING_DIR) not in sys.path:
+    sys.path.insert(0, str(PROCESSING_DIR))
+
+from acquisition_phenocam import PHENOCAM_API  # noqa: E402
+from acquisition_phenocam_all_europe import _paginate_cameras, _parse_iso_date  # noqa: E402
+
+EVALUATION_YEAR = 2025
+HOST_PROBE = "https://phenocam.nau.edu/api/cameras/?limit=1"
+ONE_DAY_CSV_SUFFIX = "_1day.csv"
+
+
+def check_phenocam_host() -> None:
+    try:
+        response = requests.get(HOST_PROBE, timeout=30)
+        response.raise_for_status()
+    except requests.RequestException as exc:
+        raise RuntimeError(
+            f"PhenoCam API unreachable (phenocam.nau.edu): "
+            f"{exc.__class__.__name__}: {exc}"
+        ) from exc
+
+
+def _overlaps_year(first: str | None, last: str | None, season: int) -> bool:
+    start = _parse_iso_date(first)
+    end = _parse_iso_date(last)
+    if start is None or end is None:
+        return False
+    return start <= date(season, 12, 31) and end >= date(season, 1, 1)
+
+
+def sites_dir(cache_dir: Path, evaluation_year: int) -> Path:
+    return cache_dir / "phenocam" / str(evaluation_year)
+
+
+def site_json_path(cache_dir: Path, evaluation_year: int, sitename: str) -> Path:
+    return sites_dir(cache_dir, evaluation_year) / f"{sitename}.json"
+
+
+def site_csv_path(cache_dir: Path, evaluation_year: int, sitename: str) -> Path:
+    return sites_dir(cache_dir, evaluation_year) / f"{sitename}{ONE_DAY_CSV_SUFFIX}"
+
+
+def load_candidate_cameras(
+    evaluation_year: int,
+    *,
+    site_filter: set[str] | None = None,
+    active_only: bool = False,
+    limit: int | None = None,
+) -> list[dict[str, Any]]:
+    cameras: list[dict[str, Any]] = []
+    for camera in _paginate_cameras():
+        if active_only and not camera.get("active"):
+            continue
+        sitename = str(camera["Sitename"])
+        if site_filter is not None and sitename not in site_filter:
+            continue
+        if not _overlaps_year(camera.get("date_first"), camera.get("date_last"), evaluation_year):
+            continue
+        cameras.append(dict(camera))
+    cameras.sort(key=lambda item: str(item["Sitename"]))
+    if limit is not None:
+        cameras = cameras[:limit]
+    return cameras
+
+
+def fetch_roi_record(site_name: str) -> dict[str, Any] | None:
+    rois: list[dict[str, Any]] = []
+    url = f"{PHENOCAM_API}/roilists/"
+    params: dict[str, Any] | None = {"site": site_name}
+    while url:
+        response = requests.get(url, params=params, timeout=60)
+        response.raise_for_status()
+        payload = response.json()
+        rois.extend(
+            item for item in payload.get("results", []) if item.get("site") == site_name
+        )
+        url = payload.get("next")
+        params = None
+        if rois:
+            break
+    return dict(rois[0]) if rois else None
+
+
+def download_one_day_csv(csv_url: str, output_path: Path) -> None:
+    response = requests.get(csv_url, timeout=60)
+    response.raise_for_status()
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(response.text, encoding="utf-8")
+
+
+def download_site(
+    camera: dict[str, Any],
+    evaluation_year: int,
+    cache_dir: Path,
+) -> str:
+    sitename = str(camera["Sitename"])
+    roi = fetch_roi_record(sitename)
+    payload = {"response": {"camera": camera, "roi": roi}}
+    json_path = site_json_path(cache_dir, evaluation_year, sitename)
+    json_path.parent.mkdir(parents=True, exist_ok=True)
+    json_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
+
+    csv_url = roi.get("one_day_summary") if roi else None
+    if csv_url:
+        download_one_day_csv(csv_url, site_csv_path(cache_dir, evaluation_year, sitename))
+    return sitename
+
+
+def load_or_download_site(
+    camera: dict[str, Any],
+    evaluation_year: int,
+    cache_dir: Path,
+    *,
+    refresh: bool,
+) -> str:
+    sitename = str(camera["Sitename"])
+    json_path = site_json_path(cache_dir, evaluation_year, sitename)
+    csv_path = site_csv_path(cache_dir, evaluation_year, sitename)
+    if not refresh and json_path.is_file():
+        if not csv_path.is_file():
+            payload = json.loads(json_path.read_text(encoding="utf-8"))
+            roi = payload.get("response", {}).get("roi") or {}
+            csv_url = roi.get("one_day_summary")
+            if csv_url:
+                download_one_day_csv(csv_url, csv_path)
+        return sitename
+    return download_site(camera, evaluation_year, cache_dir)
+
+
+def run_download(
+    *,
+    cache_dir: Path,
+    evaluation_year: int,
+    active_only: bool = False,
+    site_filter: set[str] | None = None,
+    limit: int | None = None,
+    refresh: bool = False,
+) -> list[str]:
+    check_phenocam_host()
+    candidates = load_candidate_cameras(
+        evaluation_year,
+        site_filter=site_filter,
+        active_only=active_only,
+        limit=limit,
+    )
+    print(
+        f"[PhenoCam-1] {len(candidates)} candidate(s) with archive overlap for "
+        f"{evaluation_year}"
+    )
+
+    sitenames: list[str] = []
+    for index, camera in enumerate(candidates, start=1):
+        sitename = str(camera["Sitename"])
+        print(
+            f"[PhenoCam-1] ({index}/{len(candidates)}) {sitename} "
+            f"({float(camera['Lat']):.4f}, {float(camera['Lon']):.4f})"
+        )
+        sitenames.append(
+            load_or_download_site(
+                camera,
+                evaluation_year,
+                cache_dir,
+                refresh=refresh,
+            )
+        )
+    return sorted(sitenames)
+
+
+def write_manifest(
+    sitenames: list[str],
+    output_path: Path,
+    cache_dir: Path,
+    evaluation_year: int,
+) -> None:
+    rel_sites_dir = sites_dir(cache_dir, evaluation_year).relative_to(output_path.parent)
+    payload = {
+        "evaluation_year": evaluation_year,
+        "count": len(sitenames),
+        "sites_dir": rel_sites_dir.as_posix(),
+        "sites": sitenames,
+    }
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
+    print(f"[PhenoCam-1] Wrote {output_path}")
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--cache-dir",
+        type=Path,
+        default=Path("data"),
+        help="Base directory for per-site files and manifest",
+    )
+    parser.add_argument(
+        "--evaluation-year",
+        type=int,
+        default=EVALUATION_YEAR,
+        help=f"Calendar year to download (default: {EVALUATION_YEAR})",
+    )
+    parser.add_argument(
+        "--active-only",
+        action="store_true",
+        help="Restrict candidates to cameras marked active in the API",
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=None,
+        help="Process only the first N candidate sites (testing)",
+    )
+    parser.add_argument(
+        "--sites",
+        type=str,
+        default=None,
+        help="Comma-separated sitenames to download (testing)",
+    )
+    parser.add_argument(
+        "--refresh",
+        action="store_true",
+        help="Re-download sites even when cache files exist",
+    )
+    parser.add_argument(
+        "--output-json",
+        type=Path,
+        default=None,
+        help="Manifest output path (default: data/phenocam/{year}.json)",
+    )
+    args = parser.parse_args(argv)
+
+    site_filter = None
+    if args.sites:
+        site_filter = {name.strip() for name in args.sites.split(",") if name.strip()}
+
+    sitenames = run_download(
+        cache_dir=args.cache_dir,
+        evaluation_year=args.evaluation_year,
+        active_only=args.active_only,
+        site_filter=site_filter,
+        limit=args.limit,
+        refresh=args.refresh,
+    )
+    manifest_path = args.output_json or (
+        args.cache_dir / "phenocam" / f"{args.evaluation_year}.json"
+    )
+    write_manifest(sitenames, manifest_path, args.cache_dir, args.evaluation_year)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())