efast-phenocam-validation/fix-tile-boundary.py
Felix Delattre d29754e4a5 Ran linter.
2026-06-11 16:33:14 +02:00

196 lines
6.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Detect and repair sites where S2 downloads span multiple MGRS tile extents.
Sites on MGRS tile boundaries produce REFL files from two tiles with different
spatial extents (e.g. 16SDA and 16SDB). This breaks EFAST, which requires all
S2 files to share the same grid. This script:
1. Scans all downloaded sites for the given year.
2. Reports any site where ``prepared/s2`` contains REFL files of mixed shapes.
3. With ``--fix``:
- Removes the minority-shape REFL / DIST_CLOUD / GCC files.
- Deletes the stale ``prepared/s3`` and ``prepared/gcc_s3`` composites.
- Regenerates ``prepared/s3`` composites from the existing raw S3 data using
the largest-extent S2 tile as the reference grid.
``prepared/gcc_s3`` is intentionally left empty — step 4 (``4-fusion.py``)
regenerates it on its next run.
Usage::
uv run python fix-tile-boundary.py # detect only
uv run python fix-tile-boundary.py --fix # detect + repair
uv run python fix-tile-boundary.py --fix --evaluation-year 2024
Prior step: :mod:`3-sentinel-data`.
Next step after fixing: :mod:`4-fusion`.
"""
from __future__ import annotations
import argparse
import importlib.util
import shutil
import sys
from collections import Counter
from pathlib import Path
import rasterio
DATA_DIR = Path("data")
DEFAULT_YEAR = 2025
# ---------------------------------------------------------------------------
# Detection
# ---------------------------------------------------------------------------
def _refl_shapes(s2_dir: Path) -> dict[tuple[int, int], list[Path]]:
"""Return a mapping of shape → REFL file paths for a prepared/s2 directory."""
shape_to_files: dict[tuple[int, int], list[Path]] = {}
for f in sorted(s2_dir.glob("*_REFL.tif")):
with rasterio.open(f) as src:
shape: tuple[int, int] = src.shape # type: ignore[assignment]
shape_to_files.setdefault(shape, []).append(f)
return shape_to_files
def detect(year: int) -> list[Path]:
"""Return site directories whose prepared/s2 has mixed REFL shapes."""
sentinel_dir = DATA_DIR / "sentinel_data" / str(year)
if not sentinel_dir.exists():
print(f"[detect] No sentinel data found at {sentinel_dir}")
return []
bad_sites: list[Path] = []
for site_dir in sorted(sentinel_dir.iterdir()):
s2_dir = site_dir / "prepared" / "s2"
if not s2_dir.exists():
continue
shapes = _refl_shapes(s2_dir)
if len(shapes) > 1:
summary = ", ".join(
f"{s[0]}×{s[1]} ({len(fs)} files)" for s, fs in shapes.items()
)
print(f"[detect] {site_dir.name}: mixed shapes — {summary}")
bad_sites.append(site_dir)
if not bad_sites:
print("[detect] All sites OK — no mixed tile shapes found.")
return bad_sites
# ---------------------------------------------------------------------------
# Repair
# ---------------------------------------------------------------------------
def _load_step3():
"""Import helpers from 3-sentinel-data.py without executing its main()."""
spec = importlib.util.spec_from_file_location("step3", "3-sentinel-data.py")
mod = importlib.util.module_from_spec(spec) # type: ignore[arg-type]
spec.loader.exec_module(mod) # type: ignore[union-attr]
return mod
def repair(site_dir: Path) -> None:
"""Remove minority-shape S2 files and regenerate S3 composites for one site."""
s2_dir = site_dir / "prepared" / "s2"
s3_raw = site_dir / "raw" / "s3"
s3_out = site_dir / "prepared" / "s3"
gcc_s3_out = site_dir / "prepared" / "gcc_s3"
name = site_dir.name
# --- 1. Identify reference shape (largest extent) -------------------------
shapes = _refl_shapes(s2_dir)
if len(shapes) <= 1:
print(f"[repair] {name}: already consistent — nothing to do.")
return
ref_shape = max(shapes.keys(), key=lambda s: s[0] * s[1])
# --- 2. Remove non-reference REFL + companions ----------------------------
n_removed = 0
for shape, files in shapes.items():
if shape == ref_shape:
continue
for refl_path in files:
stem = refl_path.stem[: -len("_REFL")]
for companion in s2_dir.glob(f"{stem}_*.tif"):
companion.unlink()
refl_path.unlink(missing_ok=True)
n_removed += 1
print(
f"[repair] {name}: removed {n_removed} minority-shape file-sets (kept {ref_shape[0]}×{ref_shape[1]})"
)
# --- 3. Remove stale GCC files from prepared/s2 ---------------------------
gcc_removed = sum(1 for f in s2_dir.glob("*_GCC.tif") if f.unlink() or True)
if gcc_removed:
print(
f"[repair] {name}: removed {gcc_removed} stale GCC files from prepared/s2"
)
# --- 4. Wipe stale S3 composites ------------------------------------------
for d in (s3_out, gcc_s3_out):
if d.exists():
shutil.rmtree(d)
print(f"[repair] {name}: removed {d.relative_to(site_dir)}/")
# --- 5. Regenerate S3 composites with the correct reference ---------------
if not s3_raw.exists() or not any(s3_raw.glob("S3*.tif")):
print(
f"[repair] {name}: WARNING — no raw S3 data in {s3_raw}; skipping S3 regeneration."
)
return
s2_refl_path = next(iter(sorted(s2_dir.glob("*_REFL.tif"))), None)
if s2_refl_path is None:
print(
f"[repair] {name}: WARNING — no REFL files left; cannot regenerate S3 composites."
)
return
print(
f"[repair] {name}: regenerating S3 composites (reference: {s2_refl_path.name})..."
)
step3 = _load_step3()
s3_out.mkdir(parents=True, exist_ok=True)
step3._prepare_s3(s3_raw, s2_refl_path, s3_out)
n_composites = len(list(s3_out.glob("composite_*.tif")))
print(f"[repair] {name}: wrote {n_composites} composites → ready for 4-fusion.py")
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--evaluation-year", type=int, default=DEFAULT_YEAR)
parser.add_argument(
"--fix",
action="store_true",
help="Actually repair detected sites (default: detect only)",
)
args = parser.parse_args(argv)
bad_sites = detect(args.evaluation_year)
if not bad_sites:
return 0
if not args.fix:
print(f"\nRun with --fix to repair {len(bad_sites)} site(s).")
return 0
print()
for site_dir in bad_sites:
repair(site_dir)
return 0
if __name__ == "__main__":
sys.exit(main())