"""Detect and repair sites where S2 downloads span multiple MGRS tile extents. Sites on MGRS tile boundaries produce REFL files from two tiles with different spatial extents (e.g. 16SDA and 16SDB). This breaks EFAST, which requires all S2 files to share the same grid. This script: 1. Scans all downloaded sites for the given year. 2. Reports any site where ``prepared/s2`` contains REFL files of mixed shapes. 3. With ``--fix``: - Removes the minority-shape REFL / DIST_CLOUD / GCC files. - Deletes the stale ``prepared/s3`` and ``prepared/gcc_s3`` composites. - Regenerates ``prepared/s3`` composites from the existing raw S3 data using the largest-extent S2 tile as the reference grid. ``prepared/gcc_s3`` is intentionally left empty — step 4 (``4-fusion.py``) regenerates it on its next run. Usage:: uv run python fix-tile-boundary.py # detect only uv run python fix-tile-boundary.py --fix # detect + repair uv run python fix-tile-boundary.py --fix --evaluation-year 2024 Prior step: :mod:`3-sentinel-data`. Next step after fixing: :mod:`4-fusion`. """ from __future__ import annotations import argparse import importlib.util import shutil import sys from collections import Counter from pathlib import Path import rasterio DATA_DIR = Path("data") DEFAULT_YEAR = 2025 # --------------------------------------------------------------------------- # Detection # --------------------------------------------------------------------------- def _refl_shapes(s2_dir: Path) -> dict[tuple[int, int], list[Path]]: """Return a mapping of shape → REFL file paths for a prepared/s2 directory.""" shape_to_files: dict[tuple[int, int], list[Path]] = {} for f in sorted(s2_dir.glob("*_REFL.tif")): with rasterio.open(f) as src: shape: tuple[int, int] = src.shape # type: ignore[assignment] shape_to_files.setdefault(shape, []).append(f) return shape_to_files def detect(year: int) -> list[Path]: """Return site directories whose prepared/s2 has mixed REFL shapes.""" sentinel_dir = DATA_DIR / "sentinel_data" / str(year) if not sentinel_dir.exists(): print(f"[detect] No sentinel data found at {sentinel_dir}") return [] bad_sites: list[Path] = [] for site_dir in sorted(sentinel_dir.iterdir()): s2_dir = site_dir / "prepared" / "s2" if not s2_dir.exists(): continue shapes = _refl_shapes(s2_dir) if len(shapes) > 1: summary = ", ".join( f"{s[0]}×{s[1]} ({len(fs)} files)" for s, fs in shapes.items() ) print(f"[detect] {site_dir.name}: mixed shapes — {summary}") bad_sites.append(site_dir) if not bad_sites: print("[detect] All sites OK — no mixed tile shapes found.") return bad_sites # --------------------------------------------------------------------------- # Repair # --------------------------------------------------------------------------- def _load_step3(): """Import helpers from 3-sentinel-data.py without executing its main().""" spec = importlib.util.spec_from_file_location("step3", "3-sentinel-data.py") mod = importlib.util.module_from_spec(spec) # type: ignore[arg-type] spec.loader.exec_module(mod) # type: ignore[union-attr] return mod def repair(site_dir: Path) -> None: """Remove minority-shape S2 files and regenerate S3 composites for one site.""" s2_dir = site_dir / "prepared" / "s2" s3_raw = site_dir / "raw" / "s3" s3_out = site_dir / "prepared" / "s3" gcc_s3_out = site_dir / "prepared" / "gcc_s3" name = site_dir.name # --- 1. Identify reference shape (largest extent) ------------------------- shapes = _refl_shapes(s2_dir) if len(shapes) <= 1: print(f"[repair] {name}: already consistent — nothing to do.") return ref_shape = max(shapes.keys(), key=lambda s: s[0] * s[1]) # --- 2. Remove non-reference REFL + companions ---------------------------- n_removed = 0 for shape, files in shapes.items(): if shape == ref_shape: continue for refl_path in files: stem = refl_path.stem[: -len("_REFL")] for companion in s2_dir.glob(f"{stem}_*.tif"): companion.unlink() refl_path.unlink(missing_ok=True) n_removed += 1 print( f"[repair] {name}: removed {n_removed} minority-shape file-sets (kept {ref_shape[0]}×{ref_shape[1]})" ) # --- 3. Remove stale GCC files from prepared/s2 --------------------------- gcc_removed = sum(1 for f in s2_dir.glob("*_GCC.tif") if f.unlink() or True) if gcc_removed: print( f"[repair] {name}: removed {gcc_removed} stale GCC files from prepared/s2" ) # --- 4. Wipe stale S3 composites ------------------------------------------ for d in (s3_out, gcc_s3_out): if d.exists(): shutil.rmtree(d) print(f"[repair] {name}: removed {d.relative_to(site_dir)}/") # --- 5. Regenerate S3 composites with the correct reference --------------- if not s3_raw.exists() or not any(s3_raw.glob("S3*.tif")): print( f"[repair] {name}: WARNING — no raw S3 data in {s3_raw}; skipping S3 regeneration." ) return s2_refl_path = next(iter(sorted(s2_dir.glob("*_REFL.tif"))), None) if s2_refl_path is None: print( f"[repair] {name}: WARNING — no REFL files left; cannot regenerate S3 composites." ) return print( f"[repair] {name}: regenerating S3 composites (reference: {s2_refl_path.name})..." ) step3 = _load_step3() s3_out.mkdir(parents=True, exist_ok=True) step3._prepare_s3(s3_raw, s2_refl_path, s3_out) n_composites = len(list(s3_out.glob("composite_*.tif"))) print(f"[repair] {name}: wrote {n_composites} composites → ready for 4-fusion.py") # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--evaluation-year", type=int, default=DEFAULT_YEAR) parser.add_argument( "--fix", action="store_true", help="Actually repair detected sites (default: detect only)", ) args = parser.parse_args(argv) bad_sites = detect(args.evaluation_year) if not bad_sites: return 0 if not args.fix: print(f"\nRun with --fix to repair {len(bad_sites)} site(s).") return 0 print() for site_dir in bad_sites: repair(site_dir) return 0 if __name__ == "__main__": sys.exit(main())