From e33256d43d1e97f46f113d5eb9565d94fce9f7eb Mon Sep 17 00:00:00 2001 From: Felix Delattre Date: Thu, 11 Jun 2026 11:35:35 +0200 Subject: [PATCH] Fix two scene input. --- 3-sentinel-data.py | 46 +++++++++++ fix-tile-boundary.py | 186 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 232 insertions(+) create mode 100644 fix-tile-boundary.py diff --git a/3-sentinel-data.py b/3-sentinel-data.py index f310fe1..a5571da 100644 --- a/3-sentinel-data.py +++ b/3-sentinel-data.py @@ -597,6 +597,51 @@ def _import_distance_to_clouds(): ) from exc +def _normalize_s2_grid(s2_dir: Path) -> None: + """Remove REFL (and companion DIST_CLOUD/GCC) files that don't match the dominant shape. + + Sites at MGRS tile boundaries can have S2 downloads from two tiles with different + spatial extents. For example, a site at the top edge of tile 16SDA will produce + narrow (e.g. 30×180) REFL files from 16SDA and full-extent (180×180) files from + 16SDB. EFAST requires all S2 files to share the same grid, so we keep only the + shape with the most pixels (largest tile coverage) and discard the rest together + with any already-computed companions. + """ + refl_files = sorted(s2_dir.glob("*_REFL.tif")) + if not refl_files: + return + + shape_to_files: dict[tuple[int, int], list[Path]] = {} + for f in refl_files: + with rasterio.open(f) as src: + shape: tuple[int, int] = src.shape # type: ignore[assignment] + shape_to_files.setdefault(shape, []).append(f) + + if len(shape_to_files) <= 1: + return + + ref_shape = max(shape_to_files.keys(), key=lambda s: s[0] * s[1]) + shapes_summary = ", ".join( + f"{s[0]}×{s[1]}({len(fs)})" for s, fs in shape_to_files.items() + ) + + n_removed = 0 + for shape, files in shape_to_files.items(): + if shape == ref_shape: + continue + for refl_path in files: + stem = refl_path.stem[: -len("_REFL")] # e.g. "S2A_16SDA_20250106_0_L2A" + for companion in s2_dir.glob(f"{stem}_*.tif"): + companion.unlink() + refl_path.unlink(missing_ok=True) + n_removed += 1 + + print( + f"[S2-NORM] Tile boundary — shapes: {shapes_summary}; " + f"removed {n_removed} file-sets, kept {ref_shape[0]}×{ref_shape[1]}" + ) + + def _rescale_dist_cloud(s2_dir: Path) -> None: """Ensure DIST_CLOUD values are in pixel units (not normalised to [0,1]).""" for dc_path in s2_dir.glob("*DIST_CLOUD.tif"): @@ -796,6 +841,7 @@ def process_site( items = stac_search_s2(bbox, datetime(year, 1, 1), datetime(year, 12, 31)) print(f"[{sitename}] {len(items)} S2 items found — downloading windows...") download_s2_window(items, bbox, s2_out, S2_BANDS, RESOLUTION_RATIO) + _normalize_s2_grid(s2_out) # S2 distance-to-clouds print(f"[{sitename}] Computing distance-to-clouds...") diff --git a/fix-tile-boundary.py b/fix-tile-boundary.py new file mode 100644 index 0000000..dff958e --- /dev/null +++ b/fix-tile-boundary.py @@ -0,0 +1,186 @@ +"""Detect and repair sites where S2 downloads span multiple MGRS tile extents. + +Sites on MGRS tile boundaries produce REFL files from two tiles with different +spatial extents (e.g. 16SDA and 16SDB). This breaks EFAST, which requires all +S2 files to share the same grid. This script: + + 1. Scans all downloaded sites for the given year. + 2. Reports any site where ``prepared/s2`` contains REFL files of mixed shapes. + 3. With ``--fix``: + - Removes the minority-shape REFL / DIST_CLOUD / GCC files. + - Deletes the stale ``prepared/s3`` and ``prepared/gcc_s3`` composites. + - Regenerates ``prepared/s3`` composites from the existing raw S3 data using + the largest-extent S2 tile as the reference grid. + +``prepared/gcc_s3`` is intentionally left empty — step 4 (``4-fusion.py``) +regenerates it on its next run. + +Usage:: + + uv run python fix-tile-boundary.py # detect only + uv run python fix-tile-boundary.py --fix # detect + repair + uv run python fix-tile-boundary.py --fix --evaluation-year 2024 + +Prior step: :mod:`3-sentinel-data`. +Next step after fixing: :mod:`4-fusion`. +""" + +from __future__ import annotations + +import argparse +import importlib.util +import shutil +import sys +from collections import Counter +from pathlib import Path + +import rasterio + +DATA_DIR = Path("data") +DEFAULT_YEAR = 2025 + + +# --------------------------------------------------------------------------- +# Detection +# --------------------------------------------------------------------------- + + +def _refl_shapes(s2_dir: Path) -> dict[tuple[int, int], list[Path]]: + """Return a mapping of shape → REFL file paths for a prepared/s2 directory.""" + shape_to_files: dict[tuple[int, int], list[Path]] = {} + for f in sorted(s2_dir.glob("*_REFL.tif")): + with rasterio.open(f) as src: + shape: tuple[int, int] = src.shape # type: ignore[assignment] + shape_to_files.setdefault(shape, []).append(f) + return shape_to_files + + +def detect(year: int) -> list[Path]: + """Return site directories whose prepared/s2 has mixed REFL shapes.""" + sentinel_dir = DATA_DIR / "sentinel_data" / str(year) + if not sentinel_dir.exists(): + print(f"[detect] No sentinel data found at {sentinel_dir}") + return [] + + bad_sites: list[Path] = [] + for site_dir in sorted(sentinel_dir.iterdir()): + s2_dir = site_dir / "prepared" / "s2" + if not s2_dir.exists(): + continue + shapes = _refl_shapes(s2_dir) + if len(shapes) > 1: + summary = ", ".join( + f"{s[0]}×{s[1]} ({len(fs)} files)" for s, fs in shapes.items() + ) + print(f"[detect] {site_dir.name}: mixed shapes — {summary}") + bad_sites.append(site_dir) + + if not bad_sites: + print("[detect] All sites OK — no mixed tile shapes found.") + return bad_sites + + +# --------------------------------------------------------------------------- +# Repair +# --------------------------------------------------------------------------- + + +def _load_step3(): + """Import helpers from 3-sentinel-data.py without executing its main().""" + spec = importlib.util.spec_from_file_location("step3", "3-sentinel-data.py") + mod = importlib.util.module_from_spec(spec) # type: ignore[arg-type] + spec.loader.exec_module(mod) # type: ignore[union-attr] + return mod + + +def repair(site_dir: Path) -> None: + """Remove minority-shape S2 files and regenerate S3 composites for one site.""" + s2_dir = site_dir / "prepared" / "s2" + s3_raw = site_dir / "raw" / "s3" + s3_out = site_dir / "prepared" / "s3" + gcc_s3_out = site_dir / "prepared" / "gcc_s3" + name = site_dir.name + + # --- 1. Identify reference shape (largest extent) ------------------------- + shapes = _refl_shapes(s2_dir) + if len(shapes) <= 1: + print(f"[repair] {name}: already consistent — nothing to do.") + return + + ref_shape = max(shapes.keys(), key=lambda s: s[0] * s[1]) + + # --- 2. Remove non-reference REFL + companions ---------------------------- + n_removed = 0 + for shape, files in shapes.items(): + if shape == ref_shape: + continue + for refl_path in files: + stem = refl_path.stem[: -len("_REFL")] + for companion in s2_dir.glob(f"{stem}_*.tif"): + companion.unlink() + refl_path.unlink(missing_ok=True) + n_removed += 1 + + print(f"[repair] {name}: removed {n_removed} minority-shape file-sets (kept {ref_shape[0]}×{ref_shape[1]})") + + # --- 3. Remove stale GCC files from prepared/s2 --------------------------- + gcc_removed = sum(1 for f in s2_dir.glob("*_GCC.tif") if f.unlink() or True) + if gcc_removed: + print(f"[repair] {name}: removed {gcc_removed} stale GCC files from prepared/s2") + + # --- 4. Wipe stale S3 composites ------------------------------------------ + for d in (s3_out, gcc_s3_out): + if d.exists(): + shutil.rmtree(d) + print(f"[repair] {name}: removed {d.relative_to(site_dir)}/") + + # --- 5. Regenerate S3 composites with the correct reference --------------- + if not s3_raw.exists() or not any(s3_raw.glob("S3*.tif")): + print(f"[repair] {name}: WARNING — no raw S3 data in {s3_raw}; skipping S3 regeneration.") + return + + s2_refl_path = next(iter(sorted(s2_dir.glob("*_REFL.tif"))), None) + if s2_refl_path is None: + print(f"[repair] {name}: WARNING — no REFL files left; cannot regenerate S3 composites.") + return + + print(f"[repair] {name}: regenerating S3 composites (reference: {s2_refl_path.name})...") + step3 = _load_step3() + s3_out.mkdir(parents=True, exist_ok=True) + step3._prepare_s3(s3_raw, s2_refl_path, s3_out) + n_composites = len(list(s3_out.glob("composite_*.tif"))) + print(f"[repair] {name}: wrote {n_composites} composites → ready for 4-fusion.py") + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--evaluation-year", type=int, default=DEFAULT_YEAR) + parser.add_argument( + "--fix", + action="store_true", + help="Actually repair detected sites (default: detect only)", + ) + args = parser.parse_args(argv) + + bad_sites = detect(args.evaluation_year) + if not bad_sites: + return 0 + + if not args.fix: + print(f"\nRun with --fix to repair {len(bad_sites)} site(s).") + return 0 + + print() + for site_dir in bad_sites: + repair(site_dir) + + return 0 + + +if __name__ == "__main__": + sys.exit(main())