Fix two scene input.

This commit is contained in:
Felix Delattre 2026-06-11 11:35:35 +02:00
parent aa4535a600
commit e33256d43d
2 changed files with 232 additions and 0 deletions

View file

@ -597,6 +597,51 @@ def _import_distance_to_clouds():
) from exc ) from exc
def _normalize_s2_grid(s2_dir: Path) -> None:
"""Remove REFL (and companion DIST_CLOUD/GCC) files that don't match the dominant shape.
Sites at MGRS tile boundaries can have S2 downloads from two tiles with different
spatial extents. For example, a site at the top edge of tile 16SDA will produce
narrow (e.g. 30×180) REFL files from 16SDA and full-extent (180×180) files from
16SDB. EFAST requires all S2 files to share the same grid, so we keep only the
shape with the most pixels (largest tile coverage) and discard the rest together
with any already-computed companions.
"""
refl_files = sorted(s2_dir.glob("*_REFL.tif"))
if not refl_files:
return
shape_to_files: dict[tuple[int, int], list[Path]] = {}
for f in refl_files:
with rasterio.open(f) as src:
shape: tuple[int, int] = src.shape # type: ignore[assignment]
shape_to_files.setdefault(shape, []).append(f)
if len(shape_to_files) <= 1:
return
ref_shape = max(shape_to_files.keys(), key=lambda s: s[0] * s[1])
shapes_summary = ", ".join(
f"{s[0]}×{s[1]}({len(fs)})" for s, fs in shape_to_files.items()
)
n_removed = 0
for shape, files in shape_to_files.items():
if shape == ref_shape:
continue
for refl_path in files:
stem = refl_path.stem[: -len("_REFL")] # e.g. "S2A_16SDA_20250106_0_L2A"
for companion in s2_dir.glob(f"{stem}_*.tif"):
companion.unlink()
refl_path.unlink(missing_ok=True)
n_removed += 1
print(
f"[S2-NORM] Tile boundary — shapes: {shapes_summary}; "
f"removed {n_removed} file-sets, kept {ref_shape[0]}×{ref_shape[1]}"
)
def _rescale_dist_cloud(s2_dir: Path) -> None: def _rescale_dist_cloud(s2_dir: Path) -> None:
"""Ensure DIST_CLOUD values are in pixel units (not normalised to [0,1]).""" """Ensure DIST_CLOUD values are in pixel units (not normalised to [0,1])."""
for dc_path in s2_dir.glob("*DIST_CLOUD.tif"): for dc_path in s2_dir.glob("*DIST_CLOUD.tif"):
@ -796,6 +841,7 @@ def process_site(
items = stac_search_s2(bbox, datetime(year, 1, 1), datetime(year, 12, 31)) items = stac_search_s2(bbox, datetime(year, 1, 1), datetime(year, 12, 31))
print(f"[{sitename}] {len(items)} S2 items found — downloading windows...") print(f"[{sitename}] {len(items)} S2 items found — downloading windows...")
download_s2_window(items, bbox, s2_out, S2_BANDS, RESOLUTION_RATIO) download_s2_window(items, bbox, s2_out, S2_BANDS, RESOLUTION_RATIO)
_normalize_s2_grid(s2_out)
# S2 distance-to-clouds # S2 distance-to-clouds
print(f"[{sitename}] Computing distance-to-clouds...") print(f"[{sitename}] Computing distance-to-clouds...")

186
fix-tile-boundary.py Normal file
View file

@ -0,0 +1,186 @@
"""Detect and repair sites where S2 downloads span multiple MGRS tile extents.
Sites on MGRS tile boundaries produce REFL files from two tiles with different
spatial extents (e.g. 16SDA and 16SDB). This breaks EFAST, which requires all
S2 files to share the same grid. This script:
1. Scans all downloaded sites for the given year.
2. Reports any site where ``prepared/s2`` contains REFL files of mixed shapes.
3. With ``--fix``:
- Removes the minority-shape REFL / DIST_CLOUD / GCC files.
- Deletes the stale ``prepared/s3`` and ``prepared/gcc_s3`` composites.
- Regenerates ``prepared/s3`` composites from the existing raw S3 data using
the largest-extent S2 tile as the reference grid.
``prepared/gcc_s3`` is intentionally left empty step 4 (``4-fusion.py``)
regenerates it on its next run.
Usage::
uv run python fix-tile-boundary.py # detect only
uv run python fix-tile-boundary.py --fix # detect + repair
uv run python fix-tile-boundary.py --fix --evaluation-year 2024
Prior step: :mod:`3-sentinel-data`.
Next step after fixing: :mod:`4-fusion`.
"""
from __future__ import annotations
import argparse
import importlib.util
import shutil
import sys
from collections import Counter
from pathlib import Path
import rasterio
DATA_DIR = Path("data")
DEFAULT_YEAR = 2025
# ---------------------------------------------------------------------------
# Detection
# ---------------------------------------------------------------------------
def _refl_shapes(s2_dir: Path) -> dict[tuple[int, int], list[Path]]:
"""Return a mapping of shape → REFL file paths for a prepared/s2 directory."""
shape_to_files: dict[tuple[int, int], list[Path]] = {}
for f in sorted(s2_dir.glob("*_REFL.tif")):
with rasterio.open(f) as src:
shape: tuple[int, int] = src.shape # type: ignore[assignment]
shape_to_files.setdefault(shape, []).append(f)
return shape_to_files
def detect(year: int) -> list[Path]:
"""Return site directories whose prepared/s2 has mixed REFL shapes."""
sentinel_dir = DATA_DIR / "sentinel_data" / str(year)
if not sentinel_dir.exists():
print(f"[detect] No sentinel data found at {sentinel_dir}")
return []
bad_sites: list[Path] = []
for site_dir in sorted(sentinel_dir.iterdir()):
s2_dir = site_dir / "prepared" / "s2"
if not s2_dir.exists():
continue
shapes = _refl_shapes(s2_dir)
if len(shapes) > 1:
summary = ", ".join(
f"{s[0]}×{s[1]} ({len(fs)} files)" for s, fs in shapes.items()
)
print(f"[detect] {site_dir.name}: mixed shapes — {summary}")
bad_sites.append(site_dir)
if not bad_sites:
print("[detect] All sites OK — no mixed tile shapes found.")
return bad_sites
# ---------------------------------------------------------------------------
# Repair
# ---------------------------------------------------------------------------
def _load_step3():
"""Import helpers from 3-sentinel-data.py without executing its main()."""
spec = importlib.util.spec_from_file_location("step3", "3-sentinel-data.py")
mod = importlib.util.module_from_spec(spec) # type: ignore[arg-type]
spec.loader.exec_module(mod) # type: ignore[union-attr]
return mod
def repair(site_dir: Path) -> None:
"""Remove minority-shape S2 files and regenerate S3 composites for one site."""
s2_dir = site_dir / "prepared" / "s2"
s3_raw = site_dir / "raw" / "s3"
s3_out = site_dir / "prepared" / "s3"
gcc_s3_out = site_dir / "prepared" / "gcc_s3"
name = site_dir.name
# --- 1. Identify reference shape (largest extent) -------------------------
shapes = _refl_shapes(s2_dir)
if len(shapes) <= 1:
print(f"[repair] {name}: already consistent — nothing to do.")
return
ref_shape = max(shapes.keys(), key=lambda s: s[0] * s[1])
# --- 2. Remove non-reference REFL + companions ----------------------------
n_removed = 0
for shape, files in shapes.items():
if shape == ref_shape:
continue
for refl_path in files:
stem = refl_path.stem[: -len("_REFL")]
for companion in s2_dir.glob(f"{stem}_*.tif"):
companion.unlink()
refl_path.unlink(missing_ok=True)
n_removed += 1
print(f"[repair] {name}: removed {n_removed} minority-shape file-sets (kept {ref_shape[0]}×{ref_shape[1]})")
# --- 3. Remove stale GCC files from prepared/s2 ---------------------------
gcc_removed = sum(1 for f in s2_dir.glob("*_GCC.tif") if f.unlink() or True)
if gcc_removed:
print(f"[repair] {name}: removed {gcc_removed} stale GCC files from prepared/s2")
# --- 4. Wipe stale S3 composites ------------------------------------------
for d in (s3_out, gcc_s3_out):
if d.exists():
shutil.rmtree(d)
print(f"[repair] {name}: removed {d.relative_to(site_dir)}/")
# --- 5. Regenerate S3 composites with the correct reference ---------------
if not s3_raw.exists() or not any(s3_raw.glob("S3*.tif")):
print(f"[repair] {name}: WARNING — no raw S3 data in {s3_raw}; skipping S3 regeneration.")
return
s2_refl_path = next(iter(sorted(s2_dir.glob("*_REFL.tif"))), None)
if s2_refl_path is None:
print(f"[repair] {name}: WARNING — no REFL files left; cannot regenerate S3 composites.")
return
print(f"[repair] {name}: regenerating S3 composites (reference: {s2_refl_path.name})...")
step3 = _load_step3()
s3_out.mkdir(parents=True, exist_ok=True)
step3._prepare_s3(s3_raw, s2_refl_path, s3_out)
n_composites = len(list(s3_out.glob("composite_*.tif")))
print(f"[repair] {name}: wrote {n_composites} composites → ready for 4-fusion.py")
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--evaluation-year", type=int, default=DEFAULT_YEAR)
parser.add_argument(
"--fix",
action="store_true",
help="Actually repair detected sites (default: detect only)",
)
args = parser.parse_args(argv)
bad_sites = detect(args.evaluation_year)
if not bad_sites:
return 0
if not args.fix:
print(f"\nRun with --fix to repair {len(bad_sites)} site(s).")
return 0
print()
for site_dir in bad_sites:
repair(site_dir)
return 0
if __name__ == "__main__":
sys.exit(main())