186 lines
6.6 KiB
Python
186 lines
6.6 KiB
Python
"""Detect and repair sites where S2 downloads span multiple MGRS tile extents.
|
||
|
||
Sites on MGRS tile boundaries produce REFL files from two tiles with different
|
||
spatial extents (e.g. 16SDA and 16SDB). This breaks EFAST, which requires all
|
||
S2 files to share the same grid. This script:
|
||
|
||
1. Scans all downloaded sites for the given year.
|
||
2. Reports any site where ``prepared/s2`` contains REFL files of mixed shapes.
|
||
3. With ``--fix``:
|
||
- Removes the minority-shape REFL / DIST_CLOUD / GCC files.
|
||
- Deletes the stale ``prepared/s3`` and ``prepared/gcc_s3`` composites.
|
||
- Regenerates ``prepared/s3`` composites from the existing raw S3 data using
|
||
the largest-extent S2 tile as the reference grid.
|
||
|
||
``prepared/gcc_s3`` is intentionally left empty — step 4 (``4-fusion.py``)
|
||
regenerates it on its next run.
|
||
|
||
Usage::
|
||
|
||
uv run python fix-tile-boundary.py # detect only
|
||
uv run python fix-tile-boundary.py --fix # detect + repair
|
||
uv run python fix-tile-boundary.py --fix --evaluation-year 2024
|
||
|
||
Prior step: :mod:`3-sentinel-data`.
|
||
Next step after fixing: :mod:`4-fusion`.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import importlib.util
|
||
import shutil
|
||
import sys
|
||
from collections import Counter
|
||
from pathlib import Path
|
||
|
||
import rasterio
|
||
|
||
DATA_DIR = Path("data")
|
||
DEFAULT_YEAR = 2025
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Detection
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def _refl_shapes(s2_dir: Path) -> dict[tuple[int, int], list[Path]]:
|
||
"""Return a mapping of shape → REFL file paths for a prepared/s2 directory."""
|
||
shape_to_files: dict[tuple[int, int], list[Path]] = {}
|
||
for f in sorted(s2_dir.glob("*_REFL.tif")):
|
||
with rasterio.open(f) as src:
|
||
shape: tuple[int, int] = src.shape # type: ignore[assignment]
|
||
shape_to_files.setdefault(shape, []).append(f)
|
||
return shape_to_files
|
||
|
||
|
||
def detect(year: int) -> list[Path]:
|
||
"""Return site directories whose prepared/s2 has mixed REFL shapes."""
|
||
sentinel_dir = DATA_DIR / "sentinel_data" / str(year)
|
||
if not sentinel_dir.exists():
|
||
print(f"[detect] No sentinel data found at {sentinel_dir}")
|
||
return []
|
||
|
||
bad_sites: list[Path] = []
|
||
for site_dir in sorted(sentinel_dir.iterdir()):
|
||
s2_dir = site_dir / "prepared" / "s2"
|
||
if not s2_dir.exists():
|
||
continue
|
||
shapes = _refl_shapes(s2_dir)
|
||
if len(shapes) > 1:
|
||
summary = ", ".join(
|
||
f"{s[0]}×{s[1]} ({len(fs)} files)" for s, fs in shapes.items()
|
||
)
|
||
print(f"[detect] {site_dir.name}: mixed shapes — {summary}")
|
||
bad_sites.append(site_dir)
|
||
|
||
if not bad_sites:
|
||
print("[detect] All sites OK — no mixed tile shapes found.")
|
||
return bad_sites
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Repair
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def _load_step3():
|
||
"""Import helpers from 3-sentinel-data.py without executing its main()."""
|
||
spec = importlib.util.spec_from_file_location("step3", "3-sentinel-data.py")
|
||
mod = importlib.util.module_from_spec(spec) # type: ignore[arg-type]
|
||
spec.loader.exec_module(mod) # type: ignore[union-attr]
|
||
return mod
|
||
|
||
|
||
def repair(site_dir: Path) -> None:
|
||
"""Remove minority-shape S2 files and regenerate S3 composites for one site."""
|
||
s2_dir = site_dir / "prepared" / "s2"
|
||
s3_raw = site_dir / "raw" / "s3"
|
||
s3_out = site_dir / "prepared" / "s3"
|
||
gcc_s3_out = site_dir / "prepared" / "gcc_s3"
|
||
name = site_dir.name
|
||
|
||
# --- 1. Identify reference shape (largest extent) -------------------------
|
||
shapes = _refl_shapes(s2_dir)
|
||
if len(shapes) <= 1:
|
||
print(f"[repair] {name}: already consistent — nothing to do.")
|
||
return
|
||
|
||
ref_shape = max(shapes.keys(), key=lambda s: s[0] * s[1])
|
||
|
||
# --- 2. Remove non-reference REFL + companions ----------------------------
|
||
n_removed = 0
|
||
for shape, files in shapes.items():
|
||
if shape == ref_shape:
|
||
continue
|
||
for refl_path in files:
|
||
stem = refl_path.stem[: -len("_REFL")]
|
||
for companion in s2_dir.glob(f"{stem}_*.tif"):
|
||
companion.unlink()
|
||
refl_path.unlink(missing_ok=True)
|
||
n_removed += 1
|
||
|
||
print(f"[repair] {name}: removed {n_removed} minority-shape file-sets (kept {ref_shape[0]}×{ref_shape[1]})")
|
||
|
||
# --- 3. Remove stale GCC files from prepared/s2 ---------------------------
|
||
gcc_removed = sum(1 for f in s2_dir.glob("*_GCC.tif") if f.unlink() or True)
|
||
if gcc_removed:
|
||
print(f"[repair] {name}: removed {gcc_removed} stale GCC files from prepared/s2")
|
||
|
||
# --- 4. Wipe stale S3 composites ------------------------------------------
|
||
for d in (s3_out, gcc_s3_out):
|
||
if d.exists():
|
||
shutil.rmtree(d)
|
||
print(f"[repair] {name}: removed {d.relative_to(site_dir)}/")
|
||
|
||
# --- 5. Regenerate S3 composites with the correct reference ---------------
|
||
if not s3_raw.exists() or not any(s3_raw.glob("S3*.tif")):
|
||
print(f"[repair] {name}: WARNING — no raw S3 data in {s3_raw}; skipping S3 regeneration.")
|
||
return
|
||
|
||
s2_refl_path = next(iter(sorted(s2_dir.glob("*_REFL.tif"))), None)
|
||
if s2_refl_path is None:
|
||
print(f"[repair] {name}: WARNING — no REFL files left; cannot regenerate S3 composites.")
|
||
return
|
||
|
||
print(f"[repair] {name}: regenerating S3 composites (reference: {s2_refl_path.name})...")
|
||
step3 = _load_step3()
|
||
s3_out.mkdir(parents=True, exist_ok=True)
|
||
step3._prepare_s3(s3_raw, s2_refl_path, s3_out)
|
||
n_composites = len(list(s3_out.glob("composite_*.tif")))
|
||
print(f"[repair] {name}: wrote {n_composites} composites → ready for 4-fusion.py")
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# CLI
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def main(argv: list[str] | None = None) -> int:
|
||
parser = argparse.ArgumentParser(description=__doc__)
|
||
parser.add_argument("--evaluation-year", type=int, default=DEFAULT_YEAR)
|
||
parser.add_argument(
|
||
"--fix",
|
||
action="store_true",
|
||
help="Actually repair detected sites (default: detect only)",
|
||
)
|
||
args = parser.parse_args(argv)
|
||
|
||
bad_sites = detect(args.evaluation_year)
|
||
if not bad_sites:
|
||
return 0
|
||
|
||
if not args.fix:
|
||
print(f"\nRun with --fix to repair {len(bad_sites)} site(s).")
|
||
return 0
|
||
|
||
print()
|
||
for site_dir in bad_sites:
|
||
repair(site_dir)
|
||
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main())
|