added gap validation.

This commit is contained in:
Felix Delattre 2026-05-17 15:55:15 +02:00
parent 374be6865d
commit 740249115b
12 changed files with 997 additions and 116 deletions

View file

@ -1,8 +1,9 @@
"""Symlink prepared S2 into a temp dir, omitting one acquisition (REFL + DIST_CLOUD)."""
"""Symlink prepared S2 into a temp dir, omitting gap-window acquisitions (REFL/GCC + DIST)."""
from __future__ import annotations
import re
from datetime import date, datetime
from pathlib import Path
# Acquisition calendar day in prepared S2 names (BtI REFL/DIST; ItB GCC/DIST).
@ -14,10 +15,34 @@ def yyyymmdd_in_name(name: str) -> str | None:
return m.group(1) if m else None
def yyyymmdd_from_iso(iso_d: str) -> str:
return datetime.strptime(iso_d[:10], "%Y-%m-%d").strftime("%Y%m%d")
def acquisition_yyyymmdd_in_window(
prepared_s2: Path, window_start: date, window_end: date
) -> set[str]:
"""All S2 acquisition days (from REFL filenames) inside [window_start, window_end]."""
out: set[str] = set()
if not prepared_s2.is_dir():
return out
for p in prepared_s2.glob("*REFL.tif"):
m = re.search(r"S2A_MSIL2A_(\d{8})_REFL\.tif$", p.name)
if not m:
continue
d = datetime.strptime(m.group(1), "%Y%m%d").date()
if window_start <= d <= window_end:
out.add(m.group(1))
return out
def build_masked_s2_dir(
prepared_s2: Path, withheld_yyyymmdd: str, dest: Path, patterns: tuple[str, ...]
prepared_s2: Path,
excluded_yyyymmdd: set[str],
dest: Path,
patterns: tuple[str, ...],
) -> int:
"""Symlink all files matching ``patterns`` except the withheld acquisition day."""
"""Symlink all files matching ``patterns`` except excluded acquisition days."""
dest.mkdir(parents=True, exist_ok=True)
n = 0
for pattern in patterns:
@ -25,7 +50,7 @@ def build_masked_s2_dir(
if not src.is_file() and not src.is_symlink():
continue
y = yyyymmdd_in_name(src.name)
if y == withheld_yyyymmdd:
if y and y in excluded_yyyymmdd:
continue
link = dest / src.name
if link.exists() or link.is_symlink():
@ -35,17 +60,32 @@ def build_masked_s2_dir(
return n
def assert_no_leakage(withheld_yyyymmdd: str, masked_s2_dir: Path) -> None:
"""Fail if the withheld validation acquisition is present in the fusion input dir."""
for p in masked_s2_dir.iterdir():
y = yyyymmdd_in_name(p.name)
if y == withheld_yyyymmdd:
raise RuntimeError(
f"Data leakage: withheld acquisition {withheld_yyyymmdd} "
f"found in masked S2 dir {masked_s2_dir}"
)
def build_masked_s2_dir_bti(
prepared_s2: Path, withheld_yyyymmdd: str, dest: Path
prepared_s2: Path,
excluded_yyyymmdd: set[str],
dest: Path,
) -> int:
return build_masked_s2_dir(
prepared_s2, withheld_yyyymmdd, dest, ("*REFL.tif", "*DIST_CLOUD.tif")
prepared_s2, excluded_yyyymmdd, dest, ("*REFL.tif", "*DIST_CLOUD.tif")
)
def build_masked_s2_dir_itb(
prepared_s2: Path, withheld_yyyymmdd: str, dest: Path
prepared_s2: Path,
excluded_yyyymmdd: set[str],
dest: Path,
) -> int:
return build_masked_s2_dir(
prepared_s2, withheld_yyyymmdd, dest, ("*GCC.tif", "*DIST_CLOUD.tif")
prepared_s2, excluded_yyyymmdd, dest, ("*GCC.tif", "*DIST_CLOUD.tif")
)