Added skip-download

This commit is contained in:
Felix Delattre 2026-06-11 00:22:47 +02:00
parent c033f5f527
commit 8683624557

View file

@ -97,13 +97,15 @@ _SCL_ASSET = "scl"
_MIN_BBOX_HALF_DEG = 0.008
_GDAL_COG_ENV = {
"GDAL_HTTP_VERSION": "2",
# HTTP/1.1 avoids HTTP/2 multiplexing connection-reset cascades on S3.
"GDAL_HTTP_VERSION": "1.1",
"GDAL_HTTP_MERGE_CONSECUTIVE_RANGES": "YES",
"GDAL_HTTP_MULTIPLEX": "YES",
"GDAL_HTTP_TCP_KEEPALIVE": "YES",
"GDAL_DISABLE_READDIR_ON_OPEN": "EMPTY_DIR",
"CPL_VSIL_CURL_CACHE_SIZE": "200000000",
"GDAL_MAX_CONNECTIONS": "100",
# Built-in GDAL retries for 429/502/503/504 and transient resets.
"GDAL_HTTP_MAX_RETRY": "3",
"GDAL_HTTP_RETRY_DELAY": "0.5",
"AWS_NO_SIGN_REQUEST": "YES",
}
@ -373,7 +375,7 @@ def download_s2_window(
output_dir: Path,
bands: list[str],
ratio: int = RESOLUTION_RATIO,
max_workers: int = 32,
max_workers: int = 12,
) -> None:
"""Range-read S2 L2A COG windows and write masked REFL GeoTIFFs.
@ -491,6 +493,39 @@ def _netcdf_to_geotiffs(nc_path: Path, output_dir: Path, epsg: int) -> int:
return written
_S3_DOWNLOAD_RETRIES = 4
_S3_DOWNLOAD_BACKOFF = 30 # seconds; doubled on each retry
def _download_with_retry(datacube: Any, nc_path: Path) -> None:
"""Download an OpenEO datacube to *nc_path*, retrying on transient errors.
Retries up to ``_S3_DOWNLOAD_RETRIES`` times with exponential backoff
starting at ``_S3_DOWNLOAD_BACKOFF`` seconds. Re-authenticates on each
attempt so an expired token never blocks a retry.
"""
delay = _S3_DOWNLOAD_BACKOFF
last_exc: Exception | None = None
for attempt in range(1, _S3_DOWNLOAD_RETRIES + 1):
try:
if nc_path.exists():
nc_path.unlink()
datacube.download(str(nc_path), format="NetCDF")
return
except Exception as exc:
last_exc = exc
if attempt < _S3_DOWNLOAD_RETRIES:
print(
f"[S3-OEO] Download attempt {attempt} failed ({exc}); "
f"retrying in {delay}s..."
)
time.sleep(delay)
delay *= 2
else:
print(f"[S3-OEO] All {_S3_DOWNLOAD_RETRIES} download attempts failed")
raise RuntimeError(f"S3 download failed after {_S3_DOWNLOAD_RETRIES} attempts") from last_exc
def download_s3_openeo(
start_date: datetime,
end_date: datetime,
@ -537,7 +572,7 @@ def download_s3_openeo(
nc_path = output_dir / "_s3_syn_l2.nc"
print(f"[S3-OEO] Downloading NetCDF to {nc_path}...")
t0 = time.time()
datacube.download(str(nc_path), format="NetCDF")
_download_with_retry(datacube, nc_path)
print(f"[S3-OEO] Download completed in {time.time() - t0:.1f}s")
print("[S3-OEO] Splitting into per-date GeoTIFFs...")
@ -804,6 +839,11 @@ def main(argv: list[str] | None = None) -> int:
default=None,
help="Single sitename to process (default: all step-2 PASS sites)",
)
parser.add_argument(
"--skip-downloaded",
action="store_true",
help="Skip sites whose directory already exists under data/sentinel_data/{year}/",
)
args = parser.parse_args(argv)
year = args.evaluation_year
@ -821,16 +861,17 @@ def main(argv: list[str] | None = None) -> int:
print(f"[Sentinel-3] Processing {len(pass_sites)} site(s)")
for i, site in enumerate(pass_sites, 1):
sitename = site["sitename"]
site_dir = DATA_DIR / "sentinel_data" / str(year) / sitename
if args.skip_downloaded and site_dir.exists():
print(f"[Sentinel-3] ({i}/{len(pass_sites)}) {sitename} — skipping (directory exists)")
continue
print(f"[Sentinel-3] ({i}/{len(pass_sites)}) {sitename}")
try:
summary = process_site(sitename, site["lat"], site["lon"], year)
print(
f"[Sentinel-3] {sitename} done — "
f"{summary['s2_refl_count']} REFL, "
f"{summary['s3_composite_count']} composites"
)
except Exception as exc:
print(f"[Sentinel-3] {sitename} FAILED: {exc}")
return 0