Scale it up.

This commit is contained in:
Felix Delattre 2026-06-10 19:37:33 +02:00
parent ba36dfe914
commit c033f5f527

View file

@ -29,6 +29,7 @@ import json
import os import os
import shutil import shutil
import time import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
@ -95,6 +96,17 @@ _BAND_ASSETS: dict[str, str] = {
_SCL_ASSET = "scl" _SCL_ASSET = "scl"
_MIN_BBOX_HALF_DEG = 0.008 _MIN_BBOX_HALF_DEG = 0.008
_GDAL_COG_ENV = {
"GDAL_HTTP_VERSION": "2",
"GDAL_HTTP_MERGE_CONSECUTIVE_RANGES": "YES",
"GDAL_HTTP_MULTIPLEX": "YES",
"GDAL_HTTP_TCP_KEEPALIVE": "YES",
"GDAL_DISABLE_READDIR_ON_OPEN": "EMPTY_DIR",
"CPL_VSIL_CURL_CACHE_SIZE": "200000000",
"GDAL_MAX_CONNECTIONS": "100",
"AWS_NO_SIGN_REQUEST": "YES",
}
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Internal S3 constants # Internal S3 constants
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@ -314,39 +326,29 @@ def stac_search_s2(
return list({item.id: item for item in search.items()}.values()) return list({item.id: item for item in search.items()}.values())
def download_s2_window( def _process_item(
items: list[Any], item: Any,
bbox: list[float], bbox: list[float],
output_dir: Path,
bands: list[str], bands: list[str],
ratio: int = RESOLUTION_RATIO, output_dir: Path,
) -> None: ratio: int,
"""Range-read S2 L2A COG windows and write masked REFL GeoTIFFs. ) -> str | None:
"""Range-read one S2 item and write a masked REFL GeoTIFF.
Writes ``{item.id}_REFL.tif`` directly no intermediate raw download. Returns a skip-message string when the item cannot be processed, else None.
Cloud/shadow pixels (SCL 0, 3, >7) are zeroed. BOA offset is inferred from
``processing:baseline``. Output is zero-padded to multiples of ``ratio``.
""" """
output_dir.mkdir(parents=True, exist_ok=True)
for item in tqdm(items, unit="granule", desc="S2 COG window read"):
out_path = output_dir / f"{item.id}_REFL.tif" out_path = output_dir / f"{item.id}_REFL.tif"
if out_path.is_file(): if out_path.is_file():
continue return None
bands_result = _read_bands(item, bbox, bands) bands_result = _read_bands(item, bbox, bands)
if bands_result is None: if bands_result is None:
tqdm.write(f"[S2] Skipping {item.id}: missing asset or no bbox overlap") return f"[S2] Skipping {item.id}: missing asset or no bbox overlap"
continue
band_arrays, ref_profile = bands_result band_arrays, ref_profile = bands_result
target_shape = (ref_profile["height"], ref_profile["width"]) mask = _cloud_mask(item, bbox, (ref_profile["height"], ref_profile["width"]))
mask = _cloud_mask(item, bbox, target_shape)
stacked = (np.stack(band_arrays) + _boa_offset(item)) / 10_000.0 stacked = (np.stack(band_arrays) + _boa_offset(item)) / 10_000.0
np.clip(stacked, 0, None, out=stacked) np.clip(stacked, 0, None, out=stacked)
stacked[:, mask] = 0.0 stacked[:, mask] = 0.0
stacked = _pad_to_multiple(stacked, ratio) stacked = _pad_to_multiple(stacked, ratio)
out_profile = { out_profile = {
"driver": "GTiff", "driver": "GTiff",
"count": len(bands), "count": len(bands),
@ -362,6 +364,37 @@ def download_s2_window(
dst.write(stacked) dst.write(stacked)
for i, band_name in enumerate(bands, 1): for i, band_name in enumerate(bands, 1):
dst.set_band_description(i, band_name) dst.set_band_description(i, band_name)
return None
def download_s2_window(
items: list[Any],
bbox: list[float],
output_dir: Path,
bands: list[str],
ratio: int = RESOLUTION_RATIO,
max_workers: int = 32,
) -> None:
"""Range-read S2 L2A COG windows and write masked REFL GeoTIFFs.
Writes ``{item.id}_REFL.tif`` directly no intermediate raw download.
Cloud/shadow pixels (SCL 0, 3, >7) are zeroed. BOA offset is inferred from
``processing:baseline``. Output is zero-padded to multiples of ``ratio``.
Items are fetched in parallel using ``max_workers`` threads.
"""
output_dir.mkdir(parents=True, exist_ok=True)
with rasterio.Env(**_GDAL_COG_ENV):
with ThreadPoolExecutor(max_workers=max_workers) as pool:
futures = {
pool.submit(_process_item, item, bbox, bands, output_dir, ratio): item.id
for item in items
}
with tqdm(total=len(futures), unit="granule", desc="S2 COG window read") as pbar:
for fut in as_completed(futures):
msg = fut.result()
if msg:
tqdm.write(msg)
pbar.update(1)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------