From 22d493bc2d22204e88f2b963992552f52dbc1eba Mon Sep 17 00:00:00 2001 From: Felix Delattre Date: Thu, 18 Dec 2025 10:54:22 +0100 Subject: [PATCH] Allow arguments for download. --- .pre-commit-config.yaml | 8 ++ download.py | 11 +++ download_s2.py | 180 +++++++++++++++++++++++---------------- download_s3.py | 182 +++++++++++++++++++++++++--------------- requirements.txt | 3 +- 5 files changed, 242 insertions(+), 142 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 download.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..6ba9060 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,8 @@ +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.8.4 + hooks: + - id: ruff + args: [--fix] + - id: ruff-format + diff --git a/download.py b/download.py new file mode 100644 index 0000000..6ba1614 --- /dev/null +++ b/download.py @@ -0,0 +1,11 @@ +from download_s2 import download_s2 +from download_s3 import download_s3 + +year = 2024 +site_position = (47.116171, 11.320308) +site_name = "innsbruck" + +print(f"Downloading data for {site_name}, {year}") +download_s2(year, site_position, site_name) +download_s3(year, site_position, site_name) +print("All downloads completed") diff --git a/download_s2.py b/download_s2.py index 1313133..ebea4df 100644 --- a/download_s2.py +++ b/download_s2.py @@ -4,79 +4,115 @@ from rasterio.warp import transform_geom from rasterio.windows import from_bounds, transform as window_transform from pystac_client import Client -datetime_range = "2024-01-01/2024-01-03" -lon, lat = 11.320308, 47.116171 -bbox_size = 0.009 -bbox = [lon - bbox_size/2, lat - bbox_size/2, lon + bbox_size/2, lat + bbox_size/2] -bands = {"B02": "blue", "B03": "green", "B04": "red", "B8A": "nir"} -output_dir = "data/innsbruck/2024/s2/" -os.makedirs(output_dir, exist_ok=True) -client = Client.open("https://earth-search.aws.element84.com/v1") -search = client.search( - collections=["sentinel-2-l2a"], - intersects={"type": "Point", "coordinates": [lon, lat]}, - datetime=datetime_range, - max_items=1000, -) +def download_s2(year, site_position, site_name): + lat, lon = site_position + datetime_range = f"{year}-01-01/{year}-12-31" + output_dir = f"data/{site_name}/{year}/s2/" -items_by_key = {} -for item in search.items(): - date = item.datetime.strftime("%Y%m%d") - parts = item.id.split("_") - increment = parts[3] if len(parts) > 3 else "0" - key = (date, increment) - if key not in items_by_key: - items_by_key[key] = item + print(f"[S2] Starting download: {site_name} ({lat:.6f}, {lon:.6f}), {year}") -for (date, increment), item in items_by_key.items(): - filepath = os.path.join(output_dir, f"{date}_{increment}.geotiff") - if os.path.exists(filepath): - continue - - band_data = {} - profile = None - - for band_name, asset_name in bands.items(): - if asset_name in item.assets: - asset = item.assets[asset_name] - with rasterio.open(asset.href) as src: - bbox_geom = { - "type": "Polygon", - "coordinates": [[ - [bbox[0], bbox[1]], [bbox[2], bbox[1]], - [bbox[2], bbox[3]], [bbox[0], bbox[3]], [bbox[0], bbox[1]] - ]] - } - bbox_transformed = transform_geom("EPSG:4326", src.crs, bbox_geom) - coords = bbox_transformed["coordinates"][0] - x_coords = [c[0] for c in coords[:4]] - y_coords = [c[1] for c in coords[:4]] - bbox_crs = [min(x_coords), min(y_coords), max(x_coords), max(y_coords)] - src_bounds = src.bounds - intersect_bbox = [ - max(bbox_crs[0], src_bounds.left), max(bbox_crs[1], src_bounds.bottom), - min(bbox_crs[2], src_bounds.right), min(bbox_crs[3], src_bounds.top), - ] - window = from_bounds(*intersect_bbox, src.transform) - if window.height > 0 and window.width > 0: - data = src.read(window=window) - new_transform = window_transform(window, src.transform) - if profile is None: - profile = { - "driver": "GTiff", "height": window.height, "width": window.width, - "count": len(bands), "dtype": data.dtype, "crs": src.crs, - "transform": new_transform, "compress": "lzw" - } - band_idx = list(bands.keys()).index(band_name) - band_data[band_idx] = data[0] - - if profile and len(band_data) == len(bands): - stacked = [band_data[i] for i in sorted(band_data.keys())] - band_names = [list(bands.keys())[i] for i in sorted(band_data.keys())] - with rasterio.open(filepath, "w", **profile) as dst: - for i, data in enumerate(stacked, 1): - dst.write(data, i) - dst.set_band_description(i, band_names[i-1]) - print(f"Saved: {filepath}") + bbox_size = 0.011 + bbox = [ + lon - bbox_size / 2, + lat - bbox_size / 2, + lon + bbox_size / 2, + lat + bbox_size / 2, + ] + bands = {"B02": "blue", "B03": "green", "B04": "red", "B8A": "nir"} + os.makedirs(output_dir, exist_ok=True) + print("[S2] Connecting to STAC catalog...") + client = Client.open("https://earth-search.aws.element84.com/v1") + search = client.search( + collections=["sentinel-2-l2a"], + intersects={"type": "Point", "coordinates": [lon, lat]}, + datetime=datetime_range, + max_items=1000, + ) + + print("[S2] Searching items...") + items_by_key = {} + for item in search.items(): + date = item.datetime.strftime("%Y%m%d") + parts = item.id.split("_") + increment = parts[3] if len(parts) > 3 else "0" + key = (date, increment) + if key not in items_by_key: + items_by_key[key] = item + + print(f"[S2] Found {len(items_by_key)} unique items") + + for (date, increment), item in items_by_key.items(): + filepath = os.path.join(output_dir, f"{date}_{increment}.geotiff") + if os.path.exists(filepath): + print(f"[S2] Skipping {date}_{increment}.geotiff (exists)") + continue + + print(f"[S2] Processing {date}_{increment}...") + band_data = {} + profile = None + + for band_name, asset_name in bands.items(): + if asset_name in item.assets: + asset = item.assets[asset_name] + with rasterio.open(asset.href) as src: + bbox_geom = { + "type": "Polygon", + "coordinates": [ + [ + [bbox[0], bbox[1]], + [bbox[2], bbox[1]], + [bbox[2], bbox[3]], + [bbox[0], bbox[3]], + [bbox[0], bbox[1]], + ] + ], + } + bbox_transformed = transform_geom("EPSG:4326", src.crs, bbox_geom) + coords = bbox_transformed["coordinates"][0] + x_coords = [c[0] for c in coords[:4]] + y_coords = [c[1] for c in coords[:4]] + bbox_crs = [ + min(x_coords), + min(y_coords), + max(x_coords), + max(y_coords), + ] + src_bounds = src.bounds + intersect_bbox = [ + max(bbox_crs[0], src_bounds.left), + max(bbox_crs[1], src_bounds.bottom), + min(bbox_crs[2], src_bounds.right), + min(bbox_crs[3], src_bounds.top), + ] + window = from_bounds(*intersect_bbox, src.transform) + if window.height > 0 and window.width > 0: + data = src.read(window=window) + new_transform = window_transform(window, src.transform) + if profile is None: + profile = { + "driver": "GTiff", + "height": window.height, + "width": window.width, + "count": len(bands), + "dtype": data.dtype, + "crs": src.crs, + "transform": new_transform, + "compress": "lzw", + } + band_idx = list(bands.keys()).index(band_name) + band_data[band_idx] = data[0] + + if profile and len(band_data) == len(bands): + stacked = [band_data[i] for i in sorted(band_data.keys())] + band_names = [list(bands.keys())[i] for i in sorted(band_data.keys())] + with rasterio.open(filepath, "w", **profile) as dst: + for i, data in enumerate(stacked, 1): + dst.write(data, i) + dst.set_band_description(i, band_names[i - 1]) + print(f"[S2] Saved: {filepath}") + else: + print(f"[S2] Skipping {date}_{increment} (missing bands)") + + print("[S2] Completed") diff --git a/download_s3.py b/download_s3.py index ea3f1c1..e3619eb 100644 --- a/download_s3.py +++ b/download_s3.py @@ -11,83 +11,127 @@ from rasterio.transform import from_bounds load_dotenv() -datetime_range = "2024-01-01/2024-01-03" -lon, lat = 11.320308, 47.116171 -bbox_size = 0.009 -bbox = [lon - bbox_size/2, lat - bbox_size/2, lon + bbox_size/2, lat + bbox_size/2] -bands = {"SDR_Oa04": "blue", "SDR_Oa06": "green", "SDR_Oa08": "red", "SDR_Oa17": "nir"} -output_dir = Path("data/innsbruck/2024/s3/") -output_dir.mkdir(parents=True, exist_ok=True) -band_map = {"SDR_Oa04": "B04", "SDR_Oa06": "B06", "SDR_Oa08": "B08", "SDR_Oa17": "B17"} -openeo_bands = [band_map.get(b, b) for b in bands.keys()] +def download_s3(year, site_position, site_name): + lat, lon = site_position + datetime_range = f"{year}-01-01/{year}-12-31" + output_dir = Path(f"data/{site_name}/{year}/s3/") -start_date, end_date = datetime_range.split("/") -spatial_extent = { - "west": bbox[0], "east": bbox[2], - "south": bbox[1], "north": bbox[3] -} + print(f"[S3] Starting download: {site_name} ({lat:.6f}, {lon:.6f}), {year}") -token_response = requests.post( - "https://identity.dataspace.copernicus.eu/auth/realms/CDSE/protocol/openid-connect/token", - data={ - "grant_type": "password", - "username": os.getenv("CDSE_USER"), - "password": os.getenv("CDSE_PASSWORD"), - "client_id": "cdse-public" + bbox_size = 0.011 + bbox = [ + lon - bbox_size / 2, + lat - bbox_size / 2, + lon + bbox_size / 2, + lat + bbox_size / 2, + ] + bands = { + "SDR_Oa04": "blue", + "SDR_Oa06": "green", + "SDR_Oa08": "red", + "SDR_Oa17": "nir", } -) -token_response.raise_for_status() -tokens = token_response.json() -access_token = tokens["access_token"] + output_dir.mkdir(parents=True, exist_ok=True) -conn = openeo.connect("openeo.dataspace.copernicus.eu") -conn.authenticate_oidc_access_token(access_token) + band_map = { + "SDR_Oa04": "B04", + "SDR_Oa06": "B06", + "SDR_Oa08": "B08", + "SDR_Oa17": "B17", + } + openeo_bands = [band_map.get(b, b) for b in bands.keys()] -datacube = conn.load_collection( - "SENTINEL3_OLCI_L1B", - spatial_extent=spatial_extent, - temporal_extent=[start_date, end_date], - bands=openeo_bands, -).resample_spatial(projection=32632) + start_date, end_date = datetime_range.split("/") + spatial_extent = { + "west": bbox[0], + "east": bbox[2], + "south": bbox[1], + "north": bbox[3], + } -output_file = output_dir / "s3_data.nc" -datacube.download(str(output_file), format="NetCDF") + print("[S3] Authenticating...") + token_response = requests.post( + "https://identity.dataspace.copernicus.eu/auth/realms/CDSE/protocol/openid-connect/token", + data={ + "grant_type": "password", + "username": os.getenv("CDSE_USER"), + "password": os.getenv("CDSE_PASSWORD"), + "client_id": "cdse-public", + }, + ) + token_response.raise_for_status() + tokens = token_response.json() + access_token = tokens["access_token"] -nc = netCDF4.Dataset(str(output_file), 'r') -times = netCDF4.num2date(nc.variables['t'][:], nc.variables['t'].units) -x_coords = nc.variables['x'][:] -y_coords = nc.variables['y'][:] -band_vars = sorted([v for v in nc.variables.keys() if v.startswith('B') and v[1:].isdigit()]) -band_names = [list(bands.keys())[openeo_bands.index(b)] for b in band_vars] + print("[S3] Connecting to OpenEO...") + conn = openeo.connect("openeo.dataspace.copernicus.eu") + conn.authenticate_oidc_access_token(access_token) -transform = from_bounds( - float(x_coords.min()), float(y_coords.min()), - float(x_coords.max()), float(y_coords.max()), - len(x_coords), len(y_coords) -) + print("[S3] Loading collection...") + datacube = conn.load_collection( + "SENTINEL3_OLCI_L1B", + spatial_extent=spatial_extent, + temporal_extent=[start_date, end_date], + bands=openeo_bands, + ).resample_spatial(projection=32632) -date_counts = {} -for t_idx, time_val in enumerate(times): - dt = time_val if isinstance(time_val, datetime) else netCDF4.num2date(nc.variables['t'][t_idx], nc.variables['t'].units) - date_str = dt.strftime("%Y%m%d") - increment = date_counts.get(date_str, 0) - date_counts[date_str] = increment + 1 - - band_data = [nc.variables[b][t_idx, :, :] for b in band_vars] - stacked = np.stack(band_data, axis=0) - - output_path = output_dir / f"{date_str}_{increment}.geotiff" - with rasterio.open( - output_path, 'w', - driver='GTiff', height=len(y_coords), width=len(x_coords), - count=len(band_data), dtype=stacked.dtype, crs='EPSG:32632', - transform=transform, compress='lzw' - ) as dst: - dst.write(stacked) - for i, band_name in enumerate(band_names, 1): - dst.set_band_description(i, band_name) - print(f"Saved: {output_path}") + output_file = output_dir / "s3_data.nc" + print("[S3] Downloading NetCDF...") + datacube.download(str(output_file), format="NetCDF") -nc.close() -os.remove(output_file) + print("[S3] Processing NetCDF...") + nc = netCDF4.Dataset(str(output_file), "r") + times = netCDF4.num2date(nc.variables["t"][:], nc.variables["t"].units) + x_coords = nc.variables["x"][:] + y_coords = nc.variables["y"][:] + band_vars = sorted( + [v for v in nc.variables.keys() if v.startswith("B") and v[1:].isdigit()] + ) + band_names = [list(bands.keys())[openeo_bands.index(b)] for b in band_vars] + + transform = from_bounds( + float(x_coords.min()), + float(y_coords.min()), + float(x_coords.max()), + float(y_coords.max()), + len(x_coords), + len(y_coords), + ) + + print(f"[S3] Found {len(times)} time steps") + date_counts = {} + for t_idx, time_val in enumerate(times): + dt = ( + time_val + if isinstance(time_val, datetime) + else netCDF4.num2date(nc.variables["t"][t_idx], nc.variables["t"].units) + ) + date_str = dt.strftime("%Y%m%d") + increment = date_counts.get(date_str, 0) + date_counts[date_str] = increment + 1 + + band_data = [nc.variables[b][t_idx, :, :] for b in band_vars] + stacked = np.stack(band_data, axis=0) + + output_path = output_dir / f"{date_str}_{increment}.geotiff" + with rasterio.open( + output_path, + "w", + driver="GTiff", + height=len(y_coords), + width=len(x_coords), + count=len(band_data), + dtype=stacked.dtype, + crs="EPSG:32632", + transform=transform, + compress="lzw", + ) as dst: + dst.write(stacked) + for i, band_name in enumerate(band_names, 1): + dst.set_band_description(i, band_name) + print(f"[S3] Saved: {output_path}") + + nc.close() + os.remove(output_file) + print("[S3] Completed") diff --git a/requirements.txt b/requirements.txt index fe8374e..46a3b7f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ python-dotenv netCDF4 numpy requests - +ruff +pre-commit