From ac0e6879560ae463b9c9e470855d10b9688790aa Mon Sep 17 00:00:00 2001 From: Felix Delattre Date: Sat, 21 Feb 2026 00:09:34 +0100 Subject: [PATCH] refactored download and preselection. --- acquisition_phenocam.py | 26 +++++-- acquisition_s2.py | 2 +- fusion.py | 4 +- metrics_indices.py | 61 ++------------- preparation.py | 20 ++--- preselection.py | 167 ++++++++++++++++++++++++++++------------ run.py | 74 +++++++++--------- webapp/cloudy.html | 16 ++-- 8 files changed, 206 insertions(+), 164 deletions(-) diff --git a/acquisition_phenocam.py b/acquisition_phenocam.py index 7fed9d7..200f944 100644 --- a/acquisition_phenocam.py +++ b/acquisition_phenocam.py @@ -44,6 +44,12 @@ def _find_start_offset(site_name, start_dt, total_count): def download_phenocam(season, site_position, site_name, date_range=None): + """Wrapper that downloads both phenocam images and GCC time series.""" + _download_phenocam_images(season, site_position, site_name, date_range) + _download_phenocam_gcc(season, site_position, site_name, date_range) + + +def _download_phenocam_images(season, site_position, site_name, date_range=None): lat, lon = site_position datetime_range = date_range or f"{season}-01-01/{season}-12-31" output_dir = Path(f"data/{site_name}/{season}/raw/phenocam/") @@ -149,10 +155,10 @@ def download_phenocam(season, site_position, site_name, date_range=None): print("[PhenoCam] Completed") -def download_phenocam_greenness(season, site_position, site_name, date_range=None): - """Fetch greenness-index time series from PhenoCam API.""" +def _download_phenocam_gcc(season, site_position, site_name, date_range=None): + """Fetch greenness-index time series from PhenoCam API. Saves JSON and CSV.""" datetime_range = date_range or f"{season}-01-01/{season}-12-31" - output_file = Path(f"data/{site_name}/{season}/raw/phenocam/timeseries.json") + output_file = Path(f"data/{site_name}/{season}/raw/phenocam/phenocam_gcc.json") output_file.parent.mkdir(parents=True, exist_ok=True) start_date, end_date = datetime_range.split("/") @@ -210,7 +216,17 @@ def download_phenocam_greenness(season, site_position, site_name, date_range=Non return timeseries.sort(key=lambda x: x["date"]) - with open(output_file, "w") as f: + + output_dir = output_file.parent + json_path = output_dir / "phenocam_gcc.json" + csv_path = output_dir / "phenocam_gcc.csv" + + with open(json_path, "w") as f: json.dump(timeseries, f, indent=2) - print(f"[PhenoCam-GI] Saved: {output_file} ({len(timeseries)} entries)") + with open(csv_path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=["date", "greenness_index"]) + writer.writeheader() + writer.writerows(timeseries) + + print(f"[PhenoCam-GI] Saved: {json_path} and {csv_path} ({len(timeseries)} entries)") diff --git a/acquisition_s2.py b/acquisition_s2.py index 59aae0a..9aae613 100644 --- a/acquisition_s2.py +++ b/acquisition_s2.py @@ -9,7 +9,7 @@ from rasterio.warp import Resampling, calculate_default_transform, reproject, tr from rasterio.windows import from_bounds, transform as window_transform from pystac_client import Client -BBOX_SIZE = 0.016 +BBOX_SIZE = 0.011 TARGET_CRS = CRS.from_epsg(32632) diff --git a/fusion.py b/fusion.py index ce69513..0dd1b40 100644 --- a/fusion.py +++ b/fusion.py @@ -2,7 +2,7 @@ from pathlib import Path from datetime import datetime, timedelta -from preselection import detect_clouds +from preselection import create_timeseries from preparation import ( prepare_s2, prepare_s3, @@ -68,8 +68,8 @@ def run_efast(season, site_position, site_name, cleaning_strategy="aggressive", def run_all_efast_scenarios(season, site_position, site_name, sigma_value=30, date_range=None): + create_timeseries(season, site_position, site_name) for strategy in ["aggressive", "nonaggressive"]: - detect_clouds(season, site_name, cleaning_strategy=strategy) prepare_s2(season, site_position, site_name, cleaning_strategy=strategy, date_range=date_range) prepare_s3(season, site_position, site_name, cleaning_strategy=strategy, date_range=date_range) run_efast(season, site_position, site_name, cleaning_strategy=strategy, sigma=None, date_range=date_range) diff --git a/metrics_indices.py b/metrics_indices.py index d9c4d68..d0c00b3 100644 --- a/metrics_indices.py +++ b/metrics_indices.py @@ -6,6 +6,8 @@ from rasterio.warp import transform as transform_coords from pathlib import Path from datetime import datetime +from preselection import _sample_3x3 + RED_BAND = 3 NIR_BAND = 4 BLUE_BAND = 1 @@ -65,50 +67,6 @@ def _get_ndvi_value(ndvi_file, site_position): return None -def _get_ndvi_from_original(input_file, site_position): - """Calculate NDVI directly from original file without creating GeoTIFF.""" - try: - with rasterio.open(input_file) as src: - if src.count < 4: - return None - - red = src.read(RED_BAND).astype(np.float32) - nir = src.read(NIR_BAND).astype(np.float32) - - lon, lat = site_position[1], site_position[0] - x, y = transform_coords("EPSG:4326", src.crs, [lon], [lat]) - - if not ( - src.bounds.left <= x[0] <= src.bounds.right - and src.bounds.bottom <= y[0] <= src.bounds.top - ): - return None - - row, col = src.index(x[0], y[0]) - if row < 0 or row >= src.height or col < 0 or col >= src.width: - return None - - # Extract 3x3 window with boundary handling - r0, r1 = max(0, row - 1), min(src.height, row + 2) - c0, c1 = max(0, col - 1), min(src.width, col + 2) - red_window = red[r0:r1, c0:c1] - nir_window = nir[r0:r1, c0:c1] - - # Calculate NDVI for each pixel in window - mask = (red_window > 0) & (nir_window > 0) & ~np.isnan(red_window) & ~np.isnan(nir_window) - if not np.any(mask): - return None - - ndvi_window = np.zeros_like(red_window, dtype=np.float32) - ndvi_window[mask] = (nir_window[mask] - red_window[mask]) / (nir_window[mask] + red_window[mask]) - - # Return mean of valid NDVI values - valid_ndvi = ndvi_window[mask] - return float(np.mean(valid_ndvi)) if len(valid_ndvi) > 0 else None - except Exception as e: - return None - - def _create_timeseries_for_dir(input_dir, output_dir, site_position, source_name, pattern="*.geotiff"): print(f"[NDVI-{source_name}] Creating timeseries.json...") timeseries = [] @@ -138,13 +96,17 @@ def _create_timeseries_for_dir(input_dir, output_dir, site_position, source_name f"[NDVI-{source_name}] Warning: Could not extract date from {filename}, using '{date_str}'" ) - ndvi_value = _get_ndvi_from_original(input_file, site_position) + ndvi_value, band_means = _sample_3x3(input_file, site_position) + blue_mean = band_means.get("b02") if band_means else None if ndvi_value is None: print( f"[NDVI-{source_name}] Warning: Could not sample {filename} (outside bounds or nodata)" ) - timeseries.append({"date": date, "filename": filename, "ndvi": ndvi_value}) + entry = {"date": date, "filename": filename, "ndvi": ndvi_value} + if blue_mean is not None: + entry["blue"] = blue_mean + timeseries.append(entry) timeseries.sort(key=lambda x: x["date"]) output_dir.mkdir(parents=True, exist_ok=True) @@ -198,13 +160,6 @@ def generate_ndvi_raw(season, site_position, site_name): pass -def create_ndvi_timeseries_raw(season, site_position, site_name): - for source in ["s2", "s3"]: - input_dir = Path(f"data/{site_name}/{season}/raw/{source}/") - output_dir = Path(f"data/{site_name}/{season}/raw/ndvi/{source}/") - _create_timeseries_for_dir(input_dir, output_dir, site_position, source.upper()) - - def _get_output_name_prepared(geotiff_file): if geotiff_file.suffix == ".tif": if "REFL" in geotiff_file.stem: diff --git a/preparation.py b/preparation.py index ee698cc..02b99e6 100644 --- a/preparation.py +++ b/preparation.py @@ -23,12 +23,16 @@ def _import_distance_to_clouds(): ) -def _load_clouds(clouds_file): +def _load_excluded(season, site_name, cleaning_strategy): + """Load excluded filenames from NDVI timeseries (excluded_aggressive / excluded_nonaggressive).""" + base = Path(f"data/{site_name}/{season}/raw/preselection") + key = f"excluded_{cleaning_strategy}" clouds = {"s2": set(), "s3": set()} - if clouds_file.exists(): - clouds_data = json.loads(clouds_file.read_text()) - clouds["s2"] = set(clouds_data.get("s2", [])) - clouds["s3"] = set(clouds_data.get("s3", [])) + for source in ["s2", "s3"]: + ts_file = base / f"{source}_preselection.json" + if ts_file.exists(): + data = json.loads(ts_file.read_text()) + clouds[source] = {e["filename"] for e in data if e.get(key)} return clouds @@ -71,9 +75,8 @@ def prepare_s2(season, site_position, site_name, cleaning_strategy="aggressive", s2_dir = Path(f"data/{site_name}/{season}/raw/s2/") s3_dir = Path(f"data/{site_name}/{season}/raw/s3/") s2_output_dir = _get_base_dir(season, site_name, cleaning_strategy) / "s2" - clouds_file = Path(f"data/{site_name}/{season}/clouds_{cleaning_strategy}.json") - clouds = _load_clouds(clouds_file) + clouds = _load_excluded(season, site_name, cleaning_strategy) s2_output_dir.mkdir(parents=True, exist_ok=True) s3_files = [f for f in s3_dir.glob("*.geotiff") if f.name not in clouds["s3"]] @@ -116,9 +119,8 @@ def prepare_s3(season, site_position, site_name, cleaning_strategy="aggressive", base_dir = _get_base_dir(season, site_name, cleaning_strategy) s2_prepared_dir = base_dir / "s2" s3_preprocessed_dir = base_dir / "s3" - clouds_file = Path(f"data/{site_name}/{season}/clouds_{cleaning_strategy}.json") - clouds = _load_clouds(clouds_file) + clouds = _load_excluded(season, site_name, cleaning_strategy) s3_preprocessed_dir.mkdir(parents=True, exist_ok=True) s3_by_date = defaultdict(list) diff --git a/preselection.py b/preselection.py index 9de5f4d..6aced47 100644 --- a/preselection.py +++ b/preselection.py @@ -1,69 +1,140 @@ -"""Pre-selection: NDVI-based cloud/flaw filtering for S2 and S3 data.""" +"""Pre-selection: self-contained NDVI timeseries with cloud/dark-imagery exclusion markers.""" +import csv import json +import numpy as np +import rasterio +from rasterio.warp import transform as transform_coords from pathlib import Path from datetime import datetime WINDOW_DAYS = 14 MIN_WINDOW_SIZE = 3 THRESHOLDS = {"aggressive": {"threshold": 0.3, "delta": 0.15}, "nonaggressive": {"threshold": 0.2, "delta": 0.25}} +BLUE_MIN = 0.01 + +GREEN_BAND = 2 +RED_BAND = 3 +NIR_BAND = 4 +BLUE_BAND = 1 +BAND_KEYS = ["b02", "b03", "b04", "b8a"] -def detect_clouds(season, site_name, cleaning_strategy="aggressive"): - """Filter cloud-covered/flawed S2 and S3 files using NDVI thresholds.""" - output_file = Path(f"data/{site_name}/{season}/clouds_{cleaning_strategy}.json") - clouds = {"s2": [], "s3": []} - thresholds = THRESHOLDS[cleaning_strategy] +def _sample_3x3(input_file, site_position): + """Sample mean NDVI and all four bands (3x3 window) at site. Returns (ndvi, {b02,b03,b04,b8a}) or (None, None).""" + try: + with rasterio.open(input_file) as src: + if src.count < 4: + return None, None + bands = [src.read(i).astype(np.float32) for i in range(1, 5)] + lon, lat = site_position[1], site_position[0] + x, y = transform_coords("EPSG:4326", src.crs, [lon], [lat]) + if not ( + src.bounds.left <= x[0] <= src.bounds.right + and src.bounds.bottom <= y[0] <= src.bounds.top + ): + return None, None + row, col = src.index(x[0], y[0]) + if row < 0 or row >= src.height or col < 0 or col >= src.width: + return None, None + r0, r1 = max(0, row - 1), min(src.height, row + 2) + c0, c1 = max(0, col - 1), min(src.width, col + 2) + windows = [b[r0:r1, c0:c1] for b in bands] + red_w, nir_w = windows[RED_BAND - 1], windows[NIR_BAND - 1] + mask = (red_w > 0) & (nir_w > 0) & ~np.isnan(red_w) & ~np.isnan(nir_w) + if not np.any(mask): + return None, None + ndvi = float(np.mean((nir_w[mask] - red_w[mask]) / (nir_w[mask] + red_w[mask]))) + band_means = {k: round(float(np.mean(w[mask])), 6) for k, w in zip(BAND_KEYS, windows)} + return ndvi, band_means + except Exception: + return None, None + + +def _extract_date(filename): + for part in filename.replace(".geotiff", "").split("_"): + if len(part) == 8 and part.isdigit(): + return part, datetime.strptime(part, "%Y%m%d").isoformat() + return None, None + + +def _is_excluded(entry, entries, strategy): + """True if entry is excluded by strategy (NDVI threshold/delta or dark blue).""" + th = THRESHOLDS[strategy] + if entry.get("ndvi") is None: + return True + if entry.get("b02") is not None and entry["b02"] < BLUE_MIN: + return True + entry_date = datetime.fromisoformat(entry["date"].replace("Z", "+00:00")) + window_ndvi = [] + for e in entries: + if e.get("ndvi") is None: + continue + d = datetime.fromisoformat(e["date"].replace("Z", "+00:00")) + if abs((d - entry_date).days) <= WINDOW_DAYS: + window_ndvi.append(e["ndvi"]) + if len(window_ndvi) < MIN_WINDOW_SIZE: + return False + threshold = max(window_ndvi) - th["delta"] + return entry["ndvi"] < threshold and entry["ndvi"] < th["threshold"] + + +def create_timeseries(season, site_position, site_name): + """Build NDVI timeseries (3x3 window) for raw S2/S3, with exclusion markers for both strategies.""" + lat, lon = site_position + base = Path(f"data/{site_name}/{season}") + + print(f"[PRESELECT] Creating NDVI timeseries: {site_name} ({lat:.6f}, {lon:.6f}), {season}") for source in ["s2", "s3"]: - timeseries_file = Path( - f"data/{site_name}/{season}/raw/ndvi/{source}/timeseries.json" - ) - if not timeseries_file.exists(): - print(f"[CLOUDS-{source.upper()}] No timeseries.json found") + input_dir = base / "raw" / source + out_dir = base / "raw" / "preselection" + out_dir.mkdir(parents=True, exist_ok=True) + output_file = out_dir / f"{source}_preselection.json" + + if not input_dir.exists(): + print(f"[PRESELECT] Skipping {source}: {input_dir} not found") continue - print(f"[CLOUDS-{source.upper()}] Processing {timeseries_file}...") - - with open(timeseries_file) as f: - timeseries = json.load(f) - - # Flag entries with ndvi: None as outliers (bad/invalid data) - for e in timeseries: - if e.get("ndvi") is None: - clouds[source].append(e["filename"]) - - entries = [ - (e, datetime.fromisoformat(e["date"].replace("Z", "+00:00"))) - for e in timeseries - if e.get("ndvi") is not None - ] - - for entry, entry_date in entries: - window_ndvi = [ - e["ndvi"] - for e, d in entries - if abs((d - entry_date).days) <= WINDOW_DAYS - ] - - if len(window_ndvi) < MIN_WINDOW_SIZE: + timeseries = [] + for f in sorted(input_dir.glob("*.geotiff")): + if "DIST_CLOUD" in f.name: continue + date_str, date_iso = _extract_date(f.name) + if not date_str: + continue + ndvi, band_means = _sample_3x3(f, site_position) + entry = {"filename": f.name, "date": date_iso, "ndvi": ndvi} + if band_means: + entry.update(band_means) + timeseries.append(entry) - max_ndvi = max(window_ndvi) - threshold = max_ndvi - thresholds["delta"] + timeseries.sort(key=lambda e: e["date"]) + for e in timeseries: + e["excluded_aggressive"] = _is_excluded(e, timeseries, "aggressive") + e["excluded_nonaggressive"] = _is_excluded(e, timeseries, "nonaggressive") - if entry["ndvi"] < threshold and entry["ndvi"] < thresholds["threshold"]: - clouds[source].append(entry["filename"]) + with open(output_file, "w") as out: + json.dump(timeseries, out, indent=2) - print( - f"[CLOUDS-{source.upper()}] Found {len(clouds[source])} cloud-covered files" - ) + csv_file = out_dir / f"{source}_preselection.csv" + fieldnames = ["filename", "date", "ndvi"] + BAND_KEYS + ["excluded_aggressive", "excluded_nonaggressive"] + with open(csv_file, "w", newline="") as out: + w = csv.DictWriter(out, fieldnames=fieldnames, extrasaction="ignore") + w.writeheader() + for e in timeseries: + w.writerow({k: e.get(k) for k in fieldnames}) - output_file.parent.mkdir(parents=True, exist_ok=True) - with open(output_file, "w") as f: - json.dump(clouds, f, indent=2) + n_excl_agg = sum(1 for e in timeseries if e["excluded_aggressive"]) + n_excl_non = sum(1 for e in timeseries if e["excluded_nonaggressive"]) + print(f"[PRESELECT] Saved {output_file} + {csv_file.name}: {len(timeseries)} entries ({n_excl_agg} aggressive, {n_excl_non} nonaggressive excluded)") - print(f"[CLOUDS] Saved: {output_file}") + print("[PRESELECT] Completed") -# Alias for backward compatibility -preselect = detect_clouds +# Backward compatibility +def detect_clouds(season, site_position, site_name, cleaning_strategy="aggressive"): + """Create timeseries with exclusion markers. Strategy is read from timeseries when preparing.""" + create_timeseries(season, site_position, site_name) + + +preselect = create_timeseries diff --git a/run.py b/run.py index 7f645c7..bf54b6f 100644 --- a/run.py +++ b/run.py @@ -1,46 +1,39 @@ -from fusion import run_all_efast_scenarios -from postprocessing import process_all_scenarios -from metrics_indices import ( - create_ndvi_timeseries_raw, - create_ndvi_timeseries_post_process, - create_gcc_timeseries_post_process, - create_s2_bands_timeseries_post_process, -) +# from fusion import run_all_efast_scenarios +# from postprocessing import process_all_scenarios +# from metrics_indices import ( +# create_ndvi_timeseries_post_process, +# create_gcc_timeseries_post_process, +# create_s2_bands_timeseries_post_process, +# ) from acquisition_s2 import download_s2 from acquisition_s3 import download_s3 -from acquisition_phenocam import download_phenocam, download_phenocam_greenness -from metrics_stats import calculate_all_metrics +from acquisition_phenocam import download_phenocam +from preselection import create_timeseries +# from metrics_stats import calculate_all_metrics def run_pipeline(season, site_position, site_name): - """Run pipeline (downloads + EFAST fusion + post-process + metrics).""" + """Run pipeline (downloads + preselection).""" try: - # Download steps (needed for new site/season) - #download_s2(season, site_position, site_name) - #download_s3(season, site_position, site_name) - #download_phenocam(season, site_position, site_name) - #download_phenocam_greenness(season, site_position, site_name) + download_s2(season, site_position, site_name) + download_s3(season, site_position, site_name) + download_phenocam(season, site_position, site_name) - #print(f"Generating NDVI for raw data: {site_name}, {season}") - #create_ndvi_timeseries_raw(season, site_position, site_name) + print(f"Creating preselection timeseries: {site_name}, {season}") + create_timeseries(season, site_position, site_name) - print(f"Running EFAST fusion for all scenarios: {site_name}, {season}") - run_all_efast_scenarios(season, site_position, site_name) - - print(f"Post-processing data: {site_name}, {season}") - process_all_scenarios(season, site_position, site_name) - - print(f"Generating NDVI for final outputs: {site_name}, {season}") - create_ndvi_timeseries_post_process(season, site_position, site_name) - - print(f"Generating GCC for final outputs: {site_name}, {season}") - create_gcc_timeseries_post_process(season, site_position, site_name) - - print(f"Generating S2 band timeseries: {site_name}, {season}") - create_s2_bands_timeseries_post_process(season, site_position, site_name) - - print(f"Calculating metrics: {site_name}, {season}") - calculate_all_metrics(season, site_name, site_position) + # print(f"Running EFAST fusion for all scenarios: {site_name}, {season}") + # run_all_efast_scenarios(season, site_position, site_name) + # print(f"Post-processing data: {site_name}, {season}") + # process_all_scenarios(season, site_position, site_name) + # print(f"Generating NDVI for final outputs: {site_name}, {season}") + # create_ndvi_timeseries_post_process(season, site_position, site_name) + # print(f"Generating GCC for final outputs: {site_name}, {season}") + # create_gcc_timeseries_post_process(season, site_position, site_name) + # print(f"Generating S2 band timeseries: {site_name}, {season}") + # create_s2_bands_timeseries_post_process(season, site_position, site_name) + # print(f"Calculating metrics: {site_name}, {season}") + # calculate_all_metrics(season, site_name, site_position) except Exception as e: print(f"Error: {e}") @@ -48,8 +41,11 @@ def run_pipeline(season, site_position, site_name): if __name__ == "__main__": - - run_pipeline(2024, (47.116171, 11.320308), "innsbruck") - # forthgr - FORTH Heraklion Greece, Agriculture, 2024 - # sites.geojson: lon=25.0743, lat=35.3045 #run_pipeline(2024, (35.3045, 25.0743), "forthgr") + run_pipeline(2024, (47.116171, 11.320308), "innsbruck") + #run_pipeline(2020, (47.116171, 11.320308), "innsbruck") + #run_pipeline(2024, (58.5633, 24.3688), "pitsalu") + #run_pipeline(2023, (64.2437, 19.7673), "vindeln2") + #run_pipeline(2024, (36.7455, -6.0033), "sunflowerjerez1") + #run_pipeline(2024, (42.6558, 26.9837), "institutekarnobat") + diff --git a/webapp/cloudy.html b/webapp/cloudy.html index b45e660..b979797 100644 --- a/webapp/cloudy.html +++ b/webapp/cloudy.html @@ -84,13 +84,15 @@ const showCloudsCheckbox = document.getElementById("showClouds"); async function loadTimeseries() { - const [s2, s3, cloudData] = await Promise.all([ - fetch("../data/innsbruck/2024/raw/ndvi/s2/timeseries.json").then(r => r.json()), - fetch("../data/innsbruck/2024/raw/ndvi/s3/timeseries.json").then(r => r.json()), - fetch("../data/innsbruck/2024/clouds.json").then(r => r.json()).catch(() => ({ s2: [], s3: [] })) + const [s2, s3] = await Promise.all([ + fetch("../data/innsbruck/2024/raw/preselection/s2_preselection.json").then(r => r.json()), + fetch("../data/innsbruck/2024/raw/preselection/s3_preselection.json").then(r => r.json()) ]); timeseries = { s2, s3 }; - clouds = { s2: new Set(cloudData.s2 || []), s3: new Set(cloudData.s3 || []) }; + clouds = { + s2: new Set((s2 || []).filter(e => e.excluded_aggressive).map(e => e.filename)), + s3: new Set((s3 || []).filter(e => e.excluded_aggressive).map(e => e.filename)) + }; drawTimeseries(); } @@ -244,7 +246,7 @@ } async function loadNDVI(source, filename) { - const tiff = await GeoTIFF.fromArrayBuffer(await (await fetch(`../data/innsbruck/2024/raw/ndvi/${source}/${filename}`)).arrayBuffer()); + const tiff = await GeoTIFF.fromArrayBuffer(await (await fetch(`../data/innsbruck/2024/raw/${source}/${filename}`)).arrayBuffer()); const image = await tiff.getImage(); const data = Array.from((await image.readRasters())[0]); const width = image.getWidth(); @@ -303,7 +305,7 @@ const filename = `${date}_${i}.geotiff`; if (!showCloudsCheckbox.checked && clouds[source] && clouds[source].has(filename)) continue; try { - const res = await fetch(`../data/innsbruck/2024/raw/ndvi/${source}/${filename}`); + const res = await fetch(`../data/innsbruck/2024/raw/${source}/${filename}`); if (res.ok) return filename; } catch {} }