Process benchmarking inputs¶

Read in the raw benchmarking inputs and process them to make plottable data.

Note, we split this up from the benchmarking-make_inputs.ipynb script because this task requires significantly less memory (and therefore a less expensive machine), but still takes some time so we only want to do it once. Operations should be achievable with < 35 GB memory.

In [1]:

Copied!

import os

import numpy as np
import pandas as pd
import xarray as xr
import os

import numpy as np
import pandas as pd
import xarray as xr

In [2]:

Copied!





# --- read in dat
s3_path = 's3://carbonplan-ocr/evaluation/'
savename = 'benchmarking-input-dat.zarr'
ds = xr.open_zarr(os.path.join(s3_path, savename))
# --- read in dat
s3_path = 's3://carbonplan-ocr/evaluation/'
savename = 'benchmarking-input-dat.zarr'
ds = xr.open_zarr(os.path.join(s3_path, savename))

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0xffa3685af390>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0xffa3684dab70>, 5951.158062602)])']
connector: <aiohttp.connector.TCPConnector object at 0xffa3685ad310>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0xffa3685ad090>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0xffa3684db230>, 5951.160114161)])']
connector: <aiohttp.connector.TCPConnector object at 0xffa3685accd0>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0xffa45807c050>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0xffa3684daff0>, 5951.163517585)])']
connector: <aiohttp.connector.TCPConnector object at 0xffa3685af610>

In [3]:

Copied!





# --- filter to four slices
#     (CONUS, west, east, testbox) and later (all data, non-burnable)
slicenames = {
    'testbox': {'minlat': 42.7, 'maxlat': 46.3, 'minlon': -116.8, 'maxlon': -112.8},
    'CONUS': None,
    'West of -98': {'maxlon': -98},
    'East of -98': {'minlon': -98},
}

outdict = {}  # to hold outputs
for slc_key, slc_val in slicenames.items():
    if slc_key == 'testbox':
        outdict[slc_key] = ds.sel(
            latitude=slice(slc_val['maxlat'], slc_val['minlat']),
            longitude=slice(slc_val['minlon'], slc_val['maxlon']),
        )
    elif slc_key == 'CONUS':
        outdict[slc_key] = ds.copy()

    elif slc_key == 'West of -98':
        outdict[slc_key] = ds.where(ds['longitude'] < slc_val['maxlon'], drop=True)

    elif slc_key == 'East of -98':
        outdict[slc_key] = ds.where(ds['longitude'] >= slc_val['minlon'], drop=True)
# --- filter to four slices
#     (CONUS, west, east, testbox) and later (all data, non-burnable)
slicenames = {
    'testbox': {'minlat': 42.7, 'maxlat': 46.3, 'minlon': -116.8, 'maxlon': -112.8},
    'CONUS': None,
    'West of -98': {'maxlon': -98},
    'East of -98': {'minlon': -98},
}

outdict = {}  # to hold outputs
for slc_key, slc_val in slicenames.items():
    if slc_key == 'testbox':
        outdict[slc_key] = ds.sel(
            latitude=slice(slc_val['maxlat'], slc_val['minlat']),
            longitude=slice(slc_val['minlon'], slc_val['maxlon']),
        )
    elif slc_key == 'CONUS':
        outdict[slc_key] = ds.copy()

    elif slc_key == 'West of -98':
        outdict[slc_key] = ds.where(ds['longitude'] < slc_val['maxlon'], drop=True)

    elif slc_key == 'East of -98':
        outdict[slc_key] = ds.where(ds['longitude'] >= slc_val['minlon'], drop=True)

Create N bins of data (for plotting distributions)¶

In [4]:

Copied!

# --- set bin bounds
n_bins = 10000  # this number controls distribution resolution
bp_range = (0, 0.14)  # to confirm max: dsslice['burn_probability_2011'].max(skipna=True).compute()
# --- set bin bounds
n_bins = 10000  # this number controls distribution resolution
bp_range = (0, 0.14)  # to confirm max: dsslice['burn_probability_2011'].max(skipna=True).compute()

Bins across burn mask conditions¶

In [5]:

Copied!





import dask.array as da

df_dict = {}  # to hold results

for dsname, dsslice in outdict.items():
    # track progress
    print(f'Now solving: {dsname}')

    # define variables
    bp = dsslice['burn_probability_2011']
    mask = dsslice['burn_mask']
    riley_b_mask = dsslice['riley_burnable_mask']

    burned = mask == 1
    unburned = mask == 0
    riley_burnable = riley_b_mask == 1
    riley_unburnable = riley_b_mask == 0

    # --- compute histograms lazily
    # [ ALL DATA ]
    burned_hist, bins = da.histogram(bp.where(burned).data, bins=n_bins, range=bp_range)
    unburned_hist, _ = da.histogram(bp.where(unburned).data, bins=n_bins, range=bp_range)
    # [ UNBURNABLE ONLY ]
    burned_hist_unburnable, bins = da.histogram(
        bp.where((burned) & (riley_unburnable)).data, bins=n_bins, range=bp_range
    )
    unburned_hist_unburnable, _ = da.histogram(
        bp.where((unburned) & (riley_unburnable)).data, bins=n_bins, range=bp_range
    )

    # bring just the histograms into memory
    bin_centers = (bins[:-1] + bins[1:]) / 2
    # [ ALL DATA ]
    burned_hist, unburned_hist = da.compute(burned_hist, unburned_hist)
    # normalize counts to density
    burned_density = burned_hist / burned_hist.sum()
    unburned_density = unburned_hist / unburned_hist.sum()
    # [ UNBURNABLE ONLY ]
    burned_hist_unburnable, unburned_hist_unburnable = da.compute(
        burned_hist_unburnable, unburned_hist_unburnable
    )
    # normalize counts to density
    burned_density_unburnable = burned_hist_unburnable / burned_hist_unburnable.sum()
    unburned_density_unburnable = unburned_hist_unburnable / unburned_hist_unburnable.sum()

    # --- combine into single dataframe
    tmpdict = {
        'bin_centers': bin_centers,
        'burned_BPdensity': burned_density,
        'unburned_BPdensity': unburned_density,
        'burned_BPdensity_NB': burned_density_unburnable,
        'unburned_BPdensity_NB': unburned_density_unburnable,
    }

    df_dict[dsname] = pd.DataFrame(tmpdict)
import dask.array as da

df_dict = {}  # to hold results

for dsname, dsslice in outdict.items():
    # track progress
    print(f'Now solving: {dsname}')

    # define variables
    bp = dsslice['burn_probability_2011']
    mask = dsslice['burn_mask']
    riley_b_mask = dsslice['riley_burnable_mask']

    burned = mask == 1
    unburned = mask == 0
    riley_burnable = riley_b_mask == 1
    riley_unburnable = riley_b_mask == 0

    # --- compute histograms lazily
    # [ ALL DATA ]
    burned_hist, bins = da.histogram(bp.where(burned).data, bins=n_bins, range=bp_range)
    unburned_hist, _ = da.histogram(bp.where(unburned).data, bins=n_bins, range=bp_range)
    # [ UNBURNABLE ONLY ]
    burned_hist_unburnable, bins = da.histogram(
        bp.where((burned) & (riley_unburnable)).data, bins=n_bins, range=bp_range
    )
    unburned_hist_unburnable, _ = da.histogram(
        bp.where((unburned) & (riley_unburnable)).data, bins=n_bins, range=bp_range
    )

    # bring just the histograms into memory
    bin_centers = (bins[:-1] + bins[1:]) / 2
    # [ ALL DATA ]
    burned_hist, unburned_hist = da.compute(burned_hist, unburned_hist)
    # normalize counts to density
    burned_density = burned_hist / burned_hist.sum()
    unburned_density = unburned_hist / unburned_hist.sum()
    # [ UNBURNABLE ONLY ]
    burned_hist_unburnable, unburned_hist_unburnable = da.compute(
        burned_hist_unburnable, unburned_hist_unburnable
    )
    # normalize counts to density
    burned_density_unburnable = burned_hist_unburnable / burned_hist_unburnable.sum()
    unburned_density_unburnable = unburned_hist_unburnable / unburned_hist_unburnable.sum()

    # --- combine into single dataframe
    tmpdict = {
        'bin_centers': bin_centers,
        'burned_BPdensity': burned_density,
        'unburned_BPdensity': unburned_density,
        'burned_BPdensity_NB': burned_density_unburnable,
        'unburned_BPdensity_NB': unburned_density_unburnable,
    }

    df_dict[dsname] = pd.DataFrame(tmpdict)

Now solving: testbox
Now solving: CONUS
Now solving: West of -98
Now solving: East of -98

Save mask dict¶

In [6]:

Copied!





import s3fs

s3 = s3fs.S3FileSystem()

for name, df in df_dict.items():
    path = f's3://carbonplan-ocr/evaluation/benchmarking-processed/{name}_maskdf.parquet'
    with s3.open(path, 'wb') as f:
        df.to_parquet(f)
import s3fs

s3 = s3fs.S3FileSystem()

for name, df in df_dict.items():
    path = f's3://carbonplan-ocr/evaluation/benchmarking-processed/{name}_maskdf.parquet'
    with s3.open(path, 'wb') as f:
        df.to_parquet(f)

Bins across burn sum conditions¶

In [7]:

Copied!





# --- RUN ONCE: get the max number of burns
# **** N = 156 (!!) ****
# nburn_max = ds['burn_sum'].max(skipna=True).compute()
# ---
# (earlier testing indicates that pixels where nburn > 20 are very rare,
#  so we'll split into 0-20 for now)
nburn_arr = np.arange(0, 21, 1)
# --- RUN ONCE: get the max number of burns
# **** N = 156 (!!) ****
# nburn_max = ds['burn_sum'].max(skipna=True).compute()
# ---
# (earlier testing indicates that pixels where nburn > 20 are very rare,
#  so we'll split into 0-20 for now)
nburn_arr = np.arange(0, 21, 1)

In [8]:

Copied!





import dask.array as da

ds_dict_burnsum = {}  # to hold results

for dsname, dsslice in outdict.items():
    # track progress
    print(f'Now solving: {dsname}')

    # define variables
    bp = dsslice['burn_probability_2011']
    burnsum = dsslice['burn_sum']
    riley_b_mask = dsslice['riley_burnable_mask']
    riley_burnable = riley_b_mask == 1
    riley_unburnable = riley_b_mask == 0

    dfs_nburn = {}  # empty dict to hold one dataframe per nburn_arr

    for nfire in nburn_arr:
        print(f'n fires: {nfire}', end='\r')

        # --- create a mask for just these burns
        this_burnsum_mask = burnsum == nfire

        # --- compute histograms lazily
        # [ ALL DATA ]
        burned_hist, bins = da.histogram(
            bp.where(this_burnsum_mask).data, bins=n_bins, range=bp_range
        )
        # [ UNBURNABLE ONLY ]
        burned_hist_unburnable, bins = da.histogram(
            bp.where((this_burnsum_mask) & (riley_unburnable)).data, bins=n_bins, range=bp_range
        )

        # bring just the histograms into memory
        bin_centers = (bins[:-1] + bins[1:]) / 2
        # [ ALL DATA ]
        burned_hist = da.compute(burned_hist)
        # normalize counts to density
        with np.errstate(
            invalid='ignore', divide='ignore'
        ):  # to handle cases where no pixel has this number of burns
            burned_density = burned_hist[0] / burned_hist[0].sum()

        # [ UNBURNABLE ONLY ]
        burned_hist_unburnable = da.compute(burned_hist_unburnable)
        # normalize counts to density
        with np.errstate(
            invalid='ignore', divide='ignore'
        ):  # to handle cases where no pixel has this number of burns
            burned_density_unburnable = burned_hist_unburnable[0] / burned_hist_unburnable[0].sum()

        # --- combine into single dataframe
        tmpdict = {
            'n_burns': nfire,
            'bin': bin_centers,
            'burned_BPdensity': burned_density,
            'burned_BPdensity_NB': burned_density_unburnable,
        }

        dfs_nburn[str(nfire)] = pd.DataFrame(tmpdict)

    # --- create a processed dataset
    #     with dims = n_burns, bin
    # dict to list
    dfs_list = list(dfs_nburn.values())

    # stack each data column into a 2D array
    data_vars = {}
    for col in dfs_list[0].columns:
        if col in ['n_burns', 'bin']:
            continue
        data_vars[col] = (('n_burns', 'bin'), [df[col].values for df in dfs_list])

    # build the dataset
    tmpds = xr.Dataset(
        data_vars=data_vars,
        coords={
            'n_burns': nburn_arr,
            'bin': bin_centers,
        },
    )

    ds_dict_burnsum[dsname] = tmpds
import dask.array as da

ds_dict_burnsum = {}  # to hold results

for dsname, dsslice in outdict.items():
    # track progress
    print(f'Now solving: {dsname}')

    # define variables
    bp = dsslice['burn_probability_2011']
    burnsum = dsslice['burn_sum']
    riley_b_mask = dsslice['riley_burnable_mask']
    riley_burnable = riley_b_mask == 1
    riley_unburnable = riley_b_mask == 0

    dfs_nburn = {}  # empty dict to hold one dataframe per nburn_arr

    for nfire in nburn_arr:
        print(f'n fires: {nfire}', end='\r')

        # --- create a mask for just these burns
        this_burnsum_mask = burnsum == nfire

        # --- compute histograms lazily
        # [ ALL DATA ]
        burned_hist, bins = da.histogram(
            bp.where(this_burnsum_mask).data, bins=n_bins, range=bp_range
        )
        # [ UNBURNABLE ONLY ]
        burned_hist_unburnable, bins = da.histogram(
            bp.where((this_burnsum_mask) & (riley_unburnable)).data, bins=n_bins, range=bp_range
        )

        # bring just the histograms into memory
        bin_centers = (bins[:-1] + bins[1:]) / 2
        # [ ALL DATA ]
        burned_hist = da.compute(burned_hist)
        # normalize counts to density
        with np.errstate(
            invalid='ignore', divide='ignore'
        ):  # to handle cases where no pixel has this number of burns
            burned_density = burned_hist[0] / burned_hist[0].sum()

        # [ UNBURNABLE ONLY ]
        burned_hist_unburnable = da.compute(burned_hist_unburnable)
        # normalize counts to density
        with np.errstate(
            invalid='ignore', divide='ignore'
        ):  # to handle cases where no pixel has this number of burns
            burned_density_unburnable = burned_hist_unburnable[0] / burned_hist_unburnable[0].sum()

        # --- combine into single dataframe
        tmpdict = {
            'n_burns': nfire,
            'bin': bin_centers,
            'burned_BPdensity': burned_density,
            'burned_BPdensity_NB': burned_density_unburnable,
        }

        dfs_nburn[str(nfire)] = pd.DataFrame(tmpdict)

    # --- create a processed dataset
    #     with dims = n_burns, bin
    # dict to list
    dfs_list = list(dfs_nburn.values())

    # stack each data column into a 2D array
    data_vars = {}
    for col in dfs_list[0].columns:
        if col in ['n_burns', 'bin']:
            continue
        data_vars[col] = (('n_burns', 'bin'), [df[col].values for df in dfs_list])

    # build the dataset
    tmpds = xr.Dataset(
        data_vars=data_vars,
        coords={
            'n_burns': nburn_arr,
            'bin': bin_centers,
        },
    )

    ds_dict_burnsum[dsname] = tmpds

Now solving: testbox
Now solving: CONUS
Now solving: West of -98
Now solving: East of -98
n fires: 20

In [17]:

Copied!

import s3fs

s3 = s3fs.S3FileSystem()

for name, ds in ds_dict_burnsum.items():
    path = f's3://carbonplan-ocr/evaluation/benchmarking-processed/{name}_sumds.zarr'
    ds.to_zarr(store=s3.get_mapper(path), mode='w')
import s3fs

s3 = s3fs.S3FileSystem()

for name, ds in ds_dict_burnsum.items():
    path = f's3://carbonplan-ocr/evaluation/benchmarking-processed/{name}_sumds.zarr'
    ds.to_zarr(store=s3.get_mapper(path), mode='w')

/opt/coiled/env/lib/python3.13/site-packages/zarr/api/asynchronous.py:244: ZarrUserWarning: Consolidated metadata is currently not part in the Zarr format 3 specification. It may not be supported by other zarr implementations and may change in the future.
  warnings.warn(