Process benchmarking inputs¶
Read in the raw benchmarking inputs and process them to make plottable data.
Note, we split this up from the benchmarking-make_inputs.ipynb script because this task requires significantly less memory (and therefore a less expensive machine), but still takes some time so we only want to do it once. Operations should be achievable with < 35 GB memory.
In [1]:
Copied!
import os
import numpy as np
import pandas as pd
import xarray as xr
import os
import numpy as np
import pandas as pd
import xarray as xr
In [2]:
Copied!
# --- read in dat
s3_path = 's3://carbonplan-ocr/evaluation/'
savename = 'benchmarking-input-dat.zarr'
ds = xr.open_zarr(os.path.join(s3_path, savename))
# --- read in dat
s3_path = 's3://carbonplan-ocr/evaluation/'
savename = 'benchmarking-input-dat.zarr'
ds = xr.open_zarr(os.path.join(s3_path, savename))
Unclosed client session client_session: <aiohttp.client.ClientSession object at 0xffa3685af390> Unclosed connector connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0xffa3684dab70>, 5951.158062602)])'] connector: <aiohttp.connector.TCPConnector object at 0xffa3685ad310> Unclosed client session client_session: <aiohttp.client.ClientSession object at 0xffa3685ad090> Unclosed connector connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0xffa3684db230>, 5951.160114161)])'] connector: <aiohttp.connector.TCPConnector object at 0xffa3685accd0> Unclosed client session client_session: <aiohttp.client.ClientSession object at 0xffa45807c050> Unclosed connector connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0xffa3684daff0>, 5951.163517585)])'] connector: <aiohttp.connector.TCPConnector object at 0xffa3685af610>
In [3]:
Copied!
# --- filter to four slices
# (CONUS, west, east, testbox) and later (all data, non-burnable)
slicenames = {
'testbox': {'minlat': 42.7, 'maxlat': 46.3, 'minlon': -116.8, 'maxlon': -112.8},
'CONUS': None,
'West of -98': {'maxlon': -98},
'East of -98': {'minlon': -98},
}
outdict = {} # to hold outputs
for slc_key, slc_val in slicenames.items():
if slc_key == 'testbox':
outdict[slc_key] = ds.sel(
latitude=slice(slc_val['maxlat'], slc_val['minlat']),
longitude=slice(slc_val['minlon'], slc_val['maxlon']),
)
elif slc_key == 'CONUS':
outdict[slc_key] = ds.copy()
elif slc_key == 'West of -98':
outdict[slc_key] = ds.where(ds['longitude'] < slc_val['maxlon'], drop=True)
elif slc_key == 'East of -98':
outdict[slc_key] = ds.where(ds['longitude'] >= slc_val['minlon'], drop=True)
# --- filter to four slices
# (CONUS, west, east, testbox) and later (all data, non-burnable)
slicenames = {
'testbox': {'minlat': 42.7, 'maxlat': 46.3, 'minlon': -116.8, 'maxlon': -112.8},
'CONUS': None,
'West of -98': {'maxlon': -98},
'East of -98': {'minlon': -98},
}
outdict = {} # to hold outputs
for slc_key, slc_val in slicenames.items():
if slc_key == 'testbox':
outdict[slc_key] = ds.sel(
latitude=slice(slc_val['maxlat'], slc_val['minlat']),
longitude=slice(slc_val['minlon'], slc_val['maxlon']),
)
elif slc_key == 'CONUS':
outdict[slc_key] = ds.copy()
elif slc_key == 'West of -98':
outdict[slc_key] = ds.where(ds['longitude'] < slc_val['maxlon'], drop=True)
elif slc_key == 'East of -98':
outdict[slc_key] = ds.where(ds['longitude'] >= slc_val['minlon'], drop=True)
Create N bins of data (for plotting distributions)¶
In [4]:
Copied!
# --- set bin bounds
n_bins = 10000 # this number controls distribution resolution
bp_range = (0, 0.14) # to confirm max: dsslice['burn_probability_2011'].max(skipna=True).compute()
# --- set bin bounds
n_bins = 10000 # this number controls distribution resolution
bp_range = (0, 0.14) # to confirm max: dsslice['burn_probability_2011'].max(skipna=True).compute()
Bins across burn mask conditions¶
In [5]:
Copied!
import dask.array as da
df_dict = {} # to hold results
for dsname, dsslice in outdict.items():
# track progress
print(f'Now solving: {dsname}')
# define variables
bp = dsslice['burn_probability_2011']
mask = dsslice['burn_mask']
riley_b_mask = dsslice['riley_burnable_mask']
burned = mask == 1
unburned = mask == 0
riley_burnable = riley_b_mask == 1
riley_unburnable = riley_b_mask == 0
# --- compute histograms lazily
# [ ALL DATA ]
burned_hist, bins = da.histogram(bp.where(burned).data, bins=n_bins, range=bp_range)
unburned_hist, _ = da.histogram(bp.where(unburned).data, bins=n_bins, range=bp_range)
# [ UNBURNABLE ONLY ]
burned_hist_unburnable, bins = da.histogram(
bp.where((burned) & (riley_unburnable)).data, bins=n_bins, range=bp_range
)
unburned_hist_unburnable, _ = da.histogram(
bp.where((unburned) & (riley_unburnable)).data, bins=n_bins, range=bp_range
)
# bring just the histograms into memory
bin_centers = (bins[:-1] + bins[1:]) / 2
# [ ALL DATA ]
burned_hist, unburned_hist = da.compute(burned_hist, unburned_hist)
# normalize counts to density
burned_density = burned_hist / burned_hist.sum()
unburned_density = unburned_hist / unburned_hist.sum()
# [ UNBURNABLE ONLY ]
burned_hist_unburnable, unburned_hist_unburnable = da.compute(
burned_hist_unburnable, unburned_hist_unburnable
)
# normalize counts to density
burned_density_unburnable = burned_hist_unburnable / burned_hist_unburnable.sum()
unburned_density_unburnable = unburned_hist_unburnable / unburned_hist_unburnable.sum()
# --- combine into single dataframe
tmpdict = {
'bin_centers': bin_centers,
'burned_BPdensity': burned_density,
'unburned_BPdensity': unburned_density,
'burned_BPdensity_NB': burned_density_unburnable,
'unburned_BPdensity_NB': unburned_density_unburnable,
}
df_dict[dsname] = pd.DataFrame(tmpdict)
import dask.array as da
df_dict = {} # to hold results
for dsname, dsslice in outdict.items():
# track progress
print(f'Now solving: {dsname}')
# define variables
bp = dsslice['burn_probability_2011']
mask = dsslice['burn_mask']
riley_b_mask = dsslice['riley_burnable_mask']
burned = mask == 1
unburned = mask == 0
riley_burnable = riley_b_mask == 1
riley_unburnable = riley_b_mask == 0
# --- compute histograms lazily
# [ ALL DATA ]
burned_hist, bins = da.histogram(bp.where(burned).data, bins=n_bins, range=bp_range)
unburned_hist, _ = da.histogram(bp.where(unburned).data, bins=n_bins, range=bp_range)
# [ UNBURNABLE ONLY ]
burned_hist_unburnable, bins = da.histogram(
bp.where((burned) & (riley_unburnable)).data, bins=n_bins, range=bp_range
)
unburned_hist_unburnable, _ = da.histogram(
bp.where((unburned) & (riley_unburnable)).data, bins=n_bins, range=bp_range
)
# bring just the histograms into memory
bin_centers = (bins[:-1] + bins[1:]) / 2
# [ ALL DATA ]
burned_hist, unburned_hist = da.compute(burned_hist, unburned_hist)
# normalize counts to density
burned_density = burned_hist / burned_hist.sum()
unburned_density = unburned_hist / unburned_hist.sum()
# [ UNBURNABLE ONLY ]
burned_hist_unburnable, unburned_hist_unburnable = da.compute(
burned_hist_unburnable, unburned_hist_unburnable
)
# normalize counts to density
burned_density_unburnable = burned_hist_unburnable / burned_hist_unburnable.sum()
unburned_density_unburnable = unburned_hist_unburnable / unburned_hist_unburnable.sum()
# --- combine into single dataframe
tmpdict = {
'bin_centers': bin_centers,
'burned_BPdensity': burned_density,
'unburned_BPdensity': unburned_density,
'burned_BPdensity_NB': burned_density_unburnable,
'unburned_BPdensity_NB': unburned_density_unburnable,
}
df_dict[dsname] = pd.DataFrame(tmpdict)
Now solving: testbox Now solving: CONUS Now solving: West of -98 Now solving: East of -98
Save mask dict¶
In [6]:
Copied!
import s3fs
s3 = s3fs.S3FileSystem()
for name, df in df_dict.items():
path = f's3://carbonplan-ocr/evaluation/benchmarking-processed/{name}_maskdf.parquet'
with s3.open(path, 'wb') as f:
df.to_parquet(f)
import s3fs
s3 = s3fs.S3FileSystem()
for name, df in df_dict.items():
path = f's3://carbonplan-ocr/evaluation/benchmarking-processed/{name}_maskdf.parquet'
with s3.open(path, 'wb') as f:
df.to_parquet(f)
Bins across burn sum conditions¶
In [7]:
Copied!
# --- RUN ONCE: get the max number of burns
# **** N = 156 (!!) ****
# nburn_max = ds['burn_sum'].max(skipna=True).compute()
# ---
# (earlier testing indicates that pixels where nburn > 20 are very rare,
# so we'll split into 0-20 for now)
nburn_arr = np.arange(0, 21, 1)
# --- RUN ONCE: get the max number of burns
# **** N = 156 (!!) ****
# nburn_max = ds['burn_sum'].max(skipna=True).compute()
# ---
# (earlier testing indicates that pixels where nburn > 20 are very rare,
# so we'll split into 0-20 for now)
nburn_arr = np.arange(0, 21, 1)
In [8]:
Copied!
import dask.array as da
ds_dict_burnsum = {} # to hold results
for dsname, dsslice in outdict.items():
# track progress
print(f'Now solving: {dsname}')
# define variables
bp = dsslice['burn_probability_2011']
burnsum = dsslice['burn_sum']
riley_b_mask = dsslice['riley_burnable_mask']
riley_burnable = riley_b_mask == 1
riley_unburnable = riley_b_mask == 0
dfs_nburn = {} # empty dict to hold one dataframe per nburn_arr
for nfire in nburn_arr:
print(f'n fires: {nfire}', end='\r')
# --- create a mask for just these burns
this_burnsum_mask = burnsum == nfire
# --- compute histograms lazily
# [ ALL DATA ]
burned_hist, bins = da.histogram(
bp.where(this_burnsum_mask).data, bins=n_bins, range=bp_range
)
# [ UNBURNABLE ONLY ]
burned_hist_unburnable, bins = da.histogram(
bp.where((this_burnsum_mask) & (riley_unburnable)).data, bins=n_bins, range=bp_range
)
# bring just the histograms into memory
bin_centers = (bins[:-1] + bins[1:]) / 2
# [ ALL DATA ]
burned_hist = da.compute(burned_hist)
# normalize counts to density
with np.errstate(
invalid='ignore', divide='ignore'
): # to handle cases where no pixel has this number of burns
burned_density = burned_hist[0] / burned_hist[0].sum()
# [ UNBURNABLE ONLY ]
burned_hist_unburnable = da.compute(burned_hist_unburnable)
# normalize counts to density
with np.errstate(
invalid='ignore', divide='ignore'
): # to handle cases where no pixel has this number of burns
burned_density_unburnable = burned_hist_unburnable[0] / burned_hist_unburnable[0].sum()
# --- combine into single dataframe
tmpdict = {
'n_burns': nfire,
'bin': bin_centers,
'burned_BPdensity': burned_density,
'burned_BPdensity_NB': burned_density_unburnable,
}
dfs_nburn[str(nfire)] = pd.DataFrame(tmpdict)
# --- create a processed dataset
# with dims = n_burns, bin
# dict to list
dfs_list = list(dfs_nburn.values())
# stack each data column into a 2D array
data_vars = {}
for col in dfs_list[0].columns:
if col in ['n_burns', 'bin']:
continue
data_vars[col] = (('n_burns', 'bin'), [df[col].values for df in dfs_list])
# build the dataset
tmpds = xr.Dataset(
data_vars=data_vars,
coords={
'n_burns': nburn_arr,
'bin': bin_centers,
},
)
ds_dict_burnsum[dsname] = tmpds
import dask.array as da
ds_dict_burnsum = {} # to hold results
for dsname, dsslice in outdict.items():
# track progress
print(f'Now solving: {dsname}')
# define variables
bp = dsslice['burn_probability_2011']
burnsum = dsslice['burn_sum']
riley_b_mask = dsslice['riley_burnable_mask']
riley_burnable = riley_b_mask == 1
riley_unburnable = riley_b_mask == 0
dfs_nburn = {} # empty dict to hold one dataframe per nburn_arr
for nfire in nburn_arr:
print(f'n fires: {nfire}', end='\r')
# --- create a mask for just these burns
this_burnsum_mask = burnsum == nfire
# --- compute histograms lazily
# [ ALL DATA ]
burned_hist, bins = da.histogram(
bp.where(this_burnsum_mask).data, bins=n_bins, range=bp_range
)
# [ UNBURNABLE ONLY ]
burned_hist_unburnable, bins = da.histogram(
bp.where((this_burnsum_mask) & (riley_unburnable)).data, bins=n_bins, range=bp_range
)
# bring just the histograms into memory
bin_centers = (bins[:-1] + bins[1:]) / 2
# [ ALL DATA ]
burned_hist = da.compute(burned_hist)
# normalize counts to density
with np.errstate(
invalid='ignore', divide='ignore'
): # to handle cases where no pixel has this number of burns
burned_density = burned_hist[0] / burned_hist[0].sum()
# [ UNBURNABLE ONLY ]
burned_hist_unburnable = da.compute(burned_hist_unburnable)
# normalize counts to density
with np.errstate(
invalid='ignore', divide='ignore'
): # to handle cases where no pixel has this number of burns
burned_density_unburnable = burned_hist_unburnable[0] / burned_hist_unburnable[0].sum()
# --- combine into single dataframe
tmpdict = {
'n_burns': nfire,
'bin': bin_centers,
'burned_BPdensity': burned_density,
'burned_BPdensity_NB': burned_density_unburnable,
}
dfs_nburn[str(nfire)] = pd.DataFrame(tmpdict)
# --- create a processed dataset
# with dims = n_burns, bin
# dict to list
dfs_list = list(dfs_nburn.values())
# stack each data column into a 2D array
data_vars = {}
for col in dfs_list[0].columns:
if col in ['n_burns', 'bin']:
continue
data_vars[col] = (('n_burns', 'bin'), [df[col].values for df in dfs_list])
# build the dataset
tmpds = xr.Dataset(
data_vars=data_vars,
coords={
'n_burns': nburn_arr,
'bin': bin_centers,
},
)
ds_dict_burnsum[dsname] = tmpds
Now solving: testbox Now solving: CONUS Now solving: West of -98 Now solving: East of -98 n fires: 20
In [17]:
Copied!
import s3fs
s3 = s3fs.S3FileSystem()
for name, ds in ds_dict_burnsum.items():
path = f's3://carbonplan-ocr/evaluation/benchmarking-processed/{name}_sumds.zarr'
ds.to_zarr(store=s3.get_mapper(path), mode='w')
import s3fs
s3 = s3fs.S3FileSystem()
for name, ds in ds_dict_burnsum.items():
path = f's3://carbonplan-ocr/evaluation/benchmarking-processed/{name}_sumds.zarr'
ds.to_zarr(store=s3.get_mapper(path), mode='w')
/opt/coiled/env/lib/python3.13/site-packages/zarr/api/asynchronous.py:244: ZarrUserWarning: Consolidated metadata is currently not part in the Zarr format 3 specification. It may not be supported by other zarr implementations and may change in the future. warnings.warn(