Comparing address-level risk estimates for CONUS¶
This notebook compares address-level risk estimates from our dataset with those from the Wildfire Risk to Communities dataset (WRC, Scott et al., 2024). We calculate a set of comparison criteria to understand how our datasets are similar and different.
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import kendalltau
from ocr import catalog
def apply_kendall_tau(x, y):
# we use the c variant here because the comparisons here have non-square contingency tables
tau, p_value = kendalltau(x, y, variant='c')
return pd.Series({'tau': tau, 'p_value': p_value})
def apply_bias(x, y):
bias = np.mean(x - y)
return pd.Series({'bias': bias})
def apply_median(x, label):
median = np.median(x)
return pd.Series({f'median_{label}': median})
def apply_mean(x, label):
mean = np.mean(x)
return pd.Series({f'mean_{label}': mean})
def fraction_zeros(x, y):
x_fract_zero = (x == 0).sum() / len(x)
y_fract_zero = (y == 0).sum() / len(y)
return pd.Series({'x_fract_zero': x_fract_zero, 'y_fract_zero': y_fract_zero})
def apply_corr(x, y):
corr = np.corrcoef(x, y)
return pd.Series({'corr': corr[0, 1]})
states = gpd.read_file('s3://carbonplan-risks/shapefiles/cb_2018_us_state_20m.zip')
states = states[~(states['STUSPS'].isin(['AK', 'PR', 'HI']))]
census_tracts_df = (
catalog.get_dataset('us-census-tracts')
.query_geoparquet(
"SELECT GEOID, NAME, ST_AsText(geometry) as geometry, bbox FROM read_parquet('{s3_path}')"
)
.df()
)
census_tracts = gpd.GeoDataFrame(
census_tracts_df, geometry=gpd.GeoSeries.from_wkt(census_tracts_df['geometry'])
)
census_tracts.head()
FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))
| GEOID | NAME | geometry | bbox | |
|---|---|---|---|---|
| 0 | 01063060102 | 601.02 | POLYGON ((-88.03501 32.83534, -88.03501 32.835... | {'xmin': -88.035013, 'ymin': 32.744738, 'xmax'... |
| 1 | 01063060101 | 601.01 | POLYGON ((-87.99829 32.76851, -87.99326 32.771... | {'xmin': -87.998289, 'ymin': 32.739319, 'xmax'... |
| 2 | 01069040802 | 408.02 | POLYGON ((-85.44941 31.15536, -85.44937 31.155... | {'xmin': -85.449475, 'ymin': 31.10551, 'xmax':... |
| 3 | 01069040204 | 402.04 | POLYGON ((-85.46063 31.25039, -85.4606 31.2506... | {'xmin': -85.46063, 'ymin': 31.225301, 'xmax':... |
| 4 | 01069041902 | 419.02 | POLYGON ((-85.30625 31.14305, -85.30624 31.143... | {'xmin': -85.306265, 'ymin': 31.058496, 'xmax'... |
Load in a parquet file describing risk for the ~150 million buildings in the dataset. Each building carries a ~current day risk value from our dataset and from the WRC dataset. We calculate Kendall's Tau, bias, median, mean, and correlation to better understand how the datasets compare. While the dataset exists at a 30m resolution for the entirety of CONUS, the focus of our comparisons is in developed areas, so we focus on address-level buildings for this comparison.
%%time
# NOTE: if either of these are True you need to be on a machine with ~200GB memory
remake_building_tract_file = True
remake_tract_stats_file = True
version = '0.12.0'
if remake_building_tract_file:
# NOTE: anything reading this file requires being on a machine with ~200GB memory
building_file = gpd.read_parquet(
f's3://carbonplan-ocr/output/fire-risk/vector/production/v{version}/geoparquet/consolidated-buildings.parquet'
)
building_file = building_file.drop(['bbox'], axis=1)
# intersects will duplicate buildings that straddle multiple census tracts. we could clean this up
# later by de-duplicating buildings
buildings_in_census_tracts = gpd.sjoin(building_file, census_tracts[['GEOID', 'geometry']])
buildings_in_census_tracts.to_parquet(
f's3://carbonplan-ocr/evaluation/comparisons/buildings-tracts-geo-{version}.parquet'
)
if remake_tract_stats_file:
# NOTE: anything reading this file requires being on a machine with ~200GB memory
if not remake_building_tract_file:
buildings_in_census_tracts = gpd.read_parquet(
f's3://carbonplan-ocr/evaluation/comparisons/buildings-tracts-geo-{version}.parquet'
)
census_tracts = census_tracts.rename({'GEOID': 'GEOID_right'}, axis=1)
df_buildings_in_census_tracts = buildings_in_census_tracts[
['USFS_RPS', 'wind_risk_2011', 'wind_risk_2047', 'GEOID_right']
]
tract_tau = df_buildings_in_census_tracts.groupby('GEOID_right').apply(
lambda g: apply_kendall_tau(g['wind_risk_2011'], g['USFS_RPS'])
)
tract_performance_stats = census_tracts[['GEOID_right', 'geometry']].merge(
tract_tau, on='GEOID_right'
)
# add what fraction of the buildings have zero risk because those tracts should be removed since tau is NaN
new_df = df_buildings_in_census_tracts.groupby('GEOID_right').apply(
lambda g: fraction_zeros(g['wind_risk_2011'], g['USFS_RPS'])
)
tract_performance_stats = tract_performance_stats.merge(new_df, on='GEOID_right')
# add correlation calculation
new_df = df_buildings_in_census_tracts.groupby('GEOID_right').apply(
lambda g: apply_corr(g['wind_risk_2011'], g['USFS_RPS'])
)
tract_performance_stats = tract_performance_stats.merge(new_df, on='GEOID_right')
# add bias calculation
new_df = df_buildings_in_census_tracts.groupby('GEOID_right').apply(
lambda g: apply_bias(g['wind_risk_2011'], g['USFS_RPS'])
)
tract_performance_stats = tract_performance_stats.merge(new_df, on='GEOID_right')
# add mean calculation for both x and y
for setup in ['wind_risk_2011', 'USFS_RPS']:
new_df = df_buildings_in_census_tracts.groupby('GEOID_right').apply(
lambda g: apply_mean(g[setup], setup)
)
tract_performance_stats = tract_performance_stats.merge(new_df, on='GEOID_right')
# add median calculation for both x and y
for setup in ['wind_risk_2011', 'USFS_RPS']:
new_df = df_buildings_in_census_tracts.groupby('GEOID_right').apply(
lambda g: apply_median(g[setup], setup)
)
tract_performance_stats = tract_performance_stats.merge(new_df, on='GEOID_right')
tract_performance_stats['normalized_bias'] = (
tract_performance_stats['bias'] / tract_performance_stats['mean_USFS_RPS']
)
# filter out any places that are all zeros in either CP or USFS because they'll either make for NaN kendall tau or infinity in bias
tract_performance_stats = tract_performance_stats[
~(tract_performance_stats['x_fract_zero'] == 1)
]
tract_performance_stats = tract_performance_stats[
~(tract_performance_stats['y_fract_zero'] == 1)
]
tract_performance_stats = tract_performance_stats.rename({'GEOID_right': 'GEOID'}, axis=1)
tract_performance_stats.to_parquet(
f's3://carbonplan-ocr/evaluation/comparisons/tracts-stats-geo-{version}.parquet'
)
else:
tract_performance_stats = gpd.read_parquet(
f's3://carbonplan-ocr/evaluation/comparisons/tracts-stats-geo-{version}.parquet'
)
<timed exec>:13: UserWarning: CRS mismatch between the CRS of left geometries and the CRS of right geometries. Use `to_crs()` to reproject one of the input geometries to match the CRS of the other. Left CRS: OGC:CRS84 Right CRS: None /tmp/ipykernel_10075/220347986.py:3: SmallSampleWarning: One or more sample arguments is too small; all returned values will be NaN. See documentation for sample size requirements. tau, p_value = kendalltau(x, y, variant='c') <timed exec>:27: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. <timed exec>:34: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. /opt/coiled/env/lib/python3.13/site-packages/numpy/lib/_function_base_impl.py:3057: RuntimeWarning: Degrees of freedom <= 0 for slice c = cov(x, y, rowvar, dtype=dtype) /opt/coiled/env/lib/python3.13/site-packages/numpy/lib/_function_base_impl.py:2914: RuntimeWarning: divide by zero encountered in divide c *= np.true_divide(1, fact) /opt/coiled/env/lib/python3.13/site-packages/numpy/lib/_function_base_impl.py:2914: RuntimeWarning: invalid value encountered in multiply c *= np.true_divide(1, fact) /opt/coiled/env/lib/python3.13/site-packages/numpy/lib/_function_base_impl.py:3065: RuntimeWarning: invalid value encountered in divide c /= stddev[:, None] /opt/coiled/env/lib/python3.13/site-packages/numpy/lib/_function_base_impl.py:3066: RuntimeWarning: invalid value encountered in divide c /= stddev[None, :] <timed exec>:39: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. <timed exec>:44: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. <timed exec>:50: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. <timed exec>:50: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. <timed exec>:56: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. <timed exec>:56: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
CPU times: user 49min, sys: 1min 18s, total: 50min 18s Wall time: 49min 22s
variable_name_dict = {
'tau': "Kendall's Tau of RPS",
'bias': 'RPS bias (CP - Scott)',
'corr': 'Correlation (CP vs. Scott)',
'median_wind_risk_2011': 'CP median RPS',
'median_USFS_RPS': 'Scott median RPS',
'mean_wind_risk_2011': 'CP mean RPS',
'mean_USFS_RPS': 'Scott mean RPS',
'normalized_bias': 'RPS normalized bias\n((CP - Scott)/Scott)',
}
# limits to use for plotting the different performance metrics
var_lims = {'tau': [-1, 1], 'bias': [-0.1, 0.1], 'normalized_bias': [-1, 1], 'corr': [-1, 1]}
cmaps = {'tau': 'PRGn', 'bias': 'RdBu_r', 'normalized_bias': 'RdBu_r', 'corr': 'PRGn'}
Visualizing risk from different datasets¶
Plotting the absolute values of the risk value helps us understand the distribution of risk within each dataset.
for variable in [
'median_wind_risk_2011',
'median_USFS_RPS',
'mean_wind_risk_2011',
'mean_USFS_RPS',
]:
fig, axarr = plt.subplots(figsize=(14, 10))
states.plot(ax=axarr, color='white', edgecolor='black')
ax = tract_performance_stats.plot(
ax=axarr,
column=variable,
vmin=0,
vmax=1,
legend=True,
cmap='viridis',
legend_kwds={'shrink': 0.65, 'label': variable_name_dict[variable]},
)
Calculating comparison statistics¶
Calculating CONUS-wide comparison statistics. We calculate average correlation but calculate median of bias and relative bias because small differences in areas of very low risk can result in very large bias values.
print(f'Average census-tract level correlation: {tract_performance_stats["corr"].mean()}')
print(f'Average census-tract level bias: {tract_performance_stats["bias"].median()}')
print(
f'Average census-tract level normalized bias: {tract_performance_stats["normalized_bias"].median()}'
)
Average census-tract level correlation: 0.7585414428564825 Average census-tract level bias: 0.0001532985916128382 Average census-tract level normalized bias: 0.2829974889755249
We can map the performance statistics across all census tracts within CONUS. Areas in white were places where at least one of the two datasets labeled every building as having zero fire risk.
for variable in ['tau', 'bias', 'normalized_bias', 'corr']:
fig, axarr = plt.subplots(figsize=(14, 10))
states.plot(ax=axarr, color='white', edgecolor='black')
ax = tract_performance_stats.plot(
ax=axarr,
column=variable,
vmin=var_lims[variable][0],
vmax=var_lims[variable][1],
legend=True,
cmap=cmaps[variable],
legend_kwds={'shrink': 0.65, 'label': variable_name_dict[variable]},
)
The maps above show us what the performance looks like spatially, but a chloropleth plot can be misleading since each census tract has about the same number of buildings and so larger census tracts draw a disproportionate amount of attention to the eye. We can instead look at the entire distribution of performance statistics shown below. Note that the upper limit for the normalized bias is truncated.
for statistic in ['corr', 'bias', 'normalized_bias']:
plt.figure()
plt.hist(
tract_performance_stats[statistic].values,
bins=np.arange(var_lims[statistic][0], var_lims[statistic][1], var_lims[statistic][1] / 10),
)
# plt.legend()
plt.xlabel(variable_name_dict[statistic])
plt.ylabel('Count')
plt.title('CONUS-wide census-tract level comparison with Scott (2024)=')
Identifying hotspots of bias¶
Below we identify hotspots where our values are markedly different from those of the WRC project to plot individual regions spatially and explain why they differ. To identify places where our datasets differ most substantially, we must control for the fact that fire risk varies by orders of magnitude across CONUS which can create dramatically high (or low) absolute and relative biases in areas with very high (or low) risk. So, we use identify tracts with both high (or low) relative and absolute risk to control.
def find_hotspots(quantile, tract_df, direction):
hot_spot_thresholds = tract_df[['bias', 'normalized_bias']].quantile(quantile, axis=0)
if direction == 'high':
bias_filter = tract_df['bias'] > hot_spot_thresholds['bias']
normalized_bias_filter = (
tract_df['normalized_bias'] > hot_spot_thresholds['normalized_bias']
)
elif direction == 'low':
bias_filter = tract_df['bias'] < hot_spot_thresholds['bias']
normalized_bias_filter = (
tract_df['normalized_bias'] < hot_spot_thresholds['normalized_bias']
)
return tract_df[bias_filter & normalized_bias_filter]
hotspots = find_hotspots(0.98, tract_performance_stats, 'high')
fig, axarr = plt.subplots()
states.plot(ax=axarr, color='white', edgecolor='black')
hotspots.centroid.plot(ax=axarr)
<Axes: >
# buildings_in_census_tracts = gpd.read_parquet(
# 's3://carbonplan-risks/shapefiles/buildings_tracts_0.4.0_geo.parquet'
# )
# projected_cts = census_tracts.to_crs('epsg:5070')
Top 3 hot spot tracts where we overpredict w.r.t. WRC are all around LA¶
high_bias_hotspots = find_hotspots(0.99, tract_performance_stats, 'high')
fig, axarr = plt.subplots()
states.plot(ax=axarr, color='white', edgecolor='black')
high_bias_hotspots.centroid.plot(ax=axarr)
<Axes: >
Save out some of the hotspots for further detailed raster-level inspection.
# high_bias_hotspots.to_file(f'high_bias_tracts_{version}.shp')
Now look at low hotspots - places where we're saying low risk compared to WRC¶
low_bias_hotspots = find_hotspots(0.005, tract_performance_stats, 'low')
# low_bias_hotspots.to_file(f'low_bias_tracts_{version}.shp')
fig, axarr = plt.subplots()
states.plot(ax=axarr, color='white', edgecolor='black')
low_bias_hotspots.centroid.plot(ax=axarr)
/tmp/ipykernel_10075/2824415169.py:3: UserWarning: The GeoSeries you are attempting to plot is empty. Nothing has been displayed. low_bias_hotspots.centroid.plot(ax=axarr)
<Axes: >
Hotspots of low correlation¶
fig, axarr = plt.subplots()
states.plot(ax=axarr, color='white', edgecolor='black')
tract_performance_stats.sort_values('corr').head(10).centroid.plot(ax=axarr)
<Axes: >
# tract_performance_stats.sort_values('corr').head(10).to_file(
# f'corr_low_tracts_{version}.shp'
# ) # (f'corr_low_tracts_{version}.shp')