Comparing datasets over California¶
Within the state of California we can compare our risk estimates with estimates from two other datasets: (1) the Wildfire Risk to Communities project (as in the other comparison notebook attached to this project) and the California Fire Hazard Severity Zones. This notebook contains those comparisons.
from itertools import combinations
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import odc.geo.xr # noqa
import pandas as pd
import xarray as xr
from scipy.stats import kendalltau
from ocr import catalog
states = gpd.read_file('s3://carbonplan-risks/shapefiles/cb_2018_us_state_20m.zip')
ca = states[states['STUSPS'].isin(['CA'])]
census_tracts_df = (
catalog.get_dataset('us-census-tracts')
.query_geoparquet(
"SELECT GEOID, NAME, ST_AsText(geometry) as geometry, bbox FROM read_parquet('{s3_path}')"
)
.df()
)
census_tracts = gpd.GeoDataFrame(
census_tracts_df, geometry=gpd.GeoSeries.from_wkt(census_tracts_df['geometry'])
)
ca_census_tracts = census_tracts.loc[census_tracts.geometry.intersects(ca.unary_union)]
FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))
/tmp/ipykernel_66119/2343889974.py:15: DeprecationWarning: The 'unary_union' attribute is deprecated, use the 'union_all()' method instead. ca_census_tracts = census_tracts.loc[census_tracts.geometry.intersects(ca.unary_union)]
ca_census_tracts.plot();
import duckdb
duckdb.sql("""INSTALL SPATIAL; LOAD SPATIAL; INSTALL HTTPFS; LOAD HTTPFS""")
version = 'v0.13.1'
dataset_uri = f's3://carbonplan-ocr/output/fire-risk/vector/production/{version}/geoparquet/buildings.parquet/**'
CA_FIPS_CODE = '06'
%%time
df = duckdb.sql(f"""
SELECT
* EXCLUDE (geometry),
ST_AsText(geometry) as geometry
FROM
read_parquet('{dataset_uri}', hive_partitioning = TRUE)
WHERE
state_fips = '{CA_FIPS_CODE}'
""").df()
gdf = gpd.GeoDataFrame(df, geometry=gpd.GeoSeries.from_wkt(df['geometry']))
gdf.head()
CPU times: user 1min 45s, sys: 8.74 s, total: 1min 54s Wall time: 1min 10s
| USFS_RPS | wind_risk_2011 | wind_risk_2047 | burn_probability_2011 | burn_probability_2047 | conditional_risk_usfs | burn_probability_usfs_2011 | burn_probability_usfs_2047 | GEOID | state | county | bbox | county_fips | state_fips | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.000000 | 0.000000 | 0.000000 | 9.659727e-07 | 0.000050 | 0.000000 | 0.00000 | 0.00000 | 060014283021001 | CA | Alameda County | {'xmin': -122.2584475, 'ymin': 37.7427196, 'xm... | 001 | 06 | POLYGON ((-122.25835 37.74289, -122.25845 37.7... |
| 1 | 0.000000 | 0.000000 | 0.000000 | 6.095061e-06 | 0.000020 | 0.000000 | 0.00000 | 0.00000 | 060014003003006 | CA | Alameda County | {'xmin': -122.2584397, 'ymin': 37.8415002, 'xm... | 001 | 06 | POLYGON ((-122.25824 37.84156, -122.25839 37.8... |
| 2 | 0.000287 | 0.006351 | 0.008264 | 2.441366e-04 | 0.000318 | 26.012377 | 0.00015 | 0.00015 | 060014215003004 | CA | Alameda County | {'xmin': -122.258393, 'ymin': 37.8874763, 'xma... | 001 | 06 | POLYGON ((-122.2583 37.88748, -122.25825 37.88... |
| 3 | 0.000000 | 0.000000 | 0.000000 | 2.245132e-05 | 0.000043 | 0.000000 | 0.00000 | 0.00000 | 060014003002020 | CA | Alameda County | {'xmin': -122.2583875, 'ymin': 37.8371585, 'xm... | 001 | 06 | POLYGON ((-122.25827 37.83727, -122.25831 37.8... |
| 4 | 0.000000 | 0.000000 | 0.000000 | 4.574156e-05 | 0.000091 | 0.000000 | 0.00000 | 0.00000 | 060014012001017 | CA | Alameda County | {'xmin': -122.2583776, 'ymin': 37.8325442, 'xm... | 001 | 06 | POLYGON ((-122.25828 37.83261, -122.25838 37.8... |
ca_buildings = gdf.loc[gdf.geometry.intersects(ca.union_all())]
ca_buildings.shape, gdf.shape
((13686013, 15), (13711017, 15))
%%time
ds_reprojected = catalog.get_dataset('calfire-fhsz-4326').to_xarray()
# get building coords
coords = np.column_stack([ca_buildings.geometry.centroid.x, ca_buildings.geometry.centroid.y])
# extract hazard for buildings
building_hazard = ds_reprojected.sel(
latitude=xr.DataArray(coords[:, 1], dims='building'),
longitude=xr.DataArray(coords[:, 0], dims='building'),
method='nearest',
)
CPU times: user 16.2 s, sys: 975 ms, total: 17.2 s Wall time: 17.5 s
%%time
# cast all nulls to zeros. areas with zero are considered without hazard in the cal fire approach
# and by casting them to zero we make them have a lower value than the non-zero values which
# prepares the data for a ranked test like kendall-tau
building_hazard = xr.where(building_hazard.band_data.isnull(), 0, building_hazard.band_data)
ca_buildings['cal-fire-hazard-zone'] = building_hazard.values[0]
ca_buildings_in_census_tracts = gpd.sjoin(ca_buildings, ca_census_tracts[['GEOID', 'geometry']])
ca_buildings_in_census_tracts.head()
/opt/coiled/env/lib/python3.13/site-packages/geopandas/geodataframe.py:1968: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy super().__setitem__(key, value)
%%time
path = f's3://carbonplan-ocr/evaluation/comparisons/buildings-tracts-california-{version}.parquet'
ca_buildings_in_census_tracts.to_parquet(path)
Load in CAL FIRE hazard layer¶
Note: The CAL FIRE hazard data is very clear that they do not estimate risk, which assesses the level of damage that a fire could cause. The main statistic we use in this analysis is a rank test called the Kendall's Tau which tests for concordance of data, asking: how similarly do two datasets rank areas on a scale from low to high? In other words, given two locations, do the two datsets agree which is higher or lower on a given scale. To do this comparison we assume that it makes sense for the fire hazard scales from CAL FIRE and the fire risk to potential structures scale from our datasets can be compared and that high numbers on either scale come from similar causes. We think this is a reasonable assumption given the similar attributes that each modeling system ingests: high-resolution vegetation maps, dynamic fire model, wind.
def apply_kendall_tau(x, y, variant):
# confirm we want to use b variant
tau, p_value = kendalltau(x, y, variant=variant)
return pd.Series({f'tau_{variant}': tau, f'p_value_{variant}': p_value})
ca_buildings_in_census_tracts = ca_buildings_in_census_tracts.rename(
{'GEOID_right': 'GEOID'}, axis=1
)
ds_name_dict = {
'CarbonPlan': 'wind_risk_2011',
'Scott (2024)': 'USFS_RPS',
'CAL FIRE': 'cal-fire-hazard-zone',
}
tract_performance_stats = ca_census_tracts[['GEOID', 'geometry']]
for ds1_name, ds2_name in combinations(ds_name_dict.keys(), 2):
new_df = ca_buildings_in_census_tracts.groupby('GEOID').apply(
lambda g: apply_kendall_tau(g[ds_name_dict[ds1_name]], g[ds_name_dict[ds2_name]], 'c')
)
new_df.columns = [f'{ds1_name} - {ds2_name} {column_name}' for column_name in new_df.columns]
tract_performance_stats = tract_performance_stats.merge(new_df, on='GEOID')
tract_performance_stats.to_parquet(
f's3://carbonplan-ocr/evaluation/comparisons/california-tract-stats-{version}.parquet'
)
variable_name_dict = {
'tau_c': "Kendall's Tau (c-variant) of RPS",
'bias': 'RPS bias (CP - WRC)',
'corr': 'Correlation',
'median_wind_risk_2011': 'CP median RPS',
'median_USFS_RPS': 'WRC median RPS',
'mean_wind_risk_2011': 'CP mean RPS',
'mean_USFS_RPS': 'WRC mean RPS',
'normalized_bias': 'RPS normalized bias\n((CP - WRC)/WRC)',
}
var_lims = {
'tau_c': [-1, 1],
'bias': [-0.1, 0.1],
'normalized_bias': [-1, 1],
'corr': [-1, 1],
}
cmaps = {
'tau_c': 'PRGn',
'bias': 'RdBu_r',
'normalized_bias': 'RdBu_r',
'corr': 'PRGn',
}
When we calculate the concordance of our data with that of CAL FIRE for all census tracts in California, we see that typically the Kendall's Tau is above zero, indicating some concordance.
plt.hist(
tract_performance_stats['CarbonPlan - Scott (2024) tau_c'].values,
bins=np.arange(-1, 1, 0.1),
label='OCR - Scott',
alpha=0.5,
)
plt.hist(
tract_performance_stats['Scott (2024) - CAL FIRE tau_c'].values,
bins=np.arange(-1, 1, 0.1),
label='Scott - CAL FIRE',
alpha=0.5,
)
plt.hist(
tract_performance_stats['CarbonPlan - CAL FIRE tau_c'].values,
bins=np.arange(-1, 1, 0.1),
label='OCR - CAL FIRE',
alpha=0.5,
)
plt.legend()
plt.xlabel("Kendall's Tau")
plt.ylabel('Count')
plt.title("Kendall's Tau of three fire datasets")
variable = 'tau_c'
for ds1_name, ds2_name in combinations(ds_name_dict.keys(), 2):
print(
f'{ds1_name} - {ds2_name} {variable} : {tract_performance_stats[f"{ds1_name} - {ds2_name} {variable}"].mean()}'
)
Plotting the data across the state we see areas of higher concordance tend to cluster in smaller tracts in more developed areas.
for variable in ['tau_c']:
fig, axarr = plt.subplots(ncols=3, figsize=(20, 8))
for i, (ds1_name, ds2_name) in enumerate(combinations(ds_name_dict.keys(), 2)):
ca.plot(ax=axarr[i], color='white', edgecolor='black')
ax = tract_performance_stats.plot(
ax=axarr[i],
column=f'{ds1_name} - {ds2_name} {variable}',
vmin=var_lims[variable][0],
vmax=var_lims[variable][1],
legend=True,
cmap=cmaps[variable],
legend_kwds={'shrink': 0.65, 'label': variable_name_dict[variable]},
)
tract_performance_stats[
tract_performance_stats[f'{ds1_name} - {ds2_name} {variable}'].isnull()
].plot(ax=axarr[i], color='grey')
axarr[i].set_title(f'{ds1_name} compared to {ds2_name}')