import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# %pip install geopandas
import geopandas as gpd

import warnings 

warnings.filterwarnings('ignore', category=Warning)

df = pd.read_csv('../california_wildfire.csv')
df.describe()

df.head()

print(df.columns)
print(df.shape)

Index(['_id', 'OBJECTID', '* Damage', '* Street Number', '* Street Name',
       '* Street Type (e.g. road, drive, lane, etc.)',
       'Street Suffix (e.g. apt. 23, blding C)', '* City', 'State', 'Zip Code',
       '* CAL FIRE Unit', 'County', 'Community', 'Battalion',
       '* Incident Name', 'Incident Number (e.g. CAAEU 123456)',
       'Incident Start Date', 'Hazard Type',
       'If Affected 1-9% - Where did fire start?',
       'If Affected 1-9% - What started fire?',
       'Structure Defense Actions Taken', '* Structure Type',
       'Structure Category', '# Units in Structure (if multi unit)',
       '# of Damaged Outbuildings < 120 SQFT',
       '# of Non Damaged Outbuildings < 120 SQFT', '* Roof Construction',
       '* Eaves', '* Vent Screen', '* Exterior Siding', '* Window Pane',
       '* Deck/Porch On Grade', '* Deck/Porch Elevated',
       '* Patio Cover/Carport Attached to Structure',
       '* Fence Attached to Structure', 'Distance - Propane Tank to Structure',
       'Distance - Residence to Utility/Misc Structure &gt; 120 SQFT',
       'Fire Name (Secondary)', 'APN (parcel)',
       'Assessed Improved Value (parcel)', 'Year Built (parcel)',
       'Site Address (parcel)', 'GLOBALID', 'Latitude', 'Longitude', 'x', 'y'],
      dtype='object')
(100230, 47)

df = df.drop(['OBJECTID',
              '_id',
              'Street Suffix (e.g. apt. 23, blding C)',
              'State',
              'Zip Code',
              'Community',
              'Battalion',
              'Incident Number (e.g. CAAEU 123456)',
              'Hazard Type',
              'Fire Name (Secondary)',
              'Distance - Residence to Utility/Misc Structure &gt; 120 SQFT',
              'APN (parcel)',
              'Distance - Propane Tank to Structure',
              'GLOBALID',
              'Site Address (parcel)',
              '# Units in Structure (if multi unit)',
              '* Structure Type',
              '* Fence Attached to Structure',
              'x',
              'y',
              '* City',
              '* Incident Name',
              'If Affected 1-9% - Where did fire start?',
              'If Affected 1-9% - What started fire?',
              '# of Damaged Outbuildings < 120 SQFT',
              '# of Non Damaged Outbuildings < 120 SQFT'
             ], axis=1)

df['Structure Defense Actions Taken'] = df['Structure Defense Actions Taken'].fillna('None')
df = df.dropna()
print(df.isnull().values.any())
df = df.drop_duplicates(keep='first')
print(df.duplicated().sum())
df.head()

False
0

geo_df = gpd.read_file('../geo_california_wildfire.geojson')
df.describe()

print(geo_df.columns)
print(geo_df.shape)

Index(['OBJECTID', 'DAMAGE', 'STREETNUMBER', 'STREETNAME', 'STREETTYPE',
       'STREETSUFFIX', 'CITY', 'STATE', 'ZIPCODE', 'CALFIREUNIT', 'COUNTY',
       'COMMUNITY', 'BATTALION', 'INCIDENTNAME', 'INCIDENTNUM',
       'INCIDENTSTARTDATE', 'HAZARDTYPE', 'WHEREFIRESTARTEDONSTRUCTURE',
       'WHATDIDFIRESTARTFROM', 'DEFENSIVEACTIONS', 'STRUCTURETYPE',
       'STRUCTURECATEGORY', 'NUMBEROFUNITPERSTRUCTURE',
       'NOOUTBUILDINGSDAMAGED', 'NOOUTBUILDINGSNOTDAMAGED', 'ROOFCONSTRUCTION',
       'EAVES', 'VENTSCREEN', 'EXTERIORSIDING', 'WINDOWPANE',
       'DECKPORCHONGRADE', 'DECKPORCHELEVATED', 'PATIOCOVERCARPORT',
       'FENCEATTACHEDTOSTRUCTURE', 'PROPANETANKDISTANCE',
       'UTILITYMISCSTRUCTUREDISTANCE', 'FIRENAME', 'APN',
       'ASSESSEDIMPROVEDVALUE', 'YEARBUILT', 'SITEADDRESS', 'GLOBALID',
       'Latitude', 'Longitude', 'geometry'],
      dtype='object')
(100230, 45)

geo_df = geo_df.dropna()
geo_df["YEARBUILT"] = pd.to_numeric(geo_df["YEARBUILT"]) # Convert numeric
geo_df = geo_df.drop(columns=["OBJECTID",
                              "STATE",
                              "ZIPCODE",
                              "COMMUNITY",
                              "BATTALION",
                              "INCIDENTNUM",
                              "HAZARDTYPE",
                              "FIRENAME",
                              "APN",
                              "SITEADDRESS",
                              "GLOBALID",
                              "PROPANETANKDISTANCE",
                              "UTILITYMISCSTRUCTUREDISTANCE",
                              "NOOUTBUILDINGSDAMAGED",
                              "NOOUTBUILDINGSNOTDAMAGED",
                              "NUMBEROFUNITPERSTRUCTURE"]
                              , axis=1) # Similar to .csv
geo_df.head()

damages = df['* Damage'].value_counts()
sns.barplot(x=damages.index, y=damages.values, color="orange")
plt.xlabel('Damage Types')
plt.ylabel('Counts')
plt.title('Bar Plot of Damage Type Counts')
plt.xticks(rotation=45)
plt.show()

df['Incident Start Date'] = pd.to_datetime(df['Incident Start Date'], errors='coerce')

# Drop rows with invalid dates
df = df.dropna(subset=['Incident Start Date'])

# Extract the year from 'Incident Start Date'
df['Year'] = df['Incident Start Date'].dt.year
df['Year'].value_counts()

incident_counts_by_year = df['Year'].value_counts().sort_index()

sns.lineplot(x=incident_counts_by_year.index, y=incident_counts_by_year.values)
plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Count Over Time')
plt.xticks(rotation=45)  # Rotate x-axis labels if needed
plt.show()

df['Month'] = df['Incident Start Date'].dt.month_name()
df['Month'].value_counts()

incident_counts_by_month = df.groupby('Month').size().reset_index(name='count')
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
incident_counts_by_month['Month'] = pd.Categorical(incident_counts_by_month['Month'], categories=month_order, ordered=True)
incident_counts_by_month = incident_counts_by_month.sort_values('Month')

sns.barplot(x='Month', y='count', data=incident_counts_by_month, color="green")
plt.xlabel('Month')
plt.ylabel('Count')
plt.title('Fire Count per Month')
plt.xticks(rotation=45)  # Rotate x-axis labels if needed
plt.show()

current_year = 2025
Year_Built = df[df['Year Built (parcel)'] > 100]
Year_Built['Year_Built'] = Year_Built['Year Built (parcel)']
Year_Built_value_counts = current_year - Year_Built['Year_Built']

plt.figure(figsize=(10,6))
sns.histplot(Year_Built_value_counts, kde=True, bins=100)
plt.title("Distribution of Building Age")
plt.xlabel("Building Age (Years)")
plt.ylabel("Count")
plt.xlim(0, 300)
plt.show()

Year_Built['Roof'] = Year_Built['* Roof Construction'].replace(r'^\s*$', None, regex=True)
Year_Built = Year_Built.dropna(subset=['Roof'])

Year_Built['Roof'].value_counts()

plt.figure(figsize=(10,6))
sns.countplot(x='Roof', data=Year_Built,
              order=Year_Built['Roof'].value_counts().index,
              color="teal")
plt.title("Distribution of Building Materials")
plt.xlabel("Building Material")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(10,6))
sns.countplot(x='Roof', hue='* Damage', data=Year_Built,
              order=Year_Built['Roof'].value_counts().index)
plt.title("Damage Type suffered for each Building Material")
plt.xlabel("Building Material")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

damage_counts = Year_Built.groupby('Year_Built')['* Damage'].value_counts().unstack()
plt.figure(figsize=(12, 6))
damage_counts.plot(kind='line', figsize=(12, 6), marker='o')
plt.xlabel('Year Built')
plt.ylabel('Count of Damage Types')
plt.title('Damage Types Over the Years')
plt.legend(title="Damage Type")
plt.show()

<Figure size 1200x600 with 0 Axes>

df2 = df

df2_filtered = df2[['Year Built (parcel)', '* Damage']].dropna()

df2_filtered.columns = ['Year Built', 'Damage']

df2_filtered['Year Built'] = df2_filtered['Year Built'].astype(int)

df2_filtered['Decade Built'] = (df2_filtered['Year Built'] // 10) * 10

# Filter after 1870

df2_filtered_1870 = df2_filtered[df2_filtered['Decade Built'] >= 1870]

houses_per_decade = df2_filtered_1870.groupby('Decade Built').size()

damage_distribution = df2_filtered_1870.groupby(['Decade Built', 'Damage']).size().unstack(fill_value=0)

fraction_damage = damage_distribution.div(houses_per_decade, axis=0)

custom_colors = ['#e6194B',  '#3cb44b',  '#ffe119',  '#4363d8', '#f58231', '#911eb4']  

fraction_damage.plot(kind='bar', stacked=True, figsize=(12, 6), color=custom_colors)

plt.title('Fraction of Damage Types per Decade Built (1870 and Onwards)')
plt.xlabel('Decade Built')
plt.ylabel('Ratio of Houses')
plt.legend(title='Damage Type')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

import folium
from folium.plugins import HeatMap

m = folium.Map(location=[37.5, -119.5], zoom_start=6)
gradient = {
    0.2: "yellow",
    0.5: "orange",
    0.8: "red",
    1.0: "#800000"
}
heat_data = geo_df[["Latitude", "Longitude"]].dropna().values.tolist()
HeatMap(heat_data, radius=10, gradient=gradient).add_to(m)
# static image attached since heatmap cannot be rendered on GitHub

<folium.plugins.heat_map.HeatMap at 0x17880ef10>

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Random Forest Model 
# Create feature set
features = ['Building Age', '* Roof Construction', '* Eaves', '* Vent Screen',
            '* Exterior Siding', '* Window Pane', '* Deck/Porch On Grade',
            '* Deck/Porch Elevated', '* Patio Cover/Carport Attached to Structure',
            'Assessed Improved Value (parcel)', 'Latitude', 'Longitude']
target = '* Damage'

# Preprocess data
df = df[df['Year Built (parcel)'] > 0]
df['Building Age'] = 2025 - df['Year Built (parcel)'] # Using present year as a reference
df_model = df[features + [target]].dropna()

le = LabelEncoder() # Convert to numerical values for modeling
categorical_features = ['* Roof Construction', '* Eaves', '* Vent Screen', '* Exterior Siding', '* Window Pane',
                        '* Deck/Porch On Grade', '* Deck/Porch Elevated', '* Patio Cover/Carport Attached to Structure']
for feature in categorical_features:
    df_model[feature] = le.fit_transform(df_model[feature])

df_model[target] = le.fit_transform(df_model[target])

X = df_model[features]
y = df_model[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.46      0.18      0.26       408
           1       0.94      0.96      0.95      6168
           2       0.73      0.42      0.54        26
           3       0.25      0.05      0.09        56
           4       0.16      0.04      0.06       100
           5       0.90      0.97      0.93      3301

    accuracy                           0.92     10059
   macro avg       0.57      0.44      0.47     10059
weighted avg       0.90      0.92      0.90     10059

sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

feature_importance = rf_model.feature_importances_
feature_names = X.columns

plt.figure(figsize=(8,6))
sns.barplot(x=feature_importance, y=feature_names, color='#042E4C')
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Random Forest Feature Importance')
plt.show()

	_id	OBJECTID	* Street Number	Zip Code	# Units in Structure (if multi unit)	# of Damaged Outbuildings < 120 SQFT	# of Non Damaged Outbuildings < 120 SQFT	Assessed Improved Value (parcel)	Year Built (parcel)	Latitude	Longitude	x	y
count	100230.000000	100230.000000	9.581000e+04	47429.000000	31184.000000	31085.000000	31073.00000	9.419500e+04	69812.000000	100230.000000	100230.000000	1.002300e+05	1.002300e+05
mean	50115.500000	50227.779717	3.886722e+04	46309.699973	0.433299	0.087566	0.12152	7.337022e+05	1672.283862	38.322953	-121.179297	-1.348962e+07	4.629002e+06
std	28934.053078	29107.678335	5.271695e+06	47467.653484	34.608767	0.462729	0.52558	8.603013e+06	708.451814	2.019086	1.538342	1.712474e+05	2.825063e+05
min	1.000000	1.000000	0.000000e+00	0.000000	0.000000	0.000000	0.00000	0.000000e+00	0.000000	32.592548	-123.774580	-1.377852e+07	3.841346e+06
25%	25058.250000	25058.250000	7.232500e+02	0.000000	0.000000	0.000000	0.00000	5.937000e+04	1944.000000	37.350926	-122.316162	-1.361617e+07	4.488135e+06
50%	50115.500000	50115.500000	4.308500e+03	0.000000	0.000000	0.000000	0.00000	1.455510e+05	1972.000000	38.692955	-121.600277	-1.353648e+07	4.677785e+06
75%	75172.750000	75172.750000	1.000300e+04	95667.000000	0.000000	0.000000	0.00000	3.109355e+05	1987.000000	39.763874	-120.509278	-1.341503e+07	4.831688e+06
max	100230.000000	101221.000000	1.410065e+09	96311.000000	6101.000000	40.000000	20.00000	1.220403e+09	2022.000000	41.991195	-116.418163	-1.295961e+07	5.159661e+06

	_id	OBJECTID	* Damage	* Street Number	* Street Name	* Street Type (e.g. road, drive, lane, etc.)	Street Suffix (e.g. apt. 23, blding C)	* City	State	Zip Code	...	Fire Name (Secondary)	APN (parcel)	Assessed Improved Value (parcel)	Year Built (parcel)	Site Address (parcel)	GLOBALID	Latitude	Longitude	x	y
0	1	1	No Damage	8376.0	Quail Canyon	Road	NaN	Winters	CA	NaN	...	Quail	0101090290	510000.0	1997.0	8376 QUAIL CANYON RD VACAVILLE CA 95688	e1919a06-b4c6-476d-99e5-f0b45b070de8	38.474960	-122.044465	-1.358593e+07	4.646741e+06
1	2	2	Affected (1-9%)	8402.0	Quail Canyon	Road	NaN	Winters	CA	NaN	...	Quail	0101090270	573052.0	1980.0	8402 QUAIL CANYON RD VACAVILLE CA 95688	b090eeb6-5b18-421e-9723-af7c9144587c	38.477442	-122.043252	-1.358579e+07	4.647094e+06
2	3	3	No Damage	8430.0	Quail Canyon	Road	NaN	Winters	CA	NaN	...	Quail	0101090310	350151.0	2004.0	8430 QUAIL CANYON RD VACAVILLE CA 95688	268da70b-753f-46aa-8fb1-327099337395	38.479358	-122.044585	-1.358594e+07	4.647366e+06
3	4	4	No Damage	3838.0	Putah Creek	Road	NaN	Winters	CA	NaN	...	Quail	0103010240	134880.0	1981.0	3838 PUTAH CREEK RD WINTERS CA 95694	64d4a278-5ee9-414a-8bf4-247c5b5c60f9	38.487313	-122.015115	-1.358266e+07	4.648497e+06
4	5	5	No Damage	3830.0	Putah Creek	Road	NaN	Winters	CA	NaN	...	Quail	0103010220	346648.0	1980.0	3830 PUTAH CREEK RD WINTERS CA 95694	1b44b214-01fd-4f06-b764-eb42a1ec93d7	38.485636	-122.016122	-1.358277e+07	4.648259e+06

	* Damage	* Street Number	* Street Name	* Street Type (e.g. road, drive, lane, etc.)	* CAL FIRE Unit	County	Incident Start Date	Structure Defense Actions Taken	Structure Category	* Roof Construction	...	* Vent Screen	* Exterior Siding	* Window Pane	* Deck/Porch On Grade	* Deck/Porch Elevated	* Patio Cover/Carport Attached to Structure	Assessed Improved Value (parcel)	Year Built (parcel)	Latitude	Longitude
0	No Damage	8376.0	Quail Canyon	Road	LNU	Solano	6/6/2020 12:00:00 AM	None	Single Residence	Asphalt	...	Mesh Screen <= 1/8""	Wood	Single Pane	Wood	Wood	No Patio Cover/Carport	510000.0	1997.0	38.474960	-122.044465
1	Affected (1-9%)	8402.0	Quail Canyon	Road	LNU	Solano	6/6/2020 12:00:00 AM	Hand Crew Fuel Break	Single Residence	Asphalt	...	Mesh Screen <= 1/8""	Wood	Multi Pane	Masonry/Concrete	No Deck/Porch	No Patio Cover/Carport	573052.0	1980.0	38.477442	-122.043252
2	No Damage	8430.0	Quail Canyon	Road	LNU	Solano	6/6/2020 12:00:00 AM	None	Single Residence	Asphalt	...	Mesh Screen > 1/8""	Wood	Single Pane	No Deck/Porch	No Deck/Porch	No Patio Cover/Carport	350151.0	2004.0	38.479358	-122.044585
3	No Damage	3838.0	Putah Creek	Road	LNU	Solano	6/6/2020 12:00:00 AM	None	Single Residence	Asphalt	...	Mesh Screen > 1/8""	Wood	Single Pane	No Deck/Porch	No Deck/Porch	Combustible	134880.0	1981.0	38.487313	-122.015115
4	No Damage	3830.0	Putah Creek	Road	LNU	Solano	6/6/2020 12:00:00 AM	None	Single Residence	Tile	...	Mesh Screen > 1/8""	Wood	Multi Pane	Wood	Wood	Combustible	346648.0	1980.0	38.485636	-122.016122

	* Street Number	Assessed Improved Value (parcel)	Year Built (parcel)	Latitude	Longitude
count	5.914000e+04	5.914000e+04	59140.000000	59140.000000	59140.000000
mean	5.406353e+04	8.358812e+05	1676.543236	38.189823	-121.077250
std	6.709125e+06	1.023007e+07	703.690732	2.120131	1.551404
min	0.000000e+00	0.000000e+00	0.000000	32.592548	-123.683036
25%	9.590000e+02	6.000000e+04	1945.000000	37.138162	-122.143732
50%	5.110000e+03	1.445120e+05	1971.000000	38.665997	-121.598170
75%	9.865000e+03	3.111675e+05	1986.000000	39.763468	-119.968393
max	1.410065e+09	1.220403e+09	2022.000000	41.935088	-116.623886

	DAMAGE	STREETNUMBER	STREETNAME	STREETTYPE	CITY	CALFIREUNIT	COUNTY	INCIDENTNAME	INCIDENTSTARTDATE	...	WINDOWPANE	DECKPORCHONGRADE	DECKPORCHELEVATED	PATIOCOVERCARPORT	FENCEATTACHEDTOSTRUCTURE	ASSESSEDIMPROVEDVALUE	YEARBUILT	Latitude	Longitude	geometry
74513	Destroyed (>50%)	4300.0	High	Road	Montague	SKU	Siskiyou	High	Fri, 24 May 2019 00:00:00 GMT	...	Single Pane	Wood	No Deck/Porch	No Patio Cover/Carport	No Fence	15000.0	1977.0	41.669654	-122.544020	POINT (-13641537.957 5111622.953)
74514	Destroyed (>50%)	2980.0	16 State Highway	None	Ramsey	LNU	Yolo	Sand	Sat, 08 Jun 2019 00:00:00 GMT	...	No Windows	No Deck/Porch	No Deck/Porch	No Patio Cover/Carport	No Fence	31486.0	1923.0	38.894719	-122.249752	POINT (-13608780.135 4706602.181)
74515	Destroyed (>50%)	2559.0	Rumsey Canyon	Road	Rumsey	LNU	Yolo	Sand	Sat, 08 Jun 2019 00:00:00 GMT	...	Single Pane	No Deck/Porch	No Deck/Porch	No Patio Cover/Carport	No Fence	242760.0	1975.0	38.902134	-122.246907	POINT (-13608463.4 4707662.808)
74516	Destroyed (>50%)	2756.0	Rumsey Canyon	Road	Rumsey	LNU	Yolo	Sand	Sat, 08 Jun 2019 00:00:00 GMT	...	Unknown	No Deck/Porch	No Deck/Porch	Non Combustible	No Fence	46552.0	0.0	38.900564	-122.249926	POINT (-13608799.488 4707438.274)
74517	Destroyed (>50%)	2756.0	Rumsey Canyon	Road	Rumsey	LNU	Yolo	Sand	Sat, 08 Jun 2019 00:00:00 GMT	...	Unknown	No Deck/Porch	No Deck/Porch	Unknown	No Fence	46552.0	0.0	38.900753	-122.250288	POINT (-13608839.798 4707465.236)

COGS 108 - A Structural Characteristic: California Wildfire Damage¶

Permissions¶

Names¶

Abstract¶

Research Question¶

Background and Prior Work¶

References¶

Hypothesis¶

Data¶

Data overview¶

California Wildfire Dataset¶

Setup Evaluation and Data Cleaning¶

Results¶

Exploratory Data Analysis¶

Section 1 of EDA - Fire Analysis¶

Section 2 of EDA - Structural Analysis¶

Section 3 of EDA - Geospatial Analysis¶

Data Analysis and Results¶

Ethics & Privacy¶

Discussion and Conclusion¶

Team Contributions¶