Map pdfm features
Mapping PDFM Features and Predicted Housing Prices
Useful Resources¶
- Google's Population Dynamics Foundation Model (PDFM)
- Request access to PDFM embeddings here
- Zillow data can be accessed here
Installation¶
Uncomment and run the following cell to install the required libraries.
In [ ]:
Copied!
# %pip install "leafmap[maplibre]" scikit-learn
# %pip install "leafmap[maplibre]" scikit-learn
Import Libraries¶
In [ ]:
Copied!
import os
import pandas as pd
import geopandas as gpd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from leafmap.common import evaluate_model, plot_actual_vs_predicted, download_file
import leafmap.maplibregl as leafmap
import os
import pandas as pd
import geopandas as gpd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from leafmap.common import evaluate_model, plot_actual_vs_predicted, download_file
import leafmap.maplibregl as leafmap
Download Zillow Data¶
Download the Zillow home value data at the county level.
In [ ]:
Copied!
zhvi_url = "https://github.com/opengeos/datasets/releases/download/us/zillow_home_value_index_by_county.csv"
zhvi_file = "data/zillow_home_value_index_by_county.csv"
zhvi_url = "https://github.com/opengeos/datasets/releases/download/us/zillow_home_value_index_by_county.csv"
zhvi_file = "data/zillow_home_value_index_by_county.csv"
In [ ]:
Copied!
if not os.path.exists(zhvi_file):
download_file(zhvi_url, zhvi_file)
if not os.path.exists(zhvi_file):
download_file(zhvi_url, zhvi_file)
Process Zillow Data¶
In [ ]:
Copied!
zhvi_df = pd.read_csv(
zhvi_file, dtype={"StateCodeFIPS": "string", "MunicipalCodeFIPS": "string"}
)
zhvi_df.index = "geoId/" + zhvi_df["StateCodeFIPS"] + zhvi_df["MunicipalCodeFIPS"]
zhvi_df.head()
zhvi_df = pd.read_csv(
zhvi_file, dtype={"StateCodeFIPS": "string", "MunicipalCodeFIPS": "string"}
)
zhvi_df.index = "geoId/" + zhvi_df["StateCodeFIPS"] + zhvi_df["MunicipalCodeFIPS"]
zhvi_df.head()
In [ ]:
Copied!
county_geojson = "data/county.geojson"
if not os.path.exists(county_geojson):
raise FileNotFoundError("Please request the embeddings from Google")
county_geojson = "data/county.geojson"
if not os.path.exists(county_geojson):
raise FileNotFoundError("Please request the embeddings from Google")
Load county boundaries¶
In [ ]:
Copied!
county_gdf = gpd.read_file(county_geojson)
county_gdf.set_index("place", inplace=True)
county_gdf.head()
county_gdf = gpd.read_file(county_geojson)
county_gdf.set_index("place", inplace=True)
county_gdf.head()
Join home value data and county boundaries¶
In [ ]:
Copied!
df = zhvi_df.join(county_gdf)
zhvi_gdf = gpd.GeoDataFrame(df, geometry="geometry")
zhvi_gdf.head()
df = zhvi_df.join(county_gdf)
zhvi_gdf = gpd.GeoDataFrame(df, geometry="geometry")
zhvi_gdf.head()
In [ ]:
Copied!
column = "2024-10-31"
gdf = zhvi_gdf[["RegionName", "State", column, "geometry"]]
gdf.head()
column = "2024-10-31"
gdf = zhvi_gdf[["RegionName", "State", column, "geometry"]]
gdf.head()
Visualize home values in 2D¶
In [ ]:
Copied!
m = leafmap.Map(style="liberty")
first_symbol_id = m.find_first_symbol_layer()["id"]
m.add_data(
gdf,
cmap="Blues",
column=column,
legend_title="Median Home Value",
name="Median Home Value",
before_id=first_symbol_id,
)
m.add_layer_control()
m
m = leafmap.Map(style="liberty")
first_symbol_id = m.find_first_symbol_layer()["id"]
m.add_data(
gdf,
cmap="Blues",
column=column,
legend_title="Median Home Value",
name="Median Home Value",
before_id=first_symbol_id,
)
m.add_layer_control()
m
Visualize home values in 3D¶
In [ ]:
Copied!
m = leafmap.Map(style="liberty", pitch=60)
m.add_data(
gdf,
cmap="Blues",
column=column,
legend_title="Median Home Value",
extrude=True,
scale_factor=3,
before_id=first_symbol_id,
name="Median Home Value",
)
m.add_layer_control()
m
m = leafmap.Map(style="liberty", pitch=60)
m.add_data(
gdf,
cmap="Blues",
column=column,
legend_title="Median Home Value",
extrude=True,
scale_factor=3,
before_id=first_symbol_id,
name="Median Home Value",
)
m.add_layer_control()
m
Load PDFM county embeddings¶
In [ ]:
Copied!
embeddings_file_path = "data/county_embeddings.csv"
embeddings_file_path = "data/county_embeddings.csv"
In [ ]:
Copied!
embeddings_df = pd.read_csv(embeddings_file_path).set_index("place")
embeddings_df.head()
embeddings_df = pd.read_csv(embeddings_file_path).set_index("place")
embeddings_df.head()
In [ ]:
Copied!
df = embeddings_df.join(county_gdf)
embeddings_gdf = gpd.GeoDataFrame(df, geometry="geometry")
embeddings_gdf.head()
df = embeddings_df.join(county_gdf)
embeddings_gdf = gpd.GeoDataFrame(df, geometry="geometry")
embeddings_gdf.head()
Visualize PDFM features¶
Select any of the 329 PDFM features to visualize.
In [ ]:
Copied!
column = "feature329" # Change this to the feature you want to use
gdf = embeddings_gdf[[column, "state", "county", "geometry"]]
gdf.head()
column = "feature329" # Change this to the feature you want to use
gdf = embeddings_gdf[[column, "state", "county", "geometry"]]
gdf.head()
In [ ]:
Copied!
m = leafmap.Map(style="liberty")
m.add_data(
gdf,
cmap="Blues",
column=column,
legend_title=column,
before_id=first_symbol_id,
name=column,
)
m.add_layer_control()
m
m = leafmap.Map(style="liberty")
m.add_data(
gdf,
cmap="Blues",
column=column,
legend_title=column,
before_id=first_symbol_id,
name=column,
)
m.add_layer_control()
m
In [ ]:
Copied!
m = leafmap.Map(style="liberty", pitch=60)
m.add_data(
gdf,
cmap="Blues",
column=column,
legend_title=column,
before_id=first_symbol_id,
name=column,
extrude=True,
scale_factor=0.00005,
)
m.add_layer_control()
m
m = leafmap.Map(style="liberty", pitch=60)
m.add_data(
gdf,
cmap="Blues",
column=column,
legend_title=column,
before_id=first_symbol_id,
name=column,
extrude=True,
scale_factor=0.00005,
)
m.add_layer_control()
m
Join Zillow and PDFM Data¶
In [ ]:
Copied!
data = zhvi_df.join(embeddings_df, how="inner")
data.head()
data = zhvi_df.join(embeddings_df, how="inner")
data.head()
In [ ]:
Copied!
embedding_features = [f"feature{x}" for x in range(330)]
label = "2024-10-31" # Change this to the date you want to predict
embedding_features = [f"feature{x}" for x in range(330)]
label = "2024-10-31" # Change this to the date you want to predict
In [ ]:
Copied!
data = data.dropna(subset=[label])
data = data.dropna(subset=[label])
Split Train and Test Data¶
In [ ]:
Copied!
data = data[embedding_features + [label]]
X = data[embedding_features]
y = data[label]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
data = data[embedding_features + [label]]
X = data[embedding_features]
y = data[label]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
Fit Linear Regression Model¶
In [ ]:
Copied!
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
Evaluate Linear Regression Model¶
In [ ]:
Copied!
evaluation_df = pd.DataFrame({"y": y_test, "y_pred": y_pred})
metrics = evaluate_model(evaluation_df)
print(metrics)
evaluation_df = pd.DataFrame({"y": y_test, "y_pred": y_pred})
metrics = evaluate_model(evaluation_df)
print(metrics)
In [ ]:
Copied!
xy_lim = (0, 1_000_000)
plot_actual_vs_predicted(
evaluation_df,
xlim=xy_lim,
ylim=xy_lim,
title="Actual vs Predicted Home Values",
x_label="Actual Home Value",
y_label="Predicted Home Value",
)
xy_lim = (0, 1_000_000)
plot_actual_vs_predicted(
evaluation_df,
xlim=xy_lim,
ylim=xy_lim,
title="Actual vs Predicted Home Values",
x_label="Actual Home Value",
y_label="Predicted Home Value",
)
Join predicted values with county boundaries¶
In [ ]:
Copied!
df = evaluation_df.join(gdf)
df["difference"] = df["y_pred"] - df["y"]
evaluation_gdf = gpd.GeoDataFrame(df, geometry="geometry")
evaluation_gdf.drop(columns=["category", "color", column], inplace=True)
evaluation_gdf.head()
df = evaluation_df.join(gdf)
df["difference"] = df["y_pred"] - df["y"]
evaluation_gdf = gpd.GeoDataFrame(df, geometry="geometry")
evaluation_gdf.drop(columns=["category", "color", column], inplace=True)
evaluation_gdf.head()
Visualize actual home values¶
In [ ]:
Copied!
m = leafmap.Map(style="liberty", pitch=60)
m.add_data(
evaluation_gdf,
cmap="Blues",
column="y",
legend_title="Actual Home Value",
before_id=first_symbol_id,
name="Actual Home Value",
extrude=True,
scale_factor=3,
)
m.add_layer_control()
m
m = leafmap.Map(style="liberty", pitch=60)
m.add_data(
evaluation_gdf,
cmap="Blues",
column="y",
legend_title="Actual Home Value",
before_id=first_symbol_id,
name="Actual Home Value",
extrude=True,
scale_factor=3,
)
m.add_layer_control()
m
Visualize predicted home values¶
In [ ]:
Copied!
m = leafmap.Map(style="liberty", pitch=60)
m.add_data(
evaluation_gdf,
cmap="Blues",
column="y_pred",
legend_title="Predicted Home Value",
before_id=first_symbol_id,
name="Predicted Home Value",
extrude=True,
scale_factor=3,
)
m.add_layer_control()
m
m = leafmap.Map(style="liberty", pitch=60)
m.add_data(
evaluation_gdf,
cmap="Blues",
column="y_pred",
legend_title="Predicted Home Value",
before_id=first_symbol_id,
name="Predicted Home Value",
extrude=True,
scale_factor=3,
)
m.add_layer_control()
m
Visualize difference between predicted and actual home values¶
In [ ]:
Copied!
m = leafmap.Map(style="liberty", pitch=60)
m.add_data(
evaluation_gdf,
cmap="coolwarm",
column="difference",
legend_title="y_pred-y",
before_id=first_symbol_id,
name="Difference",
extrude=True,
scale_factor=3,
)
m.add_layer_control()
m
m = leafmap.Map(style="liberty", pitch=60)
m.add_data(
evaluation_gdf,
cmap="coolwarm",
column="difference",
legend_title="y_pred-y",
before_id=first_symbol_id,
name="Difference",
extrude=True,
scale_factor=3,
)
m.add_layer_control()
m