Zillow home value
Predicting US Housing Prices at the Zip Code Level Using Google's Population Dynamics Foundation Model and Zillow Data
Useful Resources¶
- Google's Population Dynamics Foundation Model (PDFM)
- Request access to PDFM embeddings here
- Zillow data can be accessed here
Acknowledgements¶
This notebook is adapted from the PDFM tutorial. Credit goes to the authors of the PDFM tutorial.
Installation¶
Uncomment and run the following cell to install the required libraries.
In [ ]:
Copied!
# %pip install leafmap scikit-learn
# %pip install leafmap scikit-learn
Import Libraries¶
In [ ]:
Copied!
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from leafmap.common import evaluate_model, plot_actual_vs_predicted, download_file
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from leafmap.common import evaluate_model, plot_actual_vs_predicted, download_file
Download Zillow Data¶
In [ ]:
Copied!
zhvi_url = "https://github.com/opengeos/datasets/releases/download/us/zillow_home_value_index_by_zipcode.csv"
zhvi_file = "data/zillow_home_value_index_by_zipcode.csv"
zhvi_url = "https://github.com/opengeos/datasets/releases/download/us/zillow_home_value_index_by_zipcode.csv"
zhvi_file = "data/zillow_home_value_index_by_zipcode.csv"
In [ ]:
Copied!
if not os.path.exists(zhvi_file):
download_file(zhvi_url, zhvi_file)
if not os.path.exists(zhvi_file):
download_file(zhvi_url, zhvi_file)
Process Zillow Data¶
In [ ]:
Copied!
zhvi_df = pd.read_csv(zhvi_file, dtype={"RegionName": "string"})
zhvi_df.index = zhvi_df["RegionName"].apply(lambda x: f"zip/{x}")
zhvi_df.head()
zhvi_df = pd.read_csv(zhvi_file, dtype={"RegionName": "string"})
zhvi_df.index = zhvi_df["RegionName"].apply(lambda x: f"zip/{x}")
zhvi_df.head()
Request access to PDFM Embeddings¶
In [ ]:
Copied!
embeddings_file_path = "data/zcta_embeddings.csv"
embeddings_file_path = "data/zcta_embeddings.csv"
To request access to PDFM embeddings, please follow the instructions here.
In [ ]:
Copied!
if not os.path.exists(embeddings_file_path):
raise FileNotFoundError("Please request the embeddings from Google")
if not os.path.exists(embeddings_file_path):
raise FileNotFoundError("Please request the embeddings from Google")
Load PDFM Embeddings¶
In [ ]:
Copied!
zipcode_embeddings = pd.read_csv(embeddings_file_path).set_index("place")
zipcode_embeddings.head()
zipcode_embeddings = pd.read_csv(embeddings_file_path).set_index("place")
zipcode_embeddings.head()
Join Zillow and PDFM Data¶
In [ ]:
Copied!
data = zhvi_df.join(zipcode_embeddings, how="inner")
data.head()
data = zhvi_df.join(zipcode_embeddings, how="inner")
data.head()
In [ ]:
Copied!
embedding_features = [f"feature{x}" for x in range(330)]
label = "2024-10-31"
embedding_features = [f"feature{x}" for x in range(330)]
label = "2024-10-31"
In [ ]:
Copied!
data = data.dropna(subset=[label])
data = data.dropna(subset=[label])
Split Train and Test Data¶
In [ ]:
Copied!
data = data[embedding_features + [label]]
X = data[embedding_features]
y = data[label]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
data = data[embedding_features + [label]]
X = data[embedding_features]
y = data[label]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
Fit Linear Regression Model¶
In [ ]:
Copied!
# Initialize and train a simple linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Initialize and train a simple linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
Evaluate Linear Regression Model¶
In [ ]:
Copied!
evaluation_df = pd.DataFrame({"y": y_test, "y_pred": y_pred})
metrics = evaluate_model(evaluation_df)
print(metrics)
evaluation_df = pd.DataFrame({"y": y_test, "y_pred": y_pred})
metrics = evaluate_model(evaluation_df)
print(metrics)
In [ ]:
Copied!
xy_lim = (0, 3_000_000)
plot_actual_vs_predicted(
evaluation_df,
xlim=xy_lim,
ylim=xy_lim,
title="Actual vs Predicted Home Values",
x_label="Actual Home Value",
y_label="Predicted Home Value",
)
xy_lim = (0, 3_000_000)
plot_actual_vs_predicted(
evaluation_df,
xlim=xy_lim,
ylim=xy_lim,
title="Actual vs Predicted Home Values",
x_label="Actual Home Value",
y_label="Predicted Home Value",
)
Fit K-Nearest Neighbors Model¶
In [ ]:
Copied!
k = 5
model = KNeighborsRegressor(n_neighbors=k)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
k = 5
model = KNeighborsRegressor(n_neighbors=k)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
Evaluate K-Nearest Neighbors Model¶
In [ ]:
Copied!
evaluation_df = pd.DataFrame({"y": y_test, "y_pred": y_pred})
# Evaluate the model
metrics = evaluate_model(evaluation_df)
print(metrics)
evaluation_df = pd.DataFrame({"y": y_test, "y_pred": y_pred})
# Evaluate the model
metrics = evaluate_model(evaluation_df)
print(metrics)
In [ ]:
Copied!
plot_actual_vs_predicted(
evaluation_df,
xlim=xy_lim,
ylim=xy_lim,
title="Actual vs Predicted Home Values",
x_label="Actual Home Value",
y_label="Predicted Home Value",
)
plot_actual_vs_predicted(
evaluation_df,
xlim=xy_lim,
ylim=xy_lim,
title="Actual vs Predicted Home Values",
x_label="Actual Home Value",
y_label="Predicted Home Value",
)