PDFM Super Resolution with Zillow Housing Data¶

This notebook is adapted from the PDFM notebook example here. Credits to the original authors.

Data Preparation¶

Step 1: Download a csv file of the embeddings using this link.¶

The county and ZCTA (zipcode census tabulation area) embeddings are available in different files.

Here we assume that you have obtained the embeddings and uploaded them to a Google Drive directory called pdfm_embeddings/v0/us.

In [ ]:

Copied!





import math
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn import metrics as skmetrics
import lightgbm as lgbm
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
import seaborn as sns
import math
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn import metrics as skmetrics
import lightgbm as lgbm
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
import seaborn as sns

In [ ]:

Copied!

BASE_PATH = "./"  # Set this to the path where your data files are located

county_embeddings = pd.read_csv(BASE_PATH + "county_embeddings.csv").set_index("place")
zip_embeddings = pd.read_csv(BASE_PATH + "zcta_embeddings.csv").set_index("place")
embeddings = pd.concat([county_embeddings, zip_embeddings])
BASE_PATH = "./"  # Set this to the path where your data files are located

county_embeddings = pd.read_csv(BASE_PATH + "county_embeddings.csv").set_index("place")
zip_embeddings = pd.read_csv(BASE_PATH + "zcta_embeddings.csv").set_index("place")
embeddings = pd.concat([county_embeddings, zip_embeddings])

In [ ]:

Copied!

embedding_features = [f"feature{x}" for x in range(330)]
embeddings.head(2)
embedding_features = [f"feature{x}" for x in range(330)]
embeddings.head(2)

Step 2: Download and load a few variables from GitHub¶

In [ ]:

Copied!

zhvi_county_url = "https://github.com/opengeos/datasets/releases/download/us/zillow_home_value_index_by_county.csv"
zhvi_zipcode_url = "https://github.com/opengeos/datasets/releases/download/us/zillow_home_value_index_by_zipcode.csv"
zhvi_county_url = "https://github.com/opengeos/datasets/releases/download/us/zillow_home_value_index_by_county.csv"
zhvi_zipcode_url = "https://github.com/opengeos/datasets/releases/download/us/zillow_home_value_index_by_zipcode.csv"

In [ ]:

Copied!





zhvi_county_df = pd.read_csv(
    zhvi_county_url, dtype={"StateCodeFIPS": "string", "MunicipalCodeFIPS": "string"}
)
zhvi_county_df["place"] = (
    "geoId/" + zhvi_county_df["StateCodeFIPS"] + zhvi_county_df["MunicipalCodeFIPS"]
)
zhvi_county_df = zhvi_county_df.set_index("place")
zhvi_county_df.head()
zhvi_county_df = pd.read_csv(
    zhvi_county_url, dtype={"StateCodeFIPS": "string", "MunicipalCodeFIPS": "string"}
)
zhvi_county_df["place"] = (
    "geoId/" + zhvi_county_df["StateCodeFIPS"] + zhvi_county_df["MunicipalCodeFIPS"]
)
zhvi_county_df = zhvi_county_df.set_index("place")
zhvi_county_df.head()

In [ ]:

Copied!





zhvi_zipcode_df = pd.read_csv(zhvi_zipcode_url, dtype={"RegionName": "string"})
zhvi_zipcode_df["place"] = zhvi_zipcode_df["RegionName"].apply(lambda x: f"zip/{x}")
zhvi_zipcode_df = zhvi_zipcode_df.set_index("place")
zhvi_zipcode_df.head()
zhvi_zipcode_df = pd.read_csv(zhvi_zipcode_url, dtype={"RegionName": "string"})
zhvi_zipcode_df["place"] = zhvi_zipcode_df["RegionName"].apply(lambda x: f"zip/{x}")
zhvi_zipcode_df = zhvi_zipcode_df.set_index("place")
zhvi_zipcode_df.head()

In [ ]:

Copied!

zhvi_df = pd.concat([zhvi_county_df, zhvi_zipcode_df])
zhvi_df.head()
zhvi_df = pd.concat([zhvi_county_df, zhvi_zipcode_df])
zhvi_df.head()

In [ ]:

Copied!

zhvi_df[-5:]
zhvi_df[-5:]

In [ ]:

Copied!

len(zhvi_df)
len(zhvi_df)

In [ ]:

Copied!

df = embeddings.join(zhvi_df, how="inner")
df.head()
df = embeddings.join(zhvi_df, how="inner")
df.head()

In [ ]:

Copied!

df[-3:]
df[-3:]

Data Visualizations¶

Download the county and zcta (Zipcode census tabulation area) level geojson file.¶

The county and zcta level geojson file are available in the same folder as the embeddings. Download the geojson file and upload to Google Colab.

In [ ]:

Copied!

county_geo = gpd.read_file(BASE_PATH + "county.geojson").set_index("place")
zip_geo = gpd.read_file(BASE_PATH + "zcta.geojson").set_index("place")
county_geo = gpd.read_file(BASE_PATH + "county.geojson").set_index("place")
zip_geo = gpd.read_file(BASE_PATH + "zcta.geojson").set_index("place")

In [ ]:

Copied!

geo = pd.concat([county_geo, zip_geo])
embeddings = gpd.GeoDataFrame(embeddings, geometry=geo.geometry)
embeddings.shape
geo = pd.concat([county_geo, zip_geo])
embeddings = gpd.GeoDataFrame(embeddings, geometry=geo.geometry)
embeddings.shape

In [ ]:

Copied!

df = embeddings.join(zhvi_df).set_geometry("geometry")
df.head(1)
df = embeddings.join(zhvi_df).set_geometry("geometry")
df.head(1)

In [ ]:

Copied!

df["county_id"] = df["county"] + df["state"]
df["county_id"] = df["county"] + df["state"]

Map out an embedding dimension spatially¶

In [ ]:

Copied!





def get_locale(df, index, states=None, counties=None):
    df = df[df.index.isin(index)]
    if not states and not counties:
        return df
    filter = df.state.isin(states)
    if counties:
        filter &= df.county.isin(counties)
    return df[filter]
def get_locale(df, index, states=None, counties=None):
    df = df[df.index.isin(index)]
    if not states and not counties:
        return df
    filter = df.state.isin(states)
    if counties:
        filter &= df.county.isin(counties)
    return df[filter]

In [ ]:

Copied!





# @title Map out an embedding dimension feature0 spatially across all counties in US
feature = embedding_features[300]
ax = get_locale(embeddings, embeddings.index).plot(feature)
_ = ax.set_title(feature + " in counties")
# @title Map out an embedding dimension feature0 spatially across all counties in US
feature = embedding_features[300]
ax = get_locale(embeddings, embeddings.index).plot(feature)
_ = ax.set_title(feature + " in counties")

In [ ]:

Copied!





# @title Map out an embedding dimension feature0 spatially across all counties and zipcodes in NY state
fig, ax = plt.subplots(1, 2, figsize=(8, 4))
state = "NY"
get_locale(embeddings, county_embeddings.index, states=[state]).plot(feature, ax=ax[0])
get_locale(embeddings, zip_embeddings.index, states=[state]).plot(feature, ax=ax[1])
fig.suptitle(f"{feature} in {state}")
ax[0].set(title="counties")
ax[1].set(title="zip codes")
plt.setp(ax, xticks=[], yticks=[])
fig.tight_layout()
# @title Map out an embedding dimension feature0 spatially across all counties and zipcodes in NY state
fig, ax = plt.subplots(1, 2, figsize=(8, 4))
state = "NY"
get_locale(embeddings, county_embeddings.index, states=[state]).plot(feature, ax=ax[0])
get_locale(embeddings, zip_embeddings.index, states=[state]).plot(feature, ax=ax[1])
fig.suptitle(f"{feature} in {state}")
ax[0].set(title="counties")
ax[1].set(title="zip codes")
plt.setp(ax, xticks=[], yticks=[])
fig.tight_layout()

Applying the embeddings in a prediction task¶

In [ ]:

Copied!





def evaluate(df: pd.DataFrame) -> dict:
    """Evaluates the model performance on the given dataframe.

    Args:
        df: A pandas DataFrame with columns 'y' and 'y_pred'.

    Returns:
        A dictionary of performance metrics.
    """
    # Ensure necessary columns exist and drop rows with NaN or zero in 'y'
    if not {"y", "y_pred"}.issubset(df.columns):
        raise ValueError("DataFrame must contain 'y' and 'y_pred' columns")

    df = df.dropna(subset=["y", "y_pred"])
    df = df[df["y"] != 0]

    r2 = skmetrics.r2_score(df["y"], df["y_pred"])
    correlation = float(df["y"].corr(df["y_pred"]))
    rmse = math.sqrt(skmetrics.mean_squared_error(df["y"], df["y_pred"]))
    mae = float(skmetrics.mean_absolute_error(df["y"], df["y_pred"]))
    mape = float(skmetrics.mean_absolute_percentage_error(df["y"], df["y_pred"]))

    return {
        "r2": r2,
        "rmse": rmse,
        "mae": mae,
        "mape": mape,
        "correlation": correlation,
    }


def subset_eval(
    label: str,
    county_name: str,
    state: str,
    gpred: gpd.GeoDataFrame,
    visualize: bool = True,
    cmap: str = "Greys",
) -> dict:
    """Runs intra-county or intra-state evaluation and visualizes the results.

    Args:
        label: The label for the title of the visualization.
        county_name: The specific county name to filter.
        state: The specific state name to filter.
        gpred: GeoDataFrame containing 'y', 'y_pred', 'state', and 'county' columns.
        visualize: Whether to display visualizations.
        cmap: Colormap for visualizations.

    Returns:
        A dictionary of performance metrics.
    """
    # Apply filters based on state and county name
    subset = gpred.copy()
    if state:
        subset = subset[subset["state"] == state]
    if county_name:
        subset = subset[subset["county"] == county_name]

    # Drop rows where 'y' is NaN
    subset = subset.dropna(subset=["y", "y_pred"])
    eval_metrics = evaluate(subset)

    if visualize:
        _, ax = plt.subplots(1, 3, figsize=(12, 4))

        # Scatter plot of predicted vs actual
        subset.plot.scatter("y", "y_pred", alpha=0.8, ax=ax[2], color="darkgray")
        x0, x1 = (
            subset[["y", "y_pred"]].min().min(),
            subset[["y", "y_pred"]].max().max(),
        )
        ax[2].plot([x0, x1], [x0, x1], ls="--", color="black")
        ax[2].set_title(
            f'r={eval_metrics["correlation"]:.2f}, mae={eval_metrics["mae"]:.2f}'
        )

        # Maps of actual and predicted values
        subset.plot(
            "y",
            legend=True,
            ax=ax[0],
            vmin=x0,
            vmax=x1,
            cmap=cmap,
            legend_kwds={"fraction": 0.02, "pad": 0.05},
        )
        ax[0].set_title("Actual")
        subset.plot("y_pred", legend=False, ax=ax[1], vmin=x0, vmax=x1, cmap=cmap)
        ax[1].set_title("Predicted")

        plt.setp(ax[:2], xticks=[], yticks=[])
        plt.suptitle(f"{label} - {county_name}, {state}")
        plt.tight_layout()

    return eval_metrics


def make_predictions_df(
    predictions: np.ndarray, test_df: gpd.GeoDataFrame, label: str
) -> gpd.GeoDataFrame:
    """Creates a GeoDataFrame with predictions, true labels, and geographic info.

    Args:
        predictions: A sequence of predictions.
        test_df: The original test GeoDataFrame that the predictions are based on.
        label: The column name for the true label in `test_df`.

    Returns:
        A GeoDataFrame for evaluation and visualizations.
    """
    if label not in test_df.columns:
        raise ValueError(
            f"The specified label '{label}' does not exist in test_df columns."
        )

    df_predictions = pd.DataFrame(
        {"y": test_df[label], "y_pred": predictions}, index=test_df.index
    )
    return test_df[["geometry", "state", "county"]].join(df_predictions)
def evaluate(df: pd.DataFrame) -> dict:
    """Evaluates the model performance on the given dataframe.

    Args:
        df: A pandas DataFrame with columns 'y' and 'y_pred'.

    Returns:
        A dictionary of performance metrics.
    """
    # Ensure necessary columns exist and drop rows with NaN or zero in 'y'
    if not {"y", "y_pred"}.issubset(df.columns):
        raise ValueError("DataFrame must contain 'y' and 'y_pred' columns")

    df = df.dropna(subset=["y", "y_pred"])
    df = df[df["y"] != 0]

    r2 = skmetrics.r2_score(df["y"], df["y_pred"])
    correlation = float(df["y"].corr(df["y_pred"]))
    rmse = math.sqrt(skmetrics.mean_squared_error(df["y"], df["y_pred"]))
    mae = float(skmetrics.mean_absolute_error(df["y"], df["y_pred"]))
    mape = float(skmetrics.mean_absolute_percentage_error(df["y"], df["y_pred"]))

    return {
        "r2": r2,
        "rmse": rmse,
        "mae": mae,
        "mape": mape,
        "correlation": correlation,
    }


def subset_eval(
    label: str,
    county_name: str,
    state: str,
    gpred: gpd.GeoDataFrame,
    visualize: bool = True,
    cmap: str = "Greys",
) -> dict:
    """Runs intra-county or intra-state evaluation and visualizes the results.

    Args:
        label: The label for the title of the visualization.
        county_name: The specific county name to filter.
        state: The specific state name to filter.
        gpred: GeoDataFrame containing 'y', 'y_pred', 'state', and 'county' columns.
        visualize: Whether to display visualizations.
        cmap: Colormap for visualizations.

    Returns:
        A dictionary of performance metrics.
    """
    # Apply filters based on state and county name
    subset = gpred.copy()
    if state:
        subset = subset[subset["state"] == state]
    if county_name:
        subset = subset[subset["county"] == county_name]

    # Drop rows where 'y' is NaN
    subset = subset.dropna(subset=["y", "y_pred"])
    eval_metrics = evaluate(subset)

    if visualize:
        _, ax = plt.subplots(1, 3, figsize=(12, 4))

        # Scatter plot of predicted vs actual
        subset.plot.scatter("y", "y_pred", alpha=0.8, ax=ax[2], color="darkgray")
        x0, x1 = (
            subset[["y", "y_pred"]].min().min(),
            subset[["y", "y_pred"]].max().max(),
        )
        ax[2].plot([x0, x1], [x0, x1], ls="--", color="black")
        ax[2].set_title(
            f'r={eval_metrics["correlation"]:.2f}, mae={eval_metrics["mae"]:.2f}'
        )

        # Maps of actual and predicted values
        subset.plot(
            "y",
            legend=True,
            ax=ax[0],
            vmin=x0,
            vmax=x1,
            cmap=cmap,
            legend_kwds={"fraction": 0.02, "pad": 0.05},
        )
        ax[0].set_title("Actual")
        subset.plot("y_pred", legend=False, ax=ax[1], vmin=x0, vmax=x1, cmap=cmap)
        ax[1].set_title("Predicted")

        plt.setp(ax[:2], xticks=[], yticks=[])
        plt.suptitle(f"{label} - {county_name}, {state}")
        plt.tight_layout()

    return eval_metrics


def make_predictions_df(
    predictions: np.ndarray, test_df: gpd.GeoDataFrame, label: str
) -> gpd.GeoDataFrame:
    """Creates a GeoDataFrame with predictions, true labels, and geographic info.

    Args:
        predictions: A sequence of predictions.
        test_df: The original test GeoDataFrame that the predictions are based on.
        label: The column name for the true label in `test_df`.

    Returns:
        A GeoDataFrame for evaluation and visualizations.
    """
    if label not in test_df.columns:
        raise ValueError(
            f"The specified label '{label}' does not exist in test_df columns."
        )

    df_predictions = pd.DataFrame(
        {"y": test_df[label], "y_pred": predictions}, index=test_df.index
    )
    return test_df[["geometry", "state", "county"]].join(df_predictions)

Superresolution - Train the model on counties and make predictions for zip code¶

In [ ]:

Copied!





# @title Train on counties and predict for zip codes
label = "2025-01-31"
data = df[df[label].notna()]
train = data[data.index.isin(county_geo.index)]
test = data[data.index.isin(zip_geo.index)]
# @title Train on counties and predict for zip codes
label = "2025-01-31"
data = df[df[label].notna()]
train = data[data.index.isin(county_geo.index)]
test = data[data.index.isin(zip_geo.index)]

In [ ]:

Copied!

len(train), len(test)
len(train), len(test)

In [ ]:

Copied!

train.head()
train.head()

In [ ]:

Copied!

test.head()
test.head()

In [ ]:

Copied!





model = Ridge()
model.fit(train[embedding_features], train[label])
predictions = model.predict(test[embedding_features])
gdf_predictions = make_predictions_df(predictions, test, label)
evaluate(gdf_predictions)
model = Ridge()
model.fit(train[embedding_features], train[label])
predictions = model.predict(test[embedding_features])
gdf_predictions = make_predictions_df(predictions, test, label)
evaluate(gdf_predictions)

In [ ]:

Copied!

# @title Visualize some test set predictions
_ = subset_eval(label, "Harris County", "TX", gdf_predictions, cmap="Blues")
_ = subset_eval(label, "Greenville County", "SC", gdf_predictions, cmap="Blues")
# @title Visualize some test set predictions
_ = subset_eval(label, "Harris County", "TX", gdf_predictions, cmap="Blues")
_ = subset_eval(label, "Greenville County", "SC", gdf_predictions, cmap="Blues")

In [ ]:

Copied!

# @title Evaluate over a state by setting the county to an empty string.
_ = subset_eval(label, "", "NY", gdf_predictions, cmap="Blues")
# @title Evaluate over a state by setting the county to an empty string.
_ = subset_eval(label, "", "NY", gdf_predictions, cmap="Blues")

Imputation - zip -> zip¶

Train on zipcodes in a subset of counties.

In [ ]:

Copied!





# @title train on zip codes in 20% of the counties, test on the remaining 80%.


def get_train_test_split(training_fraction=0.8):
    data = df[df.index.isin(zip_embeddings.index)].copy()
    # Split the zip codes by county into train/test sets.
    train_counties = (
        data.drop_duplicates("county_id").sample(frac=training_fraction).county_id
    )
    train = data[data.county_id.isin(train_counties)]
    test = data[~data.index.isin(train.index)]
    print(
        "# training counties:",
        len(train_counties),
        "\n# training zip codes:",
        train.shape[0],
        "\n# test zip codes:",
        test.shape[0],
    )
    return train, test


def run_imputation_model(
    train, test, label, min_population=500, model_class=Ridge, model_kwargs={}
):
    train = train[(train.population >= min_population) & train[label].notna()]
    test = test[(test.population >= min_population) & test[label].notna()]
    model = make_pipeline(preprocessing.MinMaxScaler(), model_class(**model_kwargs))
    model.fit(train[embedding_features], train[label])
    predictions = model.predict(test[embedding_features])
    gdf_predictions = make_predictions_df(predictions, test, label)
    results = evaluate(gdf_predictions)
    return model, results


# Increasing this value generally improves performance.
training_fraction = 0.2
label = "2025-01-31"
train, test = get_train_test_split(training_fraction)
model, results = run_imputation_model(train, test, label)
results
# @title train on zip codes in 20% of the counties, test on the remaining 80%.


def get_train_test_split(training_fraction=0.8):
    data = df[df.index.isin(zip_embeddings.index)].copy()
    # Split the zip codes by county into train/test sets.
    train_counties = (
        data.drop_duplicates("county_id").sample(frac=training_fraction).county_id
    )
    train = data[data.county_id.isin(train_counties)]
    test = data[~data.index.isin(train.index)]
    print(
        "# training counties:",
        len(train_counties),
        "\n# training zip codes:",
        train.shape[0],
        "\n# test zip codes:",
        test.shape[0],
    )
    return train, test


def run_imputation_model(
    train, test, label, min_population=500, model_class=Ridge, model_kwargs={}
):
    train = train[(train.population >= min_population) & train[label].notna()]
    test = test[(test.population >= min_population) & test[label].notna()]
    model = make_pipeline(preprocessing.MinMaxScaler(), model_class(**model_kwargs))
    model.fit(train[embedding_features], train[label])
    predictions = model.predict(test[embedding_features])
    gdf_predictions = make_predictions_df(predictions, test, label)
    results = evaluate(gdf_predictions)
    return model, results


# Increasing this value generally improves performance.
training_fraction = 0.2
label = "2025-01-31"
train, test = get_train_test_split(training_fraction)
model, results = run_imputation_model(train, test, label)
results

In [ ]:

Copied!





# @title Visualize a few counties from the test set.
test_counties = test.county_id.unique()
large_counties = (
    df[df.county_id.isin(test_counties)]
    .sort_values("population", ascending=False)[["state", "county", "population"]]
    .head(4)
)
for _, row in large_counties.iterrows():
    _ = subset_eval(label, row.county, row.state, gdf_predictions, cmap="Blues")
# @title Visualize a few counties from the test set.
test_counties = test.county_id.unique()
large_counties = (
    df[df.county_id.isin(test_counties)]
    .sort_values("population", ascending=False)[["state", "county", "population"]]
    .head(4)
)
for _, row in large_counties.iterrows():
    _ = subset_eval(label, row.county, row.state, gdf_predictions, cmap="Blues")

In [ ]:

Copied!





# @title Try other labels.
labels = [
    "2024-01-31",
    "2024-02-29",
    "2024-03-31",
    "2024-04-30",
    "2024-05-31",
    "2024-06-30",
    "2024-07-31",
    "2024-08-31",
    "2024-09-30",
    "2024-10-31",
    "2024-11-30",
    "2024-12-31",
]
train, test = get_train_test_split(0.8)
models_by_label = {}
metrics_df = pd.DataFrame(
    columns=["label", "correlation", "r2", "rmse", "mae", "mape", "model"]
)
for label in labels:
    models_by_label[label], results = run_imputation_model(train, test, label)
    results["label"] = label
    results["model"] = "linear"
    metrics_df.loc[len(metrics_df)] = results

metrics_df.round(3)
# @title Try other labels.
labels = [
    "2024-01-31",
    "2024-02-29",
    "2024-03-31",
    "2024-04-30",
    "2024-05-31",
    "2024-06-30",
    "2024-07-31",
    "2024-08-31",
    "2024-09-30",
    "2024-10-31",
    "2024-11-30",
    "2024-12-31",
]
train, test = get_train_test_split(0.8)
models_by_label = {}
metrics_df = pd.DataFrame(
    columns=["label", "correlation", "r2", "rmse", "mae", "mape", "model"]
)
for label in labels:
    models_by_label[label], results = run_imputation_model(train, test, label)
    results["label"] = label
    results["model"] = "linear"
    metrics_df.loc[len(metrics_df)] = results

metrics_df.round(3)

In [ ]:

Copied!





# @title Try LightGBM models instead of linear.

# This will take a few minutes to run.
models_by_label_lgbm = {}
metrics_df_lgbm = pd.DataFrame(
    columns=["label", "r2", "rmse", "mae", "mape", "correlation", "model"]
)
for label in labels:
    models_by_label_lgbm[label], results = run_imputation_model(
        train,
        test,
        label,
        model_class=lgbm.LGBMRegressor,
        model_kwargs={
            "min_child_samples": 40,
            "importance_type": "gain",
            "n_estimators": 400,
            "learning_rate": 0.04,
            "force_col_wise": True,
        },
    )
    results["label"] = label
    results["model"] = "lgbm"
    metrics_df_lgbm.loc[len(metrics_df_lgbm)] = results

metrics_df_lgbm.round(3)
# @title Try LightGBM models instead of linear.

# This will take a few minutes to run.
models_by_label_lgbm = {}
metrics_df_lgbm = pd.DataFrame(
    columns=["label", "r2", "rmse", "mae", "mape", "correlation", "model"]
)
for label in labels:
    models_by_label_lgbm[label], results = run_imputation_model(
        train,
        test,
        label,
        model_class=lgbm.LGBMRegressor,
        model_kwargs={
            "min_child_samples": 40,
            "importance_type": "gain",
            "n_estimators": 400,
            "learning_rate": 0.04,
            "force_col_wise": True,
        },
    )
    results["label"] = label
    results["model"] = "lgbm"
    metrics_df_lgbm.loc[len(metrics_df_lgbm)] = results

metrics_df_lgbm.round(3)

The LGBM results are mostly comparable with the linear model. They can be improved with more iterations and lower learning rate. You can also try setting feature_fraction=0.5.

In [ ]:

Copied!





# @title LightGBM feature importance

features = {
    "trends": (128, embedding_features[:128]),
    "maps": (128, embedding_features[128:256]),
    "weather": (74, embedding_features[256:]),
}
all_importance = []
for label, model in models_by_label_lgbm.items():
    importance = pd.DataFrame(
        model[1].feature_importances_, index=embedding_features, columns=["importance"]
    )
    importance["importance"] = importance["importance"].abs()
    for feature, dims in features.items():
        importance.loc[dims[1], "feature"] = feature
    importance = importance.groupby("feature").importance.sum().reset_index()
    importance["importance"] = importance.importance / importance.importance.sum() * 100
    importance["label"] = label
    all_importance.append(importance)
all_importance = pd.concat(all_importance)
_, ax = plt.subplots(figsize=(10, 3))
sns.barplot(
    data=all_importance,
    x="label",
    y="importance",
    hue="feature",
    hue_order=features.keys(),
    ax=ax,
)
_ = plt.xticks(rotation=30)
# @title LightGBM feature importance

features = {
    "trends": (128, embedding_features[:128]),
    "maps": (128, embedding_features[128:256]),
    "weather": (74, embedding_features[256:]),
}
all_importance = []
for label, model in models_by_label_lgbm.items():
    importance = pd.DataFrame(
        model[1].feature_importances_, index=embedding_features, columns=["importance"]
    )
    importance["importance"] = importance["importance"].abs()
    for feature, dims in features.items():
        importance.loc[dims[1], "feature"] = feature
    importance = importance.groupby("feature").importance.sum().reset_index()
    importance["importance"] = importance.importance / importance.importance.sum() * 100
    importance["label"] = label
    all_importance.append(importance)
all_importance = pd.concat(all_importance)
_, ax = plt.subplots(figsize=(10, 3))
sns.barplot(
    data=all_importance,
    x="label",
    y="importance",
    hue="feature",
    hue_order=features.keys(),
    ax=ax,
)
_ = plt.xticks(rotation=30)