Code
### RUN ONLY ON JUPYTER LITE ###
import micropip
await micropip.install(['pandas', 'shapely', 'folium', 'mapclassify'])
await micropip.install('geopandas', deps=False)
### RUN ONLY ON JUPYTER LITE ###
import micropip
await micropip.install(['pandas', 'shapely', 'folium', 'mapclassify'])
await micropip.install('geopandas', deps=False)What is the lab about?
This lab provides a practical overview of how satellite image embeddings can be used as a general-purpose spatial representation.
The lab begins with unsupervised analysis, where participants use embeddings to explore and structure geographic space without predefined labels. This demonstrates how embeddings support exploratory analysis and place-based typologies. The same embeddings are then reused in a predictive context, showing how a single representation can support multiple analytical tasks without rebuilding the feature pipeline.
The emphasis throughout is on reusability and judgement:
The lab is not about training complex models, but about understanding how and when embeddings add value as spatial evidence.
By the end of the lab, participants will be able to:
uk_lsoa_london_embeds_2024.gpkgBefore working with the data, a number of Python libraries are required to support data handling, spatial analysis, and modelling tasks in this lab.
These libraries provide functionality for:
All code in this lab assumes that these libraries are available in the working environment and correctly imported before proceeding to the analytical steps.
"""
Import for a geospatial data science workflow combining spatial analysis,
machine learning, and visualisation.
"""
# Spatial data handling
import pandas as pd # Tabular data manipulation and analysis
import geopandas as gpd # Vector GIS data, geometry operations, CRS handling
# Numerical computing
import numpy as np # Numerical operations and array-based computing
# Visualisation
import matplotlib.pyplot as plt # Static plots and figures
# Unsupervised learning
from sklearn.cluster import KMeans # Clustering into place typologies
# Supervised learning utilities
from sklearn.model_selection import train_test_split # Train/test data splitting
# Supervised models
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
# Tree-based ensemble models for regression and classification
# Model evaluation
from sklearn.metrics import mean_squared_error, accuracy_score
# Performance metrics for regression and classification
# Mapping and basemaps
import folium # Interactive web-based maps
from folium.features import DivIcon # Custom text/HTML map markers
# Utilities
import re # Regular expressions for string processing
from IPython.display import HTML, display # Rich HTML output in notebooks
import uuid # Unique identifiers for map elements or outputs
import jsonThese are reusable blocks of code that perform a specific task throughout the lab.
# @title
"""
Utility functions for spatial clustering and visualisation of LSOA-level
satellite image embeddings.
Functions include:
- Filtering to identifier/embedding/geometry fields
- K-means clustering in embedding space
- Static and interactive mapping
- Selecting and mapping LSOAs closest to a cluster centroid
"""
def filter_table(gdf):
"""
Filter a GeoDataFrame to identifier fields, embedding variables, and geometry.
Parameters
----------
gdf : geopandas.GeoDataFrame
Input GeoDataFrame containing identifiers, embedding variables, and geometry.
Returns
-------
geopandas.GeoDataFrame
Filtered GeoDataFrame containing predefined columns (where present).
"""
columns_to_keep = [
"data_zone_code", "country", "LSOA21CD", "LSOA21NM",
"A00_mean", "A01_mean", "A02_mean", "A03_mean", "A04_mean",
"A05_mean", "A06_mean", "A07_mean", "A08_mean", "A09_mean",
"A10_mean", "A11_mean", "A12_mean", "A13_mean", "A14_mean",
"A15_mean", "A16_mean", "A17_mean", "A18_mean", "A19_mean",
"A20_mean", "A21_mean", "A22_mean", "A23_mean", "A24_mean",
"A25_mean", "A26_mean", "A27_mean", "A28_mean", "A29_mean",
"A30_mean", "A31_mean", "A32_mean", "A33_mean", "A34_mean",
"A35_mean", "A36_mean", "A37_mean", "A38_mean", "A39_mean",
"A40_mean", "A41_mean", "A42_mean", "A43_mean", "A44_mean",
"A45_mean", "A46_mean", "A47_mean", "A48_mean", "A49_mean",
"A50_mean", "A51_mean", "A52_mean", "A53_mean", "A54_mean",
"A55_mean", "A56_mean", "A57_mean", "A58_mean", "A59_mean",
"A60_mean", "A61_mean", "A62_mean", "A63_mean", "geometry",
]
cols = [c for c in columns_to_keep if c in gdf.columns]
return gdf[cols]
def kmeans_clustering(gdf, k, feature_suffix="_mean", random_state=42):
"""
Cluster embedding variables using k-means.
Parameters
----------
gdf : geopandas.GeoDataFrame
Input GeoDataFrame containing embedding variables.
k : int
Number of clusters.
feature_suffix : str, optional
Suffix used to identify feature columns. Default is ``"_mean"``.
random_state : int, optional
Random seed for reproducibility. Default is 42.
Returns
-------
geopandas.GeoDataFrame
Copy of `gdf` with an added ``"cluster"`` column.
Raises
------
ValueError
If no feature columns are found that match `feature_suffix`.
"""
feature_cols = [c for c in gdf.columns if c.endswith(feature_suffix)]
if not feature_cols:
raise ValueError(f"No feature columns found ending with '{feature_suffix}'.")
X = gdf[feature_cols].to_numpy(dtype=float)
kmeans = KMeans(n_clusters=int(k), random_state=random_state)
gdf_out = gdf.copy()
gdf_out["cluster"] = kmeans.fit_predict(X)
return gdf_out
def show_cluster_labels(gdf, cluster_col="cluster"):
"""
Print unique cluster labels.
Parameters
----------
gdf : geopandas.GeoDataFrame
GeoDataFrame containing cluster labels.
cluster_col : str, optional
Name of the cluster label column. Default is ``"cluster"``.
Returns
-------
None
Prints labels to standard output.
Raises
------
ValueError
If `cluster_col` is not present in `gdf`.
"""
if cluster_col not in gdf.columns:
raise ValueError(f"'{cluster_col}' column not found.")
labels = sorted(pd.unique(gdf[cluster_col]))
print("Your clusters are:", ", ".join(map(str, labels)))
def plot_simple_map(
gdf,
cluster_col="cluster",
title="Unsupervised Classification of Places in London",
figsize=(8, 8),
legend=True,
):
"""
Plot spatial clusters with a basemap.
Parameters
----------
gdf : geopandas.GeoDataFrame
GeoDataFrame containing geometries and cluster labels.
cluster_col : str, optional
Name of the cluster label column. Default is ``"cluster"``.
title : str, optional
Plot title. Default is ``"Unsupervised Classification of Places in London"``.
figsize : tuple of int, optional
Figure size in inches. Default is ``(8, 8)``.
legend : bool, optional
Whether to show a legend. Default is True.
Returns
-------
matplotlib.axes.Axes
Axes containing the map.
"""
if cluster_col not in gdf.columns:
raise ValueError(f"'{cluster_col}' column not found.")
fig, ax = plt.subplots(1, 1, figsize=figsize)
gdf.plot(column=cluster_col, categorical=True, legend=legend, ax=ax)
ax.set_title(title)
ax.axis("off")
return ax# @title
def parse_reference_points(text):
"""
Parse reference points from a simple text format.
Parameters
----------
text : str or None
Multiline string where each line is formatted as:
``"Name", latitude, longitude``.
Returns
-------
list of dict
List of dictionaries with keys ``"name"``, ``"lat"``, and ``"lon"``.
Raises
------
ValueError
If any line does not contain exactly three comma-separated values.
"""
reference_points = []
if not text:
return reference_points
for line in text.strip().splitlines():
line = line.strip()
if not line:
continue
parts = [p.strip() for p in line.split(",")]
if len(parts) != 3:
raise ValueError(f"Invalid line format: {line}")
reference_points.append(
{
"name": parts[0].strip('"').strip("'"),
"lat": float(parts[1]),
"lon": float(parts[2]),
}
)
return reference_points
def make_webmap_general(
focus_gdf,
focus_col=None,
focus_name="Focus",
focus_tooltip_cols=(),
focus_categorical=True,
focus_legend=True,
focus_cmap="Set1",
focus_style_kwds=None,
context_gdf=None,
context_name="Context",
context_style_kwds=None,
POIs=None,
layer_control_collapsed=False,
fit_to="focus", # "focus" or "all"
zoom_start=10,
):
"""
Create a general interactive folium map with an optional context layer and
an optional focus layer.
Parameters
----------
focus_gdf : geopandas.GeoDataFrame
GeoDataFrame to display as the main (top) layer.
focus_col : str or None, optional
Column used to colour polygons in the focus layer. If None, polygons are
drawn with a single style. Default is None.
focus_name : str, optional
Layer name for the focus layer. Default is "Focus".
focus_tooltip_cols : tuple of str, optional
Columns shown in tooltip for the focus layer. Default is ().
focus_categorical : bool, optional
Whether the focus_col is categorical (passed to explore). Default is True.
focus_legend : bool, optional
Whether to show a legend for the focus layer. Default is True.
focus_cmap : str, optional
Matplotlib/GeoPandas colour map name. Default is "Set1".
focus_style_kwds : dict or None, optional
Style kwargs for the focus layer (fillOpacity, weight, color, etc.).
context_gdf : geopandas.GeoDataFrame or None, optional
Optional background context layer displayed underneath. Default is None.
context_name : str, optional
Layer name for the context layer. Default is "Context".
context_style_kwds : dict or None, optional
Style kwargs for the context layer.
POIs : str or None, optional
Multiline string of reference points parsed by `parse_reference_points`.
layer_control_collapsed : bool, optional
Whether the layer control is collapsed. Default is False.
fit_to : {"focus", "all"}, optional
Fit bounds to the focus layer only, or to the union of context+focus.
Default is "focus".
zoom_start : int, optional
Initial zoom level (will be overridden by fit_bounds). Default is 10.
Returns
-------
folium.Map
Interactive map.
"""
if focus_gdf is None or len(focus_gdf) == 0:
raise ValueError("focus_gdf must be a non-empty GeoDataFrame.")
# Defaults
if focus_style_kwds is None:
focus_style_kwds = {"fillOpacity": 0.6, "weight": 0.8, "color": "black"}
if context_style_kwds is None:
context_style_kwds = {"fillOpacity": 0.05, "opacity": 0.6, "weight": 0.6, "color": "grey"}
# Reproject to WGS84 for folium
focus_wgs84 = focus_gdf
if context_gdf is not None:
context_wgs84 = context_gdf
else:
context_wgs84 = None
# Centre map using focus bounds
minx, miny, maxx, maxy = focus_wgs84.total_bounds
centre = [(miny + maxy) / 2, (minx + maxx) / 2]
m = folium.Map(location=centre, zoom_start=zoom_start, tiles=None)
# Basemaps
folium.TileLayer("CartoDB positron", name="CartoDB Positron", overlay=False, control=True, show=True).add_to(m)
folium.TileLayer("Esri.WorldImagery", name="Satellite (Esri)", overlay=False, control=True, show=False).add_to(m)
# Context layer (underneath)
if context_wgs84 is not None and len(context_wgs84) > 0:
context_wgs84.explore(
m=m,
name=context_name,
tooltip=False,
legend=False,
show=True,
style_kwds=context_style_kwds,
)
# Focus layer (on top)
tooltip = [c for c in focus_tooltip_cols if c in focus_wgs84.columns]
if focus_col is None:
# Single-style polygons (no colouring)
focus_wgs84.explore(
m=m,
name=focus_name,
tooltip=tooltip if tooltip else False,
legend=False,
style_kwds=focus_style_kwds,
)
else:
if focus_col not in focus_wgs84.columns:
raise ValueError(f"'{focus_col}' column not found in focus_gdf.")
focus_wgs84.explore(
m=m,
column=focus_col,
categorical=bool(focus_categorical),
legend=bool(focus_legend),
tooltip=tooltip,
cmap=focus_cmap,
name=focus_name,
style_kwds=focus_style_kwds,
)
# Reference points (optional)
reference_points = parse_reference_points(POIs)
if reference_points:
ref_layer = folium.FeatureGroup(name="Reference points", show=False)
label_layer = folium.FeatureGroup(name="Reference point names", show=False)
for pt in reference_points:
folium.Marker(
[pt["lat"], pt["lon"]],
popup=pt["name"],
tooltip=pt["name"],
icon=folium.Icon(icon="info-sign"),
).add_to(ref_layer)
folium.Marker(
[pt["lat"], pt["lon"]],
icon=DivIcon(
html=f"<div style='font-size:12px;font-weight:600;'>{pt['name']}</div>"
),
).add_to(label_layer)
ref_layer.add_to(m)
label_layer.add_to(m)
folium.LayerControl(collapsed=layer_control_collapsed).add_to(m)
# Fit bounds
if fit_to == "all" and context_wgs84 is not None and len(context_wgs84) > 0:
minx, miny, maxx, maxy = context_wgs84.total_bounds
else:
minx, miny, maxx, maxy = focus_wgs84.total_bounds
m.fit_bounds([[miny, minx], [maxy, maxx]], padding=(10, 10))
return m
def get_embedding_cols(gdf, pattern=r"^A\d+_mean$"):
"""
Return embedding column names sorted by numeric index.
Parameters
----------
gdf : geopandas.GeoDataFrame
GeoDataFrame containing embedding variables.
pattern : str, optional
Regular expression pattern used to select embedding columns.
Default matches ``A##_mean`` style columns.
Returns
-------
list of str
Sorted embedding column names.
Raises
------
ValueError
If no columns match `pattern`.
"""
cols = [c for c in gdf.columns if re.match(pattern, c)]
if not cols:
raise ValueError("No embedding columns found matching pattern like A##_mean.")
return sorted(cols, key=lambda x: int(re.findall(r"\d+", x)[0]))
def closest_lsoas_to_cluster(
gdf,
cluster_number,
n_closest=5,
cluster_col="cluster",
id_cols=("LSOA21CD", "LSOA21NM"),
):
"""
Find LSOAs closest to a cluster centroid in embedding space.
Distance is computed as Euclidean distance between each LSOA's embedding
vector and the centroid of the specified cluster. Members of the cluster
are excluded from the returned set.
Parameters
----------
gdf : geopandas.GeoDataFrame
GeoDataFrame containing embedding variables and cluster labels.
cluster_number : int
Cluster label used to define the centroid.
n_closest : int, optional
Number of closest LSOAs to return. Default is 5.
cluster_col : str, optional
Name of the cluster label column. Default is ``"cluster"``.
id_cols : tuple of str, optional
Identifier columns to keep if present. Default is
``("LSOA21CD", "LSOA21NM")``.
Returns
-------
geopandas.GeoDataFrame
GeoDataFrame containing the closest LSOAs with an added ``"distance"``
column and geometry.
Raises
------
ValueError
If `cluster_col` is missing, `n_closest` is invalid, or the cluster
has no members in `gdf`.
"""
if cluster_col not in gdf.columns:
raise ValueError(f"'{cluster_col}' column not found.")
if not isinstance(n_closest, int) or n_closest <= 0:
raise ValueError("n_closest must be a positive integer.")
emb_cols = get_embedding_cols(gdf)
mask = gdf[cluster_col] == cluster_number
if int(mask.sum()) == 0:
raise ValueError(f"No rows found for {cluster_col} == {cluster_number}.")
centroid = np.nanmean(gdf.loc[mask, emb_cols].to_numpy(dtype=float), axis=0)
X = gdf[emb_cols].to_numpy(dtype=float)
distances = np.linalg.norm(X - centroid, axis=1)
distances[mask.to_numpy()] = np.inf
available = int((~mask).sum())
n_use = min(n_closest, available)
idx = np.argsort(distances)[:n_use]
out = gdf.iloc[idx].copy()
out["distance"] = distances[idx]
out = out.sort_values("distance")
keep = [c for c in id_cols if c in out.columns] + [cluster_col, "distance", "geometry"]
return out[[c for c in keep if c in out.columns]]
def map_closest_lsoas(
gdf,
cluster_number,
n_closest,
context_gdf=None,
cluster_col="cluster",
tooltip_cols=("LSOA21CD", "LSOA21NM", "distance"),
fill_opacity=0.6,
weight=2.0,
edge_color="black",
POIs=None,
):
"""
Compute the n closest LSOAs to a cluster centroid (in embedding space)
and map them, optionally over a faint context layer.
"""
closest_gdf = closest_lsoas_to_cluster(
gdf=gdf,
cluster_number=cluster_number,
n_closest=n_closest,
cluster_col=cluster_col,
)
# Rank for colouring/legend
closest_gdf = closest_gdf.copy()
closest_gdf["rank"] = range(1, len(closest_gdf) + 1)
m = make_webmap_general(
focus_gdf=closest_gdf,
focus_col="rank",
focus_name=f"{len(closest_gdf)} closest LSOAs",
focus_tooltip_cols=tooltip_cols,
focus_categorical=True,
focus_legend=True,
focus_cmap="Set1",
focus_style_kwds={
"fillOpacity": float(fill_opacity),
"opacity": 1.0,
"weight": float(weight),
"color": edge_color,
},
context_gdf=context_gdf,
context_name="Context (all LSOAs)",
context_style_kwds={
"fillOpacity": 0.05,
"opacity": 0.6,
"weight": 0.6,
"color": "grey",
},
POIs=POIs,
fit_to="focus",
layer_control_collapsed=False,
zoom_start=12,
)
return m, closest_gdf
def plot_embedding_distances(
df,
distance_col="distance",
label_col="LSOA21NM",
title="Distances of Closest LSOAs in Embedding Space",
figsize=(9, 4),
):
"""
Plot embedding-space distances in ascending order with attribute labels shown
on a secondary x-axis.
The primary x-axis shows rank (closest to farthest), while the secondary
x-axis displays values from `label_col` rotated at 45 degrees.
Parameters
----------
df : pandas.DataFrame or geopandas.GeoDataFrame
DataFrame containing distance values and labels.
distance_col : str, optional
Column containing embedding-space distances. Default is ``"distance"``.
label_col : str, optional
Column used for top x-axis labels. Default is ``"LSOA21NM"``.
title : str, optional
Plot title. Default is ``"Distances of Closest LSOAs in Embedding Space"``.
figsize : tuple of int, optional
Figure size in inches. Default is ``(9, 4)``.
Returns
-------
matplotlib.axes.Axes
Axes object containing the main plot.
Raises
------
ValueError
If `distance_col` or `label_col` is not present in `df`.
"""
if distance_col not in df.columns:
raise ValueError(f"'{distance_col}' column not found.")
if label_col not in df.columns:
raise ValueError(f"'{label_col}' column not found.")
# Sort by distance (ascending)
df_sorted = df.sort_values(distance_col).reset_index(drop=True)
x = range(1, len(df_sorted) + 1)
distances = df_sorted[distance_col]
labels = df_sorted[label_col]
fig, ax = plt.subplots(figsize=figsize)
# Main plot
ax.plot(x, distances, marker="o")
ax.set_xlabel("Rank (closest to farthest)")
ax.set_ylabel("Distance to cluster centroid")
ax.set_title(title)
ax.grid(True)
# Secondary x-axis (top) for labels
ax_top = ax.secondary_xaxis("top")
ax_top.set_xticks(list(x))
ax_top.set_xticklabels(labels, rotation=45, ha="left", fontsize=8)
ax_top.set_xlabel(label_col)
plt.tight_layout()
return ax# @title
def run_rf_classifier(
data,
y_col,
x_cols,
test_size=0.2,
random_state=42,
n_estimators=500,
n_jobs=-1,
class_weight="balanced",
dropna=True,
treat_y_as_ordered=True,
verbose=True,
):
"""
Train and evaluate a Random Forest classifier with user-specified target and predictors.
Parameters
----------
data : pandas.DataFrame or geopandas.GeoDataFrame
Input dataset containing `y_col` and `x_cols`.
y_col : str
Dependent (target) variable column name.
x_cols : list of str
Independent (predictor) column names.
test_size : float, optional
Proportion of data used for testing. Default is 0.2.
random_state : int, optional
Random seed for reproducibility. Default is 42.
n_estimators : int, optional
Number of trees in the forest. Default is 500.
n_jobs : int, optional
Number of CPU cores to use. Default is -1 (all cores).
class_weight : str or dict or None, optional
Class weighting strategy for imbalanced data. Default is "balanced".
dropna : bool, optional
If True, drop rows with missing values in y or X. Default is True.
treat_y_as_ordered : bool, optional
If True, convert y to an ordered categorical then to integer codes
for modelling. Useful for ordinal targets (e.g., deciles). Default is True.
verbose : bool, optional
If True, print evaluation outputs. Default is True.
Returns
-------
results : dict
Dictionary containing:
- "model": fitted RandomForestClassifier
- "X_train", "X_test", "y_train", "y_test"
- "y_pred": predicted labels for test set
- "pred_probs": DataFrame of predicted probabilities (test set)
- "accuracy": test accuracy
- "mae_ordinal": mean absolute error in label space (if y is numeric-coded)
- "confusion": confusion matrix (crosstab)
- "importances": Series of feature importances (sorted desc)
Raises
------
ValueError
If required columns are missing or if no rows remain after filtering.
"""
# --- Column checks
missing = [c for c in ([y_col] + list(x_cols)) if c not in data.columns]
if missing:
raise ValueError(f"Missing required columns: {missing}")
df = data[[y_col] + list(x_cols)].copy()
if dropna:
df = df.dropna()
if df.empty:
raise ValueError("No rows available after filtering/dropping missing values.")
# --- Prepare y and X
y_raw = df[y_col]
if treat_y_as_ordered:
y_cat = pd.Categorical(y_raw, ordered=True)
y = pd.Series(y_cat.codes, index=df.index) + 1 # codes start at 0
y_categories = y_cat.categories
else:
y = y_raw
y_categories = None
X = df[list(x_cols)]
# --- Train/test split (stratify if possible)
stratify = y if y.nunique() > 1 else None
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=float(test_size),
random_state=random_state,
stratify=stratify,
)
# --- Model
rf = RandomForestClassifier(
n_estimators=int(n_estimators),
random_state=random_state,
n_jobs=n_jobs,
class_weight=class_weight,
)
rf.fit(X_train, y_train)
# --- Predict
y_pred = rf.predict(X_test)
pred_probs = rf.predict_proba(X_test)
pred_probs_df = pd.DataFrame(pred_probs, columns=rf.classes_, index=X_test.index)
# --- Metrics
acc = accuracy_score(y_test, y_pred)
# Ordinal-ish MAE (only meaningful if y is numeric-coded)
try:
mae_ordinal = (pd.Series(y_pred, index=X_test.index) - pd.Series(y_test, index=X_test.index)).abs().mean()
except Exception:
mae_ordinal = None
confusion = pd.crosstab(
pd.Series(y_test, name=f"Actual {y_col}"),
pd.Series(y_pred, name=f"Predicted {y_col}"),
)
importances = pd.Series(rf.feature_importances_, index=list(x_cols)).sort_values(ascending=False)
if verbose:
print(f"Test accuracy: {acc:.3f}")
if mae_ordinal is not None:
print(f"Mean absolute label error: {mae_ordinal:.2f}")
print("\nConfusion matrix:")
print(confusion)
print("\nTop 15 feature importances:")
print(importances.head(15))
return {
"model": rf,
"X_train": X_train,
"X_test": X_test,
"y_train": y_train,
"y_test": y_test,
"y_pred": y_pred,
"pred_probs": pred_probs_df,
"accuracy": acc,
"mae_ordinal": mae_ordinal,
"confusion": confusion,
"importances": importances,
"y_categories": y_categories,
}
def plot_feature_importance(
importances,
top_n=15,
title="Top Feature Importances",
figsize=(6, 5),
):
"""
Plot the top N feature importances from a fitted model.
Parameters
----------
importances : pandas.Series
Feature importances indexed by feature name
(e.g. results["importances"]).
top_n : int, optional
Number of top features to display. Default is 15.
title : str, optional
Plot title. Default is "Top Feature Importances".
figsize : tuple of int, optional
Figure size in inches. Default is (6, 5).
Returns
-------
matplotlib.axes.Axes
Axes object containing the plot.
"""
if not isinstance(importances, pd.Series):
raise ValueError("importances must be a pandas Series.")
top = importances.head(top_n).sort_values(ascending=True)
fig, ax = plt.subplots(figsize=figsize)
ax.barh(top.index, top.values)
ax.set_xlabel("Feature importance")
ax.set_title(title)
plt.tight_layout()
return ax# ----------------------------
# 1) Classification helper
# ----------------------------
def classify_for_mapping(
gdf,
col,
scheme="quantile", # "quantile" (default), "equal", "natural"
k=10,
label_fmt="{lo:.2f}–{hi:.2f}",
min_unique_to_bin=10,
):
"""
Prepare a column for mapping. If the column is numeric and has more than
`min_unique_to_bin` unique values, bin into `k` classes using the chosen scheme.
Parameters
----------
gdf : geopandas.GeoDataFrame
Input GeoDataFrame.
col : str
Column to prepare.
scheme : {"quantile", "equal", "natural"}, optional
Classification scheme for numeric variables. Default is "quantile".
k : int, optional
Number of classes for numeric variables. Default is 10.
label_fmt : str, optional
Label format for numeric class ranges.
min_unique_to_bin : int, optional
Only bin numeric variables when unique values exceed this threshold.
Returns
-------
gdf_out : geopandas.GeoDataFrame
Copy of input with an added categorical column (if binned).
mapped_col : str
Name of the column to map (original or derived).
is_categorical : bool
Whether the returned mapped column should be treated as categorical.
"""
if col not in gdf.columns:
raise ValueError(f"'{col}' not found in GeoDataFrame.")
s = gdf[col]
gdf_out = gdf.copy()
# Non-numeric -> treat as categorical as-is
if not pd.api.types.is_numeric_dtype(s):
return gdf_out, col, True
# Numeric: bin only if sufficiently many unique values
n_unique = s.dropna().nunique()
if n_unique <= min_unique_to_bin:
return gdf_out, col, True
s_nonnull = s.dropna()
if scheme == "quantile":
bins = pd.qcut(s_nonnull, q=k, duplicates="drop")
cats = bins.cat.categories # ordered intervals
elif scheme == "equal":
bins = pd.cut(s_nonnull, bins=k)
cats = bins.cat.categories # ordered intervals
elif scheme == "natural":
try:
import mapclassify as mc
except ImportError as e:
raise ImportError(
"Natural breaks requires `mapclassify`. Install with: pip install mapclassify"
) from e
classifier = mc.NaturalBreaks(s_nonnull.to_numpy(), k=k)
yb = pd.Series(classifier.yb, index=s_nonnull.index)
edges = np.r_[-np.inf, classifier.bins]
edges[0] = float(s_nonnull.min())
intervals = [pd.Interval(edges[i], edges[i + 1], closed="right")
for i in range(len(edges) - 1)]
cats = pd.Index(intervals)
bins = pd.Categorical.from_codes(yb.to_numpy(), categories=cats, ordered=True)
bins = pd.Series(bins, index=s_nonnull.index)
else:
raise ValueError("scheme must be one of: 'quantile', 'equal', 'natural'.")
# Ordered labels derived from interval order (low -> high)
def _label_interval(iv):
return label_fmt.format(lo=iv.left, hi=iv.right)
labels = [_label_interval(iv) for iv in cats]
# Assign labels using category codes to preserve order reliably
mapped = pd.Series(pd.NA, index=s.index, dtype="object")
if scheme in {"quantile", "equal"}:
codes = bins.cat.codes.to_numpy() # 0..n_bins-1
mapped.loc[s_nonnull.index] = [labels[c] if c >= 0 else pd.NA for c in codes]
else:
codes = pd.Categorical(bins, categories=cats, ordered=True).codes
mapped.loc[s_nonnull.index] = [labels[c] if c >= 0 else pd.NA for c in codes]
mapped_col = f"{col}__{scheme}_k{k}"
# CRITICAL: lock legend/category order explicitly
gdf_out[mapped_col] = pd.Categorical(mapped, categories=labels, ordered=True)
return gdf_out, mapped_col, True
# ----------------------------
# 2) Two maps side-by-side
# ----------------------------
def show_two_maps_side_by_side(
gdf,
left_var,
right_var,
tooltip_cols=(),
scheme="quantile", # default for numeric >10 unique
k=10, # always 10 categories by default
min_unique_to_bin=10, # trigger binning only when >10 unique values
cmap_left="YlOrRd_r",
cmap_right="viridis",
POIs=None,
zoom_start=10,
):
"""
Display two Folium maps side by side. For numeric variables with more than
`min_unique_to_bin` unique values, bin into `k` ordered categories using `scheme`.
Notes
-----
This function assumes `make_webmap_general` and `parse_reference_points`
are already defined in the notebook.
"""
# Left map: classify if needed
g_left, left_mapped, _ = classify_for_mapping(
gdf, left_var, scheme=scheme, k=k, min_unique_to_bin=min_unique_to_bin
)
# Right map: classify if needed
g_right, right_mapped, _ = classify_for_mapping(
gdf, right_var, scheme=scheme, k=k, min_unique_to_bin=min_unique_to_bin
)
# Create maps
m_left = make_webmap_general(
focus_gdf=g_left,
focus_col=left_mapped,
focus_name=left_var,
focus_tooltip_cols=tooltip_cols,
focus_categorical=True,
focus_legend=True,
focus_cmap=cmap_left,
POIs=POIs,
zoom_start=zoom_start,
)
m_right = make_webmap_general(
focus_gdf=g_right,
focus_col=right_mapped,
focus_name=right_var,
focus_tooltip_cols=tooltip_cols,
focus_categorical=True,
focus_legend=True,
focus_cmap=cmap_right,
POIs=POIs,
zoom_start=zoom_start,
)
html = f"""
<div style="display:flex; gap:10px;">
<div style="width:50%;">{m_left._repr_html_()}</div>
<div style="width:50%;">{m_right._repr_html_()}</div>
</div>
"""
display(HTML(html))A method of grouping data into categories without using pre-defined labels, based solely on similarity in the data.
Before we can use the data it must first be loaded into our working environment.
# Load the embedding data
file_path = "data/uk_lsoa_london_embeds_2024.geojson"
# Save the data into a GeoDataFrame
gdf = gpd.GeoDataFrame.from_features(json.loads(open(file_path).read()))Let’s make sure that data is loaded by trying to display some of its information.
# Display coordinate reference system information
print(f'The CRS for this dataset is: {gdf.crs}')
# Display the first few records of the data
gdf.head()Looking at the table above, not all variables are useful for this lab. In this step, the dataset is simplified by retaining only the embedding attributes, along with the information needed to identify each LSOA. This focuses subsequent analysis on the embedding information while preserving the spatial geometry for later mapping and interpretation.
# Creates a new dataset with attributes that we want to keep
gdf = filter_table(gdf)
# Display new datasets
gdf.head()In the above we can see that attributes like ‘dzcode’ are not present in the filtered datasets
An unsupervised algorithm that groups data into a specified number of clusters by minimising the distance between data points and the centre of their assigned cluster.
NOTE: For kmeans, you have to tell the algorithms how many clusters/groups you want.
# USER: Enter the number of clusters that you want
k = 3
# Cluster LSOAs
gdf = kmeans_clustering(gdf, k, feature_suffix="_mean", random_state=42)/lib/python3.13/site-packages/threadpoolctl.py:1123: RuntimeWarning: JsProxy.as_object_map() is deprecated. Use as_py_json() instead.
for filepath in LDSO.loadedLibsByName.as_object_map():
Let’s now have a look at our data again to see if the new cluster labels were added.
# Display to top 20 rows of the data
gdf.head(20)show_cluster_labels(gdf, cluster_col="cluster")Your clusters are: 0, 1, 2
The above is great but displaying the clusters on a map would add a lot more context.
#------------------------------
# Plot a map of clusters
#------------------------------
plot_simple_map(
gdf,
cluster_col="cluster",
title="Unsupervised clusters for London LSOAs",
figsize=(8, 8),
legend=True
)YOUR TURN: Try re-runing the code again for different number of clusters.
Let’s make the map a bit more usable and interactive. We’ll add some reference points to situate ourselves, along with providing a satellite imagery layer.
# USER: Add reference points in the format of: "<Name>", latitude, longitude
POIs = """
"Westminster", 51.4975, -0.1357
"City of London", 51.5155, -0.0922
"WE ARE HERE :)", 51.4962, -0.1298
"London Euston Station", 51.5282, -0.1337
"""
# Draws the maps
m = make_webmap_general(
focus_gdf=gdf,
focus_col="cluster",
focus_name="Clusters",
focus_tooltip_cols=("LSOA21NM", "cluster"),
focus_categorical=True,
focus_legend=True,
focus_cmap="Set1",
focus_style_kwds={
"fillOpacity": 0.6,
"weight": 0.2,
"color": "black",
},
context_gdf=None, # no background layer
POIs=POIs,
fit_to="focus",
)
mOne of the advantages of embedding representations is that observations that are closer in the embedding space are more similar in their underlying characteristics.
# USER: Enter cluster number to explore
cluster_number = 2
# USER: Enter the number of closest LSOAs to use
n_closest_to_show = 16
# Map clusters
m, closestN = map_closest_lsoas(
gdf=gdf,
cluster_number=cluster_number,
n_closest=n_closest_to_show,
context_gdf=gdf,
POIs=None, # optional
)
m# Display table with embedding distances
closestN#-------------------------------------------------
# Plot embedding distance of closest n embeddings
#-------------------------------------------------
plot_embedding_distances(
closestN,
distance_col="distance",
label_col="LSOA21NM",
)YOUR TURN
Experiment with different clustering configurations:
k) used in the unsupervised clustering.n_closest) and observe how the set of similar LSOAs expands or contracts.Uses existing data to train a model that estimates or classifies an outcome for new locations, allowing patterns learned from embeddings and other features to be applied beyond the observed data.
This data contains the Index of Multiple Deprivations (IMD) in deciles and along with some additional variables that have been reported in the literature to influence IMD. IMD deciles divide areas into ten equal groups, with lower deciles indicating higher levels of deprivation and higher deciles indicating lower levels of deprivation.
# Load data
df = pd.read_csv('data/Socioeconomic.csv')dfWe will be using the common attribute ‘LSOA21CD’ to link both sets of data, that is, our data with the embedding variables and socioeconomic variables.
# Join both datasets
gdf2 = gdf.merge(
df,
on="LSOA21CD",
how="left"
)Check that both sets of data are now together.
## Display updated data
gdf2Lets see how the data looks on a map.
# USER: You can add more reference point here in the format
# <"Name of place">, latitude, longitude
POIs = """
"Westminster", 51.4975, -0.1357
"City of London", 51.5155, -0.0922
"Canary Wharf", 51.5054, -0.0235
"King's Cross", 51.5308, -0.1238
"Heathrow Airport", 51.4700, -0.4543
"WE ARE HERE :)", 51.4962, -0.1298
"""
# Draws the map
m = make_webmap_general(
focus_gdf=gdf2, # your GeoDataFrame with IMD merged in
focus_col="IMD_decile", # colour polygons by IMD decile
focus_name="LSOA IMD Deciles",
focus_tooltip_cols=("LSOA21CD", "LSOA21NM", "IMD_decile"),
focus_categorical=True, # deciles are categorical/ordinal
focus_legend=True,
focus_cmap="YlOrRd_r", # good discrete palette
focus_style_kwds={
"fillOpacity": 0.6,
"weight": 0.2,
"color": "black",
},
context_gdf=None, # or set to gdf2 for a faint background (usually unnecessary)
POIs=POIs,
fit_to="focus",
zoom_start=10,
)
m
"""
Alternative colour palettes for focus_cmap:
Warm / sequential (similar to YlOrRd_r):
- "OrRd"
- "Reds"
- "YlGnBu_r"
- "PuRd"
- "YlOrBr"
Cool / neutral sequential:
- "Blues"
- "Greens"
- "BuGn"
- "GnBu"
- "viridis"
Perceptually uniform (publication & accessibility friendly):
- "plasma"
- "inferno"
- "magma"
- "cividis"
Diverging (use only if a meaningful midpoint exists):
- "RdBu_r"
- "BrBG"
- "PuOr"
- "coolwarm"
"""We’ll used variables that have been identified in previous studies as predictors of IMD, specifically: - Percent no qualifications 16 and over - Percent bad and very band health - Population density per km - Percent lone family household
We’ll also be using a random forest model, whichlink text is an ensemble of decision trees. It can model non-linear relationships and interactions between predictors without requiring the proportional-odds assumption used by ordinal logistic regression.
# Run the random forest classifiers and return results
results = run_rf_classifier(
data=gdf2,
y_col="IMD_decile",
x_cols=["Percent no qualifications 16 and over",
"Percent bad and very band health",
"Population density per km",
"Percent lone family household"],
)# Extract predictor variables
feature_cols = [f"A{i:02d}_mean" for i in range(64)]
# Run the random forest classifiers and return results
results = run_rf_classifier(
data=gdf2,
y_col="IMD_decile",
x_cols=feature_cols,
test_size=0.2,
random_state=42,
n_estimators=500,
class_weight="balanced",
)# Extract predictor variables
feature_cols = [f"A{i:02d}_mean" for i in range(64)] + ["Percent no qualifications 16 and over",
"Percent bad and very band health",
"Population density per km",
"Percent lone family household"]
# Run the random forest classifiers and return results
results = run_rf_classifier(
data=gdf2,
y_col="IMD_decile",
x_cols=feature_cols,
test_size=0.2,
random_state=42,
n_estimators=500,
class_weight="balanced",
)# Plot variable importance
plot_feature_importance(
results["importances"],
top_n=15,
title="Top 15 Embedding Features Predicting IMD Decile",
)# Plot two maps side by side using 10 quantile levels for comparison.
show_two_maps_side_by_side(
gdf2,
left_var="IMD_decile",
right_var="Percent no qualifications 16 and over",
tooltip_cols=("LSOA21CD", "LSOA21NM"),
scheme="quantile",
k=10,
)# Plot two maps side by side using 10 equal-interval levels for comparison.
show_two_maps_side_by_side(
gdf2,
left_var="IMD_decile",
right_var="Percent no qualifications 16 and over",
scheme="equal",
k=10,
tooltip_cols=("LSOA21NM",),
)# Plot two maps side by side using 10 natural break levels for comparison.
show_two_maps_side_by_side(
gdf2,
left_var="IMD_decile",
right_var="Percent no qualifications 16 and over",
scheme="natural",
k=10,
tooltip_cols=("LSOA21NM",),
)YOUR TURN: Choose the most influential embedding variable and visually compare it to the IMD variable to explore their spatial relationship, and briefly interpret any patterns or contrasts observed.