Skip to content

matching

facilities_to_gdf(facilities)

Converts a Polars DataFrame of facilities to a GeoPandas GeoDataFrame.

Parameters:

Name Type Description Default
facilities DataFrame

Polars DataFrame with 'latitude' and 'longitude' columns.

required

Returns:

Type Description
GeoDataFrame

GeoDataFrame with Point geometry.

Source code in src/nemdb/geodata/matching.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def facilities_to_gdf(facilities: pl.DataFrame) -> gpd.GeoDataFrame:
    """
    Converts a Polars DataFrame of facilities to a GeoPandas GeoDataFrame.

    Args:
        facilities: Polars DataFrame with 'latitude' and 'longitude' columns.

    Returns:
        GeoDataFrame with Point geometry.
    """
    # Filter out rows without coordinates
    facilities_with_coords = facilities.filter(
        pl.col("latitude").is_not_null() & pl.col("longitude").is_not_null()
    )

    gdf = gpd.GeoDataFrame(
        facilities_with_coords.to_pandas(),
        geometry=gpd.points_from_xy(
            facilities_with_coords["longitude"], facilities_with_coords["latitude"]
        ),
        crs="EPSG:4326",
    )
    return gdf

find_nearest_gis_feature(facilities_gdf, targets, distance_col='distance_m')

For each facility, finds the nearest target GIS feature using sjoin_nearest.

Both inputs are reprojected to a metric CRS for accurate distance calculation.

Parameters:

Name Type Description Default
facilities_gdf GeoDataFrame

GeoDataFrame of facilities (in any CRS).

required
targets GeoDataFrame

GeoDataFrame of target features (in any CRS).

required
distance_col str

Name of the output distance column.

'distance_m'

Returns:

Type Description
GeoDataFrame

GeoDataFrame with facility rows joined to their nearest target and a

GeoDataFrame

distance column in meters. Preserves the original facility index order.

Source code in src/nemdb/geodata/matching.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def find_nearest_gis_feature(
    facilities_gdf: gpd.GeoDataFrame,
    targets: gpd.GeoDataFrame,
    distance_col: str = "distance_m",
) -> gpd.GeoDataFrame:
    """
    For each facility, finds the nearest target GIS feature using sjoin_nearest.

    Both inputs are reprojected to a metric CRS for accurate distance calculation.

    Args:
        facilities_gdf: GeoDataFrame of facilities (in any CRS).
        targets: GeoDataFrame of target features (in any CRS).
        distance_col: Name of the output distance column.

    Returns:
        GeoDataFrame with facility rows joined to their nearest target and a
        distance column in meters. Preserves the original facility index order.
    """
    facilities_metric = facilities_gdf.to_crs(METRIC_CRS)
    targets_metric = targets.to_crs(METRIC_CRS)

    result = gpd.sjoin_nearest(
        facilities_metric,
        targets_metric,
        how="left",
        distance_col=distance_col,
        lsuffix="facility",
        rsuffix="target",
    )

    # Keep only the first (nearest) match per facility
    # sjoin_nearest preserves the left index, so group by that index
    # and keep only the first row (nearest match) for each facility
    if isinstance(result.index, pd.MultiIndex):
        result = result.reset_index()
        result = result.drop_duplicates(subset=result.columns[0], keep="first")
        result = result.set_index(result.columns[0])
    else:
        result = result.reset_index(drop=False)
        idx_col = result.columns[0] if len(result.columns) > 0 else None
        if idx_col:
            result = result.drop_duplicates(subset=[idx_col], keep="first")
            result = result.set_index(idx_col)

    return result.to_crs(facilities_gdf.crs)

find_nearest_powerstation(facilities, powerstations)

For each facility, finds the nearest powerstation.

Parameters:

Name Type Description Default
facilities DataFrame

Polars DataFrame of facilities.

required
powerstations GeoDataFrame

GeoPandas GeoDataFrame of powerstations.

required

Returns:

Type Description
DataFrame

Polars DataFrame containing facilities joined with their nearest powerstation

DataFrame

and the distance in meters.

Source code in src/nemdb/geodata/matching.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def find_nearest_powerstation(
    facilities: pl.DataFrame, powerstations: gpd.GeoDataFrame
) -> pl.DataFrame:
    """
    For each facility, finds the nearest powerstation.

    Args:
        facilities: Polars DataFrame of facilities.
        powerstations: GeoPandas GeoDataFrame of powerstations.

    Returns:
        Polars DataFrame containing facilities joined with their nearest powerstation
        and the distance in meters.
    """
    facilities_gdf = facilities_to_gdf(facilities)

    # Standardize CRS to metric for accurate distance
    facilities_gdf_metric = facilities_gdf.to_crs(METRIC_CRS)
    powerstations_metric = powerstations.to_crs(METRIC_CRS)

    # Find nearest powerstation for each facility
    nearest_stations = gpd.sjoin_nearest(
        facilities_gdf_metric,
        powerstations_metric,
        how="left",
        distance_col="distance_to_station_m",
        lsuffix="facility",
        rsuffix="station",
    )

    # Convert back to polars and remove geometry
    results = pl.from_pandas(pd.DataFrame(nearest_stations).drop(columns=["geometry"]))
    return results

match_facilities_to_gis(facilities=None, powerstations=None, substations=None, source='pooch')

Match OpenNEM facilities to the nearest GIS power station or substation.

Uses a two-pass approach: first matches against powerstations, then against substations, and keeps whichever match is closer for each facility.

Parameters:

Name Type Description Default
facilities DataFrame | None

OpenNEM facilities DataFrame. When None the data is fetched automatically according to source.

None
powerstations GeoDataFrame | None

GA powerstations GeoDataFrame (fetched if None).

None
substations GeoDataFrame | None

GA substations GeoDataFrame (fetched if None).

None
source Literal['pooch', 'api']

How to obtain facilities when facilities is None. "pooch" (default) downloads the pre-built parquet from the GitHub release — no account required. "api" calls the OpenElectricity API via :func:read_facilities (requires an API key).

'pooch'

Returns:

Type Description
GeoDataFrame

GeoDataFrame with facility info plus gis_name, match_type

GeoDataFrame

("powerstation" or "substation"), and distance_m.

Source code in src/nemdb/geodata/matching.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
def match_facilities_to_gis(
    facilities: pl.DataFrame | None = None,
    powerstations: gpd.GeoDataFrame | None = None,
    substations: gpd.GeoDataFrame | None = None,
    source: Literal["pooch", "api"] = "pooch",
) -> gpd.GeoDataFrame:
    """Match OpenNEM facilities to the nearest GIS power station or substation.

    Uses a two-pass approach: first matches against powerstations, then against
    substations, and keeps whichever match is closer for each facility.

    Args:
        facilities: OpenNEM facilities DataFrame.  When *None* the data is
            fetched automatically according to ``source``.
        powerstations: GA powerstations GeoDataFrame (fetched if None).
        substations: GA substations GeoDataFrame (fetched if None).
        source: How to obtain facilities when ``facilities`` is *None*.
            ``"pooch"`` (default) downloads the pre-built parquet from the
            GitHub release — no account required.  ``"api"`` calls the
            OpenElectricity API via :func:`read_facilities` (requires an API
            key).

    Returns:
        GeoDataFrame with facility info plus ``gis_name``, ``match_type``
        ("powerstation" or "substation"), and ``distance_m``.
    """
    if facilities is None:
        if source == "api":
            facilities = asyncio.run(read_facilities(network_id=["NEM"]))
        else:
            facilities = read_facilities_cached()
    if powerstations is None:
        powerstations = read_major_powerstations().query("state in @NEM_STATES")
    if substations is None:
        substations = read_substations().query("state in @NEM_STATES")
    agg = _aggregate_facilities(facilities)
    fac_gdf = facilities_to_gdf(agg)

    # Pass 1: match against powerstations
    ps_targets = powerstations[["name", "geometry"]].rename(columns={"name": "gis_name"})
    matched_ps = find_nearest_gis_feature(fac_gdf, ps_targets, distance_col="distance_m")
    matched_ps = matched_ps.rename(
        columns={"gis_name": "ps_gis_name", "distance_m": "ps_distance_m"}
    )

    # Pass 2: match against substations
    sub_targets = substations[["name", "geometry"]].rename(columns={"name": "gis_name"})
    matched_sub = find_nearest_gis_feature(fac_gdf, sub_targets, distance_col="distance_m")
    matched_sub = matched_sub.rename(columns={"gis_name": "sub_gis_name"})
    matched_sub = matched_sub.rename(columns={"distance_m": "sub_distance_m"})

    # Combine: for each facility, pick the closer match
    fac_gdf = fac_gdf.copy()

    # Reset indices to align matching results with facility GeoDataFrame
    matched_ps = matched_ps.reset_index(drop=True)
    matched_sub = matched_sub.reset_index(drop=True)
    fac_gdf = fac_gdf.reset_index(drop=True)

    fac_gdf["ps_gis_name"] = matched_ps["ps_gis_name"].values
    fac_gdf["ps_distance_m"] = matched_ps["ps_distance_m"].values
    fac_gdf["sub_gis_name"] = matched_sub["sub_gis_name"].values
    fac_gdf["sub_distance_m"] = matched_sub["sub_distance_m"].values

    ps_closer = fac_gdf["ps_distance_m"] <= fac_gdf["sub_distance_m"]

    fac_gdf["gis_name"] = fac_gdf["ps_gis_name"].where(ps_closer, fac_gdf["sub_gis_name"])
    fac_gdf["match_type"] = "powerstation"
    fac_gdf.loc[~ps_closer, "match_type"] = "substation"
    fac_gdf["distance_m"] = fac_gdf["ps_distance_m"].where(ps_closer, fac_gdf["sub_distance_m"])

    fac_gdf = fac_gdf.drop(
        columns=["ps_gis_name", "ps_distance_m", "sub_gis_name", "sub_distance_m"]
    )
    return fac_gdf