-
Notifications
You must be signed in to change notification settings - Fork 158
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
closes #759 This PR adds `geoseries.distance`, computing distances between two geoseries. Benchmarking distance API is a complicated task. Below I present the benchmark of a simplest case: distance between a pair of point geoseries. geopandas is also quite fast when computing simple point distances, througput peaks at 1e5 and gets surpassed by cuspatial for larger data sizes. Both geopandas and cuspatial sees performance drop when dealing with index alignments.  TODO: - [x] Support distance to a single shapely object. - [x] Benchmark against geopandas. Authors: - Michael Wang (https://github.com/isVoid) Approvers: - Mark Harris (https://github.com/harrism) - H. Thomson Comer (https://github.com/thomcom) URL: #1231
- Loading branch information
Showing
5 changed files
with
474 additions
and
1 deletion.
There are no files selected for viewing
202 changes: 202 additions & 0 deletions
202
python/cuspatial/cuspatial/core/binops/distance_dispatch.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,202 @@ | ||
import cudf | ||
from cudf.core.column import arange, full | ||
|
||
from cuspatial._lib.distance import ( | ||
pairwise_linestring_distance, | ||
pairwise_linestring_polygon_distance, | ||
pairwise_point_distance, | ||
pairwise_point_linestring_distance, | ||
pairwise_point_polygon_distance, | ||
pairwise_polygon_distance, | ||
) | ||
from cuspatial._lib.types import CollectionType | ||
from cuspatial.core._column.geometa import Feature_Enum | ||
from cuspatial.utils.column_utils import ( | ||
contains_only_linestrings, | ||
contains_only_multipoints, | ||
contains_only_points, | ||
contains_only_polygons, | ||
) | ||
|
||
# Maps from type combinations to a tuple of (function, reverse, | ||
# point_collection_types). | ||
# | ||
# If reverse is True, the arguments need to be swapped. | ||
# Due to the way the functions are written, certain combinations of types | ||
# requires that the arguments be swapped. For example, | ||
# `point_linestring_distance` requires that the first argument be a point and | ||
# the second argument be a linestring. In this case, when lhs is a linestring | ||
# and rhs is a point, the arguments need to be swapped. The results holds true | ||
# thanks to that cartesian distance is symmetric. | ||
# | ||
# `point_collection_types` is a tuple of the types of the point column type. | ||
# For example, if the first argument is a `MultiPoint` and the second is a | ||
# `Point`, then the `point_collection_types` is (`CollectionType.MULTI`, | ||
# `CollectionType.SINGLE`). They are only needed for point/multipoint columns, | ||
# because the cython APIs are designed to handle both point and multipoint | ||
# columns based on their collection types. | ||
|
||
type_to_func = { | ||
(Feature_Enum.POINT, Feature_Enum.POINT): ( | ||
pairwise_point_distance, | ||
False, | ||
(CollectionType.SINGLE, CollectionType.SINGLE), | ||
), | ||
(Feature_Enum.POINT, Feature_Enum.MULTIPOINT): ( | ||
pairwise_point_distance, | ||
False, | ||
(CollectionType.SINGLE, CollectionType.MULTI), | ||
), | ||
(Feature_Enum.POINT, Feature_Enum.LINESTRING): ( | ||
pairwise_point_linestring_distance, | ||
False, | ||
(CollectionType.SINGLE,), | ||
), | ||
(Feature_Enum.POINT, Feature_Enum.POLYGON): ( | ||
pairwise_point_polygon_distance, | ||
False, | ||
(CollectionType.SINGLE,), | ||
), | ||
(Feature_Enum.LINESTRING, Feature_Enum.POINT): ( | ||
pairwise_point_linestring_distance, | ||
True, | ||
(CollectionType.SINGLE,), | ||
), | ||
(Feature_Enum.LINESTRING, Feature_Enum.MULTIPOINT): ( | ||
pairwise_point_linestring_distance, | ||
True, | ||
(CollectionType.MULTI,), | ||
), | ||
(Feature_Enum.LINESTRING, Feature_Enum.LINESTRING): ( | ||
pairwise_linestring_distance, | ||
False, | ||
(), | ||
), | ||
(Feature_Enum.LINESTRING, Feature_Enum.POLYGON): ( | ||
pairwise_linestring_polygon_distance, | ||
False, | ||
(), | ||
), | ||
(Feature_Enum.POLYGON, Feature_Enum.POINT): ( | ||
pairwise_point_polygon_distance, | ||
True, | ||
(CollectionType.SINGLE,), | ||
), | ||
(Feature_Enum.POLYGON, Feature_Enum.MULTIPOINT): ( | ||
pairwise_point_polygon_distance, | ||
True, | ||
(CollectionType.MULTI,), | ||
), | ||
(Feature_Enum.POLYGON, Feature_Enum.LINESTRING): ( | ||
pairwise_linestring_polygon_distance, | ||
True, | ||
(), | ||
), | ||
(Feature_Enum.POLYGON, Feature_Enum.POLYGON): ( | ||
pairwise_polygon_distance, | ||
False, | ||
(), | ||
), | ||
(Feature_Enum.MULTIPOINT, Feature_Enum.POINT): ( | ||
pairwise_point_distance, | ||
False, | ||
(CollectionType.MULTI, CollectionType.SINGLE), | ||
), | ||
(Feature_Enum.MULTIPOINT, Feature_Enum.MULTIPOINT): ( | ||
pairwise_point_distance, | ||
False, | ||
(CollectionType.MULTI, CollectionType.MULTI), | ||
), | ||
(Feature_Enum.MULTIPOINT, Feature_Enum.LINESTRING): ( | ||
pairwise_point_linestring_distance, | ||
False, | ||
(CollectionType.MULTI,), | ||
), | ||
(Feature_Enum.MULTIPOINT, Feature_Enum.POLYGON): ( | ||
pairwise_point_polygon_distance, | ||
False, | ||
(CollectionType.MULTI,), | ||
), | ||
} | ||
|
||
|
||
class DistanceDispatch: | ||
"""Dispatches distance operations between two GeoSeries""" | ||
|
||
def __init__(self, lhs, rhs, align): | ||
if align: | ||
self._lhs, self._rhs = lhs.align(rhs) | ||
else: | ||
self._lhs, self._rhs = lhs, rhs | ||
|
||
self._align = align | ||
self._res_index = lhs.index | ||
self._non_null_mask = self._lhs.notna() & self._rhs.notna() | ||
self._lhs = self._lhs[self._non_null_mask] | ||
self._rhs = self._rhs[self._non_null_mask] | ||
|
||
# TODO: This test is expensive, so would be nice if we can cache it | ||
self._lhs_type = self._determine_series_type(self._lhs) | ||
self._rhs_type = self._determine_series_type(self._rhs) | ||
|
||
def _determine_series_type(self, s): | ||
"""Check single geometry type of `s`.""" | ||
if contains_only_multipoints(s): | ||
typ = Feature_Enum.MULTIPOINT | ||
elif contains_only_points(s): | ||
typ = Feature_Enum.POINT | ||
elif contains_only_linestrings(s): | ||
typ = Feature_Enum.LINESTRING | ||
elif contains_only_polygons(s): | ||
typ = Feature_Enum.POLYGON | ||
else: | ||
raise NotImplementedError( | ||
"Geoseries with mixed geometry types are not supported" | ||
) | ||
return typ | ||
|
||
def _column(self, s, typ): | ||
"""Get column of `s` based on `typ`.""" | ||
if typ == Feature_Enum.POINT: | ||
return s.points.column() | ||
elif typ == Feature_Enum.MULTIPOINT: | ||
return s.multipoints.column() | ||
elif typ == Feature_Enum.LINESTRING: | ||
return s.lines.column() | ||
elif typ == Feature_Enum.POLYGON: | ||
return s.polygons.column() | ||
|
||
@property | ||
def _lhs_column(self): | ||
return self._column(self._lhs, self._lhs_type) | ||
|
||
@property | ||
def _rhs_column(self): | ||
return self._column(self._rhs, self._rhs_type) | ||
|
||
def __call__(self): | ||
func, reverse, collection_types = type_to_func[ | ||
(self._lhs_type, self._rhs_type) | ||
] | ||
if reverse: | ||
dist = func(*collection_types, self._rhs_column, self._lhs_column) | ||
else: | ||
dist = func(*collection_types, self._lhs_column, self._rhs_column) | ||
|
||
# Rows with misaligned indices contains nan. Here we scatter the | ||
# distance values to the correct indices. | ||
result = full( | ||
len(self._res_index), | ||
float("nan"), | ||
dtype="float64", | ||
) | ||
scatter_map = arange( | ||
len(self._res_index), dtype="int32" | ||
).apply_boolean_mask(self._non_null_mask) | ||
|
||
result[scatter_map] = dist | ||
|
||
# If `align==False`, geopandas preserves lhs index. | ||
index = None if self._align else self._res_index | ||
|
||
return cudf.Series(result, index=index, nan_as_null=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.