Skip to content

Commit

Permalink
Add geoseries.distance (#1231)
Browse files Browse the repository at this point in the history
closes #759
This PR adds `geoseries.distance`, computing distances between two geoseries.

Benchmarking distance API is a complicated task. Below I present the benchmark of a simplest case: distance between a pair of point geoseries. geopandas is also quite fast when computing simple point distances, througput peaks at 1e5 and gets surpassed by cuspatial for larger data sizes. Both geopandas and cuspatial sees performance drop when dealing with index alignments.

![image](https://github.com/rapidsai/cuspatial/assets/13521008/dc0772f4-2b35-41da-883c-39ba4e731f6a)


TODO:
- [x] Support distance to a single shapely object.
- [x] Benchmark against geopandas.

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - H. Thomson Comer (https://github.com/thomcom)

URL: #1231
  • Loading branch information
isVoid authored Jul 31, 2023
1 parent e4f5ac0 commit 8525f6b
Show file tree
Hide file tree
Showing 5 changed files with 474 additions and 1 deletion.
202 changes: 202 additions & 0 deletions python/cuspatial/cuspatial/core/binops/distance_dispatch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
import cudf
from cudf.core.column import arange, full

from cuspatial._lib.distance import (
pairwise_linestring_distance,
pairwise_linestring_polygon_distance,
pairwise_point_distance,
pairwise_point_linestring_distance,
pairwise_point_polygon_distance,
pairwise_polygon_distance,
)
from cuspatial._lib.types import CollectionType
from cuspatial.core._column.geometa import Feature_Enum
from cuspatial.utils.column_utils import (
contains_only_linestrings,
contains_only_multipoints,
contains_only_points,
contains_only_polygons,
)

# Maps from type combinations to a tuple of (function, reverse,
# point_collection_types).
#
# If reverse is True, the arguments need to be swapped.
# Due to the way the functions are written, certain combinations of types
# requires that the arguments be swapped. For example,
# `point_linestring_distance` requires that the first argument be a point and
# the second argument be a linestring. In this case, when lhs is a linestring
# and rhs is a point, the arguments need to be swapped. The results holds true
# thanks to that cartesian distance is symmetric.
#
# `point_collection_types` is a tuple of the types of the point column type.
# For example, if the first argument is a `MultiPoint` and the second is a
# `Point`, then the `point_collection_types` is (`CollectionType.MULTI`,
# `CollectionType.SINGLE`). They are only needed for point/multipoint columns,
# because the cython APIs are designed to handle both point and multipoint
# columns based on their collection types.

type_to_func = {
(Feature_Enum.POINT, Feature_Enum.POINT): (
pairwise_point_distance,
False,
(CollectionType.SINGLE, CollectionType.SINGLE),
),
(Feature_Enum.POINT, Feature_Enum.MULTIPOINT): (
pairwise_point_distance,
False,
(CollectionType.SINGLE, CollectionType.MULTI),
),
(Feature_Enum.POINT, Feature_Enum.LINESTRING): (
pairwise_point_linestring_distance,
False,
(CollectionType.SINGLE,),
),
(Feature_Enum.POINT, Feature_Enum.POLYGON): (
pairwise_point_polygon_distance,
False,
(CollectionType.SINGLE,),
),
(Feature_Enum.LINESTRING, Feature_Enum.POINT): (
pairwise_point_linestring_distance,
True,
(CollectionType.SINGLE,),
),
(Feature_Enum.LINESTRING, Feature_Enum.MULTIPOINT): (
pairwise_point_linestring_distance,
True,
(CollectionType.MULTI,),
),
(Feature_Enum.LINESTRING, Feature_Enum.LINESTRING): (
pairwise_linestring_distance,
False,
(),
),
(Feature_Enum.LINESTRING, Feature_Enum.POLYGON): (
pairwise_linestring_polygon_distance,
False,
(),
),
(Feature_Enum.POLYGON, Feature_Enum.POINT): (
pairwise_point_polygon_distance,
True,
(CollectionType.SINGLE,),
),
(Feature_Enum.POLYGON, Feature_Enum.MULTIPOINT): (
pairwise_point_polygon_distance,
True,
(CollectionType.MULTI,),
),
(Feature_Enum.POLYGON, Feature_Enum.LINESTRING): (
pairwise_linestring_polygon_distance,
True,
(),
),
(Feature_Enum.POLYGON, Feature_Enum.POLYGON): (
pairwise_polygon_distance,
False,
(),
),
(Feature_Enum.MULTIPOINT, Feature_Enum.POINT): (
pairwise_point_distance,
False,
(CollectionType.MULTI, CollectionType.SINGLE),
),
(Feature_Enum.MULTIPOINT, Feature_Enum.MULTIPOINT): (
pairwise_point_distance,
False,
(CollectionType.MULTI, CollectionType.MULTI),
),
(Feature_Enum.MULTIPOINT, Feature_Enum.LINESTRING): (
pairwise_point_linestring_distance,
False,
(CollectionType.MULTI,),
),
(Feature_Enum.MULTIPOINT, Feature_Enum.POLYGON): (
pairwise_point_polygon_distance,
False,
(CollectionType.MULTI,),
),
}


class DistanceDispatch:
"""Dispatches distance operations between two GeoSeries"""

def __init__(self, lhs, rhs, align):
if align:
self._lhs, self._rhs = lhs.align(rhs)
else:
self._lhs, self._rhs = lhs, rhs

self._align = align
self._res_index = lhs.index
self._non_null_mask = self._lhs.notna() & self._rhs.notna()
self._lhs = self._lhs[self._non_null_mask]
self._rhs = self._rhs[self._non_null_mask]

# TODO: This test is expensive, so would be nice if we can cache it
self._lhs_type = self._determine_series_type(self._lhs)
self._rhs_type = self._determine_series_type(self._rhs)

def _determine_series_type(self, s):
"""Check single geometry type of `s`."""
if contains_only_multipoints(s):
typ = Feature_Enum.MULTIPOINT
elif contains_only_points(s):
typ = Feature_Enum.POINT
elif contains_only_linestrings(s):
typ = Feature_Enum.LINESTRING
elif contains_only_polygons(s):
typ = Feature_Enum.POLYGON
else:
raise NotImplementedError(
"Geoseries with mixed geometry types are not supported"
)
return typ

def _column(self, s, typ):
"""Get column of `s` based on `typ`."""
if typ == Feature_Enum.POINT:
return s.points.column()
elif typ == Feature_Enum.MULTIPOINT:
return s.multipoints.column()
elif typ == Feature_Enum.LINESTRING:
return s.lines.column()
elif typ == Feature_Enum.POLYGON:
return s.polygons.column()

@property
def _lhs_column(self):
return self._column(self._lhs, self._lhs_type)

@property
def _rhs_column(self):
return self._column(self._rhs, self._rhs_type)

def __call__(self):
func, reverse, collection_types = type_to_func[
(self._lhs_type, self._rhs_type)
]
if reverse:
dist = func(*collection_types, self._rhs_column, self._lhs_column)
else:
dist = func(*collection_types, self._lhs_column, self._rhs_column)

# Rows with misaligned indices contains nan. Here we scatter the
# distance values to the correct indices.
result = full(
len(self._res_index),
float("nan"),
dtype="float64",
)
scatter_map = arange(
len(self._res_index), dtype="int32"
).apply_boolean_mask(self._non_null_mask)

result[scatter_map] = dist

# If `align==False`, geopandas preserves lhs index.
index = None if self._align else self._res_index

return cudf.Series(result, index=index, nan_as_null=False)
92 changes: 92 additions & 0 deletions python/cuspatial/cuspatial/core/geoseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
Point,
Polygon,
)
from shapely.geometry.base import BaseGeometry, BaseMultipartGeometry

import cudf
from cudf._typing import ColumnLike
Expand All @@ -27,6 +28,7 @@
import cuspatial.io.pygeoarrow as pygeoarrow
from cuspatial.core._column.geocolumn import ColumnType, GeoColumn
from cuspatial.core._column.geometa import Feature_Enum, GeoMeta
from cuspatial.core.binops.distance_dispatch import DistanceDispatch
from cuspatial.core.binpreds.binpred_dispatch import (
CONTAINS_DISPATCH,
CONTAINS_PROPERLY_DISPATCH,
Expand Down Expand Up @@ -1384,3 +1386,93 @@ def touches(self, other, align=True):
align=align
)
return predicate(self, other)

def isna(self):
"""Detect missing values."""

c = self._column._meta.input_types == Feature_Enum.NONE.value
return cudf.Series(c, index=self.index)

def notna(self):
"""Detect non-missing values."""

c = self._column._meta.input_types != Feature_Enum.NONE.value
return cudf.Series(c, index=self.index)

def distance(self, other, align=True):
"""Returns a `Series` containing the distance to aligned other.
The operation works on a 1-to-1 row-wise manner. See
`geopandas.GeoSeries.distance` documentation for details.
Parameters
----------
other
The GeoSeries (elementwise) or geometric object to find the
distance to.
align : bool, default True
If True, automatically aligns GeoSeries based on their indices.
If False, the order of the elements is preserved.
Returns
-------
Series (float)
Notes
-----
Unlike GeoPandas, this API currently only supports geoseries that
contain only single type geometries.
Examples
--------
>>> from shapely.geometry import Point
>>> point = GeoSeries([Point(0, 0)])
>>> point2 = GeoSeries([Point(1, 1)])
>>> print(point.distance(point2))
0 1.414214
dtype: float64
By default, geoseries are aligned before computing:
>>> from shapely.geometry import Point
>>> point = GeoSeries([Point(0, 0)])
>>> point2 = GeoSeries([Point(1, 1), Point(2, 2)])
>>> print(point.distance(point2))
0 1.414214
1 NaN
dtype: float64
This can be overridden by setting `align=False`:
>>> lines = GeoSeries([
LineString([(0, 0), (1, 1)]), LineString([(2, 2), (3, 3)])])
>>> polys = GeoSeries([
Polygon([(0, 0), (1, 1), (1, 0)]),
Polygon([(2, 2), (3, 3), (3, 2)])],
index=[1, 0])
>>> lines.distance(polys), align=False)
0 0.0
1 0.0
dtype: float64
>>> lines.distance(polys, align=True)
0 1.414214
1 1.414214
dtype: float64
"""

other_is_scalar = False
if issubclass(type(other), (BaseGeometry, BaseMultipartGeometry)):
other_is_scalar = True
other = GeoSeries([other] * len(self), index=self.index)

if not align:
if len(self) != len(other):
raise ValueError(
f"Lengths of inputs do not match. Left: {len(self)}, "
f"Right: {len(other)}"
)

res = DistanceDispatch(self, other, align)()
if other_is_scalar:
res.index = self.index
return res
2 changes: 1 addition & 1 deletion python/cuspatial/cuspatial/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def generator(n, distance_from_origin, radius=1.0):


@pytest.fixture
def multipolygon_generator():
def multipolygon_generator(polygon_generator):
"""Generator for multi complex polygons.
Usage: multipolygon_generator(n, max_per_multi)
"""
Expand Down
Loading

0 comments on commit 8525f6b

Please sign in to comment.