diff --git a/python/sedona/spark/geopandas/base.py b/python/sedona/spark/geopandas/base.py index 7db50e7f8f..4fc14f8f02 100644 --- a/python/sedona/spark/geopandas/base.py +++ b/python/sedona/spark/geopandas/base.py @@ -3049,6 +3049,74 @@ def dwithin(self, other, distance, align=None): """ return _delegate_to_geometry_column("dwithin", self, other, distance, align) + def clip_by_rect(self, xmin, ymin, xmax, ymax): + """Returns a ``GeoSeries`` of the portions of geometry within the + given rectangle. + + The geometry is clipped to the rectangle defined by the given + coordinates. Geometries that do not intersect the rectangle are + returned as empty polygons (``POLYGON EMPTY``). + + .. note:: + This implementation uses ``ST_Intersection`` with a rectangle + envelope, which may produce slightly different results from + geopandas' ``clip_by_rect`` in edge cases: + + - Non-intersecting geometries are returned as ``POLYGON EMPTY``, + whereas geopandas returns ``GEOMETRYCOLLECTION EMPTY``. + - Points on the boundary of the rectangle are considered + intersecting and are returned unchanged, whereas geopandas + returns ``GEOMETRYCOLLECTION EMPTY`` for boundary-only + intersections. + + Parameters + ---------- + xmin : float + Minimum x value of the rectangle. + ymin : float + Minimum y value of the rectangle. + xmax : float + Maximum x value of the rectangle. + ymax : float + Maximum y value of the rectangle. + + Returns + ------- + GeoSeries + + Examples + -------- + >>> from sedona.spark.geopandas import GeoSeries + >>> from shapely.geometry import Polygon, LineString, Point + >>> s = GeoSeries( + ... [ + ... Polygon([(0, 0), (2, 0), (2, 2), (0, 2)]), + ... LineString([(0, 0), (2, 2)]), + ... Point(0.5, 0.5), + ... ], + ... ) + + >>> s.clip_by_rect(0, 0, 1, 1) + 0 POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0)) + 1 LINESTRING (0 0, 1 1) + 2 POINT (0.5 0.5) + dtype: geometry + + Geometries that do not intersect the rectangle are returned as + empty: + + >>> GeoSeries([Point(5, 5)]).clip_by_rect(0, 0, 1, 1) + 0 POLYGON EMPTY + dtype: geometry + + See also + -------- + GeoSeries.intersection + """ + return _delegate_to_geometry_column( + "clip_by_rect", self, xmin, ymin, xmax, ymax + ) + def difference(self, other, align=None): """Returns a ``GeoSeries`` of the points in each aligned geometry that are not in `other`. diff --git a/python/sedona/spark/geopandas/geodataframe.py b/python/sedona/spark/geopandas/geodataframe.py index 93d8dee076..68429c93e2 100644 --- a/python/sedona/spark/geopandas/geodataframe.py +++ b/python/sedona/spark/geopandas/geodataframe.py @@ -51,6 +51,7 @@ "_to_geopandas", "contains", "contains_properly", + "clip_by_rect", "convex_hull", "count_coordinates", "count_geometries", diff --git a/python/sedona/spark/geopandas/geoseries.py b/python/sedona/spark/geopandas/geoseries.py index 60b9fa29bf..088c48ebee 100644 --- a/python/sedona/spark/geopandas/geoseries.py +++ b/python/sedona/spark/geopandas/geoseries.py @@ -66,6 +66,7 @@ "convex_hull", "explode", "clip", + "clip_by_rect", "from_shapely", "count_coordinates", "count_geometries", @@ -834,6 +835,23 @@ def dwithin(self, other, distance, align=None): default_val=False, ) + def clip_by_rect(self, xmin, ymin, xmax, ymax) -> "GeoSeries": + if not all( + isinstance(val, (int, float, np.integer, np.floating)) + for val in [xmin, ymin, xmax, ymax] + ): + raise TypeError( + "clip_by_rect only accepts scalar numeric values for xmin/ymin/xmax/ymax" + ) + rect = stc.ST_PolygonFromEnvelope( + float(xmin), float(ymin), float(xmax), float(ymax) + ) + spark_expr = stf.ST_Intersection(self.spark.column, rect) + return self._query_geometry_column( + spark_expr, + returns_geom=True, + ) + def difference(self, other, align=None) -> "GeoSeries": other_series, extended = self._make_series_of_val(other) align = False if extended else align diff --git a/python/tests/geopandas/test_geoseries.py b/python/tests/geopandas/test_geoseries.py index b61e8e5056..6e82be52f8 100644 --- a/python/tests/geopandas/test_geoseries.py +++ b/python/tests/geopandas/test_geoseries.py @@ -616,6 +616,36 @@ def test_to_arrow(self): def test_clip(self): pass + def test_clip_by_rect(self): + s = GeoSeries( + [ + Polygon([(0, 0), (2, 0), (2, 2), (0, 2)]), + LineString([(0, 0), (2, 2)]), + Point(0.5, 0.5), + Point(5, 5), + None, + ], + ) + result = s.clip_by_rect(0, 0, 1, 1) + expected = gpd.GeoSeries( + [ + Polygon([(0, 0), (0, 1), (1, 1), (1, 0), (0, 0)]), + LineString([(0, 0), (1, 1)]), + Point(0.5, 0.5), + Polygon(), # Sedona returns POLYGON EMPTY for non-intersecting + None, + ] + ) + self.check_sgpd_equals_gpd(result, expected) + + # Check that GeoDataFrame works too + df_result = s.to_geoframe().clip_by_rect(0, 0, 1, 1) + self.check_sgpd_equals_gpd(df_result, expected) + + # Test invalid input types + with pytest.raises(TypeError): + s.clip_by_rect("a", 0, 1, 1) + def test_geom_type(self): geoseries = sgpd.GeoSeries( [ diff --git a/python/tests/geopandas/test_match_geopandas_series.py b/python/tests/geopandas/test_match_geopandas_series.py index 3138e8c267..d86e3a6bb5 100644 --- a/python/tests/geopandas/test_match_geopandas_series.py +++ b/python/tests/geopandas/test_match_geopandas_series.py @@ -495,6 +495,20 @@ def test_to_arrow(self): def test_clip(self): pass + def test_clip_by_rect(self): + # Use rect (0.3, 0.3, 1.7, 1.7) so no test-geometry vertex or hole + # coordinate (0, 0.1, 0.2, 1, 2, …) lands on a rectangle boundary. + # This avoids boundary-handling differences between JTS and GEOS. + for geom in self.geoms: + # JTS throws TopologyException on invalid geometries (e.g. + # self-intersecting polygons) during ST_Intersection, while + # GEOS handles them gracefully. + if not gpd.GeoSeries(geom).is_valid.all(): + continue + sgpd_result = GeoSeries(geom).clip_by_rect(0.3, 0.3, 1.7, 1.7) + gpd_result = gpd.GeoSeries(geom).clip_by_rect(0.3, 0.3, 1.7, 1.7) + self.check_sgpd_equals_gpd(sgpd_result, gpd_result) + def test_geom_type(self): for geom in self.geoms: # Sedona converts it to LineString, so the outputs will be different