fix(tests): stabilize flaky Hub LFS integration test

The-Obstacle-Is-The-Way · The-Obstacle-Is-The-Way · commit 8e00a44690e5 · 2025-12-08T11:09:42.000-05:00
`test_push_dataset_dict_to_hub_overwrite_files` intermittently fails with:
```
BadRequestError: LFS pointer pointed to a file that does not exist
```

Root cause: Two race conditions in the test design:
1. Rapid successive `push_to_hub` calls don't wait for Hub's LFS object
   propagation between pushes
2. Second test scenario reused the same repo name, creating a race between
   repo deletion and recreation

Fix:
- Add `_wait_for_repo_ready()` helper that ensures Hub repository is in a
  consistent state before subsequent operations
- Use unique repo name (`ds_name_2`) for second scenario to eliminate the
  delete/create race entirely

Tested: All 4 integration test variants now pass consistently (ubuntu/windows,
deps-latest/deps-minimum).
diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py
@@ -267,6 +267,34 @@ def test_push_dataset_dict_to_hub_with_multiple_commits(self, temporary_repo):
             num_commits_after_push = len(self._api.list_repo_commits(ds_name, repo_type="dataset", token=self._token))
             assert num_commits_after_push - num_commits_before_push > 1
 
+    def _wait_for_repo_ready(self, repo_id, max_wait=30):
+        """Wait for repository to be in a consistent state after push operations.
+
+        This helper addresses race conditions where rapid successive push_to_hub calls
+        don't wait for Hub's LFS object propagation between pushes, causing errors like:
+        "LFS pointer pointed to a file that does not exist"
+
+        Args:
+            repo_id: The repository ID to check.
+            max_wait: Maximum time in seconds to wait for repository readiness.
+
+        Raises:
+            TimeoutError: If repository is not ready within max_wait seconds.
+        """
+        from huggingface_hub.errors import HfHubHTTPError
+
+        start_time = time.monotonic()
+        while (time.monotonic() - start_time) < max_wait:
+            try:
+                # Verify we can list files (repo is consistent)
+                self._api.list_repo_files(repo_id, repo_type="dataset", token=self._token)
+                # Small delay to ensure LFS objects are fully propagated
+                time.sleep(1)
+                return
+            except HfHubHTTPError:
+                time.sleep(1)
+        raise TimeoutError(f"Repository {repo_id} not ready after {max_wait}s")
+
     def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
         ds = Dataset.from_dict({"x": list(range(1000)), "y": list(range(1000))})
         ds2 = Dataset.from_dict({"x": list(range(100)), "y": list(range(100))})
@@ -278,6 +306,9 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
         with temporary_repo() as ds_name:
             local_ds.push_to_hub(ds_name, token=self._token)
 
+            # Wait for Hub to fully process the first push
+            self._wait_for_repo_ready(ds_name)
+
             with tempfile.TemporaryDirectory() as tmp:
                 # Add a file starting with "data" to ensure it doesn't get deleted.
                 path = Path(tmp) / "datafile.txt"
@@ -292,6 +323,9 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
                     token=self._token,
                 )
 
+            # Wait again before second push
+            self._wait_for_repo_ready(ds_name)
+
             local_ds.push_to_hub(ds_name, token=self._token, max_shard_size=500 << 5)
 
             # Ensure that there are two files on the repository that have the correct name
@@ -320,8 +354,11 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
 
         # Push to hub two times, but the second time with fewer files.
         # Verify that the new files contain the correct dataset and that non-necessary files have been deleted.
-        with temporary_repo(ds_name):
-            local_ds.push_to_hub(ds_name, token=self._token, max_shard_size=500 << 5)
+        with temporary_repo() as ds_name_2:
+            local_ds.push_to_hub(ds_name_2, token=self._token, max_shard_size=500 << 5)
+
+            # Wait for Hub to fully process the first push
+            self._wait_for_repo_ready(ds_name_2)
 
             with tempfile.TemporaryDirectory() as tmp:
                 # Add a file starting with "data" to ensure it doesn't get deleted.
@@ -332,15 +369,18 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
                 self._api.upload_file(
                     path_or_fileobj=str(path),
                     path_in_repo="datafile.txt",
-                    repo_id=ds_name,
+                    repo_id=ds_name_2,
                     repo_type="dataset",
                     token=self._token,
                 )
 
-            local_ds.push_to_hub(ds_name, token=self._token)
+            # Wait again before second push
+            self._wait_for_repo_ready(ds_name_2)
+
+            local_ds.push_to_hub(ds_name_2, token=self._token)
 
             # Ensure that there are two files on the repository that have the correct name
-            files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset", token=self._token))
+            files = sorted(self._api.list_repo_files(ds_name_2, repo_type="dataset", token=self._token))
             assert files == [
                 ".gitattributes",
                 "README.md",
@@ -350,9 +390,9 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
             ]
 
             # Keeping the "datafile.txt" breaks the load_dataset to think it's a text-based dataset
-            self._api.delete_file("datafile.txt", repo_id=ds_name, repo_type="dataset", token=self._token)
+            self._api.delete_file("datafile.txt", repo_id=ds_name_2, repo_type="dataset", token=self._token)
 
-            hub_ds = load_dataset(ds_name, download_mode="force_redownload")
+            hub_ds = load_dataset(ds_name_2, download_mode="force_redownload")
 
             assert local_ds.column_names == hub_ds.column_names
             assert list(local_ds["train"].features.keys()) == list(hub_ds["train"].features.keys())