@@ -267,6 +267,34 @@ def test_push_dataset_dict_to_hub_with_multiple_commits(self, temporary_repo):
267267 num_commits_after_push = len (self ._api .list_repo_commits (ds_name , repo_type = "dataset" , token = self ._token ))
268268 assert num_commits_after_push - num_commits_before_push > 1
269269
270+ def _wait_for_repo_ready (self , repo_id , max_wait = 30 ):
271+ """Wait for repository to be in a consistent state after push operations.
272+
273+ This helper addresses race conditions where rapid successive push_to_hub calls
274+ don't wait for Hub's LFS object propagation between pushes, causing errors like:
275+ "LFS pointer pointed to a file that does not exist"
276+
277+ Args:
278+ repo_id: The repository ID to check.
279+ max_wait: Maximum time in seconds to wait for repository readiness.
280+
281+ Raises:
282+ TimeoutError: If repository is not ready within max_wait seconds.
283+ """
284+ from huggingface_hub .errors import HfHubHTTPError
285+
286+ start_time = time .monotonic ()
287+ while (time .monotonic () - start_time ) < max_wait :
288+ try :
289+ # Verify we can list files (repo is consistent)
290+ self ._api .list_repo_files (repo_id , repo_type = "dataset" , token = self ._token )
291+ # Small delay to ensure LFS objects are fully propagated
292+ time .sleep (1 )
293+ return
294+ except HfHubHTTPError :
295+ time .sleep (1 )
296+ raise TimeoutError (f"Repository { repo_id } not ready after { max_wait } s" )
297+
270298 def test_push_dataset_dict_to_hub_overwrite_files (self , temporary_repo ):
271299 ds = Dataset .from_dict ({"x" : list (range (1000 )), "y" : list (range (1000 ))})
272300 ds2 = Dataset .from_dict ({"x" : list (range (100 )), "y" : list (range (100 ))})
@@ -278,6 +306,9 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
278306 with temporary_repo () as ds_name :
279307 local_ds .push_to_hub (ds_name , token = self ._token )
280308
309+ # Wait for Hub to fully process the first push
310+ self ._wait_for_repo_ready (ds_name )
311+
281312 with tempfile .TemporaryDirectory () as tmp :
282313 # Add a file starting with "data" to ensure it doesn't get deleted.
283314 path = Path (tmp ) / "datafile.txt"
@@ -292,6 +323,9 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
292323 token = self ._token ,
293324 )
294325
326+ # Wait again before second push
327+ self ._wait_for_repo_ready (ds_name )
328+
295329 local_ds .push_to_hub (ds_name , token = self ._token , max_shard_size = 500 << 5 )
296330
297331 # Ensure that there are two files on the repository that have the correct name
@@ -320,8 +354,11 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
320354
321355 # Push to hub two times, but the second time with fewer files.
322356 # Verify that the new files contain the correct dataset and that non-necessary files have been deleted.
323- with temporary_repo (ds_name ):
324- local_ds .push_to_hub (ds_name , token = self ._token , max_shard_size = 500 << 5 )
357+ with temporary_repo () as ds_name_2 :
358+ local_ds .push_to_hub (ds_name_2 , token = self ._token , max_shard_size = 500 << 5 )
359+
360+ # Wait for Hub to fully process the first push
361+ self ._wait_for_repo_ready (ds_name_2 )
325362
326363 with tempfile .TemporaryDirectory () as tmp :
327364 # Add a file starting with "data" to ensure it doesn't get deleted.
@@ -332,15 +369,18 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
332369 self ._api .upload_file (
333370 path_or_fileobj = str (path ),
334371 path_in_repo = "datafile.txt" ,
335- repo_id = ds_name ,
372+ repo_id = ds_name_2 ,
336373 repo_type = "dataset" ,
337374 token = self ._token ,
338375 )
339376
340- local_ds .push_to_hub (ds_name , token = self ._token )
377+ # Wait again before second push
378+ self ._wait_for_repo_ready (ds_name_2 )
379+
380+ local_ds .push_to_hub (ds_name_2 , token = self ._token )
341381
342382 # Ensure that there are two files on the repository that have the correct name
343- files = sorted (self ._api .list_repo_files (ds_name , repo_type = "dataset" , token = self ._token ))
383+ files = sorted (self ._api .list_repo_files (ds_name_2 , repo_type = "dataset" , token = self ._token ))
344384 assert files == [
345385 ".gitattributes" ,
346386 "README.md" ,
@@ -350,9 +390,9 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
350390 ]
351391
352392 # Keeping the "datafile.txt" breaks the load_dataset to think it's a text-based dataset
353- self ._api .delete_file ("datafile.txt" , repo_id = ds_name , repo_type = "dataset" , token = self ._token )
393+ self ._api .delete_file ("datafile.txt" , repo_id = ds_name_2 , repo_type = "dataset" , token = self ._token )
354394
355- hub_ds = load_dataset (ds_name , download_mode = "force_redownload" )
395+ hub_ds = load_dataset (ds_name_2 , download_mode = "force_redownload" )
356396
357397 assert local_ds .column_names == hub_ds .column_names
358398 assert list (local_ds ["train" ].features .keys ()) == list (hub_ds ["train" ].features .keys ())
0 commit comments