1313#include < executorch/runtime/core/exec_aten/util/tensor_util.h>
1414#include < cstdio>
1515
16+ #include < array>
1617#include < filesystem>
1718#include < fstream>
19+ #include < mutex>
1820#include < string>
1921#include < vector>
2022
@@ -35,20 +37,55 @@ using executorch::runtime::ArrayRef;
3537using executorch::runtime::Backend;
3638using executorch::runtime::BackendExecutionContext;
3739using executorch::runtime::BackendInitContext;
40+ using executorch::runtime::BackendOption;
41+ using executorch::runtime::BackendOptionContext;
3842using executorch::runtime::CompileSpec;
3943using executorch::runtime::DelegateHandle;
4044using executorch::runtime::Error;
4145using executorch::runtime::EValue;
4246using executorch::runtime::FreeableBuffer;
47+ using executorch::runtime::kMaxOptionValueLength ;
4348using executorch::runtime::MemoryAllocator;
4449using executorch::runtime::NamedDataMap;
4550using executorch::runtime::Result;
4651using executorch::runtime::Span;
4752using executorch::runtime::etensor::Tensor;
4853
54+ namespace {
55+ constexpr char kSkipCopyOutputToCpuForMethod [] =
56+ " skip_copy_output_to_cpu_for_method" ;
57+ }
58+
4959class ET_EXPERIMENTAL CudaBackend final
5060 : public ::executorch::runtime::BackendInterface {
5161 private:
62+
63+ void set_skip_copy_method (
64+ const std::array<char , kMaxOptionValueLength >& raw) {
65+ std::lock_guard<std::mutex> guard (skip_copy_method_mutex_);
66+ skip_copy_method_ = std::string (raw.data ());
67+ }
68+
69+ std::array<char , kMaxOptionValueLength > get_skip_copy_method_as_option ()
70+ const {
71+ std::array<char , kMaxOptionValueLength > out{};
72+ std::string value;
73+ {
74+ std::lock_guard<std::mutex> guard (skip_copy_method_mutex_);
75+ value = skip_copy_method_;
76+ }
77+ std::snprintf (out.data (), out.size (), " %s" , value.c_str ());
78+ return out;
79+ }
80+
81+ bool should_skip_copy_for_method (const std::string& method_name) const {
82+ if (method_name.empty ()) {
83+ return false ;
84+ }
85+ std::lock_guard<std::mutex> guard (skip_copy_method_mutex_);
86+ return method_name == skip_copy_method_;
87+ }
88+
5289 Error load_function_pointers_into_handle (
5390 void * so_handle,
5491 AOTIDelegateHandle* handle) const {
@@ -91,6 +128,38 @@ class ET_EXPERIMENTAL CudaBackend final
91128 return 1 ;
92129 }
93130
131+ Error set_option (
132+ ET_UNUSED BackendOptionContext& context,
133+ const executorch::runtime::Span<BackendOption>& backend_options)
134+ override {
135+ for (const auto & option : backend_options) {
136+ if (std::strcmp (option.key , kSkipCopyOutputToCpuForMethod ) == 0 ) {
137+ if (auto * val = std::get_if<std::array<char , kMaxOptionValueLength >>(
138+ &option.value )) {
139+ set_skip_copy_method (*val);
140+ } else {
141+ ET_LOG (
142+ Error,
143+ " Option %s must be a method name string." ,
144+ kSkipCopyOutputToCpuForMethod );
145+ return Error::InvalidArgument;
146+ }
147+ }
148+ }
149+ return Error::Ok;
150+ }
151+
152+ Error get_option (
153+ ET_UNUSED BackendOptionContext& context,
154+ executorch::runtime::Span<BackendOption>& backend_options) override {
155+ for (auto & option : backend_options) {
156+ if (std::strcmp (option.key , kSkipCopyOutputToCpuForMethod ) == 0 ) {
157+ option.value = get_skip_copy_method_as_option ();
158+ }
159+ }
160+ return Error::Ok;
161+ }
162+
94163 // Once per loaded binary blob
95164 Result<DelegateHandle*> init (
96165 BackendInitContext& context,
@@ -159,6 +228,7 @@ class ET_EXPERIMENTAL CudaBackend final
159228 AOTIDelegateHandle* handle = new AOTIDelegateHandle ();
160229 handle->so_handle = lib_handle;
161230 handle->so_path = so_path.string ();
231+ handle->method_name = method_name;
162232
163233 // Load function pointers specific to this handle's shared library
164234 ET_CHECK_OK_OR_RETURN_ERROR (
@@ -303,18 +373,26 @@ class ET_EXPERIMENTAL CudaBackend final
303373 " AOTInductorModelContainerRun failed with error code %d" ,
304374 error);
305375
306- // Copy GPU output results back to CPU output tensors
307- for (int i = 0 ; i < n_outputs; i++) {
308- auto cpu_output_tensor = &(args[i + n_inputs]->toTensor ());
309- // For DYNAMIC_BOUND tensors we try to resize
310- ET_CHECK_OK_OR_RETURN_ERROR (
311- resize_tensor (*cpu_output_tensor, gpu_outputs[i]->sizes ()),
312- " Error resizing tensor at output index %d" ,
313- i);
314- ET_CHECK_OK_OR_RETURN_ERROR (
315- aoti_torch_copy_ (cpu_output_tensor, gpu_outputs[i], 0 ),
316- " Failed to copy GPU output %d back to CPU" ,
317- i);
376+ const bool copy_outputs = !should_skip_copy_for_method (handle->method_name );
377+
378+ if (copy_outputs) {
379+ // Copy GPU output results back to CPU output tensors
380+ for (int i = 0 ; i < n_outputs; i++) {
381+ auto cpu_output_tensor = &(args[i + n_inputs]->toTensor ());
382+ // For DYNAMIC_BOUND tensors we try to resize
383+ ET_CHECK_OK_OR_RETURN_ERROR (
384+ resize_tensor (*cpu_output_tensor, gpu_outputs[i]->sizes ()),
385+ " Error resizing tensor at output index %d" ,
386+ i);
387+ ET_CHECK_OK_OR_RETURN_ERROR (
388+ aoti_torch_copy_ (cpu_output_tensor, gpu_outputs[i], 0 ),
389+ " Failed to copy GPU output %d back to CPU" ,
390+ i);
391+ }
392+ } else {
393+ for (int i = 0 ; i < n_outputs; i++) {
394+ args[i + n_inputs]->toTensor () = *gpu_outputs[i];
395+ }
318396 }
319397
320398 return Error::Ok;
@@ -365,6 +443,10 @@ class ET_EXPERIMENTAL CudaBackend final
365443 delete handle;
366444 clear_all_tensors ();
367445 }
446+
447+ private:
448+ mutable std::mutex skip_copy_method_mutex_;
449+ std::string skip_copy_method_;
368450};
369451
370452} // namespace executorch::backends::cuda
0 commit comments