Skip to content

Commit d1df034

Browse files
committed
Avoid copying output from GPU to CPU
1 parent 33ec615 commit d1df034

File tree

5 files changed

+129
-14
lines changed

5 files changed

+129
-14
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
d03e90c2cd9048e6d9a75285c0355f033cd016fc
1+
de4f3c4978b4d36cc0bb8f87c6877a4a040d7ae7

backends/aoti/aoti_delegate_handle.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#include <executorch/runtime/core/error.h>
1212
#include <executorch/runtime/core/evalue.h>
13+
#include <string>
1314

1415
namespace executorch {
1516
namespace backends {
@@ -85,6 +86,7 @@ struct AOTIDelegateHandle {
8586
AOTInductorModelContainerHandle container_handle;
8687
void* cuda_stream; // cudaStream_t stored as void* to avoid CUDA header
8788
// dependency
89+
std::string method_name;
8890

8991
// Function pointers specific to this handle's shared library
9092
AOTInductorModelContainerCreateWithDeviceFunc create_with_device;

backends/cuda/runtime/cuda_backend.cpp

Lines changed: 94 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@
1313
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
1414
#include <cstdio>
1515

16+
#include <array>
1617
#include <filesystem>
1718
#include <fstream>
19+
#include <mutex>
1820
#include <string>
1921
#include <vector>
2022

@@ -35,20 +37,55 @@ using executorch::runtime::ArrayRef;
3537
using executorch::runtime::Backend;
3638
using executorch::runtime::BackendExecutionContext;
3739
using executorch::runtime::BackendInitContext;
40+
using executorch::runtime::BackendOption;
41+
using executorch::runtime::BackendOptionContext;
3842
using executorch::runtime::CompileSpec;
3943
using executorch::runtime::DelegateHandle;
4044
using executorch::runtime::Error;
4145
using executorch::runtime::EValue;
4246
using executorch::runtime::FreeableBuffer;
47+
using executorch::runtime::kMaxOptionValueLength;
4348
using executorch::runtime::MemoryAllocator;
4449
using executorch::runtime::NamedDataMap;
4550
using executorch::runtime::Result;
4651
using executorch::runtime::Span;
4752
using executorch::runtime::etensor::Tensor;
4853

54+
namespace {
55+
constexpr char kSkipCopyOutputToCpuForMethod[] =
56+
"skip_copy_output_to_cpu_for_method";
57+
}
58+
4959
class ET_EXPERIMENTAL CudaBackend final
5060
: public ::executorch::runtime::BackendInterface {
5161
private:
62+
63+
void set_skip_copy_method(
64+
const std::array<char, kMaxOptionValueLength>& raw) {
65+
std::lock_guard<std::mutex> guard(skip_copy_method_mutex_);
66+
skip_copy_method_ = std::string(raw.data());
67+
}
68+
69+
std::array<char, kMaxOptionValueLength> get_skip_copy_method_as_option()
70+
const {
71+
std::array<char, kMaxOptionValueLength> out{};
72+
std::string value;
73+
{
74+
std::lock_guard<std::mutex> guard(skip_copy_method_mutex_);
75+
value = skip_copy_method_;
76+
}
77+
std::snprintf(out.data(), out.size(), "%s", value.c_str());
78+
return out;
79+
}
80+
81+
bool should_skip_copy_for_method(const std::string& method_name) const {
82+
if (method_name.empty()) {
83+
return false;
84+
}
85+
std::lock_guard<std::mutex> guard(skip_copy_method_mutex_);
86+
return method_name == skip_copy_method_;
87+
}
88+
5289
Error load_function_pointers_into_handle(
5390
void* so_handle,
5491
AOTIDelegateHandle* handle) const {
@@ -91,6 +128,38 @@ class ET_EXPERIMENTAL CudaBackend final
91128
return 1;
92129
}
93130

131+
Error set_option(
132+
ET_UNUSED BackendOptionContext& context,
133+
const executorch::runtime::Span<BackendOption>& backend_options)
134+
override {
135+
for (const auto& option : backend_options) {
136+
if (std::strcmp(option.key, kSkipCopyOutputToCpuForMethod) == 0) {
137+
if (auto* val = std::get_if<std::array<char, kMaxOptionValueLength>>(
138+
&option.value)) {
139+
set_skip_copy_method(*val);
140+
} else {
141+
ET_LOG(
142+
Error,
143+
"Option %s must be a method name string.",
144+
kSkipCopyOutputToCpuForMethod);
145+
return Error::InvalidArgument;
146+
}
147+
}
148+
}
149+
return Error::Ok;
150+
}
151+
152+
Error get_option(
153+
ET_UNUSED BackendOptionContext& context,
154+
executorch::runtime::Span<BackendOption>& backend_options) override {
155+
for (auto& option : backend_options) {
156+
if (std::strcmp(option.key, kSkipCopyOutputToCpuForMethod) == 0) {
157+
option.value = get_skip_copy_method_as_option();
158+
}
159+
}
160+
return Error::Ok;
161+
}
162+
94163
// Once per loaded binary blob
95164
Result<DelegateHandle*> init(
96165
BackendInitContext& context,
@@ -159,6 +228,7 @@ class ET_EXPERIMENTAL CudaBackend final
159228
AOTIDelegateHandle* handle = new AOTIDelegateHandle();
160229
handle->so_handle = lib_handle;
161230
handle->so_path = so_path.string();
231+
handle->method_name = method_name;
162232

163233
// Load function pointers specific to this handle's shared library
164234
ET_CHECK_OK_OR_RETURN_ERROR(
@@ -303,18 +373,26 @@ class ET_EXPERIMENTAL CudaBackend final
303373
"AOTInductorModelContainerRun failed with error code %d",
304374
error);
305375

306-
// Copy GPU output results back to CPU output tensors
307-
for (int i = 0; i < n_outputs; i++) {
308-
auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
309-
// For DYNAMIC_BOUND tensors we try to resize
310-
ET_CHECK_OK_OR_RETURN_ERROR(
311-
resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()),
312-
"Error resizing tensor at output index %d",
313-
i);
314-
ET_CHECK_OK_OR_RETURN_ERROR(
315-
aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0),
316-
"Failed to copy GPU output %d back to CPU",
317-
i);
376+
const bool copy_outputs = !should_skip_copy_for_method(handle->method_name);
377+
378+
if (copy_outputs) {
379+
// Copy GPU output results back to CPU output tensors
380+
for (int i = 0; i < n_outputs; i++) {
381+
auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
382+
// For DYNAMIC_BOUND tensors we try to resize
383+
ET_CHECK_OK_OR_RETURN_ERROR(
384+
resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()),
385+
"Error resizing tensor at output index %d",
386+
i);
387+
ET_CHECK_OK_OR_RETURN_ERROR(
388+
aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0),
389+
"Failed to copy GPU output %d back to CPU",
390+
i);
391+
}
392+
} else {
393+
for (int i = 0; i < n_outputs; i++) {
394+
args[i + n_inputs]->toTensor() = *gpu_outputs[i];
395+
}
318396
}
319397

320398
return Error::Ok;
@@ -365,6 +443,10 @@ class ET_EXPERIMENTAL CudaBackend final
365443
delete handle;
366444
clear_all_tensors();
367445
}
446+
447+
private:
448+
mutable std::mutex skip_copy_method_mutex_;
449+
std::string skip_copy_method_;
368450
};
369451

370452
} // namespace executorch::backends::cuda

extension/asr/runner/CMakeLists.txt

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,22 @@ set_target_properties(
3535
extension_asr_runner PROPERTIES POSITION_INDEPENDENT_CODE ON
3636
)
3737

38+
# If the project is configured to build with CUDA support, try to find a CUDA
39+
# runtime (prefer the CUDAToolkit package). If found, expose a compile-time
40+
# macro so sources can conditionally compile CUDA-aware code.
41+
if(EXECUTORCH_BUILD_CUDA)
42+
find_package(CUDAToolkit QUIET)
43+
if(CUDAToolkit_FOUND)
44+
target_compile_definitions(extension_asr_runner PUBLIC CUDA_AVAILABLE)
45+
message(STATUS "CUDAToolkit found; defining CUDA_AVAILABLE for ASR runner")
46+
else()
47+
message(
48+
STATUS
49+
"CUDA requested (EXECUTORCH_BUILD_CUDA=ON) but no CUDA runtime found"
50+
)
51+
endif()
52+
endif()
53+
3854
install(
3955
TARGETS extension_asr_runner
4056
EXPORT ExecuTorchTargets

extension/asr/runner/runner.cpp

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,22 @@ Error AsrRunner::load() {
107107

108108
ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kDecoderMethodName));
109109
decoder_method_loaded_ = true;
110-
110+
#ifdef CUDA_AVAILABLE
111+
executorch::runtime::BackendOptions<1> backend_options;
112+
// For decoder still copy output from GPU to CPU for sampling.
113+
// TODO: change sampler to use a CUDA kernel to sample and then skip copying
114+
// decoder output as well
115+
ET_CHECK_OK_OR_RETURN_ERROR(backend_options.set_option(
116+
"skip_copy_output_to_cpu_for_method", kEncoderMethodName));
117+
const auto opt_err =
118+
executorch::runtime::set_option("CudaBackend", backend_options.view());
119+
if (opt_err != ::executorch::runtime::Error::Ok) {
120+
ET_LOG(
121+
Warning,
122+
"Failed to set CUDA backend options: %d",
123+
static_cast<int>(opt_err));
124+
}
125+
#endif
111126
ET_CHECK_OK_OR_RETURN_ERROR(load_tokenizer());
112127
auto eos_ids = get_eos_ids(tokenizer_.get(), module_.get());
113128
if (!eos_ids.empty()) {

0 commit comments

Comments
 (0)