You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I have been trying to load a saved model from pytorch (swin unetr) with libtorch. I have been getting a whole slew of random runtime errors from the C++ code.
Here is the save checkpoint code
def save_checkpoint(model, epoch, args, filename, best_acc=0, optimizer=None, scheduler=None):
if torch.distributed.get_rank() == 0:
# Detach model from DDP wrapper (if applicable)
if isinstance(model, torch.nn.parallel.DistributedDataParallel):
state_dict = model.state_dict() if not args.distributed else model.module.state_dict()
save_dict = {"epoch": epoch, "best_acc": best_acc, "state_dict": state_dict}
if optimizer is not None:
save_dict["optimizer"] = optimizer.state_dict()
if scheduler is not None:
save_dict["scheduler"] = scheduler.state_dict()
filename = os.path.join(args.pretrained_dir, filename)
torch.save(save_dict, filename)
# Reinitialize the model and load the state dictionary
nextmode = SwinUNETR(
img_size=(args.roi_x, args.roi_y, args.roi_z),
in_channels=args.in_channels,
out_channels=args.out_channels,
feature_size=args.feature_size,
drop_rate=0.0,
attn_drop_rate=0.0,
dropout_path_rate=args.dropout_path_rate,
use_checkpoint=False, # Disable checkpointing during tracing
)
nextmode.load_state_dict(state_dict)
nextmode.eval()
nextmode.cuda()
# Use torch.jit.trace instead of torch.jit.script if the model contains unsupported features
example_input = torch.randn(1, args.in_channels, args.roi_x, args.roi_y, args.roi_z).cuda()
traced_model = torch.jit.trace(nextmode, example_input).cuda()
tracename = os.path.join(args.pretrained_dir, "model_trace.tpt")
torch.jit.save(traced_model, tracename)
print("Saving traced checkpoint", tracename)
Here is the relevant C++ code. The error occurs with just loading the data without doing anything else.
int main_torch()//Scan *Scan_Data, int model_num, int merge, int2 rng)
{
cout<<"Torch Inference"<<endl;
const char *saved_model_file = "mdl/model.pt";
string save_mdl_path = saved_model_file;
torch::jit::Module module;
try
{
// Deserialize the ScriptModule from a file using torch::jit::load().
module = torch::jit::load(save_mdl_path);
//if (torch::cuda::is_available()) {std::cout << "CUDA is available! Moving model to GPU." << std::endl;
module.to(torch::kCUDA);//}
//else {std::cout << "CUDA is not available. Using CPU." << std::endl;}
}
catch (const c10::Error& e) {std::cerr << "Error loading the model "<<e.what()<<std::endl; return -1;}
return 0;
}
Anyone run into this type of issue before?
I have hit a brick wall trying to get help from ChatGPT and Gemini.
ERROR
Error loading the model Could not run 'aten::empty_strided' with arguments from the 'CUDA' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::empty_strided' is only available for these backends: [CPU, Meta, QuantizedCPU, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradHIP, AutogradXLA, AutogradMPS, AutogradIPU, AutogradXPU, AutogradHPU, AutogradVE, AutogradLazy, AutogradMTIA, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, AutogradMeta, AutogradNestedTensor, Tracer, AutocastCPU, AutocastCUDA, FuncTorchBatched, BatchedNestedTensor, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PreDispatch, PythonDispatcher].
CPU: registered at aten/src/ATen/RegisterCPU.cpp:31420 [kernel]
Meta: registered at aten/src/ATen/RegisterMeta.cpp:26984 [kernel]
QuantizedCPU: registered at aten/src/ATen/RegisterQuantizedCPU.cpp:951 [kernel]
BackendSelect: registered at aten/src/ATen/RegisterBackendSelect.cpp:807 [kernel]
Python: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:154 [backend fallback]
FuncTorchDynamicLayerBackMode: registered at ../aten/src/ATen/functorch/DynamicLayer.cpp:497 [backend fallback]
Functionalize: registered at ../aten/src/ATen/FunctionalizeFallbackKernel.cpp:324 [backend fallback]
Named: registered at ../aten/src/ATen/core/NamedRegistrations.cpp:7 [backend fallback]
Conjugate: fallthrough registered at ../aten/src/ATen/ConjugateFallback.cpp:21 [kernel]
Negative: fallthrough registered at ../aten/src/ATen/native/NegateFallback.cpp:22 [kernel]
ZeroTensor: fallthrough registered at ../aten/src/ATen/ZeroTensorFallback.cpp:90 [kernel]
ADInplaceOrView: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:86 [backend fallback]
AutogradOther: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradCPU: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradCUDA: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradHIP: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradXLA: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradMPS: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradIPU: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradXPU: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradHPU: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradVE: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradLazy: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradMTIA: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradPrivateUse1: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradPrivateUse2: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradPrivateUse3: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradMeta: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradNestedTensor: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
Tracer: registered at ../torch/csrc/autograd/generated/TraceType_2.cpp:17415 [kernel]
AutocastCPU: fallthrough registered at ../aten/src/ATen/autocast_mode.cpp:378 [backend fallback]
AutocastCUDA: fallthrough registered at ../aten/src/ATen/autocast_mode.cpp:244 [backend fallback]
FuncTorchBatched: registered at ../aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:731 [backend fallback]
BatchedNestedTensor: registered at ../aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:758 [backend fallback]
FuncTorchVmapMode: fallthrough registered at ../aten/src/ATen/functorch/VmapModeRegistrations.cpp:27 [backend fallback]
Batched: registered at ../aten/src/ATen/LegacyBatchingRegistrations.cpp:1075 [backend fallback]
VmapMode: fallthrough registered at ../aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback]
FuncTorchGradWrapper: registered at ../aten/src/ATen/functorch/TensorWrapper.cpp:202 [backend fallback]
PythonTLSSnapshot: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:162 [backend fallback]
FuncTorchDynamicLayerFrontMode: registered at ../aten/src/ATen/functorch/DynamicLayer.cpp:493 [backend fallback]
PreDispatch: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:166 [backend fallback]
PythonDispatcher: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:158 [backend fallback]
reacted with thumbs up emoji reacted with thumbs down emoji reacted with laugh emoji reacted with hooray emoji reacted with confused emoji reacted with heart emoji reacted with rocket emoji reacted with eyes emoji
-
I have been trying to load a saved model from pytorch (swin unetr) with libtorch. I have been getting a whole slew of random runtime errors from the C++ code.
Here is the save checkpoint code
def save_checkpoint(model, epoch, args, filename, best_acc=0, optimizer=None, scheduler=None):
if torch.distributed.get_rank() == 0:
# Detach model from DDP wrapper (if applicable)
if isinstance(model, torch.nn.parallel.DistributedDataParallel):
state_dict = model.state_dict() if not args.distributed else model.module.state_dict()
save_dict = {"epoch": epoch, "best_acc": best_acc, "state_dict": state_dict}
if optimizer is not None:
save_dict["optimizer"] = optimizer.state_dict()
if scheduler is not None:
save_dict["scheduler"] = scheduler.state_dict()
Here is the relevant C++ code. The error occurs with just loading the data without doing anything else.
int main_torch()//Scan *Scan_Data, int model_num, int merge, int2 rng)
{
cout<<"Torch Inference"<<endl;
const char *saved_model_file = "mdl/model.pt";
string save_mdl_path = saved_model_file;
}
Anyone run into this type of issue before?
I have hit a brick wall trying to get help from ChatGPT and Gemini.
ERROR
Error loading the model Could not run 'aten::empty_strided' with arguments from the 'CUDA' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::empty_strided' is only available for these backends: [CPU, Meta, QuantizedCPU, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradHIP, AutogradXLA, AutogradMPS, AutogradIPU, AutogradXPU, AutogradHPU, AutogradVE, AutogradLazy, AutogradMTIA, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, AutogradMeta, AutogradNestedTensor, Tracer, AutocastCPU, AutocastCUDA, FuncTorchBatched, BatchedNestedTensor, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PreDispatch, PythonDispatcher].
CPU: registered at aten/src/ATen/RegisterCPU.cpp:31420 [kernel]
Meta: registered at aten/src/ATen/RegisterMeta.cpp:26984 [kernel]
QuantizedCPU: registered at aten/src/ATen/RegisterQuantizedCPU.cpp:951 [kernel]
BackendSelect: registered at aten/src/ATen/RegisterBackendSelect.cpp:807 [kernel]
Python: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:154 [backend fallback]
FuncTorchDynamicLayerBackMode: registered at ../aten/src/ATen/functorch/DynamicLayer.cpp:497 [backend fallback]
Functionalize: registered at ../aten/src/ATen/FunctionalizeFallbackKernel.cpp:324 [backend fallback]
Named: registered at ../aten/src/ATen/core/NamedRegistrations.cpp:7 [backend fallback]
Conjugate: fallthrough registered at ../aten/src/ATen/ConjugateFallback.cpp:21 [kernel]
Negative: fallthrough registered at ../aten/src/ATen/native/NegateFallback.cpp:22 [kernel]
ZeroTensor: fallthrough registered at ../aten/src/ATen/ZeroTensorFallback.cpp:90 [kernel]
ADInplaceOrView: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:86 [backend fallback]
AutogradOther: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradCPU: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradCUDA: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradHIP: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradXLA: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradMPS: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradIPU: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradXPU: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradHPU: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradVE: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradLazy: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradMTIA: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradPrivateUse1: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradPrivateUse2: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradPrivateUse3: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradMeta: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
AutogradNestedTensor: registered at ../torch/csrc/autograd/generated/VariableType_2.cpp:19078 [autograd kernel]
Tracer: registered at ../torch/csrc/autograd/generated/TraceType_2.cpp:17415 [kernel]
AutocastCPU: fallthrough registered at ../aten/src/ATen/autocast_mode.cpp:378 [backend fallback]
AutocastCUDA: fallthrough registered at ../aten/src/ATen/autocast_mode.cpp:244 [backend fallback]
FuncTorchBatched: registered at ../aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:731 [backend fallback]
BatchedNestedTensor: registered at ../aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:758 [backend fallback]
FuncTorchVmapMode: fallthrough registered at ../aten/src/ATen/functorch/VmapModeRegistrations.cpp:27 [backend fallback]
Batched: registered at ../aten/src/ATen/LegacyBatchingRegistrations.cpp:1075 [backend fallback]
VmapMode: fallthrough registered at ../aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback]
FuncTorchGradWrapper: registered at ../aten/src/ATen/functorch/TensorWrapper.cpp:202 [backend fallback]
PythonTLSSnapshot: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:162 [backend fallback]
FuncTorchDynamicLayerFrontMode: registered at ../aten/src/ATen/functorch/DynamicLayer.cpp:493 [backend fallback]
PreDispatch: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:166 [backend fallback]
PythonDispatcher: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:158 [backend fallback]
Exception raised from reportError at ../aten/src/ATen/core/dispatch/OperatorEntry.cpp:547 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits, std::allocator >) + 0x6c (0x7fb01de5d2ac in /usr/local/libtorch/lib/libc10.so)
frame #1: + 0x11b21a7 (0x7fb01f0711a7 in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #2: + 0x275e51b (0x7fb02061d51b in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #3: at::_ops::empty_strided::redispatch(c10::DispatchKeySet, c10::ArrayRefc10::SymInt, c10::ArrayRefc10::SymInt, std::optionalc10::ScalarType, std::optionalc10::Layout, std::optionalc10::Device, std::optional) + 0xa4 (0x7fb0208f3394 in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #4: + 0x2e3ae1b (0x7fb020cf9e1b in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #5: at::_ops::empty_strided::call(c10::ArrayRefc10::SymInt, c10::ArrayRefc10::SymInt, std::optionalc10::ScalarType, std::optionalc10::Layout, std::optionalc10::Device, std::optional) + 0x1b9 (0x7fb0209457a9 in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #6: + 0x1d310d9 (0x7fb01fbf00d9 in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #7: at::native::_to_copy(at::Tensor const&, std::optionalc10::ScalarType, std::optionalc10::Layout, std::optionalc10::Device, std::optional, bool, std::optionalc10::MemoryFormat) + 0x1a9f (0x7fb01ffc00bf in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #8: + 0x306b66f (0x7fb020f2a66f in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #9: at::_ops::_to_copy::redispatch(c10::DispatchKeySet, at::Tensor const&, std::optionalc10::ScalarType, std::optionalc10::Layout, std::optionalc10::Device, std::optional, bool, std::optionalc10::MemoryFormat) + 0x109 (0x7fb020535989 in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #10: + 0x2e3e60a (0x7fb020cfd60a in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #11: at::_ops::_to_copy::redispatch(c10::DispatchKeySet, at::Tensor const&, std::optionalc10::ScalarType, std::optionalc10::Layout, std::optionalc10::Device, std::optional, bool, std::optionalc10::MemoryFormat) + 0x109 (0x7fb020535989 in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #12: + 0x49222c4 (0x7fb0227e12c4 in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #13: + 0x4922792 (0x7fb0227e1792 in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #14: at::_ops::_to_copy::call(at::Tensor const&, std::optionalc10::ScalarType, std::optionalc10::Layout, std::optionalc10::Device, std::optional, bool, std::optionalc10::MemoryFormat) + 0x1fe (0x7fb0205d4d5e in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #15: at::native::to(at::Tensor const&, c10::Device, c10::ScalarType, bool, bool, std::optionalc10::MemoryFormat) + 0xf7 (0x7fb01ffbd027 in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #16: + 0x32aa75d (0x7fb02116975d in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #17: at::_ops::to_device::call(at::Tensor const&, c10::Device, c10::ScalarType, bool, bool, std::optionalc10::MemoryFormat) + 0x1ce (0x7fb020799e8e in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #18: torch::jit::Unpickler::readInstruction() + 0x1d6d (0x7fb023b1252d in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #19: torch::jit::Unpickler::run() + 0xa8 (0x7fb023b137f8 in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #20: torch::jit::Unpickler::parse_ivalue() + 0x32 (0x7fb023b15352 in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #21: torch::jit::readArchiveAndTensors(std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, std::optional<std::function<c10::StrongTypePtr (c10::QualifiedName const&)> >, std::optional<std::function<c10::intrusive_ptr<c10::ivalue::Object, c10::detail::intrusive_target_default_null_typec10::ivalue::Object > (c10::StrongTypePtr const&, c10::IValue)> >, std::optionalc10::Device, caffe2::serialize::PyTorchStreamReader&, c10::Type::SingletonOrSharedTypePtrc10::Type (*)(std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&), std::shared_ptrtorch::jit::DeserializationStorageContext) + 0x569 (0x7fb023ace509 in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #22: + 0x5c03fc8 (0x7fb023ac2fc8 in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #23: + 0x5c06dd3 (0x7fb023ac5dd3 in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #24: torch::jit::import_ir_module(std::shared_ptrtorch::jit::CompilationUnit, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, std::optionalc10::Device, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits, std::allocator >, std::__cxx11::basic_string<char, std::char_traits, std::allocator >, std::hash<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits, std::allocator > const, std::__cxx11::basic_string<char, std::char_traits, std::allocator > > > >&, bool, bool) + 0x3df (0x7fb023acb7cf in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #25: torch::jit::import_ir_module(std::shared_ptrtorch::jit::CompilationUnit, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, std::optionalc10::Device, bool) + 0x92 (0x7fb023acba72 in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #26: torch::jit::load(std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, std::optionalc10::Device, bool) + 0xc0 (0x7fb023acbb80 in /usr/local/libtorch/lib/libtorch_cpu.so)
frame #27: + 0x416d7 (0x5635d377d6d7 in ./air)
frame #28: + 0xbdde0 (0x5635d37f9de0 in ./air)
frame #29: + 0x29d90 (0x7fb01d2f4d90 in /lib/x86_64-linux-gnu/libc.so.6)
frame #30: __libc_start_main + 0x80 (0x7fb01d2f4e40 in /lib/x86_64-linux-gnu/libc.so.6)
Beta Was this translation helpful? Give feedback.
All reactions