Skip to content

Commit

Permalink
Disable native modal load when using remote infer
Browse files Browse the repository at this point in the history
- Split previous loadNetwork into two parts
    - create Network: It loads the generated graph Network
    and dump it as xml and bin
    - loadNetwork: which now reads the xml and bin and
    and create infer request

- fallback to native inference if remote infer fails.

Note: fallback causes load network to trigger load for
native infer which increase infer time in fallback scenario,
in case of only native infer(no remote infer) compile_model
is called twice, thus resulting in longer model
time.
Sub-Task JIRA: OAM-110562

Tracked-On: OAM-109729
Signed-off-by: Ratnesh Kumar Rai <[email protected]>
  • Loading branch information
rairatne committed Jun 2, 2023
1 parent 32de1bf commit ac80d67
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 71 deletions.
129 changes: 70 additions & 59 deletions BasePreparedModel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ bool BasePreparedModel::initialize() {
}
try {
mPlugin = std::make_shared<IENetwork>(mTargetDevice);
mPlugin->loadNetwork(ov_model, mXmlFile, mBinFile);
mPlugin->createNetwork(ov_model, mXmlFile, mBinFile);
} catch (const std::exception& ex) {
ALOGE("%s Exception !!! %s", __func__, ex.what());
return false;
Expand All @@ -105,9 +105,16 @@ bool BasePreparedModel::initialize() {
if (disableOffload) break;
}
if (!disableOffload) {
ALOGD("%s GRPC load model on remote",__func__);
loadRemoteModel(mXmlFile, mBinFile);
}

if (disableOffload || !(mRemoteCheck)) {
ALOGI("%s load model on native for inference",__func__);
mPlugin->loadNetwork(mXmlFile);
setRemoteEnabled(false);
}

size_t tensorIndex = 0;
for (auto inIndex : mModelInfo->getModelInputIndexes()) {
const std::string& inputNodeName = ngraphNetCreator->getNodeName(inIndex);
Expand Down Expand Up @@ -488,9 +495,27 @@ static std::tuple<ErrorStatus, hidl_vec<V1_2::OutputShape>, Timing> executeSynch
if(preparedModel->mRemoteCheck && preparedModel->mDetectionClient) {
auto inOperandType = modelInfo->getOperandType(inIndex);
preparedModel->mDetectionClient->add_input_data(std::to_string(tensorIndex), (uint8_t*)srcPtr, modelInfo->getOperand(inIndex).dimensions, len, inOperandType);
} else {
ALOGI("%s GRPC Remote Infer", __func__);
if (measure == MeasureTiming::YES) deviceStart = now();
ALOGV("%s Run", __func__);
auto reply = preparedModel->mDetectionClient->remote_infer();
if (measure == MeasureTiming::YES) deviceEnd = now();
ALOGI("***********GRPC server response************* %s", reply.c_str());
if (reply != "Success") {
bool is_success = false;
ALOGE("%s GRPC Remote infer failed, Switching to native infer", __func__);
preparedModel->setRemoteEnabled(false);
preparedModel->mDetectionClient->release(is_success);
}
}

if (!preparedModel->mRemoteCheck || !preparedModel->mDetectionClient->get_status()) {
ov::Tensor destTensor;
try {
if(!plugin->queryState()) {
ALOGI("native model not loaded, starting model load");
plugin->loadNetwork(preparedModel->mXmlFile);
}
destTensor = plugin->getInputTensor(tensorIndex);
} catch (const std::exception& ex) {
ALOGE("%s Exception !!! %s", __func__, ex.what());
Expand Down Expand Up @@ -545,32 +570,17 @@ static std::tuple<ErrorStatus, hidl_vec<V1_2::OutputShape>, Timing> executeSynch
std::memcpy((uint8_t*)dest, (uint8_t*)srcPtr, len);
break;
}
}

}

ALOGV("%s Run", __func__);

if (measure == MeasureTiming::YES) deviceStart = now();
if(preparedModel->mRemoteCheck) {
ALOGI("%s GRPC Remote Infer", __func__);
auto reply = preparedModel->mDetectionClient->remote_infer();
ALOGI("***********GRPC server response************* %s", reply.c_str());
}
if (!preparedModel->mRemoteCheck || !preparedModel->mDetectionClient->get_status()){
//Disable remote inference if a request fails
if(preparedModel->mRemoteCheck) {
preparedModel->setRemoteEnabled(false);
}
try {
ALOGV("%s Client Infer", __func__);
plugin->infer();
} catch (const std::exception& ex) {
ALOGE("%s Exception !!! %s", __func__, ex.what());
return {ErrorStatus::GENERAL_FAILURE, {}, kNoTiming};
if (measure == MeasureTiming::YES) deviceStart = now();
try {
ALOGV("%s RUN native infer", __func__);
plugin->infer();
} catch (const std::exception& ex) {
ALOGE("%s Exception !!! %s", __func__, ex.what());
return {ErrorStatus::GENERAL_FAILURE, {}, kNoTiming};
}
if (measure == MeasureTiming::YES) deviceEnd = now();
}
}
if (measure == MeasureTiming::YES) deviceEnd = now();

for (size_t i = 0; i < request.outputs.size(); i++) {
auto outIndex = modelInfo->getModelOutputIndex(i);
Expand All @@ -580,44 +590,45 @@ static std::tuple<ErrorStatus, hidl_vec<V1_2::OutputShape>, Timing> executeSynch
continue;
}
ov::Tensor srcTensor;
try {
srcTensor = plugin->getOutputTensor(tensorIndex);
} catch (const std::exception& ex) {
ALOGE("%s Exception !!! %s", __func__, ex.what());
return {ErrorStatus::GENERAL_FAILURE, {}, kNoTiming};
}
auto operandType = modelInfo->getOperandType(outIndex);
uint32_t actualLength = srcTensor.get_byte_size();
uint32_t expectedLength = 0;
void* destPtr = modelInfo->getBlobFromMemoryPoolOut(request, i, expectedLength);
auto outputBlobDims = srcTensor.get_shape();

bool outputSizeMismatch = false;
if (actualLength != expectedLength) {
ALOGE("%s Invalid length at outIndex(%d) Actual:%d Expected:%d", __func__, outIndex,
actualLength, expectedLength);
outputSizeMismatch = true;
}

// TODO: bug identified with OV2021.4 where for Pad operation, if the output dimensions is 1
// output dimension is coming as 0
if ((outputBlobDims.size() == 0) && (actualLength != 0)) {
std::vector<size_t> rdims = {1};
modelInfo->updateOutputshapes(i, rdims, outputSizeMismatch ? false : true);
} else
modelInfo->updateOutputshapes(i, outputBlobDims, outputSizeMismatch ? false : true);

if (outputSizeMismatch) {
ALOGE(
"Mismatch in actual and exepcted output sizes. Return with "
"OUTPUT_INSUFFICIENT_SIZE error");
return {ErrorStatus::OUTPUT_INSUFFICIENT_SIZE, modelInfo->getOutputShapes(), kNoTiming};
}
//copy output from remote infer
//TODO: Add support for other OperandType
if (preparedModel->mRemoteCheck && preparedModel->mDetectionClient && preparedModel->mDetectionClient->get_status()) {
preparedModel->mDetectionClient->get_output_data(std::to_string(i), (uint8_t*)destPtr, expectedLength);
} else {
try {
srcTensor = plugin->getOutputTensor(tensorIndex);
} catch (const std::exception& ex) {
ALOGE("%s Exception !!! %s", __func__, ex.what());
return {ErrorStatus::GENERAL_FAILURE, {}, kNoTiming};
}
auto operandType = modelInfo->getOperandType(outIndex);
uint32_t actualLength = srcTensor.get_byte_size();

auto outputBlobDims = srcTensor.get_shape();

bool outputSizeMismatch = false;
if (actualLength != expectedLength) {
ALOGE("%s Invalid length at outIndex(%d) Actual:%d Expected:%d", __func__, outIndex,
actualLength, expectedLength);
outputSizeMismatch = true;
}

// TODO: bug identified with OV2021.4 where for Pad operation, if the output dimensions is 1
// output dimension is coming as 0
if ((outputBlobDims.size() == 0) && (actualLength != 0)) {
std::vector<size_t> rdims = {1};
modelInfo->updateOutputshapes(i, rdims, outputSizeMismatch ? false : true);
} else
modelInfo->updateOutputshapes(i, outputBlobDims, outputSizeMismatch ? false : true);

if (outputSizeMismatch) {
ALOGE(
"Mismatch in actual and exepcted output sizes. Return with "
"OUTPUT_INSUFFICIENT_SIZE error");
return {ErrorStatus::OUTPUT_INSUFFICIENT_SIZE, modelInfo->getOutputShapes(), kNoTiming};
}
//copy output from remote infer
//TODO: Add support for other OperandType
switch (operandType) {
case OperandType::TENSOR_INT32:
std::memcpy((uint8_t*)destPtr, (uint8_t*)srcTensor.data<int32_t>(),
Expand Down
4 changes: 2 additions & 2 deletions BasePreparedModel.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ typedef uint8_t* memory;
class BasePreparedModel : public V1_3::IPreparedModel {
public:
bool mRemoteCheck = false;
std::string mXmlFile;
std::string mBinFile;
BasePreparedModel(const IntelDeviceType device, const Model& model) : mTargetDevice(device) {
mModelInfo = std::make_shared<NnapiModelInfo>(model);
mXmlFile = MODEL_DIR + std::to_string(mFileId) + std::string(".xml");
Expand Down Expand Up @@ -110,8 +112,6 @@ class BasePreparedModel : public V1_3::IPreparedModel {
std::shared_ptr<IIENetwork> mPlugin;
private:
static uint32_t mFileId;
std::string mXmlFile;
std::string mBinFile;
std::unordered_map<size_t, size_t> mInputsToTensorMap;
std::unordered_map<size_t, size_t> mOutputsToTensorMap;
};
Expand Down
5 changes: 4 additions & 1 deletion DetectionClient.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,10 @@ std::string DetectionClient::remote_infer() {
request.mutable_token()->set_data(mToken);
status = stub_->getInferResult(&context, request, &reply);
if (status.ok()) {
if (reply.data_tensors_size() == 0) ALOGE("GRPC reply empty, ovms failure ?");
if (reply.data_tensors_size() == 0) {
ALOGE("GRPC reply empty, ovms failure ?");
return "Failure";
}
return "Success";
} else {
ALOGE("GRPC Error code: %d, message: %s", status.error_code(),
Expand Down
37 changes: 32 additions & 5 deletions IENetwork.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

namespace android::hardware::neuralnetworks::nnhal {

bool IENetwork::loadNetwork(std::shared_ptr<ov::Model> network, const std::string& ir_xml, const std::string& ir_bin) {
bool IENetwork::createNetwork(std::shared_ptr<ov::Model> network, const std::string& ir_xml, const std::string& ir_bin) {
ALOGV("%s", __func__);

#if __ANDROID__
Expand All @@ -32,14 +32,14 @@ bool IENetwork::loadNetwork(std::shared_ptr<ov::Model> network, const std::strin
break;
}

ALOGD("Creating infer request for Intel Device Type : %s", deviceStr.c_str());
ALOGD("creating infer request for Intel Device Type : %s", deviceStr.c_str());

if (!network) {
ALOGE("Invalid Network pointer");
return false;
} else {
ov::CompiledModel compiled_model = ie.compile_model(network, deviceStr);
ALOGD("loadNetwork is done....");
ALOGD("createNetwork is done....");
#if __ANDROID__
ov::serialize(network, ir_xml, ir_bin,
ov::pass::Serialize::Version::IR_V11);
Expand All @@ -48,13 +48,40 @@ bool IENetwork::loadNetwork(std::shared_ptr<ov::Model> network, const std::strin
manager.register_pass<ov::pass::Serialize>("/tmp/model.xml", "/tmp/model.bin");
manager.run_passes(network);
#endif
mInferRequest = compiled_model.create_infer_request();
ALOGD("CreateInferRequest is done....");
}

return true;
}

void IENetwork::loadNetwork(const std::string& modelName) {
#if __ANDROID__
ov::Core ie(std::string("/vendor/etc/openvino/plugins.xml"));
#else
ov::Core ie(std::string("/usr/local/lib64/plugins.xml"));
#endif

std::string deviceStr;
switch (mTargetDevice) {
case IntelDeviceType::GNA:
deviceStr = "GNA";
break;
case IntelDeviceType::VPU:
deviceStr = "VPUX";
break;
case IntelDeviceType::CPU:
default:
deviceStr = "CPU";
break;
}

ALOGD("loading infer request for Intel Device Type : %s", deviceStr.c_str());

ov::CompiledModel compiled_model = ie.compile_model(modelName, deviceStr);
mInferRequest = compiled_model.create_infer_request();
isLoaded = true;
ALOGD("Load InferRequest is done....");
}

// Need to be called before loadnetwork.. But not sure whether need to be called for
// all the inputs in case multiple input / output
ov::Tensor IENetwork::getTensor(const std::string& outName) {
Expand Down
11 changes: 7 additions & 4 deletions IENetwork.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@ namespace android::hardware::neuralnetworks::nnhal {
class IIENetwork {
public:
virtual ~IIENetwork() = default;
virtual bool loadNetwork(std::shared_ptr<ov::Model> network, const std::string& ir_xml, const std::string& ir_bin) = 0;
virtual void loadNetwork(const std::string& model_name) = 0;
virtual bool createNetwork(std::shared_ptr<ov::Model> network, const std::string& ir_xml, const std::string& ir_bin) = 0;
virtual ov::InferRequest getInferRequest() = 0;
virtual void infer() = 0;
virtual void queryState() = 0;
virtual bool queryState() = 0;
virtual ov::Tensor getTensor(const std::string& outName) = 0;
virtual ov::Tensor getInputTensor(const std::size_t index) = 0;
virtual ov::Tensor getOutputTensor(const std::size_t index) = 0;
Expand All @@ -35,17 +36,19 @@ class IENetwork : public IIENetwork {
private:
IntelDeviceType mTargetDevice;
ov::InferRequest mInferRequest;
bool isLoaded = false;

public:
IENetwork(IntelDeviceType device)
: mTargetDevice(device) {}

virtual bool loadNetwork(std::shared_ptr<ov::Model> network, const std::string& ir_xml, const std::string& ir_bin);
virtual void loadNetwork(const std::string& model_name);
virtual bool createNetwork(std::shared_ptr<ov::Model> network, const std::string& ir_xml, const std::string& ir_bin);
ov::Tensor getTensor(const std::string& outName);
ov::Tensor getInputTensor(const std::size_t index);
ov::Tensor getOutputTensor(const std::size_t index);
ov::InferRequest getInferRequest() { return mInferRequest; }
void queryState() {}
bool queryState() { return isLoaded; }
void infer();
bool getGrpcIpPort(char *ip_port);
};
Expand Down

0 comments on commit ac80d67

Please sign in to comment.