Disable native modal load when using remote infer

- Split previous loadNetwork into two parts - create Network: It loads the generated graph Network and dump it as xml and bin - loadNetwork: which now reads the xml and bin and and create infer request - fallback to native inference if remote infer fails. Note: fallback causes load network to trigger load for native infer which increase infer time in fallback scenario, in case of only native infer(no remote infer) compile_model is called twice, thus resulting in longer model time. Sub-Task JIRA: OAM-110562 Tracked-On: OAM-109729 Signed-off-by: Ratnesh Kumar Rai <[email protected]>
projectceladon · Jun 2, 2023 · ac80d67 · ac80d67
1 parent 32de1bf
commit ac80d67
Show file tree

Hide file tree

Showing 5 changed files with 115 additions and 71 deletions.
diff --git a/BasePreparedModel.cpp b/BasePreparedModel.cpp
@@ -79,7 +79,7 @@ bool BasePreparedModel::initialize() {
     }
     try {
         mPlugin = std::make_shared<IENetwork>(mTargetDevice);
-        mPlugin->loadNetwork(ov_model, mXmlFile, mBinFile);
+        mPlugin->createNetwork(ov_model, mXmlFile, mBinFile);
     } catch (const std::exception& ex) {
         ALOGE("%s Exception !!! %s", __func__, ex.what());
         return false;
@@ -105,9 +105,16 @@ bool BasePreparedModel::initialize() {
         if (disableOffload) break;
     }
     if (!disableOffload) {
+        ALOGD("%s GRPC load model on remote",__func__);
         loadRemoteModel(mXmlFile, mBinFile);
     }
 
+    if (disableOffload || !(mRemoteCheck)) {
+        ALOGI("%s load model on native for inference",__func__);
+        mPlugin->loadNetwork(mXmlFile);
+        setRemoteEnabled(false);
+    }
+
     size_t tensorIndex = 0;
     for (auto inIndex : mModelInfo->getModelInputIndexes()) {
         const std::string& inputNodeName = ngraphNetCreator->getNodeName(inIndex);
@@ -488,9 +495,27 @@ static std::tuple<ErrorStatus, hidl_vec<V1_2::OutputShape>, Timing> executeSynch
         if(preparedModel->mRemoteCheck && preparedModel->mDetectionClient) {
             auto inOperandType = modelInfo->getOperandType(inIndex);
             preparedModel->mDetectionClient->add_input_data(std::to_string(tensorIndex), (uint8_t*)srcPtr, modelInfo->getOperand(inIndex).dimensions, len, inOperandType);
-        } else {
+            ALOGI("%s GRPC Remote Infer", __func__);
+            if (measure == MeasureTiming::YES) deviceStart = now();
+            ALOGV("%s Run", __func__);
+            auto reply = preparedModel->mDetectionClient->remote_infer();
+            if (measure == MeasureTiming::YES) deviceEnd = now();
+            ALOGI("***********GRPC server response************* %s", reply.c_str());
+            if (reply != "Success") {
+                bool is_success = false;
+                ALOGE("%s GRPC Remote infer failed, Switching to native infer", __func__);
+                preparedModel->setRemoteEnabled(false);
+                preparedModel->mDetectionClient->release(is_success);
+            }
+        }
+
+        if (!preparedModel->mRemoteCheck || !preparedModel->mDetectionClient->get_status()) {
             ov::Tensor destTensor;
             try {
+                if(!plugin->queryState()) {
+                    ALOGI("native model not loaded, starting model load");
+                    plugin->loadNetwork(preparedModel->mXmlFile);
+                }
                 destTensor = plugin->getInputTensor(tensorIndex);
             } catch (const std::exception& ex) {
                 ALOGE("%s Exception !!! %s", __func__, ex.what());
@@ -545,32 +570,17 @@ static std::tuple<ErrorStatus, hidl_vec<V1_2::OutputShape>, Timing> executeSynch
                     std::memcpy((uint8_t*)dest, (uint8_t*)srcPtr, len);
                     break;
             }
-        }
-
-    }
-
-    ALOGV("%s Run", __func__);
-
-    if (measure == MeasureTiming::YES) deviceStart = now();
-    if(preparedModel->mRemoteCheck) {
-        ALOGI("%s GRPC Remote Infer", __func__);
-        auto reply = preparedModel->mDetectionClient->remote_infer();
-        ALOGI("***********GRPC server response************* %s", reply.c_str());
-    }
-    if (!preparedModel->mRemoteCheck || !preparedModel->mDetectionClient->get_status()){
-        //Disable remote inference if a request fails
-        if(preparedModel->mRemoteCheck) {
-            preparedModel->setRemoteEnabled(false);
-        }
-        try {
-            ALOGV("%s Client Infer", __func__);
-            plugin->infer();
-        } catch (const std::exception& ex) {
-            ALOGE("%s Exception !!! %s", __func__, ex.what());
-            return {ErrorStatus::GENERAL_FAILURE, {}, kNoTiming};
+            if (measure == MeasureTiming::YES) deviceStart = now();
+            try {
+                ALOGV("%s RUN native infer", __func__);
+                plugin->infer();
+            } catch (const std::exception& ex) {
+                ALOGE("%s Exception !!! %s", __func__, ex.what());
+                return {ErrorStatus::GENERAL_FAILURE, {}, kNoTiming};
+            }
+            if (measure == MeasureTiming::YES) deviceEnd = now();
         }
     }
-    if (measure == MeasureTiming::YES) deviceEnd = now();
 
     for (size_t i = 0; i < request.outputs.size(); i++) {
         auto outIndex = modelInfo->getModelOutputIndex(i);
@@ -580,44 +590,45 @@ static std::tuple<ErrorStatus, hidl_vec<V1_2::OutputShape>, Timing> executeSynch
             continue;
         }
         ov::Tensor srcTensor;
-        try {
-            srcTensor = plugin->getOutputTensor(tensorIndex);
-        } catch (const std::exception& ex) {
-            ALOGE("%s Exception !!! %s", __func__, ex.what());
-            return {ErrorStatus::GENERAL_FAILURE, {}, kNoTiming};
-        }
-        auto operandType = modelInfo->getOperandType(outIndex);
-        uint32_t actualLength = srcTensor.get_byte_size();
         uint32_t expectedLength = 0;
         void* destPtr = modelInfo->getBlobFromMemoryPoolOut(request, i, expectedLength);
-        auto outputBlobDims = srcTensor.get_shape();
-
-        bool outputSizeMismatch = false;
-        if (actualLength != expectedLength) {
-            ALOGE("%s Invalid length at outIndex(%d) Actual:%d Expected:%d", __func__, outIndex,
-                  actualLength, expectedLength);
-            outputSizeMismatch = true;
-        }
-
-        // TODO: bug identified with OV2021.4 where for Pad operation, if the output dimensions is 1
-        // output dimension is coming as 0
-        if ((outputBlobDims.size() == 0) && (actualLength != 0)) {
-            std::vector<size_t> rdims = {1};
-            modelInfo->updateOutputshapes(i, rdims, outputSizeMismatch ? false : true);
-        } else
-            modelInfo->updateOutputshapes(i, outputBlobDims, outputSizeMismatch ? false : true);
-
-        if (outputSizeMismatch) {
-            ALOGE(
-                "Mismatch in actual and exepcted output sizes. Return with "
-                "OUTPUT_INSUFFICIENT_SIZE error");
-            return {ErrorStatus::OUTPUT_INSUFFICIENT_SIZE, modelInfo->getOutputShapes(), kNoTiming};
-        }
-        //copy output from remote infer
-        //TODO: Add support for other OperandType
         if (preparedModel->mRemoteCheck && preparedModel->mDetectionClient && preparedModel->mDetectionClient->get_status()) {
             preparedModel->mDetectionClient->get_output_data(std::to_string(i), (uint8_t*)destPtr, expectedLength);
         } else {
+            try {
+                srcTensor = plugin->getOutputTensor(tensorIndex);
+            } catch (const std::exception& ex) {
+                ALOGE("%s Exception !!! %s", __func__, ex.what());
+                return {ErrorStatus::GENERAL_FAILURE, {}, kNoTiming};
+            }
+            auto operandType = modelInfo->getOperandType(outIndex);
+            uint32_t actualLength = srcTensor.get_byte_size();
+
+            auto outputBlobDims = srcTensor.get_shape();
+
+            bool outputSizeMismatch = false;
+            if (actualLength != expectedLength) {
+                ALOGE("%s Invalid length at outIndex(%d) Actual:%d Expected:%d", __func__, outIndex,
+                    actualLength, expectedLength);
+                outputSizeMismatch = true;
+            }
+
+            // TODO: bug identified with OV2021.4 where for Pad operation, if the output dimensions is 1
+            // output dimension is coming as 0
+            if ((outputBlobDims.size() == 0) && (actualLength != 0)) {
+                std::vector<size_t> rdims = {1};
+                modelInfo->updateOutputshapes(i, rdims, outputSizeMismatch ? false : true);
+            } else
+                modelInfo->updateOutputshapes(i, outputBlobDims, outputSizeMismatch ? false : true);
+
+            if (outputSizeMismatch) {
+                ALOGE(
+                    "Mismatch in actual and exepcted output sizes. Return with "
+                    "OUTPUT_INSUFFICIENT_SIZE error");
+                return {ErrorStatus::OUTPUT_INSUFFICIENT_SIZE, modelInfo->getOutputShapes(), kNoTiming};
+            }
+            //copy output from remote infer
+            //TODO: Add support for other OperandType
             switch (operandType) {
                 case OperandType::TENSOR_INT32:
                     std::memcpy((uint8_t*)destPtr, (uint8_t*)srcTensor.data<int32_t>(),

diff --git a/BasePreparedModel.h b/BasePreparedModel.h
@@ -53,6 +53,8 @@ typedef uint8_t* memory;
 class BasePreparedModel : public V1_3::IPreparedModel {
 public:
     bool mRemoteCheck = false;
+    std::string mXmlFile;
+    std::string mBinFile;
     BasePreparedModel(const IntelDeviceType device, const Model& model) : mTargetDevice(device) {
         mModelInfo = std::make_shared<NnapiModelInfo>(model);
         mXmlFile = MODEL_DIR + std::to_string(mFileId) + std::string(".xml");
@@ -110,8 +112,6 @@ class BasePreparedModel : public V1_3::IPreparedModel {
     std::shared_ptr<IIENetwork> mPlugin;
 private:
     static uint32_t mFileId;
-    std::string mXmlFile;
-    std::string mBinFile;
     std::unordered_map<size_t, size_t> mInputsToTensorMap;
     std::unordered_map<size_t, size_t> mOutputsToTensorMap;
 };

diff --git a/DetectionClient.cpp b/DetectionClient.cpp
@@ -192,7 +192,10 @@ std::string DetectionClient::remote_infer() {
     request.mutable_token()->set_data(mToken);
     status = stub_->getInferResult(&context, request, &reply);
     if (status.ok()) {
-        if (reply.data_tensors_size() == 0) ALOGE("GRPC reply empty, ovms failure ?");
+        if (reply.data_tensors_size() == 0) {
+            ALOGE("GRPC reply empty, ovms failure ?");
+            return "Failure";
+        }
         return "Success";
     } else {
         ALOGE("GRPC Error code: %d, message: %s", status.error_code(),

diff --git a/IENetwork.cpp b/IENetwork.cpp
@@ -9,7 +9,7 @@
 
 namespace android::hardware::neuralnetworks::nnhal {
 
-bool IENetwork::loadNetwork(std::shared_ptr<ov::Model> network, const std::string& ir_xml, const std::string& ir_bin) {
+bool IENetwork::createNetwork(std::shared_ptr<ov::Model> network, const std::string& ir_xml, const std::string& ir_bin) {
     ALOGV("%s", __func__);
 
 #if __ANDROID__
@@ -32,14 +32,14 @@ bool IENetwork::loadNetwork(std::shared_ptr<ov::Model> network, const std::strin
             break;
     }
 
-    ALOGD("Creating infer request for Intel Device Type : %s", deviceStr.c_str());
+    ALOGD("creating infer request for Intel Device Type : %s", deviceStr.c_str());
 
     if (!network) {
         ALOGE("Invalid Network pointer");
         return false;
     } else {
         ov::CompiledModel compiled_model = ie.compile_model(network, deviceStr);
-        ALOGD("loadNetwork is done....");
+        ALOGD("createNetwork is done....");
 #if __ANDROID__
         ov::serialize(network, ir_xml, ir_bin,
                         ov::pass::Serialize::Version::IR_V11);
@@ -48,13 +48,40 @@ bool IENetwork::loadNetwork(std::shared_ptr<ov::Model> network, const std::strin
         manager.register_pass<ov::pass::Serialize>("/tmp/model.xml", "/tmp/model.bin");
         manager.run_passes(network);
 #endif
-        mInferRequest = compiled_model.create_infer_request();
-        ALOGD("CreateInferRequest is done....");
     }
 
     return true;
 }
 
+void IENetwork::loadNetwork(const std::string& modelName) {
+#if __ANDROID__
+    ov::Core ie(std::string("/vendor/etc/openvino/plugins.xml"));
+#else
+    ov::Core ie(std::string("/usr/local/lib64/plugins.xml"));
+#endif
+
+    std::string deviceStr;
+    switch (mTargetDevice) {
+        case IntelDeviceType::GNA:
+            deviceStr = "GNA";
+            break;
+        case IntelDeviceType::VPU:
+            deviceStr = "VPUX";
+            break;
+        case IntelDeviceType::CPU:
+        default:
+            deviceStr = "CPU";
+            break;
+    }
+
+    ALOGD("loading infer request for Intel Device Type : %s", deviceStr.c_str());
+
+    ov::CompiledModel compiled_model = ie.compile_model(modelName, deviceStr);
+    mInferRequest = compiled_model.create_infer_request();
+    isLoaded = true;
+    ALOGD("Load InferRequest is done....");
+}
+
 // Need to be called before loadnetwork.. But not sure whether need to be called for
 // all the inputs in case multiple input / output
 ov::Tensor IENetwork::getTensor(const std::string& outName) {

diff --git a/IENetwork.h b/IENetwork.h
@@ -21,10 +21,11 @@ namespace android::hardware::neuralnetworks::nnhal {
 class IIENetwork {
 public:
     virtual ~IIENetwork() = default;
-    virtual bool loadNetwork(std::shared_ptr<ov::Model> network, const std::string& ir_xml, const std::string& ir_bin) = 0;
+    virtual void loadNetwork(const std::string& model_name) = 0;
+    virtual bool createNetwork(std::shared_ptr<ov::Model> network, const std::string& ir_xml, const std::string& ir_bin) = 0;
     virtual ov::InferRequest getInferRequest() = 0;
     virtual void infer() = 0;
-    virtual void queryState() = 0;
+    virtual bool queryState() = 0;
     virtual ov::Tensor getTensor(const std::string& outName) = 0;
     virtual ov::Tensor getInputTensor(const std::size_t index) = 0;
     virtual ov::Tensor getOutputTensor(const std::size_t index) = 0;
@@ -35,17 +36,19 @@ class IENetwork : public IIENetwork {
 private:
     IntelDeviceType mTargetDevice;
     ov::InferRequest mInferRequest;
+    bool isLoaded = false;
 
 public:
     IENetwork(IntelDeviceType device)
         : mTargetDevice(device) {}
 
-    virtual bool loadNetwork(std::shared_ptr<ov::Model> network, const std::string& ir_xml, const std::string& ir_bin);
+    virtual void loadNetwork(const std::string& model_name);
+    virtual bool createNetwork(std::shared_ptr<ov::Model> network, const std::string& ir_xml, const std::string& ir_bin);
     ov::Tensor getTensor(const std::string& outName);
     ov::Tensor getInputTensor(const std::size_t index);
     ov::Tensor getOutputTensor(const std::size_t index);
     ov::InferRequest getInferRequest() { return mInferRequest; }
-    void queryState() {}
+    bool queryState() { return isLoaded; }
     void infer();
     bool getGrpcIpPort(char *ip_port);
 };