From 2b3dcffc1f6fb443fa314a184d70acd42f592428 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= Date: Wed, 15 May 2024 15:58:38 +0200 Subject: [PATCH] Reset deeplima tokenizer on each text in lima too This is necessary while issue #172 is not solved --- .../RnnTokenizer/RnnTokenizer.cpp | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/DeepLimaUnits/RnnTokenizer/RnnTokenizer.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/DeepLimaUnits/RnnTokenizer/RnnTokenizer.cpp index d30ca4287..b2216362c 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/DeepLimaUnits/RnnTokenizer/RnnTokenizer.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/DeepLimaUnits/RnnTokenizer/RnnTokenizer.cpp @@ -100,7 +100,7 @@ class RnnTokenizerPrivate : public DeepTokenizerBase, public ConfigurationHelper // Parameters bool m_ignoreEOL; - segmentation::impl::SegmentationImpl m_segm; + std::shared_ptr m_segm; std::function m_load_fn; bool m_loaded; @@ -111,7 +111,7 @@ RnnTokenizerPrivate::RnnTokenizerPrivate() : m_stringsPool(nullptr), m_currentVx(0), m_ignoreEOL(false), - m_segm(), + m_segm(nullptr), m_loaded(false) { } @@ -148,6 +148,7 @@ LimaStatusCode RnnTokenizer::process(AnalysisContent& analysis) const LOG_MESSAGE_WITH_PROLOG(LINFO, "start tokenizer process"); TimeUtilsController RnnTokenizerProcessTime("RnnTokenizer"); + auto anagraph = std::make_shared("AnalysisGraph", m_d->m_language, true, true); analysis.setData("AnalysisGraph", anagraph); auto graph = anagraph->getGraph(); @@ -268,21 +269,21 @@ void RnnTokenizerPrivate::init(GroupConfigurationStructure& unitConfiguration) m_load_fn = [this, model_file_name]() { - if (m_loaded) - { - return; - } + // if (m_loaded) + // { + // return; + // } - m_segm.load(model_file_name.toStdString()); - m_segm.init(1, 16*1024); // threads, buffer size per thread + m_segm->load(model_file_name.toStdString()); + m_segm->init(1, 16*1024); // threads, buffer size per thread m_loaded = true; }; - if (!isInitLazy()) - { - m_load_fn(); - } + // if (!isInitLazy()) + // { + // m_load_fn(); + // } } void RnnTokenizerPrivate::append_new_word(std::vector< TPrimitiveToken >& current_sentence, @@ -315,6 +316,8 @@ void RnnTokenizerPrivate::append_new_word(std::vector< TPrimitiveToken >& curren void RnnTokenizerPrivate::tokenize(const QString& text, std::vector>& sentences) { + m_segm = std::make_shared(); + m_load_fn(); LOG_MESSAGE_WITH_PROLOG(LDEBUG, "RnnTokenizerPrivate::tokenize" << text.left(100)); @@ -327,7 +330,7 @@ void RnnTokenizerPrivate::tokenize(const QString& text, std::vectorregister_handler([this, &sentences, ¤t_sentence, ¤t_token_offset] (const std::vector& tokens, uint32_t len) { @@ -349,7 +352,7 @@ void RnnTokenizerPrivate::tokenize(const QString& text, std::vectorparse_from_stream([&text_utf8, &bytes_consumed] (uint8_t* buffer, int32_t& read, size_t max)