From 1285245bef6b61708c824735a8a25e80b8bf5c46 Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Mon, 4 Mar 2024 17:10:07 -0600 Subject: [PATCH 01/40] remove deprecated ChronoKeeper/chrono_common/* --- ChronoKeeper/chrono_common/KeeperIdCard.h | 97 ------------------- .../chrono_common/KeeperRegistrationMsg.h | 93 ------------------ ChronoKeeper/chrono_common/KeeperStatsMsg.h | 46 --------- ChronoKeeper/chrono_common/chronolog_types.h | 66 ------------- 4 files changed, 302 deletions(-) delete mode 100644 ChronoKeeper/chrono_common/KeeperIdCard.h delete mode 100644 ChronoKeeper/chrono_common/KeeperRegistrationMsg.h delete mode 100644 ChronoKeeper/chrono_common/KeeperStatsMsg.h delete mode 100644 ChronoKeeper/chrono_common/chronolog_types.h diff --git a/ChronoKeeper/chrono_common/KeeperIdCard.h b/ChronoKeeper/chrono_common/KeeperIdCard.h deleted file mode 100644 index 249113c9..00000000 --- a/ChronoKeeper/chrono_common/KeeperIdCard.h +++ /dev/null @@ -1,97 +0,0 @@ -#ifndef _KEEPER_ID_CARD_H -#define _KEEPER_ID_CARD_H - -#include - -#include - -// this class wrapps ChronoKeeper Process identification -// that will be used by all the ChronoLog Processes -// to both identofy the Keepr process and create RPC client channels -// to send the data to the Keeper Recording service - -namespace chronolog -{ - -// Keeper Process can be uniquely identified by the combination of -// the host IP address + client_port - -typedef uint32_t in_addr_t; -typedef uint16_t in_port_t; -typedef std::pair service_endpoint; - -// KeeperGroup is the logical grouping of KeeperProcesses -typedef uint64_t KeeperGroupId; - - -class KeeperIdCard -{ - - uint64_t keeper_group_id; - uint32_t ip_addr; //IP address as uint32_t in host byte order - uint16_t port; //port number as uint16_t in host byte order - uint16_t tl_provider_id; // id of thallium service provider - -public: - - - KeeperIdCard(uint64_t group_id = 0, uint32_t addr = 0, uint16_t a_port = 0, uint16_t provider_id = 0) - : keeper_group_id(group_id), ip_addr(addr), port(a_port), tl_provider_id(provider_id) - {} - - KeeperIdCard(KeeperIdCard const &other): keeper_group_id(other.getGroupId()), ip_addr(other.getIPaddr()), port( - other.getPort()), tl_provider_id(other.getProviderId()) - {} - - ~KeeperIdCard() = default; - - uint64_t getGroupId() const - { return keeper_group_id; } - - uint32_t getIPaddr() const - { return ip_addr; } - - uint16_t getPort() const - { return port; } - - uint16_t getProviderId() const - { return tl_provider_id; } - - - // serialization function used by thallium RPC providers - // to serialize/deserialize KeeperIdCard - template - void serialize(SerArchiveT &serT) - { - serT&keeper_group_id; - serT&ip_addr; - serT&port; - serT&tl_provider_id; - } - - std::string &getIPasDottedString(std::string &a_string) const - { - - char buffer[INET_ADDRSTRLEN]; - // convert ip from host to network byte order uint32_t - uint32_t ip_net_order = htonl(ip_addr); - // convert network byte order uint32_t to a dotted string - if(NULL != inet_ntop(AF_INET, &ip_net_order, buffer, INET_ADDRSTRLEN)) - { a_string += std::string(buffer); } - return a_string; - } - -}; - -} //namespace chronolog - -inline std::ostream &operator<<(std::ostream &out, chronolog::KeeperIdCard const &keeper_id_card) -{ - std::string a_string; - out << "KeeperIdCard{" << keeper_id_card.getGroupId() << ":" << keeper_id_card.getIPasDottedString(a_string) << ":" - << keeper_id_card.getPort() << ":" << keeper_id_card.getProviderId() << "}"; - return out; -} - - -#endif diff --git a/ChronoKeeper/chrono_common/KeeperRegistrationMsg.h b/ChronoKeeper/chrono_common/KeeperRegistrationMsg.h deleted file mode 100644 index cb0f5a27..00000000 --- a/ChronoKeeper/chrono_common/KeeperRegistrationMsg.h +++ /dev/null @@ -1,93 +0,0 @@ -#ifndef KEEPER_REGISTRATION_MSG_H -#define KEEPER_REGISTRATION_MSG_H - -#include -#include -#include "KeeperIdCard.h" - - -namespace chronolog -{ - -class ServiceId -{ -public: - ServiceId(uint32_t addr, uint16_t a_port, uint16_t a_provider_id): ip_addr(addr), port(a_port), provider_id( - a_provider_id) - {} - - ~ServiceId() = default; - - uint32_t ip_addr; //32int IP representation in host notation - uint16_t port; //16int port representation in host notation - uint16_t provider_id; //thalium provider id - - template - void serialize(SerArchiveT &serT) - { - serT&ip_addr; - serT&port; - serT&provider_id; - } - - std::string &getIPasDottedString(std::string &a_string) const - { - - char buffer[INET_ADDRSTRLEN]; - // convert ip from host to network byte order uint32_t - uint32_t ip_net_order = htonl(ip_addr); - // convert network byte order uint32_t to a dotted string - if(NULL != inet_ntop(AF_INET, &ip_net_order, buffer, INET_ADDRSTRLEN)) - { a_string += std::string(buffer); } - return a_string; - } -}; - -class KeeperRegistrationMsg -{ - - KeeperIdCard keeperIdCard; - ServiceId adminServiceId; - -public: - - - KeeperRegistrationMsg(KeeperIdCard const &keeper_card = KeeperIdCard{0, 0, 0} - , ServiceId const &admin_service_id = ServiceId{0, 0, 0}): keeperIdCard(keeper_card) - , adminServiceId(admin_service_id) - {} - - ~KeeperRegistrationMsg() = default; - - KeeperIdCard const &getKeeperIdCard() const - { return keeperIdCard; } - - ServiceId const &getAdminServiceId() const - { return adminServiceId; } - - template - void serialize(SerArchiveT &serT) - { - serT&keeperIdCard; - serT&adminServiceId; - } - -}; - -}//namespace - -inline std::ostream &operator<<(std::ostream &out, chronolog::ServiceId const serviceId) -{ - std::string a_string; - out << "{" << serviceId.getIPasDottedString(a_string) << ":" << serviceId.port << ":" << serviceId.provider_id - << "}"; - return out; -} - -inline std::ostream &operator<<(std::ostream &out, chronolog::KeeperRegistrationMsg const &msg) -{ - out << "KeeperRegistrationMsg{" << msg.getKeeperIdCard() << "}{admin:" << msg.getAdminServiceId() << "}"; - return out; -} - -#endif diff --git a/ChronoKeeper/chrono_common/KeeperStatsMsg.h b/ChronoKeeper/chrono_common/KeeperStatsMsg.h deleted file mode 100644 index 6178bd1b..00000000 --- a/ChronoKeeper/chrono_common/KeeperStatsMsg.h +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef KEEPER_STATS_MSG_H -#define KEEPER_STATS_MSG_H - -#include -#include "KeeperIdCard.h" - - -namespace chronolog -{ - -class KeeperStatsMsg -{ - KeeperIdCard keeperIdCard; - uint32_t active_story_count; - -public: - KeeperStatsMsg(KeeperIdCard const &keeper_card = KeeperIdCard{0, 0, 0}, uint32_t count = 0): keeperIdCard( - keeper_card), active_story_count(count) - {} - - ~KeeperStatsMsg() = default; - - KeeperIdCard const &getKeeperIdCard() const - { return keeperIdCard; } - - uint32_t getActiveStoryCount() const - { return active_story_count; } - - template - void serialize(SerArchiveT &serT) - { - serT&keeperIdCard; - serT&active_story_count; - } - -}; - -} - -inline std::ostream &operator<<(std::ostream &out, chronolog::KeeperStatsMsg const &stats_msg) -{ - out << "KeeperStatsMsg{" << stats_msg.getKeeperIdCard() << "}"; - return out; -} - -#endif diff --git a/ChronoKeeper/chrono_common/chronolog_types.h b/ChronoKeeper/chrono_common/chronolog_types.h deleted file mode 100644 index a69b2f49..00000000 --- a/ChronoKeeper/chrono_common/chronolog_types.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef CHRONOLOG_TYPE_DEFINITIONS_H -#define CHRONOLOG_TYPE_DEFINITIONS_H - - -namespace chronolog -{ - -typedef std::string StoryName; -typedef std::string ChronicleName; -typedef uint64_t StoryId; -typedef uint64_t ChronicleId; -typedef uint32_t ClientId; - -typedef uint64_t chrono_time; -typedef uint32_t chrono_index; - -class LogEvent -{ -public: - LogEvent() = default; - - LogEvent(StoryId const &story_id, chrono_time event_time, ClientId const &client_id, chrono_index index - , std::string const &record): storyId(story_id), eventTime(event_time), clientId(client_id), eventIndex( - index), logRecord(record) - {} - - StoryId storyId; - uint64_t eventTime; - ClientId clientId; - uint32_t eventIndex; - std::string logRecord; //INNA: replace with size_t length; & void * data; later on - - uint64_t const &time() const - { return eventTime; } - - uint32_t const &index() const - { return eventIndex; } - - // serialization function used by thallium RPC providers - // to serialize/deserialize KeeperIdCard - - template - void serialize(SerArchiveT &serT) - { - serT(storyId, eventTime, clientId, eventIndex, logRecord); - } - - bool operator==(const LogEvent &other) const - { - return (storyId == other.storyId && eventTime == other.eventTime && clientId == other.clientId && - eventIndex == other.eventIndex && logRecord == other.logRecord); - } - - // convert to string - [[nodiscard]] std::string toString() const - { - std::string str = - "StoryId: " + std::to_string(storyId) + " EventTime: " + std::to_string(eventTime) + " ClientId: " + - std::to_string(clientId) + " EventIndex: " + std::to_string(eventIndex) + " LogRecord: " + logRecord; - return str; - } -}; - - -} -#endif From 460787194f98e24b236dfddfae3a91c5b56a062d Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Thu, 7 Mar 2024 12:22:48 -0600 Subject: [PATCH 02/40] corrected include references --- ChronoKeeper/CSVFileChunkExtractor.h | 4 ++-- ChronoKeeper/ChronoKeeperInstance.cpp | 6 +++--- ChronoKeeper/DataStoreAdminService.h | 2 +- ChronoKeeper/KeeperRegClient.h | 6 +++--- ChronoKeeper/StoryChunkExtractionQueue.h | 2 +- ChronoKeeper/StoryChunkExtractor.h | 2 +- ChronoKeeper/StoryPipeline.h | 2 +- ChronoStore/test/hdf5_archiver_test.cpp | 2 +- 8 files changed, 13 insertions(+), 13 deletions(-) diff --git a/ChronoKeeper/CSVFileChunkExtractor.h b/ChronoKeeper/CSVFileChunkExtractor.h index 76cf3ea4..8a709737 100644 --- a/ChronoKeeper/CSVFileChunkExtractor.h +++ b/ChronoKeeper/CSVFileChunkExtractor.h @@ -1,8 +1,8 @@ #ifndef CSV_FILE_CHUNK_EXTRACTOR_H #define CSV_FILE_CHUNK_EXTRACTOR_H -#include "chrono_common/chronolog_types.h" -#include "chrono_common/KeeperIdCard.h" +#include "chronolog_types.h" +#include "KeeperIdCard.h" #include "StoryChunkExtractor.h" diff --git a/ChronoKeeper/ChronoKeeperInstance.cpp b/ChronoKeeper/ChronoKeeperInstance.cpp index 2b1dc5b5..bed32f3e 100644 --- a/ChronoKeeper/ChronoKeeperInstance.cpp +++ b/ChronoKeeper/ChronoKeeperInstance.cpp @@ -4,8 +4,8 @@ #include -#include "chrono_common/KeeperIdCard.h" -#include "chrono_common/KeeperStatsMsg.h" +//#include "chrono_common/KeeperIdCard.h" +//#include "chrono_common/KeeperStatsMsg.h" #include "KeeperRecordingService.h" #include "KeeperRegClient.h" #include "IngestionQueue.h" @@ -290,4 +290,4 @@ int main(int argc, char**argv) delete dataAdminEngine; LOG_INFO("[ChronoKeeperInstance] Shutdown completed. Exiting."); return exit_code; -} \ No newline at end of file +} diff --git a/ChronoKeeper/DataStoreAdminService.h b/ChronoKeeper/DataStoreAdminService.h index 6b03ec89..ee76bb42 100644 --- a/ChronoKeeper/DataStoreAdminService.h +++ b/ChronoKeeper/DataStoreAdminService.h @@ -6,7 +6,7 @@ #include #include -#include "chrono_common/chronolog_types.h" +#include "chronolog_types.h" #include "KeeperDataStore.h" namespace tl = thallium; diff --git a/ChronoKeeper/KeeperRegClient.h b/ChronoKeeper/KeeperRegClient.h index 90320477..bf3b04ce 100644 --- a/ChronoKeeper/KeeperRegClient.h +++ b/ChronoKeeper/KeeperRegClient.h @@ -5,9 +5,9 @@ #include #include -#include "chrono_common/KeeperIdCard.h" -#include "chrono_common/KeeperRegistrationMsg.h" -#include "chrono_common/KeeperStatsMsg.h" +#include "KeeperIdCard.h" +#include "KeeperRegistrationMsg.h" +#include "KeeperStatsMsg.h" #include "chronolog_errcode.h" namespace tl = thallium; diff --git a/ChronoKeeper/StoryChunkExtractionQueue.h b/ChronoKeeper/StoryChunkExtractionQueue.h index 66e7856b..9c4ecc91 100644 --- a/ChronoKeeper/StoryChunkExtractionQueue.h +++ b/ChronoKeeper/StoryChunkExtractionQueue.h @@ -7,7 +7,7 @@ #include #include "log.h" -#include "chrono_common/chronolog_types.h" +#include "chronolog_types.h" #include "StoryChunk.h" namespace chronolog diff --git a/ChronoKeeper/StoryChunkExtractor.h b/ChronoKeeper/StoryChunkExtractor.h index f4b376d4..0ce66499 100644 --- a/ChronoKeeper/StoryChunkExtractor.h +++ b/ChronoKeeper/StoryChunkExtractor.h @@ -8,7 +8,7 @@ #include #include -#include "chrono_common/chronolog_types.h" +#include "chronolog_types.h" #include "StoryChunkExtractionQueue.h" #include "log.h" diff --git a/ChronoKeeper/StoryPipeline.h b/ChronoKeeper/StoryPipeline.h index e7ff744a..29967cf4 100644 --- a/ChronoKeeper/StoryPipeline.h +++ b/ChronoKeeper/StoryPipeline.h @@ -7,7 +7,7 @@ #include #include -#include "chrono_common/chronolog_types.h" +#include "chronolog_types.h" #include "StoryChunk.h" #include "StoryChunkExtractionQueue.h" diff --git a/ChronoStore/test/hdf5_archiver_test.cpp b/ChronoStore/test/hdf5_archiver_test.cpp index 7f293e94..b074fab5 100644 --- a/ChronoStore/test/hdf5_archiver_test.cpp +++ b/ChronoStore/test/hdf5_archiver_test.cpp @@ -10,7 +10,7 @@ #include #include #include -#include "chrono_common/chronolog_types.h" +#include "chronolog_types.h" #define STORY "S1" #define CHRONICLE "C1.h5" From 8c80d4e00013df7da9e543a4a21de39434cce168 Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Thu, 7 Mar 2024 12:26:40 -0600 Subject: [PATCH 03/40] StoryChunk bound: start --- ChronoKeeper/StoryChunk.cpp | 50 ++++++++++++++++++++++++++++++++++++- ChronoKeeper/StoryChunk.h | 47 +++++++++++++--------------------- 2 files changed, 67 insertions(+), 30 deletions(-) diff --git a/ChronoKeeper/StoryChunk.cpp b/ChronoKeeper/StoryChunk.cpp index eb484986..eca28461 100644 --- a/ChronoKeeper/StoryChunk.cpp +++ b/ChronoKeeper/StoryChunk.cpp @@ -7,7 +7,35 @@ namespace chl = chronolog; ///////////////////////// -uint32_t chronolog::StoryChunk::mergeEvents(std::map &events +chl::StoryChunk::StoryChunk(chl::StoryId const &story_id , uint64_t start_time , uint64_t end_time , uint32_t chunk_size ) + : storyId(story_id) + , startTime(start_time) + , endTime(end_time) + , revisionTime(start_time) + , chunkSize(chunk_size) + { + dataBlob = new char[chunk_size]; + } + +chl::StoryChunk::~StoryChunk() + { + delete [] dataBlob; + } + +int chl::StoryChunk::insertEvent(chl::LogEvent const &event) + { + if((event.time() >= startTime) && (event.time() < endTime)) + { + logEvents.insert(std::pair ({event.time(), event.clientId, event.index()}, event)); + return 1; + } + else + { return 0; } + } + +/////////// + +uint32_t chl::StoryChunk::mergeEvents(std::map &events , std::map ::iterator &merge_start) { uint32_t merged_event_count = 0; @@ -49,3 +77,23 @@ uint32_t chronolog::StoryChunk::mergeEvents(std::map &target_map, std::map ::iterator first_pos + , std::map ::iterator last_pos) + { return 0; } + +uint32_t chl::StoryChunk::extractEvents(std::map &target_map, uint64_t start_time, uint64_t end_time) + { return 0; } + +uint32_t chl::StoryChunk::extractEvents( chl::StoryChunk & target_chunk, uint64_t start_time, uint64_t end_time) + { return 0; } + +uint32_t chl::StoryChunk::split(chl::StoryChunk & split_chunk, uint64_t time_boundary) + { return 0; } + + +uint32_t chl::StoryChunk::eraseEvents(uint64_t start_time, uint64_t end_time) + { return 0; } + diff --git a/ChronoKeeper/StoryChunk.h b/ChronoKeeper/StoryChunk.h index c2ae7f25..902e65ec 100644 --- a/ChronoKeeper/StoryChunk.h +++ b/ChronoKeeper/StoryChunk.h @@ -25,13 +25,9 @@ class StoryChunk { public: - StoryChunk(StoryId const &story_id = 0, uint64_t start_time = 0, uint64_t end_time = 0): storyId(story_id) - , startTime(start_time) - , endTime(end_time) - , revisionTime(start_time) - {} + StoryChunk(StoryId const &story_id = 0, uint64_t start_time = 0, uint64_t end_time = 0, uint32_t chunk_size = 1024); - ~StoryChunk() = default; + ~StoryChunk(); StoryId const &getStoryId() const { return storyId; } @@ -56,44 +52,37 @@ class StoryChunk uint64_t firstEventTime() const { return (*logEvents.begin()).second.time(); } + + uint64_t lastEventTime() const + { return (*logEvents.begin()).second.time(); } - int insertEvent(LogEvent const &event) - { - if((event.time() >= startTime) && (event.time() < endTime)) - { - logEvents.insert(std::pair ({event.time(), event.clientId, event.index()}, event)); - return 1; - } - else - { return 0; } - } - - //INNA: TODO implement the functions!!! + int insertEvent(LogEvent const &); + uint32_t mergeEvents(std::map &events, std::map ::iterator &merge_start); - uint32_t mergeEvents(StoryChunk &other_chunk) - { return 0; } - - uint32_t mergeEvents(StoryChunk &other_chunk, uint64_t start_time, uint64_t end_time) - { return 0; } + uint32_t mergeEvents(StoryChunk &other_chunk, uint64_t start_time =0, uint64_t end_time=0); uint32_t extractEvents(std::map &target_map, std::map ::iterator first_pos - , std::map ::iterator last_pos) - { return 0; } + , std::map ::iterator last_pos); + + uint32_t extractEvents(std::map &target_map, uint64_t start_time, uint64_t end_time); + + uint32_t extractEvents( StoryChunk & target_chunk, uint64_t start_time, uint64_t end_time); - uint32_t extractEvents(std::map &target_map, uint64_t start_time, uint64_t end_time) - { return 0; } + uint32_t split(StoryChunk & split_chunk, uint64_t time_boundary); - uint32_t eraseEvents(uint64_t start_time, uint64_t end_time) - { return 0; } + uint32_t eraseEvents(uint64_t start_time, uint64_t end_time); private: StoryId storyId; uint64_t startTime; uint64_t endTime; uint64_t revisionTime; + uint32_t chunkSize; + char * dataBlob; + std::map eventOffsetMap; std::map logEvents; }; } From 6cde88bf4a5e2067d187886199a950a795219b1f Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Wed, 13 Mar 2024 17:26:47 -0500 Subject: [PATCH 04/40] chrono_grapher POC --- CMakeLists.txt | 1 + ChronoGrapher/CMakeLists.txt | 36 ++ ChronoGrapher/CSVFileChunkExtractor.cpp | 45 +++ ChronoGrapher/CSVFileChunkExtractor.h | 31 ++ ChronoGrapher/ChronoGrapher.cpp | 292 ++++++++++++++++ ChronoGrapher/ChronoKeeperInstance.cpp | 293 ++++++++++++++++ ChronoGrapher/ChunkIngestionQueue.h | 130 ++++++++ ChronoGrapher/DataStoreAdminService.h | 88 +++++ ChronoGrapher/KeeperDataStore.cpp | 278 ++++++++++++++++ ChronoGrapher/KeeperDataStore.h | 80 +++++ ChronoGrapher/KeeperRecordingService.h | 65 ++++ ChronoGrapher/KeeperRegClient.cpp | 9 + ChronoGrapher/KeeperRegClient.h | 115 +++++++ ChronoGrapher/StoryChunk.cpp | 99 ++++++ ChronoGrapher/StoryChunk.h | 89 +++++ ChronoGrapher/StoryChunkExtractionQueue.h | 100 ++++++ ChronoGrapher/StoryChunkExtractor.cpp | 106 ++++++ ChronoGrapher/StoryChunkExtractor.h | 73 ++++ ChronoGrapher/StoryChunkIngestionHandle.h | 66 ++++ ChronoGrapher/StoryPipeline.cpp | 387 ++++++++++++++++++++++ ChronoGrapher/StoryPipeline.h | 90 +++++ ChronoKeeper/StorytellerRecord.cpp | 6 - 22 files changed, 2473 insertions(+), 6 deletions(-) create mode 100644 ChronoGrapher/CMakeLists.txt create mode 100644 ChronoGrapher/CSVFileChunkExtractor.cpp create mode 100644 ChronoGrapher/CSVFileChunkExtractor.h create mode 100644 ChronoGrapher/ChronoGrapher.cpp create mode 100644 ChronoGrapher/ChronoKeeperInstance.cpp create mode 100644 ChronoGrapher/ChunkIngestionQueue.h create mode 100644 ChronoGrapher/DataStoreAdminService.h create mode 100644 ChronoGrapher/KeeperDataStore.cpp create mode 100644 ChronoGrapher/KeeperDataStore.h create mode 100644 ChronoGrapher/KeeperRecordingService.h create mode 100644 ChronoGrapher/KeeperRegClient.cpp create mode 100644 ChronoGrapher/KeeperRegClient.h create mode 100644 ChronoGrapher/StoryChunk.cpp create mode 100644 ChronoGrapher/StoryChunk.h create mode 100644 ChronoGrapher/StoryChunkExtractionQueue.h create mode 100644 ChronoGrapher/StoryChunkExtractor.cpp create mode 100644 ChronoGrapher/StoryChunkExtractor.h create mode 100644 ChronoGrapher/StoryChunkIngestionHandle.h create mode 100644 ChronoGrapher/StoryPipeline.cpp create mode 100644 ChronoGrapher/StoryPipeline.h delete mode 100644 ChronoKeeper/StorytellerRecord.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 7f0c0d05..27900c3e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -205,6 +205,7 @@ add_subdirectory(ChronoVisor) # ChronoKeeper add_subdirectory(ChronoKeeper) +add_subdirectory(ChronoGrapher) # ChronoStore add_subdirectory(ChronoStore) diff --git a/ChronoGrapher/CMakeLists.txt b/ChronoGrapher/CMakeLists.txt new file mode 100644 index 00000000..941c2da4 --- /dev/null +++ b/ChronoGrapher/CMakeLists.txt @@ -0,0 +1,36 @@ +cmake_minimum_required(VERSION 3.19) + +find_package(Thallium REQUIRED) +find_package(spdlog REQUIRED) + +message("Building CMAKE_CURRENT_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR}") +message("build target : chrono_grapher") + +add_executable(chrono_grapher) +target_include_directories(chrono_grapher PRIVATE include + ../chrono_common + ../ChronoAPI/ChronoLog/include) + +target_sources(chrono_grapher PRIVATE + ChronoGrapher.cpp + StoryPipeline.cpp + StoryChunk.cpp + KeeperDataStore.cpp + StoryChunkExtractor.cpp + CSVFileChunkExtractor.cpp + ../ChronoAPI/ChronoLog/src/log.cpp) +target_link_libraries(chrono_grapher thallium) +#configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../default_conf.json.in +# ${CMAKE_CURRENT_BINARY_DIR}/default_conf.json COPYONLY) + +set_target_properties(chrono_grapher PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE) + +# install binary +install( + TARGETS chrono_grapher DESTINATION bin +) + +# install configuration file +install( + FILES ${CMAKE_CURRENT_SOURCE_DIR}/../default_conf.json.in DESTINATION conf RENAME default_conf.json +) diff --git a/ChronoGrapher/CSVFileChunkExtractor.cpp b/ChronoGrapher/CSVFileChunkExtractor.cpp new file mode 100644 index 00000000..8050cdb6 --- /dev/null +++ b/ChronoGrapher/CSVFileChunkExtractor.cpp @@ -0,0 +1,45 @@ +#include +#include +#include + +#include "chronolog_types.h" +#include "KeeperIdCard.h" +#include "CSVFileChunkExtractor.h" + +namespace tl = thallium; + +chronolog::CSVFileStoryChunkExtractor::CSVFileStoryChunkExtractor(chronolog::KeeperIdCard const &keeper_id_card + , std::string const &csv_files_root_dir) + : keeperIdCard(keeper_id_card), rootDirectory(csv_files_root_dir) +{} + +///////////// +chronolog::CSVFileStoryChunkExtractor::~CSVFileStoryChunkExtractor() +{ + LOG_INFO("[CSVFileStoryChunkExtractor] Destructor called. Cleaning up..."); +} + +///////////// +void chronolog::CSVFileStoryChunkExtractor::processStoryChunk(chronolog::StoryChunk*story_chunk) +{ + std::ofstream chunk_fstream; + std::string chunk_filename(rootDirectory); + keeperIdCard.getIPasDottedString(chunk_filename); + chunk_filename += "." + std::to_string(story_chunk->getStoryId()) + "." + + std::to_string(story_chunk->getStartTime() / 1000000000) + ".csv"; + + tl::xstream es = tl::xstream::self(); + LOG_INFO("[CSVFileStoryChunkExtractor] Processing StoryChunk: ES={}, ULT={}, StoryID={}, StartTime={}", es.get_rank() + , tl::thread::self_id(), story_chunk->getStoryId(), story_chunk->getStartTime()); + // current thread if the only one that has this storyChunk and the only one that's writing to this chunk csv file + // thus no additional locking is needed ... + chunk_fstream.open(chunk_filename, std::ofstream::out|std::ofstream::app); + for(auto event_iter = story_chunk->begin(); event_iter != story_chunk->end(); ++event_iter) + { + chronolog::LogEvent const &event = (*event_iter).second; + chunk_fstream << event << std::endl; + } + chunk_fstream.close(); + LOG_INFO("[CSVFileStoryChunkExtractor] Finished processing StoryChunk. File={}", chunk_filename); +} + diff --git a/ChronoGrapher/CSVFileChunkExtractor.h b/ChronoGrapher/CSVFileChunkExtractor.h new file mode 100644 index 00000000..8a709737 --- /dev/null +++ b/ChronoGrapher/CSVFileChunkExtractor.h @@ -0,0 +1,31 @@ +#ifndef CSV_FILE_CHUNK_EXTRACTOR_H +#define CSV_FILE_CHUNK_EXTRACTOR_H + +#include "chronolog_types.h" +#include "KeeperIdCard.h" +#include "StoryChunkExtractor.h" + + +namespace chronolog +{ + +class CSVFileStoryChunkExtractor: public StoryChunkExtractorBase +{ + +public: + CSVFileStoryChunkExtractor(KeeperIdCard const &keeper_id_card, std::string const &csv_files_root_dir); + + ~CSVFileStoryChunkExtractor(); + + virtual void processStoryChunk(StoryChunk*); + +private: + KeeperIdCard keeperIdCard; + std::string rootDirectory; + + +}; + + +} +#endif diff --git a/ChronoGrapher/ChronoGrapher.cpp b/ChronoGrapher/ChronoGrapher.cpp new file mode 100644 index 00000000..29ea465d --- /dev/null +++ b/ChronoGrapher/ChronoGrapher.cpp @@ -0,0 +1,292 @@ +#include +#include +#include + +#include + +//#include "chrono_common/KeeperIdCard.h" +//#include "chrono_common/KeeperStatsMsg.h" +#include "KeeperRecordingService.h" +#include "KeeperRegClient.h" +#include "ChunkIngestionQueue.h" +#include "StoryChunkExtractionQueue.h" +#include "StoryChunkExtractor.h" +#include "KeeperDataStore.h" +#include "DataStoreAdminService.h" +#include "ConfigurationManager.h" +#include "CSVFileChunkExtractor.h" +#include "cmd_arg_parse.h" + +#define KEEPER_GROUP_ID 7 + +// we will be using a combination of the uint32_t representation of the service IP address +// and uint16_t representation of the port number +int +service_endpoint_from_dotted_string(std::string const &ip_string, int port, std::pair &endpoint) +{ + // we will be using a combination of the uint32_t representation of the service IP address + // and uint16_t representation of the port number + // NOTE: both IP and port values in the KeeperCard are in the host byte order, not the network order) + // to identfy the ChronoKeeper process + + struct sockaddr_in sa; + // translate the recording service dotted IP string into 32bit network byte order representation + int inet_pton_return = inet_pton(AF_INET, ip_string.c_str(), &sa.sin_addr.s_addr); //returns 1 on success + if(1 != inet_pton_return) + { + LOG_ERROR("[ChronoGrapher] Invalid IP address provided: {}", ip_string); + return (-1); + } + + // translate 32bit ip from network into the host byte order + uint32_t ntoh_ip_addr = ntohl(sa.sin_addr.s_addr); + uint16_t ntoh_port = port; + endpoint = std::pair (ntoh_ip_addr, ntoh_port); + + LOG_DEBUG("[ChronoKeeperInstance] Service endpoint created: IP={}, Port={}", ip_string, port); + return 1; +} + +volatile sig_atomic_t keep_running = true; + +void sigterm_handler(int) +{ + LOG_INFO("[ChronoKeeperInstance] Received SIGTERM signal. Initiating shutdown procedure."); + keep_running = false; + return; +} + +/////////////////////////////////////////////// + +int main(int argc, char**argv) +{ + int exit_code = 0; + signal(SIGTERM, sigterm_handler); + + /// Configure SetUp ________________________________________________________________________________________________ + std::string conf_file_path; + conf_file_path = parse_conf_path_arg(argc, argv); + if(conf_file_path.empty()) + { + std::exit(EXIT_FAILURE); + } + ChronoLog::ConfigurationManager confManager(conf_file_path); + int result = Logger::initialize(confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGTYPE + , confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGFILE + , confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGLEVEL + , confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGNAME + , confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGFILESIZE + , confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGFILENUM + , confManager.KEEPER_CONF.KEEPER_LOG_CONF.FLUSHLEVEL); + if(result == 1) + { + exit(EXIT_FAILURE); + } + LOG_INFO("Running Chronokeeper Server."); + + // Instantiate ChronoKeeper MemoryDataStore + // instantiate DataStoreAdminService + uint64_t keeper_group_id = KEEPER_GROUP_ID; + + /// DataStoreAdminService setup ____________________________________________________________________________________ + std::string datastore_service_ip = confManager.KEEPER_CONF.KEEPER_DATA_STORE_ADMIN_SERVICE_CONF.RPC_CONF.IP; + int datastore_service_port = confManager.KEEPER_CONF.KEEPER_DATA_STORE_ADMIN_SERVICE_CONF.RPC_CONF.BASE_PORT; + std::string KEEPER_DATASTORE_SERVICE_NA_STRING = + confManager.KEEPER_CONF.KEEPER_DATA_STORE_ADMIN_SERVICE_CONF.RPC_CONF.PROTO_CONF + "://" + + datastore_service_ip + ":" + std::to_string(datastore_service_port); + + uint16_t datastore_service_provider_id = confManager.KEEPER_CONF.KEEPER_DATA_STORE_ADMIN_SERVICE_CONF.RPC_CONF.SERVICE_PROVIDER_ID; + + chronolog::service_endpoint datastore_endpoint; + // validate ip address, instantiate DataAdminService and create ServiceId to be included in KeeperRegistrationMsg + + if(-1 == service_endpoint_from_dotted_string(datastore_service_ip, datastore_service_port, datastore_endpoint)) + { + LOG_CRITICAL("[ChronoKeeperInstance] Failed to start DataStoreAdminService. Invalid endpoint provided."); + return (-1); + } + LOG_INFO("[ChronoKeeperInstance] DataStoreAdminService started successfully."); + + /// KeeperRecordingService setup ___________________________________________________________________________________ + // Instantiate KeeperRecordingService + std::string KEEPER_RECORDING_SERVICE_PROTOCOL = confManager.KEEPER_CONF.KEEPER_RECORDING_SERVICE_CONF.RPC_CONF.PROTO_CONF; + std::string KEEPER_RECORDING_SERVICE_IP = confManager.KEEPER_CONF.KEEPER_RECORDING_SERVICE_CONF.RPC_CONF.IP; + uint16_t KEEPER_RECORDING_SERVICE_PORT = confManager.KEEPER_CONF.KEEPER_RECORDING_SERVICE_CONF.RPC_CONF.BASE_PORT; + uint16_t recording_service_provider_id = confManager.KEEPER_CONF.KEEPER_RECORDING_SERVICE_CONF.RPC_CONF.SERVICE_PROVIDER_ID; + + std::string KEEPER_RECORDING_SERVICE_NA_STRING = + std::string(KEEPER_RECORDING_SERVICE_PROTOCOL) + "://" + std::string(KEEPER_RECORDING_SERVICE_IP) + ":" + + std::to_string(KEEPER_RECORDING_SERVICE_PORT); + + // validate ip address, instantiate Recording Service and create KeeperIdCard + + chronolog::service_endpoint recording_endpoint; + if(-1 == service_endpoint_from_dotted_string(KEEPER_RECORDING_SERVICE_IP, KEEPER_RECORDING_SERVICE_PORT + , recording_endpoint)) + { + LOG_CRITICAL("[ChronoKeeperInstance] Failed to start KeeperRecordingService. Invalid endpoint provided."); + return (-1); + } + LOG_INFO("[ChronoKeeperInstance] KeeperRecordingService started successfully."); + + // create KeeperIdCard to identify this Keeper process in ChronoVisor's KeeperRegistry + chronolog::KeeperIdCard keeperIdCard(keeper_group_id, recording_endpoint.first, recording_endpoint.second + , recording_service_provider_id); + + std::stringstream ss; + ss << keeperIdCard; + LOG_INFO("[ChronoKeeperInstance] KeeperIdCard: {}", ss.str()); + + // Instantiate ChronoKeeper MemoryDataStore & ExtractorModule + chronolog::ChunkIngestionQueue ingestionQueue; + std::string keeper_csv_files_directory = confManager.KEEPER_CONF.STORY_FILES_DIR; + chronolog::CSVFileStoryChunkExtractor storyExtractor(keeperIdCard, keeper_csv_files_directory); + chronolog::KeeperDataStore theDataStore(ingestionQueue, storyExtractor.getExtractionQueue()); + + chronolog::ServiceId collectionServiceId(datastore_endpoint.first, datastore_endpoint.second + , datastore_service_provider_id); + tl::engine*dataAdminEngine = nullptr; + + chronolog::DataStoreAdminService*keeperDataAdminService = nullptr; + + try + { + margo_instance_id collection_margo_id = margo_init(KEEPER_DATASTORE_SERVICE_NA_STRING.c_str(), MARGO_SERVER_MODE + , 1, 1); + + dataAdminEngine = new tl::engine(collection_margo_id); + + std::stringstream s3; + s3 << dataAdminEngine->self(); + LOG_DEBUG("[ChronoKeeperInstance] GroupID={} starting DataStoreAdminService at address {} with ProviderID={}" + , keeper_group_id, s3.str(), datastore_service_provider_id); + keeperDataAdminService = chronolog::DataStoreAdminService::CreateDataStoreAdminService(*dataAdminEngine + , datastore_service_provider_id + , theDataStore); + } + catch(tl::exception const &) + { + LOG_ERROR("[ChronoKeeperInstance] Keeper failed to create DataStoreAdminService"); + } + + if(nullptr == keeperDataAdminService) + { + LOG_CRITICAL("[ChronoKeeperInstance] Keeper failed to create DataStoreAdminService exiting"); + if(dataAdminEngine) + { delete dataAdminEngine; } + return (-1); + } + + // Instantiate KeeperRecordingService + tl::engine*recordingEngine = nullptr; + chronolog::KeeperRecordingService*keeperRecordingService = nullptr; + + try + { + margo_instance_id margo_id = margo_init(KEEPER_RECORDING_SERVICE_NA_STRING.c_str(), MARGO_SERVER_MODE, 1, 1); + recordingEngine = new tl::engine(margo_id); + + std::stringstream s1; + s1 << recordingEngine->self(); + LOG_INFO("[ChronoKeeperInstance] GroupID={} starting KeeperRecordingService at {} with provider_id {}" + , keeper_group_id, s1.str(), datastore_service_provider_id); + keeperRecordingService = chronolog::KeeperRecordingService::CreateKeeperRecordingService(*recordingEngine + , recording_service_provider_id + , ingestionQueue); + } + catch(tl::exception const &) + { + LOG_ERROR("[ChronoKeeperInstance] Keeper failed to create KeeperRecordingService"); + } + + if(nullptr == keeperRecordingService) + { + LOG_CRITICAL("[ChronoKeeperInstance] Keeper failed to create KeeperRecordingService exiting"); + delete keeperDataAdminService; + return (-1); + } + + /// KeeperRegistryClient SetUp _____________________________________________________________________________________ + // create KeeperRegistryClient and register the new KeeperRecording service with the KeeperRegistry + std::string KEEPER_REGISTRY_SERVICE_NA_STRING = + confManager.KEEPER_CONF.VISOR_KEEPER_REGISTRY_SERVICE_CONF.RPC_CONF.PROTO_CONF + "://" + + confManager.KEEPER_CONF.VISOR_KEEPER_REGISTRY_SERVICE_CONF.RPC_CONF.IP + ":" + + std::to_string(confManager.KEEPER_CONF.VISOR_KEEPER_REGISTRY_SERVICE_CONF.RPC_CONF.BASE_PORT); + + uint16_t KEEPER_REGISTRY_SERVICE_PROVIDER_ID = confManager.KEEPER_CONF.VISOR_KEEPER_REGISTRY_SERVICE_CONF.RPC_CONF.SERVICE_PROVIDER_ID; + + chronolog::KeeperRegistryClient*keeperRegistryClient = chronolog::KeeperRegistryClient::CreateKeeperRegistryClient( + *dataAdminEngine, KEEPER_REGISTRY_SERVICE_NA_STRING, KEEPER_REGISTRY_SERVICE_PROVIDER_ID); + + if(nullptr == keeperRegistryClient) + { + LOG_CRITICAL("[ChronoKeeperInstance] Keeper failed to create KeeperRegistryClient; exiting"); + delete keeperRecordingService; + delete keeperDataAdminService; + return (-1); + } + + /// Registration with ChronoVisor __________________________________________________________________________________ + // try to register with chronoVisor a few times than log ERROR and exit... + int registration_status = chronolog::CL_ERR_UNKNOWN; + int retries = 5; + while((chronolog::CL_SUCCESS != registration_status) && (retries > 0)) + { + registration_status = keeperRegistryClient->send_register_msg( + chronolog::KeeperRegistrationMsg(keeperIdCard, collectionServiceId)); + retries--; + } + + if(chronolog::CL_SUCCESS != registration_status) + { + LOG_CRITICAL("[ChronoKeeperInstance] Failed to register with ChronoVisor after multiple attempts. Exiting."); + delete keeperRegistryClient; + delete keeperRecordingService; + delete keeperDataAdminService; + return (-1); + } + LOG_INFO("[ChronoKeeperInstance] Successfully registered with ChronoVisor."); + + /// Start data collection and extraction threads ___________________________________________________________________ + // services are successfulley created and keeper process had registered with ChronoVisor + // start all dataColelction and Extraction threads... + tl::abt scope; + theDataStore.startDataCollection(3); + // start extraction streams & threads + storyExtractor.startExtractionThreads(2); + + + /// Main loop for sending stats message until receiving SIGTERM ____________________________________________________ + // now we are ready to ingest records coming from the storyteller clients .... + // main thread would be sending stats message until keeper process receives + // sigterm signal + chronolog::KeeperStatsMsg keeperStatsMsg(keeperIdCard); + while(keep_running) + { + keeperRegistryClient->send_stats_msg(keeperStatsMsg); + sleep(30); + } + + /// Unregister from ChronoVisor ____________________________________________________________________________________ + // Unregister from the chronoVisor so that no new story requests would be coming + keeperRegistryClient->send_unregister_msg(keeperIdCard); + delete keeperRegistryClient; + + /// Stop services and shut down ____________________________________________________________________________________ + LOG_INFO("[ChronoKeeperInstance] Initiating shutdown procedures."); + // Stop recording events + delete keeperRecordingService; + delete keeperDataAdminService; + // Shutdown the Data Collection + theDataStore.shutdownDataCollection(); + // Shutdown extraction module + // drain extractionQueue and stop extraction xStreams + storyExtractor.shutdownExtractionThreads(); + // these are not probably needed as thalium handles the engine finalization... + // recordingEngine.finalize(); + // collectionEngine.finalize(); + delete recordingEngine; + delete dataAdminEngine; + LOG_INFO("[ChronoKeeperInstance] Shutdown completed. Exiting."); + return exit_code; +} diff --git a/ChronoGrapher/ChronoKeeperInstance.cpp b/ChronoGrapher/ChronoKeeperInstance.cpp new file mode 100644 index 00000000..bed32f3e --- /dev/null +++ b/ChronoGrapher/ChronoKeeperInstance.cpp @@ -0,0 +1,293 @@ +#include +#include +#include + +#include + +//#include "chrono_common/KeeperIdCard.h" +//#include "chrono_common/KeeperStatsMsg.h" +#include "KeeperRecordingService.h" +#include "KeeperRegClient.h" +#include "IngestionQueue.h" +#include "StoryChunkExtractionQueue.h" +#include "StoryChunkExtractor.h" +#include "KeeperDataStore.h" +#include "DataStoreAdminService.h" +#include "ConfigurationManager.h" +#include "StoryChunkExtractor.h" +#include "CSVFileChunkExtractor.h" +#include "cmd_arg_parse.h" + +#define KEEPER_GROUP_ID 7 + +// we will be using a combination of the uint32_t representation of the service IP address +// and uint16_t representation of the port number +int +service_endpoint_from_dotted_string(std::string const &ip_string, int port, std::pair &endpoint) +{ + // we will be using a combination of the uint32_t representation of the service IP address + // and uint16_t representation of the port number + // NOTE: both IP and port values in the KeeperCard are in the host byte order, not the network order) + // to identfy the ChronoKeeper process + + struct sockaddr_in sa; + // translate the recording service dotted IP string into 32bit network byte order representation + int inet_pton_return = inet_pton(AF_INET, ip_string.c_str(), &sa.sin_addr.s_addr); //returns 1 on success + if(1 != inet_pton_return) + { + LOG_ERROR("[ChronoKeeperInstance] Invalid IP address provided: {}", ip_string); + return (-1); + } + + // translate 32bit ip from network into the host byte order + uint32_t ntoh_ip_addr = ntohl(sa.sin_addr.s_addr); + uint16_t ntoh_port = port; + endpoint = std::pair (ntoh_ip_addr, ntoh_port); + + LOG_DEBUG("[ChronoKeeperInstance] Service endpoint created: IP={}, Port={}", ip_string, port); + return 1; +} + +volatile sig_atomic_t keep_running = true; + +void sigterm_handler(int) +{ + LOG_INFO("[ChronoKeeperInstance] Received SIGTERM signal. Initiating shutdown procedure."); + keep_running = false; + return; +} + +/////////////////////////////////////////////// + +int main(int argc, char**argv) +{ + int exit_code = 0; + signal(SIGTERM, sigterm_handler); + + /// Configure SetUp ________________________________________________________________________________________________ + std::string conf_file_path; + conf_file_path = parse_conf_path_arg(argc, argv); + if(conf_file_path.empty()) + { + std::exit(EXIT_FAILURE); + } + ChronoLog::ConfigurationManager confManager(conf_file_path); + int result = Logger::initialize(confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGTYPE + , confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGFILE + , confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGLEVEL + , confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGNAME + , confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGFILESIZE + , confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGFILENUM + , confManager.KEEPER_CONF.KEEPER_LOG_CONF.FLUSHLEVEL); + if(result == 1) + { + exit(EXIT_FAILURE); + } + LOG_INFO("Running Chronokeeper Server."); + + // Instantiate ChronoKeeper MemoryDataStore + // instantiate DataStoreAdminService + uint64_t keeper_group_id = KEEPER_GROUP_ID; + + /// DataStoreAdminService setup ____________________________________________________________________________________ + std::string datastore_service_ip = confManager.KEEPER_CONF.KEEPER_DATA_STORE_ADMIN_SERVICE_CONF.RPC_CONF.IP; + int datastore_service_port = confManager.KEEPER_CONF.KEEPER_DATA_STORE_ADMIN_SERVICE_CONF.RPC_CONF.BASE_PORT; + std::string KEEPER_DATASTORE_SERVICE_NA_STRING = + confManager.KEEPER_CONF.KEEPER_DATA_STORE_ADMIN_SERVICE_CONF.RPC_CONF.PROTO_CONF + "://" + + datastore_service_ip + ":" + std::to_string(datastore_service_port); + + uint16_t datastore_service_provider_id = confManager.KEEPER_CONF.KEEPER_DATA_STORE_ADMIN_SERVICE_CONF.RPC_CONF.SERVICE_PROVIDER_ID; + + chronolog::service_endpoint datastore_endpoint; + // validate ip address, instantiate DataAdminService and create ServiceId to be included in KeeperRegistrationMsg + + if(-1 == service_endpoint_from_dotted_string(datastore_service_ip, datastore_service_port, datastore_endpoint)) + { + LOG_CRITICAL("[ChronoKeeperInstance] Failed to start DataStoreAdminService. Invalid endpoint provided."); + return (-1); + } + LOG_INFO("[ChronoKeeperInstance] DataStoreAdminService started successfully."); + + /// KeeperRecordingService setup ___________________________________________________________________________________ + // Instantiate KeeperRecordingService + std::string KEEPER_RECORDING_SERVICE_PROTOCOL = confManager.KEEPER_CONF.KEEPER_RECORDING_SERVICE_CONF.RPC_CONF.PROTO_CONF; + std::string KEEPER_RECORDING_SERVICE_IP = confManager.KEEPER_CONF.KEEPER_RECORDING_SERVICE_CONF.RPC_CONF.IP; + uint16_t KEEPER_RECORDING_SERVICE_PORT = confManager.KEEPER_CONF.KEEPER_RECORDING_SERVICE_CONF.RPC_CONF.BASE_PORT; + uint16_t recording_service_provider_id = confManager.KEEPER_CONF.KEEPER_RECORDING_SERVICE_CONF.RPC_CONF.SERVICE_PROVIDER_ID; + + std::string KEEPER_RECORDING_SERVICE_NA_STRING = + std::string(KEEPER_RECORDING_SERVICE_PROTOCOL) + "://" + std::string(KEEPER_RECORDING_SERVICE_IP) + ":" + + std::to_string(KEEPER_RECORDING_SERVICE_PORT); + + // validate ip address, instantiate Recording Service and create KeeperIdCard + + chronolog::service_endpoint recording_endpoint; + if(-1 == service_endpoint_from_dotted_string(KEEPER_RECORDING_SERVICE_IP, KEEPER_RECORDING_SERVICE_PORT + , recording_endpoint)) + { + LOG_CRITICAL("[ChronoKeeperInstance] Failed to start KeeperRecordingService. Invalid endpoint provided."); + return (-1); + } + LOG_INFO("[ChronoKeeperInstance] KeeperRecordingService started successfully."); + + // create KeeperIdCard to identify this Keeper process in ChronoVisor's KeeperRegistry + chronolog::KeeperIdCard keeperIdCard(keeper_group_id, recording_endpoint.first, recording_endpoint.second + , recording_service_provider_id); + + std::stringstream ss; + ss << keeperIdCard; + LOG_INFO("[ChronoKeeperInstance] KeeperIdCard: {}", ss.str()); + + // Instantiate ChronoKeeper MemoryDataStore & ExtractorModule + chronolog::IngestionQueue ingestionQueue; + std::string keeper_csv_files_directory = confManager.KEEPER_CONF.STORY_FILES_DIR; + chronolog::CSVFileStoryChunkExtractor storyExtractor(keeperIdCard, keeper_csv_files_directory); + chronolog::KeeperDataStore theDataStore(ingestionQueue, storyExtractor.getExtractionQueue()); + + chronolog::ServiceId collectionServiceId(datastore_endpoint.first, datastore_endpoint.second + , datastore_service_provider_id); + tl::engine*dataAdminEngine = nullptr; + + chronolog::DataStoreAdminService*keeperDataAdminService = nullptr; + + try + { + margo_instance_id collection_margo_id = margo_init(KEEPER_DATASTORE_SERVICE_NA_STRING.c_str(), MARGO_SERVER_MODE + , 1, 1); + + dataAdminEngine = new tl::engine(collection_margo_id); + + std::stringstream s3; + s3 << dataAdminEngine->self(); + LOG_DEBUG("[ChronoKeeperInstance] GroupID={} starting DataStoreAdminService at address {} with ProviderID={}" + , keeper_group_id, s3.str(), datastore_service_provider_id); + keeperDataAdminService = chronolog::DataStoreAdminService::CreateDataStoreAdminService(*dataAdminEngine + , datastore_service_provider_id + , theDataStore); + } + catch(tl::exception const &) + { + LOG_ERROR("[ChronoKeeperInstance] Keeper failed to create DataStoreAdminService"); + } + + if(nullptr == keeperDataAdminService) + { + LOG_CRITICAL("[ChronoKeeperInstance] Keeper failed to create DataStoreAdminService exiting"); + if(dataAdminEngine) + { delete dataAdminEngine; } + return (-1); + } + + // Instantiate KeeperRecordingService + tl::engine*recordingEngine = nullptr; + chronolog::KeeperRecordingService*keeperRecordingService = nullptr; + + try + { + margo_instance_id margo_id = margo_init(KEEPER_RECORDING_SERVICE_NA_STRING.c_str(), MARGO_SERVER_MODE, 1, 1); + recordingEngine = new tl::engine(margo_id); + + std::stringstream s1; + s1 << recordingEngine->self(); + LOG_INFO("[ChronoKeeperInstance] GroupID={} starting KeeperRecordingService at {} with provider_id {}" + , keeper_group_id, s1.str(), datastore_service_provider_id); + keeperRecordingService = chronolog::KeeperRecordingService::CreateKeeperRecordingService(*recordingEngine + , recording_service_provider_id + , ingestionQueue); + } + catch(tl::exception const &) + { + LOG_ERROR("[ChronoKeeperInstance] Keeper failed to create KeeperRecordingService"); + } + + if(nullptr == keeperRecordingService) + { + LOG_CRITICAL("[ChronoKeeperInstance] Keeper failed to create KeeperRecordingService exiting"); + delete keeperDataAdminService; + return (-1); + } + + /// KeeperRegistryClient SetUp _____________________________________________________________________________________ + // create KeeperRegistryClient and register the new KeeperRecording service with the KeeperRegistry + std::string KEEPER_REGISTRY_SERVICE_NA_STRING = + confManager.KEEPER_CONF.VISOR_KEEPER_REGISTRY_SERVICE_CONF.RPC_CONF.PROTO_CONF + "://" + + confManager.KEEPER_CONF.VISOR_KEEPER_REGISTRY_SERVICE_CONF.RPC_CONF.IP + ":" + + std::to_string(confManager.KEEPER_CONF.VISOR_KEEPER_REGISTRY_SERVICE_CONF.RPC_CONF.BASE_PORT); + + uint16_t KEEPER_REGISTRY_SERVICE_PROVIDER_ID = confManager.KEEPER_CONF.VISOR_KEEPER_REGISTRY_SERVICE_CONF.RPC_CONF.SERVICE_PROVIDER_ID; + + chronolog::KeeperRegistryClient*keeperRegistryClient = chronolog::KeeperRegistryClient::CreateKeeperRegistryClient( + *dataAdminEngine, KEEPER_REGISTRY_SERVICE_NA_STRING, KEEPER_REGISTRY_SERVICE_PROVIDER_ID); + + if(nullptr == keeperRegistryClient) + { + LOG_CRITICAL("[ChronoKeeperInstance] Keeper failed to create KeeperRegistryClient; exiting"); + delete keeperRecordingService; + delete keeperDataAdminService; + return (-1); + } + + /// Registration with ChronoVisor __________________________________________________________________________________ + // try to register with chronoVisor a few times than log ERROR and exit... + int registration_status = chronolog::CL_ERR_UNKNOWN; + int retries = 5; + while((chronolog::CL_SUCCESS != registration_status) && (retries > 0)) + { + registration_status = keeperRegistryClient->send_register_msg( + chronolog::KeeperRegistrationMsg(keeperIdCard, collectionServiceId)); + retries--; + } + + if(chronolog::CL_SUCCESS != registration_status) + { + LOG_CRITICAL("[ChronoKeeperInstance] Failed to register with ChronoVisor after multiple attempts. Exiting."); + delete keeperRegistryClient; + delete keeperRecordingService; + delete keeperDataAdminService; + return (-1); + } + LOG_INFO("[ChronoKeeperInstance] Successfully registered with ChronoVisor."); + + /// Start data collection and extraction threads ___________________________________________________________________ + // services are successfulley created and keeper process had registered with ChronoVisor + // start all dataColelction and Extraction threads... + tl::abt scope; + theDataStore.startDataCollection(3); + // start extraction streams & threads + storyExtractor.startExtractionThreads(2); + + + /// Main loop for sending stats message until receiving SIGTERM ____________________________________________________ + // now we are ready to ingest records coming from the storyteller clients .... + // main thread would be sending stats message until keeper process receives + // sigterm signal + chronolog::KeeperStatsMsg keeperStatsMsg(keeperIdCard); + while(keep_running) + { + keeperRegistryClient->send_stats_msg(keeperStatsMsg); + sleep(30); + } + + /// Unregister from ChronoVisor ____________________________________________________________________________________ + // Unregister from the chronoVisor so that no new story requests would be coming + keeperRegistryClient->send_unregister_msg(keeperIdCard); + delete keeperRegistryClient; + + /// Stop services and shut down ____________________________________________________________________________________ + LOG_INFO("[ChronoKeeperInstance] Initiating shutdown procedures."); + // Stop recording events + delete keeperRecordingService; + delete keeperDataAdminService; + // Shutdown the Data Collection + theDataStore.shutdownDataCollection(); + // Shutdown extraction module + // drain extractionQueue and stop extraction xStreams + storyExtractor.shutdownExtractionThreads(); + // these are not probably needed as thalium handles the engine finalization... + // recordingEngine.finalize(); + // collectionEngine.finalize(); + delete recordingEngine; + delete dataAdminEngine; + LOG_INFO("[ChronoKeeperInstance] Shutdown completed. Exiting."); + return exit_code; +} diff --git a/ChronoGrapher/ChunkIngestionQueue.h b/ChronoGrapher/ChunkIngestionQueue.h new file mode 100644 index 00000000..e47e4c79 --- /dev/null +++ b/ChronoGrapher/ChunkIngestionQueue.h @@ -0,0 +1,130 @@ +#ifndef CHUNK_INGESTION_QUEUE_H +#define CHUNK_INGESTION_QUEUE_H + + +#include +#include +#include +#include +#include "log.h" + +#include "chronolog_types.h" +#include "StoryChunk.h" +#include "StoryChunkIngestionHandle.h" + +// +// IngestionQueue is a funnel into the MemoryDataStore +// std::deque guarantees O(1) time for addidng elements and resizing +// (vector of vectors implementation) + +namespace chronolog +{ + +class ChunkIngestionQueue +{ +public: + ChunkIngestionQueue() + {} + + ~ChunkIngestionQueue() + { shutDown(); } + + void addStoryIngestionHandle(StoryId const &story_id, StoryChunkIngestionHandle*ingestion_handle) + { + std::lock_guard lock(ingestionQueueMutex); + storyIngestionHandles.emplace(std::pair (story_id, ingestion_handle)); + LOG_DEBUG("[IngestionQueue] Added handle for StoryID={}: HandleAddress={}, StoryIngestionHandles={}, HandleMapSize={}" + , story_id, static_cast(ingestion_handle), reinterpret_cast(&storyIngestionHandles) + , storyIngestionHandles.size()); + } + + void removeStoryIngestionHandle(StoryId const &story_id) + { + std::lock_guard lock(ingestionQueueMutex); + if(storyIngestionHandles.erase(story_id)) + { + LOG_DEBUG("[IngestionQueue] Removed handle for StoryID={}. Current handle MapSize={}", story_id + , storyIngestionHandles.size()); + } + else + { + LOG_WARNING("[IngestionQueue] Tried to remove non-existent handle for StoryID={}.", story_id); + } + } + + void ingestStoryChunk(StoryChunk* chunk) + { + LOG_DEBUG("[IngestionQueue] Received chunk for StoryID={}: HandleMapSize={}", chunk->getStoryId(), storyIngestionHandles.size()); + auto ingestionHandle_iter = storyIngestionHandles.find(chunk->getStoryId()); + if(ingestionHandle_iter == storyIngestionHandles.end()) + { + LOG_WARNING("[IngestionQueue] Orphan chunk for story {}. Storing for later processing.", chunk->getStoryId()); + std::lock_guard lock(ingestionQueueMutex); + orphanQueue.push_back(chunk); + } + else + { + //individual StoryIngestionHandle has its own mutex + (*ingestionHandle_iter).second->ingestChunk(chunk); + } + } + + void drainOrphanChunks() + { + if(orphanQueue.empty()) + { + LOG_DEBUG("[IngestionQueue] Orphan event queue is empty. No actions taken."); + return; + } + std::lock_guard lock(ingestionQueueMutex); + for(StoryChunkDeque::iterator iter = orphanQueue.begin(); iter != orphanQueue.end();) + { + auto ingestionHandle_iter = storyIngestionHandles.find((*iter)->getStoryId()); + if(ingestionHandle_iter != storyIngestionHandles.end()) + { + // Individual StoryIngestionHandle has its own mutex + (*ingestionHandle_iter).second->ingestChunk(*iter); + // Remove the event from the orphan deque and get the iterator to the next element prior to removal + iter = orphanQueue.erase(iter); + } + else + { + ++iter; + } + } + LOG_DEBUG("[IngestionQueue] Drained {} orphan events into known handles.", orphanQueue.size()); + } + + bool is_empty() const + { + return (orphanQueue.empty() && storyIngestionHandles.empty()); + } + + void shutDown() + { + LOG_INFO("[IngestionQueue] Initiating shutdown. HandleMapSize={}, OrphanQueueSize={}" + , storyIngestionHandles.size(), orphanQueue.size()); + // last attempt to drain orphanQueue into known ingestionHandles + drainOrphanChunks(); + // disengage all handles + std::lock_guard lock(ingestionQueueMutex); + storyIngestionHandles.clear(); + LOG_INFO("[IngestionQueue] Shutdown completed. All handles disengaged."); + } + +private: + ChunkIngestionQueue(ChunkIngestionQueue const &) = delete; + + ChunkIngestionQueue &operator=(ChunkIngestionQueue const &) = delete; + + std::mutex ingestionQueueMutex; + std::unordered_map storyIngestionHandles; + + // chunks for unknown stories or late arriving chunks for closed stories will end up + // in orphanQueue that we'll periodically try to drain into the DataStore + std::deque orphanQueue; +}; +} + +#endif + diff --git a/ChronoGrapher/DataStoreAdminService.h b/ChronoGrapher/DataStoreAdminService.h new file mode 100644 index 00000000..ee76bb42 --- /dev/null +++ b/ChronoGrapher/DataStoreAdminService.h @@ -0,0 +1,88 @@ +#ifndef DataStoreAdmin_SERVICE_H +#define DataStoreAdmin_SERVICE_H + +#include +#include +#include +#include + +#include "chronolog_types.h" +#include "KeeperDataStore.h" + +namespace tl = thallium; + +namespace chronolog +{ + +class DataStoreAdminService: public tl::provider +{ +public: + // Service should be created on the heap not the stack thus the constructor is private... + static DataStoreAdminService* + CreateDataStoreAdminService(tl::engine &tl_engine, uint16_t service_provider_id, KeeperDataStore &dataStoreInstance) + { + return new DataStoreAdminService(tl_engine, service_provider_id, dataStoreInstance); + } + + ~DataStoreAdminService() + { + LOG_DEBUG("[DataStoreAdminService] Destructor called. Cleaning up..."); + //remove provider finalization callback from the engine's list + get_engine().pop_finalize_callback(this); + } + + void collection_service_available(tl::request const &request) + { + request.respond(1); + } + + void shutdown_data_collection(tl::request const &request) + { + int status = 1; + theDataStore.shutdownDataCollection(); + request.respond(status); + } + + void + StartStoryRecording(tl::request const &request, std::string const &chronicle_name, std::string const &story_name + , StoryId const &story_id, uint64_t start_time) + { + LOG_INFO("[DataStoreAdminService] Starting Story Recording: StoryName={}, StoryID={}", story_name, story_id); + int return_code = theDataStore.startStoryRecording(chronicle_name, story_name, story_id, start_time); + request.respond(return_code); + } + + void StopStoryRecording(tl::request const &request, StoryId const &story_id) + { + LOG_INFO("[DataStoreAdminService] Stopping Story Recording: StoryID={}", story_id); + int return_code = theDataStore.stopStoryRecording(story_id); + request.respond(return_code); + } + +private: + DataStoreAdminService(tl::engine &tl_engine, uint16_t service_provider_id, KeeperDataStore &data_store_instance) + : tl::provider (tl_engine, service_provider_id), theDataStore(data_store_instance) + { + define("collection_service_available", &DataStoreAdminService::collection_service_available); + define("shutdown_data_collection", &DataStoreAdminService::shutdown_data_collection); + define("start_story_recording", &DataStoreAdminService::StartStoryRecording); + define("stop_story_recording", &DataStoreAdminService::StopStoryRecording); + //set up callback for the case when the engine is being finalized while this provider is still alive + get_engine().push_finalize_callback(this, [p = this]() + { delete p; }); + + std::stringstream ss; + ss << get_engine().self(); + LOG_INFO("[DataStoreAdminService] Constructed at {}. ProviderID={}", ss.str(), service_provider_id); + } + + DataStoreAdminService(DataStoreAdminService const &) = delete; + + DataStoreAdminService &operator=(DataStoreAdminService const &) = delete; + + KeeperDataStore &theDataStore; +}; + +}// namespace chronolog + +#endif diff --git a/ChronoGrapher/KeeperDataStore.cpp b/ChronoGrapher/KeeperDataStore.cpp new file mode 100644 index 00000000..298d88c9 --- /dev/null +++ b/ChronoGrapher/KeeperDataStore.cpp @@ -0,0 +1,278 @@ +#include +#include +#include +#include +#include + +#include + +#include "chronolog_errcode.h" +#include "KeeperDataStore.h" +#include "log.h" + +namespace chl = chronolog; +namespace tl = thallium; + +/////////////////////// +class ClocksourceCPPStyle +{ +public: + uint64_t getTimestamp() + { + return std::chrono::steady_clock::now().time_since_epoch().count(); + } +}; + +//////////////////////// + +int chronolog::KeeperDataStore::startStoryRecording(std::string const &chronicle, std::string const &story + , chronolog::StoryId const &story_id, uint64_t start_time + , uint32_t time_chunk_duration, uint32_t access_window) +{ + LOG_INFO("[KeeperDataStore] Start recording story: Chronicle={}, Story={}, StoryID={}", chronicle, story, story_id); + + // Get dataStoreMutex, check for story_id_presense & add new StoryPipeline if needed + std::lock_guard storeLock(dataStoreMutex); + auto pipeline_iter = theMapOfStoryPipelines.find(story_id); + if(pipeline_iter != theMapOfStoryPipelines.end()) + { + LOG_INFO("[KeeperDataStore] Story already being recorded. StoryID: {}", story_id); + //check it the pipeline was put on the waitingForExit list by the previous acquisition + // and remove it from there + auto waiting_iter = pipelinesWaitingForExit.find(story_id); + if(waiting_iter != pipelinesWaitingForExit.end()) + { + pipelinesWaitingForExit.erase(waiting_iter); + } + + return chronolog::CL_SUCCESS; + } + + auto result = theMapOfStoryPipelines.emplace( + std::pair (story_id, new chl::StoryPipeline(theExtractionQueue, chronicle + , story, story_id, start_time + , time_chunk_duration))); + + if(result.second) + { + LOG_INFO("[KeeperDataStore] New StoryPipeline created successfully. StoryID: {}", story_id); + pipeline_iter = result.first; + //engage StoryPipeline with the IngestionQueue + chl::StoryChunkIngestionHandle*ingestionHandle = (*pipeline_iter).second->getActiveIngestionHandle(); + theIngestionQueue.addStoryIngestionHandle(story_id, ingestionHandle); + return chronolog::CL_SUCCESS; + } + else + { + LOG_ERROR("[KeeperDataStore] Failed to create StoryPipeline for StoryID: {}. Possible memory or resource issue." + , story_id); + return CL_ERR_UNKNOWN; + } +} +//////////////////////// + +int chronolog::KeeperDataStore::stopStoryRecording(chronolog::StoryId const &story_id) +{ + LOG_DEBUG("[KeeperDataStore] Initiating stop recording for StoryID={}", story_id); + // we do not yet disengage the StoryPipeline from the IngestionQueue right away + // but put it on the WaitingForExit list to be finalized, persisted to disk , and + // removed from memory at exit_time = now+acceptance_window... + // unless there's a new story acqiusition request comes before that moment + std::lock_guard storeLock(dataStoreMutex); + auto pipeline_iter = theMapOfStoryPipelines.find(story_id); + if(pipeline_iter != theMapOfStoryPipelines.end()) + { + uint64_t exit_time = std::chrono::high_resolution_clock::now().time_since_epoch().count() + + (*pipeline_iter).second->getAcceptanceWindow(); + pipelinesWaitingForExit[(*pipeline_iter).first] = (std::pair ( + (*pipeline_iter).second, exit_time)); + LOG_INFO("[KeeperDataStore] Added StoryPipeline to waiting list for finalization. StoryID={}, ExitTime={}", story_id + , exit_time); + } + else + { + LOG_WARNING("[KeeperDataStore] Attempted to stop recording for non-existent StoryID={}", story_id); + } + return chronolog::CL_SUCCESS; +} + +//////////////////////// + +void chronolog::KeeperDataStore::collectIngestedEvents() +{ + LOG_DEBUG("[KeeperDataStore] Initiating collection of ingested events. Current state={}, Active StoryPipelines={}, PipelinesWaitingForExit={}, ThreadID={}" + , state, theMapOfStoryPipelines.size(), pipelinesWaitingForExit.size(), tl::thread::self_id()); + theIngestionQueue.drainOrphanChunks(); + + std::lock_guard storeLock(dataStoreMutex); + for(auto pipeline_iter = theMapOfStoryPipelines.begin(); + pipeline_iter != theMapOfStoryPipelines.end(); ++pipeline_iter) + { + //INNA: this can be delegated to different threads handling individual storylines... + (*pipeline_iter).second->collectIngestedEvents(); + } +} + +//////////////////////// +void chronolog::KeeperDataStore::extractDecayedStoryChunks() +{ + LOG_DEBUG("[KeeperDataStore] Initiating extraction of decayed story chunks. Current state={}, Active StoryPipelines={}, PipelinesWaitingForExit={}, ThreadID={}" + , state, theMapOfStoryPipelines.size(), pipelinesWaitingForExit.size(), tl::thread::self_id()); + + uint64_t current_time = std::chrono::high_resolution_clock::now().time_since_epoch().count(); + + std::lock_guard storeLock(dataStoreMutex); + for(auto pipeline_iter = theMapOfStoryPipelines.begin(); + pipeline_iter != theMapOfStoryPipelines.end(); ++pipeline_iter) + { + (*pipeline_iter).second->extractDecayedStoryChunks(current_time); + } +} +//////////////////////// + +void chronolog::KeeperDataStore::retireDecayedPipelines() +{ + LOG_DEBUG("[KeeperDataStore] Initiating retirement of decayed pipelines. Current state={}, Active StoryPipelines={}, PipelinesWaitingForExit={}, ThreadID={}" + , state, theMapOfStoryPipelines.size(), pipelinesWaitingForExit.size(), tl::thread::self_id()); + + if(!theMapOfStoryPipelines.empty()) + { + std::lock_guard storeLock(dataStoreMutex); + + uint64_t current_time = std::chrono::high_resolution_clock::now().time_since_epoch().count(); + for(auto pipeline_iter = pipelinesWaitingForExit.begin(); pipeline_iter != pipelinesWaitingForExit.end();) + { + if(current_time >= (*pipeline_iter).second.second) + { + //current_time >= pipeline exit_time + StoryPipeline*pipeline = (*pipeline_iter).second.first; + theMapOfStoryPipelines.erase(pipeline->getStoryId()); + theIngestionQueue.removeStoryIngestionHandle(pipeline->getStoryId()); + pipeline_iter = pipelinesWaitingForExit.erase(pipeline_iter); //pipeline->getStoryId()); + delete pipeline; + } + else + { pipeline_iter++; } + + } + } + //swipe through pipelineswaiting and remove all those with nullptr + LOG_DEBUG("[KeeperDataStore] Completed retirement of decayed pipelines. Current state={}, Active StoryPipelines={}, PipelinesWaitingForExit={}, ThreadID={}" + , state, theMapOfStoryPipelines.size(), pipelinesWaitingForExit.size(), tl::thread::self_id()); +} + +void chronolog::KeeperDataStore::dataCollectionTask() +{ + //run dataCollectionTask as long as the state == RUNNING + // or there're still events left to collect and + // storyPipelines left to retire... + tl::xstream es = tl::xstream::self(); + LOG_DEBUG("[KeeperDataStore] Initiating DataCollectionTask. ESrank={}, ThreadID={}", es.get_rank() + , tl::thread::self_id()); + + while(!is_shutting_down() || !theIngestionQueue.is_empty() || !theMapOfStoryPipelines.empty()) + { + LOG_DEBUG("[KeeperDataStore] Running DataCollection iteration. ESrank={}, ThreadID={}", es.get_rank() + , tl::thread::self_id()); + for(int i = 0; i < 6; ++i) + { + collectIngestedEvents(); + sleep(10); + } + extractDecayedStoryChunks(); + retireDecayedPipelines(); + } + LOG_DEBUG("[KeeperDataStore] Exiting DataCollectionTask thread {}", tl::thread::self_id()); +} + +//////////////////////// +void chronolog::KeeperDataStore::startDataCollection(int stream_count) +{ + std::lock_guard storeLock(dataStoreStateMutex); + if(is_running() || is_shutting_down()) + { + LOG_INFO("[KeeperDataStore] Data collection is already running or shutting down. Ignoring request."); + return; + } + + LOG_INFO("[KeeperDataStore] Starting data collection. StreamCount={}, ThreadID={}", stream_count + , tl::thread::self_id()); + state = RUNNING; + + for(int i = 0; i < stream_count; ++i) + { + tl::managed es = tl::xstream::create(); + dataStoreStreams.push_back(std::move(es)); + } + + for(int i = 0; i < 2 * stream_count; ++i) + { + tl::managed th = dataStoreStreams[i % (dataStoreStreams.size())]->make_thread([p = this]() + { p->dataCollectionTask(); }); + dataStoreThreads.push_back(std::move(th)); + } + LOG_INFO("[KeeperDataStore] Data collection started successfully. Stream count={}, ThreadID={}", stream_count + , tl::thread::self_id()); +} +////////////////////////////// + +void chronolog::KeeperDataStore::shutdownDataCollection() +{ + LOG_INFO("[KeeperDataStore] Initiating shutdown of DataCollection. CurrentState={}, Active StoryPipelines={}, PipelinesWaitingForExit={}" + , state, theMapOfStoryPipelines.size(), pipelinesWaitingForExit.size()); + + // switch the state to shuttingDown + std::lock_guard storeLock(dataStoreStateMutex); + if(is_shutting_down()) + { + LOG_INFO("[KeeperDataStore] Data collection is already shutting down. Ignoring additional shutdown request."); + return; + } + state = SHUTTING_DOWN; + + if(!theMapOfStoryPipelines.empty()) + { + // label all existing Pipelines as waiting to exit + std::lock_guard storeLock(dataStoreMutex); + uint64_t current_time = std::chrono::high_resolution_clock::now().time_since_epoch().count(); + + for(auto pipeline_iter = theMapOfStoryPipelines.begin(); + pipeline_iter != theMapOfStoryPipelines.end(); ++pipeline_iter) + { + if(pipelinesWaitingForExit.find((*pipeline_iter).first) == pipelinesWaitingForExit.end()) + { + uint64_t exit_time = current_time + (*pipeline_iter).second->getAcceptanceWindow(); + pipelinesWaitingForExit[(*pipeline_iter).first] = (std::pair ( + (*pipeline_iter).second, exit_time)); + } + } + } + + // Join threads & execution streams while holding stateMutex + // and just wait until all the events are collected and + // all the storyPipelines decay and retire + for(auto &th: dataStoreThreads) + { + th->join(); + } + LOG_INFO("[KeeperDataStore] All data collection threads have been joined."); + + for(auto &es: dataStoreStreams) + { + es->join(); + } + LOG_INFO("[KeeperDataStore] All data collection streams have been joined."); + LOG_INFO("[KeeperDataStore] DataCollection shutdown completed."); +} + +/////////////////////// + +// +chronolog::KeeperDataStore::~KeeperDataStore() +{ + LOG_INFO("[KeeperDataStore] Destructor called. Initiating shutdown. Active StoryPipelines count={}" + , theMapOfStoryPipelines.size()); + shutdownDataCollection(); + LOG_INFO("[KeeperDataStore] Shutdown completed successfully. Active StoryPipelines count={}" + , theMapOfStoryPipelines.size()); +} diff --git a/ChronoGrapher/KeeperDataStore.h b/ChronoGrapher/KeeperDataStore.h new file mode 100644 index 00000000..f8836fe1 --- /dev/null +++ b/ChronoGrapher/KeeperDataStore.h @@ -0,0 +1,80 @@ +#ifndef KEEPER_DATA_STORE_H +#define KEEPER_DATA_STORE_H + +#include +#include +#include +#include + +#include + +#include "ChunkIngestionQueue.h" +#include "StoryPipeline.h" +#include "StoryChunkExtractionQueue.h" + + +namespace chronolog +{ + + +class KeeperDataStore +{ + + enum DataStoreState + { + UNKNOWN = 0, RUNNING = 1, // active stories + SHUTTING_DOWN = 2 // Shutting down services + }; + + +public: + KeeperDataStore(ChunkIngestionQueue &ingestion_queue, StoryChunkExtractionQueue &extraction_queue): state(UNKNOWN) + , theIngestionQueue( + ingestion_queue), theExtractionQueue(extraction_queue) + {} + + ~KeeperDataStore(); + + bool is_running() const + { return (RUNNING == state); } + + bool is_shutting_down() const + { return (SHUTTING_DOWN == state); } + + int startStoryRecording(ChronicleName const &, StoryName const &, StoryId const &, uint64_t start_time + , uint32_t time_chunk_ranularity = 30, uint32_t access_window = 60); + + int stopStoryRecording(StoryId const &); + + void collectIngestedEvents(); + + void extractDecayedStoryChunks(); + + void retireDecayedPipelines(); + + void startDataCollection(int stream_count); + + void shutdownDataCollection(); + + void dataCollectionTask(); + +private: + KeeperDataStore(KeeperDataStore const &) = delete; + + KeeperDataStore &operator=(KeeperDataStore const &) = delete; + + DataStoreState state; + std::mutex dataStoreStateMutex; + ChunkIngestionQueue &theIngestionQueue; + StoryChunkExtractionQueue &theExtractionQueue; + std::vector > dataStoreStreams; + std::vector > dataStoreThreads; + + std::mutex dataStoreMutex; + std::unordered_map theMapOfStoryPipelines; + std::unordered_map > pipelinesWaitingForExit; + +}; + +} +#endif diff --git a/ChronoGrapher/KeeperRecordingService.h b/ChronoGrapher/KeeperRecordingService.h new file mode 100644 index 00000000..f02bac58 --- /dev/null +++ b/ChronoGrapher/KeeperRecordingService.h @@ -0,0 +1,65 @@ +#ifndef KEEPER_RECORDING_SERVICE_H +#define KEEPER_RECORDING_SERVICE_H + +#include +#include +#include +#include + +#include "chronolog_errcode.h" +#include "KeeperIdCard.h" +#include "chronolog_types.h" +#include "ChunkIngestionQueue.h" + +namespace tl = thallium; + +namespace chronolog +{ +class KeeperRecordingService: public tl::provider +{ +public: + // KeeperRecordingService should be created on the heap not the stack thus the constructor is private... + static KeeperRecordingService* + CreateKeeperRecordingService(tl::engine &tl_engine, uint16_t service_provider_id, ChunkIngestionQueue &ingestion_queue) + { + return new KeeperRecordingService(tl_engine, service_provider_id, ingestion_queue); + } + + ~KeeperRecordingService() + { + LOG_DEBUG("[KeeperRecordingService] Destructor called. Cleaning up..."); + get_engine().pop_finalize_callback(this); + } +/* + INN: replace this method with chunk receptor method + void on_chunk_received(tl::request const &request, LogEvent const &log_event) + { + // ClientId teller_id, StoryId story_id, + // ChronoTick const& chrono_tick, std::string const& record) + std::stringstream ss; + ss << log_event; + LOG_DEBUG("[KeeperRecordingService] Recording event: {}", ss.str()); + theIngestionQueue.ingestLogEvent(log_event); + request.respond(chronolog::CL_SUCCESS); + } +*/ +private: + KeeperRecordingService(tl::engine &tl_engine, uint16_t service_provider_id, ChunkIngestionQueue &ingestion_queue) + : tl::provider (tl_engine, service_provider_id), theIngestionQueue(ingestion_queue) + { + //define("", &KeeperRecordingService::record_event, tl::ignore_return_value()); + //set up callback for the case when the engine is being finalized while this provider is still alive + get_engine().push_finalize_callback(this, [p = this]() + { delete p; }); + } + + KeeperRecordingService(KeeperRecordingService const &) = delete; + + KeeperRecordingService &operator=(KeeperRecordingService const &) = delete; + + ChunkIngestionQueue &theIngestionQueue; +}; + +}// namespace chronolog + +#endif diff --git a/ChronoGrapher/KeeperRegClient.cpp b/ChronoGrapher/KeeperRegClient.cpp new file mode 100644 index 00000000..f74fb97f --- /dev/null +++ b/ChronoGrapher/KeeperRegClient.cpp @@ -0,0 +1,9 @@ +int main(int argc, char**argv) +{ + if(argc != 3) + { + std::cerr << "Usage: " << argv[0] << "
" << std::endl; + exit(0); + } + tl::engine myEngine("ofi+sockets", THALLIUM_CLIENT_MODE); +} \ No newline at end of file diff --git a/ChronoGrapher/KeeperRegClient.h b/ChronoGrapher/KeeperRegClient.h new file mode 100644 index 00000000..bf3b04ce --- /dev/null +++ b/ChronoGrapher/KeeperRegClient.h @@ -0,0 +1,115 @@ +#ifndef KEEPER_REG_CLIENT_H +#define KEEPER_REG_CLIENT_H + +#include +#include +#include + +#include "KeeperIdCard.h" +#include "KeeperRegistrationMsg.h" +#include "KeeperStatsMsg.h" +#include "chronolog_errcode.h" + +namespace tl = thallium; + + +namespace chronolog +{ + +class KeeperRegistryClient +{ + +public: + static KeeperRegistryClient* + CreateKeeperRegistryClient(tl::engine &tl_engine, std::string const ®istry_service_addr + , uint16_t registry_provider_id) + { + try + { + return new KeeperRegistryClient(tl_engine, registry_service_addr, registry_provider_id); + } + catch(tl::exception const &) + { + LOG_ERROR("[KeeperRegistryClient] Failed to create KeeperRegistryClient"); + return nullptr; + } + } + + int send_register_msg(KeeperRegistrationMsg const &keeperMsg) + { + try + { + std::stringstream ss; + ss << keeperMsg; + LOG_DEBUG("[KeeperRegisterClient] Sending Register Message: {}", ss.str()); + return register_keeper.on(reg_service_ph)(keeperMsg); + } + catch(tl::exception const &) + { + LOG_ERROR("[KeeperRegisterClient] Failed Sending Register Message."); + return CL_ERR_UNKNOWN; + } + } + + int send_unregister_msg(KeeperIdCard const &keeperIdCard) + { + try + { + std::stringstream ss; + ss << keeperIdCard; + LOG_DEBUG("[KeeperRegisterClient] Sending Unregister Message: {}", ss.str()); + return unregister_keeper.on(reg_service_ph)(keeperIdCard); + } + catch(tl::exception const &) + { + LOG_ERROR("[KeeperRegisterClient] Failed Sending Unregistered Message."); + return CL_ERR_UNKNOWN; + } + } + + void send_stats_msg(KeeperStatsMsg const &keeperStatsMsg) + { + try + { + std::stringstream ss; + ss << keeperStatsMsg; + LOG_DEBUG("[KeeperRegisterClient] Sending Stats Message: {}", ss.str()); + handle_stats_msg.on(reg_service_ph)(keeperStatsMsg); + } + catch(tl::exception const &) + { + LOG_ERROR("[KeeperRegisterClient] Failed Sending Stats Message."); + } + } + + ~KeeperRegistryClient() + { + LOG_DEBUG("[KeeperRegistryClient] Destructor called. Cleaning up resources..."); + register_keeper.deregister(); + unregister_keeper.deregister(); + handle_stats_msg.deregister(); + } + +private: + std::string reg_service_addr; // na address of Keeper Registry Service + uint16_t reg_service_provider_id; // KeeperRegistryService provider id + tl::provider_handle reg_service_ph; //provider_handle for remote registry service + tl::remote_procedure register_keeper; + tl::remote_procedure unregister_keeper; + tl::remote_procedure handle_stats_msg; + + // constructor is private to make sure thalium rpc objects are created on the heap, not stack + KeeperRegistryClient(tl::engine &tl_engine, std::string const ®istry_addr, uint16_t registry_provider_id) + : reg_service_addr(registry_addr), reg_service_provider_id(registry_provider_id), reg_service_ph( + tl_engine.lookup(registry_addr), registry_provider_id) + { + LOG_DEBUG("[KeeperRegistryClient] Initialized for RegistryService at {} with ProviderID={}", registry_addr + , registry_provider_id); + register_keeper = tl_engine.define("register_keeper"); + unregister_keeper = tl_engine.define("unregister_keeper"); + handle_stats_msg = tl_engine.define("handle_stats_msg").disable_response(); + } +}; +} + +#endif diff --git a/ChronoGrapher/StoryChunk.cpp b/ChronoGrapher/StoryChunk.cpp new file mode 100644 index 00000000..eca28461 --- /dev/null +++ b/ChronoGrapher/StoryChunk.cpp @@ -0,0 +1,99 @@ + + +#include "StoryChunk.h" + + +namespace chl = chronolog; + +///////////////////////// + +chl::StoryChunk::StoryChunk(chl::StoryId const &story_id , uint64_t start_time , uint64_t end_time , uint32_t chunk_size ) + : storyId(story_id) + , startTime(start_time) + , endTime(end_time) + , revisionTime(start_time) + , chunkSize(chunk_size) + { + dataBlob = new char[chunk_size]; + } + +chl::StoryChunk::~StoryChunk() + { + delete [] dataBlob; + } + +int chl::StoryChunk::insertEvent(chl::LogEvent const &event) + { + if((event.time() >= startTime) && (event.time() < endTime)) + { + logEvents.insert(std::pair ({event.time(), event.clientId, event.index()}, event)); + return 1; + } + else + { return 0; } + } + +/////////// + +uint32_t chl::StoryChunk::mergeEvents(std::map &events + , std::map ::iterator &merge_start) +{ + uint32_t merged_event_count = 0; + std::map ::iterator first_merged, last_merged; + + if((*merge_start).second.time() < startTime) + { + merge_start = events.lower_bound(chl::EventSequence{startTime, 0, 0}); + LOG_DEBUG("[StoryChunk] Adjusted merge start time to align with StoryChunk's start time: {}", startTime); + } + + for(auto iter = merge_start; iter != events.end(); ++iter) + { + if(insertEvent((*iter).second) > 0) + { + if(merged_event_count == 0) + { first_merged = iter; } + last_merged = iter; + merged_event_count++; + } + else + { + LOG_DEBUG("[StoryChunk] Stopped merging due to a record that couldn't be inserted."); + break; + } //stop at the first record that can't be merged + } + + if(merged_event_count > 0) + { + //remove the merged records from the original map + events.erase(first_merged, last_merged); + LOG_DEBUG("[StoryChunk] Removed {} merged records from the original event map.", merged_event_count); + } + else + { + LOG_DEBUG("[StoryChunk] No events merged during the operation."); + } + + return merged_event_count; +} + +uint32_t chl::StoryChunk::mergeEvents(chl::StoryChunk &other_chunk, uint64_t start_time, uint64_t end_time) + { return 0; } + +uint32_t chl::StoryChunk::extractEvents(std::map &target_map, std::map ::iterator first_pos + , std::map ::iterator last_pos) + { return 0; } + +uint32_t chl::StoryChunk::extractEvents(std::map &target_map, uint64_t start_time, uint64_t end_time) + { return 0; } + +uint32_t chl::StoryChunk::extractEvents( chl::StoryChunk & target_chunk, uint64_t start_time, uint64_t end_time) + { return 0; } + +uint32_t chl::StoryChunk::split(chl::StoryChunk & split_chunk, uint64_t time_boundary) + { return 0; } + + +uint32_t chl::StoryChunk::eraseEvents(uint64_t start_time, uint64_t end_time) + { return 0; } + diff --git a/ChronoGrapher/StoryChunk.h b/ChronoGrapher/StoryChunk.h new file mode 100644 index 00000000..902e65ec --- /dev/null +++ b/ChronoGrapher/StoryChunk.h @@ -0,0 +1,89 @@ +#ifndef STORY_CHUNK_H +#define STORY_CHUNK_H + +#include +#include +#include "chronolog_types.h" +#include "log.h" + +namespace chronolog +{ + +typedef uint64_t chrono_time; +typedef uint32_t chrono_index; + +// StoryChunk contains all the events for the single story +// for the duration [startTime, endTime[ +// startTime included, endTime excluded +// startTime/endTime are invariant + +typedef std::tuple ArrivalSequence; + +typedef std::tuple EventSequence; + +class StoryChunk +{ +public: + + StoryChunk(StoryId const &story_id = 0, uint64_t start_time = 0, uint64_t end_time = 0, uint32_t chunk_size = 1024); + + ~StoryChunk(); + + StoryId const &getStoryId() const + { return storyId; } + + uint64_t getStartTime() const + { return startTime; } + + uint64_t getEndTime() const + { return endTime; } + + bool empty() const + { return (logEvents.empty() ? true : false); } + + std::map ::const_iterator begin() const + { return logEvents.begin(); } + + std::map ::const_iterator end() const + { return logEvents.end(); } + + std::map ::const_iterator lower_bound(uint64_t chrono_time) const + { return logEvents.lower_bound(EventSequence{chrono_time, 0, 0}); } + + uint64_t firstEventTime() const + { return (*logEvents.begin()).second.time(); } + + uint64_t lastEventTime() const + { return (*logEvents.begin()).second.time(); } + + int insertEvent(LogEvent const &); + + uint32_t + mergeEvents(std::map &events, std::map ::iterator &merge_start); + + uint32_t mergeEvents(StoryChunk &other_chunk, uint64_t start_time =0, uint64_t end_time=0); + + uint32_t + extractEvents(std::map &target_map, std::map ::iterator first_pos + , std::map ::iterator last_pos); + + uint32_t extractEvents(std::map &target_map, uint64_t start_time, uint64_t end_time); + + uint32_t extractEvents( StoryChunk & target_chunk, uint64_t start_time, uint64_t end_time); + + uint32_t split(StoryChunk & split_chunk, uint64_t time_boundary); + + uint32_t eraseEvents(uint64_t start_time, uint64_t end_time); + +private: + StoryId storyId; + uint64_t startTime; + uint64_t endTime; + uint64_t revisionTime; + uint32_t chunkSize; + char * dataBlob; + std::map eventOffsetMap; + std::map logEvents; +}; +} +#endif diff --git a/ChronoGrapher/StoryChunkExtractionQueue.h b/ChronoGrapher/StoryChunkExtractionQueue.h new file mode 100644 index 00000000..9c4ecc91 --- /dev/null +++ b/ChronoGrapher/StoryChunkExtractionQueue.h @@ -0,0 +1,100 @@ +#ifndef CHUNK_EXTRACTION_QUEUE_H +#define CHUNK_EXTRACTION_QUEUE_H + + +#include +#include +#include +#include "log.h" + +#include "chronolog_types.h" +#include "StoryChunk.h" + +namespace chronolog +{ + +class StoryChunkExtractionQueue +{ +public: + StoryChunkExtractionQueue() + {} + + ~StoryChunkExtractionQueue() + { + LOG_DEBUG("[StoryChunkExtractionQueue] Destructor called. Initiating queue shutdown."); + shutDown(); + } + + void stashStoryChunk(StoryChunk*story_chunk) + { + if(nullptr == story_chunk) + { + LOG_WARNING("[StoryChunkExtractionQueue] Attempted to stash a null story chunk. Ignoring."); + return; + } + LOG_DEBUG("[StoryChunkExtractionQueue] Stashed story chunk with StoryID={} and StartTime={}" + , story_chunk->getStoryId(), story_chunk->getStartTime()); + { + std::lock_guard lock(extractionQueueMutex); + extractionDeque.push_back(story_chunk); + } + } + + StoryChunk*ejectStoryChunk() + { + std::lock_guard lock(extractionQueueMutex); + if(extractionDeque.empty()) + { + LOG_DEBUG("[StoryChunkExtractionQueue] No story chunks available for ejection."); + return nullptr; + } + StoryChunk*story_chunk = extractionDeque.front(); + extractionDeque.pop_front(); + + return story_chunk; + } + + + int size() + { + std::lock_guard lock(extractionQueueMutex); + return extractionDeque.size(); + } + + bool empty() + { + std::lock_guard lock(extractionQueueMutex); + return extractionDeque.empty(); + } + + void shutDown() + { + LOG_INFO("[StoryChunkExtractionQueue] Initiating queue shutdown. Queue size: {}", extractionDeque.size()); + if(extractionDeque.empty()) + { return; } + + //INNA: LOG a WARNING and attempt to delay shutdown until the queue is drained by the Extraction module + // if this fails , log an ERROR . + // free the remaining storychunks memory... + std::lock_guard lock(extractionQueueMutex); + while(!extractionDeque.empty()) + { + delete extractionDeque.front(); + extractionDeque.pop_front(); + } + LOG_INFO("[StoryChunkExtractionQueue] Queue has been successfully shut down and all story chunks have been freed."); + } + +private: + StoryChunkExtractionQueue(StoryChunkExtractionQueue const &) = delete; + + StoryChunkExtractionQueue &operator=(StoryChunkExtractionQueue const &) = delete; + + std::mutex extractionQueueMutex; + std::deque extractionDeque; +}; + +} + +#endif + diff --git a/ChronoGrapher/StoryChunkExtractor.cpp b/ChronoGrapher/StoryChunkExtractor.cpp new file mode 100644 index 00000000..39d6c207 --- /dev/null +++ b/ChronoGrapher/StoryChunkExtractor.cpp @@ -0,0 +1,106 @@ +#include + +#include +#include + +#include "StoryChunkExtractor.h" + + +namespace tl = thallium; + +////////////////////////////// + +void chronolog::StoryChunkExtractorBase::startExtractionThreads(int stream_count) +{ + std::lock_guard lock(extractorMutex); + + if(extractorState == RUNNING) + { + LOG_INFO("[StoryChunkExtractionBase] ExtractionModule already running. Aborting start request."); + return; + } + + extractorState = RUNNING; + LOG_DEBUG("[StoryChunkExtractionBase] Started extraction threads."); + + for(int i = 0; i < stream_count; ++i) + { + tl::managed es = tl::xstream::create(); + extractionStreams.push_back(std::move(es)); + } + + for(int i = 0; i < 2 * stream_count; ++i) + { + tl::managed th = extractionStreams[i % extractionStreams.size()]->make_thread([p = this]() + { p->drainExtractionQueue(); }); + extractionThreads.push_back(std::move(th)); + } +} +////////////////////////////// + +void chronolog::StoryChunkExtractorBase::shutdownExtractionThreads() +{ + std::lock_guard lock(extractorMutex); + + if(extractorState == SHUTTING_DOWN) + { + LOG_INFO("[StoryChunkExtractionBase] ExtractionModule already shutting down. Skipping shutdown request."); + return; + } + + extractorState = SHUTTING_DOWN; + LOG_DEBUG("[StoryChunkExtractionBase] Initiating shutdown. Queue size: {}", chunkExtractionQueue.size()); + + // join threads & executionstreams while holding stateMutex + for(auto ð: extractionThreads) + { + eth->join(); + } + LOG_DEBUG("[StoryChunkExtractionBase] Extraction threads successfully shut down."); + for(auto &es: extractionStreams) + { + es->join(); + } + LOG_DEBUG("[StoryChunkExtractionBase] Streams have been successfully closed."); +} + +////////////////////// +chronolog::StoryChunkExtractorBase::~StoryChunkExtractorBase() +{ + LOG_DEBUG("[StoryChunkExtractionBase] Destructor called. Initiating shutdown sequence."); + + shutdownExtractionThreads(); + + extractionThreads.clear(); + extractionStreams.clear(); +} + +////////////////////// + +void chronolog::StoryChunkExtractorBase::drainExtractionQueue() +{ + thallium::xstream es = thallium::xstream::self(); + // extraction threads will be running as long as the state doesn't change + // and untill the extractionQueue is drained in shutdown mode + while((extractorState == RUNNING) || !chunkExtractionQueue.empty()) + { + LOG_DEBUG("[StoryChunkExtractionBase] Draining queue. ES Rank: {}, ULT ID: {}, Queue Size: {}", es.get_rank() + , thallium::thread::self_id(), chunkExtractionQueue.size()); + + while(!chunkExtractionQueue.empty()) + { + StoryChunk*storyChunk = chunkExtractionQueue.ejectStoryChunk(); + if(storyChunk == nullptr) + //the queue might have been drained by another thread before the current thread acquired extractionQueue mutex + { + LOG_WARNING("[StoryChunkExtractionBase] Failed to acquire a story chunk from the queue."); + break; + } + processStoryChunk(storyChunk); // INNA: should add return type and handle the failure properly + // free the memory or reset the startTime and return to the pool of prealocated chunks + delete storyChunk; + } + sleep(30); + } +} + diff --git a/ChronoGrapher/StoryChunkExtractor.h b/ChronoGrapher/StoryChunkExtractor.h new file mode 100644 index 00000000..0ce66499 --- /dev/null +++ b/ChronoGrapher/StoryChunkExtractor.h @@ -0,0 +1,73 @@ +#ifndef STORY_CHUNK_EXTRACTOR_H +#define STORY_CHUNK_EXTRACTOR_H + + +#include +#include +#include +#include +#include + +#include "chronolog_types.h" +#include "StoryChunkExtractionQueue.h" +#include "log.h" + + +namespace tl = thallium; + +namespace chronolog +{ + +class StoryChunkExtractorBase +{ + enum ExtractorState + { + UNKNOWN = 0, RUNNING = 1, // active extraction threads + SHUTTING_DOWN = 2 // Shutting down extraction threads + }; + +public: + StoryChunkExtractorBase(): extractorState(UNKNOWN) + {} + + ~StoryChunkExtractorBase(); + + StoryChunkExtractionQueue &getExtractionQueue() + { + LOG_DEBUG("[StoryChunkExtraction] Current size of extraction queue: {}", chunkExtractionQueue.size()); + return chunkExtractionQueue; + } + + bool is_running() const + { return (extractorState == RUNNING); } + + bool is_shutting_down() const + { return (extractorState == SHUTTING_DOWN); } + + void drainExtractionQueue(); + + virtual void processStoryChunk(StoryChunk*) //=0 + { + LOG_WARNING("[StoryChunkExtraction] Base processStoryChunk method called. Derived class should implement specific logic."); + } + + void startExtractionThreads(int); + + void shutdownExtractionThreads(); + +private: + StoryChunkExtractorBase(StoryChunkExtractorBase const &) = delete; + + StoryChunkExtractorBase &operator=(StoryChunkExtractorBase const &) = delete; + + ExtractorState extractorState; + std::mutex extractorMutex; + StoryChunkExtractionQueue chunkExtractionQueue; + + std::vector > extractionStreams; + std::vector > extractionThreads; +}; +} + +#endif + diff --git a/ChronoGrapher/StoryChunkIngestionHandle.h b/ChronoGrapher/StoryChunkIngestionHandle.h new file mode 100644 index 00000000..7f54ddb7 --- /dev/null +++ b/ChronoGrapher/StoryChunkIngestionHandle.h @@ -0,0 +1,66 @@ +#ifndef STORY_CHUNK_INGESTION_HANDLE_H +#define STORY_CHUNK_INGESTION_HANDLE_H + +#include +#include +#include "StoryChunk.h" +// +// IngestionQueue is a funnel into the KeeperDataStore +// std::deque guarantees O(1) time for addidng elements and resizing +// (vector of vectors implementation) + +namespace chronolog +{ + +typedef std::deque StoryChunkDeque; + +class StoryChunkIngestionHandle +{ + +public: + StoryChunkIngestionHandle(std::mutex &a_mutex, StoryChunkDeque*active, StoryChunkDeque*passive): ingestionMutex(a_mutex) + , activeDeque(active) + , passiveDeque(passive) + {} + + ~StoryChunkIngestionHandle() = default; + + StoryChunkDeque &getActiveDeque() const + { return *activeDeque; } + + StoryChunkDeque &getPassiveDeque() const + { return *passiveDeque; } + + void ingestChunk(StoryChunk * chunk) + { // assume multiple service threads pushing chunks onto ingestionQueue + std::lock_guard lock(ingestionMutex); + activeDeque->push_back(chunk); + } + + void swapActiveDeque() + { + if(!passiveDeque->empty() || activeDeque->empty()) + { return; } + +//INNA: check if atomic compare_and_swap will work here + + std::lock_guard lock_guard(ingestionMutex); + if(!passiveDeque->empty() || activeDeque->empty()) + { return; } + + StoryChunkDeque*full_deque = activeDeque; + activeDeque = passiveDeque; + passiveDeque = full_deque; + } + + +private: + std::mutex &ingestionMutex; + StoryChunkDeque * activeDeque; + StoryChunkDeque * passiveDeque; +}; + +} + +#endif + diff --git a/ChronoGrapher/StoryPipeline.cpp b/ChronoGrapher/StoryPipeline.cpp new file mode 100644 index 00000000..3b5fb3e4 --- /dev/null +++ b/ChronoGrapher/StoryPipeline.cpp @@ -0,0 +1,387 @@ +#include +#include +#include + + +#include "StoryChunk.h" +#include "StoryPipeline.h" +#include "StoryChunkIngestionHandle.h" +#include "StoryChunkExtractionQueue.h" +#include "log.h" + +//#define TRACE_CHUNKING +#define TRACE_CHUNK_EXTRACTION + +namespace chl = chronolog; + +//////////////////////// + +chronolog::StoryPipeline::StoryPipeline(StoryChunkExtractionQueue &extractionQueue, std::string const &chronicle_name + , std::string const &story_name, chronolog::StoryId const &story_id + , uint64_t story_start_time, uint16_t chunk_granularity + , uint16_t acceptance_window): theExtractionQueue(extractionQueue), storyId( + story_id), chronicleName(chronicle_name), storyName(story_name), timelineStart(story_start_time), timelineEnd( + story_start_time), chunkGranularity(chunk_granularity), acceptanceWindow(acceptance_window) + , activeIngestionHandle(nullptr) +{ + activeIngestionHandle = new chl::StoryChunkIngestionHandle(ingestionMutex, &chunkQueue1, &chunkQueue2); + + //pre-initialize the pipeline map with the StoryChunks of chunkGranulary + // with the total timelength of at least 2 chunks (merging logic assumes at least 2 chunks in the active pipeline) + + auto story_start_point = std::chrono::time_point {} + + std::chrono::nanoseconds(timelineStart); + std::time_t time_t_story_start = std::chrono::high_resolution_clock::to_time_t(story_start_point); + LOG_INFO("[StoryPipeline] Initialized with StoryID={}, StoryStartTime={}, Chronology={}, ChunkGranularity={} seconds, AcceptanceWindow={} seconds" + , storyId, std::ctime(&time_t_story_start), chronicleName, chunkGranularity / 1000000000, + acceptanceWindow / 1000000000); + + chunkGranularity *= 1000000000; // seconds =>nanoseconds + acceptanceWindow *= 1000000000; // seconds =>nanoseconds + + //adjust the timelineStart to the closest prior boundary of chunkGranularity + timelineStart -= (timelineStart % chunkGranularity); + timelineEnd = timelineStart; + + for(uint64_t start = timelineStart; timelineEnd < (timelineStart + chunkGranularity * 3);) + { + appendStoryChunk(); + } + +#ifdef TRACE_CHUNKING + auto chunk_start_point = std::chrono::time_point{} // epoch_time_point{}; + + std::chrono::nanoseconds(timelineStart); + std::time_t time_t_chunk_start = std::chrono::high_resolution_clock::to_time_t(chunk_start_point); + auto chunk_end_point = std::chrono::time_point{} + + std::chrono::nanoseconds(timelineEnd); + std::time_t time_t_chunk_end = std::chrono::high_resolution_clock::to_time_t(chunk_end_point); + LOG_DEBUG("[StoryPipeline] Created StoryPipeline with StoryID={}, StartChunk={}, EndChunk={}", + storyId, std::ctime(&time_t_chunk_start), std::ctime(&time_t_chunk_end)); +#endif + +} +/////////////////////// + +chl::StoryChunkIngestionHandle*chl::StoryPipeline::getActiveIngestionHandle() +{ + return activeIngestionHandle; +} + +/////////////////////// +chronolog::StoryPipeline::~StoryPipeline() +{ + LOG_DEBUG("[StoryPipeline] Destructor called for StoryID={}", storyId); + finalize(); +} +/////////////////////// + +void chronolog::StoryPipeline::finalize() +{ + //by this time activeIngestionHandle is disengaged from the IngestionQueue + // as part of KeeperDataStore::shutdown + if(activeIngestionHandle != nullptr) + { + while(!activeIngestionHandle->getPassiveDeque().empty()) + { + //INNA: consider adding mergeEvents(StoryChunk*) + mergeEvents(*(activeIngestionHandle->getPassiveDeque().front())); + activeIngestionHandle->getPassiveDeque().pop_front(); + } + while(!activeIngestionHandle->getPassiveDeque().empty()) + { + //INNA: consider adding mergeEvents(StoryChunk*) + mergeEvents(*(activeIngestionHandle->getPassiveDeque().front())); + activeIngestionHandle->getPassiveDeque().pop_front(); + } + delete activeIngestionHandle; + LOG_INFO("[StoryPipeline] Finalized ingestion handle for storyId: {}", storyId); + } + + //extract any remianing non-empty StoryChunks regardless of decay_time + // an active pipeline is guaranteed to have at least 2 chunks at any moment... + { + std::lock_guard lock(sequencingMutex); + while(!storyTimelineMap.empty()) + { + StoryChunk*extractedChunk = nullptr; + + extractedChunk = (*storyTimelineMap.begin()).second; + storyTimelineMap.erase(storyTimelineMap.begin()); + +#ifdef TRACE_CHUNK_EXTRACTION + LOG_TRACE("[StoryPipeline] Finalized chunk for StoryID={}. Is empty: {}", storyId, extractedChunk->empty() + ? "Yes" : "No"); +#endif + if(extractedChunk->empty()) + { // no need to carry an empty chunk any further... + delete extractedChunk; + } + else + { + theExtractionQueue.stashStoryChunk(extractedChunk); + } + } + } +} + + +///////////////////// + +std::map ::iterator chronolog::StoryPipeline::prependStoryChunk() +{ + // prepend a storyChunk at the begining of storyTimeline and return the iterator to the new node +#ifdef TRACE_CHUNKING + std::chrono::time_point epoch_time_point{}; + auto chunk_start_point = epoch_time_point + std::chrono::nanoseconds(timelineStart); + std::time_t time_t_chunk_start = std::chrono::high_resolution_clock::to_time_t(chunk_start_point); + auto chunk_end_point = epoch_time_point + std::chrono::nanoseconds(timelineStart-chunkGranularity); + std::time_t time_t_chunk_end = std::chrono::high_resolution_clock::to_time_t(chunk_end_point); + LOG_TRACE("[StoryPipeline] Prepending new chunk for StoryID={} starting at: {}", + storyId, timelineStart); +#endif + auto result = storyTimelineMap.insert( + std::pair (timelineStart - chunkGranularity, new chronolog::StoryChunk( + storyId, timelineStart - chunkGranularity, timelineStart))); + if(!result.second) + { + return storyTimelineMap.end(); + } + else + { + timelineStart -= chunkGranularity; + return result.first; + } +} +///////////////////////////// + +std::map ::iterator chronolog::StoryPipeline::appendStoryChunk() +{ + // append the next storyChunk at the end of storyTimeline and return the iterator to the new node +#ifdef TRACE_CHUNKING + std::chrono::time_point epoch_time_point{}; + auto chunk_start_point = epoch_time_point + std::chrono::nanoseconds(timelineEnd); + std::time_t time_t_chunk_start = std::chrono::high_resolution_clock::to_time_t(chunk_start_point); + auto chunk_end_point = epoch_time_point + std::chrono::nanoseconds(timelineEnd+chunkGranularity); + std::time_t time_t_chunk_end = std::chrono::high_resolution_clock::to_time_t(chunk_end_point); + LOG_TRACE("[StoryPipeline] Appending new chunk for StoryID={} starting at: {}", + storyId, timelineEnd); +#endif + auto result = storyTimelineMap.insert( + std::pair (timelineEnd, new chronolog::StoryChunk(storyId, timelineEnd, + timelineEnd + chunkGranularity))); + if(!result.second) + { + return storyTimelineMap.end(); + } + else + { + timelineEnd += chunkGranularity; + return result.first; + } +} +////////////////////// + +void chronolog::StoryPipeline::collectIngestedEvents() +{ + activeIngestionHandle->swapActiveDeque(); + while(!activeIngestionHandle->getPassiveDeque().empty()) + { + //INNA: consider adding mergeEvents(StoryChunk*) + mergeEvents(*(activeIngestionHandle->getPassiveDeque().front())); + activeIngestionHandle->getPassiveDeque().pop_front(); + } + LOG_DEBUG("[StoryPipeline] Collected ingested events for StoryID={}", storyId); +} + +void chronolog::StoryPipeline::extractDecayedStoryChunks(uint64_t current_time) +{ +#ifdef TRACE_CHUNK_EXTRACTION + auto current_point = + std::chrono::time_point {} // epoch_time_point{}; + + std::chrono::nanoseconds(current_time); + std::time_t time_t_current_time = std::chrono::high_resolution_clock::to_time_t(current_point); + uint64_t head_chunk_end_time = (*storyTimelineMap.begin()).second->getEndTime(); + auto decay_point = + std::chrono::time_point {} // epoch_time_point{}; + + std::chrono::nanoseconds(head_chunk_end_time + acceptanceWindow); + std::time_t time_t_decay = std::chrono::high_resolution_clock::to_time_t(decay_point); + LOG_TRACE("[StoryPipeline] StoryID: {} - Current time: {} - Timeline size: {} - Head chunk decay time: {}", storyId + , std::ctime(&time_t_current_time), storyTimelineMap.size(), std::ctime(&time_t_decay)); +#endif + + while(current_time >= acceptanceWindow + (*storyTimelineMap.begin()).second->getEndTime()) + { + StoryChunk*extractedChunk = nullptr; + + { + // lock the TimelineMap & check that the decayed storychunk is still there + std::lock_guard lock(sequencingMutex); + if(current_time > acceptanceWindow + (*storyTimelineMap.begin()).second->getEndTime()) + { + extractedChunk = (*storyTimelineMap.begin()).second; + storyTimelineMap.erase(storyTimelineMap.begin()); + if(storyTimelineMap.size() < 2) + //keep at least 2 chunks in the map of active pipeline as merging relies on it ... + { appendStoryChunk(); } + } + } + + if(extractedChunk != nullptr) + { +#ifdef TRACE_CHUNK_EXTRACTION + LOG_TRACE("[StoryPipeline] StoryID: {} - Extracted chunk with start time {} is empty: {}", storyId + , extractedChunk->getStartTime(), (extractedChunk->empty() ? "Yes" : "No")); +#endif + if(extractedChunk->empty()) + { // there's no need to carry an empty chunk any further... + delete extractedChunk; + } + else + { + theExtractionQueue.stashStoryChunk(extractedChunk); + } + } + } +#ifdef TRACE_CHUNK_EXTRACTION + LOG_TRACE("[StoryPipeline] Extracting decayed chunks for StoryID={}. Queue size: {}", storyId + , theExtractionQueue.size()); +#endif +} + +//////////////////// +/* +void chronolog::StoryPipeline::mergeEvents(chronolog::EventDeque &event_deque) +{ + if(event_deque.empty()) + { return; } + + std::lock_guard lock(sequencingMutex); + chl::LogEvent event; + // the last chunk is most likely the one that would get the events, so we'd start with the last + // chunk and do the lookup only if it's not the one + // NOTE: we should never have less than 2 chunks in the active storyTimelineMap !!! + std::map ::iterator chunk_to_merge_iter = --storyTimelineMap.end(); + while(!event_deque.empty()) + { + event = event_deque.front(); + LOG_DEBUG("[StoryPipeline] StoryID: {} [Start: {}, End: {}]: Merging event time: {}", storyId, timelineStart + , timelineEnd, event.time()); + if(timelineStart <= event.time() && event.time() < timelineEnd) + { + // we expect the events in the deque to be mostly monotonous + // so we'd try the most recently used chunk first and only look for the new chunk + // if the event does not belong to the recently used chunk + if(!(*chunk_to_merge_iter).second->insertEvent(event)) + { + // find the new chunk_to_merge the event into : we are lookingt for + // the chunk preceeding the first chunk with the startTime > event.time() + chunk_to_merge_iter = storyTimelineMap.upper_bound(event.time()); + //merge into the preceeding chunk + if(!(*(--chunk_to_merge_iter)).second->insertEvent(event)) + { + LOG_ERROR("[StoryPipeline] StoryID: {} - Discarded event with timestamp: {}", storyId, event.time()); + } + } + } + else if(event.time() >= timelineEnd) + { //extend timeline forward + while(event.time() >= timelineEnd) + { + chunk_to_merge_iter = appendStoryChunk(); + if(chunk_to_merge_iter == storyTimelineMap.end()) + { break; } + } + if(chunk_to_merge_iter != storyTimelineMap.end()) + { (*chunk_to_merge_iter).second->insertEvent(event); } + else + { + LOG_ERROR("[StoryPipeline] StoryID: {} - Discarding event with timestamp: {}", storyId, event.time()); + } + } + else + { //extend timeline backward + while(event.time() < timelineStart) + { + chunk_to_merge_iter = chl::StoryPipeline::prependStoryChunk(); + if(chunk_to_merge_iter == storyTimelineMap.end()) + { break; } + } + if(chunk_to_merge_iter != storyTimelineMap.end()) + { (*chunk_to_merge_iter).second->insertEvent(event); } + else + { + LOG_ERROR("[StoryPipeline] StoryID: {} - Discarding event with timestamp: {}", storyId, event.time()); + } + } + event_deque.pop_front(); + } +} +*/ +////////////////////// +// Merge the StoryChunk obtained from external source into the StoryPipeline +// Note that the granularity of the StoryChunk being merged may be +// different from that of the StoryPipeline +// +void chronolog::StoryPipeline::mergeEvents(chronolog::StoryChunk &other_chunk) +{ + // we make no assumptions about the startTime or the granularity of the other_chunk + + if(other_chunk.empty()) + { return; } + + std::lock_guard lock(sequencingMutex); + + LOG_DEBUG("[StoryPipeline] StoryID: {} - Merging StoryChunk from {} to {}", storyId, other_chunk.getStartTime() + , other_chunk.getEndTime()); + + // locate the storyChunk in the StoryPipeline with the time Key not less than + // other_chunk.startTime and start merging + std::map ::iterator chunk_to_merge_iter; + + if(timelineStart <= other_chunk.getStartTime()) + { + // find the chunk_to_merge into : we are lookingt for + // the chunk preceeding the one with the startTime > other_chunk.getStartTime() + chunk_to_merge_iter = --storyTimelineMap.upper_bound(other_chunk.getStartTime()); + } + else + { + // unlikely but possible that we get some delayed events and need to prepend some chunks + // extending the timeline back to the past + LOG_DEBUG("[StoryPipeline] StoryID: {} - Prepending merge starting at timestamp {}", storyId + , other_chunk.getStartTime()); + while(timelineStart > other_chunk.getStartTime()) + { + chunk_to_merge_iter = chl::StoryPipeline::prependStoryChunk(); + if(chunk_to_merge_iter == storyTimelineMap.end()) + { + //INNA:: if prepend fails we have no choice but to discard the events we can't merge !! + LOG_ERROR("[StoryPipeline] StoryID: {} - Merge operation discards events between timestamps: {} and {}" + , storyId, other_chunk.getStartTime(), timelineStart); + other_chunk.eraseEvents(other_chunk.getStartTime(), timelineStart); + chunk_to_merge_iter = storyTimelineMap.begin(); + } + } + } + + //iterate through the storyTimelineMap draining the other_chunk events + while(chunk_to_merge_iter != storyTimelineMap.end() && !other_chunk.empty()) + { + (*chunk_to_merge_iter).second->mergeEvents(other_chunk); + chunk_to_merge_iter++; + } + + // if there are still records in the other_chunk with the timestamps beyond the current timelineEnd + // we extend the timeline forward by appending new chunks + + while(!other_chunk.empty()) + { + chunk_to_merge_iter = appendStoryChunk(); + if(chunk_to_merge_iter == storyTimelineMap.end()) + { break; } + + (*chunk_to_merge_iter).second->mergeEvents(other_chunk); + } + + return; +} diff --git a/ChronoGrapher/StoryPipeline.h b/ChronoGrapher/StoryPipeline.h new file mode 100644 index 00000000..be198573 --- /dev/null +++ b/ChronoGrapher/StoryPipeline.h @@ -0,0 +1,90 @@ +#ifndef STORY_PIPELINE_H +#define STORY_PIPELINE_H + +#include +#include +#include +#include +#include + +#include "chronolog_types.h" +#include "StoryChunk.h" +#include "StoryChunkExtractionQueue.h" + +namespace chronolog +{ + + +class StoryChunkIngestionHandle; + +class StoryPipeline +{ + +public: + StoryPipeline(StoryChunkExtractionQueue &, std::string const &chronicle_name, std::string const &story_name + , StoryId const &story_id, uint64_t start_time, uint16_t chunk_granularity = 60 // seconds + , uint16_t acceptance_window = 300 // seconds + ); + + StoryPipeline(StoryPipeline const &) = delete; + + StoryPipeline &operator=(StoryPipeline const &) = delete; + + ~StoryPipeline(); + + + StoryChunkIngestionHandle*getActiveIngestionHandle(); + + void collectIngestedEvents(); + + // void mergeEvents(std::deque &); + + void mergeEvents(StoryChunk &); + + void extractDecayedStoryChunks(uint64_t); + + StoryId const &getStoryId() const + { return storyId; } + + uint16_t getAcceptanceWindow() const + { return acceptanceWindow; } + +private: + + StoryChunkExtractionQueue &theExtractionQueue; + StoryId storyId; + ChronicleName chronicleName; + StoryName storyName; + uint64_t timelineStart; + uint64_t timelineEnd; + uint64_t chunkGranularity; + uint64_t acceptanceWindow; + uint64_t revisionTime; //time of the most recent merge + + // mutex used to protect the IngestionQueue from concurrent access + // by RecordingService threads + std::mutex ingestionMutex; + // two ingestion queues so that they can take turns playing + // active/passive ingestion duty + // + std::deque chunkQueue1; + std::deque chunkQueue2; + + StoryChunkIngestionHandle*activeIngestionHandle; + + // mutex used to protect Story sequencing operations + // from concurrent access by the DataStore Sequencing threads + std::mutex sequencingMutex; + + // map of storyChunks ordered by StoryChunck.startTime + std::map storyTimelineMap; + + std::map ::iterator prependStoryChunk(); + + std::map ::iterator appendStoryChunk(); + + void finalize(); +}; + +} +#endif diff --git a/ChronoKeeper/StorytellerRecord.cpp b/ChronoKeeper/StorytellerRecord.cpp deleted file mode 100644 index 399b0407..00000000 --- a/ChronoKeeper/StorytellerRecord.cpp +++ /dev/null @@ -1,6 +0,0 @@ -#include - -#include -#include -#include - From 8b417471dff1bdf22b4b458c8a9a5322af606670 Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Wed, 20 Mar 2024 16:45:10 -0500 Subject: [PATCH 05/40] GrapherRecordingService --- ChronoGrapher/ChronoGrapher.cpp | 11 +- ChronoGrapher/ChronoKeeperInstance.cpp | 293 ------------------ ...ingService.h => GrapherRecordingService.h} | 28 +- ChronoGrapher/KeeperDataStore.cpp | 9 - 4 files changed, 18 insertions(+), 323 deletions(-) delete mode 100644 ChronoGrapher/ChronoKeeperInstance.cpp rename ChronoGrapher/{KeeperRecordingService.h => GrapherRecordingService.h} (54%) diff --git a/ChronoGrapher/ChronoGrapher.cpp b/ChronoGrapher/ChronoGrapher.cpp index 29ea465d..4e768129 100644 --- a/ChronoGrapher/ChronoGrapher.cpp +++ b/ChronoGrapher/ChronoGrapher.cpp @@ -4,9 +4,7 @@ #include -//#include "chrono_common/KeeperIdCard.h" -//#include "chrono_common/KeeperStatsMsg.h" -#include "KeeperRecordingService.h" +#include "GrapherRecordingService.h" #include "KeeperRegClient.h" #include "ChunkIngestionQueue.h" #include "StoryChunkExtractionQueue.h" @@ -107,8 +105,7 @@ int main(int argc, char**argv) } LOG_INFO("[ChronoKeeperInstance] DataStoreAdminService started successfully."); - /// KeeperRecordingService setup ___________________________________________________________________________________ - // Instantiate KeeperRecordingService + // Instantiate GrapherRecordingService std::string KEEPER_RECORDING_SERVICE_PROTOCOL = confManager.KEEPER_CONF.KEEPER_RECORDING_SERVICE_CONF.RPC_CONF.PROTO_CONF; std::string KEEPER_RECORDING_SERVICE_IP = confManager.KEEPER_CONF.KEEPER_RECORDING_SERVICE_CONF.RPC_CONF.IP; uint16_t KEEPER_RECORDING_SERVICE_PORT = confManager.KEEPER_CONF.KEEPER_RECORDING_SERVICE_CONF.RPC_CONF.BASE_PORT; @@ -179,7 +176,7 @@ int main(int argc, char**argv) // Instantiate KeeperRecordingService tl::engine*recordingEngine = nullptr; - chronolog::KeeperRecordingService*keeperRecordingService = nullptr; + chronolog::GrapherRecordingService* keeperRecordingService = nullptr; try { @@ -190,7 +187,7 @@ int main(int argc, char**argv) s1 << recordingEngine->self(); LOG_INFO("[ChronoKeeperInstance] GroupID={} starting KeeperRecordingService at {} with provider_id {}" , keeper_group_id, s1.str(), datastore_service_provider_id); - keeperRecordingService = chronolog::KeeperRecordingService::CreateKeeperRecordingService(*recordingEngine + keeperRecordingService = chronolog::GrapherRecordingService::CreateRecordingService(*recordingEngine , recording_service_provider_id , ingestionQueue); } diff --git a/ChronoGrapher/ChronoKeeperInstance.cpp b/ChronoGrapher/ChronoKeeperInstance.cpp deleted file mode 100644 index bed32f3e..00000000 --- a/ChronoGrapher/ChronoKeeperInstance.cpp +++ /dev/null @@ -1,293 +0,0 @@ -#include -#include -#include - -#include - -//#include "chrono_common/KeeperIdCard.h" -//#include "chrono_common/KeeperStatsMsg.h" -#include "KeeperRecordingService.h" -#include "KeeperRegClient.h" -#include "IngestionQueue.h" -#include "StoryChunkExtractionQueue.h" -#include "StoryChunkExtractor.h" -#include "KeeperDataStore.h" -#include "DataStoreAdminService.h" -#include "ConfigurationManager.h" -#include "StoryChunkExtractor.h" -#include "CSVFileChunkExtractor.h" -#include "cmd_arg_parse.h" - -#define KEEPER_GROUP_ID 7 - -// we will be using a combination of the uint32_t representation of the service IP address -// and uint16_t representation of the port number -int -service_endpoint_from_dotted_string(std::string const &ip_string, int port, std::pair &endpoint) -{ - // we will be using a combination of the uint32_t representation of the service IP address - // and uint16_t representation of the port number - // NOTE: both IP and port values in the KeeperCard are in the host byte order, not the network order) - // to identfy the ChronoKeeper process - - struct sockaddr_in sa; - // translate the recording service dotted IP string into 32bit network byte order representation - int inet_pton_return = inet_pton(AF_INET, ip_string.c_str(), &sa.sin_addr.s_addr); //returns 1 on success - if(1 != inet_pton_return) - { - LOG_ERROR("[ChronoKeeperInstance] Invalid IP address provided: {}", ip_string); - return (-1); - } - - // translate 32bit ip from network into the host byte order - uint32_t ntoh_ip_addr = ntohl(sa.sin_addr.s_addr); - uint16_t ntoh_port = port; - endpoint = std::pair (ntoh_ip_addr, ntoh_port); - - LOG_DEBUG("[ChronoKeeperInstance] Service endpoint created: IP={}, Port={}", ip_string, port); - return 1; -} - -volatile sig_atomic_t keep_running = true; - -void sigterm_handler(int) -{ - LOG_INFO("[ChronoKeeperInstance] Received SIGTERM signal. Initiating shutdown procedure."); - keep_running = false; - return; -} - -/////////////////////////////////////////////// - -int main(int argc, char**argv) -{ - int exit_code = 0; - signal(SIGTERM, sigterm_handler); - - /// Configure SetUp ________________________________________________________________________________________________ - std::string conf_file_path; - conf_file_path = parse_conf_path_arg(argc, argv); - if(conf_file_path.empty()) - { - std::exit(EXIT_FAILURE); - } - ChronoLog::ConfigurationManager confManager(conf_file_path); - int result = Logger::initialize(confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGTYPE - , confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGFILE - , confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGLEVEL - , confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGNAME - , confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGFILESIZE - , confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGFILENUM - , confManager.KEEPER_CONF.KEEPER_LOG_CONF.FLUSHLEVEL); - if(result == 1) - { - exit(EXIT_FAILURE); - } - LOG_INFO("Running Chronokeeper Server."); - - // Instantiate ChronoKeeper MemoryDataStore - // instantiate DataStoreAdminService - uint64_t keeper_group_id = KEEPER_GROUP_ID; - - /// DataStoreAdminService setup ____________________________________________________________________________________ - std::string datastore_service_ip = confManager.KEEPER_CONF.KEEPER_DATA_STORE_ADMIN_SERVICE_CONF.RPC_CONF.IP; - int datastore_service_port = confManager.KEEPER_CONF.KEEPER_DATA_STORE_ADMIN_SERVICE_CONF.RPC_CONF.BASE_PORT; - std::string KEEPER_DATASTORE_SERVICE_NA_STRING = - confManager.KEEPER_CONF.KEEPER_DATA_STORE_ADMIN_SERVICE_CONF.RPC_CONF.PROTO_CONF + "://" + - datastore_service_ip + ":" + std::to_string(datastore_service_port); - - uint16_t datastore_service_provider_id = confManager.KEEPER_CONF.KEEPER_DATA_STORE_ADMIN_SERVICE_CONF.RPC_CONF.SERVICE_PROVIDER_ID; - - chronolog::service_endpoint datastore_endpoint; - // validate ip address, instantiate DataAdminService and create ServiceId to be included in KeeperRegistrationMsg - - if(-1 == service_endpoint_from_dotted_string(datastore_service_ip, datastore_service_port, datastore_endpoint)) - { - LOG_CRITICAL("[ChronoKeeperInstance] Failed to start DataStoreAdminService. Invalid endpoint provided."); - return (-1); - } - LOG_INFO("[ChronoKeeperInstance] DataStoreAdminService started successfully."); - - /// KeeperRecordingService setup ___________________________________________________________________________________ - // Instantiate KeeperRecordingService - std::string KEEPER_RECORDING_SERVICE_PROTOCOL = confManager.KEEPER_CONF.KEEPER_RECORDING_SERVICE_CONF.RPC_CONF.PROTO_CONF; - std::string KEEPER_RECORDING_SERVICE_IP = confManager.KEEPER_CONF.KEEPER_RECORDING_SERVICE_CONF.RPC_CONF.IP; - uint16_t KEEPER_RECORDING_SERVICE_PORT = confManager.KEEPER_CONF.KEEPER_RECORDING_SERVICE_CONF.RPC_CONF.BASE_PORT; - uint16_t recording_service_provider_id = confManager.KEEPER_CONF.KEEPER_RECORDING_SERVICE_CONF.RPC_CONF.SERVICE_PROVIDER_ID; - - std::string KEEPER_RECORDING_SERVICE_NA_STRING = - std::string(KEEPER_RECORDING_SERVICE_PROTOCOL) + "://" + std::string(KEEPER_RECORDING_SERVICE_IP) + ":" + - std::to_string(KEEPER_RECORDING_SERVICE_PORT); - - // validate ip address, instantiate Recording Service and create KeeperIdCard - - chronolog::service_endpoint recording_endpoint; - if(-1 == service_endpoint_from_dotted_string(KEEPER_RECORDING_SERVICE_IP, KEEPER_RECORDING_SERVICE_PORT - , recording_endpoint)) - { - LOG_CRITICAL("[ChronoKeeperInstance] Failed to start KeeperRecordingService. Invalid endpoint provided."); - return (-1); - } - LOG_INFO("[ChronoKeeperInstance] KeeperRecordingService started successfully."); - - // create KeeperIdCard to identify this Keeper process in ChronoVisor's KeeperRegistry - chronolog::KeeperIdCard keeperIdCard(keeper_group_id, recording_endpoint.first, recording_endpoint.second - , recording_service_provider_id); - - std::stringstream ss; - ss << keeperIdCard; - LOG_INFO("[ChronoKeeperInstance] KeeperIdCard: {}", ss.str()); - - // Instantiate ChronoKeeper MemoryDataStore & ExtractorModule - chronolog::IngestionQueue ingestionQueue; - std::string keeper_csv_files_directory = confManager.KEEPER_CONF.STORY_FILES_DIR; - chronolog::CSVFileStoryChunkExtractor storyExtractor(keeperIdCard, keeper_csv_files_directory); - chronolog::KeeperDataStore theDataStore(ingestionQueue, storyExtractor.getExtractionQueue()); - - chronolog::ServiceId collectionServiceId(datastore_endpoint.first, datastore_endpoint.second - , datastore_service_provider_id); - tl::engine*dataAdminEngine = nullptr; - - chronolog::DataStoreAdminService*keeperDataAdminService = nullptr; - - try - { - margo_instance_id collection_margo_id = margo_init(KEEPER_DATASTORE_SERVICE_NA_STRING.c_str(), MARGO_SERVER_MODE - , 1, 1); - - dataAdminEngine = new tl::engine(collection_margo_id); - - std::stringstream s3; - s3 << dataAdminEngine->self(); - LOG_DEBUG("[ChronoKeeperInstance] GroupID={} starting DataStoreAdminService at address {} with ProviderID={}" - , keeper_group_id, s3.str(), datastore_service_provider_id); - keeperDataAdminService = chronolog::DataStoreAdminService::CreateDataStoreAdminService(*dataAdminEngine - , datastore_service_provider_id - , theDataStore); - } - catch(tl::exception const &) - { - LOG_ERROR("[ChronoKeeperInstance] Keeper failed to create DataStoreAdminService"); - } - - if(nullptr == keeperDataAdminService) - { - LOG_CRITICAL("[ChronoKeeperInstance] Keeper failed to create DataStoreAdminService exiting"); - if(dataAdminEngine) - { delete dataAdminEngine; } - return (-1); - } - - // Instantiate KeeperRecordingService - tl::engine*recordingEngine = nullptr; - chronolog::KeeperRecordingService*keeperRecordingService = nullptr; - - try - { - margo_instance_id margo_id = margo_init(KEEPER_RECORDING_SERVICE_NA_STRING.c_str(), MARGO_SERVER_MODE, 1, 1); - recordingEngine = new tl::engine(margo_id); - - std::stringstream s1; - s1 << recordingEngine->self(); - LOG_INFO("[ChronoKeeperInstance] GroupID={} starting KeeperRecordingService at {} with provider_id {}" - , keeper_group_id, s1.str(), datastore_service_provider_id); - keeperRecordingService = chronolog::KeeperRecordingService::CreateKeeperRecordingService(*recordingEngine - , recording_service_provider_id - , ingestionQueue); - } - catch(tl::exception const &) - { - LOG_ERROR("[ChronoKeeperInstance] Keeper failed to create KeeperRecordingService"); - } - - if(nullptr == keeperRecordingService) - { - LOG_CRITICAL("[ChronoKeeperInstance] Keeper failed to create KeeperRecordingService exiting"); - delete keeperDataAdminService; - return (-1); - } - - /// KeeperRegistryClient SetUp _____________________________________________________________________________________ - // create KeeperRegistryClient and register the new KeeperRecording service with the KeeperRegistry - std::string KEEPER_REGISTRY_SERVICE_NA_STRING = - confManager.KEEPER_CONF.VISOR_KEEPER_REGISTRY_SERVICE_CONF.RPC_CONF.PROTO_CONF + "://" + - confManager.KEEPER_CONF.VISOR_KEEPER_REGISTRY_SERVICE_CONF.RPC_CONF.IP + ":" + - std::to_string(confManager.KEEPER_CONF.VISOR_KEEPER_REGISTRY_SERVICE_CONF.RPC_CONF.BASE_PORT); - - uint16_t KEEPER_REGISTRY_SERVICE_PROVIDER_ID = confManager.KEEPER_CONF.VISOR_KEEPER_REGISTRY_SERVICE_CONF.RPC_CONF.SERVICE_PROVIDER_ID; - - chronolog::KeeperRegistryClient*keeperRegistryClient = chronolog::KeeperRegistryClient::CreateKeeperRegistryClient( - *dataAdminEngine, KEEPER_REGISTRY_SERVICE_NA_STRING, KEEPER_REGISTRY_SERVICE_PROVIDER_ID); - - if(nullptr == keeperRegistryClient) - { - LOG_CRITICAL("[ChronoKeeperInstance] Keeper failed to create KeeperRegistryClient; exiting"); - delete keeperRecordingService; - delete keeperDataAdminService; - return (-1); - } - - /// Registration with ChronoVisor __________________________________________________________________________________ - // try to register with chronoVisor a few times than log ERROR and exit... - int registration_status = chronolog::CL_ERR_UNKNOWN; - int retries = 5; - while((chronolog::CL_SUCCESS != registration_status) && (retries > 0)) - { - registration_status = keeperRegistryClient->send_register_msg( - chronolog::KeeperRegistrationMsg(keeperIdCard, collectionServiceId)); - retries--; - } - - if(chronolog::CL_SUCCESS != registration_status) - { - LOG_CRITICAL("[ChronoKeeperInstance] Failed to register with ChronoVisor after multiple attempts. Exiting."); - delete keeperRegistryClient; - delete keeperRecordingService; - delete keeperDataAdminService; - return (-1); - } - LOG_INFO("[ChronoKeeperInstance] Successfully registered with ChronoVisor."); - - /// Start data collection and extraction threads ___________________________________________________________________ - // services are successfulley created and keeper process had registered with ChronoVisor - // start all dataColelction and Extraction threads... - tl::abt scope; - theDataStore.startDataCollection(3); - // start extraction streams & threads - storyExtractor.startExtractionThreads(2); - - - /// Main loop for sending stats message until receiving SIGTERM ____________________________________________________ - // now we are ready to ingest records coming from the storyteller clients .... - // main thread would be sending stats message until keeper process receives - // sigterm signal - chronolog::KeeperStatsMsg keeperStatsMsg(keeperIdCard); - while(keep_running) - { - keeperRegistryClient->send_stats_msg(keeperStatsMsg); - sleep(30); - } - - /// Unregister from ChronoVisor ____________________________________________________________________________________ - // Unregister from the chronoVisor so that no new story requests would be coming - keeperRegistryClient->send_unregister_msg(keeperIdCard); - delete keeperRegistryClient; - - /// Stop services and shut down ____________________________________________________________________________________ - LOG_INFO("[ChronoKeeperInstance] Initiating shutdown procedures."); - // Stop recording events - delete keeperRecordingService; - delete keeperDataAdminService; - // Shutdown the Data Collection - theDataStore.shutdownDataCollection(); - // Shutdown extraction module - // drain extractionQueue and stop extraction xStreams - storyExtractor.shutdownExtractionThreads(); - // these are not probably needed as thalium handles the engine finalization... - // recordingEngine.finalize(); - // collectionEngine.finalize(); - delete recordingEngine; - delete dataAdminEngine; - LOG_INFO("[ChronoKeeperInstance] Shutdown completed. Exiting."); - return exit_code; -} diff --git a/ChronoGrapher/KeeperRecordingService.h b/ChronoGrapher/GrapherRecordingService.h similarity index 54% rename from ChronoGrapher/KeeperRecordingService.h rename to ChronoGrapher/GrapherRecordingService.h index f02bac58..de3bd63d 100644 --- a/ChronoGrapher/KeeperRecordingService.h +++ b/ChronoGrapher/GrapherRecordingService.h @@ -1,5 +1,5 @@ -#ifndef KEEPER_RECORDING_SERVICE_H -#define KEEPER_RECORDING_SERVICE_H +#ifndef GRAPHER_RECORDING_SERVICE_H +#define GRAPHER_RECORDING_SERVICE_H #include #include @@ -15,19 +15,19 @@ namespace tl = thallium; namespace chronolog { -class KeeperRecordingService: public tl::provider +class GrapherRecordingService: public tl::provider { public: - // KeeperRecordingService should be created on the heap not the stack thus the constructor is private... - static KeeperRecordingService* - CreateKeeperRecordingService(tl::engine &tl_engine, uint16_t service_provider_id, ChunkIngestionQueue &ingestion_queue) + // RecordingService should be created on the heap not the stack thus the constructor is private... + static GrapherRecordingService* + CreateRecordingService(tl::engine &tl_engine, uint16_t service_provider_id, ChunkIngestionQueue &ingestion_queue) { - return new KeeperRecordingService(tl_engine, service_provider_id, ingestion_queue); + return new GrapherRecordingService(tl_engine, service_provider_id, ingestion_queue); } - ~KeeperRecordingService() + ~GrapherRecordingService() { - LOG_DEBUG("[KeeperRecordingService] Destructor called. Cleaning up..."); + LOG_DEBUG("[GrapherRecordingService] Destructor called. Cleaning up..."); get_engine().pop_finalize_callback(this); } /* @@ -44,18 +44,18 @@ class KeeperRecordingService: public tl::provider } */ private: - KeeperRecordingService(tl::engine &tl_engine, uint16_t service_provider_id, ChunkIngestionQueue &ingestion_queue) - : tl::provider (tl_engine, service_provider_id), theIngestionQueue(ingestion_queue) + GrapherRecordingService(tl::engine &tl_engine, uint16_t service_provider_id, ChunkIngestionQueue &ingestion_queue) + : tl::provider (tl_engine, service_provider_id), theIngestionQueue(ingestion_queue) { - //define("", &KeeperRecordingService::record_event, tl::ignore_return_value()); + //define("", &RecordingService::record_event, tl::ignore_return_value()); //set up callback for the case when the engine is being finalized while this provider is still alive get_engine().push_finalize_callback(this, [p = this]() { delete p; }); } - KeeperRecordingService(KeeperRecordingService const &) = delete; + GrapherRecordingService(GrapherRecordingService const &) = delete; - KeeperRecordingService &operator=(KeeperRecordingService const &) = delete; + GrapherRecordingService &operator=(GrapherRecordingService const &) = delete; ChunkIngestionQueue &theIngestionQueue; }; diff --git a/ChronoGrapher/KeeperDataStore.cpp b/ChronoGrapher/KeeperDataStore.cpp index 298d88c9..5b647817 100644 --- a/ChronoGrapher/KeeperDataStore.cpp +++ b/ChronoGrapher/KeeperDataStore.cpp @@ -13,15 +13,6 @@ namespace chl = chronolog; namespace tl = thallium; -/////////////////////// -class ClocksourceCPPStyle -{ -public: - uint64_t getTimestamp() - { - return std::chrono::steady_clock::now().time_since_epoch().count(); - } -}; //////////////////////// From 1fdef9b066557a11b83ba1d3495540306ce9f397 Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Thu, 21 Mar 2024 19:57:28 -0500 Subject: [PATCH 06/40] StoryChunk serialization --- ChronoGrapher/CMakeLists.txt | 2 +- ChronoGrapher/GrapherRecordingService.h | 18 ++-- ChronoKeeper/CMakeLists.txt | 2 +- ChronoKeeper/StoryChunk.cpp | 99 ------------------- ChronoKeeper/StoryChunk.h | 89 ----------------- .../StoryChunk.cpp | 3 - {ChronoGrapher => chrono_common}/StoryChunk.h | 19 +++- 7 files changed, 25 insertions(+), 207 deletions(-) delete mode 100644 ChronoKeeper/StoryChunk.cpp delete mode 100644 ChronoKeeper/StoryChunk.h rename {ChronoGrapher => chrono_common}/StoryChunk.cpp (96%) rename {ChronoGrapher => chrono_common}/StoryChunk.h (85%) diff --git a/ChronoGrapher/CMakeLists.txt b/ChronoGrapher/CMakeLists.txt index 941c2da4..901ea032 100644 --- a/ChronoGrapher/CMakeLists.txt +++ b/ChronoGrapher/CMakeLists.txt @@ -14,8 +14,8 @@ target_include_directories(chrono_grapher PRIVATE include target_sources(chrono_grapher PRIVATE ChronoGrapher.cpp StoryPipeline.cpp - StoryChunk.cpp KeeperDataStore.cpp + ../chrono_common/StoryChunk.cpp StoryChunkExtractor.cpp CSVFileChunkExtractor.cpp ../ChronoAPI/ChronoLog/src/log.cpp) diff --git a/ChronoGrapher/GrapherRecordingService.h b/ChronoGrapher/GrapherRecordingService.h index de3bd63d..d6ca99cf 100644 --- a/ChronoGrapher/GrapherRecordingService.h +++ b/ChronoGrapher/GrapherRecordingService.h @@ -30,24 +30,20 @@ class GrapherRecordingService: public tl::provider LOG_DEBUG("[GrapherRecordingService] Destructor called. Cleaning up..."); get_engine().pop_finalize_callback(this); } -/* - INN: replace this method with chunk receptor method - void on_chunk_received(tl::request const &request, LogEvent const &log_event) + + //TODO: replace or augment this with RDMA transfer later on... + void record_story_chunk(tl::request const &request, StoryChunk & chunk) { - // ClientId teller_id, StoryId story_id, - // ChronoTick const& chrono_tick, std::string const& record) - std::stringstream ss; - ss << log_event; - LOG_DEBUG("[KeeperRecordingService] Recording event: {}", ss.str()); - theIngestionQueue.ingestLogEvent(log_event); + LOG_DEBUG("[KeeperRecordingService] Recording chunk: {}", chunk.getStoryId()); + theIngestionQueue.ingestStoryChunk(&chunk); request.respond(chronolog::CL_SUCCESS); } -*/ + private: GrapherRecordingService(tl::engine &tl_engine, uint16_t service_provider_id, ChunkIngestionQueue &ingestion_queue) : tl::provider (tl_engine, service_provider_id), theIngestionQueue(ingestion_queue) { - //define("", &RecordingService::record_event, tl::ignore_return_value()); + define("record_story_chunk", &GrapherRecordingService::record_story_chunk, tl::ignore_return_value()); //set up callback for the case when the engine is being finalized while this provider is still alive get_engine().push_finalize_callback(this, [p = this]() { delete p; }); diff --git a/ChronoKeeper/CMakeLists.txt b/ChronoKeeper/CMakeLists.txt index b12c7c96..81622fe1 100644 --- a/ChronoKeeper/CMakeLists.txt +++ b/ChronoKeeper/CMakeLists.txt @@ -15,8 +15,8 @@ target_include_directories(chrono_keeper PRIVATE include target_sources(chrono_keeper PRIVATE ChronoKeeperInstance.cpp StoryPipeline.cpp - StoryChunk.cpp KeeperDataStore.cpp + ../chrono_common/StoryChunk.cpp StoryChunkExtractor.cpp CSVFileChunkExtractor.cpp ../ChronoAPI/ChronoLog/src/log.cpp) diff --git a/ChronoKeeper/StoryChunk.cpp b/ChronoKeeper/StoryChunk.cpp deleted file mode 100644 index eca28461..00000000 --- a/ChronoKeeper/StoryChunk.cpp +++ /dev/null @@ -1,99 +0,0 @@ - - -#include "StoryChunk.h" - - -namespace chl = chronolog; - -///////////////////////// - -chl::StoryChunk::StoryChunk(chl::StoryId const &story_id , uint64_t start_time , uint64_t end_time , uint32_t chunk_size ) - : storyId(story_id) - , startTime(start_time) - , endTime(end_time) - , revisionTime(start_time) - , chunkSize(chunk_size) - { - dataBlob = new char[chunk_size]; - } - -chl::StoryChunk::~StoryChunk() - { - delete [] dataBlob; - } - -int chl::StoryChunk::insertEvent(chl::LogEvent const &event) - { - if((event.time() >= startTime) && (event.time() < endTime)) - { - logEvents.insert(std::pair ({event.time(), event.clientId, event.index()}, event)); - return 1; - } - else - { return 0; } - } - -/////////// - -uint32_t chl::StoryChunk::mergeEvents(std::map &events - , std::map ::iterator &merge_start) -{ - uint32_t merged_event_count = 0; - std::map ::iterator first_merged, last_merged; - - if((*merge_start).second.time() < startTime) - { - merge_start = events.lower_bound(chl::EventSequence{startTime, 0, 0}); - LOG_DEBUG("[StoryChunk] Adjusted merge start time to align with StoryChunk's start time: {}", startTime); - } - - for(auto iter = merge_start; iter != events.end(); ++iter) - { - if(insertEvent((*iter).second) > 0) - { - if(merged_event_count == 0) - { first_merged = iter; } - last_merged = iter; - merged_event_count++; - } - else - { - LOG_DEBUG("[StoryChunk] Stopped merging due to a record that couldn't be inserted."); - break; - } //stop at the first record that can't be merged - } - - if(merged_event_count > 0) - { - //remove the merged records from the original map - events.erase(first_merged, last_merged); - LOG_DEBUG("[StoryChunk] Removed {} merged records from the original event map.", merged_event_count); - } - else - { - LOG_DEBUG("[StoryChunk] No events merged during the operation."); - } - - return merged_event_count; -} - -uint32_t chl::StoryChunk::mergeEvents(chl::StoryChunk &other_chunk, uint64_t start_time, uint64_t end_time) - { return 0; } - -uint32_t chl::StoryChunk::extractEvents(std::map &target_map, std::map ::iterator first_pos - , std::map ::iterator last_pos) - { return 0; } - -uint32_t chl::StoryChunk::extractEvents(std::map &target_map, uint64_t start_time, uint64_t end_time) - { return 0; } - -uint32_t chl::StoryChunk::extractEvents( chl::StoryChunk & target_chunk, uint64_t start_time, uint64_t end_time) - { return 0; } - -uint32_t chl::StoryChunk::split(chl::StoryChunk & split_chunk, uint64_t time_boundary) - { return 0; } - - -uint32_t chl::StoryChunk::eraseEvents(uint64_t start_time, uint64_t end_time) - { return 0; } - diff --git a/ChronoKeeper/StoryChunk.h b/ChronoKeeper/StoryChunk.h deleted file mode 100644 index 902e65ec..00000000 --- a/ChronoKeeper/StoryChunk.h +++ /dev/null @@ -1,89 +0,0 @@ -#ifndef STORY_CHUNK_H -#define STORY_CHUNK_H - -#include -#include -#include "chronolog_types.h" -#include "log.h" - -namespace chronolog -{ - -typedef uint64_t chrono_time; -typedef uint32_t chrono_index; - -// StoryChunk contains all the events for the single story -// for the duration [startTime, endTime[ -// startTime included, endTime excluded -// startTime/endTime are invariant - -typedef std::tuple ArrivalSequence; - -typedef std::tuple EventSequence; - -class StoryChunk -{ -public: - - StoryChunk(StoryId const &story_id = 0, uint64_t start_time = 0, uint64_t end_time = 0, uint32_t chunk_size = 1024); - - ~StoryChunk(); - - StoryId const &getStoryId() const - { return storyId; } - - uint64_t getStartTime() const - { return startTime; } - - uint64_t getEndTime() const - { return endTime; } - - bool empty() const - { return (logEvents.empty() ? true : false); } - - std::map ::const_iterator begin() const - { return logEvents.begin(); } - - std::map ::const_iterator end() const - { return logEvents.end(); } - - std::map ::const_iterator lower_bound(uint64_t chrono_time) const - { return logEvents.lower_bound(EventSequence{chrono_time, 0, 0}); } - - uint64_t firstEventTime() const - { return (*logEvents.begin()).second.time(); } - - uint64_t lastEventTime() const - { return (*logEvents.begin()).second.time(); } - - int insertEvent(LogEvent const &); - - uint32_t - mergeEvents(std::map &events, std::map ::iterator &merge_start); - - uint32_t mergeEvents(StoryChunk &other_chunk, uint64_t start_time =0, uint64_t end_time=0); - - uint32_t - extractEvents(std::map &target_map, std::map ::iterator first_pos - , std::map ::iterator last_pos); - - uint32_t extractEvents(std::map &target_map, uint64_t start_time, uint64_t end_time); - - uint32_t extractEvents( StoryChunk & target_chunk, uint64_t start_time, uint64_t end_time); - - uint32_t split(StoryChunk & split_chunk, uint64_t time_boundary); - - uint32_t eraseEvents(uint64_t start_time, uint64_t end_time); - -private: - StoryId storyId; - uint64_t startTime; - uint64_t endTime; - uint64_t revisionTime; - uint32_t chunkSize; - char * dataBlob; - std::map eventOffsetMap; - std::map logEvents; -}; -} -#endif diff --git a/ChronoGrapher/StoryChunk.cpp b/chrono_common/StoryChunk.cpp similarity index 96% rename from ChronoGrapher/StoryChunk.cpp rename to chrono_common/StoryChunk.cpp index eca28461..767f9336 100644 --- a/ChronoGrapher/StoryChunk.cpp +++ b/chrono_common/StoryChunk.cpp @@ -12,14 +12,11 @@ chl::StoryChunk::StoryChunk(chl::StoryId const &story_id , uint64_t start_time , , startTime(start_time) , endTime(end_time) , revisionTime(start_time) - , chunkSize(chunk_size) { - dataBlob = new char[chunk_size]; } chl::StoryChunk::~StoryChunk() { - delete [] dataBlob; } int chl::StoryChunk::insertEvent(chl::LogEvent const &event) diff --git a/ChronoGrapher/StoryChunk.h b/chrono_common/StoryChunk.h similarity index 85% rename from ChronoGrapher/StoryChunk.h rename to chrono_common/StoryChunk.h index 902e65ec..0a60e13d 100644 --- a/ChronoGrapher/StoryChunk.h +++ b/chrono_common/StoryChunk.h @@ -75,14 +75,27 @@ class StoryChunk uint32_t eraseEvents(uint64_t start_time, uint64_t end_time); + // serialization function used by thallium RPC providers + // to serialize/deserialize KeeperIdCard + template + void serialize( SerArchiveT & serT) + { + serT & storyId; + serT & startTime; + serT & endTime; + serT & revisionTime; + for(auto iter=logEvents.begin(); iter!= logEvents.end(); ++iter) + { + serT((*iter).second); + } + } + + private: StoryId storyId; uint64_t startTime; uint64_t endTime; uint64_t revisionTime; - uint32_t chunkSize; - char * dataBlob; - std::map eventOffsetMap; std::map logEvents; }; } From 72f40e1609b5abf55a6ac3f3580247b49addcb6a Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Wed, 27 Mar 2024 13:54:41 -0500 Subject: [PATCH 07/40] Grapher Configuration and GrapherRegClient --- ChronoGrapher/CMakeLists.txt | 2 + ChronoGrapher/CSVFileChunkExtractor.cpp | 8 +- ChronoGrapher/CSVFileChunkExtractor.h | 5 +- ChronoGrapher/ChronoGrapher.cpp | 165 +++++++++--------- .../{KeeperRegClient.h => GrapherRegClient.h} | 66 +++---- chrono_common/ConfigurationManager.cpp | 117 +++++++++++++ chrono_common/ConfigurationManager.h | 59 +++++++ chrono_common/GrapherIdCard.h | 83 +++++++++ chrono_common/GrapherRegistrationMsg.h | 53 ++++++ default_conf.json.in | 50 +++++- 10 files changed, 482 insertions(+), 126 deletions(-) rename ChronoGrapher/{KeeperRegClient.h => GrapherRegClient.h} (50%) create mode 100644 chrono_common/ConfigurationManager.cpp create mode 100644 chrono_common/GrapherIdCard.h create mode 100644 chrono_common/GrapherRegistrationMsg.h diff --git a/ChronoGrapher/CMakeLists.txt b/ChronoGrapher/CMakeLists.txt index 901ea032..77aa0c0b 100644 --- a/ChronoGrapher/CMakeLists.txt +++ b/ChronoGrapher/CMakeLists.txt @@ -16,10 +16,12 @@ target_sources(chrono_grapher PRIVATE StoryPipeline.cpp KeeperDataStore.cpp ../chrono_common/StoryChunk.cpp + ../chrono_common/ConfigurationManager.cpp StoryChunkExtractor.cpp CSVFileChunkExtractor.cpp ../ChronoAPI/ChronoLog/src/log.cpp) target_link_libraries(chrono_grapher thallium) + #configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../default_conf.json.in # ${CMAKE_CURRENT_BINARY_DIR}/default_conf.json COPYONLY) diff --git a/ChronoGrapher/CSVFileChunkExtractor.cpp b/ChronoGrapher/CSVFileChunkExtractor.cpp index 8050cdb6..f3f4c20c 100644 --- a/ChronoGrapher/CSVFileChunkExtractor.cpp +++ b/ChronoGrapher/CSVFileChunkExtractor.cpp @@ -3,14 +3,13 @@ #include #include "chronolog_types.h" -#include "KeeperIdCard.h" #include "CSVFileChunkExtractor.h" namespace tl = thallium; -chronolog::CSVFileStoryChunkExtractor::CSVFileStoryChunkExtractor(chronolog::KeeperIdCard const &keeper_id_card +chronolog::CSVFileStoryChunkExtractor::CSVFileStoryChunkExtractor(std::string const & process_id_card , std::string const &csv_files_root_dir) - : keeperIdCard(keeper_id_card), rootDirectory(csv_files_root_dir) + : chrono_process_id(process_id_card), rootDirectory(csv_files_root_dir) {} ///////////// @@ -24,8 +23,7 @@ void chronolog::CSVFileStoryChunkExtractor::processStoryChunk(chronolog::StoryCh { std::ofstream chunk_fstream; std::string chunk_filename(rootDirectory); - keeperIdCard.getIPasDottedString(chunk_filename); - chunk_filename += "." + std::to_string(story_chunk->getStoryId()) + "." + + chunk_filename += chrono_process_id + "." + std::to_string(story_chunk->getStoryId()) + "." + std::to_string(story_chunk->getStartTime() / 1000000000) + ".csv"; tl::xstream es = tl::xstream::self(); diff --git a/ChronoGrapher/CSVFileChunkExtractor.h b/ChronoGrapher/CSVFileChunkExtractor.h index 8a709737..c26e8f41 100644 --- a/ChronoGrapher/CSVFileChunkExtractor.h +++ b/ChronoGrapher/CSVFileChunkExtractor.h @@ -2,7 +2,6 @@ #define CSV_FILE_CHUNK_EXTRACTOR_H #include "chronolog_types.h" -#include "KeeperIdCard.h" #include "StoryChunkExtractor.h" @@ -13,14 +12,14 @@ class CSVFileStoryChunkExtractor: public StoryChunkExtractorBase { public: - CSVFileStoryChunkExtractor(KeeperIdCard const &keeper_id_card, std::string const &csv_files_root_dir); + CSVFileStoryChunkExtractor(std::string const & chrono_process_id_card, std::string const &csv_files_root_dir); ~CSVFileStoryChunkExtractor(); virtual void processStoryChunk(StoryChunk*); private: - KeeperIdCard keeperIdCard; + std::string chrono_process_id; std::string rootDirectory; diff --git a/ChronoGrapher/ChronoGrapher.cpp b/ChronoGrapher/ChronoGrapher.cpp index 4e768129..7d8deb51 100644 --- a/ChronoGrapher/ChronoGrapher.cpp +++ b/ChronoGrapher/ChronoGrapher.cpp @@ -4,8 +4,9 @@ #include +#include "GrapherIdCard.h" #include "GrapherRecordingService.h" -#include "KeeperRegClient.h" +#include "GrapherRegClient.h" #include "ChunkIngestionQueue.h" #include "StoryChunkExtractionQueue.h" #include "StoryChunkExtractor.h" @@ -15,8 +16,6 @@ #include "CSVFileChunkExtractor.h" #include "cmd_arg_parse.h" -#define KEEPER_GROUP_ID 7 - // we will be using a combination of the uint32_t representation of the service IP address // and uint16_t representation of the port number int @@ -24,8 +23,8 @@ service_endpoint_from_dotted_string(std::string const &ip_string, int port, std: { // we will be using a combination of the uint32_t representation of the service IP address // and uint16_t representation of the port number - // NOTE: both IP and port values in the KeeperCard are in the host byte order, not the network order) - // to identfy the ChronoKeeper process + // NOTE: both IP and port values in the IdCard are in the host byte order, not the network order) + // to identfy the Chrono process struct sockaddr_in sa; // translate the recording service dotted IP string into 32bit network byte order representation @@ -41,7 +40,7 @@ service_endpoint_from_dotted_string(std::string const &ip_string, int port, std: uint16_t ntoh_port = port; endpoint = std::pair (ntoh_ip_addr, ntoh_port); - LOG_DEBUG("[ChronoKeeperInstance] Service endpoint created: IP={}, Port={}", ip_string, port); + LOG_DEBUG("[ChronoGrapher] Service endpoint created: IP={}, Port={}", ip_string, port); return 1; } @@ -49,7 +48,7 @@ volatile sig_atomic_t keep_running = true; void sigterm_handler(int) { - LOG_INFO("[ChronoKeeperInstance] Received SIGTERM signal. Initiating shutdown procedure."); + LOG_INFO("[ChronoGrapher] Received SIGTERM signal. Initiating shutdown procedure."); keep_running = false; return; } @@ -69,75 +68,75 @@ int main(int argc, char**argv) std::exit(EXIT_FAILURE); } ChronoLog::ConfigurationManager confManager(conf_file_path); - int result = Logger::initialize(confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGTYPE - , confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGFILE - , confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGLEVEL - , confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGNAME - , confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGFILESIZE - , confManager.KEEPER_CONF.KEEPER_LOG_CONF.LOGFILENUM - , confManager.KEEPER_CONF.KEEPER_LOG_CONF.FLUSHLEVEL); + int result = Logger::initialize(confManager.GRAPHER_CONF.LOG_CONF.LOGTYPE + , confManager.GRAPHER_CONF.LOG_CONF.LOGFILE + , confManager.GRAPHER_CONF.LOG_CONF.LOGLEVEL + , confManager.GRAPHER_CONF.LOG_CONF.LOGNAME + , confManager.GRAPHER_CONF.LOG_CONF.LOGFILESIZE + , confManager.GRAPHER_CONF.LOG_CONF.LOGFILENUM + , confManager.GRAPHER_CONF.LOG_CONF.FLUSHLEVEL); if(result == 1) { exit(EXIT_FAILURE); } - LOG_INFO("Running Chronokeeper Server."); + LOG_INFO("Running ChronoGrapher "); - // Instantiate ChronoKeeper MemoryDataStore + // Instantiate MemoryDataStore // instantiate DataStoreAdminService - uint64_t keeper_group_id = KEEPER_GROUP_ID; /// DataStoreAdminService setup ____________________________________________________________________________________ - std::string datastore_service_ip = confManager.KEEPER_CONF.KEEPER_DATA_STORE_ADMIN_SERVICE_CONF.RPC_CONF.IP; - int datastore_service_port = confManager.KEEPER_CONF.KEEPER_DATA_STORE_ADMIN_SERVICE_CONF.RPC_CONF.BASE_PORT; - std::string KEEPER_DATASTORE_SERVICE_NA_STRING = - confManager.KEEPER_CONF.KEEPER_DATA_STORE_ADMIN_SERVICE_CONF.RPC_CONF.PROTO_CONF + "://" + + std::string datastore_service_ip = confManager.GRAPHER_CONF.DATA_STORE_ADMIN_SERVICE_CONF.IP; + int datastore_service_port = confManager.GRAPHER_CONF.DATA_STORE_ADMIN_SERVICE_CONF.BASE_PORT; + std::string DATASTORE_SERVICE_NA_STRING = + confManager.GRAPHER_CONF.DATA_STORE_ADMIN_SERVICE_CONF.PROTO_CONF + "://" + datastore_service_ip + ":" + std::to_string(datastore_service_port); - uint16_t datastore_service_provider_id = confManager.KEEPER_CONF.KEEPER_DATA_STORE_ADMIN_SERVICE_CONF.RPC_CONF.SERVICE_PROVIDER_ID; + uint16_t datastore_service_provider_id = confManager.GRAPHER_CONF.DATA_STORE_ADMIN_SERVICE_CONF.SERVICE_PROVIDER_ID; chronolog::service_endpoint datastore_endpoint; - // validate ip address, instantiate DataAdminService and create ServiceId to be included in KeeperRegistrationMsg + // validate ip address, instantiate DataAdminService and create ServiceId to be included in RegistrationMsg if(-1 == service_endpoint_from_dotted_string(datastore_service_ip, datastore_service_port, datastore_endpoint)) { - LOG_CRITICAL("[ChronoKeeperInstance] Failed to start DataStoreAdminService. Invalid endpoint provided."); + LOG_CRITICAL("[ChronoGrapher] Failed to start DataStoreAdminService. Invalid endpoint provided."); return (-1); } - LOG_INFO("[ChronoKeeperInstance] DataStoreAdminService started successfully."); + LOG_INFO("[ChronoGrapher] DataStoreAdminService started successfully."); // Instantiate GrapherRecordingService - std::string KEEPER_RECORDING_SERVICE_PROTOCOL = confManager.KEEPER_CONF.KEEPER_RECORDING_SERVICE_CONF.RPC_CONF.PROTO_CONF; - std::string KEEPER_RECORDING_SERVICE_IP = confManager.KEEPER_CONF.KEEPER_RECORDING_SERVICE_CONF.RPC_CONF.IP; - uint16_t KEEPER_RECORDING_SERVICE_PORT = confManager.KEEPER_CONF.KEEPER_RECORDING_SERVICE_CONF.RPC_CONF.BASE_PORT; - uint16_t recording_service_provider_id = confManager.KEEPER_CONF.KEEPER_RECORDING_SERVICE_CONF.RPC_CONF.SERVICE_PROVIDER_ID; + std::string RECORDING_SERVICE_PROTOCOL = confManager.GRAPHER_CONF.RECORDING_SERVICE_CONF.PROTO_CONF; + std::string RECORDING_SERVICE_IP = confManager.GRAPHER_CONF.RECORDING_SERVICE_CONF.IP; + uint16_t RECORDING_SERVICE_PORT = confManager.GRAPHER_CONF.RECORDING_SERVICE_CONF.BASE_PORT; + uint16_t recording_service_provider_id = confManager.GRAPHER_CONF.RECORDING_SERVICE_CONF.SERVICE_PROVIDER_ID; - std::string KEEPER_RECORDING_SERVICE_NA_STRING = - std::string(KEEPER_RECORDING_SERVICE_PROTOCOL) + "://" + std::string(KEEPER_RECORDING_SERVICE_IP) + ":" + - std::to_string(KEEPER_RECORDING_SERVICE_PORT); + std::string RECORDING_SERVICE_NA_STRING = + std::string(RECORDING_SERVICE_PROTOCOL) + "://" + std::string(RECORDING_SERVICE_IP) + ":" + + std::to_string(RECORDING_SERVICE_PORT); - // validate ip address, instantiate Recording Service and create KeeperIdCard + // validate ip address, instantiate Recording Service and create IdCard chronolog::service_endpoint recording_endpoint; - if(-1 == service_endpoint_from_dotted_string(KEEPER_RECORDING_SERVICE_IP, KEEPER_RECORDING_SERVICE_PORT + if(-1 == service_endpoint_from_dotted_string(RECORDING_SERVICE_IP, RECORDING_SERVICE_PORT , recording_endpoint)) { - LOG_CRITICAL("[ChronoKeeperInstance] Failed to start KeeperRecordingService. Invalid endpoint provided."); + LOG_CRITICAL("[ChronoGrapher] Failed to start RecordingService. Invalid endpoint provided."); return (-1); } - LOG_INFO("[ChronoKeeperInstance] KeeperRecordingService started successfully."); + LOG_INFO("[ChronoGrapher] RecordingService started successfully."); - // create KeeperIdCard to identify this Keeper process in ChronoVisor's KeeperRegistry - chronolog::KeeperIdCard keeperIdCard(keeper_group_id, recording_endpoint.first, recording_endpoint.second + // create GrapherIdCard to identify this Grapher process in ChronoVisor's Registry + chronolog::GrapherIdCard processIdCard(recording_endpoint.first, recording_endpoint.second , recording_service_provider_id); - std::stringstream ss; - ss << keeperIdCard; - LOG_INFO("[ChronoKeeperInstance] KeeperIdCard: {}", ss.str()); + std::stringstream process_id_string; + process_id_string << processIdCard; + LOG_INFO("[ChronoGrapher] GrapherIdCard: {}", process_id_string.str()); - // Instantiate ChronoKeeper MemoryDataStore & ExtractorModule + // Instantiate MemoryDataStore & ExtractorModule chronolog::ChunkIngestionQueue ingestionQueue; - std::string keeper_csv_files_directory = confManager.KEEPER_CONF.STORY_FILES_DIR; - chronolog::CSVFileStoryChunkExtractor storyExtractor(keeperIdCard, keeper_csv_files_directory); + std::string csv_files_directory = confManager.GRAPHER_CONF.EXTRACTOR_CONF.story_files_dir; + + chronolog::CSVFileStoryChunkExtractor storyExtractor(process_id_string.str(), csv_files_directory); chronolog::KeeperDataStore theDataStore(ingestionQueue, storyExtractor.getExtractionQueue()); chronolog::ServiceId collectionServiceId(datastore_endpoint.first, datastore_endpoint.second @@ -148,77 +147,77 @@ int main(int argc, char**argv) try { - margo_instance_id collection_margo_id = margo_init(KEEPER_DATASTORE_SERVICE_NA_STRING.c_str(), MARGO_SERVER_MODE + margo_instance_id collection_margo_id = margo_init(DATASTORE_SERVICE_NA_STRING.c_str(), MARGO_SERVER_MODE , 1, 1); dataAdminEngine = new tl::engine(collection_margo_id); std::stringstream s3; s3 << dataAdminEngine->self(); - LOG_DEBUG("[ChronoKeeperInstance] GroupID={} starting DataStoreAdminService at address {} with ProviderID={}" - , keeper_group_id, s3.str(), datastore_service_provider_id); + LOG_DEBUG("[ChronoGrapher] starting DataStoreAdminService at address {} with ProviderID={}" + , s3.str(), datastore_service_provider_id); keeperDataAdminService = chronolog::DataStoreAdminService::CreateDataStoreAdminService(*dataAdminEngine , datastore_service_provider_id , theDataStore); } catch(tl::exception const &) { - LOG_ERROR("[ChronoKeeperInstance] Keeper failed to create DataStoreAdminService"); + LOG_ERROR("[ChronoGrapher] failed to create DataStoreAdminService"); } if(nullptr == keeperDataAdminService) { - LOG_CRITICAL("[ChronoKeeperInstance] Keeper failed to create DataStoreAdminService exiting"); + LOG_CRITICAL("[ChronoGrapher] failed to create DataStoreAdminService exiting"); if(dataAdminEngine) { delete dataAdminEngine; } return (-1); } - // Instantiate KeeperRecordingService + // Instantiate RecordingService tl::engine*recordingEngine = nullptr; - chronolog::GrapherRecordingService* keeperRecordingService = nullptr; + chronolog::GrapherRecordingService* grapherRecordingService = nullptr; try { - margo_instance_id margo_id = margo_init(KEEPER_RECORDING_SERVICE_NA_STRING.c_str(), MARGO_SERVER_MODE, 1, 1); + margo_instance_id margo_id = margo_init(RECORDING_SERVICE_NA_STRING.c_str(), MARGO_SERVER_MODE, 1, 1); recordingEngine = new tl::engine(margo_id); std::stringstream s1; s1 << recordingEngine->self(); - LOG_INFO("[ChronoKeeperInstance] GroupID={} starting KeeperRecordingService at {} with provider_id {}" - , keeper_group_id, s1.str(), datastore_service_provider_id); - keeperRecordingService = chronolog::GrapherRecordingService::CreateRecordingService(*recordingEngine + LOG_INFO("[ChronoGrapher] starting RecordingService at {} with provider_id {}" + , s1.str(), datastore_service_provider_id); + grapherRecordingService = chronolog::GrapherRecordingService::CreateRecordingService(*recordingEngine , recording_service_provider_id , ingestionQueue); } catch(tl::exception const &) { - LOG_ERROR("[ChronoKeeperInstance] Keeper failed to create KeeperRecordingService"); + LOG_ERROR("[ChronoGrapher] failed to create RecordingService"); } - if(nullptr == keeperRecordingService) + if(nullptr == grapherRecordingService) { - LOG_CRITICAL("[ChronoKeeperInstance] Keeper failed to create KeeperRecordingService exiting"); + LOG_CRITICAL("[ChronoGrapher] failed to create RecordingService exiting"); delete keeperDataAdminService; return (-1); } - /// KeeperRegistryClient SetUp _____________________________________________________________________________________ - // create KeeperRegistryClient and register the new KeeperRecording service with the KeeperRegistry - std::string KEEPER_REGISTRY_SERVICE_NA_STRING = - confManager.KEEPER_CONF.VISOR_KEEPER_REGISTRY_SERVICE_CONF.RPC_CONF.PROTO_CONF + "://" + - confManager.KEEPER_CONF.VISOR_KEEPER_REGISTRY_SERVICE_CONF.RPC_CONF.IP + ":" + - std::to_string(confManager.KEEPER_CONF.VISOR_KEEPER_REGISTRY_SERVICE_CONF.RPC_CONF.BASE_PORT); + /// RegistryClient SetUp _____________________________________________________________________________________ + // create RegistryClient and register the new Recording service with the Registry + std::string REGISTRY_SERVICE_NA_STRING = + confManager.GRAPHER_CONF.VISOR_REGISTRY_SERVICE_CONF.PROTO_CONF + "://" + + confManager.GRAPHER_CONF.VISOR_REGISTRY_SERVICE_CONF.IP + ":" + + std::to_string(confManager.GRAPHER_CONF.VISOR_REGISTRY_SERVICE_CONF.BASE_PORT); - uint16_t KEEPER_REGISTRY_SERVICE_PROVIDER_ID = confManager.KEEPER_CONF.VISOR_KEEPER_REGISTRY_SERVICE_CONF.RPC_CONF.SERVICE_PROVIDER_ID; + uint16_t REGISTRY_SERVICE_PROVIDER_ID = confManager.GRAPHER_CONF.VISOR_REGISTRY_SERVICE_CONF.SERVICE_PROVIDER_ID; - chronolog::KeeperRegistryClient*keeperRegistryClient = chronolog::KeeperRegistryClient::CreateKeeperRegistryClient( - *dataAdminEngine, KEEPER_REGISTRY_SERVICE_NA_STRING, KEEPER_REGISTRY_SERVICE_PROVIDER_ID); + chronolog::GrapherRegistryClient* grapherRegistryClient = chronolog::GrapherRegistryClient::CreateRegistryClient( + *dataAdminEngine, REGISTRY_SERVICE_NA_STRING, REGISTRY_SERVICE_PROVIDER_ID); - if(nullptr == keeperRegistryClient) + if(nullptr == grapherRegistryClient) { - LOG_CRITICAL("[ChronoKeeperInstance] Keeper failed to create KeeperRegistryClient; exiting"); - delete keeperRecordingService; + LOG_CRITICAL("[ChronoGrapher] failed to create RegistryClient; exiting"); + delete grapherRecordingService; delete keeperDataAdminService; return (-1); } @@ -229,20 +228,20 @@ int main(int argc, char**argv) int retries = 5; while((chronolog::CL_SUCCESS != registration_status) && (retries > 0)) { - registration_status = keeperRegistryClient->send_register_msg( - chronolog::KeeperRegistrationMsg(keeperIdCard, collectionServiceId)); + registration_status = grapherRegistryClient->send_register_msg( + chronolog::GrapherRegistrationMsg(processIdCard, collectionServiceId)); retries--; } if(chronolog::CL_SUCCESS != registration_status) { - LOG_CRITICAL("[ChronoKeeperInstance] Failed to register with ChronoVisor after multiple attempts. Exiting."); - delete keeperRegistryClient; - delete keeperRecordingService; + LOG_CRITICAL("[ChronoGrapher] Failed to register with ChronoVisor after multiple attempts. Exiting."); + delete grapherRegistryClient; + delete grapherRecordingService; delete keeperDataAdminService; return (-1); } - LOG_INFO("[ChronoKeeperInstance] Successfully registered with ChronoVisor."); + LOG_INFO("[ChronoGrapher] Successfully registered with ChronoVisor."); /// Start data collection and extraction threads ___________________________________________________________________ // services are successfulley created and keeper process had registered with ChronoVisor @@ -257,22 +256,22 @@ int main(int argc, char**argv) // now we are ready to ingest records coming from the storyteller clients .... // main thread would be sending stats message until keeper process receives // sigterm signal - chronolog::KeeperStatsMsg keeperStatsMsg(keeperIdCard); + //chronolog::StatsMsg keeperStatsMsg(grapherIdCard); while(keep_running) { - keeperRegistryClient->send_stats_msg(keeperStatsMsg); + // grapherRegistryClient->send_stats_msg(keeperStatsMsg); sleep(30); } /// Unregister from ChronoVisor ____________________________________________________________________________________ // Unregister from the chronoVisor so that no new story requests would be coming - keeperRegistryClient->send_unregister_msg(keeperIdCard); - delete keeperRegistryClient; + grapherRegistryClient->send_unregister_msg(processIdCard); + delete grapherRegistryClient; /// Stop services and shut down ____________________________________________________________________________________ - LOG_INFO("[ChronoKeeperInstance] Initiating shutdown procedures."); + LOG_INFO("[ChronoGrapher] Initiating shutdown procedures."); // Stop recording events - delete keeperRecordingService; + delete grapherRecordingService; delete keeperDataAdminService; // Shutdown the Data Collection theDataStore.shutdownDataCollection(); @@ -284,6 +283,6 @@ int main(int argc, char**argv) // collectionEngine.finalize(); delete recordingEngine; delete dataAdminEngine; - LOG_INFO("[ChronoKeeperInstance] Shutdown completed. Exiting."); + LOG_INFO("[ChronoGrapher] Shutdown completed. Exiting."); return exit_code; } diff --git a/ChronoGrapher/KeeperRegClient.h b/ChronoGrapher/GrapherRegClient.h similarity index 50% rename from ChronoGrapher/KeeperRegClient.h rename to ChronoGrapher/GrapherRegClient.h index bf3b04ce..fb0b5fc4 100644 --- a/ChronoGrapher/KeeperRegClient.h +++ b/ChronoGrapher/GrapherRegClient.h @@ -1,13 +1,13 @@ -#ifndef KEEPER_REG_CLIENT_H -#define KEEPER_REG_CLIENT_H +#ifndef GRAPHER_REG_CLIENT_H +#define GRAPHER_REG_CLIENT_H #include #include #include -#include "KeeperIdCard.h" -#include "KeeperRegistrationMsg.h" -#include "KeeperStatsMsg.h" +#include "GrapherIdCard.h" +#include "GrapherRegistrationMsg.h" +//#include "GrapherStatsMsg.h" #include "chronolog_errcode.h" namespace tl = thallium; @@ -16,49 +16,49 @@ namespace tl = thallium; namespace chronolog { -class KeeperRegistryClient +class GrapherRegistryClient { public: - static KeeperRegistryClient* - CreateKeeperRegistryClient(tl::engine &tl_engine, std::string const ®istry_service_addr + static GrapherRegistryClient* + CreateRegistryClient(tl::engine &tl_engine, std::string const ®istry_service_addr , uint16_t registry_provider_id) { try { - return new KeeperRegistryClient(tl_engine, registry_service_addr, registry_provider_id); + return new GrapherRegistryClient(tl_engine, registry_service_addr, registry_provider_id); } catch(tl::exception const &) { - LOG_ERROR("[KeeperRegistryClient] Failed to create KeeperRegistryClient"); + LOG_ERROR("[GrapherRegistryClient] Failed to create GrapherRegistryClient"); return nullptr; } } - int send_register_msg(KeeperRegistrationMsg const &keeperMsg) + int send_register_msg(GrapherRegistrationMsg const & grapherMsg) { try { std::stringstream ss; - ss << keeperMsg; - LOG_DEBUG("[KeeperRegisterClient] Sending Register Message: {}", ss.str()); - return register_keeper.on(reg_service_ph)(keeperMsg); + ss << grapherMsg; + LOG_DEBUG("[GrapherRegistryClient] Sending Registration Message: {}", ss.str()); + return register_grapher.on(reg_service_ph)(grapherMsg); } catch(tl::exception const &) { - LOG_ERROR("[KeeperRegisterClient] Failed Sending Register Message."); + LOG_ERROR("[GrapherRegistryClient] Failed Sending Registration Message."); return CL_ERR_UNKNOWN; } } - int send_unregister_msg(KeeperIdCard const &keeperIdCard) + int send_unregister_msg(GrapherIdCard const &grapherIdCard) { try { std::stringstream ss; - ss << keeperIdCard; - LOG_DEBUG("[KeeperRegisterClient] Sending Unregister Message: {}", ss.str()); - return unregister_keeper.on(reg_service_ph)(keeperIdCard); + ss << grapherIdCard; + LOG_DEBUG("[GrapherRegistryClient] Sending Unregister Message: {}", ss.str()); + return unregister_grapher.on(reg_service_ph)(grapherIdCard); } catch(tl::exception const &) { @@ -67,7 +67,7 @@ class KeeperRegistryClient } } - void send_stats_msg(KeeperStatsMsg const &keeperStatsMsg) +/* void send_stats_msg(KeeperStatsMsg const &keeperStatsMsg) { try { @@ -81,33 +81,33 @@ class KeeperRegistryClient LOG_ERROR("[KeeperRegisterClient] Failed Sending Stats Message."); } } - - ~KeeperRegistryClient() +*/ + ~GrapherRegistryClient() { LOG_DEBUG("[KeeperRegistryClient] Destructor called. Cleaning up resources..."); - register_keeper.deregister(); - unregister_keeper.deregister(); - handle_stats_msg.deregister(); + register_grapher.deregister(); + unregister_grapher.deregister(); + //handle_stats_msg.deregister(); } private: std::string reg_service_addr; // na address of Keeper Registry Service uint16_t reg_service_provider_id; // KeeperRegistryService provider id tl::provider_handle reg_service_ph; //provider_handle for remote registry service - tl::remote_procedure register_keeper; - tl::remote_procedure unregister_keeper; - tl::remote_procedure handle_stats_msg; + tl::remote_procedure register_grapher; + tl::remote_procedure unregister_grapher; + //tl::remote_procedure handle_stats_msg; // constructor is private to make sure thalium rpc objects are created on the heap, not stack - KeeperRegistryClient(tl::engine &tl_engine, std::string const ®istry_addr, uint16_t registry_provider_id) + GrapherRegistryClient(tl::engine &tl_engine, std::string const ®istry_addr, uint16_t registry_provider_id) : reg_service_addr(registry_addr), reg_service_provider_id(registry_provider_id), reg_service_ph( tl_engine.lookup(registry_addr), registry_provider_id) { - LOG_DEBUG("[KeeperRegistryClient] Initialized for RegistryService at {} with ProviderID={}", registry_addr + LOG_DEBUG("[GrapherRegistryClient] Initialized for RegistryService at {} with ProviderID={}", registry_addr , registry_provider_id); - register_keeper = tl_engine.define("register_keeper"); - unregister_keeper = tl_engine.define("unregister_keeper"); - handle_stats_msg = tl_engine.define("handle_stats_msg").disable_response(); + register_grapher = tl_engine.define("register_grapher"); + unregister_grapher = tl_engine.define("unregister_grapher"); + // handle_stats_msg = tl_engine.define("handle_stats_msg").disable_response(); } }; } diff --git a/chrono_common/ConfigurationManager.cpp b/chrono_common/ConfigurationManager.cpp new file mode 100644 index 00000000..02696455 --- /dev/null +++ b/chrono_common/ConfigurationManager.cpp @@ -0,0 +1,117 @@ +#include "ConfigurationManager.h" + + +void ChronoLog::ConfigurationManager::parseGrapherConf(json_object*json_conf) +{ + json_object_object_foreach(json_conf, key, val) + { + if(strcmp(key, "RecordingService") == 0) + { + assert(json_object_is_type(val, json_type_object)); + json_object* recording_service_conf = json_object_object_get(json_conf, "RecordingService"); + json_object_object_foreach(recording_service_conf, key, val) + { + if(strcmp(key, "rpc") == 0) + { + parseRPCProviderConf(val, GRAPHER_CONF.RECORDING_SERVICE_CONF); + } + else + { + std::cerr << "[ConfigurationManager] [chrono_grapher] Unknown RecordingService configuration: " << key + << std::endl; + } + } + } + else if(strcmp(key, "DataStoreAdminService") == 0) + { + assert(json_object_is_type(val, json_type_object)); + json_object*keeper_data_store_admin_service_conf = json_object_object_get(json_conf + , "DataStoreAdminService"); + json_object_object_foreach(keeper_data_store_admin_service_conf, key, val) + { + if(strcmp(key, "rpc") == 0) + { + parseRPCProviderConf(val, GRAPHER_CONF.DATA_STORE_ADMIN_SERVICE_CONF); + } + else + { + std::cerr << "[ConfigurationManager] [chrono_grapher] Unknown DataStoreAdminService configuration: " << key + << std::endl; + } + } + } + else if(strcmp(key, "VisorRegistryService") == 0) + { + assert(json_object_is_type(val, json_type_object)); + json_object*visor_keeper_registry_service_conf = json_object_object_get(json_conf + , "VisorRegistryService"); + json_object_object_foreach(visor_keeper_registry_service_conf, key, val) + { + if(strcmp(key, "rpc") == 0) + { + parseRPCProviderConf(val, GRAPHER_CONF.VISOR_REGISTRY_SERVICE_CONF); + } + else + { + std::cerr << "[ConfigurationManager] [chrono_grapher] Unknown VisorRegistryService configuration: " << key + << std::endl; + } + } + } + else if(strcmp(key, "Logging") == 0) + { + assert(json_object_is_type(val, json_type_object)); + json_object*chrono_logging = json_object_object_get(json_conf, "Logging"); + json_object_object_foreach(chrono_logging, key, val) + { + if(strcmp(key, "log") == 0) + { + parseLogConf(val, GRAPHER_CONF.LOG_CONF); + } + else + { + std::cerr << "[ConfigurationManager] [chrono_grapher] Unknown Logging configuration: " << key << std::endl; + } + } + } + else if(strcmp(key, "DataStoreInternals") == 0) + { + assert(json_object_is_type(val, json_type_object)); + json_object* data_store_conf = json_object_object_get(json_conf, "DataStoreInternals"); + json_object_object_foreach(data_store_conf, key, val) + { + if(strcmp(key, "max_story_chunk_size") == 0) + { + assert(json_object_is_type(val, json_type_int)); + GRAPHER_CONF.DATA_STORE_CONF.max_story_chunk_size = json_object_get_int(val); + } + else + { + std::cerr << "[ConfigurationManager] [chrono_grapher] Unknown configuration: " << key << std::endl; + } + } + } + else if(strcmp(key, "Extractors") == 0) + { + assert(json_object_is_type(val, json_type_object)); + json_object* extractors = json_object_object_get(json_conf, "Extractors"); + json_object_object_foreach(extractors, key, val) + { + if(strcmp(key, "story_files_dir") == 0) + { + std::cerr << "reading stroy_files_dir"< + +#include + +// this class wrapps ChronoGrapher Process identification +// that will be used by all the ChronoLog Processes +// to both identofy the process and create RPC client channels +// to send the data to the RecordingService it contains + +namespace chronolog +{ + +class GrapherIdCard +{ + + uint32_t ip_addr; //IP address as uint32_t in host byte order + uint16_t port; //port number as uint16_t in host byte order + uint16_t tl_provider_id; // id of thallium service provider + +public: + + + GrapherIdCard( uint32_t addr = 0, uint16_t a_port=0, uint16_t provider_id=0) + : ip_addr(addr), port(a_port),tl_provider_id(provider_id) + {} + + GrapherIdCard( GrapherIdCard const& other) + : ip_addr(other.getIPaddr()), port(other.getPort()),tl_provider_id(other.getProviderId()) + {} + + ~GrapherIdCard()=default; + + uint32_t getIPaddr() const {return ip_addr; } + uint16_t getPort() const { return port;} + uint16_t getProviderId () const { return tl_provider_id; } + + + // serialization function used by thallium RPC providers + // to serialize/deserialize + template + void serialize( SerArchiveT & serT) + { + serT & ip_addr; + serT & port; + serT & tl_provider_id; + } + + std::string & getIPasDottedString ( std::string & a_string ) const + { + + char buffer[INET_ADDRSTRLEN]; + // convert ip from host to network byte order uint32_t + uint32_t ip_net_order = htonl(ip_addr); + // convert network byte order uint32_t to a dotted string + if (NULL != inet_ntop(AF_INET, &ip_net_order, buffer, INET_ADDRSTRLEN)) + { a_string += std::string(buffer); } + return a_string; + } + +}; + +} //namespace chronolog + +inline bool operator==(chronolog::GrapherIdCard const& card1, chronolog::GrapherIdCard const& card2) +{ + return ( (card1.getIPaddr()==card2.getIPaddr() && card1.getPort() == card2.getPort() + && card1.getProviderId() == card2.getProviderId()) ? true : false ); + +} +inline std::ostream & operator<< (std::ostream & out , chronolog::GrapherIdCard const & id_card) +{ + std::string a_string; + out << "GrapherIdCard{" + <<":"< +#include +#include "KeeperIdCard.h" +#include "GrapherIdCard.h" +#include "KeeperRegistrationMsg.h" + +namespace chronolog +{ + +class GrapherRegistrationMsg +{ + + GrapherIdCard grapherIdCard; + ServiceId adminServiceId; + +public: + + + GrapherRegistrationMsg(GrapherIdCard const &id_card = GrapherIdCard{0, 0, 0} + , ServiceId const &admin_service_id = ServiceId{0, 0, 0}) + : grapherIdCard(id_card) + , adminServiceId(admin_service_id) + {} + + ~GrapherRegistrationMsg() = default; + + GrapherIdCard const &getGrapherIdCard() const + { return grapherIdCard; } + + ServiceId const &getAdminServiceId() const + { return adminServiceId; } + + template + void serialize(SerArchiveT &serT) + { + serT&grapherIdCard; + serT&adminServiceId; + } + +}; + +}//namespace + +inline std::ostream &operator<<(std::ostream &out, chronolog::GrapherRegistrationMsg const &msg) +{ + out << "GrapherRegistrationMsg{" << msg.getGrapherIdCard() << "}{admin:" << msg.getAdminServiceId() << "}"; + return out; +} + +#endif diff --git a/default_conf.json.in b/default_conf.json.in index 244abd12..10cb4bd1 100644 --- a/default_conf.json.in +++ b/default_conf.json.in @@ -30,7 +30,7 @@ "Logging": { "log": { "type": "file", - "file": "chronovisor_logfile.txt", + "file": "chrono_visor.log", "level": "debug", "name": "ChronoVisor", "filesize": 102400, @@ -71,7 +71,7 @@ "Logging": { "log": { "type": "file", - "file": "chronokeeper_logfile.txt", + "file": "chrono_keeper.log", "level": "debug", "name": "ChronoKeeper", "filesize": 102400, @@ -81,6 +81,52 @@ }, "story_files_dir": "/tmp/" }, + "chrono_grapher": { + "RecordingService": { + "rpc": { + "rpc_implementation": "Thallium_sockets", + "protocol_conf": "ofi+sockets", + "service_ip": "127.0.0.1", + "service_base_port": 3333, + "service_provider_id": 33 + } + }, + "DataStoreAdminService": { + "rpc": { + "rpc_implementation": "Thallium_sockets", + "protocol_conf": "ofi+sockets", + "service_ip": "127.0.0.1", + "service_base_port": 4444, + "service_provider_id": 44 + } + }, + "VisorRegistryService": { + "rpc": { + "rpc_implementation": "Thallium_sockets", + "protocol_conf": "ofi+sockets", + "service_ip": "127.0.0.1", + "service_base_port": 8888, + "service_provider_id": 88 + } + }, + "Logging": { + "log": { + "type": "file", + "file": "chrono_grapher.log", + "level": "debug", + "name": "ChronoGrapher", + "filesize": 102400, + "filenum": 3, + "flushlevel": "debug" + } + }, + "DataStoreInternals": { + "max_story_chunk_size": 4096 + }, + "Extractors": { + "story_files_dir" : "/tmp" + } + }, "chrono_client": { "VisorClientPortalService": { "rpc": { From d9baa37dca0a7beea1912ea9e9fd90472c0b990a Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Wed, 3 Apr 2024 14:14:02 -0500 Subject: [PATCH 08/40] grapher registration checkpoint --- ChronoVisor/CMakeLists.txt | 1 + ChronoVisor/include/KeeperRegistry.h | 45 +++++++++++++++++++++ ChronoVisor/include/KeeperRegistryService.h | 22 +++++++++- ChronoVisor/src/KeeperRegistry.cpp | 12 ++++++ chrono_common/KeeperIdCard.h | 6 +-- 5 files changed, 82 insertions(+), 4 deletions(-) diff --git a/ChronoVisor/CMakeLists.txt b/ChronoVisor/CMakeLists.txt index 922c1d0a..fadae6c2 100644 --- a/ChronoVisor/CMakeLists.txt +++ b/ChronoVisor/CMakeLists.txt @@ -15,6 +15,7 @@ target_sources(chronovisor_server PRIVATE ./src/ClientRegistryRecord.cpp ./src/ChronicleMetaDirectory.cpp ./src/KeeperRegistry.cpp + ../chrono_common/ConfigurationManager.cpp ../ChronoAPI/ChronoLog/src/city.cpp ../ChronoAPI/ChronoLog/src/log.cpp) diff --git a/ChronoVisor/include/KeeperRegistry.h b/ChronoVisor/include/KeeperRegistry.h index cc199cc5..185546ec 100644 --- a/ChronoVisor/include/KeeperRegistry.h +++ b/ChronoVisor/include/KeeperRegistry.h @@ -11,6 +11,8 @@ #include "KeeperIdCard.h" #include "KeeperStatsMsg.h" #include "KeeperRegistrationMsg.h" +#include "GrapherIdCard.h" +#include "GrapherRegistrationMsg.h" #include "ConfigurationManager.h" namespace chronolog @@ -64,6 +66,46 @@ class KeeperRegistry std::list> delayedExitClients; }; + struct GrapherProcessEntry + { + public: + GrapherProcessEntry(GrapherIdCard const& id_card, ServiceId const& admin_service_id) + : idCard(id_card) + , adminServiceId(admin_service_id) + , adminClient(nullptr) + , active(false) + , lastStatsTime(0) + , activeStoryCount(0) + {} + + GrapherProcessEntry(GrapherProcessEntry const &other) = default; + ~GrapherProcessEntry() = default; // Registry is reponsible for creating & deleting keeperAdminClient + + GrapherIdCard idCard; + ServiceId adminServiceId; + DataStoreAdminClient* adminClient; + bool active; + uint64_t lastStatsTime; + uint32_t activeStoryCount; + std::list> delayedExitClients; + }; + + + struct KeeperGroupEntry + { + KeeperGroupEntry( KeeperGroupId group_id, GrapherProcessEntry * grapher_ptr=nullptr) + : groupId(group_id) + , grapher_process(grapher_ptr) + { } + + KeeperGroupEntry(KeeperGroupEntry const &other) = default; + ~KeeperGroupEntry() = default; + + KeeperGroupId groupId; + GrapherProcessEntry* grapher_process; + std::map, KeeperProcessEntry*> keepers; + }; + enum RegistryState { UNKNOWN = 0, INITIALIZED = 1, // RegistryService is initialized, no active keepers @@ -103,6 +145,8 @@ class KeeperRegistry int notifyKeepersOfStoryRecordingStop(std::vector const &, StoryId const &); + int registerGrapherProcess(GrapherRegistrationMsg const & reg_msg); + int unregisterGrapherProcess(GrapherIdCard const & id_card); private: @@ -112,6 +156,7 @@ class KeeperRegistry RegistryState registryState; std::mutex registryLock; std::map , KeeperProcessEntry> keeperProcessRegistry; + std::map keeperGroups; thallium::engine*registryEngine; KeeperRegistryService*keeperRegistryService; size_t delayedDataAdminExitSeconds; diff --git a/ChronoVisor/include/KeeperRegistryService.h b/ChronoVisor/include/KeeperRegistryService.h index 32a054c2..12bc59a8 100644 --- a/ChronoVisor/include/KeeperRegistryService.h +++ b/ChronoVisor/include/KeeperRegistryService.h @@ -2,13 +2,14 @@ #define KEEPER_REGISTRY_SERVICE_H #include -//#include #include #include #include "KeeperIdCard.h" #include "KeeperRegistrationMsg.h" #include "KeeperStatsMsg.h" +#include "GrapherIdCard.h" +#include "GrapherRegistrationMsg.h" #include "log.h" #include "KeeperRegistry.h" @@ -53,6 +54,23 @@ class KeeperRegistryService: public tl::provider theKeeperProcessRegistry.updateKeeperProcessStats(keeper_stats_msg); } + void register_grapher(tl::request const &request, chronolog::GrapherRegistrationMsg const & registrationMsg) + { + int return_code = 0; + std::stringstream ss; + ss << registrationMsg; + LOG_INFO("[KeeperRegistryService] register_grapher: {}", ss.str()); + return_code = theKeeperProcessRegistry.registerGrapherProcess(registrationMsg); + request.respond(return_code); + } + + void unregister_grapher(tl::request const &request, chronolog::GrapherIdCard const &id_card) + { + int return_code = 0; + return_code = theKeeperProcessRegistry.unregisterGrapherProcess(id_card); + request.respond(return_code); + } + KeeperRegistryService(tl::engine &tl_engine, uint16_t service_provider_id, KeeperRegistry &keeperRegistry) : tl::provider (tl_engine, service_provider_id), theKeeperProcessRegistry( keeperRegistry) @@ -60,6 +78,8 @@ class KeeperRegistryService: public tl::provider define("register_keeper", &KeeperRegistryService::register_keeper); define("unregister_keeper", &KeeperRegistryService::unregister_keeper); define("handle_stats_msg", &KeeperRegistryService::handle_stats_msg, tl::ignore_return_value()); + define("register_grapher", &KeeperRegistryService::register_grapher); + define("unregister_grapher", &KeeperRegistryService::unregister_grapher); //setup finalization callback in case this ser vice provider is still alive when the engine is finalized get_engine().push_finalize_callback(this, [p = this]() { delete p; }); diff --git a/ChronoVisor/src/KeeperRegistry.cpp b/ChronoVisor/src/KeeperRegistry.cpp index 502fd7c6..ec69bf32 100644 --- a/ChronoVisor/src/KeeperRegistry.cpp +++ b/ChronoVisor/src/KeeperRegistry.cpp @@ -504,6 +504,18 @@ int KeeperRegistry::notifyKeepersOfStoryRecordingStop(std::vector return chronolog::CL_SUCCESS; } +int KeeperRegistry::registerGrapherProcess(GrapherRegistrationMsg const & reg_msg) +{ + + return chronolog::CL_SUCCESS; +} + +int KeeperRegistry::unregisterGrapherProcess(GrapherIdCard const & id_card) +{ + + + return chronolog::CL_SUCCESS; +} }//namespace chronolog diff --git a/chrono_common/KeeperIdCard.h b/chrono_common/KeeperIdCard.h index c8d01fc3..5daa1574 100644 --- a/chrono_common/KeeperIdCard.h +++ b/chrono_common/KeeperIdCard.h @@ -21,7 +21,7 @@ typedef uint16_t in_port_t; typedef std::pair service_endpoint; // KeeperGroup is the logical grouping of KeeperProcesses -typedef uint64_t KeeperGroupId; +typedef uint32_t KeeperGroupId; class KeeperIdCard @@ -35,7 +35,7 @@ class KeeperIdCard public: - KeeperIdCard( uint64_t group_id = 0, uint32_t addr = 0, uint16_t a_port=0, uint16_t provider_id=0) + KeeperIdCard( uint32_t group_id = 0, uint32_t addr = 0, uint16_t a_port=0, uint16_t provider_id=0) : keeper_group_id(group_id), ip_addr(addr), port(a_port),tl_provider_id(provider_id) {} @@ -45,7 +45,7 @@ class KeeperIdCard ~KeeperIdCard()=default; - uint64_t getGroupId() const { return keeper_group_id; } + uint32_t getGroupId() const { return keeper_group_id; } uint32_t getIPaddr() const {return ip_addr; } uint16_t getPort() const { return port;} uint16_t getProviderId () const { return tl_provider_id; } From 1405bb4aa4aa02fade07da9d93c0ea1faf2f59cb Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Fri, 5 Apr 2024 13:41:28 -0500 Subject: [PATCH 09/40] Grapher registration checkpoint --- ChronoVisor/src/KeeperRegistry.cpp | 25 ++++++++++++++++++++----- chrono_common/KeeperIdCard.h | 2 +- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/ChronoVisor/src/KeeperRegistry.cpp b/ChronoVisor/src/KeeperRegistry.cpp index ec69bf32..4803ef9c 100644 --- a/ChronoVisor/src/KeeperRegistry.cpp +++ b/ChronoVisor/src/KeeperRegistry.cpp @@ -157,6 +157,23 @@ int KeeperRegistry::registerKeeperProcess(KeeperRegistrationMsg const &keeper_re KeeperIdCard keeper_id_card = keeper_reg_msg.getKeeperIdCard(); ServiceId admin_service_id = keeper_reg_msg.getAdminServiceId(); + + //find the group that keepr belongs to in the registry + auto keeper_group_iter = keeperGroups.find(keeper_id_card.getGroupId()); + if(keeper_group_iter == keeperGroups.end()) + { + auto insert_return = keeperGroups.insert(std::pair( + keeper_id_card.getGroupId(), KeeperGroupEntry(keeper_id_card.getGroupId()))); + if(false == insert_return.second) + { + LOG_ERROR("[KeeperRegistry] registration failed for KeeperGroup {}", keeper_id_card.getGroupId()); + return chronolog::CL_ERR_UNKNOWN; + } + else { keeper_group_iter = insert_return.first; } + } + + KeeperGroupEntry* keeper_group = &((*keeper_group_iter).second); + // unlikely but possible that the Registry still retains the record of the previous re-incarnation of hte Keeper process // running on the same host... check for this case and clean up the leftover record... auto keeper_process_iter = keeperProcessRegistry.find( @@ -259,6 +276,9 @@ int KeeperRegistry::unregisterKeeperProcess(KeeperIdCard const &keeper_id_card) if(is_shutting_down()) { return chronolog::CL_ERR_UNKNOWN; } + auto keeper_group_iter = keeperGroups.find(keeper_id_card.getGroupId()); + if(keeper_group_iter == keeperGroups.end()) { return chronolog::CL_SUCCESS; } + auto keeper_process_iter = keeperProcessRegistry.find( std::pair(keeper_id_card.getIPaddr(), keeper_id_card.getPort())); if(keeper_process_iter != keeperProcessRegistry.end()) @@ -284,11 +304,6 @@ int KeeperRegistry::unregisterKeeperProcess(KeeperIdCard const &keeper_id_card) delayedExitTime, (*keeper_process_iter).second.keeperAdminClient)); (*keeper_process_iter).second.keeperAdminClient = nullptr; } - - /*if( (*keeper_process_iter).second.keeperAdminClient != nullptr) - { delete (*keeper_process_iter).second.keeperAdminClient; } - keeperProcessRegistry.erase(keeper_process_iter); - */ } // now that we are still holding registryLock // update registryState if needed diff --git a/chrono_common/KeeperIdCard.h b/chrono_common/KeeperIdCard.h index 5daa1574..98f72c0e 100644 --- a/chrono_common/KeeperIdCard.h +++ b/chrono_common/KeeperIdCard.h @@ -27,7 +27,7 @@ typedef uint32_t KeeperGroupId; class KeeperIdCard { - uint64_t keeper_group_id; + KeeperGroupId keeper_group_id; uint32_t ip_addr; //IP address as uint32_t in host byte order uint16_t port; //port number as uint16_t in host byte order uint16_t tl_provider_id; // id of thallium service provider From 4edff43b7b0227570579a77919aec1fb7997dcf3 Mon Sep 17 00:00:00 2001 From: Kun Feng Date: Tue, 12 Mar 2024 16:21:45 -0500 Subject: [PATCH 10/40] Use getent to get Visor IP Always use high-speed network IP (172.25.x.x for Ares) --- deploy/single_user_deploy.sh | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/deploy/single_user_deploy.sh b/deploy/single_user_deploy.sh index d5508e12..28f2b766 100755 --- a/deploy/single_user_deploy.sh +++ b/deploy/single_user_deploy.sh @@ -17,6 +17,7 @@ CLIENT_ARGS="--config ${CONF_FILE}" VISOR_HOSTS="${CONF_DIR}/hosts_visor" KEEPER_HOSTS="${CONF_DIR}/hosts_keeper" CLIENT_HOSTS="${CONF_DIR}/hosts_client" +HOSTNAME_HS_NET_POSTFIX="-40g" JOB_ID="" install=false deploy=false @@ -133,7 +134,18 @@ copy_shared_libs() { update_visor_ip() { visor_host=$(cat ${VISOR_HOSTS}) - visor_ip=$(dig +short ${visor_host}) + if [[ ${visor_host} == *${HOSTNAME_HS_NET_POSTFIX} ]] + then + visor_ip=$(getent hosts ${visor_host} | awk '{print $1}') + else + visor_ip=$(getent hosts ${visor_host}${HOSTNAME_HS_NET_POSTFIX} | awk '{print $1}') + fi + if [[ -z "${visor_ip}" ]] + then + echo "Cannot get ChronoVisor IP, exiting ..." + exit 1 + fi + echo "Replacing ChronoVisor IP with ${visor_ip} ..." jq ".chrono_visor.VisorClientPortalService.rpc.service_ip = \"${visor_ip}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE} jq ".chrono_client.VisorClientPortalService.rpc.service_ip = \"${visor_ip}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE} jq ".chrono_visor.VisorKeeperRegistryService.rpc.service_ip = \"${visor_ip}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE} From 2c5af08b37859aab2ee88d8b294834d834dcffa8 Mon Sep 17 00:00:00 2001 From: Kun Feng Date: Tue, 12 Mar 2024 17:11:45 -0500 Subject: [PATCH 11/40] Make sure there is only one Visor Move getting IP from hostname to a separate function --- deploy/single_user_deploy.sh | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/deploy/single_user_deploy.sh b/deploy/single_user_deploy.sh index 28f2b766..291d59ae 100755 --- a/deploy/single_user_deploy.sh +++ b/deploy/single_user_deploy.sh @@ -132,14 +132,21 @@ copy_shared_libs() { done } -update_visor_ip() { - visor_host=$(cat ${VISOR_HOSTS}) - if [[ ${visor_host} == *${HOSTNAME_HS_NET_POSTFIX} ]] +get_host_ip() { + local hostname=$1 + local host_ip="" + if [[ ${hostname} == *${HOSTNAME_HS_NET_POSTFIX} ]] then - visor_ip=$(getent hosts ${visor_host} | awk '{print $1}') + host_ip=$(getent hosts ${hostname} | awk '{print $1}' | head -1) else - visor_ip=$(getent hosts ${visor_host}${HOSTNAME_HS_NET_POSTFIX} | awk '{print $1}') + host_ip=$(getent hosts ${hostname}${HOSTNAME_HS_NET_POSTFIX} | awk '{print $1}' | head -1) fi + echo ${host_ip} +} + +update_visor_ip() { + visor_host=$(head -1 ${VISOR_HOSTS}) + visor_ip=$(get_host_ip ${visor_host}) if [[ -z "${visor_ip}" ]] then echo "Cannot get ChronoVisor IP, exiting ..." From aae479d72a1588266709f12ef76f231b1939f13d Mon Sep 17 00:00:00 2001 From: Kun Feng Date: Wed, 13 Mar 2024 15:06:40 -0500 Subject: [PATCH 12/40] Keepers need to have service_ip for KeeperRecordingService and KeeperDataStoreAdminService set to their own IP --- deploy/single_user_deploy.sh | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/deploy/single_user_deploy.sh b/deploy/single_user_deploy.sh index 291d59ae..0cb9a123 100755 --- a/deploy/single_user_deploy.sh +++ b/deploy/single_user_deploy.sh @@ -157,8 +157,17 @@ update_visor_ip() { jq ".chrono_client.VisorClientPortalService.rpc.service_ip = \"${visor_ip}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE} jq ".chrono_visor.VisorKeeperRegistryService.rpc.service_ip = \"${visor_ip}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE} jq ".chrono_keeper.VisorKeeperRegistryService.rpc.service_ip = \"${visor_ip}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE} - jq ".chrono_keeper.KeeperRecordingService.rpc.service_ip = \"${visor_ip}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE} - jq ".chrono_keeper.KeeperDataStoreAdminService.rpc.service_ip = \"${visor_ip}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE} +} + +generate_conf_for_each_keeper() { + for keeper_host in $(cat ${KEEPER_HOSTS} | awk '{print $1}') + do + keeper_ip=$(get_host_ip ${keeper_host}) + echo "Generating conf file for Keeper ${keeper_host} ..." + jq ".chrono_keeper.KeeperDataStoreAdminService.rpc.service_ip = \"${keeper_ip}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE} + jq ".chrono_keeper.KeeperRecordingService.rpc.service_ip = \"${keeper_ip}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE} + jq ".chrono_keeper.Logging.log.file = \"chronokeeper_logfile.txt.${keeper_host}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE}.${keeper_host} + done } install() { @@ -173,6 +182,8 @@ install() { check_conf_files update_visor_ip + + generate_conf_for_each_keeper } deploy() { @@ -184,10 +195,10 @@ deploy() { mpssh -f ${VISOR_HOSTS} "cd ${BIN_DIR}; LD_LIBRARY_PATH=${LIB_DIR} nohup ${VISOR_BIN} ${VISOR_ARGS} > ${VISOR_BIN_FILE_NAME}.\$(hostname) 2>&1 &" # launch Keeper - mpssh -f ${KEEPER_HOSTS} "cd ${BIN_DIR}; LD_LIBRARY_PATH=${LIB_DIR} nohup ${KEEPER_BIN} ${KEEPER_ARGS} > ${KEEPER_BIN_FILE_NAME}.\$(hostname) 2>&1 &" + mpssh -f ${KEEPER_HOSTS} "cd ${BIN_DIR}; LD_LIBRARY_PATH=${LIB_DIR} nohup ${KEEPER_BIN} ${KEEPER_ARGS}.\$(hostname) > ${KEEPER_BIN_FILE_NAME}.\$(hostname) 2>&1 &" # launch Client - mpssh -f ${CLIENT_HOSTS} "cd ${BIN_DIR}; LD_LIBRARY_PATH=${LIB_DIR} nohup ${CLIENT_BIN} ${CLIENT_ARGS} > ${CLIENT_BIN_FILE_NAME}.\$(hostname) 2>&1 &" + mpssh -f ${CLIENT_HOSTS} "cd ${BIN_DIR}; LD_LIBRARY_PATH=${LIB_DIR} nohup ${CLIENT_BIN} ${CLIENT_ARGS}.\$(hostname) > ${CLIENT_BIN_FILE_NAME}.\$(hostname) 2>&1 &" # check Visor mpssh -f ${VISOR_HOSTS} "pgrep -fla ${VISOR_BIN_FILE_NAME}" From 6cece7afc7af4f00064081c98bcb02f13e066254 Mon Sep 17 00:00:00 2001 From: Kun Feng Date: Thu, 21 Mar 2024 17:36:39 -0500 Subject: [PATCH 13/40] Add conf_file command line argument --- deploy/single_user_deploy.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/deploy/single_user_deploy.sh b/deploy/single_user_deploy.sh index 0cb9a123..02a7342a 100755 --- a/deploy/single_user_deploy.sh +++ b/deploy/single_user_deploy.sh @@ -69,7 +69,7 @@ check_conf_files() { echo "Checking configuration files ..." if [[ ! -f ${CONF_FILE} ]] then - echo "${CONF_FILE} configuration file does not exist, exiting ..." + echo "configuration file ${CONF_FILE} does not exist, exiting ..." exit 1 fi @@ -192,12 +192,15 @@ deploy() { echo "Deploying ..." # launch Visor + VISOR_ARGS="--config ${CONF_FILE}" mpssh -f ${VISOR_HOSTS} "cd ${BIN_DIR}; LD_LIBRARY_PATH=${LIB_DIR} nohup ${VISOR_BIN} ${VISOR_ARGS} > ${VISOR_BIN_FILE_NAME}.\$(hostname) 2>&1 &" # launch Keeper + KEEPER_ARGS="--config ${CONF_FILE}" mpssh -f ${KEEPER_HOSTS} "cd ${BIN_DIR}; LD_LIBRARY_PATH=${LIB_DIR} nohup ${KEEPER_BIN} ${KEEPER_ARGS}.\$(hostname) > ${KEEPER_BIN_FILE_NAME}.\$(hostname) 2>&1 &" # launch Client + CLIENT_ARGS="--config ${CONF_FILE}" mpssh -f ${CLIENT_HOSTS} "cd ${BIN_DIR}; LD_LIBRARY_PATH=${LIB_DIR} nohup ${CLIENT_BIN} ${CLIENT_ARGS}.\$(hostname) > ${CLIENT_BIN_FILE_NAME}.\$(hostname) 2>&1 &" # check Visor @@ -236,7 +239,7 @@ reset() { } parse_args() { - TEMP=$(getopt -o v:k:c:s:p:t:j:idr --long visor:,keeper:,client:,visor_hosts:,keeper_hosts:,client_hosts:,job_id:,install,deploy,reset -- "$@") + TEMP=$(getopt -o v:k:c:s:p:t:f:j:idr --long visor:,keeper:,client:,visor_hosts:,keeper_hosts:,client_hosts:,conf_file:,job_id:,install,deploy,reset -- "$@") if [ $? != 0 ] ; then echo "Terminating ..." >&2 ; exit 1 ; fi @@ -265,6 +268,9 @@ parse_args() { -t|--client_hosts) CLIENT_HOSTS=$(realpath "$2") shift 2 ;; + -f|--conf_file) + CONF_FILE=$(realpath "$2") + shift 2 ;; -j|--job_id) JOB_ID="$2" shift 2 ;; @@ -339,6 +345,7 @@ usage() { -s|--visor_hosts VISOR_HOSTS -p|--keeper_hosts KEEPER_HOSTS -r|--client_hosts CLIENT_HOSTS + -f|--conf_file CONF_FILE -j|--job_id JOB_ID" exit 1 } From b74d952ef79896e4b84e3c968e37ddc1c92ce702 Mon Sep 17 00:00:00 2001 From: Kun Feng Date: Thu, 21 Mar 2024 17:37:56 -0500 Subject: [PATCH 14/40] Change default log file size to 1MB --- default_conf.json.in | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/default_conf.json.in b/default_conf.json.in index 10cb4bd1..ff905c2f 100644 --- a/default_conf.json.in +++ b/default_conf.json.in @@ -33,7 +33,7 @@ "file": "chrono_visor.log", "level": "debug", "name": "ChronoVisor", - "filesize": 102400, + "filesize": 1048576, "filenum": 3, "flushlevel": "warning" } @@ -74,7 +74,7 @@ "file": "chrono_keeper.log", "level": "debug", "name": "ChronoKeeper", - "filesize": 102400, + "filesize": 1048576, "filenum": 3, "flushlevel": "warning" } @@ -143,7 +143,7 @@ "file": "chronoclient_logfile.txt", "level": "debug", "name": "ChronoClient", - "filesize": 102400, + "filesize": 1048576, "filenum": 3, "flushlevel": "warning" } From bfadff437067dd474a3538ecd9ed6e66e28a45bb Mon Sep 17 00:00:00 2001 From: Kun Feng Date: Tue, 26 Mar 2024 12:30:28 -0500 Subject: [PATCH 15/40] Update variables on non-default paths from command line Add work_dir, conf_file and help command line argument Add color formatting --- deploy/single_user_deploy.sh | 185 ++++++++++++++++++++++++----------- 1 file changed, 127 insertions(+), 58 deletions(-) diff --git a/deploy/single_user_deploy.sh b/deploy/single_user_deploy.sh index 02a7342a..bcd1daa4 100755 --- a/deploy/single_user_deploy.sh +++ b/deploy/single_user_deploy.sh @@ -1,15 +1,23 @@ #!/bin/bash -CHRONOLOG_ROOT_DIR="/home/${USER}/chronolog" -BIN_DIR="${CHRONOLOG_ROOT_DIR}/bin" -LIB_DIR="${CHRONOLOG_ROOT_DIR}/lib" -CONF_DIR="${CHRONOLOG_ROOT_DIR}/conf" +# Define some colors +ERR='\033[1;31m\033[41m' +INFO='\033[1,36m\033[42m' +DEBUG='\033[0;33m' +NC='\033[0m' # No Color + +WORK_DIR="/home/${USER}/chronolog" +LIB_DIR="${WORK_DIR}/lib" +CONF_DIR="${WORK_DIR}/conf" VISOR_BIN_FILE_NAME="chronovisor_server" KEEPER_BIN_FILE_NAME="chrono_keeper" CLIENT_BIN_FILE_NAME="client_lib_multi_storytellers" -VISOR_BIN="${BIN_DIR}/${VISOR_BIN_FILE_NAME}" -KEEPER_BIN="${BIN_DIR}/${KEEPER_BIN_FILE_NAME}" -CLIENT_BIN="${BIN_DIR}/${CLIENT_BIN_FILE_NAME}" +VISOR_BIN="${WORK_DIR}/bin/${VISOR_BIN_FILE_NAME}" +KEEPER_BIN="${WORK_DIR}/bin/${KEEPER_BIN_FILE_NAME}" +CLIENT_BIN="${WORK_DIR}/bin/${CLIENT_BIN_FILE_NAME}" +VISOR_BIN_DIR=$(dirname ${VISOR_BIN}) +KEEPER_BIN_DIR=$(dirname ${KEEPER_BIN}) +CLIENT_BIN_DIR=$(dirname ${CLIENT_BIN}) CONF_FILE="${CONF_DIR}/default_conf.json" VISOR_ARGS="--config ${CONF_FILE}" KEEPER_ARGS="--config ${CONF_FILE}" @@ -24,52 +32,52 @@ deploy=false reset=false check_hosts_files() { - echo "Checking hosts files..." + echo -e "${INFO}Checking hosts files...${NC}" if [[ ! -f ${VISOR_HOSTS} ]] then - echo "${VISOR_HOSTS} host file does not exist, exiting ..." + echo -e "${ERR}${VISOR_HOSTS} host file does not exist, exiting ...${NC}" exit 1 fi if [[ ! -f ${KEEPER_HOSTS} ]] then - echo "${KEEPER_HOSTS} host file does not exist, exiting ..." + echo -e "${ERR}${KEEPER_HOSTS} host file does not exist, exiting ...${NC}" exit 1 fi if [[ ! -f ${CLIENT_HOSTS} ]] then - echo "${CLIENT_HOSTS} host file does not exist, exiting ..." + echo -e "${ERR}${CLIENT_HOSTS} host file does not exist, exiting ...${NC}" exit 1 fi } check_bin_files() { - echo "Checking binary files..." + echo -e "${INFO}Checking binary files...${NC}" if [[ ! -f ${VISOR_BIN} ]] then - echo "${VISOR_BIN} executable file does not exist, exiting ..." + echo -e "${ERR}${VISOR_BIN} executable file does not exist, exiting ...${NC}" exit 1 fi if [[ ! -f ${KEEPER_BIN} ]] then - echo "${KEEPER_BIN} executable file does not exist, exiting ..." + echo -e "${ERR}${KEEPER_BIN} executable file does not exist, exiting ...${NC}" exit 1 fi if [[ ! -f ${CLIENT_BIN} ]] then - echo "${CLIENT_BIN} executable file does not exist, exiting ..." + echo -e "${ERR}${CLIENT_BIN} executable file does not exist, exiting ...${NC}" exit 1 fi } check_conf_files() { - echo "Checking configuration files ..." + echo -e "${INFO}Checking configuration files ...${NC}" if [[ ! -f ${CONF_FILE} ]] then - echo "configuration file ${CONF_FILE} does not exist, exiting ..." + echo -e "${ERR}configuration file ${CONF_FILE} does not exist, exiting ...${NC}" exit 1 fi @@ -77,7 +85,7 @@ check_conf_files() { visor_client_portal_rpc_in_client=$(jq '.chrono_client.VisorClientPortalService.rpc' "${CONF_FILE}") if [[ "${visor_client_portal_rpc_in_visor}" != "${visor_client_portal_rpc_in_client}" ]] then - echo "mismatched VisorClientPortalService conf in ${CONF_FILE}, exiting ..." + echo -e "${ERR}mismatched VisorClientPortalService conf in ${CONF_FILE}, exiting ...${NC}" exit 1 fi @@ -85,14 +93,14 @@ check_conf_files() { visor_keeper_registry_rpc_in_keeper=$(jq '.chrono_keeper.VisorKeeperRegistryService.rpc' "${CONF_FILE}") if [[ "${visor_keeper_registry_rpc_in_visor}" != "${visor_keeper_registry_rpc_in_keeper}" ]] then - echo "mismatched VisorKeeperRegistryService conf in ${CONF_FILE}, exiting ..." + echo -e "${ERR}mismatched VisorKeeperRegistryService conf in ${CONF_FILE}, exiting ...${NC}" exit 1 fi } extract_shared_libraries() { local executable="$1" - ldd_output=$(ldd ${executable} 2>/dev/null | awk '{print $3}' | grep -v 'not' | grep -v '^/lib') + ldd_output=$(ldd ${executable} 2>/dev/null | grep '=>' | awk '{print $3}' | grep -v 'not' | grep -v '^/lib') echo "${ldd_output}" } @@ -103,9 +111,9 @@ copy_shared_libs_recursive() { # Copy the library and maintain symbolic links recursively final_dest_lib_copies=false + echo -e "${DEBUG}Copying ${lib_path} recursively ...${NC}" while [ "$final_dest_lib_copies" != true ] do - echo "Copying ${lib_path}, linked to ${linked_to_lib_path} ..." cp -P "$lib_path" "$dest_path/" if [ "$lib_path" == "$linked_to_lib_path" ] then @@ -124,11 +132,14 @@ copy_shared_libs() { # Combine shared libraries from all executables and remove duplicates all_shared_libs=$(echo -e "${libs_visor}\n${libs_keeper}\n${libs_client}" | sort | uniq) - # Copy shared libraries to the bin directory + # Copy shared libraries to the lib directory + echo -e "${DEBUG}Copying shared library ...${NC}" mkdir -p ${LIB_DIR} for lib in ${all_shared_libs}; do - echo "Copying shared library ${lib} ..." - copy_shared_libs_recursive ${lib} ${LIB_DIR} + if [[ ! -z ${lib} ]] + then + copy_shared_libs_recursive ${lib} ${LIB_DIR} + fi done } @@ -141,7 +152,7 @@ get_host_ip() { else host_ip=$(getent hosts ${hostname}${HOSTNAME_HS_NET_POSTFIX} | awk '{print $1}' | head -1) fi - echo ${host_ip} + echo "${host_ip}" } update_visor_ip() { @@ -149,10 +160,10 @@ update_visor_ip() { visor_ip=$(get_host_ip ${visor_host}) if [[ -z "${visor_ip}" ]] then - echo "Cannot get ChronoVisor IP, exiting ..." + echo -e "${ERR}Cannot get ChronoVisor IP, exiting ...${NC}" exit 1 fi - echo "Replacing ChronoVisor IP with ${visor_ip} ..." + echo -e "${INFO}Replacing ChronoVisor IP with ${visor_ip} ...${NC}" jq ".chrono_visor.VisorClientPortalService.rpc.service_ip = \"${visor_ip}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE} jq ".chrono_client.VisorClientPortalService.rpc.service_ip = \"${visor_ip}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE} jq ".chrono_visor.VisorKeeperRegistryService.rpc.service_ip = \"${visor_ip}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE} @@ -163,7 +174,7 @@ generate_conf_for_each_keeper() { for keeper_host in $(cat ${KEEPER_HOSTS} | awk '{print $1}') do keeper_ip=$(get_host_ip ${keeper_host}) - echo "Generating conf file for Keeper ${keeper_host} ..." + echo -e "${INFO}Generating conf file for Keeper ${keeper_host} ...${NC}" jq ".chrono_keeper.KeeperDataStoreAdminService.rpc.service_ip = \"${keeper_ip}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE} jq ".chrono_keeper.KeeperRecordingService.rpc.service_ip = \"${keeper_ip}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE} jq ".chrono_keeper.Logging.log.file = \"chronokeeper_logfile.txt.${keeper_host}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE}.${keeper_host} @@ -171,7 +182,7 @@ generate_conf_for_each_keeper() { } install() { - echo "Installing ..." + echo -e "${INFO}Installing ...${NC}" copy_shared_libs @@ -189,75 +200,121 @@ install() { deploy() { install - echo "Deploying ..." + echo -e "${INFO}Deploying ...${NC}" # launch Visor + echo -e "${DEBUG}Lauching ChronoVisor ...${NC}" + VISOR_BIN="${VISOR_BIN_DIR}/${VISOR_BIN_FILE_NAME}" VISOR_ARGS="--config ${CONF_FILE}" - mpssh -f ${VISOR_HOSTS} "cd ${BIN_DIR}; LD_LIBRARY_PATH=${LIB_DIR} nohup ${VISOR_BIN} ${VISOR_ARGS} > ${VISOR_BIN_FILE_NAME}.\$(hostname) 2>&1 &" + mpssh -f ${VISOR_HOSTS} "cd ${VISOR_BIN_DIR}; LD_LIBRARY_PATH=${LIB_DIR} nohup ${VISOR_BIN} ${VISOR_ARGS} > ${VISOR_BIN_FILE_NAME}.\$(hostname) 2>&1 &" | grep ares- 2>&1 # launch Keeper + echo -e "${DEBUG}Lauching ChronoKeeper ...${NC}" + KEEPER_BIN="${KEEPER_BIN_DIR}/${KEEPER_BIN_FILE_NAME}" KEEPER_ARGS="--config ${CONF_FILE}" - mpssh -f ${KEEPER_HOSTS} "cd ${BIN_DIR}; LD_LIBRARY_PATH=${LIB_DIR} nohup ${KEEPER_BIN} ${KEEPER_ARGS}.\$(hostname) > ${KEEPER_BIN_FILE_NAME}.\$(hostname) 2>&1 &" + mpssh -f ${KEEPER_HOSTS} "cd ${KEEPER_BIN_DIR}; LD_LIBRARY_PATH=${LIB_DIR} nohup ${KEEPER_BIN} ${KEEPER_ARGS}.\$(hostname) > ${KEEPER_BIN_FILE_NAME}.\$(hostname) 2>&1 &" | grep ares- 2>&1 # launch Client + echo -e "${DEBUG}Lauching Client ...${NC}" + CLIENT_BIN="${CLIENT_BIN_DIR}/${CLIENT_BIN_FILE_NAME}" CLIENT_ARGS="--config ${CONF_FILE}" - mpssh -f ${CLIENT_HOSTS} "cd ${BIN_DIR}; LD_LIBRARY_PATH=${LIB_DIR} nohup ${CLIENT_BIN} ${CLIENT_ARGS}.\$(hostname) > ${CLIENT_BIN_FILE_NAME}.\$(hostname) 2>&1 &" + mpssh -f ${CLIENT_HOSTS} "cd ${CLIENT_BIN_DIR}; LD_LIBRARY_PATH=${LIB_DIR} nohup ${CLIENT_BIN} ${CLIENT_ARGS}.\$(hostname) > ${CLIENT_BIN_FILE_NAME}.\$(hostname) 2>&1 &" | grep ares- 2>&1 # check Visor - mpssh -f ${VISOR_HOSTS} "pgrep -fla ${VISOR_BIN_FILE_NAME}" + echo -e "${DEBUG}Checking ChronoVisor ...${NC}" + mpssh -f ${VISOR_HOSTS} "pgrep -fla ${VISOR_BIN_FILE_NAME}" | grep ares- 2>&1 # check Keeper - mpssh -f ${KEEPER_HOSTS} "pgrep -fla ${KEEPER_BIN_FILE_NAME}" + echo -e "${DEBUG}Checking ChronoKeeper ...${NC}" + mpssh -f ${KEEPER_HOSTS} "pgrep -fla ${KEEPER_BIN_FILE_NAME}" | grep ares- 2>&1 # check Client - mpssh -f ${CLIENT_HOSTS} "pgrep -fla ${CLIENT_BIN_FILE_NAME}" + echo -e "${DEBUG}Checking Client ...${NC}" + mpssh -f ${CLIENT_HOSTS} "pgrep -fla ${CLIENT_BIN_FILE_NAME}" | grep ares- 2>&1 } reset() { - prepare_hosts + if [[ -z ${JOB_ID} ]] + then + echo -e "${INFO}No JOB_ID provided, use hosts files in ${CONF_DIR}${NC}" + check_hosts_files + else + echo -e "${INFO}JOB_ID is provided, prepare hosts file first${NC}" + prepare_hosts + fi - echo "Resetting ..." + echo -e "${INFO}Resetting ...${NC}" # kill Visor - mpssh -f ${VISOR_HOSTS} "pkill --signal 9 -f ${VISOR_BIN_FILE_NAME}" + echo -e "${DEBUG}Killing ChronoVisor ...${NC}" + mpssh -f ${VISOR_HOSTS} "pkill --signal 9 -f ${VISOR_BIN_FILE_NAME}" | grep ares- 2>&1 # kill Keeper - mpssh -f ${KEEPER_HOSTS} "pkill --signal 9 -f ${KEEPER_BIN_FILE_NAME}" + echo -e "${DEBUG}Killing ChronoKeeper ...${NC}" + mpssh -f ${KEEPER_HOSTS} "pkill --signal 9 -f ${KEEPER_BIN_FILE_NAME}" | grep ares- 2>&1 # kill Client - mpssh -f ${CLIENT_HOSTS} "pkill --signal 9 -f ${CLIENT_BIN_FILE_NAME}" + echo -e "${DEBUG}Killing Client ...${NC}" + mpssh -f ${CLIENT_HOSTS} "pkill --signal 9 -f ${CLIENT_BIN_FILE_NAME}" | grep ares- 2>&1 # check Visor - mpssh -f ${VISOR_HOSTS} "pgrep -fla ${VISOR_BIN_FILE_NAME}" + echo -e "${DEBUG}Checking ChronoVisor ...${NC}" + mpssh -f ${VISOR_HOSTS} "pgrep -fla ${VISOR_BIN_FILE_NAME}" | grep ares- 2>&1 # check Keeper - mpssh -f ${KEEPER_HOSTS} "pgrep -fla ${KEEPER_BIN_FILE_NAME}" + echo -e "${DEBUG}Checking ChronoKeeper ...${NC}" + mpssh -f ${KEEPER_HOSTS} "pgrep -fla ${KEEPER_BIN_FILE_NAME}" | grep ares- 2>&1 # check Client - mpssh -f ${CLIENT_HOSTS} "pgrep -fla ${CLIENT_BIN_FILE_NAME}" + echo -e "${DEBUG}Checking Client ...${NC}" + mpssh -f ${CLIENT_HOSTS} "pgrep -fla ${CLIENT_BIN_FILE_NAME}" | grep ares- 2>&1 } parse_args() { - TEMP=$(getopt -o v:k:c:s:p:t:f:j:idr --long visor:,keeper:,client:,visor_hosts:,keeper_hosts:,client_hosts:,conf_file:,job_id:,install,deploy,reset -- "$@") + TEMP=$(getopt -o w:v:k:c:s:p:t:f:j:hidr --long work_dir:visor:,keeper:,client:,visor_hosts:,keeper_hosts:,client_hosts:,conf_file:,job_id:,help,install,deploy,reset -- "$@") - if [ $? != 0 ] ; then echo "Terminating ..." >&2 ; exit 1 ; fi + if [ $? != 0 ] ; then echo -e "${ERR}Terminating ...${NC}" >&2 ; exit 1 ; fi # Note the quotes around '$TEMP': they are essential! eval set -- "$TEMP" while [[ $# -gt 0 ]]; do case "$1" in + -w|--work_dir) + WORK_DIR=$(realpath "$2") + LIB_DIR="${WORK_DIR}/lib" + CONF_DIR="${WORK_DIR}/conf" + VISOR_BIN_FILE_NAME="chronovisor_server" + KEEPER_BIN_FILE_NAME="chrono_keeper" + CLIENT_BIN_FILE_NAME="client_lib_multi_storytellers" + VISOR_BIN="${WORK_DIR}/bin/${VISOR_BIN_FILE_NAME}" + KEEPER_BIN="${WORK_DIR}/bin/${KEEPER_BIN_FILE_NAME}" + CLIENT_BIN="${WORK_DIR}/bin/${CLIENT_BIN_FILE_NAME}" + VISOR_BIN_DIR=$(dirname ${VISOR_BIN}) + KEEPER_BIN_DIR=$(dirname ${KEEPER_BIN}) + CLIENT_BIN_DIR=$(dirname ${CLIENT_BIN}) + CONF_FILE="${CONF_DIR}/default_conf.json" + VISOR_ARGS="--config ${CONF_FILE}" + KEEPER_ARGS="--config ${CONF_FILE}" + CLIENT_ARGS="--config ${CONF_FILE}" + VISOR_HOSTS="${CONF_DIR}/hosts_visor" + KEEPER_HOSTS="${CONF_DIR}/hosts_keeper" + CLIENT_HOSTS="${CONF_DIR}/hosts_client" + shift 2 ;; -v|--visor) VISOR_BIN=$(realpath "$2") VISOR_BIN_FILE_NAME=$(basename ${VISOR_BIN}) + VISOR_BIN_DIR=$(dirname ${VISOR_BIN}) shift 2 ;; -k|--keeper) KEEPER_BIN=$(realpath "$2") KEEPER_BIN_FILE_NAME=$(basename ${KEEPER_BIN}) + KEEPER_BIN_DIR=$(dirname ${KEEPER_BIN}) shift 2 ;; -c|--client) CLIENT_BIN=$(realpath "$2") CLIENT_BIN_FILE_NAME=$(basename ${CLIENT_BIN}) + CLIENT_BIN_DIR=$(dirname ${CLIENT_BIN}) shift 2 ;; -s|--visor_hosts) VISOR_HOSTS=$(realpath "$2") @@ -270,10 +327,14 @@ parse_args() { shift 2 ;; -f|--conf_file) CONF_FILE=$(realpath "$2") + CONF_DIR=$(dirname ${CONF_FILE}) shift 2 ;; -j|--job_id) JOB_ID="$2" shift 2 ;; + -h|--help) + usage + shift 2 ;; -i|--install) install=true shift ;; @@ -286,7 +347,7 @@ parse_args() { --) shift; break ;; *) - echo "Unknown option: $1" + echo -e "${ERR}Unknown option: $1${NC}" exit 1 ;; esac @@ -294,7 +355,7 @@ parse_args() { if [[ "$deploy" == true && "$reset" == true ]] then - echo "Error: You must choose between deploy (-d) or reset (-r)." + echo -e "${ERR}Error: You must choose between deploy (-d) or reset (-r).${NC}" usage exit 1 fi @@ -305,7 +366,7 @@ parse_args() { # Check if required options are provided if [[ -z ${VISOR_BIN} || -z ${KEEPER_BIN} || -z ${CLIENT_BIN} || -z ${VISOR_HOSTS} || -z ${KEEPER_HOSTS} || -z ${CLIENT_HOSTS} ]]; then - echo "Missing required options." + echo -e "${ERR}Missing required options.${NC}" exit 1 fi } @@ -313,32 +374,38 @@ parse_args() { prepare_hosts() { if [ -n "$SLURM_JOB_ID" ] then - echo "Launched as a SLURM job, getting hosts from job ${SLURM_JOB_ID} ..." + echo -e "${DEBUG}Launched as a SLURM job, getting hosts from job ${SLURM_JOB_ID} ...${NC}" hosts="$(echo \"$SLURM_JOB_NODELIST\" | tr ',' '\n')" echo "${hosts}" | head -1 > ${VISOR_HOSTS} echo "${hosts}" > ${KEEPER_HOSTS} echo "${hosts}" > ${CLIENT_HOSTS} else - echo "Launched from a shell, getting hosts from command line or presets ..." + echo -e "${DEBUG}Launched from a shell, getting hosts from command line or presets ...${NC}" if [ -n "${JOB_ID}" ] then - echo "JOB_ID is set to be ${JOB_ID} via command line, use it" + echo -e "${INFO}JOB_ID is set to be ${JOB_ID} via command line, use it${NC}" hosts_regex="$(squeue | grep ${JOB_ID} | awk '{print $NF}')" + if [[ -z ${hosts_regex} ]] + then + echo -e "${ERR}Cannot find job ${JOB_ID}, exiting ...${NC}" + exit 1 + fi hosts="$(scontrol show hostnames ${hosts_regex})" echo "${hosts}" | head -1 > ${VISOR_HOSTS} echo "${hosts}" > ${KEEPER_HOSTS} echo "${hosts}" > ${CLIENT_HOSTS} else - echo "JOB_ID is not set, use presets" + echo -e "${DEBUG}JOB_ID is not set, use presets${NC}" fi check_hosts_files fi } usage() { - echo "Usage: $0 -i|--install - -d|--deploy - -r|--reset + echo "Usage: $0 -i|--install Prepare ChronoLog deployment + -d|--deploy Start ChronoLog deployment + -r|--reset Reset ChronoLog deployment + -w|--work_dir WORK_DIR -v|--visor VISOR_BIN -k|--keeper KEEPER_BIN -c|--client CLIENT_BIN @@ -346,7 +413,8 @@ usage() { -p|--keeper_hosts KEEPER_HOSTS -r|--client_hosts CLIENT_HOSTS -f|--conf_file CONF_FILE - -j|--job_id JOB_ID" + -j|--job_id JOB_ID + -h|--help Print this page" exit 1 } @@ -364,6 +432,7 @@ elif ${reset} then reset else - echo "Please select deploy or reset mode" + echo -e "${ERR}Please select deploy or reset mode${NC}" usage fi +echo -e "${INFO}Done${NC}" From 45bdc41eb9d565e02caab8fb02204a8af42e0556 Mon Sep 17 00:00:00 2001 From: Eneko Gonzalez Date: Wed, 27 Mar 2024 19:51:00 -0500 Subject: [PATCH 16/40] FEAT: Updated README.md --- README.md | 113 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 74 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 572b6cc1..c59bb60d 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,13 @@ -# ChronoLog +# ChronoLog + +--- ChronoLog: A High-Performance Storage Infrastructure for Activity and Log Workloads (NSF CSSI 2104013) ## ChronoLog Project Synopsis +--- + This project will design and implement ChronoLog, a distributed and tiered shared log storage ecosystem. ChronoLog uses physical time to distribute log entries while providing total log ordering. It also utilizes multiple storage tiers to elastically scale the log capacity (i.e., auto-tiering). ChronoLog will serve as a foundation for developing scalable @@ -12,6 +16,8 @@ distribution, a log-based key-value store, and a log-based TensorFlow module. ## Workloads and Applications +--- + Modern applications spanning from Edge to High Performance Computing (HPC) systems, produce and process log data and create a plethora of workload characteristics that rely on a common storage model: **the distributed shared log**. @@ -19,34 +25,57 @@ create a plethora of workload characteristics that rely on a common storage mode ## Features +--- + ![Feature matrix](/doc/images/feature-matrix.png) ## Checkout ChronoLog -ChronoLog uses HCL internally. It is added to this repository as a submodule. Thus, you need to clone the submodules as -well. You can do it using `git clone --recursive git@github.com:scs-lab/ChronoLog.git` to clone ChronoLog. Or you can -run `git submodule update --init --recursive` once in `ChronoLog` directory after you clone the repository -without `--recursive`. For following pulls, you can update the submodule using command `git pull --recurse-submodules`. +--- -## Building +To get started with ChronoLog, the first step involves cloning the repository to your system. ChronoLog utilizes the HCL language, which is integrated as a submodule within the repository. This setup requires specific steps to ensure both the main project and its submodule are correctly cloned and initialized. Below, you will find two options to achieve this: -ChronoLog has a list of dependencies which can be solved by Spack packages. Thus, Spack needs to be installed and -configured as the first step to build ChronoLog. +**Option 1: Clone directly with Submodules (Clone and automatically initializes and updates the submodules)** -### Install Spack +``` +git clone --recursive https://github.com/grc-iit/ChronoLog.git +``` + +**Option 2: Initialize Submodules After Cloning** -Spack can be checked out with `git clone https://github.com/spack/spack.git`. It is assumed that Spack is stored -at `~/Spack` for the following step. Spack needs to activated by -running `source ~/Spack/spack/share/spack/setup-env.sh`. +If you prefer or need to separate the cloning and submodule initialization into two steps, follow these instructions: -### Install ChronoLog dependencies +``` +git clone https://github.com/grc-iit/ChronoLog.git +git submodule update --init --recursive +``` -Currently, most of the dependencies are listed in `spack.yaml` and can be installed via Spack. `gcc` and `g++` will be -needed to build ChronoLog. +**Future Updates with Submodules** -A Spack environment needs to be created and activated using the following commands. When the environment is activated, a -shell prompt `[ChronoLog]` will pop up. +For subsequent updates, ensure you also update the submodules alongside the main repository by using: +``` +git pull --recurse-submodules +``` + +## Installation and configuration + +--- + +### Prerequisites: Spack + +ChronoLog requires various packages managed by Spack. To ensure compatibility and stability, we recommend using Spack version **`v0.21.2 (2024-03-01)`**. Follow the steps below to install and configure Spack: + +``` +git clone --branch v0.21.2 https://github.com/spack/spack.git +source /path-to-where-spack-was-cloned/spack/share/spack/setup-env.sh +``` + +### Installing Dependencies + +Currently, most of the dependencies are listed in `spack.yaml` and can be installed via Spack.`gcc` and `g++` will be needed to build ChronoLog. + +A Spack environment needs to be created and activated using the following commands. When the environment is activated, a shell prompt `[ChronoLog]` will pop up. ``` cd ChronoLog git switch develop @@ -54,44 +83,50 @@ spack env create -d . spack env activate -p . spack install -v ``` + :information_source: Installation can take > 30 minutes. -The installation may take some time (> 30 minutes) to finish. +### Building ChronoLog -### Build ChronoLog - -**Please make sure all the building is carried out in the activated Spack environment.** Otherwise, CMake will not able -to find the dependencies. - -Two executables, `chronovisor_server` and `chrono_keeper`, will be built for ChronoVisor and ChronoKeeper, respectively. -Multiple client test cases will be built in the `test/integration/Client/` directory. An additional command line client -admin tool `client_admin` will be built in the `Client/ChronoAdmin` directory as a workload generator. + :exclamation: Ensure all build steps are performed within the activated Spack environment to allow CMake to locate necessary dependencies: ``` +// Build the environment cd ChronoLog git switch develop -mkdir build -cd build -cmake .. +mkdir build && cd build + +// Build the project +cmkae .. make all ``` -### Install ChronoLog +Building ChronoLog generates the following executables: + +- **Servers:** `chronovisor_server` for ChronoVisor and `chrono_keeper` for ChronoKeeper. +- **Client Test Cases:** Located in `test/integration/Client/`. +- **Admin Tool:** `client_admin` in `Client/ChronoAdmin` serves as a workload generator. -You can run `make install` in the build directory to install all generated executables along with their dependencies to -the install directory (`~/chronolog` by default). +### Installing ChronoLog + +From the build directory, execute + +``` +make install +``` + +to install executables and dependencies into the default install directory (**`~/chronolog`**). ### Configuration files -All ChronoLog executables need a configuration file to run properly. The template file can be found -in `default_conf.json.in`. You can modify it for your own preferences. The default installing process will copy and -rename `default_conf.json.in` into `conf` under the install directory. You can pass it to the executables via command -line argument `--config default_conf.json`. +--- -ChronoLog will support sockets/TCP/verbs protocols using ofi transport. You can run command `margo-info` to check which -transports and protocols are supported on your system. +- **Default Configuration:** ChronoLog executables require a configuration file (`default_conf.json.in`). Modify this template according to your preferences. +- **Installation Process:** The default installation copies and renames `default_conf.json.in` to `conf` in the install directory. +- **Using Configuration:** Pass the configuration file to executables with `-config default_conf.json`. ------- + +------ # Coming soon ... For more details about the ChronoLog project, please visit our website http://chronolog.dev. From aeee569e789788f6e18e77300b64ef2a1481de4e Mon Sep 17 00:00:00 2001 From: Eneko Gonzalez Date: Wed, 27 Mar 2024 19:53:28 -0500 Subject: [PATCH 17/40] Delete unnecessary horizontal lines at README.md --- README.md | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/README.md b/README.md index c59bb60d..2d362443 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,9 @@ # ChronoLog ---- - ChronoLog: A High-Performance Storage Infrastructure for Activity and Log Workloads (NSF CSSI 2104013) ## ChronoLog Project Synopsis ---- - This project will design and implement ChronoLog, a distributed and tiered shared log storage ecosystem. ChronoLog uses physical time to distribute log entries while providing total log ordering. It also utilizes multiple storage tiers to elastically scale the log capacity (i.e., auto-tiering). ChronoLog will serve as a foundation for developing scalable @@ -16,8 +12,6 @@ distribution, a log-based key-value store, and a log-based TensorFlow module. ## Workloads and Applications ---- - Modern applications spanning from Edge to High Performance Computing (HPC) systems, produce and process log data and create a plethora of workload characteristics that rely on a common storage model: **the distributed shared log**. @@ -25,14 +19,9 @@ create a plethora of workload characteristics that rely on a common storage mode ## Features ---- - ![Feature matrix](/doc/images/feature-matrix.png) ## Checkout ChronoLog - ---- - To get started with ChronoLog, the first step involves cloning the repository to your system. ChronoLog utilizes the HCL language, which is integrated as a submodule within the repository. This setup requires specific steps to ensure both the main project and its submodule are correctly cloned and initialized. Below, you will find two options to achieve this: **Option 1: Clone directly with Submodules (Clone and automatically initializes and updates the submodules)** @@ -60,8 +49,6 @@ git pull --recurse-submodules ## Installation and configuration ---- - ### Prerequisites: Spack ChronoLog requires various packages managed by Spack. To ensure compatibility and stability, we recommend using Spack version **`v0.21.2 (2024-03-01)`**. Follow the steps below to install and configure Spack: @@ -116,10 +103,7 @@ make install to install executables and dependencies into the default install directory (**`~/chronolog`**). -### Configuration files - ---- - +## Configuration files - **Default Configuration:** ChronoLog executables require a configuration file (`default_conf.json.in`). Modify this template according to your preferences. - **Installation Process:** The default installation copies and renames `default_conf.json.in` to `conf` in the install directory. - **Using Configuration:** Pass the configuration file to executables with `-config default_conf.json`. From b1fda0ec1bf2e949b34af0500b7888b47a1f5a20 Mon Sep 17 00:00:00 2001 From: Eneko Gonzalez Date: Wed, 27 Mar 2024 19:59:10 -0500 Subject: [PATCH 18/40] Update spack and website links --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2d362443..5d845b3c 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,7 @@ git pull --recurse-submodules ### Prerequisites: Spack -ChronoLog requires various packages managed by Spack. To ensure compatibility and stability, we recommend using Spack version **`v0.21.2 (2024-03-01)`**. Follow the steps below to install and configure Spack: +ChronoLog requires various packages managed by Spack. To ensure compatibility and stability, we recommend using Spack version [`v0.21.2 (2024-03-01)`](https://github.com/spack/spack/releases/tag/v0.21.2). Follow the steps below to install and configure Spack: ``` git clone --branch v0.21.2 https://github.com/spack/spack.git @@ -113,5 +113,5 @@ to install executables and dependencies into the default install directory (**`~ ------ # Coming soon ... -For more details about the ChronoLog project, please visit our website http://chronolog.dev. +For more details about the ChronoLog project, please visit our [website](https://grc.iit.edu/research/projects/chronolog) From 6358cfbcfb71fe7d479d40f1e17f777c6dfce9a4 Mon Sep 17 00:00:00 2001 From: Eneko Gonzalez Date: Tue, 2 Apr 2024 11:48:33 -0500 Subject: [PATCH 19/40] Updated readme with requested changes --- README.md | 54 ++++++++++++++++++++++-------------------------------- 1 file changed, 22 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index 5d845b3c..43411da8 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# ChronoLog +# ChronoLog ChronoLog: A High-Performance Storage Infrastructure for Activity and Log Workloads (NSF CSSI 2104013) @@ -22,36 +22,20 @@ create a plethora of workload characteristics that rely on a common storage mode ![Feature matrix](/doc/images/feature-matrix.png) ## Checkout ChronoLog -To get started with ChronoLog, the first step involves cloning the repository to your system. ChronoLog utilizes the HCL language, which is integrated as a submodule within the repository. This setup requires specific steps to ensure both the main project and its submodule are correctly cloned and initialized. Below, you will find two options to achieve this: -**Option 1: Clone directly with Submodules (Clone and automatically initializes and updates the submodules)** - -``` -git clone --recursive https://github.com/grc-iit/ChronoLog.git -``` - -**Option 2: Initialize Submodules After Cloning** - -If you prefer or need to separate the cloning and submodule initialization into two steps, follow these instructions: +To get started with ChronoLog, the first step involves cloning the repository to your system. To do so: ``` git clone https://github.com/grc-iit/ChronoLog.git -git submodule update --init --recursive -``` - -**Future Updates with Submodules** - -For subsequent updates, ensure you also update the submodules alongside the main repository by using: - -``` -git pull --recurse-submodules ``` ## Installation and configuration ### Prerequisites: Spack -ChronoLog requires various packages managed by Spack. To ensure compatibility and stability, we recommend using Spack version [`v0.21.2 (2024-03-01)`](https://github.com/spack/spack/releases/tag/v0.21.2). Follow the steps below to install and configure Spack: +ChronoLog requires various packages managed by Spack. To ensure compatibility and stability, we recommend using Spack +version [`v0.21.2 (2024-03-01)`](https://github.com/spack/spack/releases/tag/v0.21.2). Follow the steps below to install +and configure Spack: ``` git clone --branch v0.21.2 https://github.com/spack/spack.git @@ -60,21 +44,25 @@ source /path-to-where-spack-was-cloned/spack/share/spack/setup-env.sh ### Installing Dependencies -Currently, most of the dependencies are listed in `spack.yaml` and can be installed via Spack.`gcc` and `g++` will be needed to build ChronoLog. +Currently, most of the dependencies are listed in `spack.yaml` and can be installed via Spack. `gcc` and `g++` will be +needed to build ChronoLog. + +A Spack environment needs to be created and activated using the following commands. When the environment is activated, a +shell prompt `[ChronoLog]` will pop up. -A Spack environment needs to be created and activated using the following commands. When the environment is activated, a shell prompt `[ChronoLog]` will pop up. ``` cd ChronoLog git switch develop -spack env create -d . spack env activate -p . spack install -v ``` - :information_source: Installation can take > 30 minutes. + +:information_source: Installation can take > 30 minutes. ### Building ChronoLog - :exclamation: Ensure all build steps are performed within the activated Spack environment to allow CMake to locate necessary dependencies: +:exclamation: Ensure all build steps are performed within the activated Spack environment to allow CMake to locate +necessary dependencies: ``` // Build the environment @@ -83,7 +71,7 @@ git switch develop mkdir build && cd build // Build the project -cmkae .. +cmake .. make all ``` @@ -104,14 +92,16 @@ make install to install executables and dependencies into the default install directory (**`~/chronolog`**). ## Configuration files -- **Default Configuration:** ChronoLog executables require a configuration file (`default_conf.json.in`). Modify this template according to your preferences. -- **Installation Process:** The default installation copies and renames `default_conf.json.in` to `conf` in the install directory. -- **Using Configuration:** Pass the configuration file to executables with `-config default_conf.json`. - +- **Default Configuration:** ChronoLog executables require a configuration file. Modify this template (`default_conf.json.in`) according to your preferences. +- **Installation Process:** The installation copies and renames default_conf.json.in to conf/default_conf.json in the + installation directory by default. You can pass -DCMAKE_INSTALL_PREFIX=/new/installation/directory to CMake to change it. +- **Using Configuration:** Pass the configuration file to executables with `--config default_conf.json`. ------ + # Coming soon ... -For more details about the ChronoLog project, please visit our [website](https://grc.iit.edu/research/projects/chronolog) +For more details about the ChronoLog project, please visit +our [website](https://www.chronolog.dev) From 28b3e086192319ca86ff91db714dd065c554c63a Mon Sep 17 00:00:00 2001 From: Eneko Gonzalez Date: Thu, 4 Apr 2024 02:03:33 -0500 Subject: [PATCH 20/40] Updated readme with requested changes --- README.md | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 43411da8..9b649f30 100644 --- a/README.md +++ b/README.md @@ -47,8 +47,7 @@ source /path-to-where-spack-was-cloned/spack/share/spack/setup-env.sh Currently, most of the dependencies are listed in `spack.yaml` and can be installed via Spack. `gcc` and `g++` will be needed to build ChronoLog. -A Spack environment needs to be created and activated using the following commands. When the environment is activated, a -shell prompt `[ChronoLog]` will pop up. +A Spack environment needs to be created and activated using the following commands. ``` cd ChronoLog @@ -56,14 +55,17 @@ git switch develop spack env activate -p . spack install -v ``` - +To check if the environment is activated the following can be executed: +``` +spack env status +``` :information_source: Installation can take > 30 minutes. ### Building ChronoLog +:exclamation: Ensure (by using `spack env status`) all building steps are performed within the activated Spack environment to allow CMake to locate +necessary dependencies. To do so: -:exclamation: Ensure all build steps are performed within the activated Spack environment to allow CMake to locate -necessary dependencies: - +For building ChronoLog the following commands must be executed. ``` // Build the environment cd ChronoLog @@ -94,8 +96,8 @@ to install executables and dependencies into the default install directory (**`~ ## Configuration files - **Default Configuration:** ChronoLog executables require a configuration file. Modify this template (`default_conf.json.in`) according to your preferences. -- **Installation Process:** The installation copies and renames default_conf.json.in to conf/default_conf.json in the - installation directory by default. You can pass -DCMAKE_INSTALL_PREFIX=/new/installation/directory to CMake to change it. +- **Installation Process:** The installation copies and renames `default_conf.json.in` to `conf/default_conf.json` in the + installation directory by default. You can pass `-DCMAKE_INSTALL_PREFIX=/new/installation/directory` to CMake to change it. - **Using Configuration:** Pass the configuration file to executables with `--config default_conf.json`. ------ From c6b6cefcc4e81c9c9766af276bf59c089af4d10f Mon Sep 17 00:00:00 2001 From: Eneko Gonzalez Date: Fri, 5 Apr 2024 14:09:32 -0500 Subject: [PATCH 21/40] Move environment installation after checking the status --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9b649f30..87f25a9b 100644 --- a/README.md +++ b/README.md @@ -53,12 +53,15 @@ A Spack environment needs to be created and activated using the following comman cd ChronoLog git switch develop spack env activate -p . -spack install -v ``` To check if the environment is activated the following can be executed: ``` spack env status ``` +If the environment is properly activated, it can be installed +``` +spack install -v +``` :information_source: Installation can take > 30 minutes. ### Building ChronoLog From 655b174b1ecdd54b8314f8cf8f7e2a7cd9c2435d Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Fri, 5 Apr 2024 15:48:48 -0500 Subject: [PATCH 22/40] StoryChunk serialization fixed --- chrono_common/StoryChunk.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/chrono_common/StoryChunk.h b/chrono_common/StoryChunk.h index 0a60e13d..2d9132da 100644 --- a/chrono_common/StoryChunk.h +++ b/chrono_common/StoryChunk.h @@ -6,6 +6,9 @@ #include "chronolog_types.h" #include "log.h" +#include +#include + namespace chronolog { @@ -76,7 +79,6 @@ class StoryChunk uint32_t eraseEvents(uint64_t start_time, uint64_t end_time); // serialization function used by thallium RPC providers - // to serialize/deserialize KeeperIdCard template void serialize( SerArchiveT & serT) { @@ -84,10 +86,7 @@ class StoryChunk serT & startTime; serT & endTime; serT & revisionTime; - for(auto iter=logEvents.begin(); iter!= logEvents.end(); ++iter) - { - serT((*iter).second); - } + serT& logEvents; } From 209969dda6fa98e6e3282f13520ae815bc970f76 Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Fri, 5 Apr 2024 15:51:29 -0500 Subject: [PATCH 23/40] CMakeLists.txt changes --- ChronoGrapher/CMakeLists.txt | 3 +-- ChronoKeeper/CMakeLists.txt | 4 +++- ChronoStore/test/CMakeLists.txt | 3 ++- Client/CMakeLists.txt | 3 ++- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/ChronoGrapher/CMakeLists.txt b/ChronoGrapher/CMakeLists.txt index 77aa0c0b..d646c0b3 100644 --- a/ChronoGrapher/CMakeLists.txt +++ b/ChronoGrapher/CMakeLists.txt @@ -16,11 +16,10 @@ target_sources(chrono_grapher PRIVATE StoryPipeline.cpp KeeperDataStore.cpp ../chrono_common/StoryChunk.cpp - ../chrono_common/ConfigurationManager.cpp StoryChunkExtractor.cpp CSVFileChunkExtractor.cpp ../ChronoAPI/ChronoLog/src/log.cpp) -target_link_libraries(chrono_grapher thallium) +target_link_libraries(chrono_grapher chronolog_client thallium) #configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../default_conf.json.in # ${CMAKE_CURRENT_BINARY_DIR}/default_conf.json COPYONLY) diff --git a/ChronoKeeper/CMakeLists.txt b/ChronoKeeper/CMakeLists.txt index 81622fe1..379098ed 100644 --- a/ChronoKeeper/CMakeLists.txt +++ b/ChronoKeeper/CMakeLists.txt @@ -20,7 +20,9 @@ target_sources(chrono_keeper PRIVATE StoryChunkExtractor.cpp CSVFileChunkExtractor.cpp ../ChronoAPI/ChronoLog/src/log.cpp) -target_link_libraries(chrono_keeper thallium) + +target_link_libraries(chrono_keeper chronolog_client thallium) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../default_conf.json.in ${CMAKE_CURRENT_BINARY_DIR}/default_conf.json COPYONLY) message("build target : chrono_keeper") diff --git a/ChronoStore/test/CMakeLists.txt b/ChronoStore/test/CMakeLists.txt index 1bcef0d9..519354b2 100644 --- a/ChronoStore/test/CMakeLists.txt +++ b/ChronoStore/test/CMakeLists.txt @@ -11,7 +11,8 @@ add_executable(hdf5_archiver_test ../include/StoryReader.h ../src/StoryReader.cpp story_chunk_test_utils.h - ../../ChronoAPI/ChronoLog/src/log.cpp) + ../../ChronoAPI/ChronoLog/src/log.cpp + ../../chrono_common/StoryChunk.cpp) target_include_directories(hdf5_archiver_test PRIVATE ../../ChronoKeeper ../../chrono_common diff --git a/Client/CMakeLists.txt b/Client/CMakeLists.txt index 07be50c4..62f44461 100644 --- a/Client/CMakeLists.txt +++ b/Client/CMakeLists.txt @@ -30,6 +30,7 @@ add_library(chronolog_client src/ChronologClient.cpp src/ChronologClientImpl.cpp ../ChronoAPI/ChronoLog/src/log.cpp + ../chrono_common/ConfigurationManager.cpp ) # Include directories for the library @@ -62,4 +63,4 @@ install( #../ChronoAPI/ChronoLog/include/RPCFactory.h #../ChronoAPI/ChronoLog/include/singleton.h #../ChronoAPI/ChronoLog/src/city.cpp -#../ChronoAPI/ChronoLog/include/city.h) \ No newline at end of file +#../ChronoAPI/ChronoLog/include/city.h) From b5d615bf13ee29729836b76afc61972575eb0356 Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Wed, 27 Mar 2024 13:54:41 -0500 Subject: [PATCH 24/40] Grapher Configuration and GrapherRegClient --- ChronoGrapher/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/ChronoGrapher/CMakeLists.txt b/ChronoGrapher/CMakeLists.txt index d646c0b3..2e8b2bb6 100644 --- a/ChronoGrapher/CMakeLists.txt +++ b/ChronoGrapher/CMakeLists.txt @@ -19,6 +19,7 @@ target_sources(chrono_grapher PRIVATE StoryChunkExtractor.cpp CSVFileChunkExtractor.cpp ../ChronoAPI/ChronoLog/src/log.cpp) + target_link_libraries(chrono_grapher chronolog_client thallium) #configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../default_conf.json.in From 4250ac1635db1c84b38269a92db1825cdfe1139e Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Thu, 11 Apr 2024 10:50:28 -0500 Subject: [PATCH 25/40] RecordingGroupId, ServiceID.h and registration message reorg --- chrono_common/GrapherIdCard.h | 38 ++++++++++------- chrono_common/GrapherRegistrationMsg.h | 13 +++--- chrono_common/KeeperIdCard.h | 10 ++--- chrono_common/KeeperRegistrationMsg.h | 54 +++-------------------- chrono_common/ServiceId.h | 59 ++++++++++++++++++++++++++ 5 files changed, 98 insertions(+), 76 deletions(-) create mode 100644 chrono_common/ServiceId.h diff --git a/chrono_common/GrapherIdCard.h b/chrono_common/GrapherIdCard.h index fccef30a..101e793d 100644 --- a/chrono_common/GrapherIdCard.h +++ b/chrono_common/GrapherIdCard.h @@ -5,6 +5,8 @@ #include +#include "ServiceId.h" + // this class wrapps ChronoGrapher Process identification // that will be used by all the ChronoLog Processes // to both identofy the process and create RPC client channels @@ -15,25 +17,30 @@ namespace chronolog class GrapherIdCard { - + RecordingGroupId groupId; uint32_t ip_addr; //IP address as uint32_t in host byte order uint16_t port; //port number as uint16_t in host byte order uint16_t tl_provider_id; // id of thallium service provider public: - - - GrapherIdCard( uint32_t addr = 0, uint16_t a_port=0, uint16_t provider_id=0) - : ip_addr(addr), port(a_port),tl_provider_id(provider_id) + GrapherIdCard(RecordingGroupId group_id = 0, uint32_t addr = 0, uint16_t a_port = 0, uint16_t provider_id = 0) + : groupId(group_id) + , ip_addr(addr) + , port(a_port) + , tl_provider_id(provider_id) {} - GrapherIdCard( GrapherIdCard const& other) - : ip_addr(other.getIPaddr()), port(other.getPort()),tl_provider_id(other.getProviderId()) - {} + GrapherIdCard(GrapherIdCard const& other) + : groupId(other.getGroupId()) + , ip_addr(other.getIPaddr()) + , port(other.getPort()) + , tl_provider_id(other.getProviderId()) + {} ~GrapherIdCard()=default; - uint32_t getIPaddr() const {return ip_addr; } + RecordingGroupId getGroupId() const { return groupId; } + uint32_t getIPaddr() const { return ip_addr; } uint16_t getPort() const { return port;} uint16_t getProviderId () const { return tl_provider_id; } @@ -43,6 +50,7 @@ class GrapherIdCard template void serialize( SerArchiveT & serT) { + serT& groupId; serT & ip_addr; serT & port; serT & tl_provider_id; @@ -66,16 +74,16 @@ class GrapherIdCard inline bool operator==(chronolog::GrapherIdCard const& card1, chronolog::GrapherIdCard const& card2) { - return ( (card1.getIPaddr()==card2.getIPaddr() && card1.getPort() == card2.getPort() - && card1.getProviderId() == card2.getProviderId()) ? true : false ); - + return ((card1.getGroupId() == card2.getGroupId() && card1.getIPaddr() == card2.getIPaddr() && + card1.getPort() == card2.getPort() && card1.getProviderId() == card2.getProviderId()) + ? true + : false); } inline std::ostream & operator<< (std::ostream & out , chronolog::GrapherIdCard const & id_card) { std::string a_string; - out << "GrapherIdCard{" - <<":"< #include -#include "KeeperIdCard.h" + #include "GrapherIdCard.h" -#include "KeeperRegistrationMsg.h" namespace chronolog { @@ -17,12 +16,10 @@ class GrapherRegistrationMsg ServiceId adminServiceId; public: - - - GrapherRegistrationMsg(GrapherIdCard const &id_card = GrapherIdCard{0, 0, 0} - , ServiceId const &admin_service_id = ServiceId{0, 0, 0}) - : grapherIdCard(id_card) - , adminServiceId(admin_service_id) + GrapherRegistrationMsg(GrapherIdCard const& id_card = GrapherIdCard{0, 0, 0}, + ServiceId const& admin_service_id = ServiceId{0, 0, 0}) + : grapherIdCard(id_card) + , adminServiceId(admin_service_id) {} ~GrapherRegistrationMsg() = default; diff --git a/chrono_common/KeeperIdCard.h b/chrono_common/KeeperIdCard.h index 98f72c0e..bce1a8ce 100644 --- a/chrono_common/KeeperIdCard.h +++ b/chrono_common/KeeperIdCard.h @@ -5,6 +5,8 @@ #include +#include "ServiceId.h" + // this class wrapps ChronoKeeper Process identification // that will be used by all the ChronoLog Processes // to both identofy the Keepr process and create RPC client channels @@ -20,14 +22,12 @@ typedef uint32_t in_addr_t; typedef uint16_t in_port_t; typedef std::pair service_endpoint; -// KeeperGroup is the logical grouping of KeeperProcesses -typedef uint32_t KeeperGroupId; - +typedef uint32_t KeeperGroupId; class KeeperIdCard { - KeeperGroupId keeper_group_id; + RecordingGroupId keeper_group_id; uint32_t ip_addr; //IP address as uint32_t in host byte order uint16_t port; //port number as uint16_t in host byte order uint16_t tl_provider_id; // id of thallium service provider @@ -45,7 +45,7 @@ class KeeperIdCard ~KeeperIdCard()=default; - uint32_t getGroupId() const { return keeper_group_id; } + RecordingGroupId getGroupId() const { return keeper_group_id; } uint32_t getIPaddr() const {return ip_addr; } uint16_t getPort() const { return port;} uint16_t getProviderId () const { return tl_provider_id; } diff --git a/chrono_common/KeeperRegistrationMsg.h b/chrono_common/KeeperRegistrationMsg.h index cb0f5a27..c6d7fb1f 100644 --- a/chrono_common/KeeperRegistrationMsg.h +++ b/chrono_common/KeeperRegistrationMsg.h @@ -3,46 +3,13 @@ #include #include -#include "KeeperIdCard.h" +#include "KeeperIdCard.h" +#include "ServiceId.h" namespace chronolog { -class ServiceId -{ -public: - ServiceId(uint32_t addr, uint16_t a_port, uint16_t a_provider_id): ip_addr(addr), port(a_port), provider_id( - a_provider_id) - {} - - ~ServiceId() = default; - - uint32_t ip_addr; //32int IP representation in host notation - uint16_t port; //16int port representation in host notation - uint16_t provider_id; //thalium provider id - - template - void serialize(SerArchiveT &serT) - { - serT&ip_addr; - serT&port; - serT&provider_id; - } - - std::string &getIPasDottedString(std::string &a_string) const - { - - char buffer[INET_ADDRSTRLEN]; - // convert ip from host to network byte order uint32_t - uint32_t ip_net_order = htonl(ip_addr); - // convert network byte order uint32_t to a dotted string - if(NULL != inet_ntop(AF_INET, &ip_net_order, buffer, INET_ADDRSTRLEN)) - { a_string += std::string(buffer); } - return a_string; - } -}; - class KeeperRegistrationMsg { @@ -50,11 +17,10 @@ class KeeperRegistrationMsg ServiceId adminServiceId; public: - - - KeeperRegistrationMsg(KeeperIdCard const &keeper_card = KeeperIdCard{0, 0, 0} - , ServiceId const &admin_service_id = ServiceId{0, 0, 0}): keeperIdCard(keeper_card) - , adminServiceId(admin_service_id) + KeeperRegistrationMsg(KeeperIdCard const& keeper_card = KeeperIdCard{0, 0, 0}, + ServiceId const& admin_service_id = ServiceId{0, 0, 0}) + : keeperIdCard(keeper_card) + , adminServiceId(admin_service_id) {} ~KeeperRegistrationMsg() = default; @@ -76,14 +42,6 @@ class KeeperRegistrationMsg }//namespace -inline std::ostream &operator<<(std::ostream &out, chronolog::ServiceId const serviceId) -{ - std::string a_string; - out << "{" << serviceId.getIPasDottedString(a_string) << ":" << serviceId.port << ":" << serviceId.provider_id - << "}"; - return out; -} - inline std::ostream &operator<<(std::ostream &out, chronolog::KeeperRegistrationMsg const &msg) { out << "KeeperRegistrationMsg{" << msg.getKeeperIdCard() << "}{admin:" << msg.getAdminServiceId() << "}"; diff --git a/chrono_common/ServiceId.h b/chrono_common/ServiceId.h new file mode 100644 index 00000000..da548234 --- /dev/null +++ b/chrono_common/ServiceId.h @@ -0,0 +1,59 @@ +#ifndef CHRONO_SERVICE_ID_H +#define CHRONO_SERVICE_ID_H + +#include +#include + + +namespace chronolog +{ + +typedef uint32_t RecordingGroupId; + +class ServiceId +{ +public: + ServiceId(uint32_t addr, uint16_t a_port, uint16_t a_provider_id) + : ip_addr(addr) + , port(a_port) + , provider_id(a_provider_id) + {} + + ~ServiceId() = default; + + uint32_t ip_addr; //32int IP representation in host notation + uint16_t port; //16int port representation in host notation + uint16_t provider_id;//thalium provider id + + template + void serialize(SerArchiveT& serT) + { + serT& ip_addr; + serT& port; + serT& provider_id; + } + + std::string& getIPasDottedString(std::string& a_string) const + { + + char buffer[INET_ADDRSTRLEN]; + // convert ip from host to network byte order uint32_t + uint32_t ip_net_order = htonl(ip_addr); + // convert network byte order uint32_t to a dotted string + if(NULL != inet_ntop(AF_INET, &ip_net_order, buffer, INET_ADDRSTRLEN)) { a_string += std::string(buffer); } + return a_string; + } +}; + +}//namespace chronolog + + +inline std::ostream& operator<<(std::ostream& out, chronolog::ServiceId const serviceId) +{ + std::string a_string; + out << "{" << serviceId.getIPasDottedString(a_string) << ":" << serviceId.port << ":" << serviceId.provider_id + << "}"; + return out; +} + +#endif From bd7c22635c0a36b305d4eafd5e7fd08e8698d8a1 Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Thu, 11 Apr 2024 16:37:16 -0500 Subject: [PATCH 26/40] grapher delayed exit --- ChronoGrapher/ChronoGrapher.cpp | 5 +- ChronoVisor/include/KeeperRegistry.h | 206 ++++++++++---------- ChronoVisor/src/KeeperRegistry.cpp | 278 +++++++++++++++++++++++++-- chrono_common/GrapherIdCard.h | 7 + chrono_common/KeeperIdCard.h | 8 + 5 files changed, 387 insertions(+), 117 deletions(-) diff --git a/ChronoGrapher/ChronoGrapher.cpp b/ChronoGrapher/ChronoGrapher.cpp index 7d8deb51..92717750 100644 --- a/ChronoGrapher/ChronoGrapher.cpp +++ b/ChronoGrapher/ChronoGrapher.cpp @@ -104,6 +104,7 @@ int main(int argc, char**argv) LOG_INFO("[ChronoGrapher] DataStoreAdminService started successfully."); // Instantiate GrapherRecordingService + chronolog::RecordingGroupId recording_group_id = 7; std::string RECORDING_SERVICE_PROTOCOL = confManager.GRAPHER_CONF.RECORDING_SERVICE_CONF.PROTO_CONF; std::string RECORDING_SERVICE_IP = confManager.GRAPHER_CONF.RECORDING_SERVICE_CONF.IP; uint16_t RECORDING_SERVICE_PORT = confManager.GRAPHER_CONF.RECORDING_SERVICE_CONF.BASE_PORT; @@ -125,8 +126,8 @@ int main(int argc, char**argv) LOG_INFO("[ChronoGrapher] RecordingService started successfully."); // create GrapherIdCard to identify this Grapher process in ChronoVisor's Registry - chronolog::GrapherIdCard processIdCard(recording_endpoint.first, recording_endpoint.second - , recording_service_provider_id); + chronolog::GrapherIdCard processIdCard(recording_group_id, recording_endpoint.first, recording_endpoint.second, + recording_service_provider_id); std::stringstream process_id_string; process_id_string << processIdCard; diff --git a/ChronoVisor/include/KeeperRegistry.h b/ChronoVisor/include/KeeperRegistry.h index 185546ec..adb39442 100644 --- a/ChronoVisor/include/KeeperRegistry.h +++ b/ChronoVisor/include/KeeperRegistry.h @@ -31,137 +31,143 @@ class DataStoreAdminClient; class KeeperRegistryService; -class KeeperRegistry +class KeeperProcessEntry { +public: + KeeperProcessEntry(KeeperIdCard const& keeper_id_card, ServiceId const& admin_service_id) + : idCard(keeper_id_card) + , adminServiceId(admin_service_id) + , keeperAdminClient(nullptr) + , active(false) + , lastStatsTime(0) + , activeStoryCount(0) + {} - struct KeeperProcessEntry { - public: - KeeperProcessEntry(KeeperIdCard const& keeper_id_card, ServiceId const& admin_service_id) - : idCard(keeper_id_card) - , adminServiceId(admin_service_id) - , keeperAdminClient(nullptr) - , active(false) - , lastStatsTime(0) - , activeStoryCount(0) - {} + KeeperProcessEntry(KeeperProcessEntry const& other) = default; - KeeperProcessEntry(KeeperProcessEntry const &other) = default; + void reset() + { + keeperAdminClient = nullptr; + active = false; + lastStatsTime = 0; + activeStoryCount = 0; + } + + ~KeeperProcessEntry() = default;// Registry is reponsible for creating & deleting keeperAdminClient + + KeeperIdCard idCard; + ServiceId adminServiceId; + std::string idCardString; + DataStoreAdminClient* keeperAdminClient; + bool active; + uint64_t lastStatsTime; + uint32_t activeStoryCount; + std::list> delayedExitClients; +}; - void reset() - { - keeperAdminClient = nullptr; - active = false; - lastStatsTime = 0; - activeStoryCount = 0; - } - - ~KeeperProcessEntry() = default; // Registry is reponsible for creating & deleting keeperAdminClient - - KeeperIdCard idCard; - ServiceId adminServiceId; - DataStoreAdminClient* keeperAdminClient; - bool active; - uint64_t lastStatsTime; - uint32_t activeStoryCount; - std::list> delayedExitClients; +class GrapherProcessEntry +{ +public: + GrapherProcessEntry(GrapherIdCard const& id_card, ServiceId const& admin_service_id) + : idCard(id_card) + , adminServiceId(admin_service_id) + , adminClient(nullptr) + , active(false) + , lastStatsTime(0) + , activeStoryCount(0) + {} + + GrapherProcessEntry(GrapherProcessEntry const& other) = default; + ~GrapherProcessEntry() = default;// Registry is reponsible for creating & deleting keeperAdminClient + + GrapherIdCard idCard; + ServiceId adminServiceId; + std::string idCardString; + DataStoreAdminClient* adminClient; + bool active; + uint64_t lastStatsTime; + uint32_t activeStoryCount; + std::list> delayedExitGrapherClients; }; - struct GrapherProcessEntry + + class RecordingGroup { public: - GrapherProcessEntry(GrapherIdCard const& id_card, ServiceId const& admin_service_id) - : idCard(id_card) - , adminServiceId(admin_service_id) - , adminClient(nullptr) - , active(false) - , lastStatsTime(0) - , activeStoryCount(0) + RecordingGroup(RecordingGroupId group_id, GrapherProcessEntry* grapher_ptr = nullptr) + : groupId(group_id) + , grapherProcess(grapher_ptr) {} - GrapherProcessEntry(GrapherProcessEntry const &other) = default; - ~GrapherProcessEntry() = default; // Registry is reponsible for creating & deleting keeperAdminClient + RecordingGroup(RecordingGroup const& other) = default; + ~RecordingGroup() = default; - GrapherIdCard idCard; - ServiceId adminServiceId; - DataStoreAdminClient* adminClient; - bool active; - uint64_t lastStatsTime; - uint32_t activeStoryCount; - std::list> delayedExitClients; - }; + void startDelayedGrapherExit(GrapherProcessEntry&, std::time_t); + void clearDelayedExitGrapher(GrapherProcessEntry&, std::time_t); + void startDelayedKeeperExit(KeeperProcessEntry&, std::time_t); - - struct KeeperGroupEntry - { - KeeperGroupEntry( KeeperGroupId group_id, GrapherProcessEntry * grapher_ptr=nullptr) - : groupId(group_id) - , grapher_process(grapher_ptr) - { } - - KeeperGroupEntry(KeeperGroupEntry const &other) = default; - ~KeeperGroupEntry() = default; - - KeeperGroupId groupId; - GrapherProcessEntry* grapher_process; - std::map, KeeperProcessEntry*> keepers; + RecordingGroupId groupId; + GrapherProcessEntry* grapherProcess; + std::map, KeeperProcessEntry*> keeperProcesses; }; - enum RegistryState + class KeeperRegistry { - UNKNOWN = 0, INITIALIZED = 1, // RegistryService is initialized, no active keepers - RUNNING = 2, // RegistryService and active Keepers - SHUTTING_DOWN = 3 // Shutting down services - }; - -public: - KeeperRegistry(): registryState(UNKNOWN), registryEngine(nullptr), keeperRegistryService(nullptr) - {} - - ~KeeperRegistry(); + enum RegistryState + { + UNKNOWN = 0, + INITIALIZED = 1, // RegistryService is initialized, no active keepers + RUNNING = 2, // RegistryService and active Keepers + SHUTTING_DOWN = 3// Shutting down services + }; - bool is_initialized() const - { return (INITIALIZED == registryState); } + public: + KeeperRegistry() + : registryState(UNKNOWN) + , registryEngine(nullptr) + , keeperRegistryService(nullptr) + {} - bool is_running() const - { return (RUNNING == registryState); } + ~KeeperRegistry(); - bool is_shutting_down() const - { return (SHUTTING_DOWN == registryState); } + bool is_initialized() const { return (INITIALIZED == registryState); } - int InitializeRegistryService(ChronoLog::ConfigurationManager const &); + bool is_running() const { return (RUNNING == registryState); } - int ShutdownRegistryService(); + bool is_shutting_down() const { return (SHUTTING_DOWN == registryState); } - int registerKeeperProcess(KeeperRegistrationMsg const &keeper_reg_msg); + int InitializeRegistryService(ChronoLog::ConfigurationManager const&); - int unregisterKeeperProcess(KeeperIdCard const &keeper_id_card); + int ShutdownRegistryService(); - void updateKeeperProcessStats(KeeperStatsMsg const &keeperStatsMsg); + int registerKeeperProcess(KeeperRegistrationMsg const& keeper_reg_msg); - std::vector &getActiveKeepers(std::vector &keeper_id_cards); + int unregisterKeeperProcess(KeeperIdCard const& keeper_id_card); - int notifyKeepersOfStoryRecordingStart(std::vector&, ChronicleName const&, StoryName const&, - StoryId const&); + void updateKeeperProcessStats(KeeperStatsMsg const& keeperStatsMsg); - int notifyKeepersOfStoryRecordingStop(std::vector const &, StoryId const &); + std::vector& getActiveKeepers(std::vector& keeper_id_cards); - int registerGrapherProcess(GrapherRegistrationMsg const & reg_msg); - int unregisterGrapherProcess(GrapherIdCard const & id_card); + int notifyKeepersOfStoryRecordingStart(std::vector&, ChronicleName const&, StoryName const&, + StoryId const&); -private: + int notifyKeepersOfStoryRecordingStop(std::vector const&, StoryId const&); - KeeperRegistry(KeeperRegistry const &) = delete; //disable copying - KeeperRegistry &operator=(KeeperRegistry const &) = delete; + int registerGrapherProcess(GrapherRegistrationMsg const& reg_msg); + int unregisterGrapherProcess(GrapherIdCard const& id_card); - RegistryState registryState; - std::mutex registryLock; - std::map , KeeperProcessEntry> keeperProcessRegistry; - std::map keeperGroups; - thallium::engine*registryEngine; - KeeperRegistryService*keeperRegistryService; - size_t delayedDataAdminExitSeconds; -}; + private: + KeeperRegistry(KeeperRegistry const&) = delete;//disable copying + KeeperRegistry& operator=(KeeperRegistry const&) = delete; + RegistryState registryState; + std::mutex registryLock; + std::map, KeeperProcessEntry> keeperProcessRegistry; + std::map recordingGroups; + thallium::engine* registryEngine; + KeeperRegistryService* keeperRegistryService; + size_t delayedDataAdminExitSeconds; + }; } #endif diff --git a/ChronoVisor/src/KeeperRegistry.cpp b/ChronoVisor/src/KeeperRegistry.cpp index 4803ef9c..8b76a7ab 100644 --- a/ChronoVisor/src/KeeperRegistry.cpp +++ b/ChronoVisor/src/KeeperRegistry.cpp @@ -78,6 +78,59 @@ int KeeperRegistry::ShutdownRegistryService() registryState = SHUTTING_DOWN; LOG_INFO("[KeeperRegistry] Shutting down..."); + + while(!recordingGroups.empty()) + { + std::time_t current_time = + std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now()); + std::time_t delayedExitTime = std::chrono::high_resolution_clock::to_time_t( + std::chrono::high_resolution_clock::now() + std::chrono::seconds(delayedDataAdminExitSeconds)); + + for(auto group_iter = recordingGroups.begin(); group_iter != recordingGroups.end();) + { + RecordingGroup& recording_group = ((*group_iter).second); + + if(recording_group.grapherProcess != nullptr) + { + std::stringstream id_string; + id_string << recording_group.grapherProcess->idCard; + // start delayed destruction for the lingering Adminclient to be safe... + + chl::GrapherProcessEntry* grapher_process = recording_group.grapherProcess; + if(grapher_process->active) + { + // start delayed destruction for the lingering Adminclient to be safe... + recording_group.startDelayedGrapherExit(*grapher_process, delayedExitTime); + } + else + { + //check if any existing delayed exit grapher processes can be cleared... + recording_group.clearDelayedExitGrapher(*grapher_process, current_time); + } + + if(grapher_process->delayedExitGrapherClients.empty()) + { + LOG_INFO("[KeeperRegistry] registerGrapherProcess has destroyed old entry for grapher {}", + id_string.str()); + delete grapher_process; + recording_group.grapherProcess = nullptr; + } + } + if(recording_group.grapherProcess == nullptr && recording_group.keeperProcesses.empty()) + { + LOG_INFO("[KeeperRegistry] recordingGroup {} is destroyed", recording_group.groupId); + group_iter = recordingGroups.erase(group_iter); + } + else + { + LOG_INFO("[KeeperRegistry] recordingGroup {} can't yet be destroyed", recording_group.groupId); + ++group_iter; + } + } + + if(!recordingGroups.empty()) { sleep(1); } + } + // send out shutdown instructions to // all active keeper processes // then drain the registry @@ -95,20 +148,27 @@ int KeeperRegistry::ShutdownRegistryService() LOG_INFO("[KeeperRegistry] Sending shutdown to keeper {}", id_string.str()); (*process_iter).second.keeperAdminClient->shutdown_collection(); + std::time_t delayedExitTime = std::chrono::high_resolution_clock::to_time_t( + std::chrono::high_resolution_clock::now() + std::chrono::seconds(delayedDataAdminExitSeconds)); + LOG_INFO("[KeeperRegistry] shutdown: starting delayedExit for keeperProcess {} current_time={} " + "delayedExitTime={}", + id_string.str(), ctime(¤t_time), std::ctime(&delayedExitTime)); + ; + + //StartKeeperDelayedExit.. (*process_iter).second.active = false; if((*process_iter).second.keeperAdminClient != nullptr) { - std::time_t delayedExitTime = std::chrono::high_resolution_clock::to_time_t( - std::chrono::high_resolution_clock::now() + std::chrono::seconds(delayedDataAdminExitSeconds)); - LOG_INFO("[KeeperRegistry] shutdown: starting delayedExit for keeperProcess {} current_time={} delayedExitTime={}",id_string.str(), ctime(¤t_time), std::ctime(&delayedExitTime));; (*process_iter) .second.delayedExitClients.push_back(std::pair( delayedExitTime, (*process_iter).second.keeperAdminClient)); (*process_iter).second.keeperAdminClient = nullptr; } + //StartKeeperDelayedExit.. } + //ExpireKeeperDelayedExitClients while(!(*process_iter).second.delayedExitClients.empty() && (current_time >= (*process_iter).second.delayedExitClients.front().first)) { @@ -118,6 +178,8 @@ int KeeperRegistry::ShutdownRegistryService() (*process_iter).second.delayedExitClients.pop_front(); } + //ExpireKeeperDelayedExitClients + if((*process_iter).second.delayedExitClients.empty()) { LOG_INFO("[KeeperRegistry] registerKeeperProcess() destroys old keeperProcessEntry for {}",id_string.str()); @@ -158,21 +220,22 @@ int KeeperRegistry::registerKeeperProcess(KeeperRegistrationMsg const &keeper_re KeeperIdCard keeper_id_card = keeper_reg_msg.getKeeperIdCard(); ServiceId admin_service_id = keeper_reg_msg.getAdminServiceId(); - //find the group that keepr belongs to in the registry - auto keeper_group_iter = keeperGroups.find(keeper_id_card.getGroupId()); - if(keeper_group_iter == keeperGroups.end()) + //find the group that keeper belongs to in the registry + auto group_iter = recordingGroups.find(keeper_id_card.getGroupId()); + if(group_iter == recordingGroups.end()) { - auto insert_return = keeperGroups.insert(std::pair( - keeper_id_card.getGroupId(), KeeperGroupEntry(keeper_id_card.getGroupId()))); + auto insert_return = recordingGroups.insert(std::pair( + keeper_id_card.getGroupId(), RecordingGroup(keeper_id_card.getGroupId()))); if(false == insert_return.second) { - LOG_ERROR("[KeeperRegistry] registration failed for KeeperGroup {}", keeper_id_card.getGroupId()); + LOG_ERROR("[KeeperRegistry] keeper registration failed to find RecordingGroup {}", + keeper_id_card.getGroupId()); return chronolog::CL_ERR_UNKNOWN; } - else { keeper_group_iter = insert_return.first; } + else { group_iter = insert_return.first; } } - KeeperGroupEntry* keeper_group = &((*keeper_group_iter).second); + RecordingGroup* keeper_group = &((*group_iter).second); // unlikely but possible that the Registry still retains the record of the previous re-incarnation of hte Keeper process // running on the same host... check for this case and clean up the leftover record... @@ -276,8 +339,8 @@ int KeeperRegistry::unregisterKeeperProcess(KeeperIdCard const &keeper_id_card) if(is_shutting_down()) { return chronolog::CL_ERR_UNKNOWN; } - auto keeper_group_iter = keeperGroups.find(keeper_id_card.getGroupId()); - if(keeper_group_iter == keeperGroups.end()) { return chronolog::CL_SUCCESS; } + auto group_iter = recordingGroups.find(keeper_id_card.getGroupId()); + if(group_iter == recordingGroups.end()) { return chronolog::CL_SUCCESS; } auto keeper_process_iter = keeperProcessRegistry.find( std::pair(keeper_id_card.getIPaddr(), keeper_id_card.getPort())); @@ -519,20 +582,205 @@ int KeeperRegistry::notifyKeepersOfStoryRecordingStop(std::vector return chronolog::CL_SUCCESS; } + +//////////////////////////// + int KeeperRegistry::registerGrapherProcess(GrapherRegistrationMsg const & reg_msg) { + if(is_shutting_down()) { return chronolog::CL_ERR_UNKNOWN; } + + GrapherIdCard grapher_id_card = reg_msg.getGrapherIdCard(); + RecordingGroupId group_id = grapher_id_card.getGroupId(); + ServiceId admin_service_id = reg_msg.getAdminServiceId(); + + std::lock_guard lock(registryLock); + //re-check state after ther lock is aquired + if(is_shutting_down()) { return chronolog::CL_ERR_UNKNOWN; } + + //find the group that keeper belongs to in the registry + auto group_iter = recordingGroups.find(group_id); + if(group_iter == recordingGroups.end()) + { + auto insert_return = + recordingGroups.insert(std::pair(group_id, RecordingGroup(group_id))); + if(false == insert_return.second) + { + LOG_ERROR("[KeeperRegistry] keeper registration failed to find RecordingGroup {}", group_id); + return chronolog::CL_ERR_UNKNOWN; + } + else { group_iter = insert_return.first; } + } + + RecordingGroup& recording_group = ((*group_iter).second); + + // it is possible that the Registry still retains the record of the previous re-incarnation of the grapher process + // check for this case and clean up the leftover record... + + std::stringstream id_string; + id_string << grapher_id_card; + if(recording_group.grapherProcess != nullptr) + { + // start delayed destruction for the lingering Adminclient to be safe... + + chl::GrapherProcessEntry* grapher_process = recording_group.grapherProcess; + if(grapher_process->active) + { + // start delayed destruction for the lingering Adminclient to be safe... + + std::time_t delayedExitTime = std::chrono::high_resolution_clock::to_time_t( + std::chrono::high_resolution_clock::now() + std::chrono::seconds(delayedDataAdminExitSeconds)); + + recording_group.startDelayedGrapherExit(*grapher_process, delayedExitTime); + } + else + { + //check if any existing delayed exit grapher processes can be cleared... + std::time_t current_time = + std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now()); + recording_group.clearDelayedExitGrapher(*grapher_process, current_time); + } + + if(grapher_process->delayedExitGrapherClients.empty()) + { + LOG_INFO("[KeeperRegistry] registerGrapherProcess has destroyed old entry for grapher {}", id_string.str()); + delete grapher_process; + recording_group.grapherProcess = nullptr; + } + else + { + LOG_INFO("[KeeperRegistry] registration for Grapher{} cant's proceed as previous grapherClient isn't yet " + "dismantled", + id_string.str()); + return CL_ERR_UNKNOWN; + } + } + + //create a client of the new grapher's DataStoreAdminService listenning at adminServiceId + std::string service_na_string("ofi+sockets://"); + service_na_string = + admin_service_id.getIPasDottedString(service_na_string) + ":" + std::to_string(admin_service_id.port); + + DataStoreAdminClient* collectionClient = DataStoreAdminClient::CreateDataStoreAdminClient( + *registryEngine, service_na_string, admin_service_id.provider_id); + if(nullptr == collectionClient) + { + LOG_ERROR("[KeeperRegistry] Register Grapher {} failed to create DataStoreAdminClient for {}: provider_id={}", + id_string.str(), service_na_string, admin_service_id.provider_id); + return chronolog::CL_ERR_UNKNOWN; + } + + //now create a new GrapherProcessEntry with the new DataAdminclient + recording_group.grapherProcess = new GrapherProcessEntry(grapher_id_card, admin_service_id); + recording_group.grapherProcess->adminClient = collectionClient; + recording_group.grapherProcess->active = true; + + LOG_INFO("[KeeperRegistry] Register grapher {} created DataStoreAdminClient for {}: provider_id={}", + id_string.str(), service_na_string, admin_service_id.provider_id); + + // now that communnication with the Keeper is established and we still holding registryLock + // update registryState in case this is the first KeeperProcess registration + if(keeperProcessRegistry.size() > 0) + { + registryState = RUNNING; + LOG_INFO("[KeeperRegistry] RUNNING with {} KeeperProcesses", keeperProcessRegistry.size()); + } return chronolog::CL_SUCCESS; } +///////////////// -int KeeperRegistry::unregisterGrapherProcess(GrapherIdCard const & id_card) +int KeeperRegistry::unregisterGrapherProcess(GrapherIdCard const& grapher_id_card) { + std::lock_guard lock(registryLock); + //check again after the lock is acquired + if(is_shutting_down()) { return chronolog::CL_ERR_UNKNOWN; } + + auto group_iter = recordingGroups.find(grapher_id_card.getGroupId()); + if(group_iter == recordingGroups.end()) { return chronolog::CL_SUCCESS; } + + RecordingGroup& recording_group = ((*group_iter).second); + + std::stringstream id_string; + id_string << grapher_id_card; + if(recording_group.grapherProcess != nullptr && recording_group.grapherProcess->active) + { + // start delayed destruction for the lingering Adminclient to be safe... + // to prevent the case of deleting the keeperAdminClient while it might be waiting for rpc response on the + // other thread + + std::time_t delayedExitTime = std::chrono::high_resolution_clock::to_time_t( + std::chrono::high_resolution_clock::now() + std::chrono::seconds(delayedDataAdminExitSeconds)); + LOG_INFO("[KeeperRegistry] grapher {} starting delayedExit for grapher {} delayedExitTime={}", id_string.str(), + std::ctime(&delayedExitTime)); + + recording_group.startDelayedGrapherExit(*(recording_group.grapherProcess), delayedExitTime); + } + // now that we are still holding registryLock + // update registryState if needed + if(!is_shutting_down() && (1 == keeperProcessRegistry.size())) + { + registryState = INITIALIZED; + LOG_INFO("[KeeperRegistry] INITIALIZED with {} KeeperProcesses", keeperProcessRegistry.size()); + } return chronolog::CL_SUCCESS; } - }//namespace chronolog +/////////////// + +void chl::RecordingGroup::startDelayedGrapherExit(chl::GrapherProcessEntry& grapher_process, + std::time_t delayedExitTime) +{ + grapher_process.active = false; + + if(grapher_process.adminClient != nullptr) + { + grapher_process.delayedExitGrapherClients.push_back( + std::pair(delayedExitTime, grapher_process.adminClient)); + grapher_process.adminClient = nullptr; + } +} + +void chl::RecordingGroup::clearDelayedExitGrapher(chl::GrapherProcessEntry& grapher_process, std::time_t current_time) +{ + while(!grapher_process.delayedExitGrapherClients.empty() && + (current_time >= grapher_process.delayedExitGrapherClients.front().first)) + { + auto dataStoreClientPair = grapher_process.delayedExitGrapherClients.front(); + LOG_DEBUG("[KeeperRegistry] recording_Group {}, destroys delayed dataAdmindClient for {}", id_string.str()); + if(dataStoreClientPair.second != nullptr) { delete dataStoreClientPair.second; } + grapher_process.delayedExitGrapherClients.pop_front(); + } + /* + + if(grapher_process->delayedExitGrapherClients.empty()) + { + LOG_DEBUG("[KeeperRegistry] recording_group {} has destroyed old entry for grapher {}", groupId, id_string.str()); + delete grapher_process; + group_entry.grapherProcess = nullptr; + } + +*/ +} + +void chl::RecordingGroup::startDelayedKeeperExit(chl::KeeperProcessEntry& keeper_process, std::time_t delayedExitTime) +{ + // we mark the keeperProcessEntry as inactive and set the time it would be safe to delete. + // we delay the destruction of the keeperEntry & keeperAdminClient by 5 secs + // to prevent the case of deleting the keeperAdminClient while it might be waiting for rpc response on the + // other thread + + keeper_process.active = false; + + if(keeper_process.keeperAdminClient != nullptr) + { + + keeper_process.delayedExitClients.push_back( + std::pair(delayedExitTime, keeper_process.keeperAdminClient)); + keeper_process.keeperAdminClient = nullptr; + } +} diff --git a/chrono_common/GrapherIdCard.h b/chrono_common/GrapherIdCard.h index 101e793d..fe98922d 100644 --- a/chrono_common/GrapherIdCard.h +++ b/chrono_common/GrapherIdCard.h @@ -87,5 +87,12 @@ inline std::ostream & operator<< (std::ostream & out , chronolog::GrapherIdCard return out; } +inline std::string& operator+(std::string& a_string, chronolog::GrapherIdCard const& id_card) +{ + a_string += std::string("GrapherIdCard{") + std::to_string(id_card.getGroupId()) + ":" + + id_card.getIPasDottedString(a_string) + ":" + std::to_string(id_card.getPort()) + ":" + + std::to_string(id_card.getProviderId()) + "}"; + return a_string; +} #endif diff --git a/chrono_common/KeeperIdCard.h b/chrono_common/KeeperIdCard.h index bce1a8ce..259c81d0 100644 --- a/chrono_common/KeeperIdCard.h +++ b/chrono_common/KeeperIdCard.h @@ -84,6 +84,7 @@ inline bool operator==(chronolog::KeeperIdCard const& card1, chronolog::KeeperId && card1.getProviderId() == card2.getProviderId()) ? true : false ); } + inline std::ostream & operator<< (std::ostream & out , chronolog::KeeperIdCard const & keeper_id_card) { std::string a_string; @@ -93,5 +94,12 @@ inline std::ostream & operator<< (std::ostream & out , chronolog::KeeperIdCard c return out; } +inline std::string& operator+(std::string& a_string, chronolog::KeeperIdCard const& keeper_id_card) +{ + a_string += std::string("KeeperIdCard{") + std::to_string(keeper_id_card.getGroupId()) + ":" + + keeper_id_card.getIPasDottedString(a_string) + ":" + std::to_string(keeper_id_card.getPort()) + ":" + + std::to_string(keeper_id_card.getProviderId()) + "}"; + return a_string; +} #endif From b1ba901b380222a34066df4fefe6cabb509c57e0 Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Thu, 11 Apr 2024 18:58:55 -0500 Subject: [PATCH 27/40] moved KeeperProcess map to RecordingGroup level --- ChronoVisor/include/KeeperRegistry.h | 4 +- ChronoVisor/src/KeeperRegistry.cpp | 281 ++++++++++++++------------- 2 files changed, 148 insertions(+), 137 deletions(-) diff --git a/ChronoVisor/include/KeeperRegistry.h b/ChronoVisor/include/KeeperRegistry.h index adb39442..63f6f336 100644 --- a/ChronoVisor/include/KeeperRegistry.h +++ b/ChronoVisor/include/KeeperRegistry.h @@ -105,10 +105,11 @@ class GrapherProcessEntry void startDelayedGrapherExit(GrapherProcessEntry&, std::time_t); void clearDelayedExitGrapher(GrapherProcessEntry&, std::time_t); void startDelayedKeeperExit(KeeperProcessEntry&, std::time_t); + void clearDelayedExitKeeper(KeeperProcessEntry&, std::time_t); RecordingGroupId groupId; GrapherProcessEntry* grapherProcess; - std::map, KeeperProcessEntry*> keeperProcesses; + std::map, KeeperProcessEntry> keeperProcesses; }; class KeeperRegistry @@ -162,7 +163,6 @@ class GrapherProcessEntry RegistryState registryState; std::mutex registryLock; - std::map, KeeperProcessEntry> keeperProcessRegistry; std::map recordingGroups; thallium::engine* registryEngine; KeeperRegistryService* keeperRegistryService; diff --git a/ChronoVisor/src/KeeperRegistry.cpp b/ChronoVisor/src/KeeperRegistry.cpp index 8b76a7ab..3fa11b12 100644 --- a/ChronoVisor/src/KeeperRegistry.cpp +++ b/ChronoVisor/src/KeeperRegistry.cpp @@ -94,11 +94,13 @@ int KeeperRegistry::ShutdownRegistryService() { std::stringstream id_string; id_string << recording_group.grapherProcess->idCard; - // start delayed destruction for the lingering Adminclient to be safe... chl::GrapherProcessEntry* grapher_process = recording_group.grapherProcess; if(grapher_process->active) { + LOG_INFO("[KeeperRegistry] Sending shutdown to grapher {}", id_string.str()); + if(grapher_process->adminClient != nullptr) { grapher_process->adminClient->shutdown_collection(); } + // start delayed destruction for the lingering Adminclient to be safe... recording_group.startDelayedGrapherExit(*grapher_process, delayedExitTime); } @@ -116,6 +118,44 @@ int KeeperRegistry::ShutdownRegistryService() recording_group.grapherProcess = nullptr; } } + + // send out shutdown instructions to all active keeper processes + // then start delayedExit procedure for them + for(auto process_iter = recording_group.keeperProcesses.begin(); + process_iter != recording_group.keeperProcesses.end();) + { + std::stringstream id_string; + id_string << (*process_iter).second.idCard; + + if((*process_iter).second.active) + { + LOG_INFO("[KeeperRegistry] Sending shutdown to keeper {}", id_string.str()); + (*process_iter).second.keeperAdminClient->shutdown_collection(); + + LOG_INFO("[KeeperRegistry] shutdown: starting delayedExit for keeper {} delayedExitTime={}", + id_string.str(), std::ctime(&delayedExitTime)); + recording_group.startDelayedKeeperExit((*process_iter).second, delayedExitTime); + } + else + { + LOG_INFO("[KeeperRegistry] shutdown: clear delayedAdminClient for keeper {}", id_string.str()); + recording_group.clearDelayedExitKeeper((*process_iter).second, current_time); + } + + if((*process_iter).second.delayedExitClients.empty()) + { + LOG_INFO("[KeeperRegistry] registerKeeperProcess() destroys old keeperProcessEntry for {}", + id_string.str()); + process_iter = recording_group.keeperProcesses.erase(process_iter); + } + else + { + LOG_INFO("[KeeperRegistry] registerKeeperProcess() old dataAdminClient for {} can't yet be " + "destroyed", + id_string.str()); + ++process_iter; + } + } if(recording_group.grapherProcess == nullptr && recording_group.keeperProcesses.empty()) { LOG_INFO("[KeeperRegistry] recordingGroup {} is destroyed", recording_group.groupId); @@ -131,67 +171,7 @@ int KeeperRegistry::ShutdownRegistryService() if(!recordingGroups.empty()) { sleep(1); } } - // send out shutdown instructions to - // all active keeper processes - // then drain the registry - while(!keeperProcessRegistry.empty()) - { - for(auto process_iter = keeperProcessRegistry.begin(); process_iter != keeperProcessRegistry.end();) - { - std::time_t current_time = - std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now()); - std::stringstream id_string; - id_string << (*process_iter).second.idCard; - - if((*process_iter).second.active) - { - LOG_INFO("[KeeperRegistry] Sending shutdown to keeper {}", id_string.str()); - (*process_iter).second.keeperAdminClient->shutdown_collection(); - - std::time_t delayedExitTime = std::chrono::high_resolution_clock::to_time_t( - std::chrono::high_resolution_clock::now() + std::chrono::seconds(delayedDataAdminExitSeconds)); - LOG_INFO("[KeeperRegistry] shutdown: starting delayedExit for keeperProcess {} current_time={} " - "delayedExitTime={}", - id_string.str(), ctime(¤t_time), std::ctime(&delayedExitTime)); - ; - - //StartKeeperDelayedExit.. - (*process_iter).second.active = false; - - if((*process_iter).second.keeperAdminClient != nullptr) - { - (*process_iter) - .second.delayedExitClients.push_back(std::pair( - delayedExitTime, (*process_iter).second.keeperAdminClient)); - (*process_iter).second.keeperAdminClient = nullptr; - } - //StartKeeperDelayedExit.. - } - - //ExpireKeeperDelayedExitClients - while(!(*process_iter).second.delayedExitClients.empty() && - (current_time >= (*process_iter).second.delayedExitClients.front().first)) - { - auto dataStoreClientPair = (*process_iter).second.delayedExitClients.front(); - LOG_INFO("[KeeperRegistry] shutdown() destroys old dataAdminClient for keeper {} delayedTime={}", id_string.str(), ctime(&(dataStoreClientPair.first))); - if(dataStoreClientPair.second != nullptr) { delete dataStoreClientPair.second; } - (*process_iter).second.delayedExitClients.pop_front(); - } - - //ExpireKeeperDelayedExitClients - if((*process_iter).second.delayedExitClients.empty()) - { - LOG_INFO("[KeeperRegistry] registerKeeperProcess() destroys old keeperProcessEntry for {}",id_string.str()); - process_iter = keeperProcessRegistry.erase(process_iter); - } - else - { - LOG_INFO("[KeeperRegistry] registerKeeperProcess() old dataAdminClient for {} can't yet be destroyed", id_string.str()); - ++process_iter; - } - } - } if(nullptr != keeperRegistryService) { delete keeperRegistryService; } return chronolog::CL_SUCCESS; @@ -235,49 +215,42 @@ int KeeperRegistry::registerKeeperProcess(KeeperRegistrationMsg const &keeper_re else { group_iter = insert_return.first; } } - RecordingGroup* keeper_group = &((*group_iter).second); + RecordingGroup& recording_group = (*group_iter).second; // unlikely but possible that the Registry still retains the record of the previous re-incarnation of hte Keeper process // running on the same host... check for this case and clean up the leftover record... - auto keeper_process_iter = keeperProcessRegistry.find( + auto keeper_process_iter = recording_group.keeperProcesses.find( std::pair(keeper_id_card.getIPaddr(), keeper_id_card.getPort())); std::stringstream id_string; id_string << keeper_id_card; - if(keeper_process_iter != keeperProcessRegistry.end()) + if(keeper_process_iter != recording_group.keeperProcesses.end()) { // must be a case of the KeeperProcess exiting without unregistering or some unexpected break in communication... // start delayed destruction process for hte lingering keeperAdminclient to be safe... - std::time_t current_time = - std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now()); - (*keeper_process_iter).second.active = false; - - if((*keeper_process_iter).second.keeperAdminClient != nullptr) + if((*keeper_process_iter).second.active) { - std::time_t delayedExitTime = std::chrono::high_resolution_clock::to_time_t( std::chrono::high_resolution_clock::now() + std::chrono::seconds(delayedDataAdminExitSeconds)); - LOG_WARNING("[KeeperRegistry] registerKeeperProcess for keeper {} found old instance of dataAdminclient; starting delayedExit current_time={} delayedExitTime={}",id_string.str(), ctime(¤t_time), std::ctime(&delayedExitTime));; + LOG_WARNING("[KeeperRegistry] registerKeeperProcess: found old instance of dataAdminclient for {} " + "delayedExitTime={}", + id_string.str(), std::ctime(&delayedExitTime)); + ; - (*keeper_process_iter) - .second.delayedExitClients.push_back(std::pair( - delayedExitTime, (*keeper_process_iter).second.keeperAdminClient)); - (*keeper_process_iter).second.keeperAdminClient = nullptr; + recording_group.startDelayedKeeperExit((*keeper_process_iter).second, delayedExitTime); } - - while(!(*keeper_process_iter).second.delayedExitClients.empty() && - (current_time >= (*keeper_process_iter).second.delayedExitClients.front().first)) + else { - auto dataStoreClientPair = (*keeper_process_iter).second.delayedExitClients.front(); - LOG_INFO("[KeeperRegistry] registerKeeperProcess destroys delayed dataAdmindClient for {}",id_string.str()); - if(dataStoreClientPair.second != nullptr) { delete dataStoreClientPair.second; } - (*keeper_process_iter).second.delayedExitClients.pop_front(); + std::time_t current_time = + std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now()); + LOG_INFO("[KeeperRegistry] registerKeeperProcess tries to clear dataAdmindClient for {}", id_string.str()); + recording_group.clearDelayedExitKeeper((*keeper_process_iter).second, current_time); } if((*keeper_process_iter).second.delayedExitClients.empty()) { LOG_INFO("[KeeperRegistry] registerKeeperProcess has destroyed old entry for keeper {}",id_string.str()); - keeperProcessRegistry.erase(keeper_process_iter); + recording_group.keeperProcesses.erase(keeper_process_iter); } else { @@ -302,9 +275,10 @@ int KeeperRegistry::registerKeeperProcess(KeeperRegistrationMsg const &keeper_re } //now create a new KeeperRecord with the new DataAdminclient - auto insert_return = keeperProcessRegistry.insert(std::pair , KeeperProcessEntry>( - std::pair (keeper_id_card.getIPaddr(), keeper_id_card.getPort()), KeeperProcessEntry( - keeper_id_card, admin_service_id))); + auto insert_return = + recording_group.keeperProcesses.insert(std::pair, KeeperProcessEntry>( + std::pair(keeper_id_card.getIPaddr(), keeper_id_card.getPort()), + KeeperProcessEntry(keeper_id_card, admin_service_id))); if(false == insert_return.second) { LOG_ERROR("[KeeperRegistry] registration failed for Keeper {}", id_string.str()); @@ -318,12 +292,15 @@ int KeeperRegistry::registerKeeperProcess(KeeperRegistrationMsg const &keeper_re LOG_INFO("[KeeperRegistry] Register Keeper: KeeperIdCard: {} created DataStoreAdminClient for {}: provider_id={}" , id_string.str(), service_na_string, admin_service_id.provider_id); + LOG_INFO("[KeeperRegistry] RecordingGroup {} has {} keepers", recording_group.groupId, + recording_group.keeperProcesses.size()); + // now that communnication with the Keeper is established and we still holding registryLock // update registryState in case this is the first KeeperProcess registration - if(keeperProcessRegistry.size() > 0) - { registryState = RUNNING; - - LOG_INFO("[KeeperRegistry] RUNNING with {} KeeperProcesses", keeperProcessRegistry.size()); + if(recordingGroups.size() > 0) + { + registryState = RUNNING; + LOG_INFO("[KeeperRegistry] RUNNING with {} RecordingGroups ", recordingGroups.size()); } return chronolog::CL_SUCCESS; } @@ -342,9 +319,11 @@ int KeeperRegistry::unregisterKeeperProcess(KeeperIdCard const &keeper_id_card) auto group_iter = recordingGroups.find(keeper_id_card.getGroupId()); if(group_iter == recordingGroups.end()) { return chronolog::CL_SUCCESS; } - auto keeper_process_iter = keeperProcessRegistry.find( + RecordingGroup& recording_group = (*group_iter).second; + + auto keeper_process_iter = recording_group.keeperProcesses.find( std::pair(keeper_id_card.getIPaddr(), keeper_id_card.getPort())); - if(keeper_process_iter != keeperProcessRegistry.end()) + if(keeper_process_iter != recording_group.keeperProcesses.end()) { // we mark the keeperProcessEntry as inactive and set the time it would be safe to delete. // we delay the destruction of the keeperEntry & keeperAdminClient by 5 secs @@ -353,27 +332,23 @@ int KeeperRegistry::unregisterKeeperProcess(KeeperIdCard const &keeper_id_card) std::stringstream id_string; id_string << keeper_id_card; - (*keeper_process_iter).second.active = false; - - if((*keeper_process_iter).second.keeperAdminClient != nullptr) - { + std::time_t delayedExitTime = std::chrono::high_resolution_clock::to_time_t( + std::chrono::high_resolution_clock::now() + std::chrono::seconds(delayedDataAdminExitSeconds)); + LOG_INFO("[KeeperRegistry] unregisterKeeperProcess() starting delayedExit for keeper {} delayedExitTime={}", + id_string.str(), std::ctime(&delayedExitTime)); + ; + recording_group.startDelayedKeeperExit((*keeper_process_iter).second, delayedExitTime); + } - std::time_t current_time = std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now()); - std::time_t delayedExitTime = std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now() + std::chrono::seconds(delayedDataAdminExitSeconds)); - LOG_INFO("[KeeperRegistry] unregisterKeeperProcess() starting delayedExit for keeper {} current_time={} delayedExitTime={}",id_string.str(), std::ctime(¤t_time), std::ctime(&delayedExitTime));; + LOG_INFO("[KeeperRegistry] RecordingGroup {} has {} keepers", recording_group.groupId, + recording_group.keeperProcesses.size()); - (*keeper_process_iter) - .second.delayedExitClients.push_back(std::pair( - delayedExitTime, (*keeper_process_iter).second.keeperAdminClient)); - (*keeper_process_iter).second.keeperAdminClient = nullptr; - } - } // now that we are still holding registryLock // update registryState if needed - if(!is_shutting_down() && (1 == keeperProcessRegistry.size())) + if(!is_shutting_down() && recordingGroups.size() == 0) { registryState = INITIALIZED; - LOG_INFO("[KeeperRegistry] INITIALIZED with {} KeeperProcesses", keeperProcessRegistry.size()); + LOG_INFO("[KeeperRegistry] RUNNING with {} RecordingGroups ", recordingGroups.size()); } return chronolog::CL_SUCCESS; @@ -390,9 +365,13 @@ void KeeperRegistry::updateKeeperProcessStats(KeeperStatsMsg const &keeperStatsM { return; } KeeperIdCard keeper_id_card = keeperStatsMsg.getKeeperIdCard(); - auto keeper_process_iter = keeperProcessRegistry.find( - std::pair(keeper_id_card.getIPaddr(), keeper_id_card.getPort())); - if(keeper_process_iter == keeperProcessRegistry.end() || !((*keeper_process_iter).second.active)) + auto group_iter = recordingGroups.find(keeper_id_card.getGroupId()); + if(group_iter == recordingGroups.end()) { return; } + + auto keeper_process_iter = (*group_iter) + .second.keeperProcesses.find(std::pair( + keeper_id_card.getIPaddr(), keeper_id_card.getPort())); + if(keeper_process_iter == (*group_iter).second.keeperProcesses.end() || !((*keeper_process_iter).second.active)) {// however unlikely it is that the stats msg would be delivered for the keeper that's already unregistered // we should probably log a warning here... return; @@ -414,9 +393,14 @@ std::vector &KeeperRegistry::getActiveKeepers(std::vector &KeeperRegistry::getActiveKeepers(std::vector ChronicleName const& chronicle, StoryName const& story, StoryId const& storyId) { + //INNA: rework keeper& grapher assignment ... + if(!is_running()) { LOG_ERROR("[KeeperRegistry] Registry has no Keeper processes to start recording story {}", storyId); @@ -474,6 +460,9 @@ int KeeperRegistry::notifyKeepersOfStoryRecordingStart(std::vector std::vector vectorOfKeepersToNotify = vectorOfKeepers; vectorOfKeepers.clear(); + + auto keeper_processes = (*recordingGroups.begin()).second.keeperProcesses; + for(KeeperIdCard keeper_id_card: vectorOfKeepersToNotify) { DataStoreAdminClient* dataAdminClient = nullptr; @@ -485,9 +474,9 @@ int KeeperRegistry::notifyKeepersOfStoryRecordingStart(std::vector // (see unregisterKeeperProcess()) to protect us from the unfortunate case of keeperProcessEntry.dataAdminClient object being deleted // while this thread is waiting for rpc response std::lock_guard lock(registryLock); - auto keeper_process_iter = keeperProcessRegistry.find( + auto keeper_process_iter = keeper_processes.find( std::pair(keeper_id_card.getIPaddr(), keeper_id_card.getPort())); - if((keeper_process_iter != keeperProcessRegistry.end() && (*keeper_process_iter).second.active) && + if((keeper_process_iter != keeper_processes.end() && (*keeper_process_iter).second.active) && ((*keeper_process_iter).second.keeperAdminClient != nullptr)) { dataAdminClient = (*keeper_process_iter).second.keeperAdminClient; @@ -537,6 +526,10 @@ int KeeperRegistry::notifyKeepersOfStoryRecordingStop(std::vector return chronolog::CL_ERR_NO_KEEPERS; } + //INNA: rework keeper & grapher assignments.. + + auto keeper_processes = (*recordingGroups.begin()).second.keeperProcesses; + size_t keepers_left_to_notify = vectorOfKeepers.size(); for(KeeperIdCard keeper_id_card: vectorOfKeepers) { @@ -549,9 +542,9 @@ int KeeperRegistry::notifyKeepersOfStoryRecordingStop(std::vector // (see unregisterKeeperProcess()) to protect us from the unfortunate case of keeperProcessEntry.dataAdminClient object being deleted // while this thread is waiting for rpc response std::lock_guard lock(registryLock); - auto keeper_process_iter = keeperProcessRegistry.find( + auto keeper_process_iter = keeper_processes.find( std::pair(keeper_id_card.getIPaddr(), keeper_id_card.getPort())); - if((keeper_process_iter != keeperProcessRegistry.end() && (*keeper_process_iter).second.active) && + if((keeper_process_iter != keeper_processes.end() && (*keeper_process_iter).second.active) && ((*keeper_process_iter).second.keeperAdminClient != nullptr)) { dataAdminClient = (*keeper_process_iter).second.keeperAdminClient; @@ -677,13 +670,15 @@ int KeeperRegistry::registerGrapherProcess(GrapherRegistrationMsg const & reg_ms LOG_INFO("[KeeperRegistry] Register grapher {} created DataStoreAdminClient for {}: provider_id={}", id_string.str(), service_na_string, admin_service_id.provider_id); - // now that communnication with the Keeper is established and we still holding registryLock - // update registryState in case this is the first KeeperProcess registration - if(keeperProcessRegistry.size() > 0) + LOG_INFO("[KeeperRegistry] RecordingGroup {} has a grappher and {} keepers", recording_group.groupId, + recording_group.keeperProcesses.size()); + + // now that communnication with the Grapher is established and we still holding registryLock + // update registryState in case this is the first GrapherProcess registration + if(recordingGroups.size() > 0) { registryState = RUNNING; - - LOG_INFO("[KeeperRegistry] RUNNING with {} KeeperProcesses", keeperProcessRegistry.size()); + LOG_INFO("[KeeperRegistry] RUNNING with {} RecordingGroups ", recordingGroups.size()); } return chronolog::CL_SUCCESS; } @@ -718,10 +713,19 @@ int KeeperRegistry::unregisterGrapherProcess(GrapherIdCard const& grapher_id_car // now that we are still holding registryLock // update registryState if needed - if(!is_shutting_down() && (1 == keeperProcessRegistry.size())) + LOG_INFO("[KeeperRegistry] RecordingGroup {} has no grappher and {} keepers", recording_group.groupId, + recording_group.keeperProcesses.size()); + + // now that communnication with the Grapher is established and we still holding registryLock + // update registryState in case this is the first GrapherProcess registration + if(recordingGroups.size() > 0) + { + LOG_INFO("[KeeperRegistry] RUNNING with {} RecordingGroups ", recordingGroups.size()); + } + else if(!is_shutting_down()) { registryState = INITIALIZED; - LOG_INFO("[KeeperRegistry] INITIALIZED with {} KeeperProcesses", keeperProcessRegistry.size()); + LOG_INFO("[KeeperRegistry] reverted to INITIALIZED state with {} RecordingGroups", recordingGroups.size()); } return chronolog::CL_SUCCESS; @@ -737,6 +741,8 @@ void chl::RecordingGroup::startDelayedGrapherExit(chl::GrapherProcessEntry& grap { grapher_process.active = false; + LOG_DEBUG("[KeeperRegistry] recording_group {} starts delayedExit for {}", groupId, + grapher_process.idCardString.str()); if(grapher_process.adminClient != nullptr) { grapher_process.delayedExitGrapherClients.push_back( @@ -750,21 +756,12 @@ void chl::RecordingGroup::clearDelayedExitGrapher(chl::GrapherProcessEntry& grap while(!grapher_process.delayedExitGrapherClients.empty() && (current_time >= grapher_process.delayedExitGrapherClients.front().first)) { + LOG_DEBUG("[KeeperRegistry] recording_Group {}, destroys delayed dataAdmindClient for {}", groupId, + grapher_process.idCardString.str()); auto dataStoreClientPair = grapher_process.delayedExitGrapherClients.front(); - LOG_DEBUG("[KeeperRegistry] recording_Group {}, destroys delayed dataAdmindClient for {}", id_string.str()); if(dataStoreClientPair.second != nullptr) { delete dataStoreClientPair.second; } grapher_process.delayedExitGrapherClients.pop_front(); } - /* - - if(grapher_process->delayedExitGrapherClients.empty()) - { - LOG_DEBUG("[KeeperRegistry] recording_group {} has destroyed old entry for grapher {}", groupId, id_string.str()); - delete grapher_process; - group_entry.grapherProcess = nullptr; - } - -*/ } void chl::RecordingGroup::startDelayedKeeperExit(chl::KeeperProcessEntry& keeper_process, std::time_t delayedExitTime) @@ -776,11 +773,25 @@ void chl::RecordingGroup::startDelayedKeeperExit(chl::KeeperProcessEntry& keeper keeper_process.active = false; + LOG_DEBUG("[KeeperRegistry] recording_group {} starts delayedExit for {}", groupId, + keeper_process.idCardString.str()); if(keeper_process.keeperAdminClient != nullptr) { - keeper_process.delayedExitClients.push_back( std::pair(delayedExitTime, keeper_process.keeperAdminClient)); keeper_process.keeperAdminClient = nullptr; } } + +void chl::RecordingGroup::clearDelayedExitKeeper(chl::KeeperProcessEntry& keeper_process, std::time_t current_time) +{ + while(!keeper_process.delayedExitClients.empty() && + (current_time >= keeper_process.delayedExitClients.front().first)) + { + LOG_DEBUG("[KeeperRegistry] recording_group {} destroys delayed dataAdminClient for {}", groupId, + keeper_process.idCardString.str()); + auto dataStoreClientPair = keeper_process.delayedExitClients.front(); + if(dataStoreClientPair.second != nullptr) { delete dataStoreClientPair.second; } + keeper_process.delayedExitClients.pop_front(); + } +} From e8c103f23f19776befb119fda7314c9728bf4245 Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Wed, 17 Apr 2024 14:44:30 -0500 Subject: [PATCH 28/40] concept of activeGroups, uniform_group_distribution, mersene twister random generator --- ChronoVisor/include/KeeperRegistry.h | 18 +++-- ChronoVisor/src/KeeperRegistry.cpp | 109 +++++++++++++++++++-------- 2 files changed, 88 insertions(+), 39 deletions(-) diff --git a/ChronoVisor/include/KeeperRegistry.h b/ChronoVisor/include/KeeperRegistry.h index 63f6f336..45d4d3b0 100644 --- a/ChronoVisor/include/KeeperRegistry.h +++ b/ChronoVisor/include/KeeperRegistry.h @@ -1,10 +1,12 @@ #ifndef KEEPER_REGISTRY_H #define KEEPER_REGISTRY_H +#include +#include #include +#include #include -#include -#include + #include #include "chronolog_types.h" @@ -123,11 +125,7 @@ class GrapherProcessEntry }; public: - KeeperRegistry() - : registryState(UNKNOWN) - , registryEngine(nullptr) - , keeperRegistryService(nullptr) - {} + KeeperRegistry(); ~KeeperRegistry(); @@ -163,10 +161,14 @@ class GrapherProcessEntry RegistryState registryState; std::mutex registryLock; - std::map recordingGroups; thallium::engine* registryEngine; KeeperRegistryService* keeperRegistryService; size_t delayedDataAdminExitSeconds; + + std::map recordingGroups; + std::vector activeGroups; + std::mt19937 mt_random;//mersene twister random int generator + std::uniform_int_distribution group_id_distribution; }; } diff --git a/ChronoVisor/src/KeeperRegistry.cpp b/ChronoVisor/src/KeeperRegistry.cpp index 3fa11b12..95ffd0b7 100644 --- a/ChronoVisor/src/KeeperRegistry.cpp +++ b/ChronoVisor/src/KeeperRegistry.cpp @@ -50,6 +50,7 @@ int KeeperRegistry::InitializeRegistryService(ChronoLog::ConfigurationManager co keeperRegistryService = KeeperRegistryService::CreateKeeperRegistryService(*registryEngine, provider_id, *this); delayedDataAdminExitSeconds = confManager.VISOR_CONF.DELAYED_DATA_ADMIN_EXIT_IN_SECS; + registryState = INITIALIZED; status = chronolog::CL_SUCCESS; } @@ -61,6 +62,21 @@ int KeeperRegistry::InitializeRegistryService(ChronoLog::ConfigurationManager co return status; } + +KeeperRegistry::KeeperRegistry() + : registryState(UNKNOWN) + , registryEngine(nullptr) + , keeperRegistryService(nullptr) + , delayedDataAdminExitSeconds(3) +{ + // INNA: I'm using current time for seeding Mersene Twister number generator + // there are different opinions on the use of std::random_device for seeding of Mersene Twister.. + // TODO: reseach the seeding of Mersense Twister int number generator + + size_t new_seed = std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now()); + mt_random.seed(new_seed);//initial seed for the 32 int Mersene Twister generator +} + ///////////////// int KeeperRegistry::ShutdownRegistryService() @@ -295,13 +311,19 @@ int KeeperRegistry::registerKeeperProcess(KeeperRegistrationMsg const &keeper_re LOG_INFO("[KeeperRegistry] RecordingGroup {} has {} keepers", recording_group.groupId, recording_group.keeperProcesses.size()); - // now that communnication with the Keeper is established and we still holding registryLock - // update registryState in case this is the first KeeperProcess registration - if(recordingGroups.size() > 0) + // check if this is the first keeper for the recording group and the group is ready to be part of + // the activeGroups rotation + if(recording_group.keeperProcesses.size() == 1 && recording_group.grapherProcess != nullptr) { - registryState = RUNNING; - LOG_INFO("[KeeperRegistry] RUNNING with {} RecordingGroups ", recordingGroups.size()); + activeGroups.push_back(&recording_group); + size_t new_seed = std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now()); + mt_random.seed(new_seed);//re-seed the mersene_twister_generator + group_id_distribution = + std::uniform_int_distribution(0, activeGroups.size() - 1);//reset the distribution range } + + LOG_INFO("[KeeperRegistry] has {} RecordingGroups ; {} activeGroups", recordingGroups.size(), activeGroups.size()); + if(activeGroups.size() > 0) { registryState = RUNNING; } return chronolog::CL_SUCCESS; } ///////////////// @@ -344,13 +366,24 @@ int KeeperRegistry::unregisterKeeperProcess(KeeperIdCard const &keeper_id_card) recording_group.keeperProcesses.size()); // now that we are still holding registryLock - // update registryState if needed - if(!is_shutting_down() && recordingGroups.size() == 0) + // check if the keeper we've just unregistered was the only one for the recordingGroup + // and the group can't perform recording duties any longer + if(recording_group.keeperProcesses.size() == 1) { - registryState = INITIALIZED; - LOG_INFO("[KeeperRegistry] RUNNING with {} RecordingGroups ", recordingGroups.size()); + activeGroups.erase(std::remove(activeGroups.begin(), activeGroups.end(), &recording_group)); + if(activeGroups.size() > 0) + {//reset the group distribution + size_t new_seed = std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now()); + mt_random.seed(new_seed);//re-seed the mersene_twister_generator + group_id_distribution = + std::uniform_int_distribution(0, activeGroups.size() - 1);//reset the distribution range + } } + LOG_INFO("[KeeperRegistry] has {} RecordingGroups ; {} activeGroups", recordingGroups.size(), activeGroups.size()); + // update registryState if needed + if(!is_shutting_down() && activeGroups.size() == 0) { registryState = INITIALIZED; } + return chronolog::CL_SUCCESS; } ///////////////// @@ -382,7 +415,7 @@ void KeeperRegistry::updateKeeperProcessStats(KeeperStatsMsg const &keeperStatsM } ///////////////// -std::vector &KeeperRegistry::getActiveKeepers(std::vector &keeper_id_cards) +std::vector& KeeperRegistry::getActiveKeepers(std::vector& keeper_id_cards) { //the process of keeper selection will probably get more nuanced; //for now just return all the keepers registered if(is_shutting_down()) @@ -394,7 +427,10 @@ std::vector &KeeperRegistry::getActiveKeepers(std::vector &KeeperRegistry::getActiveKeepers(std::vector 0) + // now that communnication with the Grapher is established and we are still holding registryLock + // add the group to the activeGroups rotation if it's ready + if(recording_group.keeperProcesses.size() > 0 && recording_group.grapherProcess != nullptr) { - registryState = RUNNING; - LOG_INFO("[KeeperRegistry] RUNNING with {} RecordingGroups ", recordingGroups.size()); + activeGroups.push_back(&recording_group); + if(activeGroups.size() > 0) + {//reset the group distribution + size_t new_seed = std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now()); + mt_random.seed(new_seed);//re-seed the mersene_twister_generator + group_id_distribution = + std::uniform_int_distribution(0, activeGroups.size() - 1);//reset the distribution range + } } + + LOG_INFO("[KeeperRegistry] has {} RecordingGroups ; {} activeGroups", recordingGroups.size(), activeGroups.size()); + // now that communnication with the Grapher is established and we still holding registryLock + // update registryState in case this is the first group registration + if(activeGroups.size() > 0) { registryState = RUNNING; } return chronolog::CL_SUCCESS; } ///////////////// @@ -716,18 +759,22 @@ int KeeperRegistry::unregisterGrapherProcess(GrapherIdCard const& grapher_id_car LOG_INFO("[KeeperRegistry] RecordingGroup {} has no grappher and {} keepers", recording_group.groupId, recording_group.keeperProcesses.size()); - // now that communnication with the Grapher is established and we still holding registryLock - // update registryState in case this is the first GrapherProcess registration - if(recordingGroups.size() > 0) - { - LOG_INFO("[KeeperRegistry] RUNNING with {} RecordingGroups ", recordingGroups.size()); - } - else if(!is_shutting_down()) + // we've just unregistered the grapher so the group can't perform recording duties any longer { - registryState = INITIALIZED; - LOG_INFO("[KeeperRegistry] reverted to INITIALIZED state with {} RecordingGroups", recordingGroups.size()); + activeGroups.erase(std::remove(activeGroups.begin(), activeGroups.end(), &recording_group)); + if(activeGroups.size() > 0) + {//reset the group distribution + size_t new_seed = std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now()); + mt_random.seed(new_seed);//re-seed the mersene_twister_generator + group_id_distribution = + std::uniform_int_distribution(0, activeGroups.size() - 1);//reset the distribution range + } } + LOG_INFO("[KeeperRegistry] has {} RecordingGroups ; {} activeGroups", recordingGroups.size(), activeGroups.size()); + // update registryState in case this was the last active recordingGroup + if(activeGroups.size() == 0) { registryState = INITIALIZED; } + return chronolog::CL_SUCCESS; } From d8d3488c583347760c814991a8112b8329feb594 Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Wed, 17 Apr 2024 20:42:58 -0500 Subject: [PATCH 29/40] reworked recordingGroup notifications about story Start/stop --- ChronoVisor/include/KeeperRegistry.h | 19 +- ChronoVisor/src/KeeperRegistry.cpp | 287 ++++++++++++++++++++++---- ChronoVisor/src/VisorClientPortal.cpp | 33 ++- 3 files changed, 278 insertions(+), 61 deletions(-) diff --git a/ChronoVisor/include/KeeperRegistry.h b/ChronoVisor/include/KeeperRegistry.h index 45d4d3b0..7b56f650 100644 --- a/ChronoVisor/include/KeeperRegistry.h +++ b/ChronoVisor/include/KeeperRegistry.h @@ -109,6 +109,8 @@ class GrapherProcessEntry void startDelayedKeeperExit(KeeperProcessEntry&, std::time_t); void clearDelayedExitKeeper(KeeperProcessEntry&, std::time_t); + std::vector& getActiveKeepers(std::vector& keeper_id_cards); + RecordingGroupId groupId; GrapherProcessEntry* grapherProcess; std::map, KeeperProcessEntry> keeperProcesses; @@ -145,12 +147,9 @@ class GrapherProcessEntry void updateKeeperProcessStats(KeeperStatsMsg const& keeperStatsMsg); - std::vector& getActiveKeepers(std::vector& keeper_id_cards); - - int notifyKeepersOfStoryRecordingStart(std::vector&, ChronicleName const&, StoryName const&, - StoryId const&); - - int notifyKeepersOfStoryRecordingStop(std::vector const&, StoryId const&); + int notifyRecordingGroupOfStoryRecordingStart(ChronicleName const&, StoryName const&, StoryId const&, + std::vector&); + int notifyRecordingGroupOfStoryRecordingStop(StoryId const&); int registerGrapherProcess(GrapherRegistrationMsg const& reg_msg); int unregisterGrapherProcess(GrapherIdCard const& id_card); @@ -159,6 +158,13 @@ class GrapherProcessEntry KeeperRegistry(KeeperRegistry const&) = delete;//disable copying KeeperRegistry& operator=(KeeperRegistry const&) = delete; + int notifyGrapherOfStoryRecordingStart(RecordingGroup&, ChronicleName const&, StoryName const&, StoryId const&, + uint64_t); + int notifyGrapherOfStoryRecordingStop(RecordingGroup&, StoryId const&); + int notifyKeepersOfStoryRecordingStart(RecordingGroup&, std::vector&, ChronicleName const&, + StoryName const&, StoryId const&, uint64_t); + int notifyKeepersOfStoryRecordingStop(RecordingGroup&, std::vector const&, StoryId const&); + RegistryState registryState; std::mutex registryLock; thallium::engine* registryEngine; @@ -169,6 +175,7 @@ class GrapherProcessEntry std::vector activeGroups; std::mt19937 mt_random;//mersene twister random int generator std::uniform_int_distribution group_id_distribution; + std::map activeStories; }; } diff --git a/ChronoVisor/src/KeeperRegistry.cpp b/ChronoVisor/src/KeeperRegistry.cpp index 95ffd0b7..7cb4ad1f 100644 --- a/ChronoVisor/src/KeeperRegistry.cpp +++ b/ChronoVisor/src/KeeperRegistry.cpp @@ -128,8 +128,7 @@ int KeeperRegistry::ShutdownRegistryService() if(grapher_process->delayedExitGrapherClients.empty()) { - LOG_INFO("[KeeperRegistry] registerGrapherProcess has destroyed old entry for grapher {}", - id_string.str()); + LOG_INFO("[KeeperRegistry] shudown: destroyed old entry for grapher {}", id_string.str()); delete grapher_process; recording_group.grapherProcess = nullptr; } @@ -160,26 +159,25 @@ int KeeperRegistry::ShutdownRegistryService() if((*process_iter).second.delayedExitClients.empty()) { - LOG_INFO("[KeeperRegistry] registerKeeperProcess() destroys old keeperProcessEntry for {}", - id_string.str()); + LOG_INFO("[KeeperRegistry] shutdown : destroys old keeperProcessEntry for {}", id_string.str()); process_iter = recording_group.keeperProcesses.erase(process_iter); } else { - LOG_INFO("[KeeperRegistry] registerKeeperProcess() old dataAdminClient for {} can't yet be " - "destroyed", + LOG_INFO("[KeeperRegistry] shutdown: old dataAdminClient for {} can't yet be destroyed", id_string.str()); ++process_iter; } } if(recording_group.grapherProcess == nullptr && recording_group.keeperProcesses.empty()) { - LOG_INFO("[KeeperRegistry] recordingGroup {} is destroyed", recording_group.groupId); + LOG_INFO("[KeeperRegistry] shutdown: recordingGroup {} is destroyed", recording_group.groupId); group_iter = recordingGroups.erase(group_iter); } else { - LOG_INFO("[KeeperRegistry] recordingGroup {} can't yet be destroyed", recording_group.groupId); + LOG_INFO("[KeeperRegistry] shutdown: recordingGroup {} can't yet be destroyed", + recording_group.groupId); ++group_iter; } } @@ -415,28 +413,22 @@ void KeeperRegistry::updateKeeperProcessStats(KeeperStatsMsg const &keeperStatsM } ///////////////// -std::vector& KeeperRegistry::getActiveKeepers(std::vector& keeper_id_cards) -{ //the process of keeper selection will probably get more nuanced; - //for now just return all the keepers registered - if(is_shutting_down()) - { return keeper_id_cards; } - - std::lock_guard lock(registryLock); - if(is_shutting_down()) - { return keeper_id_cards; } +std::vector& RecordingGroup::getActiveKeepers(std::vector& keeper_id_cards) +{ + // NOTE: RecordingGroup methods are not currently protected by lock + // the assumptions is that the caller would use RegistryLock before calling the RecordingGroup + // method + // we may decide to revisit this and introduce RecordingGroup level locks later on.. keeper_id_cards.clear(); // pick recording_group from uniform group id distribution using a random int value // generated by Mirsene Twister generator - RecordingGroup* recording_group_to_use = activeGroups[group_id_distribution(mt_random)]; - - RecordingGroup& recording_group = (*recordingGroups.begin()).second;//FIX this !!! std::time_t current_time = std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now()); - for(auto iter = recording_group.keeperProcesses.begin(); iter != recording_group.keeperProcesses.end();) + for(auto iter = keeperProcesses.begin(); iter != keeperProcesses.end();) { std::stringstream id_string; id_string << (*iter).second.idCard; @@ -461,7 +453,7 @@ std::vector& KeeperRegistry::getActiveKeepers(std::vector& KeeperRegistry::getActiveKeepers(std::vector& vectorOfKeepers) +{ + vectorOfKeepers.clear(); -int KeeperRegistry::notifyKeepersOfStoryRecordingStart(std::vector& vectorOfKeepers, - ChronicleName const& chronicle, StoryName const& story, - StoryId const& storyId) + RecordingGroup* recording_group = nullptr; + + { + //lock KeeperRegistry and choose the recording group for this story + //NOTE we only keep the lock within this paragraph... + std::lock_guard lock(registryLock); + if(!is_running()) + { + LOG_ERROR("[KeeperRegistry] Registry has no active RecordingGroups to start recording story {}", story_id); + return chronolog::CL_ERR_NO_KEEPERS; + } + + + //first check if we are already recording this story for another client and have a recording group assigned to it + + auto story_iter = activeStories.find(story_id); + + if(story_iter != activeStories.end() && (*story_iter).second != nullptr) + { + //INNA: we should probably check if the group's active status hasn't changed + //and implement group re-assignment procedure when we have recording processes dynamically coming and going.. + + recording_group = (*story_iter).second; + + if(recording_group != nullptr) { recording_group->getActiveKeepers(vectorOfKeepers); } + + return chronolog::CL_SUCCESS; + } + + // pick recording_group from the group id distribution using a random int value + // generated by Mirsene Twister generator + // NOTE: using uniform_distribution for now, we might add discrete distribution with weights later... + + recording_group = activeGroups[group_id_distribution(mt_random)]; + activeStories[story_id] = recording_group; + } + + std::time_t story_start_time = + std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now()); + + + // the registryLock is released by this point.. + // notify Grapher and notifyKeepers functions use delayedExit logic to protect + // the rpc code from DataAdminClients being destroyed while notification is in progress.. + int rpc_return = notifyGrapherOfStoryRecordingStart(*recording_group, chronicle, story, story_id, story_start_time); + + if(rpc_return == CL_SUCCESS) + { + recording_group->getActiveKeepers(vectorOfKeepers); + rpc_return = notifyKeepersOfStoryRecordingStart(*recording_group, vectorOfKeepers, chronicle, story, story_id, + story_start_time); + } + + return rpc_return; +} + +//////////////// +int KeeperRegistry::notifyGrapherOfStoryRecordingStart(RecordingGroup& recordingGroup, ChronicleName const& chronicle, + StoryName const& story, StoryId const& storyId, + uint64_t story_start_time) +{ + int return_code = chronolog::CL_ERR_NO_KEEPERS; + + if(!is_running()) + { + LOG_ERROR("[KeeperRegistry] Registry has no active RecordingGroups to start recording story {}", storyId); + return chronolog::CL_ERR_NO_KEEPERS; + } + + DataStoreAdminClient* dataAdminClient = nullptr; + + std::stringstream id_string; + + { + // NOTE: we release the registryLock before sending rpc request so that we do not hold it for the duration of rpc communication. + // We delay the destruction of unactive adminClients that might be triggered by the unregister call from a different thread + // to protect us from the unfortunate case of aminClient object being deleted + // while this thread is waiting for rpc response + std::lock_guard lock(registryLock); + + if(recordingGroup.grapherProcess != nullptr && recordingGroup.grapherProcess->active && + recordingGroup.grapherProcess->adminClient != nullptr) + { + id_string << recordingGroup.grapherProcess->idCard; + dataAdminClient = recordingGroup.grapherProcess->adminClient; + } + else + { + LOG_WARNING("[KeeperRegistry] grapher for recordingGroup {} is not available for notification", + recordingGroup.groupId); + } + } + + if(dataAdminClient == nullptr) { return return_code; } + + try + { + return_code = dataAdminClient->send_start_story_recording(chronicle, story, storyId, story_start_time); + if(return_code != CL_SUCCESS) + { + LOG_WARNING("[KeeperRegistry] Registry failed RPC notification to {}", id_string.str()); + } + else + { + LOG_INFO("[KeeperRegistry] Registry notified {} to start recording StoryID={} with StartTime={}", + id_string.str(), storyId, story_start_time); + } + } + catch(thallium::exception const& ex) + { + LOG_WARNING("[KeeperRegistry] Registry failed RPC notification to grapher {}", id_string.str()); + } + + return return_code; +} + +/////////////// +int KeeperRegistry::notifyGrapherOfStoryRecordingStop(RecordingGroup& recordingGroup, StoryId const& storyId) { - //INNA: rework keeper& grapher assignment ... + int return_code = chronolog::CL_ERR_NO_KEEPERS; if(!is_running()) { - LOG_ERROR("[KeeperRegistry] Registry has no Keeper processes to start recording story {}", storyId); + LOG_ERROR("[KeeperRegistry] Registry has no active RecordingGroups to start recording story {}", storyId); return chronolog::CL_ERR_NO_KEEPERS; } - std::chrono::time_point time_now = std::chrono::system_clock::now(); - uint64_t story_start_time = time_now.time_since_epoch().count(); + DataStoreAdminClient* dataAdminClient = nullptr; + + std::stringstream id_string; + + { + // NOTE: we release the registryLock before sending rpc request so that we do not hold it for the duration of rpc communication. + // We delay the destruction of unactive adminClients that might be triggered by the unregister call from a different thread + // to protect us from the unfortunate case of aminClient object being deleted + // while this thread is waiting for rpc response + std::lock_guard lock(registryLock); + + if(recordingGroup.grapherProcess != nullptr && recordingGroup.grapherProcess->active && + recordingGroup.grapherProcess->adminClient != nullptr) + { + id_string << recordingGroup.grapherProcess->idCard; + dataAdminClient = recordingGroup.grapherProcess->adminClient; + } + else + { + LOG_WARNING("[KeeperRegistry] grapher for recordingGroup {} is not available for notification", + recordingGroup.groupId); + } + } + + if(dataAdminClient == nullptr) { return return_code; } + + try + { + return_code = dataAdminClient->send_stop_story_recording(storyId); + if(return_code != CL_SUCCESS) + { + LOG_WARNING("[KeeperRegistry] Registry failed RPC notification to {}", id_string.str()); + } + else + { + LOG_INFO("[KeeperRegistry] Registry notified grapher {} to stop recording StoryID={} ", id_string.str(), + storyId); + } + } + catch(thallium::exception const& ex) + { + LOG_WARNING("[KeeperRegistry] Registry failed RPC notification to grapher {}", id_string.str()); + } + + return return_code; +} +///////////////////// + +int KeeperRegistry::notifyKeepersOfStoryRecordingStart(RecordingGroup& recordingGroup, + std::vector& vectorOfKeepers, + ChronicleName const& chronicle, StoryName const& story, + StoryId const& storyId, uint64_t story_start_time) +{ + + // if there are no activeGroups ready for recording + // we are out of luck... + if(!is_running()) + { + LOG_ERROR("[KeeperRegistry] Registry has no active RecordingGroups to start recording story {}", storyId); + return chronolog::CL_ERR_NO_KEEPERS; + } std::vector vectorOfKeepersToNotify = vectorOfKeepers; vectorOfKeepers.clear(); - auto keeper_processes = (*recordingGroups.begin()).second.keeperProcesses; + auto keeper_processes = recordingGroup.keeperProcesses; for(KeeperIdCard keeper_id_card: vectorOfKeepersToNotify) { @@ -529,7 +700,8 @@ int KeeperRegistry::notifyKeepersOfStoryRecordingStart(std::vector } else { - LOG_INFO("[KeeperRegistry] Registry notified keeper {} to start recording StoryID={} with StartTime={}", id_string.str(), storyId, story_start_time); + LOG_INFO("[KeeperRegistry] Registry notified {} to start recording StoryID={} with StartTime={}", + id_string.str(), storyId, story_start_time); vectorOfKeepers.push_back(keeper_id_card); } } @@ -548,8 +720,53 @@ int KeeperRegistry::notifyKeepersOfStoryRecordingStart(std::vector return chronolog::CL_SUCCESS; } ///////////////// +int KeeperRegistry::notifyRecordingGroupOfStoryRecordingStop(StoryId const& story_id) +{ + RecordingGroup* recording_group = nullptr; + + std::vector vectorOfKeepers; + + { + //lock KeeperRegistry and choose the recording group for this story + //NOTE we only keep the lock within this paragraph... + std::lock_guard lock(registryLock); + if(!is_running()) + { + LOG_ERROR("[KeeperRegistry] Registry has no active RecordingGroups to start recording story {}", story_id); + return chronolog::CL_ERR_NO_KEEPERS; + } -int KeeperRegistry::notifyKeepersOfStoryRecordingStop(std::vector const& vectorOfKeepers, + + auto story_iter = activeStories.find(story_id); + + if(story_iter == activeStories.end()) + { + //we don't know of this story + return CL_SUCCESS; + } + + recording_group = (*story_iter).second; + + if(recording_group != nullptr) { recording_group->getActiveKeepers(vectorOfKeepers); } + + activeStories.erase(story_id); + } + + if(recording_group != nullptr) + { + // the registryLock is released by this point.. + // notify Grapher and notifyKeepers functions use delayedExit logic to protect + // the rpc code from DataAdminClients being destroyed while notification is in progress.. + notifyGrapherOfStoryRecordingStop(*recording_group, story_id); + + notifyKeepersOfStoryRecordingStop(*recording_group, vectorOfKeepers, story_id); + } + + return CL_SUCCESS; +} +////////////// +int KeeperRegistry::notifyKeepersOfStoryRecordingStop(RecordingGroup& recordingGroup, + std::vector const& vectorOfKeepers, StoryId const& storyId) { if(!is_running()) @@ -558,9 +775,7 @@ int KeeperRegistry::notifyKeepersOfStoryRecordingStop(std::vector return chronolog::CL_ERR_NO_KEEPERS; } - //INNA: rework keeper & grapher assignments.. - - auto keeper_processes = (*recordingGroups.begin()).second.keeperProcesses; + auto keeper_processes = recordingGroup.keeperProcesses; size_t keepers_left_to_notify = vectorOfKeepers.size(); for(KeeperIdCard keeper_id_card: vectorOfKeepers) @@ -596,7 +811,7 @@ int KeeperRegistry::notifyKeepersOfStoryRecordingStop(std::vector } else { - LOG_INFO("[KeeperRegistry] Registry notified keeper {} to stop recording story {}", id_string.str(), storyId); + LOG_INFO("[KeeperRegistry] Registry notified {} to stop recording story {}", id_string.str(), storyId); } } catch(thallium::exception const& ex) diff --git a/ChronoVisor/src/VisorClientPortal.cpp b/ChronoVisor/src/VisorClientPortal.cpp index fab6dd68..090e4ef2 100644 --- a/ChronoVisor/src/VisorClientPortal.cpp +++ b/ChronoVisor/src/VisorClientPortal.cpp @@ -206,16 +206,14 @@ chronolog::VisorClientPortal::AcquireStory(chl::ClientId const &client_id, std:: { return chronolog::AcquireStoryResponseMsg(CL_ERR_NOT_AUTHORIZED, story_id, recording_keepers); } int ret = CL_ERR_UNKNOWN; - //ret = chronicleMetaDirectory.create_story(chronicle_name, story_name, attrs); - - //if (ret != CL_SUCCESS && ret != CL_ERR_STORY_EXISTS) - //{ return chronolog::AcquireStoryResponseMsg(ret, story_id, recording_keepers); } bool notify_keepers = false; + ret = chronicleMetaDirectory.acquire_story(client_id, chronicle_name, story_name, attrs, flags, story_id , notify_keepers); if(ret != chronolog::CL_SUCCESS) { + // return the error with the empty recording_keepers vector return chronolog::AcquireStoryResponseMsg(ret, story_id, recording_keepers); } else @@ -224,21 +222,20 @@ chronolog::VisorClientPortal::AcquireStory(chl::ClientId const &client_id, std:: , getpid(), client_id, chronicle_name.c_str(), story_name.c_str(), flags); } - recording_keepers = theKeeperRegistry->getActiveKeepers(recording_keepers); - // if this is the first client to acquire this story we need to notify the recording Keepers + // if this is the first client to acquire this story we need to choose an active recording group + // for the new story and notify the recording Keepers & Graphers // so that they are ready to start recording this story - if(notify_keepers) + + if(chronolog::CL_SUCCESS != theKeeperRegistry->notifyRecordingGroupOfStoryRecordingStart( + chronicle_name, story_name, story_id, recording_keepers)) { - if(chronolog::CL_SUCCESS != - theKeeperRegistry->notifyKeepersOfStoryRecordingStart(recording_keepers, chronicle_name, story_name - , story_id)) - { // RPC notification to the keepers might have failed, release the newly acquired story - chronicleMetaDirectory.release_story(client_id, chronicle_name, story_name, story_id, notify_keepers); - //we do know that there's no need notify keepers of the story ending in this case as it hasn't started... - //return CL_ERR_NO_KEEPERS; - return chronolog::AcquireStoryResponseMsg(chronolog::CL_ERR_NO_KEEPERS, story_id, recording_keepers); - } + // RPC notification to the keepers might have failed, release the newly acquired story + chronicleMetaDirectory.release_story(client_id, chronicle_name, story_name, story_id, notify_keepers); + //we do know that there's no need notify keepers of the story ending in this case as it hasn't started... + recording_keepers.clear(); + return chronolog::AcquireStoryResponseMsg(chronolog::CL_ERR_NO_KEEPERS, story_id, recording_keepers); } + return chronolog::AcquireStoryResponseMsg(chronolog::CL_SUCCESS, story_id, recording_keepers); } @@ -261,9 +258,7 @@ int chronolog::VisorClientPortal::ReleaseStory(chl::ClientId const &client_id, s if(notify_keepers && theKeeperRegistry->is_running()) { - std::vector recording_keepers; - theKeeperRegistry->notifyKeepersOfStoryRecordingStop(theKeeperRegistry->getActiveKeepers(recording_keepers) - , story_id); + theKeeperRegistry->notifyRecordingGroupOfStoryRecordingStop(story_id); } return chronolog::CL_SUCCESS; } From 25994b9756703bdeda41343bca487ace0e7ccce7 Mon Sep 17 00:00:00 2001 From: Kun Feng Date: Wed, 10 Apr 2024 10:32:58 -0500 Subject: [PATCH 30/40] Fix typo in help page --- deploy/single_user_deploy.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/single_user_deploy.sh b/deploy/single_user_deploy.sh index bcd1daa4..4a9c9a88 100755 --- a/deploy/single_user_deploy.sh +++ b/deploy/single_user_deploy.sh @@ -411,7 +411,7 @@ usage() { -c|--client CLIENT_BIN -s|--visor_hosts VISOR_HOSTS -p|--keeper_hosts KEEPER_HOSTS - -r|--client_hosts CLIENT_HOSTS + -t|--client_hosts CLIENT_HOSTS -f|--conf_file CONF_FILE -j|--job_id JOB_ID -h|--help Print this page" From cb8303872bd4620afa96b3e206cbd1cd19d58bc0 Mon Sep 17 00:00:00 2001 From: Kun Feng Date: Thu, 11 Apr 2024 15:06:16 -0500 Subject: [PATCH 31/40] Add -l for local deployment, and -e for verbose output Change from getent to dig to get IP from hostname Rename HOSTNAME_HS_NET_POSTFIX to HOSTNAME_HS_NET_SUFFIX' --- deploy/single_user_deploy.sh | 156 +++++++++++++++++++++++++++-------- 1 file changed, 121 insertions(+), 35 deletions(-) diff --git a/deploy/single_user_deploy.sh b/deploy/single_user_deploy.sh index 4a9c9a88..7074380d 100755 --- a/deploy/single_user_deploy.sh +++ b/deploy/single_user_deploy.sh @@ -25,11 +25,13 @@ CLIENT_ARGS="--config ${CONF_FILE}" VISOR_HOSTS="${CONF_DIR}/hosts_visor" KEEPER_HOSTS="${CONF_DIR}/hosts_keeper" CLIENT_HOSTS="${CONF_DIR}/hosts_client" -HOSTNAME_HS_NET_POSTFIX="-40g" +HOSTNAME_HS_NET_SUFFIX="-40g" JOB_ID="" install=false deploy=false +local=true reset=false +verbose=false check_hosts_files() { echo -e "${INFO}Checking hosts files...${NC}" @@ -50,6 +52,11 @@ check_hosts_files() { echo -e "${ERR}${CLIENT_HOSTS} host file does not exist, exiting ...${NC}" exit 1 fi + + if [[ "${verbose}" == "true" ]] + then + echo -e "${DEBUG}Check hosts files done${NC}" + fi } check_bin_files() { @@ -71,6 +78,11 @@ check_bin_files() { echo -e "${ERR}${CLIENT_BIN} executable file does not exist, exiting ...${NC}" exit 1 fi + + if [[ "${verbose}" == "true" ]] + then + echo -e "${DEBUG}Check binary files done${NC}" + fi } check_conf_files() { @@ -96,6 +108,11 @@ check_conf_files() { echo -e "${ERR}mismatched VisorKeeperRegistryService conf in ${CONF_FILE}, exiting ...${NC}" exit 1 fi + + if [[ "${verbose}" == "true" ]] + then + echo -e "${DEBUG}Check conf files done${NC}" + fi } extract_shared_libraries() { @@ -141,16 +158,21 @@ copy_shared_libs() { copy_shared_libs_recursive ${lib} ${LIB_DIR} fi done + + if [[ "${verbose}" == "true" ]] + then + echo -e "${DEBUG}Copy shared library done${NC}" + fi } get_host_ip() { local hostname=$1 local host_ip="" - if [[ ${hostname} == *${HOSTNAME_HS_NET_POSTFIX} ]] + if [[ ${hostname} == *${HOSTNAME_HS_NET_SUFFIX} ]] then - host_ip=$(getent hosts ${hostname} | awk '{print $1}' | head -1) + host_ip=$(dig -4 ${hostname} | grep "^${hostname}" | awk '{print $5}') else - host_ip=$(getent hosts ${hostname}${HOSTNAME_HS_NET_POSTFIX} | awk '{print $1}' | head -1) + host_ip=$(dig -4 ${hostname}${HOSTNAME_HS_NET_SUFFIX} | grep "^${hostname}${HOSTNAME_HS_NET_SUFFIX}" | awk '{print $5}') fi echo "${host_ip}" } @@ -168,17 +190,28 @@ update_visor_ip() { jq ".chrono_client.VisorClientPortalService.rpc.service_ip = \"${visor_ip}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE} jq ".chrono_visor.VisorKeeperRegistryService.rpc.service_ip = \"${visor_ip}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE} jq ".chrono_keeper.VisorKeeperRegistryService.rpc.service_ip = \"${visor_ip}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE} + + if [[ "${verbose}" == "true" ]] + then + echo -e "${DEBUG}Update ChronoVisor IP done${NC}" + fi } generate_conf_for_each_keeper() { for keeper_host in $(cat ${KEEPER_HOSTS} | awk '{print $1}') do + remote_keeper_host_name=$(ssh ${keeper_host} hostname) keeper_ip=$(get_host_ip ${keeper_host}) - echo -e "${INFO}Generating conf file for Keeper ${keeper_host} ...${NC}" + echo -e "${INFO}Generating conf file for ChronoKeeper ${remote_keeper_hostname} ...${NC}" jq ".chrono_keeper.KeeperDataStoreAdminService.rpc.service_ip = \"${keeper_ip}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE} jq ".chrono_keeper.KeeperRecordingService.rpc.service_ip = \"${keeper_ip}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE} - jq ".chrono_keeper.Logging.log.file = \"chronokeeper_logfile.txt.${keeper_host}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE}.${keeper_host} + jq ".chrono_keeper.Logging.log.file = \"chronokeeper_logfile.txt.${remote_keeper_hostname}\"" ${CONF_FILE} > tmp.json && mv tmp.json ${CONF_FILE}.${remote_keeper_hostname} done + + if [[ "${verbose}" == "true" ]] + then + echo -e "${DEBUG}Generate conf file for ChronoKeepers done${NC}" + fi } install() { @@ -194,7 +227,15 @@ install() { update_visor_ip - generate_conf_for_each_keeper + if [[ "${local}" == "false" ]] + then + generate_conf_for_each_keeper + fi + + if [[ "${verbose}" == "true" ]] + then + echo -e "${DEBUG}Install done${NC}" + fi } deploy() { @@ -202,77 +243,106 @@ deploy() { echo -e "${INFO}Deploying ...${NC}" + local hostname_suffix="" + local simple_output_grep_keyword="" + + if [[ "${local}" == "false" ]] + then + # use hostname suffix conf file only on Ares + hostname_suffix=".\$(hostname)" + if [[ "${verbose}" == "false" ]] + then + # grep only on Ares with simple output + simple_output_grep_keyword="ares-" + fi + fi + # launch Visor - echo -e "${DEBUG}Lauching ChronoVisor ...${NC}" + echo -e "${DEBUG}Launching ChronoVisor ...${NC}" VISOR_BIN="${VISOR_BIN_DIR}/${VISOR_BIN_FILE_NAME}" VISOR_ARGS="--config ${CONF_FILE}" - mpssh -f ${VISOR_HOSTS} "cd ${VISOR_BIN_DIR}; LD_LIBRARY_PATH=${LIB_DIR} nohup ${VISOR_BIN} ${VISOR_ARGS} > ${VISOR_BIN_FILE_NAME}.\$(hostname) 2>&1 &" | grep ares- 2>&1 + mpssh -f ${VISOR_HOSTS} "cd ${VISOR_BIN_DIR}; LD_LIBRARY_PATH=${LIB_DIR} nohup ${VISOR_BIN} ${VISOR_ARGS} > ${VISOR_BIN_FILE_NAME}${hostname_suffix}.log 2>&1 &" | grep "${simple_output_grep_keyword}" 2>&1 # launch Keeper - echo -e "${DEBUG}Lauching ChronoKeeper ...${NC}" + echo -e "${DEBUG}Launching ChronoKeeper ...${NC}" KEEPER_BIN="${KEEPER_BIN_DIR}/${KEEPER_BIN_FILE_NAME}" - KEEPER_ARGS="--config ${CONF_FILE}" - mpssh -f ${KEEPER_HOSTS} "cd ${KEEPER_BIN_DIR}; LD_LIBRARY_PATH=${LIB_DIR} nohup ${KEEPER_BIN} ${KEEPER_ARGS}.\$(hostname) > ${KEEPER_BIN_FILE_NAME}.\$(hostname) 2>&1 &" | grep ares- 2>&1 + KEEPER_ARGS="--config ${CONF_FILE}${hostname_suffix}" + mpssh -f ${KEEPER_HOSTS} "cd ${KEEPER_BIN_DIR}; LD_LIBRARY_PATH=${LIB_DIR} nohup ${KEEPER_BIN} ${KEEPER_ARGS} > ${KEEPER_BIN_FILE_NAME}${hostname_suffix}.log 2>&1 &" | grep "${simple_output_grep_keyword}" 2>&1 # launch Client - echo -e "${DEBUG}Lauching Client ...${NC}" + echo -e "${DEBUG}Launching Client ...${NC}" CLIENT_BIN="${CLIENT_BIN_DIR}/${CLIENT_BIN_FILE_NAME}" - CLIENT_ARGS="--config ${CONF_FILE}" - mpssh -f ${CLIENT_HOSTS} "cd ${CLIENT_BIN_DIR}; LD_LIBRARY_PATH=${LIB_DIR} nohup ${CLIENT_BIN} ${CLIENT_ARGS}.\$(hostname) > ${CLIENT_BIN_FILE_NAME}.\$(hostname) 2>&1 &" | grep ares- 2>&1 + CLIENT_ARGS="--config ${CONF_FILE}${hostname_suffix}" + mpssh -f ${CLIENT_HOSTS} "cd ${CLIENT_BIN_DIR}; LD_LIBRARY_PATH=${LIB_DIR} nohup ${CLIENT_BIN} ${CLIENT_ARGS} > ${CLIENT_BIN_FILE_NAME}${hostname_suffix}.log 2>&1 &" | grep "${simple_output_grep_keyword}" 2>&1 # check Visor - echo -e "${DEBUG}Checking ChronoVisor ...${NC}" - mpssh -f ${VISOR_HOSTS} "pgrep -fla ${VISOR_BIN_FILE_NAME}" | grep ares- 2>&1 + echo -e "${DEBUG}Running ChronoVisor (only one is expected):${NC}" + mpssh -f ${VISOR_HOSTS} "pgrep -fla ${VISOR_BIN_FILE_NAME}" | grep -v ssh | grep "${simple_output_grep_keyword}" 2>&1 # check Keeper - echo -e "${DEBUG}Checking ChronoKeeper ...${NC}" - mpssh -f ${KEEPER_HOSTS} "pgrep -fla ${KEEPER_BIN_FILE_NAME}" | grep ares- 2>&1 + echo -e "${DEBUG}Running ChronoKeepers:${NC}" + mpssh -f ${KEEPER_HOSTS} "pgrep -fla ${KEEPER_BIN_FILE_NAME}" | grep -v ssh | grep "${simple_output_grep_keyword}" 2>&1 # check Client - echo -e "${DEBUG}Checking Client ...${NC}" - mpssh -f ${CLIENT_HOSTS} "pgrep -fla ${CLIENT_BIN_FILE_NAME}" | grep ares- 2>&1 + echo -e "${DEBUG}Running Client (may ended already):${NC}" + mpssh -f ${CLIENT_HOSTS} "pgrep -fla ${CLIENT_BIN_FILE_NAME}" | grep -v ssh | grep "${simple_output_grep_keyword}" 2>&1 + + if [[ "${verbose}" == "true" ]] + then + echo -e "${DEBUG}Deploy done${NC}" + fi } reset() { + echo -e "${INFO}Resetting ...${NC}" + if [[ -z ${JOB_ID} ]] then - echo -e "${INFO}No JOB_ID provided, use hosts files in ${CONF_DIR}${NC}" + echo -e "${DEBUG}No JOB_ID provided, use hosts files in ${CONF_DIR}${NC}" check_hosts_files else - echo -e "${INFO}JOB_ID is provided, prepare hosts file first${NC}" + echo -e "${DEBUG}JOB_ID is provided, prepare hosts file first${NC}" prepare_hosts fi - echo -e "${INFO}Resetting ...${NC}" + if [[ "${local}" == "false" && "${verbose}" == "false" ]] + then + # grep only on Ares with simple output + simple_output_grep_keyword="ares-" + fi # kill Visor echo -e "${DEBUG}Killing ChronoVisor ...${NC}" - mpssh -f ${VISOR_HOSTS} "pkill --signal 9 -f ${VISOR_BIN_FILE_NAME}" | grep ares- 2>&1 + mpssh -f ${VISOR_HOSTS} "pkill --signal 9 -ef ${VISOR_BIN_FILE_NAME}" | grep -v ssh | grep "${simple_output_grep_keyword}" 2>&1 # kill Keeper echo -e "${DEBUG}Killing ChronoKeeper ...${NC}" - mpssh -f ${KEEPER_HOSTS} "pkill --signal 9 -f ${KEEPER_BIN_FILE_NAME}" | grep ares- 2>&1 + mpssh -f ${KEEPER_HOSTS} "pkill --signal 9 -ef ${KEEPER_BIN_FILE_NAME}" | grep -v ssh | grep "${simple_output_grep_keyword}" 2>&1 # kill Client echo -e "${DEBUG}Killing Client ...${NC}" - mpssh -f ${CLIENT_HOSTS} "pkill --signal 9 -f ${CLIENT_BIN_FILE_NAME}" | grep ares- 2>&1 + mpssh -f ${CLIENT_HOSTS} "pkill --signal 9 -ef ${CLIENT_BIN_FILE_NAME}" | grep -v ssh | grep "${simple_output_grep_keyword}" 2>&1 # check Visor - echo -e "${DEBUG}Checking ChronoVisor ...${NC}" - mpssh -f ${VISOR_HOSTS} "pgrep -fla ${VISOR_BIN_FILE_NAME}" | grep ares- 2>&1 + echo -e "${DEBUG}ChronoVisor left behind:${NC}" + mpssh -f ${VISOR_HOSTS} "pgrep -fla ${VISOR_BIN_FILE_NAME}" | grep -v ssh | grep "${simple_output_grep_keyword}" 2>&1 # check Keeper - echo -e "${DEBUG}Checking ChronoKeeper ...${NC}" - mpssh -f ${KEEPER_HOSTS} "pgrep -fla ${KEEPER_BIN_FILE_NAME}" | grep ares- 2>&1 + echo -e "${DEBUG}ChronoKeeper left behind:${NC}" + mpssh -f ${KEEPER_HOSTS} "pgrep -fla ${KEEPER_BIN_FILE_NAME}" | grep -v ssh | grep "${simple_output_grep_keyword}" 2>&1 # check Client - echo -e "${DEBUG}Checking Client ...${NC}" - mpssh -f ${CLIENT_HOSTS} "pgrep -fla ${CLIENT_BIN_FILE_NAME}" | grep ares- 2>&1 + echo -e "${DEBUG}Client left behind:${NC}" + mpssh -f ${CLIENT_HOSTS} "pgrep -fla ${CLIENT_BIN_FILE_NAME}" | grep -v ssh | grep "${simple_output_grep_keyword}" 2>&1 + if [[ "${verbose}" == "true" ]] + then + echo -e "${DEBUG}Reset done${NC}" + fi } parse_args() { - TEMP=$(getopt -o w:v:k:c:s:p:t:f:j:hidr --long work_dir:visor:,keeper:,client:,visor_hosts:,keeper_hosts:,client_hosts:,conf_file:,job_id:,help,install,deploy,reset -- "$@") + TEMP=$(getopt -o w:v:k:c:s:p:t:f:j:hidlre --long work_dir:visor:,keeper:,client:,visor_hosts:,keeper_hosts:,client_hosts:,conf_file:,job_id:,help,install,deploy,local,reset,verbose -- "$@") if [ $? != 0 ] ; then echo -e "${ERR}Terminating ...${NC}" >&2 ; exit 1 ; fi @@ -341,9 +411,16 @@ parse_args() { -d|--deploy) deploy=true shift ;; + -l|--local) + local=true + HOSTNAME_HS_NET_SUFFIX="" + shift ;; -r|--reset) reset=true shift ;; + -e|--verbose) + verbose=true + shift ;; --) shift; break ;; *) @@ -372,6 +449,8 @@ parse_args() { } prepare_hosts() { + echo -e "${INFO}Preparing hosts files ...${NC}" + if [ -n "$SLURM_JOB_ID" ] then echo -e "${DEBUG}Launched as a SLURM job, getting hosts from job ${SLURM_JOB_ID} ...${NC}" @@ -399,12 +478,18 @@ prepare_hosts() { fi check_hosts_files fi + + if [[ "${verbose}" == "true" ]] + then + echo -e "${DEBUG}Prepare hosts file done${NC}" + fi } usage() { - echo "Usage: $0 -i|--install Prepare ChronoLog deployment + echo "Usage: $0 -i|--install Re-prepare ChronoLog deployment -d|--deploy Start ChronoLog deployment -r|--reset Reset ChronoLog deployment + -l|--local Local install/deployment/reset -w|--work_dir WORK_DIR -v|--visor VISOR_BIN -k|--keeper KEEPER_BIN @@ -414,6 +499,7 @@ usage() { -t|--client_hosts CLIENT_HOSTS -f|--conf_file CONF_FILE -j|--job_id JOB_ID + -e|--verbose Enable verbose output -h|--help Print this page" exit 1 } From 6cec659ba6cf7c48997a9f36a63064d62582800c Mon Sep 17 00:00:00 2001 From: Kun Feng Date: Thu, 11 Apr 2024 15:13:23 -0500 Subject: [PATCH 32/40] Print default values in usage output --- deploy/single_user_deploy.sh | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/deploy/single_user_deploy.sh b/deploy/single_user_deploy.sh index 7074380d..5d02c062 100755 --- a/deploy/single_user_deploy.sh +++ b/deploy/single_user_deploy.sh @@ -29,7 +29,7 @@ HOSTNAME_HS_NET_SUFFIX="-40g" JOB_ID="" install=false deploy=false -local=true +local=false reset=false verbose=false @@ -486,20 +486,20 @@ prepare_hosts() { } usage() { - echo "Usage: $0 -i|--install Re-prepare ChronoLog deployment - -d|--deploy Start ChronoLog deployment - -r|--reset Reset ChronoLog deployment - -l|--local Local install/deployment/reset - -w|--work_dir WORK_DIR - -v|--visor VISOR_BIN - -k|--keeper KEEPER_BIN - -c|--client CLIENT_BIN - -s|--visor_hosts VISOR_HOSTS - -p|--keeper_hosts KEEPER_HOSTS - -t|--client_hosts CLIENT_HOSTS - -f|--conf_file CONF_FILE - -j|--job_id JOB_ID - -e|--verbose Enable verbose output + echo "Usage: $0 -i|--install Re-prepare ChronoLog deployment (default: false) + -d|--deploy Start ChronoLog deployment (default: false) + -r|--reset Reset/cleanup ChronoLog deployment (default: false) + -l|--local Local install/deployment/reset (default: false) + -w|--work_dir WORK_DIR (default: ~/chronolog) + -v|--visor VISOR_BIN (default: work_dir/bin/chronovisor_server) + -k|--keeper KEEPER_BIN (default: work_dir/bin/chrono_keeper) + -c|--client CLIENT_BIN (default: work_dir/bin/client_lib_multi_storytellers) + -s|--visor_hosts VISOR_HOSTS (default: work_dir/conf/hosts_visor) + -p|--keeper_hosts KEEPER_HOSTS (default: work_dir/conf/hosts_keeper) + -t|--client_hosts CLIENT_HOSTS (default: work_dir/conf/hosts_client) + -f|--conf_file CONF_FILE (default: work_dir/conf/default_conf.json) + -j|--job_id JOB_ID (default: "") + -e|--verbose Enable verbose output (default: false) -h|--help Print this page" exit 1 } From df8202780b8371d8747e6fd1b287110c7fa5bd05 Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Wed, 27 Mar 2024 13:54:41 -0500 Subject: [PATCH 33/40] Grapher Configuration and GrapherRegClient --- ChronoGrapher/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/ChronoGrapher/CMakeLists.txt b/ChronoGrapher/CMakeLists.txt index 2e8b2bb6..3ab1fd66 100644 --- a/ChronoGrapher/CMakeLists.txt +++ b/ChronoGrapher/CMakeLists.txt @@ -16,6 +16,7 @@ target_sources(chrono_grapher PRIVATE StoryPipeline.cpp KeeperDataStore.cpp ../chrono_common/StoryChunk.cpp + ../chrono_common/ConfigurationManager.cpp StoryChunkExtractor.cpp CSVFileChunkExtractor.cpp ../ChronoAPI/ChronoLog/src/log.cpp) From dd4dd214f7edb46a897c661980b28e435927b3f5 Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Thu, 18 Apr 2024 16:47:49 -0500 Subject: [PATCH 34/40] HOTFIX: keeper chunk_filename=storyId.chunkStartTime.keeperIP.port.csv --- ChronoKeeper/CSVFileChunkExtractor.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ChronoKeeper/CSVFileChunkExtractor.cpp b/ChronoKeeper/CSVFileChunkExtractor.cpp index 8050cdb6..3ef107cc 100644 --- a/ChronoKeeper/CSVFileChunkExtractor.cpp +++ b/ChronoKeeper/CSVFileChunkExtractor.cpp @@ -23,14 +23,18 @@ chronolog::CSVFileStoryChunkExtractor::~CSVFileStoryChunkExtractor() void chronolog::CSVFileStoryChunkExtractor::processStoryChunk(chronolog::StoryChunk*story_chunk) { std::ofstream chunk_fstream; + + // chunk_filename: /rootDirectory/storyId.chunkStartTime.keeperIP.port.csv + std::string chunk_filename(rootDirectory); + chunk_filename += "/" + std::to_string(story_chunk->getStoryId()) + "." + std::to_string(story_chunk->getStartTime() / 1000000000) + "."; keeperIdCard.getIPasDottedString(chunk_filename); - chunk_filename += "." + std::to_string(story_chunk->getStoryId()) + "." + - std::to_string(story_chunk->getStartTime() / 1000000000) + ".csv"; + chunk_filename += "." + std::to_string(keeperIdCard.getPort()) + ".csv"; tl::xstream es = tl::xstream::self(); LOG_INFO("[CSVFileStoryChunkExtractor] Processing StoryChunk: ES={}, ULT={}, StoryID={}, StartTime={}", es.get_rank() , tl::thread::self_id(), story_chunk->getStoryId(), story_chunk->getStartTime()); + // current thread if the only one that has this storyChunk and the only one that's writing to this chunk csv file // thus no additional locking is needed ... chunk_fstream.open(chunk_filename, std::ofstream::out|std::ofstream::app); From d72754760d86c484a3360ce152b4da88b37d2f5a Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Tue, 23 Apr 2024 15:18:51 -0500 Subject: [PATCH 35/40] added operator +=(string,..) for ServiceId, KeeperIdCard, GrapherIdCard --- chrono_common/ConfigurationManager.h | 1 - chrono_common/GrapherIdCard.h | 2 +- chrono_common/KeeperIdCard.h | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/chrono_common/ConfigurationManager.h b/chrono_common/ConfigurationManager.h index 41300d14..a488f0c0 100644 --- a/chrono_common/ConfigurationManager.h +++ b/chrono_common/ConfigurationManager.h @@ -867,7 +867,6 @@ class ConfigurationManager void parseClientConf(json_object*json_conf) { - const char*string_value = json_object_get_string(json_conf); json_object_object_foreach(json_conf, key, val) { if(strcmp(key, "VisorClientPortalService") == 0) diff --git a/chrono_common/GrapherIdCard.h b/chrono_common/GrapherIdCard.h index fe98922d..df67fd3a 100644 --- a/chrono_common/GrapherIdCard.h +++ b/chrono_common/GrapherIdCard.h @@ -87,7 +87,7 @@ inline std::ostream & operator<< (std::ostream & out , chronolog::GrapherIdCard return out; } -inline std::string& operator+(std::string& a_string, chronolog::GrapherIdCard const& id_card) +inline std::string& operator+= (std::string& a_string, chronolog::GrapherIdCard const& id_card) { a_string += std::string("GrapherIdCard{") + std::to_string(id_card.getGroupId()) + ":" + id_card.getIPasDottedString(a_string) + ":" + std::to_string(id_card.getPort()) + ":" + diff --git a/chrono_common/KeeperIdCard.h b/chrono_common/KeeperIdCard.h index 259c81d0..369bcc1e 100644 --- a/chrono_common/KeeperIdCard.h +++ b/chrono_common/KeeperIdCard.h @@ -94,7 +94,7 @@ inline std::ostream & operator<< (std::ostream & out , chronolog::KeeperIdCard c return out; } -inline std::string& operator+(std::string& a_string, chronolog::KeeperIdCard const& keeper_id_card) +inline std::string& operator+= (std::string& a_string, chronolog::KeeperIdCard const& keeper_id_card) { a_string += std::string("KeeperIdCard{") + std::to_string(keeper_id_card.getGroupId()) + ":" + keeper_id_card.getIPasDottedString(a_string) + ":" + std::to_string(keeper_id_card.getPort()) + ":" + From 4fd6c2b53893c063e4146b808a64e74280fd2bfc Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Tue, 23 Apr 2024 19:01:06 -0500 Subject: [PATCH 36/40] better tracking of recording group activity status --- ChronoVisor/include/ChronicleMetaDirectory.h | 6 +- ChronoVisor/include/KeeperRegistry.h | 18 +- ChronoVisor/src/ChronicleMetaDirectory.cpp | 55 +---- ChronoVisor/src/KeeperRegistry.cpp | 214 +++++++++++-------- ChronoVisor/src/VisorClientPortal.cpp | 18 +- chrono_common/ServiceId.h | 9 +- 6 files changed, 153 insertions(+), 167 deletions(-) diff --git a/ChronoVisor/include/ChronicleMetaDirectory.h b/ChronoVisor/include/ChronicleMetaDirectory.h index 3474573f..ae302711 100644 --- a/ChronoVisor/include/ChronicleMetaDirectory.h +++ b/ChronoVisor/include/ChronicleMetaDirectory.h @@ -32,17 +32,15 @@ class ChronicleMetaDirectory int destroy_chronicle(const std::string &name); - //int create_story(std::string const& chronicle_name, const std::string& story_name, - // const std::unordered_map& attrs); int destroy_story(std::string const &chronicle_name, const std::string &story_name); int acquire_story(chronolog::ClientId const &client_id, const std::string &chronicle_name, const std::string &story_name - , const std::unordered_map &attrs, int &flags, StoryId &, bool &); + , const std::unordered_map &attrs, int &flags, StoryId &); int release_story(chronolog::ClientId const &client_id, const std::string &chronicle_name, const std::string &story_name - , StoryId &, bool &); + , StoryId &); int get_chronicle_attr(std::string const &name, const std::string &key, std::string &value); diff --git a/ChronoVisor/include/KeeperRegistry.h b/ChronoVisor/include/KeeperRegistry.h index 7b56f650..1954bfee 100644 --- a/ChronoVisor/include/KeeperRegistry.h +++ b/ChronoVisor/include/KeeperRegistry.h @@ -43,18 +43,11 @@ class KeeperProcessEntry , active(false) , lastStatsTime(0) , activeStoryCount(0) - {} - - KeeperProcessEntry(KeeperProcessEntry const& other) = default; - - void reset() { - keeperAdminClient = nullptr; - active = false; - lastStatsTime = 0; - activeStoryCount = 0; + idCardString += idCard; } + KeeperProcessEntry(KeeperProcessEntry const& other) = default; ~KeeperProcessEntry() = default;// Registry is reponsible for creating & deleting keeperAdminClient KeeperIdCard idCard; @@ -77,7 +70,9 @@ class GrapherProcessEntry , active(false) , lastStatsTime(0) , activeStoryCount(0) - {} + { + idCardString += idCard; + } GrapherProcessEntry(GrapherProcessEntry const& other) = default; ~GrapherProcessEntry() = default;// Registry is reponsible for creating & deleting keeperAdminClient @@ -98,12 +93,14 @@ class GrapherProcessEntry public: RecordingGroup(RecordingGroupId group_id, GrapherProcessEntry* grapher_ptr = nullptr) : groupId(group_id) + , activeKeeperCount(0) , grapherProcess(grapher_ptr) {} RecordingGroup(RecordingGroup const& other) = default; ~RecordingGroup() = default; + bool isActive() const; void startDelayedGrapherExit(GrapherProcessEntry&, std::time_t); void clearDelayedExitGrapher(GrapherProcessEntry&, std::time_t); void startDelayedKeeperExit(KeeperProcessEntry&, std::time_t); @@ -112,6 +109,7 @@ class GrapherProcessEntry std::vector& getActiveKeepers(std::vector& keeper_id_cards); RecordingGroupId groupId; + size_t activeKeeperCount; GrapherProcessEntry* grapherProcess; std::map, KeeperProcessEntry> keeperProcesses; }; diff --git a/ChronoVisor/src/ChronicleMetaDirectory.cpp b/ChronoVisor/src/ChronicleMetaDirectory.cpp index 0b02f1ab..29146b34 100644 --- a/ChronoVisor/src/ChronicleMetaDirectory.cpp +++ b/ChronoVisor/src/ChronicleMetaDirectory.cpp @@ -171,53 +171,6 @@ int ChronicleMetaDirectory::destroy_chronicle(const std::string &name) } } -/** - * Create a Story - * @param chronicle_name: name of the Chronicle that the Story belongs to - * @param story_name: name of the Story - * @param attrs: attributes associated with the Story - * @return CL_SUCCESS if succeed to create the Story \n - * CL_ERR_NOT_EXIST if the Chronicle does not exist \n - * CL_ERR_STORY_EXISTS if a Story with the same name already exists \n - * CL_ERR_UNKNOWN otherwise - */ -/*int ChronicleMetaDirectory::create_story(std::string const& chronicle_name, - const std::string &story_name, - const std::unordered_map &attrs) { - LOG_DEBUG("creating Story name=%s in Chronicle name=%s", story_name.c_str(), chronicle_name.c_str()); - std::chrono::steady_clock::time_point t1, t2; - t1 = std::chrono::steady_clock::now(); - std::lock_guard chronicleMapLock(g_chronicleMetaDirectoryMutex_); - // First check if Chronicle exists, fail if false - uint64_t cid; -// auto name2IdRecord = chronicleName2IdMap_->find(chronicle_name); -// if (name2IdRecord != chronicleName2IdMap_->end()) { -// cid = name2IdRecord->second; - cid = CityHash64(chronicle_name.c_str(), chronicle_name.length()); - auto chronicleMapRecord = chronicleMap_->find(cid); - if (chronicleMapRecord != chronicleMap_->end()) { - Chronicle *pChronicle = chronicleMap_->find(cid)->second; - LOG_DEBUG("Chronicle@%p", &(*pChronicle)); - //TODO: check if the story exists and handle it gracefully - uint64_t sid = pChronicle->getStoryId(story_name); - if (sid> 0) { - LOG_DEBUG("StoryID=%lu name=%s exists", sid, story_name.c_str()); - return CL_ERR_STORY_EXISTS; - } - else - { - CL_Status res = pChronicle->addStory(chronicle_name, cid, story_name, attrs); - t2 = std::chrono::steady_clock::now(); - std::chrono::duration duration = (t2 - t1); - LOG_DEBUG("time in %s: %lf ns", __FUNCTION__, duration.count()); - return res; - } - } else { - LOG_DEBUG("Chronicle name=%s does not exist", chronicle_name.c_str()); - return CL_ERR_NOT_EXIST; - } -} -*/ /** * Destroy a Story @@ -285,7 +238,6 @@ int ChronicleMetaDirectory::destroy_story(std::string const &chronicle_name, con * @param story_name: name of the Story * @param flags: flags * @param story_id to populate with the story_id assigned to the story - * @param notify_keepers , bool value that would be set to true if this is the first client to acquire the story * @return CL_SUCCESS if succeed to destroy the Story \n * CL_ERR_NOT_EXIST if the Chronicle does not exist \n * CL_ERR_UNKNOWN otherwise @@ -293,7 +245,7 @@ int ChronicleMetaDirectory::destroy_story(std::string const &chronicle_name, con int ChronicleMetaDirectory::acquire_story(chl::ClientId const &client_id, const std::string &chronicle_name , const std::string &story_name , const std::unordered_map &attrs, int &flags - , StoryId &story_id, bool ¬ify_keepers) + , StoryId &story_id) { LOG_DEBUG("[ChronicleMetaDirectory] ClientID={} acquiring StoryName={} in ChronicleName={} with Flags={}", client_id , story_name.c_str(), chronicle_name.c_str(), flags); @@ -332,7 +284,6 @@ int ChronicleMetaDirectory::acquire_story(chl::ClientId const &client_id, const /* All checks passed, manipulate metadata */ story_id = pStory->getSid(); - notify_keepers = (pStory->getAcquisitionCount() == 0 ? true : false); /* Increment AcquisitionCount */ pStory->incrementAcquisitionCount(); @@ -350,14 +301,13 @@ int ChronicleMetaDirectory::acquire_story(chl::ClientId const &client_id, const * @param story_name: name of the Story * @param flags: flags * @param story_id to populate with the story_id assigned to the story - * @param notify_keepers , bool value that would be set to true if this is the last client to release the story * @return CL_SUCCESS if succeed to destroy the Story \n * CL_ERR_NOT_EXIST if the Chronicle does not exist \n * CL_ERR_UNKNOWN otherwise */ //TO_DO return acquisition_count after the story has been released int ChronicleMetaDirectory::release_story(chl::ClientId const &client_id, const std::string &chronicle_name - , const std::string &story_name, StoryId &story_id, bool ¬ify_keepers) + , const std::string &story_name, StoryId &story_id) { LOG_DEBUG("[ChronicleMetaDirectory] ClientID={} releasing StoryName={} in ChronicleName={}", client_id , story_name.c_str(), chronicle_name.c_str()); @@ -389,7 +339,6 @@ int ChronicleMetaDirectory::release_story(chl::ClientId const &client_id, const /* Decrement AcquisitionCount */ pStory->decrementAcquisitionCount(); story_id = pStory->getSid(); - notify_keepers = (pStory->getAcquisitionCount() == 0 ? true : false); /* Remove this client from acquirerClientList of the Story */ pStory->removeAcquirerClient(client_id); /* Remove this Story from acquiredStoryMap for this client */ diff --git a/ChronoVisor/src/KeeperRegistry.cpp b/ChronoVisor/src/KeeperRegistry.cpp index 7cb4ad1f..b2f10f7b 100644 --- a/ChronoVisor/src/KeeperRegistry.cpp +++ b/ChronoVisor/src/KeeperRegistry.cpp @@ -94,6 +94,8 @@ int KeeperRegistry::ShutdownRegistryService() registryState = SHUTTING_DOWN; LOG_INFO("[KeeperRegistry] Shutting down..."); + activeGroups.clear(); + activeStories.clear(); while(!recordingGroups.empty()) { @@ -274,7 +276,7 @@ int KeeperRegistry::registerKeeperProcess(KeeperRegistrationMsg const &keeper_re } //create a client of Keeper's DataStoreAdminService listenning at adminServiceId - std::string service_na_string("ofi+sockets://"); + std::string service_na_string("ofi+sockets://"); //TODO: add protocol to serviceId and keeperIdCard service_na_string = admin_service_id.getIPasDottedString(service_na_string) + ":" + std::to_string(admin_service_id.port); @@ -302,6 +304,7 @@ int KeeperRegistry::registerKeeperProcess(KeeperRegistrationMsg const &keeper_re (*insert_return.first).second.keeperAdminClient = collectionClient; (*insert_return.first).second.active = true; + recording_group.activeKeeperCount += 1; LOG_INFO("[KeeperRegistry] Register Keeper: KeeperIdCard: {} created DataStoreAdminClient for {}: provider_id={}" , id_string.str(), service_na_string, admin_service_id.provider_id); @@ -311,7 +314,8 @@ int KeeperRegistry::registerKeeperProcess(KeeperRegistrationMsg const &keeper_re // check if this is the first keeper for the recording group and the group is ready to be part of // the activeGroups rotation - if(recording_group.keeperProcesses.size() == 1 && recording_group.grapherProcess != nullptr) + + if(recording_group.isActive() && recording_group.activeKeeperCount == 1) { activeGroups.push_back(&recording_group); size_t new_seed = std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now()); @@ -320,7 +324,7 @@ int KeeperRegistry::registerKeeperProcess(KeeperRegistrationMsg const &keeper_re std::uniform_int_distribution(0, activeGroups.size() - 1);//reset the distribution range } - LOG_INFO("[KeeperRegistry] has {} RecordingGroups ; {} activeGroups", recordingGroups.size(), activeGroups.size()); + LOG_INFO("[KeeperRegistry] has {} activeGroups; {} RecordingGroups ", activeGroups.size(), recordingGroups.size()); if(activeGroups.size() > 0) { registryState = RUNNING; } return chronolog::CL_SUCCESS; } @@ -343,42 +347,42 @@ int KeeperRegistry::unregisterKeeperProcess(KeeperIdCard const &keeper_id_card) auto keeper_process_iter = recording_group.keeperProcesses.find( std::pair(keeper_id_card.getIPaddr(), keeper_id_card.getPort())); - if(keeper_process_iter != recording_group.keeperProcesses.end()) + if(keeper_process_iter == recording_group.keeperProcesses.end()) + { + //we don't have a record of this keeper, we have nothing to do + return CL_SUCCESS; + } + else { + // check if the group is active and the keeper we are about to unregister is the only one this group has + // and the group needs to be removed from the active group rotation + if(recording_group.isActive() && (*keeper_process_iter).second.active && recording_group.activeKeeperCount == 1) + { + activeGroups.erase(std::remove(activeGroups.begin(), activeGroups.end(), &recording_group)); + if(activeGroups.size() > 0) + {//reset the group distribution + size_t new_seed = std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now()); + mt_random.seed(new_seed);//re-seed the mersene_twister_generator + group_id_distribution = + std::uniform_int_distribution(0, activeGroups.size() - 1);//reset the distribution range + } + } + // we mark the keeperProcessEntry as inactive and set the time it would be safe to delete. // we delay the destruction of the keeperEntry & keeperAdminClient by 5 secs // to prevent the case of deleting the keeperAdminClient while it might be waiting for rpc response on the // other thread - std::stringstream id_string; - id_string << keeper_id_card; std::time_t delayedExitTime = std::chrono::high_resolution_clock::to_time_t( std::chrono::high_resolution_clock::now() + std::chrono::seconds(delayedDataAdminExitSeconds)); - LOG_INFO("[KeeperRegistry] unregisterKeeperProcess() starting delayedExit for keeper {} delayedExitTime={}", - id_string.str(), std::ctime(&delayedExitTime)); - ; recording_group.startDelayedKeeperExit((*keeper_process_iter).second, delayedExitTime); } LOG_INFO("[KeeperRegistry] RecordingGroup {} has {} keepers", recording_group.groupId, recording_group.keeperProcesses.size()); - // now that we are still holding registryLock - // check if the keeper we've just unregistered was the only one for the recordingGroup - // and the group can't perform recording duties any longer - if(recording_group.keeperProcesses.size() == 1) - { - activeGroups.erase(std::remove(activeGroups.begin(), activeGroups.end(), &recording_group)); - if(activeGroups.size() > 0) - {//reset the group distribution - size_t new_seed = std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now()); - mt_random.seed(new_seed);//re-seed the mersene_twister_generator - group_id_distribution = - std::uniform_int_distribution(0, activeGroups.size() - 1);//reset the distribution range - } - } - - LOG_INFO("[KeeperRegistry] has {} RecordingGroups ; {} activeGroups", recordingGroups.size(), activeGroups.size()); + LOG_INFO("[KeeperRegistry] has {} activeGroups; {} RecordingGroups ", activeGroups.size(), recordingGroups.size()); + // update registryState if needed if(!is_shutting_down() && activeGroups.size() == 0) { registryState = INITIALIZED; } @@ -413,6 +417,10 @@ void KeeperRegistry::updateKeeperProcessStats(KeeperStatsMsg const &keeperStatsM } ///////////////// +// NOTE: RecordingGroup methods are not currently protected by lock +// the assumptions is that the caller would use RegistryLock before calling the RecordingGroup method +// we may decide to revisit this and introduce RecordingGroup level locks later on.. + std::vector& RecordingGroup::getActiveKeepers(std::vector& keeper_id_cards) { // NOTE: RecordingGroup methods are not currently protected by lock @@ -430,14 +438,13 @@ std::vector& RecordingGroup::getActiveKeepers(std::vector= (*iter).second.delayedExitClients.front().first)) { auto dataStoreClientPair = (*iter).second.delayedExitClients.front(); - LOG_INFO("[KeeperRegistry] getActiveKeepers() destroys dataAdminClient for keeper {} current_time={} delayedExitTime={}",id_string.str(), ctime(¤t_time), ctime(&(dataStoreClientPair.first))); + LOG_INFO("[KeeperRegistry] getActiveKeepers() destroys dataAdminClient for keeper {} current_time={} delayedExitTime={}", + (*iter).second.idCardString, ctime(¤t_time)); if(dataStoreClientPair.second != nullptr) { delete dataStoreClientPair.second; } (*iter).second.delayedExitClients.pop_front(); } @@ -452,13 +459,13 @@ std::vector& RecordingGroup::getActiveKeepers(std::vectorgetActiveKeepers(vectorOfKeepers); } - + recording_group->getActiveKeepers(vectorOfKeepers); + + //no need for notification , group processes are already recording this story + LOG_INFO("[Registry] RecordingGroup {} is already recording story {}", recording_group->groupId,story_id); + return chronolog::CL_SUCCESS; } - // pick recording_group from the group id distribution using a random int value + // select recording_group from the group id distribution using a random int value // generated by Mirsene Twister generator // NOTE: using uniform_distribution for now, we might add discrete distribution with weights later... @@ -509,6 +518,8 @@ int KeeperRegistry::notifyRecordingGroupOfStoryRecordingStart(ChronicleName cons activeStories[story_id] = recording_group; } + LOG_INFO("[Registry] selected RecordingGroup {} for story {}", recording_group->groupId, story_id); + std::time_t story_start_time = std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now()); @@ -543,8 +554,6 @@ int KeeperRegistry::notifyGrapherOfStoryRecordingStart(RecordingGroup& recording DataStoreAdminClient* dataAdminClient = nullptr; - std::stringstream id_string; - { // NOTE: we release the registryLock before sending rpc request so that we do not hold it for the duration of rpc communication. // We delay the destruction of unactive adminClients that might be triggered by the unregister call from a different thread @@ -555,7 +564,6 @@ int KeeperRegistry::notifyGrapherOfStoryRecordingStart(RecordingGroup& recording if(recordingGroup.grapherProcess != nullptr && recordingGroup.grapherProcess->active && recordingGroup.grapherProcess->adminClient != nullptr) { - id_string << recordingGroup.grapherProcess->idCard; dataAdminClient = recordingGroup.grapherProcess->adminClient; } else @@ -572,17 +580,17 @@ int KeeperRegistry::notifyGrapherOfStoryRecordingStart(RecordingGroup& recording return_code = dataAdminClient->send_start_story_recording(chronicle, story, storyId, story_start_time); if(return_code != CL_SUCCESS) { - LOG_WARNING("[KeeperRegistry] Registry failed RPC notification to {}", id_string.str()); + LOG_WARNING("[KeeperRegistry] Registry failed RPC notification to {}", recordingGroup.grapherProcess->idCardString); } else { LOG_INFO("[KeeperRegistry] Registry notified {} to start recording StoryID={} with StartTime={}", - id_string.str(), storyId, story_start_time); + recordingGroup.grapherProcess->idCardString, storyId, story_start_time); } } catch(thallium::exception const& ex) { - LOG_WARNING("[KeeperRegistry] Registry failed RPC notification to grapher {}", id_string.str()); + LOG_WARNING("[KeeperRegistry] Registry failed RPC notification to grapher {}", recordingGroup.grapherProcess->idCardString); } return return_code; @@ -746,10 +754,9 @@ int KeeperRegistry::notifyRecordingGroupOfStoryRecordingStop(StoryId const& stor } recording_group = (*story_iter).second; - if(recording_group != nullptr) { recording_group->getActiveKeepers(vectorOfKeepers); } - activeStories.erase(story_id); + activeStories.erase(story_iter); } if(recording_group != nullptr) @@ -757,9 +764,10 @@ int KeeperRegistry::notifyRecordingGroupOfStoryRecordingStop(StoryId const& stor // the registryLock is released by this point.. // notify Grapher and notifyKeepers functions use delayedExit logic to protect // the rpc code from DataAdminClients being destroyed while notification is in progress.. - notifyGrapherOfStoryRecordingStop(*recording_group, story_id); notifyKeepersOfStoryRecordingStop(*recording_group, vectorOfKeepers, story_id); + + notifyGrapherOfStoryRecordingStop(*recording_group, story_id); } return CL_SUCCESS; @@ -777,7 +785,6 @@ int KeeperRegistry::notifyKeepersOfStoryRecordingStop(RecordingGroup& recordingG auto keeper_processes = recordingGroup.keeperProcesses; - size_t keepers_left_to_notify = vectorOfKeepers.size(); for(KeeperIdCard keeper_id_card: vectorOfKeepers) { DataStoreAdminClient* dataAdminClient = nullptr; @@ -837,7 +844,7 @@ int KeeperRegistry::registerGrapherProcess(GrapherRegistrationMsg const & reg_ms //re-check state after ther lock is aquired if(is_shutting_down()) { return chronolog::CL_ERR_UNKNOWN; } - //find the group that keeper belongs to in the registry + //find the group that grapher belongs to in the registry auto group_iter = recordingGroups.find(group_id); if(group_iter == recordingGroups.end()) { @@ -896,7 +903,7 @@ int KeeperRegistry::registerGrapherProcess(GrapherRegistrationMsg const & reg_ms } //create a client of the new grapher's DataStoreAdminService listenning at adminServiceId - std::string service_na_string("ofi+sockets://"); + std::string service_na_string("ofi+sockets://"); //TODO: add protocol string to serviceIdCard service_na_string = admin_service_id.getIPasDottedString(service_na_string) + ":" + std::to_string(admin_service_id.port); @@ -921,23 +928,24 @@ int KeeperRegistry::registerGrapherProcess(GrapherRegistrationMsg const & reg_ms recording_group.keeperProcesses.size()); // now that communnication with the Grapher is established and we are still holding registryLock - // add the group to the activeGroups rotation if it's ready - if(recording_group.keeperProcesses.size() > 0 && recording_group.grapherProcess != nullptr) + // check if the group is ready for active group rotation + if(recording_group.isActive()) { activeGroups.push_back(&recording_group); - if(activeGroups.size() > 0) - {//reset the group distribution - size_t new_seed = std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now()); - mt_random.seed(new_seed);//re-seed the mersene_twister_generator - group_id_distribution = - std::uniform_int_distribution(0, activeGroups.size() - 1);//reset the distribution range - } + + //reset the group distribution + size_t new_seed = std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now()); + mt_random.seed(new_seed);//re-seed the mersene_twister_generator + group_id_distribution = std::uniform_int_distribution(0, activeGroups.size() - 1); } - LOG_INFO("[KeeperRegistry] has {} RecordingGroups ; {} activeGroups", recordingGroups.size(), activeGroups.size()); - // now that communnication with the Grapher is established and we still holding registryLock - // update registryState in case this is the first group registration - if(activeGroups.size() > 0) { registryState = RUNNING; } + LOG_INFO("[KeeperRegistry] has {} activeGroups; {} RecordingGroups ", activeGroups.size(), recordingGroups.size()); + // we still holding registryLock + // update registryState if needed + if(activeGroups.size() > 0) + { + registryState = RUNNING; + } return chronolog::CL_SUCCESS; } ///////////////// @@ -953,8 +961,36 @@ int KeeperRegistry::unregisterGrapherProcess(GrapherIdCard const& grapher_id_car RecordingGroup& recording_group = ((*group_iter).second); - std::stringstream id_string; - id_string << grapher_id_card; + // we are about to unregister the grapher so the group can't perform recording duties + // if it were an active recordingGroup before remove it from rotation + if(recording_group.isActive()) + { + auto active_group_iter = activeGroups.begin(); + while (active_group_iter != activeGroups.end()) + { + if((*active_group_iter) != &recording_group) + { ++active_group_iter;} + else + { break; } + } + + if(active_group_iter != activeGroups.end()) + { + //INNA: what do we do with any active Stories that this group were recordng? force release them and notify clients? + // wait for the new grapher? + LOG_INFO("[KeeperRegistry] RecordingGroup {} is not active; activeGroups.size{}", recording_group.groupId,activeGroups.size()); + activeGroups.erase(active_group_iter); + if(activeGroups.size() > 0) + { + //reset the group distribution + size_t new_seed = std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now()); + mt_random.seed(new_seed);//re-seed the mersene_twister_generator + group_id_distribution = + std::uniform_int_distribution(0, activeGroups.size() - 1);//reset the distribution range + } + } + } + if(recording_group.grapherProcess != nullptr && recording_group.grapherProcess->active) { // start delayed destruction for the lingering Adminclient to be safe... @@ -963,7 +999,7 @@ int KeeperRegistry::unregisterGrapherProcess(GrapherIdCard const& grapher_id_car std::time_t delayedExitTime = std::chrono::high_resolution_clock::to_time_t( std::chrono::high_resolution_clock::now() + std::chrono::seconds(delayedDataAdminExitSeconds)); - LOG_INFO("[KeeperRegistry] grapher {} starting delayedExit for grapher {} delayedExitTime={}", id_string.str(), + LOG_INFO("[KeeperRegistry] grapher {} starting delayedExit for grapher {} delayedExitTime={}", recording_group.grapherProcess->idCardString, std::ctime(&delayedExitTime)); recording_group.startDelayedGrapherExit(*(recording_group.grapherProcess), delayedExitTime); @@ -971,22 +1007,11 @@ int KeeperRegistry::unregisterGrapherProcess(GrapherIdCard const& grapher_id_car // now that we are still holding registryLock // update registryState if needed - LOG_INFO("[KeeperRegistry] RecordingGroup {} has no grappher and {} keepers", recording_group.groupId, + LOG_INFO("[KeeperRegistry] RecordingGroup {} has no grapher and {} keepers", recording_group.groupId, recording_group.keeperProcesses.size()); - - // we've just unregistered the grapher so the group can't perform recording duties any longer - { - activeGroups.erase(std::remove(activeGroups.begin(), activeGroups.end(), &recording_group)); - if(activeGroups.size() > 0) - {//reset the group distribution - size_t new_seed = std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now()); - mt_random.seed(new_seed);//re-seed the mersene_twister_generator - group_id_distribution = - std::uniform_int_distribution(0, activeGroups.size() - 1);//reset the distribution range - } - } - - LOG_INFO("[KeeperRegistry] has {} RecordingGroups ; {} activeGroups", recordingGroups.size(), activeGroups.size()); + + LOG_INFO("[KeeperRegistry] has {} activeGroups; {} RecordingGroups ", activeGroups.size(), recordingGroups.size()); + // update registryState in case this was the last active recordingGroup if(activeGroups.size() == 0) { registryState = INITIALIZED; } @@ -998,13 +1023,29 @@ int KeeperRegistry::unregisterGrapherProcess(GrapherIdCard const& grapher_id_car /////////////// +bool chl::RecordingGroup::isActive() const +{ + //TODO: we might add a check for time since the last stats message received from + // the processes listed as active + + if(grapherProcess != nullptr && grapherProcess->active && activeKeeperCount >0) + { + LOG_DEBUG("[REGISTRY] RecordingGroup {} is active", groupId); + return true; + } + else + { + LOG_DEBUG("[REGISTRY] RecordingGroup {} is not active", groupId); + return false; + } +} + void chl::RecordingGroup::startDelayedGrapherExit(chl::GrapherProcessEntry& grapher_process, std::time_t delayedExitTime) { grapher_process.active = false; - LOG_DEBUG("[KeeperRegistry] recording_group {} starts delayedExit for {}", groupId, - grapher_process.idCardString.str()); + LOG_INFO("[KeeperRegistry] recording_group {} starts delayedExit for {}", groupId, grapher_process.idCardString); if(grapher_process.adminClient != nullptr) { grapher_process.delayedExitGrapherClients.push_back( @@ -1018,8 +1059,8 @@ void chl::RecordingGroup::clearDelayedExitGrapher(chl::GrapherProcessEntry& grap while(!grapher_process.delayedExitGrapherClients.empty() && (current_time >= grapher_process.delayedExitGrapherClients.front().first)) { - LOG_DEBUG("[KeeperRegistry] recording_Group {}, destroys delayed dataAdmindClient for {}", groupId, - grapher_process.idCardString.str()); + LOG_INFO("[KeeperRegistry] recording_Group {}, destroys delayed dataAdmindClient for {}", groupId, + grapher_process.idCardString); auto dataStoreClientPair = grapher_process.delayedExitGrapherClients.front(); if(dataStoreClientPair.second != nullptr) { delete dataStoreClientPair.second; } grapher_process.delayedExitGrapherClients.pop_front(); @@ -1034,9 +1075,9 @@ void chl::RecordingGroup::startDelayedKeeperExit(chl::KeeperProcessEntry& keeper // other thread keeper_process.active = false; + activeKeeperCount -= 1; - LOG_DEBUG("[KeeperRegistry] recording_group {} starts delayedExit for {}", groupId, - keeper_process.idCardString.str()); + LOG_INFO("[KeeperRegistry] recording_group {} starts delayedExit for {}", groupId, keeper_process.idCardString); if(keeper_process.keeperAdminClient != nullptr) { keeper_process.delayedExitClients.push_back( @@ -1050,8 +1091,7 @@ void chl::RecordingGroup::clearDelayedExitKeeper(chl::KeeperProcessEntry& keeper while(!keeper_process.delayedExitClients.empty() && (current_time >= keeper_process.delayedExitClients.front().first)) { - LOG_DEBUG("[KeeperRegistry] recording_group {} destroys delayed dataAdminClient for {}", groupId, - keeper_process.idCardString.str()); + LOG_INFO("[KeeperRegistry] recording_group {} destroys delayed dataAdminClient for {}", groupId, keeper_process.idCardString); auto dataStoreClientPair = keeper_process.delayedExitClients.front(); if(dataStoreClientPair.second != nullptr) { delete dataStoreClientPair.second; } keeper_process.delayedExitClients.pop_front(); diff --git a/ChronoVisor/src/VisorClientPortal.cpp b/ChronoVisor/src/VisorClientPortal.cpp index 090e4ef2..0f52cd92 100644 --- a/ChronoVisor/src/VisorClientPortal.cpp +++ b/ChronoVisor/src/VisorClientPortal.cpp @@ -207,10 +207,8 @@ chronolog::VisorClientPortal::AcquireStory(chl::ClientId const &client_id, std:: int ret = CL_ERR_UNKNOWN; - bool notify_keepers = false; - - ret = chronicleMetaDirectory.acquire_story(client_id, chronicle_name, story_name, attrs, flags, story_id - , notify_keepers); + ret = chronicleMetaDirectory.acquire_story(client_id, chronicle_name, story_name, attrs, flags, story_id); + if(ret != chronolog::CL_SUCCESS) { // return the error with the empty recording_keepers vector @@ -230,7 +228,7 @@ chronolog::VisorClientPortal::AcquireStory(chl::ClientId const &client_id, std:: chronicle_name, story_name, story_id, recording_keepers)) { // RPC notification to the keepers might have failed, release the newly acquired story - chronicleMetaDirectory.release_story(client_id, chronicle_name, story_name, story_id, notify_keepers); + chronicleMetaDirectory.release_story(client_id, chronicle_name, story_name, story_id); //we do know that there's no need notify keepers of the story ending in this case as it hasn't started... recording_keepers.clear(); return chronolog::AcquireStoryResponseMsg(chronolog::CL_ERR_NO_KEEPERS, story_id, recording_keepers); @@ -250,16 +248,12 @@ int chronolog::VisorClientPortal::ReleaseStory(chl::ClientId const &client_id, s { return CL_ERR_NOT_AUTHORIZED; } StoryId story_id(0); - bool notify_keepers = false; - auto return_code = chronicleMetaDirectory.release_story(client_id, chronicle_name, story_name, story_id - , notify_keepers); + auto return_code = chronicleMetaDirectory.release_story(client_id, chronicle_name, story_name, story_id); if(chronolog::CL_SUCCESS != return_code) { return return_code; } - if(notify_keepers && theKeeperRegistry->is_running()) - { - theKeeperRegistry->notifyRecordingGroupOfStoryRecordingStop(story_id); - } + theKeeperRegistry->notifyRecordingGroupOfStoryRecordingStop(story_id); + return chronolog::CL_SUCCESS; } diff --git a/chrono_common/ServiceId.h b/chrono_common/ServiceId.h index da548234..b10c894d 100644 --- a/chrono_common/ServiceId.h +++ b/chrono_common/ServiceId.h @@ -51,9 +51,16 @@ class ServiceId inline std::ostream& operator<<(std::ostream& out, chronolog::ServiceId const serviceId) { std::string a_string; - out << "{" << serviceId.getIPasDottedString(a_string) << ":" << serviceId.port << ":" << serviceId.provider_id + out << "ServiceId{" << serviceId.getIPasDottedString(a_string) << ":" << serviceId.port << ":" << serviceId.provider_id << "}"; return out; } +inline std::string& operator+= (std::string& a_string, chronolog::ServiceId const& serviceId) +{ + a_string += std::string("ServiceId{") + serviceId.getIPasDottedString(a_string) + ":" + std::to_string(serviceId.port) + ":" + + std::to_string(serviceId.provider_id) + "}"; + return a_string; +} + #endif From 1b4964794809498bbf3b74d6f7a64b8c666b6a0f Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Thu, 25 Apr 2024 12:14:12 -0500 Subject: [PATCH 37/40] fixed no-return in CSVFileExtractor --- ChronoKeeper/CSVFileChunkExtractor.cpp | 4 +++- ChronoKeeper/StoryChunkExtractor.h | 5 +---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/ChronoKeeper/CSVFileChunkExtractor.cpp b/ChronoKeeper/CSVFileChunkExtractor.cpp index cc6d6910..be5f3867 100644 --- a/ChronoKeeper/CSVFileChunkExtractor.cpp +++ b/ChronoKeeper/CSVFileChunkExtractor.cpp @@ -44,6 +44,8 @@ int chronolog::CSVFileStoryChunkExtractor::processStoryChunk(StoryChunk*story_ch chunk_fstream << event << std::endl; } chunk_fstream.close(); - LOG_INFO("[CSVFileStoryChunkExtractor] Finished processing StoryChunk. File={}", chunk_filename); + LOG_DEBUG("[CSVFileStoryChunkExtractor] Finished processing StoryChunk. File={}", chunk_filename); + + return chronolog::CL_SUCCESS; } diff --git a/ChronoKeeper/StoryChunkExtractor.h b/ChronoKeeper/StoryChunkExtractor.h index c1f4671b..6b170007 100644 --- a/ChronoKeeper/StoryChunkExtractor.h +++ b/ChronoKeeper/StoryChunkExtractor.h @@ -46,10 +46,7 @@ class StoryChunkExtractorBase void drainExtractionQueue(); - virtual int processStoryChunk(StoryChunk*) //=0 - { - LOG_WARNING("[StoryChunkExtraction] Base processStoryChunk method called. Derived class should implement specific logic."); - } + virtual int processStoryChunk(StoryChunk*) = 0; void startExtractionThreads(int); From f25955b3cfc9ce35a97d70e21653775fb077b362 Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Thu, 25 Apr 2024 13:37:31 -0500 Subject: [PATCH 38/40] fixed merging issues --- ChronoGrapher/ChronoGrapher.cpp | 89 +++++++++++++------------- ChronoKeeper/CSVFileChunkExtractor.cpp | 1 + 2 files changed, 46 insertions(+), 44 deletions(-) diff --git a/ChronoGrapher/ChronoGrapher.cpp b/ChronoGrapher/ChronoGrapher.cpp index 7d63bf40..0e878b2f 100644 --- a/ChronoGrapher/ChronoGrapher.cpp +++ b/ChronoGrapher/ChronoGrapher.cpp @@ -203,46 +203,47 @@ int main(int argc, char**argv) return (-1); } -// /// RegistryClient SetUp _____________________________________________________________________________________ -// // create RegistryClient and register the new Recording service with the Registry -// std::string REGISTRY_SERVICE_NA_STRING = -// confManager.GRAPHER_CONF.VISOR_REGISTRY_SERVICE_CONF.PROTO_CONF + "://" + -// confManager.GRAPHER_CONF.VISOR_REGISTRY_SERVICE_CONF.IP + ":" + -// std::to_string(confManager.GRAPHER_CONF.VISOR_REGISTRY_SERVICE_CONF.BASE_PORT); -// -// uint16_t REGISTRY_SERVICE_PROVIDER_ID = confManager.GRAPHER_CONF.VISOR_REGISTRY_SERVICE_CONF.SERVICE_PROVIDER_ID; -// -// chronolog::GrapherRegistryClient* grapherRegistryClient = chronolog::GrapherRegistryClient::CreateRegistryClient( -// *dataAdminEngine, REGISTRY_SERVICE_NA_STRING, REGISTRY_SERVICE_PROVIDER_ID); -// -// if(nullptr == grapherRegistryClient) -// { -// LOG_CRITICAL("[ChronoGrapher] failed to create RegistryClient; exiting"); -// delete grapherRecordingService; -// delete keeperDataAdminService; -// return (-1); -// } -// -// /// Registration with ChronoVisor __________________________________________________________________________________ -// // try to register with chronoVisor a few times than log ERROR and exit... -// int registration_status = chronolog::CL_ERR_UNKNOWN; -// int retries = 5; -// while((chronolog::CL_SUCCESS != registration_status) && (retries > 0)) -// { -// registration_status = grapherRegistryClient->send_register_msg( -// chronolog::GrapherRegistrationMsg(processIdCard, collectionServiceId)); -// retries--; -// } -// -// if(chronolog::CL_SUCCESS != registration_status) -// { -// LOG_CRITICAL("[ChronoGrapher] Failed to register with ChronoVisor after multiple attempts. Exiting."); -// delete grapherRegistryClient; -// delete grapherRecordingService; -// delete keeperDataAdminService; -// return (-1); -// } -// LOG_INFO("[ChronoGrapher] Successfully registered with ChronoVisor."); + /// RegistryClient SetUp _____________________________________________________________________________________ + // create RegistryClient and register the new Recording service with the Registry + std::string REGISTRY_SERVICE_NA_STRING = + confManager.GRAPHER_CONF.VISOR_REGISTRY_SERVICE_CONF.PROTO_CONF + "://" + + confManager.GRAPHER_CONF.VISOR_REGISTRY_SERVICE_CONF.IP + ":" + + std::to_string(confManager.GRAPHER_CONF.VISOR_REGISTRY_SERVICE_CONF.BASE_PORT); + + uint16_t REGISTRY_SERVICE_PROVIDER_ID = confManager.GRAPHER_CONF.VISOR_REGISTRY_SERVICE_CONF.SERVICE_PROVIDER_ID; + + chronolog::GrapherRegistryClient* grapherRegistryClient = chronolog::GrapherRegistryClient::CreateRegistryClient( + *dataAdminEngine, REGISTRY_SERVICE_NA_STRING, REGISTRY_SERVICE_PROVIDER_ID); + + if(nullptr == grapherRegistryClient) + { + LOG_CRITICAL("[ChronoGrapher] failed to create RegistryClient; exiting"); + delete grapherRecordingService; + delete keeperDataAdminService; + return (-1); + } + + /// Registration with ChronoVisor __________________________________________________________________________________ + // try to register with chronoVisor a few times than log ERROR and exit... + int registration_status = chronolog::CL_ERR_UNKNOWN; + int retries = 5; + while((chronolog::CL_SUCCESS != registration_status) && (retries > 0)) + { + registration_status = grapherRegistryClient->send_register_msg( + chronolog::GrapherRegistrationMsg(processIdCard, collectionServiceId)); + sleep(5); + retries--; + } + + if(chronolog::CL_SUCCESS != registration_status) + { + LOG_CRITICAL("[ChronoGrapher] Failed to register with ChronoVisor after multiple attempts. Exiting."); + delete grapherRegistryClient; + delete grapherRecordingService; + delete keeperDataAdminService; + return (-1); + } + LOG_INFO("[ChronoGrapher] Successfully registered with ChronoVisor."); /// Start data collection and extraction threads ___________________________________________________________________ // services are successfully created and keeper process had registered with ChronoVisor @@ -264,10 +265,10 @@ int main(int argc, char**argv) sleep(30); } -// /// Unregister from ChronoVisor ____________________________________________________________________________________ -// // Unregister from the chronoVisor so that no new story requests would be coming -// grapherRegistryClient->send_unregister_msg(processIdCard); -// delete grapherRegistryClient; + /// Unregister from ChronoVisor ____________________________________________________________________________________ + // Unregister from the chronoVisor so that no new story requests would be coming + grapherRegistryClient->send_unregister_msg(processIdCard); + delete grapherRegistryClient; /// Stop services and shut down ____________________________________________________________________________________ LOG_INFO("[ChronoGrapher] Initiating shutdown procedures."); diff --git a/ChronoKeeper/CSVFileChunkExtractor.cpp b/ChronoKeeper/CSVFileChunkExtractor.cpp index be5f3867..cabcab6b 100644 --- a/ChronoKeeper/CSVFileChunkExtractor.cpp +++ b/ChronoKeeper/CSVFileChunkExtractor.cpp @@ -3,6 +3,7 @@ #include #include "chronolog_types.h" +#include "chronolog_errcode.h" #include "KeeperIdCard.h" #include "CSVFileChunkExtractor.h" From e623b7b17aa5bef381e5606a0b336a9cbca93ebd Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Thu, 25 Apr 2024 18:22:34 -0500 Subject: [PATCH 39/40] uint64_t story_start_time --- ChronoVisor/src/KeeperRegistry.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ChronoVisor/src/KeeperRegistry.cpp b/ChronoVisor/src/KeeperRegistry.cpp index b2f10f7b..0822c1f3 100644 --- a/ChronoVisor/src/KeeperRegistry.cpp +++ b/ChronoVisor/src/KeeperRegistry.cpp @@ -519,10 +519,8 @@ int KeeperRegistry::notifyRecordingGroupOfStoryRecordingStart(ChronicleName cons } LOG_INFO("[Registry] selected RecordingGroup {} for story {}", recording_group->groupId, story_id); - - std::time_t story_start_time = - std::chrono::high_resolution_clock::to_time_t(std::chrono::high_resolution_clock::now()); - + + uint64_t story_start_time = std::chrono::high_resolution_clock::now().time_since_epoch().count(); // the registryLock is released by this point.. // notify Grapher and notifyKeepers functions use delayedExit logic to protect From 06295ec99fefe1bb9fc8a0e1bcd36ac85de205f7 Mon Sep 17 00:00:00 2001 From: Inna Brodkin Date: Fri, 26 Apr 2024 13:07:28 -0500 Subject: [PATCH 40/40] addeed RecordingGroup to configuration --- ChronoGrapher/ChronoGrapher.cpp | 2 +- ChronoKeeper/ChronoKeeperInstance.cpp | 4 +--- chrono_common/ConfigurationManager.cpp | 8 +++++++- chrono_common/ConfigurationManager.h | 18 +++++++++++++++--- default_conf.json.in | 2 ++ 5 files changed, 26 insertions(+), 8 deletions(-) diff --git a/ChronoGrapher/ChronoGrapher.cpp b/ChronoGrapher/ChronoGrapher.cpp index 0e878b2f..5b257d29 100644 --- a/ChronoGrapher/ChronoGrapher.cpp +++ b/ChronoGrapher/ChronoGrapher.cpp @@ -104,7 +104,7 @@ int main(int argc, char**argv) LOG_INFO("[ChronoGrapher] DataStoreAdminService started successfully."); // Instantiate GrapherRecordingService - chronolog::RecordingGroupId recording_group_id = 7; + chronolog::RecordingGroupId recording_group_id = confManager.GRAPHER_CONF.RECORDING_GROUP; std::string RECORDING_SERVICE_PROTOCOL = confManager.GRAPHER_CONF.KEEPER_GRAPHER_DRAIN_SERVICE_CONF.PROTO_CONF; std::string RECORDING_SERVICE_IP = confManager.GRAPHER_CONF.KEEPER_GRAPHER_DRAIN_SERVICE_CONF.IP; uint16_t RECORDING_SERVICE_PORT = confManager.GRAPHER_CONF.KEEPER_GRAPHER_DRAIN_SERVICE_CONF.BASE_PORT; diff --git a/ChronoKeeper/ChronoKeeperInstance.cpp b/ChronoKeeper/ChronoKeeperInstance.cpp index c08cce51..cd863e37 100644 --- a/ChronoKeeper/ChronoKeeperInstance.cpp +++ b/ChronoKeeper/ChronoKeeperInstance.cpp @@ -19,8 +19,6 @@ #include "cmd_arg_parse.h" #include "StoryChunkExtractorRDMA.h" -#define KEEPER_GROUP_ID 7 - // we will be using a combination of the uint32_t representation of the service IP address // and uint16_t representation of the port number int @@ -88,7 +86,6 @@ int main(int argc, char**argv) // Instantiate ChronoKeeper MemoryDataStore // instantiate DataStoreAdminService - uint64_t keeper_group_id = KEEPER_GROUP_ID; /// DataStoreAdminService setup ____________________________________________________________________________________ std::string datastore_service_ip = confManager.KEEPER_CONF.KEEPER_DATA_STORE_ADMIN_SERVICE_CONF.RPC_CONF.IP; @@ -132,6 +129,7 @@ int main(int argc, char**argv) LOG_INFO("[ChronoKeeperInstance] KeeperRecordingService started successfully."); // create KeeperIdCard to identify this Keeper process in ChronoVisor's KeeperRegistry + chronolog::RecordingGroupId keeper_group_id = confManager.KEEPER_CONF.RECORDING_GROUP; chronolog::KeeperIdCard keeperIdCard(keeper_group_id, recording_endpoint.first, recording_endpoint.second , recording_service_provider_id); diff --git a/chrono_common/ConfigurationManager.cpp b/chrono_common/ConfigurationManager.cpp index c6f0787f..90ffc87d 100644 --- a/chrono_common/ConfigurationManager.cpp +++ b/chrono_common/ConfigurationManager.cpp @@ -5,7 +5,13 @@ void ChronoLog::ConfigurationManager::parseGrapherConf(json_object*json_conf) { json_object_object_foreach(json_conf, key, val) { - if(strcmp(key, "KeeperGrapherDrainService") == 0) + if(strcmp(key, "RecordingGroup") == 0) + { + assert(json_object_is_type(val, json_type_object)); + int value = json_object_get_int(val); + GRAPHER_CONF.RECORDING_GROUP = (value >= 0 ? value : 0); + } + else if(strcmp(key, "KeeperGrapherDrainService") == 0) { assert(json_object_is_type(val, json_type_object)); json_object*keeper_grapher_drain_service_conf = json_object_object_get(json_conf diff --git a/chrono_common/ConfigurationManager.h b/chrono_common/ConfigurationManager.h index ff13f319..5b861e62 100644 --- a/chrono_common/ConfigurationManager.h +++ b/chrono_common/ConfigurationManager.h @@ -166,6 +166,7 @@ typedef struct VisorConf_ typedef struct KeeperConf_ { + uint32_t RECORDING_GROUP; KeeperRecordingServiceConf KEEPER_RECORDING_SERVICE_CONF; KeeperDataStoreAdminServiceConf KEEPER_DATA_STORE_ADMIN_SERVICE_CONF; VisorKeeperRegistryServiceConf VISOR_KEEPER_REGISTRY_SERVICE_CONF; @@ -175,7 +176,8 @@ typedef struct KeeperConf_ [[nodiscard]] std::string to_String() const { - return "[KEEPER_RECORDING_SERVICE_CONF: " + KEEPER_RECORDING_SERVICE_CONF.to_String() + + return "[CHRONO_KEEPER_CONFIGURATION : RECORDING_GROUP: "+ std::to_string(RECORDING_GROUP) + + ", KEEPER_RECORDING_SERVICE_CONF: " + KEEPER_RECORDING_SERVICE_CONF.to_String() + ", KEEPER_DATA_STORE_ADMIN_SERVICE_CONF: " + KEEPER_DATA_STORE_ADMIN_SERVICE_CONF.to_String() + ", VISOR_KEEPER_REGISTRY_SERVICE_CONF: " + VISOR_KEEPER_REGISTRY_SERVICE_CONF.to_String() + ", STORY_FILES_DIR:" + STORY_FILES_DIR + ", KEEPER_LOG_CONF:" + KEEPER_LOG_CONF.to_String() + "]"; @@ -206,6 +208,7 @@ typedef struct ExtractorConf_ typedef struct GrapherConf_ { + uint32_t RECORDING_GROUP; RPCProviderConf KEEPER_GRAPHER_DRAIN_SERVICE_CONF; RPCProviderConf DATA_STORE_ADMIN_SERVICE_CONF; RPCProviderConf VISOR_REGISTRY_SERVICE_CONF; @@ -215,7 +218,8 @@ typedef struct GrapherConf_ [[nodiscard]] std::string to_String() const { - return "[CHRONO_GRAPHER_CONFIGURATION : KEEPER_GRAPHER_DRAIN_SERVICE_CONF: " + KEEPER_GRAPHER_DRAIN_SERVICE_CONF.to_String() + + return "[CHRONO_GRAPHER_CONFIGURATION : RECORDING_GROUP: "+ std::to_string(RECORDING_GROUP) + + ", KEEPER_GRAPHER_DRAIN_SERVICE_CONF: " + KEEPER_GRAPHER_DRAIN_SERVICE_CONF.to_String() + ", DATA_STORE_ADMIN_SERVICE_CONF: " + DATA_STORE_ADMIN_SERVICE_CONF.to_String() + ", VISOR_REGISTRY_SERVICE_CONF: " + VISOR_REGISTRY_SERVICE_CONF.to_String() + ", LOG_CONF:" + LOG_CONF.to_String() + @@ -280,6 +284,7 @@ class ConfigurationManager VISOR_CONF.DELAYED_DATA_ADMIN_EXIT_IN_SECS = 3; /* Keeper-related configurations */ + KEEPER_CONF.RECORDING_GROUP = 0; KEEPER_CONF.KEEPER_RECORDING_SERVICE_CONF.RPC_CONF.RPC_IMPLEMENTATION = CHRONOLOG_THALLIUM_SOCKETS; KEEPER_CONF.KEEPER_RECORDING_SERVICE_CONF.RPC_CONF.PROTO_CONF = "ofi+sockets"; KEEPER_CONF.KEEPER_RECORDING_SERVICE_CONF.RPC_CONF.IP = "127.0.0.1"; @@ -301,6 +306,7 @@ class ConfigurationManager KEEPER_CONF.STORY_FILES_DIR = "/tmp/"; /* Grapher-related configurations */ + GRAPHER_CONF.RECORDING_GROUP = 0; GRAPHER_CONF.KEEPER_GRAPHER_DRAIN_SERVICE_CONF.RPC_IMPLEMENTATION = CHRONOLOG_THALLIUM_SOCKETS; GRAPHER_CONF.KEEPER_GRAPHER_DRAIN_SERVICE_CONF.PROTO_CONF = "ofi+sockets"; GRAPHER_CONF.KEEPER_GRAPHER_DRAIN_SERVICE_CONF.IP = "127.0.0.1"; @@ -801,7 +807,13 @@ class ConfigurationManager { json_object_object_foreach(json_conf, key, val) { - if(strcmp(key, "KeeperRecordingService") == 0) + if(strcmp(key, "RecordingGroup") == 0) + { + assert(json_object_is_type(val, json_type_object)); + int value = json_object_get_int(val); + KEEPER_CONF.RECORDING_GROUP = (value >= 0 ? value : 0); + } + else if(strcmp(key, "KeeperRecordingService") == 0) { assert(json_object_is_type(val, json_type_object)); json_object*keeper_recording_service_conf = json_object_object_get(json_conf, "KeeperRecordingService"); diff --git a/default_conf.json.in b/default_conf.json.in index cce60df0..fc0fdba1 100644 --- a/default_conf.json.in +++ b/default_conf.json.in @@ -41,6 +41,7 @@ "delayed_data_admin_exit_in_secs": 3 }, "chrono_keeper": { + "RecordingGroup": 7, "KeeperRecordingService": { "rpc": { "rpc_implementation": "Thallium_sockets", @@ -91,6 +92,7 @@ "story_files_dir": "/tmp/" }, "chrono_grapher": { + "RecordingGroup": 7, "KeeperGrapherDrainService": { "rpc": { "rpc_implementation": "Thallium_sockets",