From 129443f9b960ff27f51cb369f6cf18a64048199a Mon Sep 17 00:00:00 2001 From: snafus Date: Tue, 5 Apr 2022 11:41:40 +0100 Subject: [PATCH 01/18] Master buffered ceph io (#18) * Buffer implementation for XrdCeph * Better error return code values * Add timing into BufferIO * Add timing into BufferSimple * Utils code area * Update raw data access and copy * Adding Extents * ReadV simple logic * Add to own files the readV implementations * Add to own files the readV implementations; cmake updated * Logging improvements and write buffer updates * Add IOadapter with blocking aio access * Use IOadapter with blocking aio access * Small logging update * Reduce logging information; fix timeing to ms * Reduce logging information; * Reduced logging, and better use of aggregated metrics * comment clean and typo fixes * Remove uncessary file close * Additional logging in case of problems * Additional logging in case of problems * allow option for buffering with IO or AIO buffer Co-authored-by: james Co-authored-by: root --- src/XrdCeph.cmake | 12 +- src/XrdCeph/XrdCephBuffers/BufferUtils.cc | 169 +++++++++ src/XrdCeph/XrdCephBuffers/BufferUtils.hh | 152 ++++++++ .../XrdCephBuffers/CephIOAdapterAIORaw.cc | 184 +++++++++ .../XrdCephBuffers/CephIOAdapterAIORaw.hh | 102 +++++ .../XrdCephBuffers/CephIOAdapterRaw.cc | 79 ++++ .../XrdCephBuffers/CephIOAdapterRaw.hh | 71 ++++ src/XrdCeph/XrdCephBuffers/ICephIOAdapter.hh | 35 ++ .../XrdCephBuffers/IXrdCephBufferAlg.hh | 47 +++ .../XrdCephBuffers/IXrdCephBufferData.hh | 46 +++ .../XrdCephBuffers/IXrdCephReadVAdapter.hh | 45 +++ .../XrdCephBuffers/XrdCephBufferAlgSimple.cc | 349 ++++++++++++++++++ .../XrdCephBuffers/XrdCephBufferAlgSimple.hh | 61 +++ .../XrdCephBuffers/XrdCephBufferDataSimple.cc | 171 +++++++++ .../XrdCephBuffers/XrdCephBufferDataSimple.hh | 66 ++++ .../XrdCephBuffers/XrdCephReadVBasic.cc | 56 +++ .../XrdCephBuffers/XrdCephReadVBasic.hh | 42 +++ .../XrdCephBuffers/XrdCephReadVNoOp.cc | 22 ++ .../XrdCephBuffers/XrdCephReadVNoOp.hh | 38 ++ src/XrdCeph/XrdCephOss.cc | 100 ++++- src/XrdCeph/XrdCephOss.hh | 7 + src/XrdCeph/XrdCephOssBufferedFile.cc | 224 +++++++++++ src/XrdCeph/XrdCephOssBufferedFile.hh | 84 +++++ src/XrdCeph/XrdCephOssFile.hh | 7 +- src/XrdCeph/XrdCephOssReadVFile.cc | 211 +++++++++++ src/XrdCeph/XrdCephOssReadVFile.hh | 90 +++++ src/XrdCeph/XrdCephPosix.cc | 2 + src/XrdCeph/XrdCephPosix.hh | 12 + 28 files changed, 2478 insertions(+), 6 deletions(-) create mode 100644 src/XrdCeph/XrdCephBuffers/BufferUtils.cc create mode 100644 src/XrdCeph/XrdCephBuffers/BufferUtils.hh create mode 100644 src/XrdCeph/XrdCephBuffers/CephIOAdapterAIORaw.cc create mode 100644 src/XrdCeph/XrdCephBuffers/CephIOAdapterAIORaw.hh create mode 100644 src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc create mode 100644 src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.hh create mode 100644 src/XrdCeph/XrdCephBuffers/ICephIOAdapter.hh create mode 100644 src/XrdCeph/XrdCephBuffers/IXrdCephBufferAlg.hh create mode 100644 src/XrdCeph/XrdCephBuffers/IXrdCephBufferData.hh create mode 100644 src/XrdCeph/XrdCephBuffers/IXrdCephReadVAdapter.hh create mode 100644 src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc create mode 100644 src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.hh create mode 100644 src/XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.cc create mode 100644 src/XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.hh create mode 100644 src/XrdCeph/XrdCephBuffers/XrdCephReadVBasic.cc create mode 100644 src/XrdCeph/XrdCephBuffers/XrdCephReadVBasic.hh create mode 100644 src/XrdCeph/XrdCephBuffers/XrdCephReadVNoOp.cc create mode 100644 src/XrdCeph/XrdCephBuffers/XrdCephReadVNoOp.hh create mode 100644 src/XrdCeph/XrdCephOssBufferedFile.cc create mode 100644 src/XrdCeph/XrdCephOssBufferedFile.hh create mode 100644 src/XrdCeph/XrdCephOssReadVFile.cc create mode 100644 src/XrdCeph/XrdCephOssReadVFile.hh diff --git a/src/XrdCeph.cmake b/src/XrdCeph.cmake index 1a68a7f82..33843544d 100644 --- a/src/XrdCeph.cmake +++ b/src/XrdCeph.cmake @@ -45,7 +45,17 @@ add_library( MODULE XrdCeph/XrdCephOss.cc XrdCeph/XrdCephOss.hh XrdCeph/XrdCephOssFile.cc XrdCeph/XrdCephOssFile.hh - XrdCeph/XrdCephOssDir.cc XrdCeph/XrdCephOssDir.hh ) + XrdCeph/XrdCephOssDir.cc XrdCeph/XrdCephOssDir.hh + XrdCeph/XrdCephOssBufferedFile.cc XrdCeph/XrdCephOssBufferedFile.hh + XrdCeph/XrdCephOssReadVFile.cc XrdCeph/XrdCephOssReadVFile.hh + XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.cc XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.hh + XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.hh + XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc XrdCeph/XrdCephBuffers/CephIOAdapterRaw.hh + XrdCeph/XrdCephBuffers/CephIOAdapterAIORaw.cc XrdCeph/XrdCephBuffers/CephIOAdapterAIORaw.hh + XrdCeph/XrdCephBuffers/BufferUtils.cc XrdCeph/XrdCephBuffers/BufferUtils.hh + XrdCeph/XrdCephBuffers/XrdCephReadVNoOp.cc XrdCeph/XrdCephBuffers/XrdCephReadVNoOp.hh + XrdCeph/XrdCephBuffers/XrdCephReadVBasic.cc XrdCeph/XrdCephBuffers/XrdCephReadVBasic.hh +) target_link_libraries( ${LIB_XRD_CEPH} diff --git a/src/XrdCeph/XrdCephBuffers/BufferUtils.cc b/src/XrdCeph/XrdCephBuffers/BufferUtils.cc new file mode 100644 index 000000000..a30adfbe0 --- /dev/null +++ b/src/XrdCeph/XrdCephBuffers/BufferUtils.cc @@ -0,0 +1,169 @@ + +#include "BufferUtils.hh" +#include // std::max + +using namespace XrdCephBuffer; + +#ifdef CEPHBUFDEBUG +// to synchronise logging statements + std::mutex cephbuf_iolock; +#endif + +// ------------------------------------------------------ // +// Extent // + +bool Extent::in_extent(off_t pos) const +{ + return ((pos > begin()) && (pos < end())); +} + +bool Extent::isContiguous(const Extent &rhs) const +{ + // does the rhs connect directly to the end of the first + if (end() != rhs.begin()) + return false; + return true; +} + +bool Extent::allInExtent(off_t pos, size_t len) const +{ + // is all the range in this extent + if ((pos < begin()) || (pos >= end())) + return false; + + if (off_t(pos + len) > end()) + return false; + return true; +} +bool Extent::someInExtent(off_t pos, size_t len) const +{ // is some of the range in this extent + if ((off_t(pos + len) < begin()) || (pos >= end())) + return false; + return true; +} + +Extent Extent::containedExtent(off_t pos, size_t len) const +{ + // return the subset of input range that is in this extent + off_t subbeg = std::max(begin(), pos); + off_t subend = std::min(end(), off_t(pos + len)); + + return Extent(subbeg, subend - subbeg); +} +Extent Extent::containedExtent(const Extent &rhs) const +{ + return containedExtent(rhs.begin(), rhs.len()); +} + +bool Extent::operator<(const Extent &rhs) const +{ + // comparison primarily on begin values + // use end values if begin values are equal. + + if (begin() > rhs.begin()) return false; + if (begin() < rhs.begin()) return true; + if (end() < rhs.end() ) return true; + return false; +} +bool Extent::operator==(const Extent &rhs) const +{ + // equivalence based only on start and end + if (begin() != rhs.begin()) + return false; + if (end() != rhs.end()) + return false; + return true; +} + +// ------------------------------------------------------ // +// ExtentHolder // + +ExtentHolder::ExtentHolder() {} + +ExtentHolder::ExtentHolder(size_t elements) +{ + m_extents.reserve(elements); +} + +ExtentHolder::ExtentHolder(const ExtentContainer &extents) +{ + m_extents.reserve(extents.size()); + for (ExtentContainer::const_iterator vit = m_extents.cbegin(); vit != m_extents.cend(); ++vit) { + push_back(*vit); + } + +} +ExtentHolder::~ExtentHolder() +{ + m_extents.clear(); +} + +void ExtentHolder::push_back(const Extent & in) { + if (size()) { + m_begin = std::min(m_begin, in.begin()); + m_end = std::max(m_end, in.end()); + } else { + m_begin = in.begin(); + m_end = in.end(); + } + return m_extents.push_back(in); +} + + + +Extent ExtentHolder::asExtent() const { + // if (!size()) return Extent(0,0); + // ExtentContainer se = getSortedExtents(); + // off_t b = se.front().begin(); + // off_t e = se.back().end(); + + return Extent(m_begin, m_end-m_begin); + +} + +size_t ExtentHolder::bytesContained() const { + size_t nbytes{0}; + for (ExtentContainer::const_iterator vit = m_extents.cbegin(); vit != m_extents.cend(); ++vit) { + nbytes += vit->len(); + } + return nbytes; +} + +size_t ExtentHolder::bytesMissing() const { + size_t bytesUsed = bytesContained(); + size_t totalRange = asExtent().len(); //might be expensive to call + return totalRange - bytesUsed; +} + + +void ExtentHolder::sort() { + std::sort(m_extents.begin(), m_extents.end()); +} + + +ExtentContainer ExtentHolder::getSortedExtents() const { + ExtentContainer v; + v.assign(m_extents.begin(), m_extents.end() ); + std::sort(v.begin(), v.end()); + return v; +} + +ExtentContainer ExtentHolder::getExtents() const { + ExtentContainer v; + v.assign(m_extents.begin(), m_extents.end() ); + return v; +} + +// ------------------------------------------------------ // +// Timer ns // + +Timer_ns::Timer_ns(long &output) : m_output_val(output) +{ + m_start = std::chrono::steady_clock::now(); +} + +Timer_ns::~Timer_ns() +{ + auto end = std::chrono::steady_clock::now(); + m_output_val = std::chrono::duration_cast(end - m_start).count(); +} diff --git a/src/XrdCeph/XrdCephBuffers/BufferUtils.hh b/src/XrdCeph/XrdCephBuffers/BufferUtils.hh new file mode 100644 index 000000000..0b4caee26 --- /dev/null +++ b/src/XrdCeph/XrdCephBuffers/BufferUtils.hh @@ -0,0 +1,152 @@ +#ifndef __CEPH_BUFFER_UTILS_HH__ +#define __CEPH_BUFFER_UTILS_HH__ + +// holder of various small utility classes for debugging, profiling, logging, and general stuff + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +// basic logging +// #TODO; merge this into the xrootd logging, when xrootd is available +#define CEPHBUFDEBUG 1 +#ifdef CEPHBUFDEBUG +extern std::mutex cephbuf_iolock; +#define BUFLOG(x) {std::unique_lock(cephbuf_iolock); std::stringstream _bs; _bs << x; std::clog << _bs.str() << std::endl;} +#else +#define BUFLOG(x) +#endif + +namespace XrdCephBuffer +{ + + + class Timer_ns + { + /** + * @brief RAII based timer information outputing a long value of ns + * Almost trivial class to time something and to pass the duration as a long + * to an output variable (specified in the constructor) at destruction. + * Create the object to start the timer. The timer stops when its destructor is called. + * #TODO improve to template the output type and the time ratio + */ + public: + explicit Timer_ns(long &output_ns); + ~Timer_ns(); + + private: + std::chrono::steady_clock::time_point m_start; + long &m_output_val; //!< reference to the external variable to store the output. + + }; //Timer_ns + + + + class Extent + { + /** + * @brief Ecapsulates an offsets and length, with added functionaliyu + * Class that represents an offset possition and a length. + * Simplest usecase is to avoid passing two values around, however this class + * provides additional funcationality for manipulation of extends (e.g. merging, splitting) + * which may prove useful. + */ + + public: + Extent(off_t offset, size_t len) : m_offset(offset), m_len(len){} + inline off_t offset() const { return m_offset; } + inline size_t len() const { return m_len; } + inline off_t begin() const { return m_offset; } //!< Same as offset, but a bit more stl container like + inline off_t end() const { return m_offset + m_len; } //!< similar to stl vector end. + inline bool empty() const {return m_len == 0;} + + /** + * Does the start of the rhs continue directly from the + * end of this Extent + */ + bool isContiguous(const Extent& rhs) const; + + inline off_t last_pos() const { return m_offset + m_len - 1; } //!< last real position + + bool in_extent(off_t pos) const; //!< is this position within the range of this extent + bool allInExtent(off_t pos, size_t len) const; //!< is all the range in this extent + bool someInExtent(off_t pos, size_t len) const; //!< is some of the range in this extent + + Extent containedExtent(off_t pos, size_t len) const; //!< return the subset of range that is in this extent + Extent containedExtent(const Extent &in) const; //!< + + bool operator<(const Extent &rhs) const; + bool operator==(const Extent &rhs) const; + + + private: + off_t m_offset; + size_t m_len; + }; + + /** + * @brief Container defintion for Extents + * Typedef to provide a container of extents as a simple stl vector container + */ + typedef std::vector ExtentContainer; + + /** + * @brief Designed to hold individual extents, but itself provide Extent-like capabilities + * Useful in cases of combining extends, or needing to hold a range of extends and extract + * information about (or aggregated from) the contained objects. + * Could be useful to inherit from Extent if improvements needed. + * + * + */ + class ExtentHolder { + // holder of a list of extent objects + public: + ExtentHolder(); + explicit ExtentHolder(size_t elements); //!< reserve memory only + explicit ExtentHolder(const ExtentContainer& extents); + ~ExtentHolder(); + + off_t begin() const {return m_begin;} + off_t end() const {return m_end;} + size_t len() const {return m_end - m_begin;} //! Total range in bytes of the extents + + bool empty() const {return m_extents.empty();} + size_t size() const {return m_extents.size();} //!< number of extent elements + + Extent asExtent() const; // return an extent covering the whole range + + + size_t bytesContained() const; // number of bytes across the extent not considering overlaps! + size_t bytesMissing() const; // number of bytes missing across the extent, not considering overlaps! + + void push_back(const Extent & in); + void sort(); //!< inplace sort by offset of contained extents + + const ExtentContainer & extents() const {return m_extents;} + //ExtentContainer & extents() {return m_extents;} + + ExtentContainer getSortedExtents() const; + ExtentContainer getExtents() const; + + + + protected: + ExtentContainer m_extents; + + off_t m_begin{0}; //lowest offset value + off_t m_end{0}; // one past end of last byte used. + + }; + + +} + +#endif diff --git a/src/XrdCeph/XrdCephBuffers/CephIOAdapterAIORaw.cc b/src/XrdCeph/XrdCephBuffers/CephIOAdapterAIORaw.cc new file mode 100644 index 000000000..0ba9caeca --- /dev/null +++ b/src/XrdCeph/XrdCephBuffers/CephIOAdapterAIORaw.cc @@ -0,0 +1,184 @@ +#include "CephIOAdapterAIORaw.hh" +#include "../XrdCephPosix.hh" +#include "XrdOuc/XrdOucEnv.hh" + +#include +#include +#include +#include +#include +#include +#include + +using namespace XrdCephBuffer; + +using myclock = std::chrono::steady_clock; +//using myseconds = std::chrono::durationResult = rc; + aiop->doneRead(); + } + static void aioWriteCallback(XrdSfsAio *aiop, size_t rc) + { + aiop->Result = rc; + aiop->doneWrite(); + } + +} // anonymous namespace + +CephBufSfsAio::CephBufSfsAio() : m_lock(m_mutex) +{ +} + +void CephBufSfsAio::doneRead() +{ + //BUFLOG("DoneRead"); + m_dataOpDone = true; + m_lock.unlock(); + m_condVar.notify_all(); +} + +void CephBufSfsAio::doneWrite() +{ + //BUFLOG("DoneWrite"); + m_dataOpDone = true; + m_lock.unlock(); + m_condVar.notify_all(); +} + +CephIOAdapterAIORaw::CephIOAdapterAIORaw(IXrdCephBufferData *bufferdata, int fd) : m_bufferdata(bufferdata), m_fd(fd) +{ +} + +CephIOAdapterAIORaw::~CephIOAdapterAIORaw() +{ + // nothing to specifically to do; just print out some stats + float read_speed{0}, write_speed{0}; + if (m_stats_read_req.load() > 0) { + read_speed = m_stats_read_bytes.load() / m_stats_read_timer.load() * 1e-3; + } + if (m_stats_write_req.load() > 0) { + write_speed = m_stats_write_bytes.load() / m_stats_write_timer.load() * 1e-3; + } + BUFLOG("CephIOAdapterAIORaw::Summary fd:" << m_fd + << " nwrite:" << m_stats_write_req << " byteswritten:" << m_stats_write_bytes << " write_s:" + << m_stats_write_timer * 1e-3 << " writemax_s" << m_stats_write_longest * 1e-3 + << " write_MBs:" << write_speed + << " nread:" << m_stats_read_req << " bytesread:" << m_stats_read_bytes << " read_s:" + << m_stats_read_timer * 1e-3 << " readmax_s:" << m_stats_read_longest * 1e-3 + << " read_MBs:" << read_speed ); +} + +ssize_t CephIOAdapterAIORaw::write(off64_t offset, size_t count) +{ + void *buf = m_bufferdata->raw(); + if (!buf) { + BUFLOG("CephIOAdapterAIORaw::write null buffer was provided.") + return -EINVAL; + } + //BUFLOG("Make aio"); + std::unique_ptr aiop = std::unique_ptr(new CephBufSfsAio()); + aiocb &sfsAio = aiop->sfsAio; + // set the necessary parameters for the read, e.g. buffer pointer, offset and length + sfsAio.aio_buf = buf; + sfsAio.aio_nbytes = count; + sfsAio.aio_offset = offset; + // need the concrete object for the blocking / wait + CephBufSfsAio *ceph_aiop = dynamic_cast(aiop.get()); + + long dt_ns{0}; + ssize_t rc{0}; + { // brace is for timer RAII + XrdCephBuffer::Timer_ns timer(dt_ns); + rc = ceph_aio_write(m_fd, aiop.get(), aioWriteCallback); + + if (rc < 0) { + BUFLOG("CephIOAdapterAIORaw::write ceph_aio_write returned rc:" << rc) + return rc; + } + + while (!ceph_aiop->isDone()) + { + ceph_aiop->m_condVar.wait(ceph_aiop->m_lock, std::bind(&CephBufSfsAio::isDone, ceph_aiop)); + } + } // timer brace + + // cleanup + rc = ceph_aiop->Result; + if (rc < 0) { + BUFLOG("CephIOAdapterAIORaw::write ceph_aiop->Result returned rc:" << rc) + } + + // BUFLOG("CephIOAdapterAIORaw::write fd:" << m_fd << " off:" + // << offset << " len:" << count << " rc:" << rc << " ms:" << dt_ns / 1000000); + + m_stats_write_longest = std::max(m_stats_write_longest, dt_ns / 1000000); + m_stats_write_timer.fetch_add(dt_ns / 1000000); + m_stats_write_bytes.fetch_add(rc); + ++m_stats_write_req; + return rc; +} + +ssize_t CephIOAdapterAIORaw::read(off64_t offset, size_t count) +{ + void *buf = m_bufferdata->raw(); + if (!buf) + { + BUFLOG("CephIOAdapterAIORaw::read null buffer was provided.") + return -EINVAL; + } + + std::unique_ptr aiop = std::unique_ptr(new CephBufSfsAio()); + aiocb &sfsAio = aiop->sfsAio; + // set the necessary parameters for the read, e.g. buffer pointer, offset and length + sfsAio.aio_buf = buf; + sfsAio.aio_nbytes = count; + sfsAio.aio_offset = offset; + // need the concrete object for the blocking / wait + CephBufSfsAio *ceph_aiop = dynamic_cast(aiop.get()); + + long dt_ns{0}; + ssize_t rc{0}; + { // timer brace RAII + XrdCephBuffer::Timer_ns timer(dt_ns); + // no check is made whether the buffer has sufficient capacity + // rc = ceph_posix_pread(m_fd,buf,count,offset); + //BUFLOG("Submit aio read: "); + rc = ceph_aio_read(m_fd, aiop.get(), aioReadCallback); + + if (rc < 0) + return rc; + + // now block until the read is done + // take the lock on the aio object + // while(!ceph_aiop->isDone()) { ceph_aiop->m_condVar.wait(lock,std::bind(&CephBufSfsAio::isDone,ceph_aiop) ); } + while (!ceph_aiop->isDone()) + { + ceph_aiop->m_condVar.wait(ceph_aiop->m_lock, std::bind(&CephBufSfsAio::isDone, ceph_aiop)); + } + } // timer brace + + // cleanup + rc = ceph_aiop->Result; + + m_stats_read_longest = std::max(m_stats_read_longest, dt_ns / 1000000); + m_stats_read_timer.fetch_add(dt_ns * 1e-6); + m_stats_read_bytes.fetch_add(rc); + ++m_stats_read_req; + + // BUFLOG("CephIOAdapterAIORaw::read fd:" << m_fd << " " << offset + // << " " << count << " " << rc << " " << dt_ns * 1e-6); + + if (rc >= 0) + { + m_bufferdata->setLength(rc); + m_bufferdata->setStartingOffset(offset); + m_bufferdata->setValid(true); + } + return rc; +} diff --git a/src/XrdCeph/XrdCephBuffers/CephIOAdapterAIORaw.hh b/src/XrdCeph/XrdCephBuffers/CephIOAdapterAIORaw.hh new file mode 100644 index 000000000..a4d0f0c8b --- /dev/null +++ b/src/XrdCeph/XrdCephBuffers/CephIOAdapterAIORaw.hh @@ -0,0 +1,102 @@ +#ifndef __CEPH_IO_ADAPTER_AIORAW_HH__ +#define __CEPH_IO_ADAPTER_AIORAW_HH__ +//------------------------------------------------------------------------------ +// Interface of the logic part of the buffering +// Intention to be able to abstract the underlying implementation and code against the inteface +// e.g. for different complexities of control. +// Couples loosely to IXrdCepgBufferData and anticipated to be called by XrdCephOssBufferedFile. +// Should managage all of the IO and logic to give XrdCephOssBufferedFile only simple commands to call. +// implementations are likely to use (via callbacks?) CephPosix library code for actual reads and writes. +//------------------------------------------------------------------------------ + +#include +#include "IXrdCephBufferData.hh" +#include "ICephIOAdapter.hh" +#include "BufferUtils.hh" + +#include +#include +#include +#include +#include + +#include "XrdSfs/XrdSfsAio.hh" + + +namespace XrdCephBuffer { + + class CephBufSfsAio : virtual public XrdSfsAio + { + public: + CephBufSfsAio(); + // Method to handle completed reads + // + virtual void doneRead() override; + + // Method to hand completed writes + // + virtual void doneWrite() override; + + // Method to recycle free object + // + virtual void Recycle() override{}; + std::mutex m_mutex; + std::unique_lock m_lock; + std::condition_variable m_condVar; + bool isDone() {return m_dataOpDone;} + + protected: + bool m_dataOpDone {false}; + + }; + +/** + * @brief Implements a non-async read and write to ceph via aio ceph_posix calls + * Using the standard ceph_posix_aio calls do the actual read and write operations. + * No ownership is taken on the buffer that's passed via the constructor + * Although using aio calls, we block here until the data has been read/written + */ +class CephIOAdapterAIORaw: public virtual ICephIOAdapter { + public: + CephIOAdapterAIORaw(IXrdCephBufferData * bufferdata, int fd); + virtual ~CephIOAdapterAIORaw(); + + /** + * @brief Take the data in the buffer and write to ceph at given offset + * Issues a ceph_posix_pwrite for data in the buffer (from pos 0) into + * ceph at position offset with len count. + * Returns -ve on error, else the number of bytes writen. + * + * @param offset + * @param count + * @return ssize_t + */ + virtual ssize_t write(off64_t offset,size_t count) override; + + /** + * @brief Issue a ceph_posix_pread to read to the buffer data from file offset and len count. + * No range checking is currently provided here. The caller must provide sufficient space for the + * max len read. + * Returns -ve errorcode on failure, else the number of bytes returned. + * + * @param offset + * @param count + * @return ssize_t + */ + virtual ssize_t read(off64_t offset,size_t count) override; + + private: + IXrdCephBufferData * m_bufferdata; //!< no ownership of pointer (consider shared ptrs, etc) + int m_fd; + + // timer and counter info + std::atomic< long> m_stats_read_timer{0}, m_stats_write_timer{0}; + std::atomic< long> m_stats_read_bytes{0}, m_stats_write_bytes{0}; + std::atomic< long> m_stats_read_req{0}, m_stats_write_req{0}; + long m_stats_read_longest{0}, m_stats_write_longest{0}; + +}; + +} + +#endif diff --git a/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc b/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc new file mode 100644 index 000000000..fae2a2669 --- /dev/null +++ b/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc @@ -0,0 +1,79 @@ +#include "CephIOAdapterRaw.hh" +#include "../XrdCephPosix.hh" +#include "XrdOuc/XrdOucEnv.hh" + +#include +#include +#include + +using namespace XrdCephBuffer; + +using myclock = std::chrono::steady_clock; +//using myseconds = std::chrono::durationraw(); + if (!buf) return -EINVAL; + + auto start = std::chrono::steady_clock::now(); + ssize_t rc = ceph_posix_pwrite(m_fd,buf,count,offset); + auto end = std::chrono::steady_clock::now(); + auto int_ms = std::chrono::duration_cast(end-start); + + BUFLOG("CephIOAdapterRaw::write fd:" << m_fd << " " << rc << " " + << offset << " " << count << " " << rc << " " << int_ms.count() ); + + if (rc < 0) return rc; + m_stats_write_longest = std::max(m_stats_write_longest,int_ms.count()); + m_stats_write_timer.fetch_add(int_ms.count()); + m_stats_write_bytes.fetch_add(rc); + ++m_stats_write_req; + return rc; +} + + +ssize_t CephIOAdapterRaw::read(off64_t offset, size_t count) { + void* buf = m_bufferdata->raw(); + if (!buf) { + return -EINVAL; + } + + // no check is made whether the buffer has sufficient capacity + auto start = std::chrono::steady_clock::now(); + ssize_t rc = ceph_posix_pread(m_fd,buf,count,offset); + auto end = std::chrono::steady_clock::now(); + //auto elapsed = end-start; + auto int_ms = std::chrono::duration_cast(end-start); + + if (rc < 0) return rc; + + m_stats_read_longest = std::max(m_stats_read_longest,int_ms.count()); + m_stats_read_timer.fetch_add(int_ms.count()); + m_stats_read_bytes.fetch_add(rc); + ++m_stats_read_req; + + BUFLOG("CephIOAdapterRaw::read fd:" << m_fd << " " << rc << " " << offset + << " " << count << " " << rc << " " << int_ms.count() ); + + if (rc>=0) { + m_bufferdata->setLength(rc); + m_bufferdata->setStartingOffset(offset); + m_bufferdata->setValid(true); + } + return rc; +} + diff --git a/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.hh b/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.hh new file mode 100644 index 000000000..3c7011ef7 --- /dev/null +++ b/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.hh @@ -0,0 +1,71 @@ +#ifndef __CEPH_IO_ADAPTER_RAW_HH__ +#define __CEPH_IO_ADAPTER_RAW_HH__ +//------------------------------------------------------------------------------ +// Interface of the logic part of the buffering +// Intention to be able to abstract the underlying implementation and code against the inteface +// e.g. for different complexities of control. +// Couples loosely to IXrdCepgBufferData and anticipated to be called by XrdCephOssBufferedFile. +// Should managage all of the IO and logic to give XrdCephOssBufferedFile only simple commands to call. +// implementations are likely to use (via callbacks?) CephPosix library code for actual reads and writes. +//------------------------------------------------------------------------------ + +#include +#include "IXrdCephBufferData.hh" +#include "ICephIOAdapter.hh" +#include "BufferUtils.hh" + +#include +#include +#include + +namespace XrdCephBuffer { + +/** + * @brief Implements a non-async read and write to ceph via ceph_posix calls + * Using the standard ceph_posix_ calls do the actual read and write operations. + * No ownership is taken on the buffer that's passed via the constructor + */ +class CephIOAdapterRaw: public virtual ICephIOAdapter { + public: + CephIOAdapterRaw(IXrdCephBufferData * bufferdata, int fd); + virtual ~CephIOAdapterRaw(); + + /** + * @brief Take the data in the buffer and write to ceph at given offset + * Issues a ceph_posix_pwrite for data in the buffer (from pos 0) into + * ceph at position offset with len count. + * Returns -ve on error, else the number of bytes writen. + * + * @param offset + * @param count + * @return ssize_t + */ + virtual ssize_t write(off64_t offset,size_t count) override; + + /** + * @brief Issue a ceph_posix_pread to read to the buffer data from file offset and len count. + * No range checking is currently provided here. The caller must provide sufficient space for the + * max len read. + * Returns -ve errorcode on failure, else the number of bytes returned. + * + * @param offset + * @param count + * @return ssize_t + */ + virtual ssize_t read(off64_t offset,size_t count) override; + + private: + IXrdCephBufferData * m_bufferdata; //!< no ownership of pointer (consider shared ptrs, etc) + int m_fd; + + // timer and counter info + std::atomic< long> m_stats_read_timer{0}, m_stats_write_timer{0}; + std::atomic< long> m_stats_read_bytes{0}, m_stats_write_bytes{0}; + std::atomic< long> m_stats_read_req{0}, m_stats_write_req{0}; + long m_stats_read_longest{0}, m_stats_write_longest{0}; + +}; + +} + +#endif diff --git a/src/XrdCeph/XrdCephBuffers/ICephIOAdapter.hh b/src/XrdCeph/XrdCephBuffers/ICephIOAdapter.hh new file mode 100644 index 000000000..1fb2c363f --- /dev/null +++ b/src/XrdCeph/XrdCephBuffers/ICephIOAdapter.hh @@ -0,0 +1,35 @@ +#ifndef __ICEPH_IO_ADAPTER_HH__ +#define __ICEPH_IO_ADAPTER_HH__ +//------------------------------------------------------------------------------ +// Interface of the logic part of the buffering +// Intention to be able to abstract the underlying implementation and code against the inteface +// e.g. for different complexities of control. +// Couples loosely to IXrdCepgBufferData and anticipated to be called by XrdCephOssBufferedFile. +// Should managage all of the IO and logic to give XrdCephOssBufferedFile only simple commands to call. +// implementations are likely to use (via callbacks?) CephPosix library code for actual reads and writes. +//------------------------------------------------------------------------------ + +#include +#include "IXrdCephBufferData.hh" + +namespace XrdCephBuffer { + +/** + * @brief Manage the actual IO operations that read and write the data into Ceph via librados striper. + * Likely to be provided with a buffer in the concreate implementation's constructor. + * Attempt to decouple the low level IO operations from the buffer implementation. + * However, ight coupling might be strictly necessary, making this class a bit redundant. + * Consider to refactor if this proves to be the case ... + * + */ +class ICephIOAdapter { + public: + virtual ~ICephIOAdapter() {} + virtual ssize_t write(off64_t offset,size_t count) = 0; //!< write from buffer into ceph + virtual ssize_t read(off64_t offset,size_t count) = 0; //!< read from ceph into the buffer + +}; + +} + +#endif diff --git a/src/XrdCeph/XrdCephBuffers/IXrdCephBufferAlg.hh b/src/XrdCeph/XrdCephBuffers/IXrdCephBufferAlg.hh new file mode 100644 index 000000000..432273fa4 --- /dev/null +++ b/src/XrdCeph/XrdCephBuffers/IXrdCephBufferAlg.hh @@ -0,0 +1,47 @@ +#ifndef __IXRD_CEPH_BUFFER_ALG_HH__ +#define __IXRD_CEPH_BUFFER_ALG_HH__ +//------------------------------------------------------------------------------ +// Interface of the logic part of the buffering +// Intention to be able to abstract the underlying implementation and code against the inteface +// e.g. for different complexities of control. +// Couples loosely to IXrdCepgBufferData and anticipated to be called by XrdCephOssBufferedFile. +// Should managage all of the IO and logic to give XrdCephOssBufferedFile only simple commands to call. +// implementations are likely to use (via callbacks?) CephPosix library code for actual reads and writes. +//------------------------------------------------------------------------------ + +#include +#include "IXrdCephBufferData.hh" +#include "ICephIOAdapter.hh" + +class XrdSfsAio; + +namespace XrdCephBuffer { + +/** + * @brief Interface to a holder of the main logic decisions of the buffering algortithm, decoupled from the buffer resource itself. + * Main work of the buffering is done in the classes that inherit from the interace, of how and when and why to buffer and flush the data + * The physical representation of the buffer is not written here to allow for some flexibility of changing the internals of the buffer if needed. + * Anticipate that a non-async and async will be the main distinct use cases. + */ +class IXrdCephBufferAlg { + public: + virtual ~IXrdCephBufferAlg() {} + + virtual ssize_t read_aio (XrdSfsAio *aoip) = 0; //!< possible aio based code + virtual ssize_t write_aio(XrdSfsAio *aoip) = 0; //!< possible aio based code + + virtual ssize_t read (volatile void *buff, off_t offset, size_t blen) = 0; //!< read data through the buffer + virtual ssize_t write(const void *buff, off_t offset, size_t blen) = 0; //!< write data through the buffer + virtual ssize_t flushWriteCache() = 0; //!< remember to flush the cache on final writes + + + protected: + + + private: + +}; + +} + +#endif diff --git a/src/XrdCeph/XrdCephBuffers/IXrdCephBufferData.hh b/src/XrdCeph/XrdCephBuffers/IXrdCephBufferData.hh new file mode 100644 index 000000000..2b242b0d5 --- /dev/null +++ b/src/XrdCeph/XrdCephBuffers/IXrdCephBufferData.hh @@ -0,0 +1,46 @@ +#ifndef __IXRD_CEPH_BUFFER_DATA_HH__ +#define __IXRD_CEPH_BUFFER_DATA_HH__ +//------------------------------------------------------------------------------ +// Interface to the actual buffer data object used to store the data +// Intention to be able to abstract the underlying implementation and code against the inteface +// e.g. if choice of buffer data object +//------------------------------------------------------------------------------ + +#include + +namespace XrdCephBuffer { + +/** + * @brief Interface to the Buffer's physical representation. + * Allow an interface to encapsulate the requirements of a buffer's memory, without worrying about the details. + * Various options exist for the specific buffer implemented, and are left to the sub-classes. + */ +class IXrdCephBufferData { + public: + virtual ~IXrdCephBufferData(){} + virtual size_t capacity() const = 0;//! total available space + virtual size_t length() const = 0;//! Currently occupied and valid space, which may be less than capacity + virtual void setLength(size_t len) =0 ;//! Currently occupied and valid space, which may be less than capacity + virtual bool isValid() const =0; + virtual void setValid(bool isValid) =0; + + virtual off_t startingOffset() const = 0; + virtual off_t setStartingOffset(off_t offset) = 0; + + virtual ssize_t invalidate() = 0; //! set cache into an invalid state + + virtual ssize_t readBuffer(void* buf, off_t offset, size_t blen) const = 0; //! copy data from the internal buffer to buf + + virtual ssize_t writeBuffer(const void* buf, off_t offset, size_t blen,off_t externalOffset) = 0; //! write data into the buffer, store the external offset + + virtual const void* raw() const = 0; // const accessor to the 'raw' or underlying object + virtual void* raw() = 0; // accessor to the 'raw' or underlying object + + + protected: + +}; + +} + +#endif diff --git a/src/XrdCeph/XrdCephBuffers/IXrdCephReadVAdapter.hh b/src/XrdCeph/XrdCephBuffers/IXrdCephReadVAdapter.hh new file mode 100644 index 000000000..18d6c1ae5 --- /dev/null +++ b/src/XrdCeph/XrdCephBuffers/IXrdCephReadVAdapter.hh @@ -0,0 +1,45 @@ +#ifndef __IXRD_CEPH_READV_ADAPTER_HH__ +#define __IXRD_CEPH_READV_ADAPTER_HH__ +//------------------------------------------------------------------------------ +// Interface to the actual buffer data object used to store the data +// Intention to be able to abstract the underlying implementation and code against the inteface +// e.g. if choice of buffer data object +//------------------------------------------------------------------------------ + +#include +#include + +#include "BufferUtils.hh" + +#include // #FIXME remove + +namespace XrdCephBuffer +{ + + /** + * @brief Interface to the logic of dealing with readV requests + */ + class IXrdCephReadVAdapter + { + public: + virtual ~IXrdCephReadVAdapter() {} + + /** + * @brief Take in a set of extents representing the readV requests. return a vector of each combined read request. + * Caller translates the readV request into a set of Extents (passed to an ExtentHolder). + * The logic of the specific concrete implementation combines the set of readV requests into merged requests. + * Output is a vector of those requests. Each ExtentHolder element holds the offset and len to be read, and also + * the contained extents of the readVs. + * The index of the readV element is not held, so the caller must ensure to match up appropriately. + * + * @param extentsIn + * @return std::vector + */ + virtual std::vector convert(const ExtentHolder &extentsIn) const =0; + + protected: + }; + +} + +#endif diff --git a/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc b/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc new file mode 100644 index 000000000..fa6de1ddd --- /dev/null +++ b/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc @@ -0,0 +1,349 @@ +//------------------------------------------------------------------------------ +//------------------------------------------------------------------------------ + +#include +#include "XrdCephBufferAlgSimple.hh" + +#include "../XrdCephPosix.hh" +#include +#include +#include +#include + +#include "XrdSfs/XrdSfsAio.hh" + + +using namespace XrdCephBuffer; + + +XrdCephBufferAlgSimple::XrdCephBufferAlgSimple(std::unique_ptr buffer, std::unique_ptr cephio, int fd ): +m_bufferdata(std::move(buffer)), m_cephio(std::move(cephio)), m_fd(fd){ + +} + +XrdCephBufferAlgSimple::~XrdCephBufferAlgSimple() { + BUFLOG("XrdCephBufferAlgSimple::Destructor fd:" << m_fd); + m_fd = -1; +} + + +ssize_t XrdCephBufferAlgSimple::read_aio (XrdSfsAio *aoip) { + // Currently this is not supported, and callers using this should recieve the appropriate error code + //return -ENOSYS; + + ssize_t rc(-ENOSYS); + if (!aoip) { + return -EINVAL; + } + + volatile void * buf = aoip->sfsAio.aio_buf; + size_t blen = aoip->sfsAio.aio_nbytes; + off_t offset = aoip->sfsAio.aio_offset; + + // translate the aio read into a simple sync read. + // hopefully don't get too many out of sequence reads to effect the caching + rc = read(buf, offset, blen); + + aoip->Result = rc; + aoip->doneRead(); + + return rc; + +} + +ssize_t XrdCephBufferAlgSimple::write_aio(XrdSfsAio *aoip) { + // Currently this is not supported, and callers using this should recieve the appropriate error code + // return -ENOSYS; + + ssize_t rc(-ENOSYS); + if (!aoip) { + return -EINVAL; + } + + // volatile void * buf = aoip->sfsAio.aio_buf; + // size_t blen = aoip->sfsAio.aio_nbytes; + // off_t offset = aoip->sfsAio.aio_offset; + size_t blen = aoip->sfsAio.aio_nbytes; + off_t offset = aoip->sfsAio.aio_offset; + + rc = write(const_cast(aoip->sfsAio.aio_buf), offset, blen); + aoip->Result = rc; + aoip->doneWrite(); + return rc; + +} + + +ssize_t XrdCephBufferAlgSimple::read(volatile void *buf, off_t offset, size_t blen) { + // Set a lock for any attempt at a simultaneous operation + // Use recursive, as flushCache also calls the lock and don't want to deadlock + // No call to flushCache should happen in a read, but be consistent + const std::lock_guard lock(m_data_mutex); // + + //BUFLOG("XrdCephBufferAlgSimple::read: " << offset << " " << blen); + if (blen == 0) return 0; + + /** + * If the requested read is larger than the buffer size, just bypass the cache. + * Invalidate the cache in anycase + */ + if (blen >= m_bufferdata->capacity()) { + //BUFLOG("XrdCephBufferAlgSimple::read: Readthrough cache: fd: " << m_fd + // << " " << offset << " " << blen); + // larger than cache, so read through, and invalidate the cache anyway + m_bufferdata->invalidate(); + // #FIXME JW: const_cast is probably a bit poor. + return ceph_posix_pread(m_fd, const_cast(buf), blen, offset); + } + + ssize_t rc(-1); + size_t bytesRemaining = blen; // track how many bytes still need to be read + off_t offsetDelta = 0; + size_t bytesRead = 0; + /** + * In principle, only should ever have the first loop, however, in the case a read request + * passes over the boundary of the buffer, two reads will be needed; the first to read + * out the current buffer, and a second, to read the partial data from the refilled buffer + */ + while (bytesRemaining > 0) { + bool loadCache = false; + // run some checks to see if we need to fill the cache. + if (m_bufferLength == 0) { + // no data in buffer + loadCache = true; + } else if (offset < m_bufferStartingOffset) { + // offset before any cache data + loadCache = true; + } else if (offset >= (off_t) (m_bufferStartingOffset + m_bufferLength) ) { + // offset is beyond the stored data + loadCache = true; + } + + /** + * @brief If we need to load data in the cache, do it here. + * + */ + if (loadCache) { + m_bufferdata->invalidate(); + rc = m_cephio->read(offset + offsetDelta, m_bufferdata->capacity()); // fill the cache + //BUFLOG("LoadCache ReadToCache: " << rc << " " << offset + offsetDelta << " " << m_bufferdata->capacity() ); + if (rc < 0) { + BUFLOG("LoadCache Error: " << rc); + return rc;// TODO return correct errors + } + m_bufferStartingOffset = offset + offsetDelta; + m_bufferLength = rc; + if (rc == 0) { + // We should be at the end of file, with nothing more to read, and nothing that could be returned + // break out of the loop. + break; + } + } + + //now read as much data as possible + off_t bufPosition = offset - m_bufferStartingOffset + offsetDelta; + rc = m_bufferdata->readBuffer( (void*) &(((char*)buf)[offsetDelta]) , bufPosition + offsetDelta , bytesRemaining); + if (rc < 0 ) { + BUFLOG("Reading from Cache Failed: " << rc << " " << offsetDelta << " " << bytesRemaining ); + return rc; // TODO return correct errors + } + if (rc == 0) { + // no bytes returned; much be at end of file + //BUFLOG("No bytes returned: " << rc << " " << offset << " + " << offsetDelta << "; " << blen << " : " << bytesRemaining); + break; // leave the loop even though bytesremaing is probably >=0. + //i.e. requested a full buffers worth, but only a fraction of the file is here. + } + + //BUFLOG("End of loop: " << rc << " " << offset << " + " << offsetDelta << "; " << blen << " : " << bytesRemaining); + offsetDelta += rc; + bytesRemaining -= rc; + bytesRead += rc; + + } // while bytesremaing + + return bytesRead; +} + +ssize_t XrdCephBufferAlgSimple::write (const void *buf, off_t offset, size_t blen) { + // Set a lock for any attempt at a simultaneous operation + // Use recursive, as flushCache also calls the lock and don't want to deadlock + const std::lock_guard lock(m_data_mutex); + + // take the data in buf and put it into the cache; when the cache is full, write to underlying storage + // remember to flush the cache at the end of operations ... + ssize_t rc(-1); + ssize_t bytesWrittenToStorage(0); + + if (blen == 0) { + return 0; // nothing to write; are we done? + } + + /** + * We expect the next write to be in order and well defined. + * Determine the expected offset, and compare against offset provided + * Expected offset is the end of the buffer. + * m_bufferStartingOffset is the represented offset in ceph that buffer[0] represents + */ + off_t expected_offset = (off_t)(m_bufferStartingOffset + m_bufferLength); + + if ((offset != expected_offset) && (m_bufferLength > 0) ) { + // for the moment we just log that there is some non expected offset value + // TODO, might be dangerous to flush the cache on non-aligned writes ... + BUFLOG("Non expected offset: " << rc << " " << offset << " " << expected_offset); + // rc = flushWriteCache(); + // if (rc < 0) { + // return rc; // TODO return correct errors + // } + } // mismatched offset + + //! We should be equally careful if the offset of the buffer start is not aligned sensibly. + //! Log this only for now, but #TODO, this should be come an error condition for over cautitious behaviour. + if ( (m_bufferStartingOffset % m_bufferdata->capacity()) != 0 ) { + BUFLOG(" Non aligned offset?" << m_bufferStartingOffset << " " + << m_bufferdata->capacity() << " " << m_bufferStartingOffset % m_bufferdata->capacity() ); + } + + // Commmented out below. It would be good to pass writes, which are larger than the buffer size, + // straight-through. However if the ranges are not well aligned, this could be an issue. + // And, what then to do about a possible partial filled buffer? + + // if (blen >= m_bufferdata->capacity()) { + // // TODO, might be dangerous to flush the cache on non-aligned writes ... + // // flush the cache now, if needed + // rc = flushWriteCache(); + // if (rc < 0) { + // return rc; // TODO return correct errors + // } + // bytesWrittenToStorage += rc; + + // // Size is larger than the buffer; send the write straight through + // std::clog << "XrdCephBufferAlgSimple::write: Readthrough cache: fd: " << m_fd + // << " " << offset << " " << blen << std::endl; + // // larger than cache, so read through, and invalidate the cache anyway + // m_bufferdata->invalidate(); + // m_bufferLength=0; + // m_bufferStartingOffset=0; + // rc = ceph_posix_pwrite(m_fd, buf, blen, offset); + // if (rc < 0) { + // return rc; // TODO return correct errors + // } + // bytesWrittenToStorage += rc; + // return rc; + // } + + /** + * @brief Provide some sanity checking for the write to the buffer. + * We call an error on this conditions as there is no immediate solution that is satisfactory. + */ + if ((offset != expected_offset) && (m_bufferLength > 0) ) { + BUFLOG("Error trying to write out of order: expeted at: " << expected_offset + << " got offset" << offset << " of len " << blen); + return -EINVAL; + } + if (offset < 0) { + BUFLOG("Got a negative offset: " << offset); + return -EINVAL; + } + + + size_t bytesRemaining = blen; //!< track how many bytes left to write + size_t bytesWritten = 0; + off_t bufferOffset = m_bufferLength; // position to append data in the buffer, i.e. the end of the buffer + + /** Typically would expect only one loop, i.e. the write request is smaller than the buffer. + * If bigger, or the request stradles the end of the buffer, will need another loop + */ + while (bytesRemaining > 0) { + /** + * If the cache is already full, lets flush to disk now + */ + if (m_bufferLength == m_bufferdata->capacity()) { + rc = flushWriteCache(); + if (rc < 0) { + return rc; + } + bytesWrittenToStorage += rc; + } // at capacity; + + if (m_bufferLength == 0) { + // cache is currently empty, so set the 'reference' to the external offset now + m_bufferStartingOffset = offset + bytesWritten; + } + //add data to the cache from buf, from buf[offsetDelta] to the cache at position bufferOffset + // make sure to write only as many bytes as left in the cache. + size_t nBytesToWrite = std::min(bytesRemaining, m_bufferdata->capacity()-m_bufferLength); + const void* bufAtOffset = (void*)((char*)buf + bytesWritten); // nasty cast as void* doesn't do arithmetic + if (nBytesToWrite == 0) { + BUFLOG( "Wanting to write 0 bytes; why is that?"); + } + rc = m_bufferdata->writeBuffer(bufAtOffset, bufferOffset, nBytesToWrite, 0); + if (rc < 0) { + BUFLOG( "WriteBuffer step failed: " << rc << " " << bufferOffset << " " << blen << " " << offset ); + return rc; // pass the error condidition upwards + } + if (rc != (ssize_t)nBytesToWrite) { + BUFLOG( "WriteBuffer returned unexpected number of bytes: " << rc << " Expected: " << nBytesToWrite << " " + << bufferOffset << " " << blen << " " << offset ); + return -EBADE; // is bad exchange error best errno here? + } + + // lots of repetition here; #TODO try to reduce + m_bufferLength += rc; + bufferOffset += rc; + bytesWritten += rc; + bytesRemaining -= rc; + + } // while byteRemaining + + /** + * @brief Check again if we can write data into the storage + */ + if (m_bufferLength == m_bufferdata->capacity()){ + rc = flushWriteCache(); + if (rc < 0) + { + return rc; // TODO return correct errors + } + bytesWrittenToStorage += rc; + } // at capacity; + + //BUFLOG( "WriteBuffer " << bytesWritten << " " << bytesWrittenToStorage << " " << offset << " " << blen << " " ); + return bytesWritten; +} + + + +ssize_t XrdCephBufferAlgSimple::flushWriteCache() { + // Set a lock for any attempt at a simultaneous operation + // Use recursive, as write (and read) also calls the lock and don't want to deadlock + const std::lock_guard lock(m_data_mutex); // + // BUFLOG("flushWriteCache: " << m_bufferStartingOffset << " " << m_bufferLength); + ssize_t rc(-1); + if (m_bufferLength == 0) { + BUFLOG("Empty buffer to flush: "); + rc = 0; // not an issue + } + + if (m_bufferLength > 0) { + rc = m_cephio->write(m_bufferStartingOffset, m_bufferLength); + if (rc < 0) { + BUFLOG("WriteBuffer write step failed: " << rc); + } + } // some bytes to write + + // reset values + m_bufferLength=0; + m_bufferStartingOffset=0; + m_bufferdata->invalidate(); + // return bytes written, or errorcode if failure + return rc; +} + + +ssize_t XrdCephBufferAlgSimple::rawRead (void *buf, off_t offset, size_t blen) { + return -ENOSYS; +} + +ssize_t XrdCephBufferAlgSimple::rawWrite(void *buf, off_t offset, size_t blen) { + return -ENOSYS; +} diff --git a/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.hh b/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.hh new file mode 100644 index 000000000..3f30aa38a --- /dev/null +++ b/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.hh @@ -0,0 +1,61 @@ +#ifndef __XRD_CEPH_BUFFER_ALG_SIMPLE_HH__ +#define __XRD_CEPH_BUFFER_ALG_SIMPLE_HH__ +//------------------------------------------------------------------------------ +// Implementation of the logic section of buffer code +//------------------------------------------------------------------------------ + +#include +#include +#include + +#include "IXrdCephBufferAlg.hh" +#include "ICephIOAdapter.hh" +#include "BufferUtils.hh" + + +namespace XrdCephBuffer { + +/** Non-async buffering code for non-aio read operations. + * Create a single buffer of a given size. + * For reads, if data in the buffer read and return the available bytes; + * if no useful data in the buffer fill the full buffer and return the requested read. + * If the data is partially in the buffer for the range requested, return only that subset; + * client should check and make an additional call for the data not returned. + * if 0 bytes are returned, it should be assumed it is at the end of the file. + */ + +class XrdCephBufferAlgSimple : public virtual IXrdCephBufferAlg { + public: + XrdCephBufferAlgSimple(std::unique_ptr buffer, std::unique_ptr cephio, int fd ); + virtual ~XrdCephBufferAlgSimple(); + + virtual ssize_t read_aio (XrdSfsAio *aoip) override; + virtual ssize_t write_aio(XrdSfsAio *aoip) override; + + + virtual ssize_t read (volatile void *buff, off_t offset, size_t blen) override; + virtual ssize_t write(const void *buff, off_t offset, size_t blen) override; + virtual ssize_t flushWriteCache() override; + + // #REVIEW + virtual const IXrdCephBufferData *buffer() const {return m_bufferdata.get();} + virtual IXrdCephBufferData *buffer() {return m_bufferdata.get();} + + protected: + virtual ssize_t rawRead (void *buff, off_t offset, size_t blen) ; // read from the storage, at its offset + virtual ssize_t rawWrite(void *buff, off_t offset, size_t blen) ; // write to the storage, to its offset posiiton + + private: + std::unique_ptr m_bufferdata; //! this algorithm takes ownership of the buffer, and will delete it on destruction + std::unique_ptr m_cephio ; // no ownership is taken here + int m_fd = -1; + + off_t m_bufferStartingOffset = 0; + size_t m_bufferLength = 0; + + std::recursive_mutex m_data_mutex; // any data access method on the buffer will use this +}; + +} + +#endif diff --git a/src/XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.cc b/src/XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.cc new file mode 100644 index 000000000..457334c2a --- /dev/null +++ b/src/XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.cc @@ -0,0 +1,171 @@ +//------------------------------------------------------------------------------ +//! is a simple implementation of IXrdCephBufferData using std::vector representation for the buffer +//------------------------------------------------------------------------------ + +#include "XrdCephBufferDataSimple.hh" +#include "BufferUtils.hh" +//#include "XrdCeph/XrdCephBuffers/IXrdCephBufferData.hh" +#include +#include +#include +#include +#include +#include + + +using namespace XrdCephBuffer; + +std::atomic XrdCephBufferDataSimple::m_total_memory_used {0}; //!< total memory of all these buffers +std::atomic XrdCephBufferDataSimple::m_total_memory_nbuffers {0}; //!< total number of buffers actively open + + + +XrdCephBufferDataSimple::XrdCephBufferDataSimple(size_t bufCapacity): + m_buffer(bufCapacity,0), m_externalOffset(0),m_bufLength(0) { + m_valid = true; + + // update global statistics + m_total_memory_used.fetch_add(bufCapacity); + ++m_total_memory_nbuffers; + BUFLOG("XrdCephBufferDataSimple: Global: " << m_total_memory_nbuffers.load() << " " << m_total_memory_used.load()); +} + +XrdCephBufferDataSimple::~XrdCephBufferDataSimple() { + m_valid = false; + auto cap = m_buffer.capacity(); + m_buffer.clear(); + m_buffer.reserve(0); // just to be paranoid and realse memory immediately + + // update global statistics + m_total_memory_used.fetch_add(-cap); + --m_total_memory_nbuffers; + BUFLOG("XrdCephBufferDataSimple~: Global: " << m_total_memory_nbuffers.load() << " " << m_total_memory_used.load()); + +} + + +size_t XrdCephBufferDataSimple::capacity() const { + return m_buffer.capacity(); +} + +size_t XrdCephBufferDataSimple::length() const { + return m_bufLength; +} +void XrdCephBufferDataSimple::setLength(size_t len) { + m_bufLength = len; +} +bool XrdCephBufferDataSimple::isValid() const { + return m_valid; +} +void XrdCephBufferDataSimple::setValid(bool isValid) { + m_valid = isValid; +} + + +off_t XrdCephBufferDataSimple::startingOffset() const { + return m_externalOffset; +} +off_t XrdCephBufferDataSimple::setStartingOffset(off_t offset) { + m_externalOffset = offset; + return m_externalOffset; +} + +ssize_t XrdCephBufferDataSimple::invalidate() { + m_externalOffset = 0; + m_bufLength = 0; + m_valid = false; + //m_buffer.clear(); // do we really need to clear the elements ? + return 0; +} + + + +ssize_t XrdCephBufferDataSimple::readBuffer(void* buf, off_t offset, size_t blen) const { + // read from the internal buffer to buf (at pos 0), from offset for blen, or max length possible + // returns -ve value on error, else the actual number of bytes read + + if (!m_valid) { + return -EINVAL; + } + if (offset < 0) { + return -EINVAL; + } + if (offset > (ssize_t) m_bufLength) { + return 0; + } + ssize_t readlength = blen; + if (offset + blen > m_bufLength) { + readlength = m_bufLength - offset; + } + //std::cout << readlength << " " << blen << " " << m_bufLength << " " << offset << std::endl; + if (readlength <0) { + return -EINVAL; + } + + if (readlength == 0) { + return 0; + } + + const char* rawbufstart = m_buffer.data(); + + long int_ns{0}; + {auto t = Timer_ns(int_ns); + // std::copy(rawbufstart + offset, rawbufstart+offset+readlength, reinterpret_cast(buf) ); + memcpy(reinterpret_cast(buf), rawbufstart + offset, readlength); + } // end Timer + // BUFLOG("XrdCephBufferDataSimple::readBuffer: " << offset << " " << readlength << " " << int_ns ); + + return readlength; +} + + +ssize_t XrdCephBufferDataSimple::writeBuffer(const void* buf, off_t offset, size_t blen, off_t externalOffset) { + // write data from buf (from pos 0), with length blen, into the buffer at position offset (local to the internal buffer) + + // #TODO Add test to see if it's in use + //invalidate(); + + if (offset < 0) { + BUFLOG("XrdCephBufferDataSimple::writeBuffer: offset <0"); + return -EINVAL; + } + + ssize_t cap = capacity(); + if ((ssize_t)blen > cap) { + BUFLOG("XrdCephBufferDataSimple::writeBuffer: blen > cap:" << blen << " > " << cap); + return -EINVAL; + } + if ((ssize_t)offset > cap) { + BUFLOG("XrdCephBufferDataSimple::writeBuffer: offset > cap:" << offset << " > " << cap); + return -EINVAL; + } + if (ssize_t(offset + blen) > cap) { + BUFLOG("XrdCephBufferDataSimple::writeBuffer: (offset + blen) > cap: (" << offset << " + " << blen << ") >" << cap); + return -EINVAL; + } + + // std::vector::iterator itstart = m_buffer.begin(); + size_t readBytes = blen; + char* rawbufstart = m_buffer.data(); + + + long int_ns{0}; + {auto t = Timer_ns(int_ns); // brace for timer start/stop scoping + //std::copy((char*)buf, (char*)buf +readBytes ,itstart + offset ); + memcpy(rawbufstart + offset, buf, readBytes); + + } // end Timer + + // BUFLOG("XrdCephBufferDataSimple::writeBuffer: " << offset << " " << readBytes << " " << int_ns); + + + + m_externalOffset = externalOffset; + // Decide to set the length of the maximum value that has be written + // note; unless invalidate is called, then this value may not be correctly set ... + m_bufLength = std::max(offset+blen, m_bufLength); + m_valid = true; + + + return readBytes; +} diff --git a/src/XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.hh b/src/XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.hh new file mode 100644 index 000000000..ac9b36d10 --- /dev/null +++ b/src/XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.hh @@ -0,0 +1,66 @@ +#ifndef __XRD_CEPH_BUFFER_DATA_SIMPLE_HH__ +#define __XRD_CEPH_BUFFER_DATA_SIMPLE_HH__ +//------------------------------------------------------------------------------ +//! is a simple implementation of IXrdCephBufferData using std::vector representation for the buffer +//------------------------------------------------------------------------------ + +#include +#include "IXrdCephBufferData.hh" +#include "BufferUtils.hh" +#include +#include +#include + +namespace XrdCephBuffer { + +/** + * @brief Implementation of a buffer using a simple vector + * Simplest implementation of a buffer using vector for underlying memory. + * Capacity is reserved on construction and released back at destruction. + * Does very little itself, except to provide access methods + * + */ +class XrdCephBufferDataSimple : public virtual IXrdCephBufferData + { + public: + XrdCephBufferDataSimple(size_t bufCapacity); + virtual ~XrdCephBufferDataSimple(); + virtual size_t capacity() const override;//! total available space + virtual size_t length() const override;//! Currently occupied and valid space, which may be less than capacity + virtual void setLength(size_t len) override;//! Currently occupied and valid space, which may be less than capacity + virtual bool isValid() const override; + virtual void setValid(bool isValid) override; + + virtual off_t startingOffset() const override; + virtual off_t setStartingOffset(off_t offset) override; + + + virtual ssize_t readBuffer(void* buf, off_t offset, size_t blen) const override; //! copy data from the internal buffer to buf + + virtual ssize_t invalidate() override; //! set cache into an invalid state; do this before writes to be consistent + virtual ssize_t writeBuffer(const void* buf, off_t offset, size_t blen, off_t externalOffset=0) override; //! write data into the buffer, store the external offset if provided + + virtual const void* raw() const override {return capacity() > 0 ? &(m_buffer[0]) : nullptr;} + virtual void* raw() override {return capacity() > 0 ? &(m_buffer[0]) : nullptr;} + + + protected: + bool m_valid = false; + std::vector m_buffer; // actual physical buffer + off_t m_externalOffset = 0; //! what does the first byte of the buffer map to for external offsets + size_t m_bufLength = 0; //! length of valid stored data; might be less than the capacity + + // timer and counter info + std::atomic< long> m_stats_read_timer{0}, m_stats_write_timer{0}; + std::atomic< long> m_stats_read_bytes{0}, m_stats_write_bytes{0}; + std::atomic< long> m_stats_read_req{0}, m_stats_write_req{0}; + long m_stats_read_longest{0}, m_stats_write_longest{0}; + + // staric vars to store the total useage of memory across this class + static std::atomic m_total_memory_used; + static std::atomic m_total_memory_nbuffers; + +}; // XrdCephBufferDataSimple + +} // namespace +#endif diff --git a/src/XrdCeph/XrdCephBuffers/XrdCephReadVBasic.cc b/src/XrdCeph/XrdCephBuffers/XrdCephReadVBasic.cc new file mode 100644 index 000000000..219d5cf75 --- /dev/null +++ b/src/XrdCeph/XrdCephBuffers/XrdCephReadVBasic.cc @@ -0,0 +1,56 @@ + +#include "XrdCephReadVBasic.hh" +#include "BufferUtils.hh" + +using namespace XrdCephBuffer; + +std::vector XrdCephReadVBasic::convert(const ExtentHolder &extentsHolderInput) const +{ + std::vector outputs; + + const ExtentContainer &extentsIn = extentsHolderInput.extents(); + + ExtentContainer::const_iterator it = extentsIn.begin(); + while (it != extentsIn.end()) + { + ExtentHolder tmp; + int counter(0); + while (it != extentsIn.end()) { + tmp.push_back(*it); // just put it into an extent + ++it; + ++counter; + if (counter > 10 ) break; + } + // while (it != extentsIn.end()) + // { + // //std::clog << "XrdCephReadVBasic: Inner: " << it->begin() << " " << it->len() << std::endl; + // if (!tmp.size()) + // { + // tmp.push_back(*it); + // } + // else if (it->end() - tmp.begin() < (ssize_t)m_minSize) + // { + // tmp.push_back(*it); + // } + // else if (((tmp.bytesContained() + it->len()) / (tmp.len() + it->len())) > 0.6) + // { + // tmp.push_back(*it); + // } + // else if (it->end() - tmp.begin() >= (ssize_t)m_maxSize) + // { + // break; // don't make too big + // } + // else + // { + // break; // didn't fullful logic to include, so start a new extent in next loop + // } + // ++it; + // } + //BUFLOG("XrdCephReadVBasic: Done Inner: " << tmp.size()); + outputs.push_back(tmp); + } + BUFLOG("XrdCephReadVBasic: In size: " << extentsHolderInput.size() << " " + << extentsHolderInput.extents().size() << " " << outputs.size() ); + + return outputs; +} // convert diff --git a/src/XrdCeph/XrdCephBuffers/XrdCephReadVBasic.hh b/src/XrdCeph/XrdCephBuffers/XrdCephReadVBasic.hh new file mode 100644 index 000000000..9ac912025 --- /dev/null +++ b/src/XrdCeph/XrdCephBuffers/XrdCephReadVBasic.hh @@ -0,0 +1,42 @@ +#ifndef __IXRD_CEPH_READV_BASIC_HH__ +#define __IXRD_CEPH_READV_BASIC_HH__ +//------------------------------------------------------------------------------ +// Interface to the actual buffer data object used to store the data +// Intention to be able to abstract the underlying implementation and code against the inteface +// e.g. if choice of buffer data object +//------------------------------------------------------------------------------ + +#include +#include + +#include "BufferUtils.hh" +#include "IXrdCephReadVAdapter.hh" + +namespace XrdCephBuffer +{ + + /** + * @brief Combine requests into single reads accoriding to some basic rules. + * Read a minimum amount of data (2MiB default), keep adding chunks until the used fraction is lower than some threshold, or 64MiB is reached. + * Calling code unraveles the correct ranges for each + */ + + + class XrdCephReadVBasic : virtual public IXrdCephReadVAdapter { + // nothing more than readV in, and readV out + public: + XrdCephReadVBasic() {} + virtual ~XrdCephReadVBasic() {} + + virtual std::vector convert(const ExtentHolder &extentsHolderInput) const override; + + protected: + size_t m_minSize = 2*1024*1024; + size_t m_maxSize = 64*1024*1024; + }; + + + +} + +#endif diff --git a/src/XrdCeph/XrdCephBuffers/XrdCephReadVNoOp.cc b/src/XrdCeph/XrdCephBuffers/XrdCephReadVNoOp.cc new file mode 100644 index 000000000..89bf323e7 --- /dev/null +++ b/src/XrdCeph/XrdCephBuffers/XrdCephReadVNoOp.cc @@ -0,0 +1,22 @@ + +#include "XrdCephReadVNoOp.hh" +#include "BufferUtils.hh" + +using namespace XrdCephBuffer; + +std::vector XrdCephReadVNoOp::convert(const ExtentHolder &extentsHolderInput) const +{ + std::vector outputs; + + const ExtentContainer &extentsIn = extentsHolderInput.extents(); + + for (ExtentContainer::const_iterator it = extentsIn.begin(); it != extentsIn.end(); ++it) + { + ExtentHolder tmp; + tmp.push_back(*it); + outputs.push_back(tmp); + } // for + // each element in the output contains one element, the + + return outputs; +} // convert diff --git a/src/XrdCeph/XrdCephBuffers/XrdCephReadVNoOp.hh b/src/XrdCeph/XrdCephBuffers/XrdCephReadVNoOp.hh new file mode 100644 index 000000000..b19b29123 --- /dev/null +++ b/src/XrdCeph/XrdCephBuffers/XrdCephReadVNoOp.hh @@ -0,0 +1,38 @@ +#ifndef __IXRD_CEPH_READV_NOOP_HH__ +#define __IXRD_CEPH_READV_NOOP_HH__ +//------------------------------------------------------------------------------ +// Interface to the actual buffer data object used to store the data +// Intention to be able to abstract the underlying implementation and code against the inteface +// e.g. if choice of buffer data object +//------------------------------------------------------------------------------ + +#include +#include + +#include "BufferUtils.hh" +#include "IXrdCephReadVAdapter.hh" + +namespace XrdCephBuffer +{ + + /** + * @brief Passthrough implementation. Convertes the ReadV requests to extents and makes the request. + * Does not change how the readV implementation is done, just implements a version with Extents + * More for functionality testing, or to allow easier access to readV statistics. + */ + class XrdCephReadVNoOp : virtual public IXrdCephReadVAdapter { + // nothing more than readV in, and readV out + public: + XrdCephReadVNoOp() {} + virtual ~XrdCephReadVNoOp() {} + + virtual std::vector convert(const ExtentHolder &extentsHolderInput) const override; + + protected: + }; + + + +} + +#endif diff --git a/src/XrdCeph/XrdCephOss.cc b/src/XrdCeph/XrdCephOss.cc index 76bed8e74..e0c88c14c 100644 --- a/src/XrdCeph/XrdCephOss.cc +++ b/src/XrdCeph/XrdCephOss.cc @@ -41,6 +41,8 @@ #include "XrdCeph/XrdCephOss.hh" #include "XrdCeph/XrdCephOssDir.hh" #include "XrdCeph/XrdCephOssFile.hh" +#include "XrdCeph/XrdCephOssBufferedFile.hh" +#include "XrdCeph/XrdCephOssReadVFile.hh" XrdVERSIONINFO(XrdOssGetStorageSystem, XrdCephOss); @@ -144,7 +146,86 @@ int XrdCephOss::Configure(const char *configfn, XrdSysError &Eroute) { return 1; } } - } + if (!strncmp(var, "ceph.usebuffer", 14)) { // allowable values: 0, 1 + var = Config.GetWord(); + if (var) { + unsigned long value = strtoul(var, 0, 10); + if (value <= 1) { + m_configBufferEnable = value; + Eroute.Emsg("Config", "ceph.usebuffer",std::to_string(m_configBufferEnable).c_str()); + } else { + Eroute.Emsg("Config", "Invalid value for ceph.usebuffer in config file (must be 0 or 1)", configfn, var); + return 1; + } + } else { + Eroute.Emsg("Config", "Missing value for ceph.usebuffer in config file", configfn); + return 1; + } + } // usebuffer + if (!strncmp(var, "ceph.buffersize", 15)) { // size in bytes + var = Config.GetWord(); + if (var) { + unsigned long value = strtoul(var, 0, 10); + if (value > 0 and value <= 1000000000L) { + m_configBufferSize = value; + Eroute.Emsg("Config", "ceph.buffersize", std::to_string(m_configBufferSize).c_str() ); + } else { + Eroute.Emsg("Config", "Invalid value for ceph.buffersize in config file; enter in bytes (no units)", configfn, var); + return 1; + } + } else { + Eroute.Emsg("Config", "Missing value for ceph.buffersize in config file", configfn); + return 1; + } + } // usebuffer + + if (!strncmp(var, "ceph.usereadv", 13)) { // allowable values: 0, 1 + var = Config.GetWord(); + if (var) { + unsigned long value = strtoul(var, 0, 10); + if (value <= 1) { + m_configReadVEnable = value; + Eroute.Emsg("Config", "ceph.usereadvalg",std::to_string(m_configBufferEnable).c_str()); + } else { + Eroute.Emsg("Config", "Invalid value for ceph.usereadv in config file (must be 0 or 1)", configfn, var); + return 1; + } + } else { + Eroute.Emsg("Config", "Missing value for ceph.usereadv in config file", configfn); + return 1; + } + } // usereadv + if (!strncmp(var, "ceph.readvalgname", 17)) { + var = Config.GetWord(); + // Eroute.Emsg("Config", "readvalgname readvalgname readvalgname readvalgname", var); + if (var) { + // Warn in case parameters were givne + char parms[1040]; + if (!Config.GetRest(parms, sizeof(parms)) || parms[0]) { + Eroute.Emsg("Config", "readvalgname parameters will be ignored"); + } + m_configReadVAlgName = var; + } else { + Eroute.Emsg("Config", "Missing value for ceph.readvalgname in config file", configfn); + return 1; + } + } + if (!strncmp(var, "ceph.bufferiomode", 17)) { + var = Config.GetWord(); + if (var) { + // Warn in case parameters were givne + char parms[1040]; + if (!Config.GetRest(parms, sizeof(parms)) || parms[0]) { + Eroute.Emsg("Config", "readvalgname parameters will be ignored"); + } + m_configBufferIOmode = var; // allowed values would be aio, io + } else { + Eroute.Emsg("Config", "Missing value for ceph.bufferiomode in config file", configfn); + return 1; + } + } + + } // while // Now check if any errors occured during file i/o int retc = Config.LastError(); @@ -254,6 +335,21 @@ XrdOssDF* XrdCephOss::newDir(const char *tident) { } XrdOssDF* XrdCephOss::newFile(const char *tident) { - return new XrdCephOssFile(this); + + // Depending on the configuration settings stack up the underlying + // XrdCephOssFile instance with decorator objects for readV and Buffering requests + + XrdCephOssFile* xrdCephOssDF = new XrdCephOssFile(this); + + if (m_configReadVEnable) { + xrdCephOssDF = new XrdCephOssReadVFile(this,xrdCephOssDF,m_configReadVAlgName); + } + + if (m_configBufferEnable) { + xrdCephOssDF = new XrdCephOssBufferedFile(this,xrdCephOssDF, m_configBufferSize, m_configBufferIOmode); + } + + + return xrdCephOssDF; } diff --git a/src/XrdCeph/XrdCephOss.hh b/src/XrdCeph/XrdCephOss.hh index 838030dc3..84af2b429 100644 --- a/src/XrdCeph/XrdCephOss.hh +++ b/src/XrdCeph/XrdCephOss.hh @@ -71,6 +71,13 @@ public: virtual XrdOssDF *newDir(const char *tident); virtual XrdOssDF *newFile(const char *tident); + private: + bool m_configBufferEnable=false; //! config option for buffering + size_t m_configBufferSize=16*1024*1024L; //! Buffer size + std::string m_configBufferIOmode = "aio"; + bool m_configReadVEnable=false; //! enable readV decorator + std::string m_configReadVAlgName="passthrough"; // readV algorithm type + }; #endif /* __CEPH_OSS_HH__ */ diff --git a/src/XrdCeph/XrdCephOssBufferedFile.cc b/src/XrdCeph/XrdCephOssBufferedFile.cc new file mode 100644 index 000000000..b23dbcddd --- /dev/null +++ b/src/XrdCeph/XrdCephOssBufferedFile.cc @@ -0,0 +1,224 @@ +//------------------------------------------------------------------------------ +// Copyright (c) 2014-2015 by European Organization for Nuclear Research (CERN) +// Author: Sebastien Ponce +//------------------------------------------------------------------------------ +// This file is part of the XRootD software suite. +// +// XRootD is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// XRootD is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with XRootD. If not, see . +// +// In applying this licence, CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. +//------------------------------------------------------------------------------ + +#include +#include +#include +#include +#include +#include +#include + +#include "XrdCeph/XrdCephPosix.hh" +#include "XrdOuc/XrdOucEnv.hh" +#include "XrdSys/XrdSysError.hh" +#include "XrdOuc/XrdOucTrace.hh" +#include "XrdSfs/XrdSfsAio.hh" +#include "XrdCeph/XrdCephOssFile.hh" + +#include "XrdCeph/XrdCephOssBufferedFile.hh" +#include "XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.hh" +#include "XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.hh" +#include "XrdCeph/XrdCephBuffers/CephIOAdapterRaw.hh" +#include "XrdCeph/XrdCephBuffers/CephIOAdapterAIORaw.hh" + +using namespace XrdCephBuffer; + +extern XrdSysError XrdCephEroute; +extern XrdOucTrace XrdCephTrace; + + +XrdCephOssBufferedFile::XrdCephOssBufferedFile(XrdCephOss *cephoss,XrdCephOssFile *cephossDF, + size_t buffersize,const std::string& bufferIOmode): + XrdCephOssFile(cephoss), m_cephoss(cephoss), m_xrdOssDF(cephossDF), m_bufsize(buffersize), + m_bufferIOmode(bufferIOmode) +{ + +} + +XrdCephOssBufferedFile::~XrdCephOssBufferedFile() { + // XrdCephEroute.Say("XrdCephOssBufferedFile::Destructor"); + + // remember to delete the inner XrdCephOssFile object + if (m_xrdOssDF) { + delete m_xrdOssDF; + m_xrdOssDF = nullptr; + } + +} + + +int XrdCephOssBufferedFile::Open(const char *path, int flags, mode_t mode, XrdOucEnv &env) { + + int rc = m_xrdOssDF->Open(path, flags, mode, env); + if (rc < 0) { + return rc; + } + m_fd = m_xrdOssDF->getFileDescriptor(); + BUFLOG("XrdCephOssBufferedFile::Open got fd: " << m_fd << " " << path); + m_flags = flags; // e.g. for write/read knowledge + m_path = path; // good to keep the path for final stats presentation + + // opened a file, so create the buffer here; note - this might be better delegated to the first read/write ... + // need the file descriptor, so do it after we know the file is opened (and not just a stat for example) + std::unique_ptr cephbuffer = std::unique_ptr(new XrdCephBufferDataSimple(m_bufsize)); + // std::unique_ptr cephio = std::unique_ptr(new CephIOAdapterRaw(cephbuffer.get(),m_fd)); + std::unique_ptr cephio; + if (m_bufferIOmode == "aio") { + cephio = std::unique_ptr(new CephIOAdapterAIORaw(cephbuffer.get(),m_fd)); + } else if (m_bufferIOmode == "io") { + cephio = std::unique_ptr(new CephIOAdapterRaw(cephbuffer.get(),m_fd)); + } else { + BUFLOG("XrdCephOssBufferedFile: buffer mode needs to be one of aio|io " ); + m_xrdOssDF->Close(); + return -EINVAL; + } + + LOGCEPH( "XrdCephOssBufferedFile::Open: fd: " << m_fd << " Buffer created: " << cephbuffer->capacity() ); + m_bufferAlg = std::unique_ptr(new XrdCephBufferAlgSimple(std::move(cephbuffer),std::move(cephio),m_fd) ); + + // start the timer + //m_timestart = std::chrono::steady_clock::now(); + m_timestart = std::chrono::system_clock::now(); + // return the file descriptor + return rc; +} + +int XrdCephOssBufferedFile::Close(long long *retsz) { + // if data is still in the buffer and we are writing, make sure to write it + if ((m_flags & (O_WRONLY|O_RDWR)) != 0) { + ssize_t rc = m_bufferAlg->flushWriteCache(); + if (rc < 0) { + LOGCEPH( "XrdCephOssBufferedFile::Close: flush Error fd: " << m_fd << " rc:" << rc ); + // still try to close the file + ssize_t rc2 = m_xrdOssDF->Close(retsz); + if (rc2 < 0) { + LOGCEPH( "XrdCephOssBufferedFile::Close: Close error after flush Error fd: " << m_fd << " rc:" << rc2 ); + } + return rc; // return the original flush error + } else { + LOGCEPH( "XrdCephOssBufferedFile::Close: Flushed data on close fd: " << m_fd << " rc:" << rc ); + } + } // check for write + const std::chrono::time_point now = + std::chrono::system_clock::now(); + const std::time_t t_s = std::chrono::system_clock::to_time_t(m_timestart); + const std::time_t t_c = std::chrono::system_clock::to_time_t(now); + + auto t_dur = std::chrono::duration_cast(now - m_timestart).count(); + + LOGCEPH("XrdCephOssBufferedFile::Summary: {\"fd\":" << m_fd << ", \"Elapsed_time_ms\":" << t_dur + << ", \"path\":\"" << m_path + << "\", read_B:" << m_bytesRead.load() + << ", readV_B:" << m_bytesReadV.load() + << ", readAIO_B:" << m_bytesReadAIO.load() + << ", writeB:" << m_bytesWrite.load() + << ", writeAIO_B:" << m_bytesWriteAIO.load() + << ", startTime:\"" << std::put_time(std::localtime(&t_s), "%F %T") << "\", endTime:\"" + << std::put_time(std::localtime(&t_c), "%F %T") << "\"" + << "}"); + + return m_xrdOssDF->Close(retsz); +} + + +ssize_t XrdCephOssBufferedFile::ReadV(XrdOucIOVec *readV, int rnum) { + // don't touch readV in the buffering method + ssize_t rc = m_xrdOssDF->ReadV(readV,rnum); + if (rc > 0) m_bytesReadV.fetch_add(rc); + return rc; +} + +ssize_t XrdCephOssBufferedFile::Read(off_t offset, size_t blen) { + return m_xrdOssDF->Read(offset, blen); +} + +ssize_t XrdCephOssBufferedFile::Read(void *buff, off_t offset, size_t blen) { + ssize_t rc = m_bufferAlg->read(buff, offset, blen); + if (rc >=0) { + m_bytesRead.fetch_add(rc); + } else { + LOGCEPH( "XrdCephOssBufferedFile::Read: Read error fd: " << m_fd << " rc:" << rc << " off:" << offset << " len:" << blen); + } + return rc; +} + +int XrdCephOssBufferedFile::Read(XrdSfsAio *aiop) { + + // LOGCEPH("XrdCephOssBufferedFile::AIOREAD: fd: " << m_xrdOssDF->getFileDescriptor() << " " << time(nullptr) << " : " + // << aiop->sfsAio.aio_offset << " " + // << aiop->sfsAio.aio_nbytes << " " << aiop->sfsAio.aio_reqprio << " " + // << aiop->sfsAio.aio_fildes ); + ssize_t rc = m_bufferAlg->read_aio(aiop); + if (rc > 0) { + m_bytesReadAIO.fetch_add(rc); + } else { + LOGCEPH( "XrdCephOssBufferedFile::Read: ReadAIO error fd: " << m_fd << " rc:" << rc + << " off:" << aiop->sfsAio.aio_offset << " len:" << aiop->sfsAio.aio_nbytes ); + } + return rc; +} + +ssize_t XrdCephOssBufferedFile::ReadRaw(void *buff, off_t offset, size_t blen) { + // #TODO; ReadRaw should bypass the buffer ? + return m_xrdOssDF->ReadRaw(buff, offset, blen); +} + +int XrdCephOssBufferedFile::Fstat(struct stat *buff) { + return m_xrdOssDF->Fstat(buff); +} + +ssize_t XrdCephOssBufferedFile::Write(const void *buff, off_t offset, size_t blen) { + ssize_t rc = m_bufferAlg->write(buff, offset, blen); + if (rc >=0) { + m_bytesWrite.fetch_add(rc); + } else { + LOGCEPH( "XrdCephOssBufferedFile::Write: Write error fd: " << m_fd << " rc:" << rc << " off:" << offset << " len:" << blen); + } + return rc; +} + +int XrdCephOssBufferedFile::Write(XrdSfsAio *aiop) { + // LOGCEPH("XrdCephOssBufferedFile::AIOWRITE: fd: " << m_xrdOssDF->getFileDescriptor() << " " << time(nullptr) << " : " + // << aiop->sfsAio.aio_offset << " " + // << aiop->sfsAio.aio_nbytes << " " << aiop->sfsAio.aio_reqprio << " " + // << aiop->sfsAio.aio_fildes << " " ); + ssize_t rc = m_bufferAlg->write_aio(aiop); + if (rc > 0) { + m_bytesWriteAIO.fetch_add(rc); + } else { + LOGCEPH( "XrdCephOssBufferedFile::Write: WriteAIO error fd: " << m_fd << " rc:" << rc + << " off:" << aiop->sfsAio.aio_offset << " len:" << aiop->sfsAio.aio_nbytes ); + } + return rc; + +} + +int XrdCephOssBufferedFile::Fsync() { + return m_xrdOssDF->Fsync(); +} + +int XrdCephOssBufferedFile::Ftruncate(unsigned long long len) { + return m_xrdOssDF->Ftruncate(len); +} diff --git a/src/XrdCeph/XrdCephOssBufferedFile.hh b/src/XrdCeph/XrdCephOssBufferedFile.hh new file mode 100644 index 000000000..b20ef10df --- /dev/null +++ b/src/XrdCeph/XrdCephOssBufferedFile.hh @@ -0,0 +1,84 @@ +//------------------------------------------------------------------------------ +// Copyright (c) 2014-2015 by European Organization for Nuclear Research (CERN) +// Author: Sebastien Ponce +//------------------------------------------------------------------------------ +// This file is part of the XRootD software suite. +// +// XRootD is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// XRootD is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with XRootD. If not, see . +// +// In applying this licence, CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. +//------------------------------------------------------------------------------ + +#ifndef __XRD_CEPH_OSS_BUFFERED_FILE_HH__ +#define __XRD_CEPH_OSS_BUFFERED_FILE_HH__ + +#include "XrdOss/XrdOss.hh" +#include "XrdCeph/XrdCephOss.hh" +#include "XrdCeph/XrdCephOssFile.hh" + +#include "XrdCeph/XrdCephBuffers/IXrdCephBufferData.hh" +#include "XrdCeph/XrdCephBuffers/IXrdCephBufferAlg.hh" +#include "XrdCeph/XrdCephBuffers/IXrdCephReadVAdapter.hh" + +#include +#include +#include + + +//------------------------------------------------------------------------------ +//! Decorator class XrdCephOssBufferedFile designed to wrap XrdCephOssFile +//! Functionality for buffered access to/from data in Ceph to avoid inefficient +//! small reads / writes from the client side +//------------------------------------------------------------------------------ + +class XrdCephOssBufferedFile : virtual public XrdCephOssFile { // XrdOssDF + +public: + XrdCephOssBufferedFile(XrdCephOss *cephoss,XrdCephOssFile *cephossDF, size_t buffersize, + const std::string& bufferIOmode); + //explicit XrdCephOssBufferedFile(size_t buffersize); + virtual ~XrdCephOssBufferedFile(); + virtual int Open(const char *path, int flags, mode_t mode, XrdOucEnv &env); + virtual int Close(long long *retsz=0); + virtual ssize_t Read(off_t offset, size_t blen); + virtual ssize_t Read(void *buff, off_t offset, size_t blen); + virtual int Read(XrdSfsAio *aoip); + virtual ssize_t ReadV(XrdOucIOVec *readV, int rdvcnt); + virtual ssize_t ReadRaw(void *, off_t, size_t); + virtual int Fstat(struct stat *buff); + virtual ssize_t Write(const void *buff, off_t offset, size_t blen); + virtual int Write(XrdSfsAio *aiop); + virtual int Fsync(void); + virtual int Ftruncate(unsigned long long); + +protected: + XrdCephOss *m_cephoss = nullptr; + XrdCephOssFile * m_xrdOssDF = nullptr; // holder of the XrdCephOssFile instance + std::unique_ptr(m_bufferAlg); + + int m_flags = 0; + size_t m_bufsize = 16*1024*1024L; // default 16MiB size + std::string m_bufferIOmode; + std::string m_path; + std::chrono::time_point m_timestart; + std::atomic m_bytesRead = {0}; /// number of bytes read or written + std::atomic m_bytesReadV = {0}; /// number of bytes read or written + std::atomic m_bytesReadAIO = {0}; /// number of bytes read or written + std::atomic m_bytesWrite = {0}; /// number of bytes read or written + std::atomic m_bytesWriteAIO= {0}; /// number of bytes read or written +}; + +#endif /* __XRD_CEPH_OSS_BUFFERED_FILE_HH__ */ diff --git a/src/XrdCeph/XrdCephOssFile.hh b/src/XrdCeph/XrdCephOssFile.hh index 094ed84c6..ecdf668a2 100644 --- a/src/XrdCeph/XrdCephOssFile.hh +++ b/src/XrdCeph/XrdCephOssFile.hh @@ -49,11 +49,11 @@ //! In case one of the two only has a default, it will be applied for both plugins. //------------------------------------------------------------------------------ -class XrdCephOssFile : public XrdOssDF { +class XrdCephOssFile : virtual public XrdOssDF { public: - XrdCephOssFile(XrdCephOss *cephoss); + explicit XrdCephOssFile(XrdCephOss *cephoss); virtual ~XrdCephOssFile() {}; virtual int Open(const char *path, int flags, mode_t mode, XrdOucEnv &env); virtual int Close(long long *retsz=0); @@ -67,7 +67,8 @@ public: virtual int Fsync(void); virtual int Ftruncate(unsigned long long); -private: + inline virtual int getFileDescriptor() const {return m_fd;} +protected: int m_fd; XrdCephOss *m_cephOss; diff --git a/src/XrdCeph/XrdCephOssReadVFile.cc b/src/XrdCeph/XrdCephOssReadVFile.cc new file mode 100644 index 000000000..2aa31e2dc --- /dev/null +++ b/src/XrdCeph/XrdCephOssReadVFile.cc @@ -0,0 +1,211 @@ +//------------------------------------------------------------------------------ +// Copyright (c) 2014-2015 by European Organization for Nuclear Research (CERN) +// Author: Sebastien Ponce +//------------------------------------------------------------------------------ +// This file is part of the XRootD software suite. +// +// XRootD is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// XRootD is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with XRootD. If not, see . +// +// In applying this licence, CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. +//------------------------------------------------------------------------------ + +#include +#include +#include +#include +#include +#include +#include + +#include "XrdCeph/XrdCephPosix.hh" +#include "XrdOuc/XrdOucEnv.hh" +#include "XrdSys/XrdSysError.hh" +#include "XrdOuc/XrdOucTrace.hh" +#include "XrdSfs/XrdSfsAio.hh" +#include "XrdCeph/XrdCephOssFile.hh" + +#include "XrdCeph/XrdCephOssReadVFile.hh" +#include "XrdCeph/XrdCephBuffers/XrdCephReadVBasic.hh" +#include "XrdCeph/XrdCephBuffers/XrdCephReadVNoOp.hh" + +using namespace XrdCephBuffer; + +extern XrdSysError XrdCephEroute; +extern XrdOucTrace XrdCephTrace; + +XrdCephOssReadVFile::XrdCephOssReadVFile(XrdCephOss *cephoss,XrdCephOssFile *cephossDF,const std::string& algname): +XrdCephOssFile(cephoss), m_cephoss(cephoss), m_xrdOssDF(cephossDF),m_algname(algname) +{ + if (!m_xrdOssDF) XrdCephEroute.Say("XrdCephOssReadVFile::Null m_xrdOssDF"); + + if (m_algname == "passthrough") { // #TODO consider to use a factory method. but this is simple enough for now + m_readVAdapter = std::unique_ptr(new XrdCephBuffer::XrdCephReadVNoOp()); + } else if (m_algname == "basic") { + m_readVAdapter = std::unique_ptr(new XrdCephBuffer::XrdCephReadVBasic()); + } else { + XrdCephEroute.Say("XrdCephOssReadVFile::ERROR Invalid ReadV algorthm passed; defaulting to passthrough"); + m_algname = "passthrough"; + m_readVAdapter = std::unique_ptr(new XrdCephBuffer::XrdCephReadVNoOp()); + } + LOGCEPH("XrdCephOssReadVFile Algorithm type: " << m_algname); +} + +XrdCephOssReadVFile::~XrdCephOssReadVFile() { + if (m_xrdOssDF) { + delete m_xrdOssDF; + m_xrdOssDF = nullptr; + } + +} + +int XrdCephOssReadVFile::Open(const char *path, int flags, mode_t mode, XrdOucEnv &env) { + int rc = m_xrdOssDF->Open(path, flags, mode, env); + if (rc < 0) { + return rc; + } + m_fd = m_xrdOssDF->getFileDescriptor(); + LOGCEPH("XrdCephOssReadVFile::Open: fd: " << m_fd << " " << path ); + return rc; +} + +int XrdCephOssReadVFile::Close(long long *retsz) { + LOGCEPH("XrdCephOssReadVFile::Close: retsz: " << retsz << " Time_ceph_s: " << m_timer_read_ns.load()*1e-9 << " count: " + << m_timer_count.load() << " size_B: " << m_timer_size.load() + << " longest_s:" << m_timer_longest.load()*1e-9); + + return m_xrdOssDF->Close(retsz); +} + + +ssize_t XrdCephOssReadVFile::ReadV(XrdOucIOVec *readV, int rnum) { + int fd = m_xrdOssDF->getFileDescriptor(); + LOGCEPH("XrdCephOssReadVFile::ReadV: fd: " << fd << " " << rnum << "\n" ); + + //std::stringstream msg_extents; + //msg_extents << "EXTENTS=["; + + ExtentHolder extents(rnum); + for (int i = 0; i < rnum; i++) { + extents.push_back(Extent(readV[i].offset, readV[i].size)); + //msg_extents << "(" << readV[i].offset << "," << readV[i].size << ")," ; + } + //msg_extents << "]"; + //XrdCephEroute.Say(msg_extents.str().c_str()); msg_extents.clear(); + //LOGCEPH(msg_extents.str()); + + LOGCEPH("Extents: fd: "<< fd << " " << extents.size() << " " << extents.len() << " " + << extents.begin() << " " << extents.end() << " " << extents.bytesContained() + << " " << extents.bytesMissing()); + + // take the input set of extents and return a vector of merged extents (covering the range to read) + std::vector mappedExtents = m_readVAdapter->convert(extents); + + + // counter is the iterator to the original readV elements, and is incremented for each chunk that's returned + int nbytes = 0, curCount = 0, counter(0); + size_t totalBytesRead(0), totalBytesUseful(0); + + // extract the largest range of the extents, and create a buffer. + size_t buffersize{0}; + for (std::vector::const_iterator ehit = mappedExtents.cbegin(); ehit!= mappedExtents.cend(); ++ehit ) { + buffersize = std::max(buffersize, ehit->len()); + } + std::vector buffer; + buffer.reserve(buffersize); + + + //LOGCEPH("mappedExtents: len: " << mappedExtents.size() ); + for (std::vector::const_iterator ehit = mappedExtents.cbegin(); ehit!= mappedExtents.cend(); ++ehit ) { + off_t off = ehit->begin(); + size_t len = ehit->len(); + + //LOGCEPH("outerloop: " << off << " " << len << " " << ehit->end() << " " << " " << ehit->size() ); + + // read the full extent into the buffer + long timed_read_ns{0}; + {Timer_ns ts(timed_read_ns); + curCount = m_xrdOssDF->Read(buffer.data(), off, len); + } // timer scope + ++m_timer_count; + auto l = m_timer_longest.load(); + m_timer_longest.store(max(l,timed_read_ns)); // doesn't quite prevent race conditions + m_timer_read_ns.fetch_add(timed_read_ns); + m_timer_size.fetch_add(curCount); + + // check that the correct amount of data was read. + // std:: clog << "buf Read " << curCount << std::endl; + if (curCount != (ssize_t)len) { + return (curCount < 0 ? curCount : -ESPIPE); + } + totalBytesRead += curCount; + totalBytesUseful += ehit->bytesContained(); + + + // now read out into the original readV requests for each of the held inner extents + const char* data = buffer.data(); + const ExtentContainer& innerExtents = ehit->extents(); + for (ExtentContainer::const_iterator it = innerExtents.cbegin(); it != innerExtents.cend(); ++it) { + off_t innerBegin = it->begin() - off; + off_t innerEnd = it->end() - off; + //LOGCEPH( "innerloop: " << innerBegin << " " << innerEnd << " " << off << " " + // << it->begin() << " " << it-> end() << " " + // << readV[counter].offset << " " << readV[counter].size); + std::copy(data+innerBegin, data+innerEnd, readV[counter].data ); + nbytes += it->len(); + ++counter; // next element + } // inner extents + + } // outer extents + LOGCEPH( "readV returning " << nbytes << " bytes: " << "Read: " <Read(offset,blen); +} + +ssize_t XrdCephOssReadVFile::Read(void *buff, off_t offset, size_t blen) { + return m_xrdOssDF->Read(buff,offset,blen); +} + +int XrdCephOssReadVFile::Read(XrdSfsAio *aiop) { + return m_xrdOssDF->Read(aiop); +} + +ssize_t XrdCephOssReadVFile::ReadRaw(void *buff, off_t offset, size_t blen) { + return m_xrdOssDF->ReadRaw(buff, offset, blen); +} + +int XrdCephOssReadVFile::Fstat(struct stat *buff) { + return m_xrdOssDF->Fstat(buff); +} + +ssize_t XrdCephOssReadVFile::Write(const void *buff, off_t offset, size_t blen) { + return m_xrdOssDF->Write(buff,offset,blen); +} + +int XrdCephOssReadVFile::Write(XrdSfsAio *aiop) { + return m_xrdOssDF->Write(aiop); +} + +int XrdCephOssReadVFile::Fsync() { + return m_xrdOssDF->Fsync(); +} + +int XrdCephOssReadVFile::Ftruncate(unsigned long long len) { + return m_xrdOssDF->Ftruncate(len); +} diff --git a/src/XrdCeph/XrdCephOssReadVFile.hh b/src/XrdCeph/XrdCephOssReadVFile.hh new file mode 100644 index 000000000..22c0717a3 --- /dev/null +++ b/src/XrdCeph/XrdCephOssReadVFile.hh @@ -0,0 +1,90 @@ +//------------------------------------------------------------------------------ +// Copyright (c) 2014-2015 by European Organization for Nuclear Research (CERN) +// Author: Sebastien Ponce +//------------------------------------------------------------------------------ +// This file is part of the XRootD software suite. +// +// XRootD is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// XRootD is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with XRootD. If not, see . +// +// In applying this licence, CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. +//------------------------------------------------------------------------------ + +#ifndef __XRD_CEPH_OSS_READV_FILE_HH__ +#define __XRD_CEPH_OSS_READV_FILE_HH__ + +#include "XrdOss/XrdOss.hh" +#include "XrdCeph/XrdCephOss.hh" +#include "XrdCeph/XrdCephOssFile.hh" + +#include "XrdCeph/XrdCephBuffers/IXrdCephBufferData.hh" +#include "XrdCeph/XrdCephBuffers/IXrdCephBufferAlg.hh" +#include "XrdCeph/XrdCephBuffers/IXrdCephReadVAdapter.hh" + +#include + + +//------------------------------------------------------------------------------ +//! Decorator class XrdCephOssReadVFile designed to wrap XrdCephOssFile +//! Functionality for ReadV access to/from data in Ceph to avoid inefficient +//! small reads / writes from the client side. +//! Initially for monitoring purposes +//------------------------------------------------------------------------------ + +class XrdCephOssReadVFile : virtual public XrdCephOssFile { + +public: + + explicit XrdCephOssReadVFile(XrdCephOss *cephoss, XrdCephOssFile *cephossDF,const std::string& algname); + virtual ~XrdCephOssReadVFile(); + virtual int Open(const char *path, int flags, mode_t mode, XrdOucEnv &env); + virtual int Close(long long *retsz=0); + +//----------------------------------------------------------------------------- +//! Read file bytes as directed by the read vector. +//! +//! @param readV pointer to the array of read requests. +//! @param rdvcnt the number of elements in readV. +//! +//! @return >=0 The numbe of bytes read. +//! @return < 0 -errno or -osserr upon failure (see XrdOssError.hh). +//----------------------------------------------------------------------------- + virtual ssize_t ReadV(XrdOucIOVec *readV, int rdvcnt); + + virtual ssize_t Read(off_t offset, size_t blen); + virtual ssize_t Read(void *buff, off_t offset, size_t blen); + virtual int Read(XrdSfsAio *aoip); + virtual ssize_t ReadRaw(void *, off_t, size_t); + virtual int Fstat(struct stat *buff); + virtual ssize_t Write(const void *buff, off_t offset, size_t blen); + virtual int Write(XrdSfsAio *aiop); + virtual int Fsync(void); + virtual int Ftruncate(unsigned long long); + +protected: + XrdCephOss *m_cephoss = nullptr; + XrdCephOssFile * m_xrdOssDF = nullptr; // holder of the XrdCephOssFile instance + std::string m_algname = "passthrough"; + std::unique_ptr m_readVAdapter; + + std::atomic m_timer_read_ns {0}; //! timer for the reads against ceph + std::atomic m_timer_count {0}; //! number of reads + std::atomic m_timer_size {0}; //! number of reads + std::atomic m_timer_longest {0}; //! size read in bytes + + +}; + +#endif /* __XRD_CEPH_OSS_READV_FILE_HH__ */ diff --git a/src/XrdCeph/XrdCephPosix.cc b/src/XrdCeph/XrdCephPosix.cc index 1a0b8f957..1c96f8420 100644 --- a/src/XrdCeph/XrdCephPosix.cc +++ b/src/XrdCeph/XrdCephPosix.cc @@ -50,6 +50,8 @@ #include "XrdCeph/XrdCephPosix.hh" +#include "XrdSfs/XrdSfsFlags.hh" // for the OFFLINE flag status + /// small structs to store file metadata struct CephFile { std::string name; diff --git a/src/XrdCeph/XrdCephPosix.hh b/src/XrdCeph/XrdCephPosix.hh index 77f9fbc23..e34596a6c 100644 --- a/src/XrdCeph/XrdCephPosix.hh +++ b/src/XrdCeph/XrdCephPosix.hh @@ -35,6 +35,18 @@ #include #include +// simple logging for XrdCeph buffering code +#define XRDCEPHLOGLEVEL 1 +#ifdef XRDCEPHLOGLEVEL + // ensure that + // extern XrdOucTrace XrdCephTrace; + // is in the cc file where you want to log // << std::endl + //#define LOGCEPH(x) {std::stringstream _s; _s << x; XrdCephTrace.Beg(); std::clog << _s.str() ; XrdCephTrace.End(); _s.clear();} + #define LOGCEPH(x) {std::stringstream _s; _s << x; std::clog << _s.str() << std::endl; _s.clear(); } +#else + #define LOGCEPH(x) +#endif + class XrdSfsAio; typedef void(AioCB)(XrdSfsAio*, size_t); From 6814241eac7e8ac43b51fc05ab40b9e2fc5323ea Mon Sep 17 00:00:00 2001 From: Jo-stfc <71326101+Jo-stfc@users.noreply.github.com> Date: Tue, 5 Apr 2022 12:19:23 +0100 Subject: [PATCH 02/18] merge variable rpm name into bufferedIO (#19) * variable rpm name * Update xrootd-ceph.spec.in * Update makesrpm.sh * Update makesrpm.sh --- packaging/makesrpm.sh | 27 +++++++++++++++------- packaging/rhel/xrootd-ceph.spec.in | 36 ++++++++++++++++++++---------- 2 files changed, 43 insertions(+), 20 deletions(-) diff --git a/packaging/makesrpm.sh b/packaging/makesrpm.sh index b74b90fd4..c82f85a8a 100755 --- a/packaging/makesrpm.sh +++ b/packaging/makesrpm.sh @@ -42,6 +42,7 @@ function printHelp() SOURCEPATH="../" OUTPUTPATH="." PRINTHELP=0 +RPM_NAME="xrootd-ceph" while test ${#} -ne 0; do if test x${1} = x--help; then @@ -74,6 +75,16 @@ while test ${#} -ne 0; do fi USER_DEFINE="$USER_DEFINE --define \""${2}"\"" shift + elif test x${1} = x--rename; then + if test ${#} -lt 2; then + echo "--rename parameter needs an argument" 1>&2 + exit 1 + fi + cp rhel/xrootd-ceph.spec.in rhel/${2}.spec.in + sed -i "s/xrootd-ceph/${2}/" rhel/${2}.spec.in + cp rhel/${2}.spec.in /root/rpmbuild/SPECS + RPM_NAME="${2}" + shift fi shift done @@ -177,7 +188,7 @@ echo "[i] RPM compliant version: $VERSION-$RELEASE" # exit on any error set -e -TEMPDIR=`mktemp -d /tmp/xrootd-ceph.srpm.XXXXXXXXXX` +TEMPDIR=`mktemp -d /tmp/${RPM_NAME}.srpm.XXXXXXXXXX` RPMSOURCES=$TEMPDIR/rpmbuild/SOURCES mkdir -p $RPMSOURCES mkdir -p $TEMPDIR/rpmbuild/SRPMS @@ -199,12 +210,12 @@ fi #------------------------------------------------------------------------------- # Generate the spec file #------------------------------------------------------------------------------- -if test ! -r rhel/xrootd-ceph.spec.in; then +if test ! -r rhel/${RPM_NAME}.spec.in; then echo "[!] The specfile template does not exist!" 1>&2 exit 7 fi -cat rhel/xrootd-ceph.spec.in | sed "s/__VERSION__/$VERSION/" | \ - sed "s/__RELEASE__/$RELEASE/" > $TEMPDIR/xrootd-ceph.spec +cat rhel/${RPM_NAME}.spec.in | sed "s/__VERSION__/$VERSION/" | \ + sed "s/__RELEASE__/$RELEASE/" > $TEMPDIR/${RPM_NAME}.spec #------------------------------------------------------------------------------- # Make a tarball of the latest commit on the branch @@ -221,8 +232,8 @@ if test $? -ne 0; then exit 5 fi -git archive --prefix=xrootd-ceph/ --format=tar $COMMIT | gzip -9fn > \ - $RPMSOURCES/xrootd-ceph.tar.gz +git archive --prefix=${RPM_NAME}/ --format=tar $COMMIT | gzip -9fn > \ + $RPMSOURCES/${RPM_NAME}.tar.gz if test $? -ne 0; then echo "[!] Unable to create the source tarball" 1>&2 @@ -244,13 +255,13 @@ eval "rpmbuild --define \"_topdir $TEMPDIR/rpmbuild\" \ --define \"_source_filedigest_algorithm md5\" \ --define \"_binary_filedigest_algorithm md5\" \ ${USER_DEFINE} \ - -bs $TEMPDIR/xrootd-ceph.spec > $TEMPDIR/log" + -bs $TEMPDIR/${RPM_NAME}.spec > $TEMPDIR/log" if test $? -ne 0; then echo "[!] RPM creation failed" 1>&2 exit 8 fi -cp $TEMPDIR/rpmbuild/SRPMS/xrootd-ceph*.src.rpm $OUTPUTPATH +cp $TEMPDIR/rpmbuild/SRPMS/${RPM_NAME}*.src.rpm $OUTPUTPATH rm -rf $TEMPDIR echo "[i] Done." diff --git a/packaging/rhel/xrootd-ceph.spec.in b/packaging/rhel/xrootd-ceph.spec.in index 03b1cacce..721be70f3 100644 --- a/packaging/rhel/xrootd-ceph.spec.in +++ b/packaging/rhel/xrootd-ceph.spec.in @@ -40,8 +40,8 @@ #------------------------------------------------------------------------------- Name: xrootd-ceph Epoch: 1 -Version: __VERSION__ -Release: __RELEASE__%{?dist}%{?_with_clang:.clang} +Version: 5.3.4 +Release: 1%{?dist}%{?_with_clang:.clang} Summary: CEPH plug-in for XRootD Group: System Environment/Daemons License: LGPLv3+ @@ -64,22 +64,32 @@ BuildRequires: cmake BuildRequires: cppunit-devel %endif -BuildRequires: librados-devel = 2:14.2.15 -BuildRequires: libradosstriper-devel = 2:14.2.15 +BuildRequires: librados-devel = 2:14.2.22 +BuildRequires: libradosstriper-devel = 2:14.2.22 %if %{?_with_clang:1}%{!?_with_clang:0} BuildRequires: clang %endif -BuildRequires: xrootd-server-devel%{?_isa} = %{epoch}:%{version}-%{release} -BuildRequires: xrootd-private-devel%{?_isa} = %{epoch}:%{version}-%{release} -BuildRequires: xrootd-libs%{?_isa} = %{epoch}:%{version}-%{release} -BuildRequires: xrootd-server-libs%{?_isa} = %{epoch}:%{version}-%{release} -BuildRequires: xrootd-client-libs%{?_isa} = %{epoch}:%{version}-%{release} +#BuildRequires: xrootd-server-devel%{?_isa} = %{epoch}:%{version}-%{release} +#BuildRequires: xrootd-private-devel%{?_isa} = %{epoch}:%{version}-%{release} +#BuildRequires: xrootd-libs%{?_isa} = %{epoch}:%{version}-%{release} +#BuildRequires: xrootd-server-libs%{?_isa} = %{epoch}:%{version}-%{release} +#BuildRequires: xrootd-client-libs%{?_isa} = %{epoch}:%{version}-%{release} -Requires: xrootd-server-libs%{?_isa} = %{epoch}:%{version}-%{release} -Requires: xrootd-client-libs%{?_isa} = %{epoch}:%{version}-%{release} -Requires: xrootd-libs%{?_isa} = %{epoch}:%{version}-%{release} +#Requires: xrootd-server-libs%{?_isa} = %{epoch}:%{version}-%{release} +#Requires: xrootd-client-libs%{?_isa} = %{epoch}:%{version}-%{release} +#Requires: xrootd-libs%{?_isa} = %{epoch}:%{version}-%{release} + +BuildRequires: xrootd-server-devel%{?_isa} >= 1:5.3.3 +BuildRequires: xrootd-private-devel%{?_isa} >= 1:5.3.3 +BuildRequires: xrootd-libs%{?_isa} >= 1:5.3.1 +BuildRequires: xrootd-server-libs%{?_isa} >= 1:5.3.3 +BuildRequires: xrootd-client-libs%{?_isa} >= 1:5.3.3 + +Requires: xrootd-server-libs%{?_isa} >= 1:5.3.3 +Requires: xrootd-client-libs%{?_isa} >= 1:5.3.3 +Requires: xrootd-libs%{?_isa} >= 1:5.3.3 %description The xrootd-ceph is an OSS layer plug-in for the XRootD server for interfacing @@ -158,6 +168,8 @@ rm -rf $RPM_BUILD_ROOT # Changelog #------------------------------------------------------------------------------- %changelog +* Mon Mar 14 2022 Jyothish Thomas +-offline file bug fix * Wed Dec 16 2020 George Patargias - updated version for librados-devel and libradosstriper-devel to 14.2.15 following the recent upgrade on external Echo gateways - fixed version in xrootd-ceph shared libraries From b85fe04e8d43014f42130ec9fea731157ba9d941 Mon Sep 17 00:00:00 2001 From: James Walder Date: Wed, 6 Apr 2022 12:26:23 +0100 Subject: [PATCH 03/18] Fixes to remove warnings from devtoolset-9 compilation --- src/XrdCeph/XrdCephBuffers/BufferUtils.hh | 2 +- src/XrdCeph/XrdCephOssBufferedFile.hh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/XrdCeph/XrdCephBuffers/BufferUtils.hh b/src/XrdCeph/XrdCephBuffers/BufferUtils.hh index 0b4caee26..8a0c6f0f9 100644 --- a/src/XrdCeph/XrdCephBuffers/BufferUtils.hh +++ b/src/XrdCeph/XrdCephBuffers/BufferUtils.hh @@ -20,7 +20,7 @@ #define CEPHBUFDEBUG 1 #ifdef CEPHBUFDEBUG extern std::mutex cephbuf_iolock; -#define BUFLOG(x) {std::unique_lock(cephbuf_iolock); std::stringstream _bs; _bs << x; std::clog << _bs.str() << std::endl;} +#define BUFLOG(x) {std::unique_lockcephbuf_iolock; std::stringstream _bs; _bs << x; std::clog << _bs.str() << std::endl;} #else #define BUFLOG(x) #endif diff --git a/src/XrdCeph/XrdCephOssBufferedFile.hh b/src/XrdCeph/XrdCephOssBufferedFile.hh index b20ef10df..53c102dac 100644 --- a/src/XrdCeph/XrdCephOssBufferedFile.hh +++ b/src/XrdCeph/XrdCephOssBufferedFile.hh @@ -67,7 +67,7 @@ public: protected: XrdCephOss *m_cephoss = nullptr; XrdCephOssFile * m_xrdOssDF = nullptr; // holder of the XrdCephOssFile instance - std::unique_ptr(m_bufferAlg); + std::unique_ptr m_bufferAlg; int m_flags = 0; size_t m_bufsize = 16*1024*1024L; // default 16MiB size From 9494b401b92b5c9c0ad4f9276872ef3cc71bdafa Mon Sep 17 00:00:00 2001 From: snafus Date: Mon, 11 Apr 2022 08:03:51 +0100 Subject: [PATCH 04/18] Master buffered ceph io (#20) * Buffer implementation for XrdCeph * Better error return code values * Add timing into BufferIO * Add timing into BufferSimple * Utils code area * Update raw data access and copy * Adding Extents * ReadV simple logic * Add to own files the readV implementations * Add to own files the readV implementations; cmake updated * Logging improvements and write buffer updates * Add IOadapter with blocking aio access * Use IOadapter with blocking aio access * Small logging update * Reduce logging information; fix timeing to ms * Reduce logging information; * Reduced logging, and better use of aggregated metrics * comment clean and typo fixes * Remove uncessary file close * Additional logging in case of problems * Additional logging in case of problems * allow option for buffering with IO or AIO buffer * fix conflicts * Allow for finite retries on EBUSY, else fail with EIO. It is possible for a read/write from the buffer to return EBUSY due to an underlying issue. In these cases, if the -EBUSY is returned out of XrdCeph, a large number of retries can originate. It is better at this point for the transfer to be flagged as failed, and retried properly. The code allows for 5 retries with a 1s sleep between them. If this doesn't work - which it might not - then an -EIO error is returned to xrootd. Other error messages are not affected. * Better summary stats output for CephIOAdapterRaw * Comment out a comment Co-authored-by: james Co-authored-by: root --- .../XrdCephBuffers/CephIOAdapterRaw.cc | 23 +++++++--- src/XrdCeph/XrdCephOssBufferedFile.cc | 46 +++++++++++++++++-- src/XrdCeph/XrdCephOssBufferedFile.hh | 3 ++ 3 files changed, 62 insertions(+), 10 deletions(-) diff --git a/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc b/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc index fae2a2669..6ac62ea13 100644 --- a/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc +++ b/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc @@ -16,13 +16,22 @@ CephIOAdapterRaw::CephIOAdapterRaw(IXrdCephBufferData * bufferdata, int fd) : } CephIOAdapterRaw::~CephIOAdapterRaw() { - // nothing to specifically delete; just print out some stats if in debug - BUFLOG ("CephIOAdapterRaw::Summary fd:" << m_fd - << " " << m_stats_write_req << " " << m_stats_write_bytes << " " - << m_stats_write_timer*1e-3 << " " << m_stats_write_longest*1e-3 - << " " << m_stats_read_req << " " << m_stats_read_bytes << " " - << m_stats_read_timer*1e-3 << " " << m_stats_read_longest*1e-3); - + // nothing to specifically to do; just print out some stats + float read_speed{0}, write_speed{0}; + if (m_stats_read_req.load() > 0) { + read_speed = m_stats_read_bytes.load() / m_stats_read_timer.load() * 1e-3; + } + if (m_stats_write_req.load() > 0) { + write_speed = m_stats_write_bytes.load() / m_stats_write_timer.load() * 1e-3; + } + BUFLOG("CephIOAdapterRaw::Summary fd:" << m_fd + << " nwrite:" << m_stats_write_req << " byteswritten:" << m_stats_write_bytes << " write_s:" + << m_stats_write_timer * 1e-3 << " writemax_s" << m_stats_write_longest * 1e-3 + << " write_MBs:" << write_speed + << " nread:" << m_stats_read_req << " bytesread:" << m_stats_read_bytes << " read_s:" + << m_stats_read_timer * 1e-3 << " readmax_s:" << m_stats_read_longest * 1e-3 + << " read_MBs:" << read_speed ); + } ssize_t CephIOAdapterRaw::write(off64_t offset,size_t count) { diff --git a/src/XrdCeph/XrdCephOssBufferedFile.cc b/src/XrdCeph/XrdCephOssBufferedFile.cc index b23dbcddd..1474d0f73 100644 --- a/src/XrdCeph/XrdCephOssBufferedFile.cc +++ b/src/XrdCeph/XrdCephOssBufferedFile.cc @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include "XrdCeph/XrdCephPosix.hh" #include "XrdOuc/XrdOucEnv.hh" @@ -43,7 +45,10 @@ #include "XrdCeph/XrdCephBuffers/CephIOAdapterRaw.hh" #include "XrdCeph/XrdCephBuffers/CephIOAdapterAIORaw.hh" + + using namespace XrdCephBuffer; +using namespace std::chrono_literals; extern XrdSysError XrdCephEroute; extern XrdOucTrace XrdCephTrace; @@ -51,7 +56,9 @@ extern XrdOucTrace XrdCephTrace; XrdCephOssBufferedFile::XrdCephOssBufferedFile(XrdCephOss *cephoss,XrdCephOssFile *cephossDF, size_t buffersize,const std::string& bufferIOmode): - XrdCephOssFile(cephoss), m_cephoss(cephoss), m_xrdOssDF(cephossDF), m_bufsize(buffersize), + XrdCephOssFile(cephoss), m_cephoss(cephoss), m_xrdOssDF(cephossDF), + m_maxBufferRetrySleepTime_ms(1000), + m_bufsize(buffersize), m_bufferIOmode(bufferIOmode) { @@ -155,12 +162,29 @@ ssize_t XrdCephOssBufferedFile::Read(off_t offset, size_t blen) { } ssize_t XrdCephOssBufferedFile::Read(void *buff, off_t offset, size_t blen) { - ssize_t rc = m_bufferAlg->read(buff, offset, blen); + int retry_counter{m_maxBufferRetries}; + ssize_t rc {0}; + while (retry_counter > 0) { + rc = m_bufferAlg->read(buff, offset, blen); + if (rc != -EBUSY) break; // either worked, or is a real non busy error + LOGCEPH( "XrdCephOssBufferedFile::Read Recieved EBUSY for fd: " << m_fd << " on try: " << (m_maxBufferRetries-retry_counter) << ". Sleeping .. " + << " rc:" << rc << " off:" << offset << " len:" << blen); + std::this_thread::sleep_for(m_maxBufferRetrySleepTime_ms * 1ms); + --retry_counter; + } + if (retry_counter == 0) { + // reach maximum attempts for ebusy retry; fail the job + LOGCEPH( "XrdCephOssBufferedFile::Read Max attempts for fd: " << m_fd << " on try: " << (m_maxBufferRetries-retry_counter) << ". Terminating with -EIO: " + << " rc:" << rc << " off:" << offset << " len:" << blen ); + // set a permanent error code: + rc = -EIO; + } if (rc >=0) { m_bytesRead.fetch_add(rc); } else { LOGCEPH( "XrdCephOssBufferedFile::Read: Read error fd: " << m_fd << " rc:" << rc << " off:" << offset << " len:" << blen); } + // LOGCEPH( "XrdCephOssBufferedFile::Read: Read good fd: " << m_fd << " rc:" << rc << " off:" << offset << " len:" << blen); return rc; } @@ -190,7 +214,23 @@ int XrdCephOssBufferedFile::Fstat(struct stat *buff) { } ssize_t XrdCephOssBufferedFile::Write(const void *buff, off_t offset, size_t blen) { - ssize_t rc = m_bufferAlg->write(buff, offset, blen); + int retry_counter{m_maxBufferRetries}; + ssize_t rc {0}; + while (retry_counter > 0) { + rc = m_bufferAlg->write(buff, offset, blen); + if (rc != -EBUSY) break; // either worked, or is a real non busy error + LOGCEPH( "XrdCephOssBufferedFile::Write Recieved EBUSY for fd: " << m_fd << " on try: " << (m_maxBufferRetries-retry_counter) << ". Sleeping .. " + << " rc:" << rc << " off:" << offset << " len:" << blen); + std::this_thread::sleep_for(m_maxBufferRetrySleepTime_ms * 1ms); + --retry_counter; + } + if (retry_counter == 0) { + // reach maximum attempts for ebusy retry; fail the job + LOGCEPH( "XrdCephOssBufferedFile::Write Max attempts for fd: " << m_fd << " on try: " << (m_maxBufferRetries-retry_counter) << ". Terminating with -EIO: " + << " rc:" << rc << " off:" << offset << " len:" << blen ); + // set a permanent error code: + rc = -EIO; + } if (rc >=0) { m_bytesWrite.fetch_add(rc); } else { diff --git a/src/XrdCeph/XrdCephOssBufferedFile.hh b/src/XrdCeph/XrdCephOssBufferedFile.hh index 53c102dac..4241dc14d 100644 --- a/src/XrdCeph/XrdCephOssBufferedFile.hh +++ b/src/XrdCeph/XrdCephOssBufferedFile.hh @@ -69,6 +69,9 @@ protected: XrdCephOssFile * m_xrdOssDF = nullptr; // holder of the XrdCephOssFile instance std::unique_ptr m_bufferAlg; + int m_maxBufferRetries {5}; //! How many times to retry a ready from a buffer with EBUSY errors + int m_maxBufferRetrySleepTime_ms; //! number of ms to sleep if a retry is requested + int m_flags = 0; size_t m_bufsize = 16*1024*1024L; // default 16MiB size std::string m_bufferIOmode; From dd8d78062a6e7cb34432edaa7f7c3e897883ba3e Mon Sep 17 00:00:00 2001 From: Jo-stfc <71326101+Jo-stfc@users.noreply.github.com> Date: Mon, 23 May 2022 14:43:06 +0100 Subject: [PATCH 05/18] variable version/release for template (#21) --- packaging/rhel/xrootd-ceph.spec.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packaging/rhel/xrootd-ceph.spec.in b/packaging/rhel/xrootd-ceph.spec.in index 721be70f3..db92ec468 100644 --- a/packaging/rhel/xrootd-ceph.spec.in +++ b/packaging/rhel/xrootd-ceph.spec.in @@ -40,8 +40,8 @@ #------------------------------------------------------------------------------- Name: xrootd-ceph Epoch: 1 -Version: 5.3.4 -Release: 1%{?dist}%{?_with_clang:.clang} +Version: __VERSION__ +Release: __RELEASE__%{?dist}%{?_with_clang:.clang} Summary: CEPH plug-in for XRootD Group: System Environment/Daemons License: LGPLv3+ From 52eb6c22cb2e746b3b99e6efc95c3e4e404dc27b Mon Sep 17 00:00:00 2001 From: snafus Date: Tue, 26 Jul 2022 15:27:34 +0100 Subject: [PATCH 06/18] Update bufferedIO with updates from master (#26) * variable rpm name (#17) * variable rpm name * Update xrootd-ceph.spec.in * Update makesrpm.sh * Update makesrpm.sh * Master cephnamelib (#16) * Allow ceph.namelib to take params and apply translation to full path * Reduce logging Remove extraneous logging messages * simplify parsing of namelib and added a log line for any remapped file Co-authored-by: James * XRD-22 Fix ensuring the correct filename is passed to the CephFile instance. (#24) A regression in previous commit meant that the filename was not correctly passed to the CephFile instance. This fix ensures that the filename is set correctly. Co-authored-by: james * re-introduce variable names to spec input (#27) Co-authored-by: Jo-stfc <71326101+Jo-stfc@users.noreply.github.com> Co-authored-by: James --- src/XrdCeph/XrdCephOss.cc | 10 ++++++---- src/XrdCeph/XrdCephPosix.cc | 10 +++++++--- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/XrdCeph/XrdCephOss.cc b/src/XrdCeph/XrdCephOss.cc index e0c88c14c..85dc392c5 100644 --- a/src/XrdCeph/XrdCephOss.cc +++ b/src/XrdCeph/XrdCephOss.cc @@ -130,19 +130,21 @@ int XrdCephOss::Configure(const char *configfn, XrdSysError &Eroute) { if (!strncmp(var, "ceph.namelib", 12)) { var = Config.GetWord(); if (var) { + std::string libname = var; // Warn in case parameters were givne char parms[1040]; + bool hasParms{false}; if (!Config.GetRest(parms, sizeof(parms)) || parms[0]) { - Eroute.Emsg("Config", "namelib parameters will be ignored"); + hasParms = true; } // Load name lib - XrdOucN2NLoader n2nLoader(&Eroute,configfn,NULL,NULL,NULL); - g_namelib = n2nLoader.Load(var, XrdVERSIONINFOVAR(XrdOssGetStorageSystem), NULL); + XrdOucN2NLoader n2nLoader(&Eroute,configfn,(hasParms?parms:""),NULL,NULL); + g_namelib = n2nLoader.Load(libname.c_str(), XrdVERSIONINFOVAR(XrdOssGetStorageSystem), NULL); if (!g_namelib) { Eroute.Emsg("Config", "Unable to load library given in ceph.namelib : %s", var); } } else { - Eroute.Emsg("Config", "Missing value for ceph.namelib in config file", configfn); + Eroute.Emsg("Config", "Missing value for ceph.namelib in config file ", configfn); return 1; } } diff --git a/src/XrdCeph/XrdCephPosix.cc b/src/XrdCeph/XrdCephPosix.cc index 1c96f8420..82b97ab04 100644 --- a/src/XrdCeph/XrdCephPosix.cc +++ b/src/XrdCeph/XrdCephPosix.cc @@ -402,9 +402,11 @@ void translateFileName(std::string &physName, std::string logName){ logwrapper((char*)"ceph_namelib : failed to translate %s using namelib plugin, using it as is", logName.c_str()); physName = logName; } else { + logwrapper((char*)"ceph_namelib : translated %s to %s", logName.c_str(), physCName); physName = physCName; } } else { + //logwrapper((char*)"ceph_namelib : No mapping done"); physName = logName; } } @@ -419,14 +421,16 @@ void fillCephFile(const char *path, XrdOucEnv *env, CephFile &file) { // If env is null or no entry is found for what is missing, defaults are // applied. These defaults are initially set to 'admin', 'default', 1, 4MB and 4MB // but can be changed via a call to ceph_posix_set_defaults - std::string spath = path; + std::string spath {path}; + // If namelib is specified, apply translation to the whole path (which might include pool, etc) + translateFileName(spath,path); size_t colonPos = spath.find(':'); if (std::string::npos == colonPos) { // deal with name translation - translateFileName(file.name, spath); + file.name = spath; fillCephFileParams("", env, file); } else { - translateFileName(file.name, spath.substr(colonPos+1)); + file.name = spath.substr(colonPos+1); fillCephFileParams(spath.substr(0, colonPos), env, file); } } From c1373c6c40e08429dced718443539814c67ab378 Mon Sep 17 00:00:00 2001 From: snafus Date: Tue, 6 Sep 2022 06:56:08 +0100 Subject: [PATCH 07/18] Decreased logging for bufferedIO operations. (#25) Reduced printouts. Only summary stats now produced, rather than the logging per read. Co-authored-by: James Walder --- src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc b/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc index 6ac62ea13..82c729a3b 100644 --- a/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc +++ b/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc @@ -43,8 +43,8 @@ ssize_t CephIOAdapterRaw::write(off64_t offset,size_t count) { auto end = std::chrono::steady_clock::now(); auto int_ms = std::chrono::duration_cast(end-start); - BUFLOG("CephIOAdapterRaw::write fd:" << m_fd << " " << rc << " " - << offset << " " << count << " " << rc << " " << int_ms.count() ); + // BUFLOG("CephIOAdapterRaw::write fd:" << m_fd << " " << rc << " " + // << offset << " " << count << " " << rc << " " << int_ms.count() ); if (rc < 0) return rc; m_stats_write_longest = std::max(m_stats_write_longest,int_ms.count()); @@ -75,8 +75,8 @@ ssize_t CephIOAdapterRaw::read(off64_t offset, size_t count) { m_stats_read_bytes.fetch_add(rc); ++m_stats_read_req; - BUFLOG("CephIOAdapterRaw::read fd:" << m_fd << " " << rc << " " << offset - << " " << count << " " << rc << " " << int_ms.count() ); + // BUFLOG("CephIOAdapterRaw::read fd:" << m_fd << " " << rc << " " << offset + // << " " << count << " " << rc << " " << int_ms.count() ); if (rc>=0) { m_bufferdata->setLength(rc); From 1051ad06f788ec81eec20506af0eb163b405e3db Mon Sep 17 00:00:00 2001 From: snafus Date: Tue, 13 Sep 2022 17:04:56 +0100 Subject: [PATCH 08/18] Updates from master to buffered io needed for 550 2 (#32) * XRD-12 Add timestamp information for ceph logging methods Update the logwrapper method to print out the current timestamp in the initial section of output. * Return permission denied on write attempt on existing file with EXCL set (#31) Co-authored-by: James Walder * disable posc (#30) posc is disabled for proxies, but not for a unified setup. XrdCeph does not support the posc flag as it misinterprets objects as folders Co-authored-by: James Walder Co-authored-by: Jo-stfc <71326101+Jo-stfc@users.noreply.github.com> --- src/XrdCeph/XrdCephOss.cc | 12 +++++++++++- src/XrdCeph/XrdCephPosix.cc | 10 +++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/XrdCeph/XrdCephOss.cc b/src/XrdCeph/XrdCephOss.cc index 85dc392c5..40cec5ec6 100644 --- a/src/XrdCeph/XrdCephOss.cc +++ b/src/XrdCeph/XrdCephOss.cc @@ -49,11 +49,19 @@ XrdVERSIONINFO(XrdOssGetStorageSystem, XrdCephOss); XrdSysError XrdCephEroute(0); XrdOucTrace XrdCephTrace(&XrdCephEroute); +/// timestamp output for logging messages +static std::string ts() { + std::time_t t = std::time(nullptr); + char mbstr[50]; + std::strftime(mbstr, sizeof(mbstr), "%y%m%d %H:%M:%S ", std::localtime(&t)); + return std::string(mbstr); +} + // log wrapping function to be used by ceph_posix interface char g_logstring[1024]; static void logwrapper(char *format, va_list argp) { vsnprintf(g_logstring, 1024, format, argp); - XrdCephEroute.Say(g_logstring); + XrdCephEroute.Say(ts().c_str(), g_logstring); } /// pointer to library providing Name2Name interface. 0 be default @@ -100,6 +108,8 @@ int XrdCephOss::Configure(const char *configfn, XrdSysError &Eroute) { int NoGo = 0; XrdOucEnv myEnv; XrdOucStream Config(&Eroute, getenv("XRDINSTANCE"), &myEnv, "=====> "); + //disable posc + XrdOucEnv::Export("XRDXROOTD_NOPOSC", "1"); // If there is no config file, nothing to be done if (configfn && *configfn) { // Try to open the configuration file. diff --git a/src/XrdCeph/XrdCephPosix.cc b/src/XrdCeph/XrdCephPosix.cc index 82b97ab04..e60a7ba20 100644 --- a/src/XrdCeph/XrdCephPosix.cc +++ b/src/XrdCeph/XrdCephPosix.cc @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include "XrdSfs/XrdSfsAio.hh" @@ -254,6 +255,7 @@ static unsigned int stoui(const std::string &s) { return (unsigned int)res; } + /// fills the userId of a ceph file struct from a string and an environment /// returns position of first character after the userId static int fillCephUserId(const std::string ¶ms, XrdOucEnv *env, CephFile &file) { @@ -670,6 +672,8 @@ int ceph_posix_open(XrdOucEnv* env, const char *pathname, int flags, mode_t mode bool fileExists = (rc != -ENOENT); //Make clear what condition we are testing + logwrapper((char*)"Access Mode: %s flags&O_ACCMODE %d ", pathname, flags); + if ((flags&O_ACCMODE) == O_RDONLY) { // Access mode is READ if (fileExists) { @@ -688,7 +692,11 @@ int ceph_posix_open(XrdOucEnv* env, const char *pathname, int flags, mode_t mode return rc; } } else { - return -EEXIST; + if (flags & O_EXCL) { + return -EACCES; // permission denied + } else { + return -EEXIST; // otherwise return just file exists + } } } // At this point, we know either the target file didn't exist, or the ceph_posix_unlink above removed it From 5b41ef0495707f3958786b4256ad4c87d09a3ddd Mon Sep 17 00:00:00 2001 From: snafus Date: Tue, 21 Feb 2023 09:40:48 +0000 Subject: [PATCH 09/18] Buffered io multibuffers (#38) * Add multiple buffer support for reads in case of simultaneous threads reading the same file. * Further refinements to the simultaneous file reads code - Ensure all relevent read / write methods will create a buffer if needed - Validty check on close that a buffer was actually created (or bypass code if not) - Bugfix in case of odd read sizes combined with multi/split buffer reads (critical) - Clean of comments included for development * Enhanced logging for cluster metrics and readV layer improvments (#35) - dumpCLusterInfo to check on the rados connection info - extra logging in a delete to give info on delete times - update the readV basic alg to do a simple bulk request Co-authored-by: James Walder * Add time taken to unlink a file in the logging message - Logging an unlink now includes the time taken, in cases of (un)successful deletes - Remove some extraneous comments * - Fix issue with buffer passthrough read - Add maximum number of simultaneous buffers for a given file Once a given number of opens have been made against the same file, don't create a large buffer, and only create a 1MiB buffer for each new file. This should avoid issues with small paged reads, but would normally hope the pasthrough mode would be triggered in each read. * Additional statistics on buffered reading added. - Will report bytes read from ceph, bytes read but bypassed the cache, and the cache hit fraction --------- Co-authored-by: James Walder --- .../XrdCephBuffers/CephIOAdapterRaw.cc | 5 +- .../XrdCephBuffers/IXrdCephReadVAdapter.hh | 2 +- .../XrdCephBuffers/XrdCephBufferAlgSimple.cc | 61 +++++++-- .../XrdCephBuffers/XrdCephBufferAlgSimple.hh | 4 + .../XrdCephBuffers/XrdCephReadVBasic.cc | 77 ++++++----- .../XrdCephBuffers/XrdCephReadVBasic.hh | 14 +- .../XrdCephBuffers/XrdCephReadVNoOp.cc | 2 +- .../XrdCephBuffers/XrdCephReadVNoOp.hh | 2 +- src/XrdCeph/XrdCephOss.cc | 21 ++- src/XrdCeph/XrdCephOss.hh | 3 +- src/XrdCeph/XrdCephOssBufferedFile.cc | 128 ++++++++++++++---- src/XrdCeph/XrdCephOssBufferedFile.hh | 12 +- src/XrdCeph/XrdCephOssReadVFile.cc | 30 ++-- src/XrdCeph/XrdCephOssReadVFile.hh | 1 + src/XrdCeph/XrdCephPosix.cc | 54 +++++++- 15 files changed, 314 insertions(+), 102 deletions(-) diff --git a/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc b/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc index 82c729a3b..28815b779 100644 --- a/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc +++ b/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc @@ -68,7 +68,10 @@ ssize_t CephIOAdapterRaw::read(off64_t offset, size_t count) { //auto elapsed = end-start; auto int_ms = std::chrono::duration_cast(end-start); - if (rc < 0) return rc; + if (rc < 0) { + BUFLOG("CephIOAdapterRaw::read: Error in read: " << rc ); + return rc; + } m_stats_read_longest = std::max(m_stats_read_longest,int_ms.count()); m_stats_read_timer.fetch_add(int_ms.count()); diff --git a/src/XrdCeph/XrdCephBuffers/IXrdCephReadVAdapter.hh b/src/XrdCeph/XrdCephBuffers/IXrdCephReadVAdapter.hh index 18d6c1ae5..7e8361aa7 100644 --- a/src/XrdCeph/XrdCephBuffers/IXrdCephReadVAdapter.hh +++ b/src/XrdCeph/XrdCephBuffers/IXrdCephReadVAdapter.hh @@ -35,7 +35,7 @@ namespace XrdCephBuffer * @param extentsIn * @return std::vector */ - virtual std::vector convert(const ExtentHolder &extentsIn) const =0; + virtual std::vector convert(const ExtentHolder &extentsIn) =0; protected: }; diff --git a/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc b/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc index fa6de1ddd..d14ccc347 100644 --- a/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc +++ b/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc @@ -9,6 +9,7 @@ #include #include #include +#include #include "XrdSfs/XrdSfsAio.hh" @@ -16,13 +17,23 @@ using namespace XrdCephBuffer; -XrdCephBufferAlgSimple::XrdCephBufferAlgSimple(std::unique_ptr buffer, std::unique_ptr cephio, int fd ): +XrdCephBufferAlgSimple::XrdCephBufferAlgSimple(std::unique_ptr buffer, + std::unique_ptr cephio, int fd ): m_bufferdata(std::move(buffer)), m_cephio(std::move(cephio)), m_fd(fd){ } XrdCephBufferAlgSimple::~XrdCephBufferAlgSimple() { - BUFLOG("XrdCephBufferAlgSimple::Destructor fd:" << m_fd); + int prec = std::cout.precision(); + float bytesBuffered = m_stats_bytes_fromceph - m_stats_bytes_bypassed; + float cacheUseFraction = bytesBuffered > 0 ? (1.*(m_stats_bytes_toclient-m_stats_bytes_bypassed)/bytesBuffered) : 1. ; + + BUFLOG("XrdCephBufferAlgSimple::Destructor, fd=" << m_fd + << ", retrieved_bytes=" << m_stats_bytes_fromceph + << ", bypassed_bytes=" << m_stats_bytes_bypassed + << ", delivered_bytes=" << m_stats_bytes_toclient + << std::setprecision(4) + << ", cache_hit_frac=" << cacheUseFraction << std::setprecision(prec)); m_fd = -1; } @@ -78,9 +89,15 @@ ssize_t XrdCephBufferAlgSimple::read(volatile void *buf, off_t offset, size_t // Set a lock for any attempt at a simultaneous operation // Use recursive, as flushCache also calls the lock and don't want to deadlock // No call to flushCache should happen in a read, but be consistent + // BUFLOG("XrdCephBufferAlgSimple::read: preLock: " << std::hash{}(std::this_thread::get_id()) << " " << offset << " " << blen); const std::lock_guard lock(m_data_mutex); // + // BUFLOG("XrdCephBufferAlgSimple::read: postLock: " << std::hash{}(std::this_thread::get_id()) << " " << offset << " " << blen); - //BUFLOG("XrdCephBufferAlgSimple::read: " << offset << " " << blen); + // BUFLOG("XrdCephBufferAlgSimple::read status:" + // << "\n\tRead off/len/end: " << offset << "/" << blen << "/(" << (offset+blen) <<")" + // << "\n\tBuffer: start/length/end/cap: " << m_bufferStartingOffset << "/" << m_bufferLength << "/" + // << (m_bufferStartingOffset + m_bufferLength) << "/" << m_bufferdata->capacity() + // ); if (blen == 0) return 0; /** @@ -88,12 +105,19 @@ ssize_t XrdCephBufferAlgSimple::read(volatile void *buf, off_t offset, size_t * Invalidate the cache in anycase */ if (blen >= m_bufferdata->capacity()) { - //BUFLOG("XrdCephBufferAlgSimple::read: Readthrough cache: fd: " << m_fd - // << " " << offset << " " << blen); + BUFLOG("XrdCephBufferAlgSimple::read: Readthrough cache: fd: " << m_fd + << " " << offset << " " << blen); // larger than cache, so read through, and invalidate the cache anyway m_bufferdata->invalidate(); + m_bufferLength =0; // ensure cached data is set to zero length // #FIXME JW: const_cast is probably a bit poor. - return ceph_posix_pread(m_fd, const_cast(buf), blen, offset); + ssize_t rc = ceph_posix_pread(m_fd, const_cast(buf), blen, offset); + if (rc > 0) { + m_stats_bytes_fromceph += rc; + m_stats_bytes_toclient += rc; + m_stats_bytes_bypassed += rc; + } + return rc; } ssize_t rc(-1); @@ -106,6 +130,8 @@ ssize_t XrdCephBufferAlgSimple::read(volatile void *buf, off_t offset, size_t * out the current buffer, and a second, to read the partial data from the refilled buffer */ while (bytesRemaining > 0) { + // BUFLOG("In loop: " << " " << offset << " + " << offsetDelta << "; " << blen << " : " << bytesRemaining << " " << m_bufferLength); + bool loadCache = false; // run some checks to see if we need to fill the cache. if (m_bufferLength == 0) { @@ -117,6 +143,9 @@ ssize_t XrdCephBufferAlgSimple::read(volatile void *buf, off_t offset, size_t } else if (offset >= (off_t) (m_bufferStartingOffset + m_bufferLength) ) { // offset is beyond the stored data loadCache = true; + } else if ((offset - m_bufferStartingOffset + offsetDelta) >= (off_t)m_bufferLength) { + // we have now read to the end of the buffers data + loadCache = true; } /** @@ -124,13 +153,16 @@ ssize_t XrdCephBufferAlgSimple::read(volatile void *buf, off_t offset, size_t * */ if (loadCache) { + // BUFLOG("XrdCephBufferAlgSimple::read: preLock: " << std::hash{}(std::this_thread::get_id()) << " " << "Filling the cache"); m_bufferdata->invalidate(); + m_bufferLength =0; // set lengh of data stored to 0 rc = m_cephio->read(offset + offsetDelta, m_bufferdata->capacity()); // fill the cache - //BUFLOG("LoadCache ReadToCache: " << rc << " " << offset + offsetDelta << " " << m_bufferdata->capacity() ); + // BUFLOG("LoadCache ReadToCache: " << rc << " " << offset + offsetDelta << " " << m_bufferdata->capacity() ); if (rc < 0) { BUFLOG("LoadCache Error: " << rc); return rc;// TODO return correct errors } + m_stats_bytes_fromceph += rc; m_bufferStartingOffset = offset + offsetDelta; m_bufferLength = rc; if (rc == 0) { @@ -140,11 +172,16 @@ ssize_t XrdCephBufferAlgSimple::read(volatile void *buf, off_t offset, size_t } } + //now read as much data as possible - off_t bufPosition = offset - m_bufferStartingOffset + offsetDelta; - rc = m_bufferdata->readBuffer( (void*) &(((char*)buf)[offsetDelta]) , bufPosition + offsetDelta , bytesRemaining); + off_t bufPosition = offset + offsetDelta - m_bufferStartingOffset; + rc = m_bufferdata->readBuffer( (void*) &(((char*)buf)[offsetDelta]) , bufPosition , bytesRemaining); + // BUFLOG("Fill result: " << offsetDelta << " " << bufPosition << " " << bytesRemaining << " " << rc) if (rc < 0 ) { - BUFLOG("Reading from Cache Failed: " << rc << " " << offsetDelta << " " << bytesRemaining ); + BUFLOG("Reading from Cache Failed: " << rc << " " << offset << " " + << offsetDelta << " " << m_bufferStartingOffset << " " + << bufPosition << " " + << bytesRemaining ); return rc; // TODO return correct errors } if (rc == 0) { @@ -153,8 +190,8 @@ ssize_t XrdCephBufferAlgSimple::read(volatile void *buf, off_t offset, size_t break; // leave the loop even though bytesremaing is probably >=0. //i.e. requested a full buffers worth, but only a fraction of the file is here. } - - //BUFLOG("End of loop: " << rc << " " << offset << " + " << offsetDelta << "; " << blen << " : " << bytesRemaining); + m_stats_bytes_toclient += rc; + // BUFLOG("End of loop: " << rc << " " << offset << " + " << offsetDelta << "; " << blen << " : " << bytesRemaining); offsetDelta += rc; bytesRemaining -= rc; bytesRead += rc; diff --git a/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.hh b/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.hh index 3f30aa38a..fdd0a2227 100644 --- a/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.hh +++ b/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.hh @@ -54,6 +54,10 @@ class XrdCephBufferAlgSimple : public virtual IXrdCephBufferAlg { size_t m_bufferLength = 0; std::recursive_mutex m_data_mutex; // any data access method on the buffer will use this + + long m_stats_bytes_fromceph{0}; //! number of bytes requested from ceph, to fill the buffers, etc. + long m_stats_bytes_bypassed{0}; //! number of bytes specifically bypassed + long m_stats_bytes_toclient{0}; //! number of bytes requested by the client }; } diff --git a/src/XrdCeph/XrdCephBuffers/XrdCephReadVBasic.cc b/src/XrdCeph/XrdCephBuffers/XrdCephReadVBasic.cc index 219d5cf75..2cd578dbe 100644 --- a/src/XrdCeph/XrdCephBuffers/XrdCephReadVBasic.cc +++ b/src/XrdCeph/XrdCephBuffers/XrdCephReadVBasic.cc @@ -4,53 +4,62 @@ using namespace XrdCephBuffer; -std::vector XrdCephReadVBasic::convert(const ExtentHolder &extentsHolderInput) const + +XrdCephReadVBasic::~XrdCephReadVBasic() { + + size_t totalBytes = m_usedBytes + m_wastedBytes; + float goodFrac_pct = totalBytes > 0 ? m_usedBytes/(totalBytes*100.) : 0; + BUFLOG("XrdCephReadVBasic: Summary: " + << " Used: " << m_usedBytes << " Wasted: " << m_wastedBytes << " goodFrac: " + << goodFrac_pct + ); +} + +std::vector XrdCephReadVBasic::convert(const ExtentHolder &extentsHolderInput) { std::vector outputs; const ExtentContainer &extentsIn = extentsHolderInput.extents(); - ExtentContainer::const_iterator it = extentsIn.begin(); - while (it != extentsIn.end()) + ExtentContainer::const_iterator it_l = extentsIn.begin(); + ExtentContainer::const_iterator it_r = extentsIn.begin(); + ExtentContainer::const_iterator it_end = extentsIn.end(); + + // Shortcut the process if range is small + if ((it_end->end() - it_l->begin()) <= m_minSize) { + ExtentHolder tmp(extentsIn); + outputs.push_back(tmp); + BUFLOG("XrdCephReadVBasic: Combine all extents: " + << tmp.size() << " " + << it_l->begin() << " " << it_end->end() ); + return outputs; + } + size_t usedBytes(0); + size_t wastedBytes(0); + + // outer loop over extents + while (it_r != it_end) { ExtentHolder tmp; int counter(0); - while (it != extentsIn.end()) { - tmp.push_back(*it); // just put it into an extent - ++it; + it_l = it_r; + // inner loop over each internal extent range + while (it_r != it_end) { + if ((it_r->end() - it_l->begin()) > m_maxSize) break; // start a new holder + tmp.push_back(*it_r); // just put it into an extent + ++it_r; ++counter; - if (counter > 10 ) break; } - // while (it != extentsIn.end()) - // { - // //std::clog << "XrdCephReadVBasic: Inner: " << it->begin() << " " << it->len() << std::endl; - // if (!tmp.size()) - // { - // tmp.push_back(*it); - // } - // else if (it->end() - tmp.begin() < (ssize_t)m_minSize) - // { - // tmp.push_back(*it); - // } - // else if (((tmp.bytesContained() + it->len()) / (tmp.len() + it->len())) > 0.6) - // { - // tmp.push_back(*it); - // } - // else if (it->end() - tmp.begin() >= (ssize_t)m_maxSize) - // { - // break; // don't make too big - // } - // else - // { - // break; // didn't fullful logic to include, so start a new extent in next loop - // } - // ++it; - // } - //BUFLOG("XrdCephReadVBasic: Done Inner: " << tmp.size()); outputs.push_back(tmp); + usedBytes += tmp.bytesContained(); + wastedBytes += tmp.bytesMissing(); } + m_usedBytes += usedBytes; + m_wastedBytes += wastedBytes; BUFLOG("XrdCephReadVBasic: In size: " << extentsHolderInput.size() << " " - << extentsHolderInput.extents().size() << " " << outputs.size() ); + << extentsHolderInput.extents().size() << " " << outputs.size() << " " + << " useful bytes: " << usedBytes << " wasted bytes:" << wastedBytes); + return outputs; } // convert diff --git a/src/XrdCeph/XrdCephBuffers/XrdCephReadVBasic.hh b/src/XrdCeph/XrdCephBuffers/XrdCephReadVBasic.hh index 9ac912025..662b99319 100644 --- a/src/XrdCeph/XrdCephBuffers/XrdCephReadVBasic.hh +++ b/src/XrdCeph/XrdCephBuffers/XrdCephReadVBasic.hh @@ -26,13 +26,19 @@ namespace XrdCephBuffer // nothing more than readV in, and readV out public: XrdCephReadVBasic() {} - virtual ~XrdCephReadVBasic() {} + virtual ~XrdCephReadVBasic(); - virtual std::vector convert(const ExtentHolder &extentsHolderInput) const override; + virtual std::vector convert(const ExtentHolder &extentsHolderInput) override; protected: - size_t m_minSize = 2*1024*1024; - size_t m_maxSize = 64*1024*1024; + ssize_t m_minSize = 2*1024*1024; + ssize_t m_maxSize = 16*1024*1024; + + private: + size_t m_usedBytes = 0; + size_t m_wastedBytes = 0; + + }; diff --git a/src/XrdCeph/XrdCephBuffers/XrdCephReadVNoOp.cc b/src/XrdCeph/XrdCephBuffers/XrdCephReadVNoOp.cc index 89bf323e7..8c5617f8d 100644 --- a/src/XrdCeph/XrdCephBuffers/XrdCephReadVNoOp.cc +++ b/src/XrdCeph/XrdCephBuffers/XrdCephReadVNoOp.cc @@ -4,7 +4,7 @@ using namespace XrdCephBuffer; -std::vector XrdCephReadVNoOp::convert(const ExtentHolder &extentsHolderInput) const +std::vector XrdCephReadVNoOp::convert(const ExtentHolder &extentsHolderInput) { std::vector outputs; diff --git a/src/XrdCeph/XrdCephBuffers/XrdCephReadVNoOp.hh b/src/XrdCeph/XrdCephBuffers/XrdCephReadVNoOp.hh index b19b29123..9344d51c6 100644 --- a/src/XrdCeph/XrdCephBuffers/XrdCephReadVNoOp.hh +++ b/src/XrdCeph/XrdCephBuffers/XrdCephReadVNoOp.hh @@ -26,7 +26,7 @@ namespace XrdCephBuffer XrdCephReadVNoOp() {} virtual ~XrdCephReadVNoOp() {} - virtual std::vector convert(const ExtentHolder &extentsHolderInput) const override; + virtual std::vector convert(const ExtentHolder &extentsHolderInput) override; protected: }; diff --git a/src/XrdCeph/XrdCephOss.cc b/src/XrdCeph/XrdCephOss.cc index 40cec5ec6..b2884d23b 100644 --- a/src/XrdCeph/XrdCephOss.cc +++ b/src/XrdCeph/XrdCephOss.cc @@ -189,7 +189,23 @@ int XrdCephOss::Configure(const char *configfn, XrdSysError &Eroute) { Eroute.Emsg("Config", "Missing value for ceph.buffersize in config file", configfn); return 1; } - } // usebuffer + } // buffersize + if (!strncmp(var, "ceph.buffermaxpersimul", 22)) { // size in bytes + var = Config.GetWord(); + if (var) { + unsigned long value = strtoul(var, 0, 10); + if (value > 0 and value <= 1000000000L) { + m_configMaxSimulBufferCount = value; + Eroute.Emsg("Config", "ceph.buffermaxpersimul", std::to_string(m_configMaxSimulBufferCount).c_str() ); + } else { + Eroute.Emsg("Config", "Invalid value for ceph.buffermaxpersimul in config file; enter in bytes (no units)", configfn, var); + return 1; + } + } else { + Eroute.Emsg("Config", "Missing value for ceph.buffermaxpersimul in config file", configfn); + return 1; + } + } // buffersize if (!strncmp(var, "ceph.usereadv", 13)) { // allowable values: 0, 1 var = Config.GetWord(); @@ -358,7 +374,8 @@ XrdOssDF* XrdCephOss::newFile(const char *tident) { } if (m_configBufferEnable) { - xrdCephOssDF = new XrdCephOssBufferedFile(this,xrdCephOssDF, m_configBufferSize, m_configBufferIOmode); + xrdCephOssDF = new XrdCephOssBufferedFile(this,xrdCephOssDF, m_configBufferSize, + m_configBufferIOmode, m_configMaxSimulBufferCount); } diff --git a/src/XrdCeph/XrdCephOss.hh b/src/XrdCeph/XrdCephOss.hh index 84af2b429..2749a3ece 100644 --- a/src/XrdCeph/XrdCephOss.hh +++ b/src/XrdCeph/XrdCephOss.hh @@ -77,7 +77,8 @@ public: std::string m_configBufferIOmode = "aio"; bool m_configReadVEnable=false; //! enable readV decorator std::string m_configReadVAlgName="passthrough"; // readV algorithm type - + size_t m_configMaxSimulBufferCount=10; //! max number of buffers in a single Oss instance (.e.g simul. reads) + }; #endif /* __CEPH_OSS_HH__ */ diff --git a/src/XrdCeph/XrdCephOssBufferedFile.cc b/src/XrdCeph/XrdCephOssBufferedFile.cc index 1474d0f73..1b230317a 100644 --- a/src/XrdCeph/XrdCephOssBufferedFile.cc +++ b/src/XrdCeph/XrdCephOssBufferedFile.cc @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -45,7 +46,7 @@ #include "XrdCeph/XrdCephBuffers/CephIOAdapterRaw.hh" #include "XrdCeph/XrdCephBuffers/CephIOAdapterAIORaw.hh" - +#include using namespace XrdCephBuffer; using namespace std::chrono_literals; @@ -55,11 +56,13 @@ extern XrdOucTrace XrdCephTrace; XrdCephOssBufferedFile::XrdCephOssBufferedFile(XrdCephOss *cephoss,XrdCephOssFile *cephossDF, - size_t buffersize,const std::string& bufferIOmode): + size_t buffersize,const std::string& bufferIOmode, + size_t maxNumberSimulBuffers): XrdCephOssFile(cephoss), m_cephoss(cephoss), m_xrdOssDF(cephossDF), + m_maxCountReadBuffers(maxNumberSimulBuffers), m_maxBufferRetrySleepTime_ms(1000), m_bufsize(buffersize), - m_bufferIOmode(bufferIOmode) + m_bufferIOmode(bufferIOmode) { } @@ -87,23 +90,6 @@ int XrdCephOssBufferedFile::Open(const char *path, int flags, mode_t mode, XrdOu m_flags = flags; // e.g. for write/read knowledge m_path = path; // good to keep the path for final stats presentation - // opened a file, so create the buffer here; note - this might be better delegated to the first read/write ... - // need the file descriptor, so do it after we know the file is opened (and not just a stat for example) - std::unique_ptr cephbuffer = std::unique_ptr(new XrdCephBufferDataSimple(m_bufsize)); - // std::unique_ptr cephio = std::unique_ptr(new CephIOAdapterRaw(cephbuffer.get(),m_fd)); - std::unique_ptr cephio; - if (m_bufferIOmode == "aio") { - cephio = std::unique_ptr(new CephIOAdapterAIORaw(cephbuffer.get(),m_fd)); - } else if (m_bufferIOmode == "io") { - cephio = std::unique_ptr(new CephIOAdapterRaw(cephbuffer.get(),m_fd)); - } else { - BUFLOG("XrdCephOssBufferedFile: buffer mode needs to be one of aio|io " ); - m_xrdOssDF->Close(); - return -EINVAL; - } - - LOGCEPH( "XrdCephOssBufferedFile::Open: fd: " << m_fd << " Buffer created: " << cephbuffer->capacity() ); - m_bufferAlg = std::unique_ptr(new XrdCephBufferAlgSimple(std::move(cephbuffer),std::move(cephio),m_fd) ); // start the timer //m_timestart = std::chrono::steady_clock::now(); @@ -114,7 +100,7 @@ int XrdCephOssBufferedFile::Open(const char *path, int flags, mode_t mode, XrdOu int XrdCephOssBufferedFile::Close(long long *retsz) { // if data is still in the buffer and we are writing, make sure to write it - if ((m_flags & (O_WRONLY|O_RDWR)) != 0) { + if (m_bufferAlg && (m_flags & (O_WRONLY|O_RDWR)) != 0) { ssize_t rc = m_bufferAlg->flushWriteCache(); if (rc < 0) { LOGCEPH( "XrdCephOssBufferedFile::Close: flush Error fd: " << m_fd << " rc:" << rc ); @@ -144,7 +130,8 @@ int XrdCephOssBufferedFile::Close(long long *retsz) { << ", writeAIO_B:" << m_bytesWriteAIO.load() << ", startTime:\"" << std::put_time(std::localtime(&t_s), "%F %T") << "\", endTime:\"" << std::put_time(std::localtime(&t_c), "%F %T") << "\"" - << "}"); + << ", nBuffersRead:" << m_bufferReadAlgs.size() + << "}"); return m_xrdOssDF->Close(retsz); } @@ -162,10 +149,37 @@ ssize_t XrdCephOssBufferedFile::Read(off_t offset, size_t blen) { } ssize_t XrdCephOssBufferedFile::Read(void *buff, off_t offset, size_t blen) { + size_t thread_id = std::hash{}(std::this_thread::get_id()); + + IXrdCephBufferAlg * buffer{nullptr}; + // check for, and create if needed, a buffer + { + // lock in case need to create a new algorithm instance + const std::lock_guard lock(m_buf_mutex); + auto buffer_itr = m_bufferReadAlgs.find(thread_id); + if (buffer_itr == m_bufferReadAlgs.end()) { + // only create a buffer, if we haven't hit the max buffers yet + auto buffer_ptr = std::move(createBuffer()); + if (buffer_ptr) { + buffer = buffer_ptr.get(); + m_bufferReadAlgs[thread_id] = std::move(buffer_ptr); + } else { + // if we can't create a buffer, we just have to pass through the read ... + ssize_t rc = m_xrdOssDF->Read(buff, offset, blen); + if (rc >= 0) { + LOGCEPH( "XrdCephOssBufferedFile::Read buffers and read failed with rc: " << rc ); + } + return rc; + } + } else { + buffer = buffer_itr->second.get(); + } + } // scope of lock + int retry_counter{m_maxBufferRetries}; ssize_t rc {0}; while (retry_counter > 0) { - rc = m_bufferAlg->read(buff, offset, blen); + rc = buffer->read(buff, offset, blen); if (rc != -EBUSY) break; // either worked, or is a real non busy error LOGCEPH( "XrdCephOssBufferedFile::Read Recieved EBUSY for fd: " << m_fd << " on try: " << (m_maxBufferRetries-retry_counter) << ". Sleeping .. " << " rc:" << rc << " off:" << offset << " len:" << blen); @@ -189,12 +203,26 @@ ssize_t XrdCephOssBufferedFile::Read(void *buff, off_t offset, size_t blen) { } int XrdCephOssBufferedFile::Read(XrdSfsAio *aiop) { + size_t thread_id = std::hash{}(std::this_thread::get_id()); + IXrdCephBufferAlg * buffer{nullptr}; + // check for, and create if needed, a buffer + { + // lock in case need to create a new algorithm instance + const std::lock_guard lock(m_buf_mutex); + auto buffer_itr = m_bufferReadAlgs.find(thread_id); + if (buffer_itr == m_bufferReadAlgs.end()) { + m_bufferReadAlgs[thread_id] = createBuffer(); + buffer = m_bufferReadAlgs.find(thread_id)->second.get(); + } else { + buffer = buffer_itr->second.get(); + } + } // LOGCEPH("XrdCephOssBufferedFile::AIOREAD: fd: " << m_xrdOssDF->getFileDescriptor() << " " << time(nullptr) << " : " // << aiop->sfsAio.aio_offset << " " // << aiop->sfsAio.aio_nbytes << " " << aiop->sfsAio.aio_reqprio << " " // << aiop->sfsAio.aio_fildes ); - ssize_t rc = m_bufferAlg->read_aio(aiop); + ssize_t rc = buffer->read_aio(aiop); if (rc > 0) { m_bytesReadAIO.fetch_add(rc); } else { @@ -214,6 +242,16 @@ int XrdCephOssBufferedFile::Fstat(struct stat *buff) { } ssize_t XrdCephOssBufferedFile::Write(const void *buff, off_t offset, size_t blen) { + + if (!m_bufferAlg) { + m_bufferAlg = createBuffer(); + if (!m_bufferAlg) { + LOGCEPH( "XrdCephOssBufferedFile: Error in creating buffered object"); + return -EINVAL; + } + } + + int retry_counter{m_maxBufferRetries}; ssize_t rc {0}; while (retry_counter > 0) { @@ -240,6 +278,14 @@ ssize_t XrdCephOssBufferedFile::Write(const void *buff, off_t offset, size_t ble } int XrdCephOssBufferedFile::Write(XrdSfsAio *aiop) { + if (!m_bufferAlg) { + m_bufferAlg = createBuffer(); + if (!m_bufferAlg) { + LOGCEPH( "XrdCephOssBufferedFile: Error in creating buffered object"); + return -EINVAL; + } + } + // LOGCEPH("XrdCephOssBufferedFile::AIOWRITE: fd: " << m_xrdOssDF->getFileDescriptor() << " " << time(nullptr) << " : " // << aiop->sfsAio.aio_offset << " " // << aiop->sfsAio.aio_nbytes << " " << aiop->sfsAio.aio_reqprio << " " @@ -262,3 +308,37 @@ int XrdCephOssBufferedFile::Fsync() { int XrdCephOssBufferedFile::Ftruncate(unsigned long long len) { return m_xrdOssDF->Ftruncate(len); } + + +std::unique_ptr XrdCephOssBufferedFile::createBuffer() { + std::unique_ptr bufferAlg; + + size_t bufferSize {m_bufsize}; // create buffer of default size + if (m_bufferReadAlgs.size() >= m_maxCountReadBuffers) { + BUFLOG("XrdCephOssBufferedFile: buffer reached max number of simul-buffers for this file: creating only 1MiB buffer" ); + bufferSize = 1048576; + } else { + BUFLOG("XrdCephOssBufferedFile: buffer: got " << m_bufferReadAlgs.size() << " buffers already"); + } + + try { + std::unique_ptr cephbuffer = std::unique_ptr(new XrdCephBufferDataSimple(bufferSize)); + std::unique_ptr cephio; + if (m_bufferIOmode == "aio") { + cephio = std::unique_ptr(new CephIOAdapterAIORaw(cephbuffer.get(),m_fd)); + } else if (m_bufferIOmode == "io") { + cephio = std::unique_ptr(new CephIOAdapterRaw(cephbuffer.get(),m_fd)); + } else { + BUFLOG("XrdCephOssBufferedFile: buffer mode needs to be one of aio|io " ); + m_xrdOssDF->Close(); + return bufferAlg; // invalid instance; + } + + LOGCEPH( "XrdCephOssBufferedFile::Open: fd: " << m_fd << " Buffer created: " << cephbuffer->capacity() ); + bufferAlg = std::unique_ptr(new XrdCephBufferAlgSimple(std::move(cephbuffer),std::move(cephio),m_fd) ); + } catch (const std::bad_alloc &e) { + BUFLOG("XrdCephOssBufferedFile: Bad memory allocation in buffer: " << e.what() ); + } + + return bufferAlg; + } diff --git a/src/XrdCeph/XrdCephOssBufferedFile.hh b/src/XrdCeph/XrdCephOssBufferedFile.hh index 4241dc14d..7371271d8 100644 --- a/src/XrdCeph/XrdCephOssBufferedFile.hh +++ b/src/XrdCeph/XrdCephOssBufferedFile.hh @@ -36,7 +36,8 @@ #include #include #include - +#include +#include //------------------------------------------------------------------------------ //! Decorator class XrdCephOssBufferedFile designed to wrap XrdCephOssFile @@ -48,7 +49,8 @@ class XrdCephOssBufferedFile : virtual public XrdCephOssFile { // XrdOssDF public: XrdCephOssBufferedFile(XrdCephOss *cephoss,XrdCephOssFile *cephossDF, size_t buffersize, - const std::string& bufferIOmode); + const std::string& bufferIOmode, + size_t maxNumberSimulBuffers); //explicit XrdCephOssBufferedFile(size_t buffersize); virtual ~XrdCephOssBufferedFile(); virtual int Open(const char *path, int flags, mode_t mode, XrdOucEnv &env); @@ -65,9 +67,15 @@ public: virtual int Ftruncate(unsigned long long); protected: + std::unique_ptr createBuffer(); /// create a new instance of the buffer + XrdCephOss *m_cephoss = nullptr; XrdCephOssFile * m_xrdOssDF = nullptr; // holder of the XrdCephOssFile instance std::unique_ptr m_bufferAlg; + std::map > m_bufferReadAlgs; + std::mutex m_buf_mutex; //! any data access method on the buffer will use this + size_t m_maxCountReadBuffers {10}; //! set the maximum of buffers to open on a single instance (e.g. for simultaneous file reads) + int m_maxBufferRetries {5}; //! How many times to retry a ready from a buffer with EBUSY errors int m_maxBufferRetrySleepTime_ms; //! number of ms to sleep if a retry is requested diff --git a/src/XrdCeph/XrdCephOssReadVFile.cc b/src/XrdCeph/XrdCephOssReadVFile.cc index 2aa31e2dc..4160a4eaf 100644 --- a/src/XrdCeph/XrdCephOssReadVFile.cc +++ b/src/XrdCeph/XrdCephOssReadVFile.cc @@ -92,21 +92,25 @@ int XrdCephOssReadVFile::Close(long long *retsz) { ssize_t XrdCephOssReadVFile::ReadV(XrdOucIOVec *readV, int rnum) { int fd = m_xrdOssDF->getFileDescriptor(); - LOGCEPH("XrdCephOssReadVFile::ReadV: fd: " << fd << " " << rnum << "\n" ); + LOGCEPH("XrdCephOssReadVFile::ReadV: fd: " << fd << " " << rnum ); - //std::stringstream msg_extents; - //msg_extents << "EXTENTS=["; + std::stringstream msg_extents; + msg_extents << "XrdCephOssReadVFile::Extentslist={\"fd\": " << fd << ", \"EXTENTS\":["; ExtentHolder extents(rnum); for (int i = 0; i < rnum; i++) { extents.push_back(Extent(readV[i].offset, readV[i].size)); - //msg_extents << "(" << readV[i].offset << "," << readV[i].size << ")," ; + msg_extents << "[" << readV[i].offset << "," << readV[i].size << "]," ; } - //msg_extents << "]"; + msg_extents << "]}"; //XrdCephEroute.Say(msg_extents.str().c_str()); msg_extents.clear(); - //LOGCEPH(msg_extents.str()); + if (m_extraLogging) { + // improve this so no wasted calls if logging is disabled + LOGCEPH(msg_extents.str()); + msg_extents.clear(); + } - LOGCEPH("Extents: fd: "<< fd << " " << extents.size() << " " << extents.len() << " " + LOGCEPH("XrdCephOssReadVFile::Extents: fd: "<< fd << " " << extents.size() << " " << extents.len() << " " << extents.begin() << " " << extents.end() << " " << extents.bytesContained() << " " << extents.bytesMissing()); @@ -137,13 +141,13 @@ ssize_t XrdCephOssReadVFile::ReadV(XrdOucIOVec *readV, int rnum) { // read the full extent into the buffer long timed_read_ns{0}; {Timer_ns ts(timed_read_ns); - curCount = m_xrdOssDF->Read(buffer.data(), off, len); + curCount = m_xrdOssDF->Read(buffer.data(), off, len); } // timer scope - ++m_timer_count; - auto l = m_timer_longest.load(); - m_timer_longest.store(max(l,timed_read_ns)); // doesn't quite prevent race conditions - m_timer_read_ns.fetch_add(timed_read_ns); - m_timer_size.fetch_add(curCount); + ++m_timer_count; + auto l = m_timer_longest.load(); + m_timer_longest.store(max(l,timed_read_ns)); // doesn't quite prevent race conditions + m_timer_read_ns.fetch_add(timed_read_ns); + m_timer_size.fetch_add(curCount); // check that the correct amount of data was read. // std:: clog << "buf Read " << curCount << std::endl; diff --git a/src/XrdCeph/XrdCephOssReadVFile.hh b/src/XrdCeph/XrdCephOssReadVFile.hh index 22c0717a3..a27a439ac 100644 --- a/src/XrdCeph/XrdCephOssReadVFile.hh +++ b/src/XrdCeph/XrdCephOssReadVFile.hh @@ -76,6 +76,7 @@ public: protected: XrdCephOss *m_cephoss = nullptr; XrdCephOssFile * m_xrdOssDF = nullptr; // holder of the XrdCephOssFile instance + bool m_extraLogging = true; // use verbose logging std::string m_algname = "passthrough"; std::unique_ptr m_readVAdapter; diff --git a/src/XrdCeph/XrdCephPosix.cc b/src/XrdCeph/XrdCephPosix.cc index e60a7ba20..a5ac6d100 100644 --- a/src/XrdCeph/XrdCephPosix.cc +++ b/src/XrdCeph/XrdCephPosix.cc @@ -132,6 +132,9 @@ XrdSysMutex g_fd_mutex; /// mutex protecting initialization of ceph clusters XrdSysMutex g_init_mutex; +//JW Counter for number of times a given cluster is resolved. +std::map g_idxCntr; + /// Accessor to next ceph pool index /// Note that this is not thread safe, but we do not care /// as we only want a rough load balancing @@ -155,6 +158,8 @@ unsigned int getCephPoolIdxAndIncrease() { nextValue = 0; } g_cephPoolIdx = nextValue; + // JW logging of accesses: + ++g_idxCntr[res]; return res; } @@ -255,6 +260,31 @@ static unsigned int stoui(const std::string &s) { return (unsigned int)res; } +void dumpClusterInfo() { + //JW + // log the current state of the cluster: + // don't want to lock here, so the numbers may not be 100% self-consistent + int n_cluster = g_cluster.size(); + int n_ioCtx = g_ioCtx.size(); + int n_filesOpenForWrite = g_filesOpenForWrite.size(); + int n_fds = g_fds.size(); + int n_stripers = g_radosStripers.size(); + int n_stripers_pool = 0; + for (size_t i = 0; i < g_radosStripers.size(); ++i) { + n_stripers_pool += g_radosStripers.at(i).size(); + } + std::stringstream ss; + ss << "Counts: " << n_cluster << " " << n_ioCtx << " " << n_filesOpenForWrite << " " + << n_fds << " " << n_stripers << " " << n_stripers_pool << " " << n_stripers_pool + << " CountsbyCluster: ["; + for (const auto& el : g_idxCntr) { + ss << el.first << ":" << el.second << ", " ; + } // it + ss<< "], "; + + logwrapper((char*)"dumpClusterInfo : %s", ss.str().c_str()); +} + /// fills the userId of a ceph file struct from a string and an environment /// returns position of first character after the userId @@ -661,19 +691,17 @@ int ceph_posix_open(XrdOucEnv* env, const char *pathname, int flags, mode_t mode struct stat buf; libradosstriper::RadosStriper *striper = getRadosStriper(fr); //Get a handle to the RADOS striper API - if (NULL == striper) { logwrapper((char*)"Cannot create striper"); return -EINVAL; } - + dumpClusterInfo(); // JW enhanced logging + int rc = striper->stat(fr.name, (uint64_t*)&(buf.st_size), &(buf.st_atime)); //Get details about a file bool fileExists = (rc != -ENOENT); //Make clear what condition we are testing - logwrapper((char*)"Access Mode: %s flags&O_ACCMODE %d ", pathname, flags); - if ((flags&O_ACCMODE) == O_RDONLY) { // Access mode is READ if (fileExists) { @@ -1290,6 +1318,9 @@ int ceph_posix_truncate(XrdOucEnv* env, const char *pathname, unsigned long long int ceph_posix_unlink(XrdOucEnv* env, const char *pathname) { logwrapper((char*)"ceph_posix_unlink : %s", pathname); + // start the timer + auto timer_start = std::chrono::steady_clock::now(); + // minimal stat : only size and times are filled CephFile file = getCephFile(pathname, env); libradosstriper::RadosStriper *striper = getRadosStriper(file); @@ -1297,7 +1328,15 @@ int ceph_posix_unlink(XrdOucEnv* env, const char *pathname) { return -EINVAL; } int rc = striper->remove(file.name); + auto end = std::chrono::steady_clock::now(); + auto deltime_ms = std::chrono::duration_cast(end - timer_start).count(); + + if (rc == 0) { + logwrapper((char*)"ceph_posix_unlink : %s unlink successful: %d ms", pathname, deltime_ms); + return 0; + } if (rc != -EBUSY) { + logwrapper((char*)"ceph_posix_unlink : %s unlink failed: %d ms; return code %d", pathname, deltime_ms, rc); return rc; } // if EBUSY returned, assume the file is locked; so try to remove the lock @@ -1312,10 +1351,13 @@ int ceph_posix_unlink(XrdOucEnv* env, const char *pathname) { // now try to remove again rc = striper->remove(file.name); + end = std::chrono::steady_clock::now(); + deltime_ms = std::chrono::duration_cast(end - timer_start).count(); + if (rc != 0) { - logwrapper((char*)"ceph_posix_unlink : unlink failed after lock removal %s, %d", pathname, rc); + logwrapper((char*)"ceph_posix_unlink : unlink failed after lock removal %s, %d ms", pathname, deltime_ms); } else { - logwrapper((char*)"ceph_posix_unlink : unlink suceeded after lock removal %s, %d", pathname, rc); + logwrapper((char*)"ceph_posix_unlink : unlink suceeded after lock removal %s, %d ms", pathname, deltime_ms); } return rc; } From 18c6fe64a5784f6854c3e241a92802ce4915981d Mon Sep 17 00:00:00 2001 From: snafus Date: Mon, 17 Apr 2023 14:42:14 +0100 Subject: [PATCH 10/18] Bug fix for writes with bufferedIO when extending over buffer range. (#40) * Bug fix for writes with bufferedIO when extending over buffer range. - Fix for case where multiple writes to the buffer are needed for a given xrd write request - Previously threw an error; now will correctly perform the multiple writes as required. - Set the Simple Data buffer capacity to the input size, rather than the capacity of the vector, which could be larger. --------- Co-authored-by: James Walder --- src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc | 10 ++++------ src/XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.cc | 7 +++++-- src/XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.hh | 1 + 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc b/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc index d14ccc347..1485ff8f1 100644 --- a/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc +++ b/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc @@ -285,7 +285,6 @@ ssize_t XrdCephBufferAlgSimple::write (const void *buf, off_t offset, size_t ble size_t bytesRemaining = blen; //!< track how many bytes left to write size_t bytesWritten = 0; - off_t bufferOffset = m_bufferLength; // position to append data in the buffer, i.e. the end of the buffer /** Typically would expect only one loop, i.e. the write request is smaller than the buffer. * If bigger, or the request stradles the end of the buffer, will need another loop @@ -306,27 +305,26 @@ ssize_t XrdCephBufferAlgSimple::write (const void *buf, off_t offset, size_t ble // cache is currently empty, so set the 'reference' to the external offset now m_bufferStartingOffset = offset + bytesWritten; } - //add data to the cache from buf, from buf[offsetDelta] to the cache at position bufferOffset + //add data to the cache from buf, from buf[offsetDelta] to the cache at position m_bufferLength // make sure to write only as many bytes as left in the cache. size_t nBytesToWrite = std::min(bytesRemaining, m_bufferdata->capacity()-m_bufferLength); const void* bufAtOffset = (void*)((char*)buf + bytesWritten); // nasty cast as void* doesn't do arithmetic if (nBytesToWrite == 0) { BUFLOG( "Wanting to write 0 bytes; why is that?"); } - rc = m_bufferdata->writeBuffer(bufAtOffset, bufferOffset, nBytesToWrite, 0); + rc = m_bufferdata->writeBuffer(bufAtOffset, m_bufferLength, nBytesToWrite, 0); if (rc < 0) { - BUFLOG( "WriteBuffer step failed: " << rc << " " << bufferOffset << " " << blen << " " << offset ); + BUFLOG( "WriteBuffer step failed: " << rc << " " << m_bufferLength << " " << blen << " " << offset ); return rc; // pass the error condidition upwards } if (rc != (ssize_t)nBytesToWrite) { BUFLOG( "WriteBuffer returned unexpected number of bytes: " << rc << " Expected: " << nBytesToWrite << " " - << bufferOffset << " " << blen << " " << offset ); + << m_bufferLength << " " << blen << " " << offset ); return -EBADE; // is bad exchange error best errno here? } // lots of repetition here; #TODO try to reduce m_bufferLength += rc; - bufferOffset += rc; bytesWritten += rc; bytesRemaining -= rc; diff --git a/src/XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.cc b/src/XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.cc index 457334c2a..fc9ed4431 100644 --- a/src/XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.cc +++ b/src/XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.cc @@ -21,7 +21,7 @@ std::atomic XrdCephBufferDataSimple::m_total_memory_nbuffers {0}; //!< tot XrdCephBufferDataSimple::XrdCephBufferDataSimple(size_t bufCapacity): - m_buffer(bufCapacity,0), m_externalOffset(0),m_bufLength(0) { + m_bufferSize(bufCapacity), m_buffer(bufCapacity,0), m_externalOffset(0),m_bufLength(0) { m_valid = true; // update global statistics @@ -32,6 +32,7 @@ XrdCephBufferDataSimple::XrdCephBufferDataSimple(size_t bufCapacity): XrdCephBufferDataSimple::~XrdCephBufferDataSimple() { m_valid = false; + // obtain the actual capacity here, as this is the real number of bytes to be released auto cap = m_buffer.capacity(); m_buffer.clear(); m_buffer.reserve(0); // just to be paranoid and realse memory immediately @@ -45,7 +46,9 @@ XrdCephBufferDataSimple::~XrdCephBufferDataSimple() { size_t XrdCephBufferDataSimple::capacity() const { - return m_buffer.capacity(); + // return defined buffered size, which might in principle be different + // to the actual size of the buffer allocated in memory + return m_bufferSize; } size_t XrdCephBufferDataSimple::length() const { diff --git a/src/XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.hh b/src/XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.hh index ac9b36d10..ca3fe8cc3 100644 --- a/src/XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.hh +++ b/src/XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.hh @@ -45,6 +45,7 @@ class XrdCephBufferDataSimple : public virtual IXrdCephBufferData protected: + size_t m_bufferSize; //! the buffer size bool m_valid = false; std::vector m_buffer; // actual physical buffer off_t m_externalOffset = 0; //! what does the first byte of the buffer map to for external offsets From dd5ad6612fdb4d668f2fd372ddd1f584a5432db9 Mon Sep 17 00:00:00 2001 From: Jo-stfc <71326101+Jo-stfc@users.noreply.github.com> Date: Tue, 18 Apr 2023 11:18:38 +0100 Subject: [PATCH 11/18] Buffered io spaceinfo (#39) * test * fix merge conflict * extra bracket * misplaced bracket * StatLS only takes pool name from section of object path before first colon ':' * Tidy reporting of pool name to ignore some exraneous characters * Add XrdSys/XrdSysPlatform.h to get MAXPATHLEN * Bug fix for writes with bufferedIO when extending over buffer range. (#40) (#41) * Bug fix for writes with bufferedIO when extending over buffer range. - Fix for case where multiple writes to the buffer are needed for a given xrd write request - Previously threw an error; now will correctly perform the multiple writes as required. - Set the Simple Data buffer capacity to the input size, rather than the capacity of the vector, which could be larger. --------- Co-authored-by: snafus Co-authored-by: James Walder --------- Co-authored-by: Ian Johnson Co-authored-by: snafus Co-authored-by: James Walder --- packaging/make-src-tar.sh | 269 ++++++++++++++++++++++++++++++++++++ src/XrdCeph/XrdCephOss.cc | 255 +++++++++++++++++++++++++++++++--- src/XrdCeph/XrdCephOss.hh | 4 +- src/XrdCeph/XrdCephPosix.cc | 50 ++++++- src/XrdCeph/XrdCephPosix.hh | 1 + 5 files changed, 560 insertions(+), 19 deletions(-) create mode 100755 packaging/make-src-tar.sh diff --git a/packaging/make-src-tar.sh b/packaging/make-src-tar.sh new file mode 100755 index 000000000..71ea4ecf7 --- /dev/null +++ b/packaging/make-src-tar.sh @@ -0,0 +1,269 @@ +#!/bin/bash +#------------------------------------------------------------------------------- +# Create a source RPM package +# Author: Lukasz Janyst (10.03.2011) +#------------------------------------------------------------------------------- + +RCEXP='^[0-9]+\.[0-9]+\.[0-9]+\-rc.*$' +CERNEXP='^[0-9]+\.[0-9]+\.[0-9]+\-[0-9]+\.CERN.*$' + +#------------------------------------------------------------------------------- +# Find a program +#------------------------------------------------------------------------------- +function findProg() +{ + for prog in $@; do + if test -x "`which $prog 2>/dev/null`"; then + echo $prog + break + fi + done +} + +#------------------------------------------------------------------------------- +# Print help +#------------------------------------------------------------------------------- +function printHelp() +{ + echo "Usage:" 1>&2 + echo "${0} [--help] [--source PATH] [--output PATH]" 1>&2 + echo " --help prints this message" 1>&2 + echo " --source PATH specify the root of the source tree" 1>&2 + echo " defaults to ../" 1>&2 + echo " --output PATH the directory where the source rpm" 1>&2 + echo " should be stored, defaulting to ." 1>&2 + echo " --version VERSION the version provided by user" 1>&2 + echo " --define 'MACRO EXPR'" 1>&2 +} + +#------------------------------------------------------------------------------- +# Parse the commandline, if only we could use getopt... :( +#------------------------------------------------------------------------------- +SOURCEPATH="../" +OUTPUTPATH="." +PRINTHELP=0 +RPM_NAME="xrootd-ceph" + +while test ${#} -ne 0; do + if test x${1} = x--help; then + PRINTHELP=1 + elif test x${1} = x--source; then + if test ${#} -lt 2; then + echo "--source parameter needs an argument" 1>&2 + exit 1 + fi + SOURCEPATH=${2} + shift + elif test x${1} = x--output; then + if test ${#} -lt 2; then + echo "--output parameter needs an argument" 1>&2 + exit 1 + fi + OUTPUTPATH=${2} + shift + elif test x${1} = x--version; then + if test ${#} -lt 2; then + echo "--version parameter needs an argument" 1>&2 + exit 1 + fi + USER_VERSION="--version ${2}" + shift + elif test x${1} = x--define; then + if test ${#} -lt 2; then + echo "--define parameter needs an argument" 1>&2 + exit 1 + fi + USER_DEFINE="$USER_DEFINE --define \""${2}"\"" + shift + elif test x${1} = x--rename; then + if test ${#} -lt 2; then + echo "--rename parameter needs an argument" 1>&2 + exit 1 + fi + cp rhel/xrootd-ceph.spec.in rhel/${2}.spec.in + sed -i "s/xrootd-ceph/${2}/" rhel/${2}.spec.in + cp rhel/${2}.spec.in /root/rpmbuild/SPECS + RPM_NAME="${2}" + shift + fi + shift +done + +if test $PRINTHELP -eq 1; then + printHelp + exit 0 +fi + +echo "[i] Working on: $SOURCEPATH" +echo "[i] Storing the output to: $OUTPUTPATH" + +#------------------------------------------------------------------------------- +# Check if the source and the output dirs +#------------------------------------------------------------------------------- +if test ! -d $SOURCEPATH -o ! -r $SOURCEPATH; then + echo "[!] Source path does not exist or is not readable" 1>&2 + exit 2 +fi + +if test ! -d $OUTPUTPATH -o ! -w $OUTPUTPATH; then + echo "[!] Output path does not exist or is not writeable" 1>&2 + exit 2 +fi + +#------------------------------------------------------------------------------- +# Check if we have all the necassary components +#------------------------------------------------------------------------------- +if test x`findProg rpmbuild` = x; then + echo "[!] Unable to find rpmbuild, aborting..." 1>&2 + exit 1 +fi + +if test x`findProg git` = x; then + echo "[!] Unable to find git, aborting..." 1>&2 + exit 1 +fi + +#------------------------------------------------------------------------------- +# Check if the source is a git repository +#------------------------------------------------------------------------------- +if test ! -d $SOURCEPATH/.git; then + echo "[!] I can only work with a git repository" 1>&2 + exit 2 +fi + +#------------------------------------------------------------------------------- +# Check the version number +#------------------------------------------------------------------------------- +if test ! -x $SOURCEPATH/genversion.sh; then + echo "[!] Unable to find the genversion script" 1>&2 + exit 3 +fi + +VERSION=`$SOURCEPATH/genversion.sh --print-only $USER_VERSION $SOURCEPATH 2>/dev/null` +if test $? -ne 0; then + echo "[!] Unable to figure out the version number" 1>&2 + exit 4 +fi + +echo "[i] Working with version: $VERSION" + +if test x${VERSION:0:1} = x"v"; then + VERSION=${VERSION:1} +fi + +#------------------------------------------------------------------------------- +# Deal with release candidates +#------------------------------------------------------------------------------- +RELEASE=1 +if test x`echo $VERSION | egrep $RCEXP` != x; then + RELEASE=0.`echo $VERSION | sed 's/.*-rc/rc/'` + VERSION=`echo $VERSION | sed 's/-rc.*//'` +fi + +#------------------------------------------------------------------------------- +# Deal with CERN releases +#------------------------------------------------------------------------------- +if test x`echo $VERSION | egrep $CERNEXP` != x; then + RELEASE=`echo $VERSION | sed 's/.*-//'` + VERSION=`echo $VERSION | sed 's/-.*\.CERN//'` +fi + +#------------------------------------------------------------------------------- +# In case of user version check if the release number has been provided +#------------------------------------------------------------------------------- +if test x"$USER_VERSION" != x; then + TMP=`echo $VERSION | sed 's#.*-##g'` + if test $TMP != $VERSION; then + RELEASE=$TMP + VERSION=`echo $VERSION | sed 's#-[^-]*$##'` + fi +fi + +VERSION=`echo $VERSION | sed 's/-/./g'` +echo "[i] RPM compliant version: $VERSION-$RELEASE" + +#------------------------------------------------------------------------------- +# Create a tempdir and copy the files there +#------------------------------------------------------------------------------- +# exit on any error +set -e + +TEMPDIR=`mktemp -d /tmp/${RPM_NAME}.srpm.XXXXXXXXXX` +RPMSOURCES=$TEMPDIR/rpmbuild/SOURCES +mkdir -p $RPMSOURCES +mkdir -p $TEMPDIR/rpmbuild/SRPMS + +echo "[i] Working in: $TEMPDIR" 1>&2 + +if test -d rhel -a -r rhel; then + for i in rhel/*; do + cp $i $RPMSOURCES + done +fi + +if test -d common -a -r common; then + for i in common/*; do + cp $i $RPMSOURCES + done +fi + +#------------------------------------------------------------------------------- +# Generate the spec file +#------------------------------------------------------------------------------- +if test ! -r rhel/${RPM_NAME}.spec.in; then + echo "[!] The specfile template does not exist!" 1>&2 + exit 7 +fi +cat rhel/${RPM_NAME}.spec.in | sed "s/__VERSION__/$VERSION/" | \ + sed "s/__RELEASE__/$RELEASE/" > $TEMPDIR/${RPM_NAME}.spec + +#------------------------------------------------------------------------------- +# Make a tarball of the latest commit on the branch +#------------------------------------------------------------------------------- +# no more exiting on error +set +e + +CWD=$PWD +cd $SOURCEPATH +COMMIT=`git log --pretty=format:"%H" -1` + +if test $? -ne 0; then + echo "[!] Unable to figure out the git commit hash" 1>&2 + exit 5 +fi + +git archive --prefix=${RPM_NAME}/ --format=tar $COMMIT | gzip -9fn > \ + $RPMSOURCES/${RPM_NAME}.tar.gz + +if test $? -ne 0; then + echo "[!] Unable to create the source tarball" 1>&2 + exit 6 +fi +echo "Copying $RPMSOURCES/${RPM_NAME}.tar.gz to ${OUTPUTPATH}" +cp -f $RPMSOURCES/${RPM_NAME}.tar.gz ${OUTPUTPATH} + +cd $CWD + +#------------------------------------------------------------------------------- +# Build the source RPM +#------------------------------------------------------------------------------- +echo "[i] Creating the source RPM..." + +# Dirty, dirty hack! +echo "%_sourcedir $RPMSOURCES" >> $TEMPDIR/rpmmacros +eval "rpmbuild --define \"_topdir $TEMPDIR/rpmbuild\" \ + --define \"%_sourcedir $RPMSOURCES\" \ + --define \"%_srcrpmdir %{_topdir}/SRPMS\" \ + --define \"_source_filedigest_algorithm md5\" \ + --define \"_binary_filedigest_algorithm md5\" \ + ${USER_DEFINE} \ + -bs $TEMPDIR/${RPM_NAME}.spec > $TEMPDIR/log" +if test $? -ne 0; then + echo "[!] RPM creation failed" 1>&2 + exit 8 +fi + +cp $TEMPDIR/rpmbuild/SRPMS/${RPM_NAME}*.src.rpm $OUTPUTPATH +rm -rf $TEMPDIR + +echo "[i] Done." diff --git a/src/XrdCeph/XrdCephOss.cc b/src/XrdCeph/XrdCephOss.cc index b2884d23b..08c36be38 100644 --- a/src/XrdCeph/XrdCephOss.cc +++ b/src/XrdCeph/XrdCephOss.cc @@ -25,10 +25,10 @@ #include #include #include - #include "XrdCeph/XrdCephPosix.hh" #include "XrdOuc/XrdOucEnv.hh" #include "XrdSys/XrdSysError.hh" +#include "XrdSys/XrdSysPlatform.hh" #include "XrdOuc/XrdOucTrace.hh" #include "XrdOuc/XrdOucStream.hh" #include "XrdOuc/XrdOucName2Name.hh" @@ -69,6 +69,64 @@ static void logwrapper(char *format, va_list argp) { /// used in XrdCephPosix extern XrdOucName2Name *g_namelib; +/// converts a logical filename to physical one if needed +void m_translateFileName(std::string &physName, std::string logName){ + if (0 != g_namelib) { + char physCName[MAXPATHLEN+1]; + int retc = g_namelib->lfn2pfn(logName.c_str(), physCName, sizeof(physCName)); + if (retc) { + XrdCephEroute.Say(__FUNCTION__, " - failed to translate '", logName.c_str(), "' using namelib plugin, using it as is"); + physName = logName; + } else { + XrdCephEroute.Say(__FUNCTION__, " - translated '", logName.c_str(), "' to '", physCName, "'"); + physName = physCName; + } + } else { + physName = logName; + } +} + +/** + * Get an integer numeric value from an extended attribute attached to an object + * + * @brief Retrieve an integer-value extended attribute. + * @param path the object ID containing the attribute + * @param attrName the name of the attribute to retrieve + * @param maxAttrLen the largest number of characters to handle + * @return value of the attibute, -EINVAL if not valid integer, or -ENOMEM + * + * Implementation: + * Ian Johnson, ian.johnson@stfc.ac.uk, 2022 + * + */ + +ssize_t getNumericAttr(const char* const path, const char* attrName, const int maxAttrLen) +{ + + ssize_t retval; + char *attrValue = (char*)malloc(maxAttrLen+1); + if (NULL == attrValue) { + return -ENOMEM; + } + + ssize_t attrLen = ceph_posix_getxattr((XrdOucEnv*)NULL, path, attrName, attrValue, maxAttrLen); + + if (attrLen <= 0) { + retval = -EINVAL; + } else { + attrValue[attrLen] = (char)NULL; + char *endPointer = (char *)NULL; + retval = strtoll(attrValue, &endPointer, 10); + } + + if (NULL != attrValue) { + free(attrValue); + } + + return retval; + +} + extern "C" { XrdOss* @@ -253,8 +311,16 @@ int XrdCephOss::Configure(const char *configfn, XrdSysError &Eroute) { } } - } // while - + if (!strcmp(var, "ceph.reportingpools")) { + var = Config.GetWord(); + if (var) { + m_configPoolnames = var; + } else { + Eroute.Emsg("Config", "Missing value for ceph.reportingpools in config file", configfn); + return 1; + } + } + } // Now check if any errors occured during file i/o int retc = Config.LastError(); if (retc) { @@ -294,27 +360,100 @@ int XrdCephOss::Rename(const char *from, return -ENOTSUP; } +/** + * + * @brief Extract a pool name (string before the first colon ':') from an object ID. + * @param (in) possPool the object ID + * @return pool name or unchanged object ID + * + * Implementation: + * Ian Johnson STFC RAL, ian.johnson@stfc.ac.uk, 2022 + * + */ + +std::string extractPool(std::string possPool) { + + std::string pool; + auto colonPos = possPool.find_first_of(':'); + + if (colonPos > 0) { + pool = possPool.substr(0, colonPos); + } else { + pool = possPool; + } + return pool; +} + + +/** + * + * Populate a struct stat* with information on an object ID. + * Determine whether the request relates to a pool name for disk space reporting via + * StatLS. If not, handle an object path or the notional root element "/" + * + * @brief Return status information for an object ID. + * @param (in) path the object ID + * @param (out) buff receive the status information + * @param (in) opts not used + * @param (in) env not used + * + * Implementation of enhancements: + * Jyothish Thomas STFC RAL, jyothish.thomas@stfc.ac.uk, 2022 + * Ian Johnson STFC RAL, ian.johnson@stfc.ac.uk, 2022, 2023 + * + */ + + int XrdCephOss::Stat(const char* path, struct stat* buff, int opts, XrdOucEnv* env) { - try { - if (!strcmp(path, "/")) { - // special case of a stat made by the locate interface - // we intend to then list all files - memset(buff, 0, sizeof(*buff)); - buff->st_mode = S_IFDIR | 0700; - return 0; - } else { - return ceph_posix_stat(env, path, buff); - } - } catch (std::exception &e) { - XrdCephEroute.Say("stat : invalid syntax in file parameters"); - return -EINVAL; + + XrdCephEroute.Say(__FUNCTION__, " path = ", path); + + std::string spath {path}; + m_translateFileName(spath,path); + + if (spath.back() == '/') { // Request to stat the root + + XrdCephEroute.Say(__FUNCTION__, " - fake a return for stat'ing root element '/'"); + + // special case of a stat made by the locate interface + // we intend to then list all files + + memset(buff, 0, sizeof(*buff)); + buff->st_mode = S_IFDIR|S_IRWXU; + buff->st_dev = 1; + buff->st_ino = 1; + + return XrdOssOK; + + } else if (ceph_posix_stat(env, path, buff) == 0) { // Found object ID + +#ifdef STAT_TRACE + XrdCephEroute.Say(__FUNCTION__, " - found object ", spath.c_str(), " via ceph_posix_stat"); +#endif + return XrdOssOK; + + } else { + +#ifdef STAT_TRACE + XrdCephEroute.Say(__FUNCTION__, " - cannot find object '", spath.c_str(), "'"); +#endif + return -ENOENT; + } + } + + + int XrdCephOss::StatFS(const char *path, char *buff, int &blen, XrdOucEnv *eP) { + +#ifdef STAT_TRACE + XrdCephEroute.Say(__FUNCTION__, " path = ", path); +#endif XrdOssVSInfo sP; int rc = StatVS(&sP, 0, 0); if (rc) { @@ -327,6 +466,10 @@ int XrdCephOss::StatFS(const char *path, char *buff, int &blen, XrdOucEnv *eP) { } int XrdCephOss::StatVS(XrdOssVSInfo *sP, const char *sname, int updt) { + +#ifdef STAT_TRACE + XrdCephEroute.Say(__FUNCTION__, " path = ", sname); +#endif int rc = ceph_posix_statfs(&(sP->Total), &(sP->Free)); if (rc) { return rc; @@ -338,6 +481,86 @@ int XrdCephOss::StatVS(XrdOssVSInfo *sP, const char *sname, int updt) { return XrdOssOK; } +int formatStatLSResponse(char *buff, int &blen, const char* cgroup, long long totalSpace, + long long usedSpace, long long freeSpace, long long quota, long long maxFreeChunk) +{ + return snprintf(buff, blen, "oss.cgroup=%s&oss.space=%lld&oss.free=%lld&oss.maxf=%lld&oss.used=%lld&oss.quota=%lld", + cgroup, totalSpace, freeSpace, maxFreeChunk, usedSpace, quota); +} + +/** + * + * Handle a request for the amount of space used in a Ceph pool + * + * @brief Report on disk space use in this pool. + * @param (in) env not used + * @param (in) path name of the pool + * @param (out) buff location for string containing OSS key-value pairs for disk space used, free, etc + * @param (out) blen set to length of buff + * + * Implementation: + * Jyothish Thomas STFC RAL, jyothish.thomas@stfc.ac.uk, 2022 + * Ian Johnson STFC RAL, ian.johnson@stfc.ac.uk, 2022, 2023 + * + */ + + +int XrdCephOss::StatLS(XrdOucEnv &env, const char *charPath, char *buff, int &blen) +{ + XrdCephEroute.Say(__FUNCTION__, " incoming path = ", charPath); + + std::string path({charPath}); + path = extractPool(path); + std::string spath {path}; + + m_translateFileName(spath,path); + +// +// Following test is now redundant as we take the substring up to colonPos +// + if (spath.back() == ':') { + spath.pop_back(); + } + if (m_configPoolnames.find(spath) == std::string::npos) { + XrdCephEroute.Say("Can't report on ", spath.c_str()); + return -EINVAL; + } + + long long usedSpace, totalSpace, freeSpace; + + if (ceph_posix_stat_pool(spath.c_str(), &usedSpace) != 0) { + XrdCephEroute.Say("Failed to get used space in pool ", spath.c_str()); + return -EINVAL; + } + + // Construct the object path + std::string spaceInfoPath = spath + ":" + (const char *)"__spaceinfo__"; + totalSpace = getNumericAttr(spaceInfoPath.c_str(), "total_space", 24); + if (totalSpace < 0) { + XrdCephEroute.Say("Could not get 'total_space' attribute from ", spaceInfoPath.c_str()); + return -EINVAL; + } + +// +// Figure for 'usedSpace' already accounts for Erasure Coding overhead +// + + + freeSpace = totalSpace - usedSpace; + blen = formatStatLSResponse(buff, blen, + /* charPath */ spath.c_str(), /* "oss.cgroup" */ + totalSpace, /* "oss.space" */ + usedSpace, /* "oss.used" */ + freeSpace, /* "oss.free" */ + totalSpace, /* "oss.quota" */ + freeSpace /* "oss.maxf" */); +#ifdef STAT_TRACE + XrdCephEroute.Say(__FUNCTION__, "space info = \n", buff); +#endif + return XrdOssOK; + +} + int XrdCephOss::Truncate (const char* path, unsigned long long size, XrdOucEnv* env) { diff --git a/src/XrdCeph/XrdCephOss.hh b/src/XrdCeph/XrdCephOss.hh index 2749a3ece..0e7312af5 100644 --- a/src/XrdCeph/XrdCephOss.hh +++ b/src/XrdCeph/XrdCephOss.hh @@ -65,6 +65,7 @@ public: virtual int Rename(const char *, const char *, XrdOucEnv *eP1=0, XrdOucEnv *eP2=0); virtual int Stat(const char *, struct stat *, int opts=0, XrdOucEnv *eP=0); virtual int StatFS(const char *path, char *buff, int &blen, XrdOucEnv *eP=0); + virtual int StatLS(XrdOucEnv &env, const char *path, char *buff, int &blen); virtual int StatVS(XrdOssVSInfo *sP, const char *sname=0, int updt=0); virtual int Truncate(const char *, unsigned long long, XrdOucEnv *eP=0); virtual int Unlink(const char *path, int Opts=0, XrdOucEnv *eP=0); @@ -72,13 +73,14 @@ public: virtual XrdOssDF *newFile(const char *tident); private: + bool m_configBufferEnable=false; //! config option for buffering size_t m_configBufferSize=16*1024*1024L; //! Buffer size std::string m_configBufferIOmode = "aio"; bool m_configReadVEnable=false; //! enable readV decorator std::string m_configReadVAlgName="passthrough"; // readV algorithm type size_t m_configMaxSimulBufferCount=10; //! max number of buffers in a single Oss instance (.e.g simul. reads) - + std::string m_configPoolnames; }; #endif /* __CEPH_OSS_HH__ */ diff --git a/src/XrdCeph/XrdCephPosix.cc b/src/XrdCeph/XrdCephPosix.cc index a5ac6d100..c70f338e8 100644 --- a/src/XrdCeph/XrdCephPosix.cc +++ b/src/XrdCeph/XrdCephPosix.cc @@ -44,10 +44,12 @@ #include #include #include + #include "XrdSfs/XrdSfsAio.hh" #include "XrdSys/XrdSysPthread.hh" #include "XrdOuc/XrdOucName2Name.hh" #include "XrdSys/XrdSysPlatform.hh" +#include #include "XrdCeph/XrdCephPosix.hh" @@ -1036,7 +1038,7 @@ ssize_t ceph_aio_read(int fd, XrdSfsAio *aiop, AioCB *cb) { int ceph_posix_fstat(int fd, struct stat *buf) { CephFileRef* fr = getFileRef(fd); if (fr) { - logwrapper((char*)"ceph_stat: fd %d", fd); + logwrapper((char*)__FUNCTION__,": fd %d", fd); // minimal stat : only size and times are filled // atime, mtime and ctime are set all to the same value // mode is set arbitrarily to 0666 | S_IFREG @@ -1062,7 +1064,7 @@ int ceph_posix_fstat(int fd, struct stat *buf) { } int ceph_posix_stat(XrdOucEnv* env, const char *pathname, struct stat *buf) { - logwrapper((char*)"ceph_stat: %s", pathname); + logwrapper((char*)__FUNCTION__, pathname); // minimal stat : only size and times are filled // atime, mtime and ctime are set all to the same value // mode is set arbitrarily to 0666 | S_IFREG @@ -1291,6 +1293,50 @@ int ceph_posix_statfs(long long *totalSpace, long long *freeSpace) { return rc; } +/** + * + * @brief Return the amount of space used in a pool. + * @details This function - + * Obtains the statistics that librados holds on a pool + * Calculates the number of bytes allocated to the pool + * @params + * poolName: (in) the name of the pool to query + * usedSpace: (out) the number of bytes used in the pool + * @return + * success or failure status + * + * Implementation: + * Jyothish Thomas STFC RAL, jyothish.thomas@stfc.ac.uk, 2022 + * Ian Johnson STFC RAL, ian.johnson@stfc.ac.uk, 2022, 2023 + * + */ + +int ceph_posix_stat_pool(char const *poolName, long long *usedSpace) { + + logwrapper((char*)__FUNCTION__, poolName); + // get the poolIdx to use + int cephPoolIdx = getCephPoolIdxAndIncrease(); + librados::Rados* cluster = checkAndCreateCluster(cephPoolIdx); + if (0 == cluster) { + return -EINVAL; + } + + std::list poolNames({poolName}); + std::map stat; + + if (cluster->get_pool_stats(poolNames, stat) < 0) { + + logwrapper((char*)"Unable to get_pool_stats for pool ", poolName); + return -EINVAL; + + } else { + + *usedSpace = stat[poolName].num_kb * 1024; + return XrdOssOK; + + } +} + static int ceph_posix_internal_truncate(const CephFile &file, unsigned long long size) { libradosstriper::RadosStriper *striper = getRadosStriper(file); if (0 == striper) { diff --git a/src/XrdCeph/XrdCephPosix.hh b/src/XrdCeph/XrdCephPosix.hh index e34596a6c..d24efe6aa 100644 --- a/src/XrdCeph/XrdCephPosix.hh +++ b/src/XrdCeph/XrdCephPosix.hh @@ -79,6 +79,7 @@ int ceph_posix_listxattrs(XrdOucEnv* env, const char* path, XrdSysXAttr::AList * int ceph_posix_flistxattrs(int fd, XrdSysXAttr::AList **aPL, int getSz); void ceph_posix_freexattrlist(XrdSysXAttr::AList *aPL); int ceph_posix_statfs(long long *totalSpace, long long *freeSpace); +int ceph_posix_stat_pool(char const *poolName, long long *usedSpace); int ceph_posix_truncate(XrdOucEnv* env, const char *pathname, unsigned long long size); int ceph_posix_ftruncate(int fd, unsigned long long size); int ceph_posix_unlink(XrdOucEnv* env, const char *pathname); From 6a168c6da36e98bf9f1c63bcacddca3e15643c4b Mon Sep 17 00:00:00 2001 From: Jo-stfc <71326101+Jo-stfc@users.noreply.github.com> Date: Sat, 6 May 2023 21:07:21 +0100 Subject: [PATCH 12/18] Vector Read merge (#42) * variable rpm name (#17) * variable rpm name * Update xrootd-ceph.spec.in * Update makesrpm.sh * Update makesrpm.sh * Master cephnamelib (#16) * Allow ceph.namelib to take params and apply translation to full path * Reduce logging Remove extraneous logging messages * simplify parsing of namelib and added a log line for any remapped file Co-authored-by: James * XRD-22 Fix ensuring the correct filename is passed to the CephFile instance. (#24) A regression in previous commit meant that the filename was not correctly passed to the CephFile instance. This fix ensures that the filename is set correctly. Co-authored-by: james * XRD-12 Add timestamp information for ceph logging methods Update the logwrapper method to print out the current timestamp in the initial section of output. * re-introduce variable names to spec input (#27) * Return permission denied on write attempt on existing file with EXCL set (#31) Co-authored-by: James Walder * disable posc (#30) posc is disabled for proxies, but not for a unified setup. XrdCeph does not support the posc flag as it misinterprets objects as folders * Disk space reporting (#36) * Provide XrdCephOss::StatLS and ceph_posix_stat_pool to enable disk space reporting. Responds to the 'xrdfs query space' command as requested by ALICE VO * Remove ts() timestamp function and unnecessary #defines * Read ceph.poolnames setting from XRootD config to specify reportable pools. * Support 'xrdfs spaceinfo' via Stat() method returning XrdOssOK for stat'ing 'pool:' * Tidy up tracing of Stat* calls * Remove unwanted method isPathReportablePool * Add comments for need to support stat-ing '/' * Return -ENOMEM if malloc fails * Return -ENOMEM if malloc fails * Rename disk space reporting config item to ceph,reportingppols and log if the list of names is not present. Report if ceph_posix_stat_pool call to get the amount of used space fails * Sanitize incoming pool name and allow for MonALISA format * Optional tracing of Stat* incoming paths and response. Remove double logging of ceph.reporting pools. * Check that sanitized pool name is not marked invalid * Use ceph namelib translation at Oss level by copying translateFileName logic from Posix level. More error checking if stat can't find pool name. * Remove superfluous comments * Ensure tracing of path arguments to Stat() and StatLS(). Add Doxygen-style commments to changed methods * Make source tarball only as minimum output * Add make-src-tar.sh to additionally place required source tarball in '--output' destination * Change back usedSpace to totalSpace in ceph_posix_statfs * feat: improve (vector) read implementation (#37) Try to avoid usage of libradosstriper for readv operations since it may impact performance significantly. To do so we explicitly determine the objects that constitute a file and read from them using rados only. Reads are async. To do these async reads conveniently we introduce a class for handling multiple async read requests. * Initial implementation of ReadV at the XrdOss level * Correct the signature of ReadV to XrdCephOssFile * feat: do not use libradosstiper for readv operation * feat: use atomic operations for readv requests This should be the most efficient way of handling multiple read ops. * feat: use nonstriper reads for pread requests * feat: use nonstriper reads for read operations also To do so we do complete refactoring: bulkAioRead class moved to a separate file, and its features extended. Namely, it can do reads from files, not only objects, now. * feat: print warning message if waiting for aio reads from ceph takes long This is useful for debugging the reasons of failures for read(v) requests. * Added some comments * fix: use size_t for start_block We can use "%zx" in sprintf, so let's unify the types of variables in the function. This will also allow us to extend limitations on the file size. * feat: refactor BulkAioRead::read method, suggested during review 1. Rename end_block to last_block 2. Move variable definitions closer to its usage 3. Use 'std::min' instead of 'if' for chunk_len determination 4. Use more efficient chunk_start calculation * feat: add options to allow one to switch to standard read mechanisms This may be useful for testing. * feat: rename block_size to object_size in BulkAioRead New name better describes reality, since we are talking about the size of ceph objects. * feat: rename wait_for_complete to submit_and_wait_for_complete New name describes this function better. * feat: use more meaningful names for variables that loops over operations map op_data should describe the contents of the variables better. * feat: move type definitions into the class * feat: added comments with method's description * feat: remove unnecessary semicolons * feat: convert wait_for_complete method from void to int This allow one to improve several things. Here we change key to the operations and use object number instead of full its name. * fix: fixed comment * fix: fixed comments * feat: refactor bulkAioRead class Pointers were dropped from objectReadOperation and ceph_bufferlist objects. The objects are moved to appropriate classes to simplify memory management and usage. * feat: take into account completion's return value We can retrieve return code from completion and get meaningful status of the whole operation with this value. * feat: allow reading of sparse file Since we do not really expect sparse files, we use a fallback mechanism: if a read(v) failed with -ENOENT exit status, then just resubmit it using striper-based functions. * lint: remove trailing whitespaces * feat: use meaningful names for read(v) functions The name now indicates whether read(v)s are striper or non-striper ones. * feat: fallback to striper-based read if number of stripes > 1 Just in case, such files should not be present in our production setup * feat: allow zero-sized reads In principle, this is a correct request, so we should support it. * fix: make sure we do not delete completion objects until submitted operation is completed This is done to prevent some nasty side-effects, e.g. writing to a deleted buffer. * fix: remove move constructor from bulkAioRead We do not use it. * fix: handle failure to allocate completion Completion allocation can fail, we should take that into an account. * feat: use file reference to construct readOp objects There is no need to extract (and the copy) file name and object size from file reference to construct read object, we can use file reference directly. * feat: replace conversion operator with explicit method Implicit conversion was making code less readable. * feat: remove call to is_complete() in completion wrapper destructor There is no need to check for completion, we can call wait_for_complete multiple times. * feat: put warning threshold to config file It is better to have this value as configurable instead of hardcoded. * fix: initialize return code variable in ReadOpData * Added comment * feat: add comment for future optimization. We should use `aio_cancel` to cancel all pending read operations in future. * fix: remove vim's swp file Commited by accident * feat: improve logging Add file descriptor to sparse file's logging, fix typos. * fix: minor fixes Remove unnecessary include, move variable declaration closer to the usage, fix spelling in the comment. * feat: BulkAioRead::read method refactoring Refactoring was made to increase (hopefully) readability. * fix: better wording for comment * feat: BulkAioRead::read -- change loop exit condition We can exit when `to_read == 0`. This allow us to drop `end_block` variable. * fix: add call to `clear` after getting results This is to allow clients to use the same readOp object for future operations. --------- Co-authored-by: Ian Johnson Co-authored-by: Alexander Rogovskiy * duplicate struct definition * move struct definition to headers * use bufferedIO version of path * remove MAXPATHLEN redefinition --------- Co-authored-by: snafus Co-authored-by: James Co-authored-by: root Co-authored-by: Ian Johnson Co-authored-by: alex-rg Co-authored-by: Alexander Rogovskiy --- src/XrdCeph.cmake | 3 +- src/XrdCeph/XrdCephBulkAioRead.cc | 198 ++++++++++++++++++++++++++++++ src/XrdCeph/XrdCephBulkAioRead.hh | 93 ++++++++++++++ src/XrdCeph/XrdCephOss.cc | 72 ++++++++++- src/XrdCeph/XrdCephOss.hh | 6 +- src/XrdCeph/XrdCephOssFile.cc | 37 +++++- src/XrdCeph/XrdCephOssFile.hh | 1 + src/XrdCeph/XrdCephPosix.cc | 177 +++++++++++++++++++++----- src/XrdCeph/XrdCephPosix.hh | 37 ++++++ 9 files changed, 587 insertions(+), 37 deletions(-) create mode 100644 src/XrdCeph/XrdCephBulkAioRead.cc create mode 100644 src/XrdCeph/XrdCephBulkAioRead.hh diff --git a/src/XrdCeph.cmake b/src/XrdCeph.cmake index 33843544d..6a0c64979 100644 --- a/src/XrdCeph.cmake +++ b/src/XrdCeph.cmake @@ -45,7 +45,8 @@ add_library( MODULE XrdCeph/XrdCephOss.cc XrdCeph/XrdCephOss.hh XrdCeph/XrdCephOssFile.cc XrdCeph/XrdCephOssFile.hh - XrdCeph/XrdCephOssDir.cc XrdCeph/XrdCephOssDir.hh + XrdCeph/XrdCephOssDir.cc XrdCeph/XrdCephOssDir.hh + XrdCeph/XrdCephBulkAioRead.cc XrdCeph/XrdCephBulkAioRead.hh XrdCeph/XrdCephOssBufferedFile.cc XrdCeph/XrdCephOssBufferedFile.hh XrdCeph/XrdCephOssReadVFile.cc XrdCeph/XrdCephOssReadVFile.hh XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.cc XrdCeph/XrdCephBuffers/XrdCephBufferDataSimple.hh diff --git a/src/XrdCeph/XrdCephBulkAioRead.cc b/src/XrdCeph/XrdCephBulkAioRead.cc new file mode 100644 index 000000000..af064bfc2 --- /dev/null +++ b/src/XrdCeph/XrdCephBulkAioRead.cc @@ -0,0 +1,198 @@ +#include "XrdCephBulkAioRead.hh" + + +bulkAioRead::bulkAioRead(librados::IoCtx* ct, logfunc_pointer logwrapper, CephFileRef* fileref) { + /** + * Constructor. + * + * @param ct Rados IoContext object + * @param logfunc_pointer Pointer to the function that will be used for logging + * @param fileref Ceph file reference + * + */ + context = ct; + file_ref = fileref; + log_func = logwrapper; +} + +bulkAioRead::~bulkAioRead() { + /** + * Destructor. Just clears dynamically allocated memroy. + */ + clear(); +} + +void bulkAioRead::clear() { + /** + * Clear all dynamically alocated memory + */ + operations.clear(); + buffers.clear(); +} + +int bulkAioRead::addRequest(size_t obj_idx, char* out_buf, size_t size, off64_t offset) { + /** + * Prepare read request for a single ceph object. Private method. + * + * Method will allocate all (well, almost, except the string for the object name) + * necessary objects to submit read request to ceph. To submit the requests use + * `submit_and_wait_for_complete` method. + * + * @param obj_idx number of the object (starting from zero) to read + * @param out_buf output buffer, where read results should be stored + * @param size number of bytes to read + * @param offset offset in bytes where the read should start. Note that the offset is local to the + * ceph object. I.e. if offset is 0 and object number is 1, yo'll be reading from the + * start of the second object, not from the begining of the file. + * + * @return zero on success, negative error code on failure + */ + + try{ + auto &op_data = operations[obj_idx]; + //When we start using C++17, the next two lines can be merged + buffers.emplace_back(out_buf); + auto &buf = buffers.back(); + op_data.ceph_read_op.read(offset, size, &buf.bl, &buf.rc); + } catch (std::bad_alloc&) { + log_func((char*)"Memory allocation failed while reading file %s", file_ref->name.c_str()); + return -ENOMEM; + } + return 0; +} + +int bulkAioRead::submit_and_wait_for_complete() { + /** + * Submit previously prepared read requests and wait for their completion + * + * To prepare read requests use `read` or `addRequest` methods. + * + * @return zero on success, negative error code on failure + * + */ + + for (auto &op_data: operations) { + size_t obj_idx = op_data.first; + //16 bytes for object hex number, 1 for dot and 1 for null-terminator + char object_suffix[18]; + int sp_bytes_written; + sp_bytes_written = snprintf(object_suffix, sizeof(object_suffix), ".%016zx", obj_idx); + if (sp_bytes_written >= (int) sizeof(object_suffix)) { + log_func((char*)"Can not fit object suffix into buffer for file %s -- too big\n", file_ref->name.c_str()); + return -EFBIG; + } + + std::string obj_name; + try { + obj_name = file_ref->name + std::string(object_suffix); + } catch (std::bad_alloc&) { + log_func((char*)"Can not create object string for file %s)", file_ref->name.c_str()); + return -ENOMEM; + } + context->aio_operate(obj_name, op_data.second.cmpl.use(), &op_data.second.ceph_read_op, 0); + } + + for (auto &op_data: operations) { + op_data.second.cmpl.wait_for_complete(); + int rval = op_data.second.cmpl.get_return_value(); + /* + * Optimization is possible here: cancel all remaining read operations after the failure. + * One way to do so is the following: add context as an argument to the `use` method of CmplPtr. + * Then inside the class this pointer can be saved and used by the destructor to call + * `aio_cancel` (and probably `wait_for_complete`) before releasing the completion. + * Though one need to clarify whether it is necessary to cal `wait_for_complete` after + * `aio_cancel` (i.e. may the status variable/bufferlist still be written to or not). + */ + if (rval < 0) { + log_func((char*)"Read of the object %ld for file %s failed", op_data.first, file_ref->name.c_str()); + return rval; + } + } + return 0; +} + +ssize_t bulkAioRead::get_results() { + /** + * Copy the results of executed read requests from ceph's bufferlists to client's buffers + * + * Note that this method should be called only after the submission and completion of read + * requests, i.e. after `submit_and_wait_for_complete` method. + * + * @return cumulative number of bytes read (by all read operations) on success, negative + * error code on failure + * + */ + + ssize_t res = 0; + for (ReadOpData &op_data: buffers) { + if (op_data.rc < 0) { + //Is it possible to get here? + log_func((char*)"One of the reads failed with rc %d", op_data.rc); + return op_data.rc; + } + op_data.bl.begin().copy(op_data.bl.length(), op_data.out_buf); + res += op_data.bl.length(); + } + //We should clear used completions to allow new operations + clear(); + return res; +} + +int bulkAioRead::read(void* out_buf, size_t req_size, off64_t offset) { + /** + * Declare a read operation for file. + * + * Read coordinates are global, i.e. valid offsets are from 0 to the -1, valid request sizes + * are from 0 to INF. Method can be called multiple times to declare multiple read + * operations on the same file. + * + * @param out_buf output buffer, where read results should be stored + * @param req_size number of bytes to read + * @param offset offset in bytes where the read should start. Note that the offset is global, + * i.e. refers to the whole file, not individual ceph objects + * + * @return zero on success, negative error code on failure + * + */ + + if (req_size == 0) { + log_func((char*)"Zero-length read request for file %s, probably client error", file_ref->name.c_str()); + return 0; + } + + char* const buf_start_ptr = (char*) out_buf; + + size_t object_size = file_ref->objectSize; + //The amount of bytes that is yet to be read + size_t to_read = req_size; + //block means ceph object here + size_t start_block = offset / object_size; + size_t buf_pos = 0; + size_t chunk_start = offset % object_size; + + while (to_read > 0) { + size_t chunk_len = std::min(to_read, object_size - chunk_start); + + if (buf_pos >= req_size) { + log_func((char*)"Internal bug! Attempt to read %lu data for block (%lu, %lu) of file %s\n", buf_pos, offset, req_size, file_ref->name.c_str()); + return -EINVAL; + } + + int rc = addRequest(start_block, buf_start_ptr + buf_pos, chunk_len, chunk_start); + if (rc < 0) { + log_func((char*)"Unable to submit async read request, rc=%d\n", rc); + return rc; + } + + buf_pos += chunk_len; + + start_block++; + chunk_start = 0; + if (chunk_len > to_read) { + log_func((char*)"Internal bug! Read %lu bytes, more than expected %lu bytes for block (%lu, %lu) of file %s\n", chunk_len, to_read, offset, req_size, file_ref->name.c_str()); + return -EINVAL; + } + to_read = to_read - chunk_len; + } + return 0; +} diff --git a/src/XrdCeph/XrdCephBulkAioRead.hh b/src/XrdCeph/XrdCephBulkAioRead.hh new file mode 100644 index 000000000..2c5c9b183 --- /dev/null +++ b/src/XrdCeph/XrdCephBulkAioRead.hh @@ -0,0 +1,93 @@ +#include +#include +#include +#include +#include + +#include "XrdCephPosix.hh" + + +class bulkAioRead { + //typedef std::tuple ReadOpData; + typedef void (*logfunc_pointer) (char *, ...); + + /** + * Class is used to execute read operations against rados striper files *without* usage of rados striper. + * Reads are based on ceph read operations. + * + * The interface is similar to the one that ceph's read operation objects has: + * 1. Instantiate the object. + * 2. Declare read operations using 'read' method, providing the output buffers, offset and length. + * 3. Submitn operation and wait for results using 'submit_and_wait_for_complete' method. + * 4. Copy results to buffers with 'get_results' method. + * + * WARNING: there is no copy/move constructor in the class, so do not use temporary objects for initialization + * (i.e. something like `bulkAioRead rop = bulkAioRead(...);` will not work, use `bulkAioRead rop(...);` instead). + */ + public: + bulkAioRead(librados::IoCtx* ct, logfunc_pointer ptr, CephFileRef* fileref); + ~bulkAioRead(); + + void clear(); + int submit_and_wait_for_complete(); + ssize_t get_results(); + int read(void *out_buf, size_t size, off64_t offset); + + private: + //Completion pointer + class CmplPtr { + librados::AioCompletion *ptr; + bool used = false; + public: + CmplPtr() { + ptr = librados::Rados::aio_create_completion(); + if (NULL == ptr) { + throw std::bad_alloc(); + } + } + ~CmplPtr() { + if (used) { + this->wait_for_complete(); + } + ptr->release(); + } + void wait_for_complete() { + ptr->wait_for_complete(); + } + int get_return_value() { + return ptr->get_return_value(); + } + librados::AioCompletion* use() { + //If the object was converted to AioCompletion, we suppose it was passed to + //the read operation, and therefore set the flag. + used = true; + return ptr; + } + }; + + //Ceph read operation + completion + struct CephOpData { + librados::ObjectReadOperation ceph_read_op; + CmplPtr cmpl; + }; + + //Data for an individual read -- ceph's buffer, client's buffer and return code + struct ReadOpData { + ceph::bufferlist bl; + char* out_buf; + int rc; + ReadOpData(char* output_buf): out_buf(output_buf), rc(-1) {}; + }; + + + + int addRequest(size_t obj_idx, char *out_buf, size_t size, off64_t offset); + librados::IoCtx* context; + std::list buffers; + + //map { : } + std::map operations; + + logfunc_pointer log_func; + CephFileRef* file_ref; +}; diff --git a/src/XrdCeph/XrdCephOss.cc b/src/XrdCeph/XrdCephOss.cc index 08c36be38..429f911a0 100644 --- a/src/XrdCeph/XrdCephOss.cc +++ b/src/XrdCeph/XrdCephOss.cc @@ -25,6 +25,7 @@ #include #include #include +#include #include "XrdCeph/XrdCephPosix.hh" #include "XrdOuc/XrdOucEnv.hh" #include "XrdSys/XrdSysError.hh" @@ -69,6 +70,7 @@ static void logwrapper(char *format, va_list argp) { /// used in XrdCephPosix extern XrdOucName2Name *g_namelib; + /// converts a logical filename to physical one if needed void m_translateFileName(std::string &physName, std::string logName){ if (0 != g_namelib) { @@ -162,6 +164,8 @@ XrdCephOss::~XrdCephOss() { // declared and used in XrdCephPosix.cc extern unsigned int g_maxCephPoolIdx; +extern unsigned int g_cephAioWaitThresh; + int XrdCephOss::Configure(const char *configfn, XrdSysError &Eroute) { int NoGo = 0; XrdOucEnv myEnv; @@ -216,6 +220,47 @@ int XrdCephOss::Configure(const char *configfn, XrdSysError &Eroute) { return 1; } } + + int pread_flag_set = !strncmp(var, "ceph.usedefaultpreadalg", 24); + int readv_flag_set = !strncmp(var, "ceph.usedefaultreadvalg", 24); + if (pread_flag_set or readv_flag_set) { + var = Config.GetWord(); + if (var) { + char* endptr; + long value = strtol(var, &endptr, 10); + if ((value == 0 || value == 1) && (var != endptr)) { + if (pread_flag_set) { + m_useDefaultPreadAlg = value; + } else if(readv_flag_set) { + m_useDefaultReadvAlg = value; + } else { + Eroute.Emsg("Config", "Bug encountered during parsing", var); + } + } else { + Eroute.Emsg("Config", "Invalid value for ceph.usedefault* in config file -- must be 0 or 1, got", var); + return 1; + } + } else { + Eroute.Emsg("Config", "Missing value for ceph.usedefault* in config file"); + return 1; + } + } + + if (!strncmp(var, "ceph.aiowaitthresh", 19)) { + var = Config.GetWord(); + if (var) { + unsigned long value = strtoul(var, 0, 10); + if ((value > 0) && (value < INT_MAX)){ + g_cephAioWaitThresh = value; + } else { + Eroute.Emsg("Config", "Invalid value for ceph.aiowaitthresh:", var); + } + } else { + Eroute.Emsg("Config", "Missing value for ceph.aiowaitthresh in config file"); + return 1; + } + } + if (!strncmp(var, "ceph.usebuffer", 14)) { // allowable values: 0, 1 var = Config.GetWord(); if (var) { @@ -362,6 +407,7 @@ int XrdCephOss::Rename(const char *from, /** * + * @brief Extract a pool name (string before the first colon ':') from an object ID. * @param (in) possPool the object ID * @return pool name or unchanged object ID @@ -415,8 +461,10 @@ int XrdCephOss::Stat(const char* path, m_translateFileName(spath,path); if (spath.back() == '/') { // Request to stat the root - + +#ifdef STAT_TRACE XrdCephEroute.Say(__FUNCTION__, " - fake a return for stat'ing root element '/'"); +#endif // special case of a stat made by the locate interface // we intend to then list all files @@ -428,6 +476,24 @@ int XrdCephOss::Stat(const char* path, return XrdOssOK; + } + if (spath.find_first_of(":") == spath.length()-1) { // Request to stat just the pool name + +#ifdef STAT_TRACE + XrdCephEroute.Say(__FUNCTION__, "Found request to stat pool name"); +#endif + + spath.pop_back(); // remove colon from pool name + if (m_configPoolnames.find(spath) != std::string::npos) { // Support 'locate' for spaceinfo +#ifdef STAT_TRACE + XrdCephEroute.Say(__FUNCTION__, " - preparing spaceinfo report for '", path, "'"); +#endif + return XrdOssOK; // Only requires a status code, do not need to fill contents in struct stat + } else { + XrdCephEroute.Say(__FUNCTION__, " - cannot find pool '", path, "' in ceph.reportingpools"); + return -EINVAL; + } + } else if (ceph_posix_stat(env, path, buff) == 0) { // Found object ID #ifdef STAT_TRACE @@ -448,7 +514,6 @@ int XrdCephOss::Stat(const char* path, - int XrdCephOss::StatFS(const char *path, char *buff, int &blen, XrdOucEnv *eP) { #ifdef STAT_TRACE @@ -505,6 +570,7 @@ int formatStatLSResponse(char *buff, int &blen, const char* cgroup, long long to */ + int XrdCephOss::StatLS(XrdOucEnv &env, const char *charPath, char *buff, int &blen) { XrdCephEroute.Say(__FUNCTION__, " incoming path = ", charPath); @@ -548,7 +614,7 @@ int XrdCephOss::StatLS(XrdOucEnv &env, const char *charPath, char *buff, int &bl freeSpace = totalSpace - usedSpace; blen = formatStatLSResponse(buff, blen, - /* charPath */ spath.c_str(), /* "oss.cgroup" */ + spath.c_str(), /* "oss.cgroup" */ totalSpace, /* "oss.space" */ usedSpace, /* "oss.used" */ freeSpace, /* "oss.free" */ diff --git a/src/XrdCeph/XrdCephOss.hh b/src/XrdCeph/XrdCephOss.hh index 0e7312af5..a54040cb2 100644 --- a/src/XrdCeph/XrdCephOss.hh +++ b/src/XrdCeph/XrdCephOss.hh @@ -72,8 +72,12 @@ public: virtual XrdOssDF *newDir(const char *tident); virtual XrdOssDF *newFile(const char *tident); - private: + //If set to 1, striper-based algorithm is used for pread + int m_useDefaultPreadAlg = 0; + //If set to 1, striper-based algorithm is used for readv + int m_useDefaultReadvAlg = 0; + private: bool m_configBufferEnable=false; //! config option for buffering size_t m_configBufferSize=16*1024*1024L; //! Buffer size std::string m_configBufferIOmode = "aio"; diff --git a/src/XrdCeph/XrdCephOssFile.cc b/src/XrdCeph/XrdCephOssFile.cc index 3a0a63f47..b1fa57136 100644 --- a/src/XrdCeph/XrdCephOssFile.cc +++ b/src/XrdCeph/XrdCephOssFile.cc @@ -59,7 +59,22 @@ ssize_t XrdCephOssFile::Read(off_t offset, size_t blen) { } ssize_t XrdCephOssFile::Read(void *buff, off_t offset, size_t blen) { - return ceph_posix_pread(m_fd, buff, blen, offset); + ssize_t retval; + if (m_cephOss->m_useDefaultPreadAlg) { + retval = ceph_posix_pread(m_fd, buff, blen, offset); + } else { + retval = ceph_posix_nonstriper_pread(m_fd, buff, blen, offset); + if (-ENOENT == retval || -ENOTSUP == retval) { + //This might be a sparse file or nbstripes > 1, so let's try striper read + retval = ceph_posix_pread(m_fd, buff, blen, offset); + if (retval >= 0) { + char err_str[100]; //99 symbols should be enough for the short message + snprintf(err_str, 100, "WARNING! The file (fd %d) seem to be sparse, this is not expected", m_fd); + XrdCephEroute.Say(err_str); + } + } + } + return retval; } static void aioReadCallback(XrdSfsAio *aiop, size_t rc) { @@ -75,6 +90,26 @@ ssize_t XrdCephOssFile::ReadRaw(void *buff, off_t offset, size_t blen) { return Read(buff, offset, blen); } +ssize_t XrdCephOssFile::ReadV(XrdOucIOVec *readV, int n) { + ssize_t retval; + if (m_cephOss->m_useDefaultReadvAlg) { + retval = ceph_striper_readv(m_fd, readV, n); + } else { + retval = ceph_nonstriper_readv(m_fd, readV, n); + if (-ENOENT == retval || -ENOTSUP == retval) { + //This might be a sparse file or nbstripes > 1, so let's try striper read + retval = ceph_striper_readv(m_fd, readV, n); + if (retval >= 0) { + char err_str[100]; //99 symbols should be enough for the short message + snprintf(err_str, 100, "WARNING! The file (fd %d) seem to be sparse, this is not expected", m_fd); + XrdCephEroute.Say(err_str); + } + } + } + return retval; +} + + int XrdCephOssFile::Fstat(struct stat *buff) { return ceph_posix_fstat(m_fd, buff); } diff --git a/src/XrdCeph/XrdCephOssFile.hh b/src/XrdCeph/XrdCephOssFile.hh index ecdf668a2..999cfcfd0 100644 --- a/src/XrdCeph/XrdCephOssFile.hh +++ b/src/XrdCeph/XrdCephOssFile.hh @@ -60,6 +60,7 @@ public: virtual ssize_t Read(off_t offset, size_t blen); virtual ssize_t Read(void *buff, off_t offset, size_t blen); virtual int Read(XrdSfsAio *aoip); + virtual ssize_t ReadV(XrdOucIOVec *readV, int n); virtual ssize_t ReadRaw(void *, off_t, size_t); virtual int Fstat(struct stat *buff); virtual ssize_t Write(const void *buff, off_t offset, size_t blen); diff --git a/src/XrdCeph/XrdCephPosix.cc b/src/XrdCeph/XrdCephPosix.cc index c70f338e8..01bd213a1 100644 --- a/src/XrdCeph/XrdCephPosix.cc +++ b/src/XrdCeph/XrdCephPosix.cc @@ -50,40 +50,11 @@ #include "XrdOuc/XrdOucName2Name.hh" #include "XrdSys/XrdSysPlatform.hh" #include - +#include "XrdOuc/XrdOucIOVec.hh" #include "XrdCeph/XrdCephPosix.hh" - +#include "XrdCeph/XrdCephBulkAioRead.hh" #include "XrdSfs/XrdSfsFlags.hh" // for the OFFLINE flag status -/// small structs to store file metadata -struct CephFile { - std::string name; - std::string pool; - std::string userId; - unsigned int nbStripes; - unsigned long long stripeUnit; - unsigned long long objectSize; -}; - -struct CephFileRef : CephFile { - int flags; - mode_t mode; - uint64_t offset; - // This mutex protects against parallel updates of the stats. - XrdSysMutex statsMutex; - uint64_t maxOffsetWritten; - uint64_t bytesAsyncWritePending; - uint64_t bytesWritten; - unsigned rdcount; - unsigned wrcount; - unsigned asyncRdStartCount; - unsigned asyncRdCompletionCount; - unsigned asyncWrStartCount; - unsigned asyncWrCompletionCount; - ::timeval lastAsyncSubmission; - double longestAsyncWriteTime; - double longestCallbackInvocation; -}; /// small struct for directory listing struct DirIterator { @@ -115,6 +86,9 @@ std::vector g_cluster; XrdSysMutex g_striper_mutex; /// index of current Striper/IoCtx to be used unsigned int g_cephPoolIdx = 0; +///If aio read operation takes longer than this value, a warning +///will be issued +unsigned int g_cephAioWaitThresh = 15; /// size of the Striper/IoCtx pool, defaults to 1 /// may be overwritten in the configuration file /// (See XrdCephOss::configure) @@ -262,6 +236,7 @@ static unsigned int stoui(const std::string &s) { return (unsigned int)res; } + void dumpClusterInfo() { //JW // log the current state of the cluster: @@ -704,6 +679,8 @@ int ceph_posix_open(XrdOucEnv* env, const char *pathname, int flags, mode_t mode bool fileExists = (rc != -ENOENT); //Make clear what condition we are testing + logwrapper((char*)"Access Mode: %s flags&O_ACCMODE %d ", pathname, flags); + if ((flags&O_ACCMODE) == O_RDONLY) { // Access mode is READ if (fileExists) { @@ -925,6 +902,82 @@ ssize_t ceph_aio_write(int fd, XrdSfsAio *aiop, AioCB *cb) { } } +ssize_t ceph_nonstriper_readv(int fd, XrdOucIOVec *readV, int n) { + CephFileRef* fr = getFileRef(fd); + if (fr) { + // TODO implement proper logging level for this plugin - this should be only debug + //logwrapper((char*)"ceph_read: for fd %d, count=%d", fd, count); + if ((fr->flags & O_WRONLY) != 0) { + return -EBADF; + } + if (fr->nbStripes != 1) { + //Non-striper based read method works only with a single stripe + return -ENOTSUP; + } + + ssize_t read_bytes; + int rc; + + librados::IoCtx *ioctx = getIoCtx(*fr); + if (0 == ioctx) { + return -EINVAL; + } + + try { + //Constructor can throw bad alloc + bulkAioRead readOp(ioctx, logwrapper, fr); + + for (int i = 0; i < n; i++) { + rc = readOp.read(readV[i].data, readV[i].size, readV[i].offset); + if (rc < 0) { + logwrapper( (char*)"Can not declare read request\n"); + return rc; + } + } + + std::time_t wait_time = std::time(0); + rc = readOp.submit_and_wait_for_complete(); + wait_time = std::time(0) - wait_time; + if (wait_time > g_cephAioWaitThresh) { + logwrapper( + (char*)"Waiting for AIO results in readv for %s took %ld seconds, too long!\n", + fr->name.c_str(), + wait_time + ); + } + if (rc < 0) { + logwrapper( (char*)"Can not submit read requests\n"); + return rc; + } + read_bytes = readOp.get_results(); + XrdSysMutexHelper lock(fr->statsMutex); + //We consider readv as a single operation + fr->rdcount += 1; + return read_bytes; + } catch(std::bad_alloc&) { + return -ENOMEM; + } + } else { + return -EBADF; + } +} + +ssize_t ceph_striper_readv(int fd, XrdOucIOVec *readV, int n) { + /** + * Sequential, striper-based readv implementation. + */ + ssize_t nbytes = 0, curCount = 0; + for (int i=0; iflags & O_WRONLY) != 0) { + return -EBADF; + } + if (fr->nbStripes != 1) { + //Non-striper based read method works only with a single stripe + return -ENOTSUP; + } + + int rc; + ssize_t bytes_read; + + librados::IoCtx *ioctx = getIoCtx(*fr); + if (0 == ioctx) { + return -EINVAL; + } + + try { + //Constructor can throw bad alloc + bulkAioRead readOp(ioctx, logwrapper, fr); + rc = readOp.read(buf, count, offset); + if (rc < 0) { + logwrapper( (char*)"Can not declare read request\n"); + return rc; + } + std::time_t wait_time = std::time(0); + rc = readOp.submit_and_wait_for_complete(); + wait_time = std::time(0) - wait_time; + if (wait_time > g_cephAioWaitThresh) { + logwrapper( + (char*)"Waiting for AIO results in pread for %s took %ld seconds, too long!\n", + fr->name.c_str(), + wait_time + ); + } + if (rc < 0) { + logwrapper( (char*)"Can not submit read request\n"); + return rc; + } + bytes_read = readOp.get_results(); + + if (bytes_read > 0) { + XrdSysMutexHelper lock(fr->statsMutex); + fr->rdcount++; + } else { + logwrapper( (char*)"Error while read\n"); + } + return bytes_read; + } catch (std::bad_alloc&) { + return -ENOMEM; + } + } else { + return -EBADF; + } +} + ssize_t ceph_posix_pread(int fd, void *buf, size_t count, off64_t offset) { CephFileRef* fr = getFileRef(fd); if (fr) { diff --git a/src/XrdCeph/XrdCephPosix.hh b/src/XrdCeph/XrdCephPosix.hh index d24efe6aa..9b9c2e0f0 100644 --- a/src/XrdCeph/XrdCephPosix.hh +++ b/src/XrdCeph/XrdCephPosix.hh @@ -31,10 +31,13 @@ #include #include +#include #include #include #include +#include "XrdSys/XrdSysPthread.hh" +#include "XrdOuc/XrdOucIOVec.hh" // simple logging for XrdCeph buffering code #define XRDCEPHLOGLEVEL 1 #ifdef XRDCEPHLOGLEVEL @@ -47,6 +50,7 @@ #define LOGCEPH(x) #endif + class XrdSfsAio; typedef void(AioCB)(XrdSfsAio*, size_t); @@ -60,7 +64,10 @@ off64_t ceph_posix_lseek64(int fd, off64_t offset, int whence); ssize_t ceph_posix_write(int fd, const void *buf, size_t count); ssize_t ceph_posix_pwrite(int fd, const void *buf, size_t count, off64_t offset); ssize_t ceph_aio_write(int fd, XrdSfsAio *aiop, AioCB *cb); +ssize_t ceph_nonstriper_readv(int fd, XrdOucIOVec *readV, int n); +ssize_t ceph_striper_readv(int fd, XrdOucIOVec *readV, int n); ssize_t ceph_posix_read(int fd, void *buf, size_t count); +ssize_t ceph_posix_nonstriper_pread(int fd, void *buf, size_t count, off64_t offset); ssize_t ceph_posix_pread(int fd, void *buf, size_t count, off64_t offset); ssize_t ceph_aio_read(int fd, XrdSfsAio *aiop, AioCB *cb); int ceph_posix_fstat(int fd, struct stat *buf); @@ -87,4 +94,34 @@ DIR* ceph_posix_opendir(XrdOucEnv* env, const char *pathname); int ceph_posix_readdir(DIR* dirp, char *buff, int blen); int ceph_posix_closedir(DIR *dirp); +/// small structs to store file metadata +struct CephFile { + std::string name; + std::string pool; + std::string userId; + unsigned int nbStripes; + unsigned long long stripeUnit; + unsigned long long objectSize; +}; + +struct CephFileRef : CephFile { + int flags; + mode_t mode; + uint64_t offset; + // This mutex protects against parallel updates of the stats. + XrdSysMutex statsMutex; + uint64_t maxOffsetWritten; + uint64_t bytesAsyncWritePending; + uint64_t bytesWritten; + unsigned rdcount; + unsigned wrcount; + unsigned asyncRdStartCount; + unsigned asyncRdCompletionCount; + unsigned asyncWrStartCount; + unsigned asyncWrCompletionCount; + ::timeval lastAsyncSubmission; + double longestAsyncWriteTime; + double longestCallbackInvocation; +}; + #endif // __XRD_CEPH_POSIX__ From 14bb81a225f76b323521b66dee0698534be756e8 Mon Sep 17 00:00:00 2001 From: snafus Date: Wed, 10 May 2023 09:55:14 -0400 Subject: [PATCH 13/18] Buffered io nonstriperbuffer (#43) * Add capability for buffer io raw to use striperless reads * Add capability for buffer io raw to use striperless reads * Add a maybe striper for reading in ceph posix * Use striperless reads when bypassing the buffer --- .../XrdCephBuffers/CephIOAdapterRaw.cc | 13 ++++++++---- .../XrdCephBuffers/CephIOAdapterRaw.hh | 4 +++- .../XrdCephBuffers/XrdCephBufferAlgSimple.cc | 9 ++++++--- .../XrdCephBuffers/XrdCephBufferAlgSimple.hh | 4 +++- src/XrdCeph/XrdCephOssBufferedFile.cc | 3 ++- src/XrdCeph/XrdCephPosix.cc | 20 +++++++++++++++++++ src/XrdCeph/XrdCephPosix.hh | 2 ++ 7 files changed, 45 insertions(+), 10 deletions(-) diff --git a/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc b/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc index 28815b779..cb9b2d7fa 100644 --- a/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc +++ b/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc @@ -11,8 +11,10 @@ using namespace XrdCephBuffer; using myclock = std::chrono::steady_clock; //using myseconds = std::chrono::duration(end-start); diff --git a/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.hh b/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.hh index 3c7011ef7..f893bb022 100644 --- a/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.hh +++ b/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.hh @@ -27,7 +27,8 @@ namespace XrdCephBuffer { */ class CephIOAdapterRaw: public virtual ICephIOAdapter { public: - CephIOAdapterRaw(IXrdCephBufferData * bufferdata, int fd); + CephIOAdapterRaw(IXrdCephBufferData * bufferdata, int fd, + bool useStriperlessReads); virtual ~CephIOAdapterRaw(); /** @@ -57,6 +58,7 @@ class CephIOAdapterRaw: public virtual ICephIOAdapter { private: IXrdCephBufferData * m_bufferdata; //!< no ownership of pointer (consider shared ptrs, etc) int m_fd; + bool m_useStriperlessReads {true}; //!< use the striperless read code // timer and counter info std::atomic< long> m_stats_read_timer{0}, m_stats_write_timer{0}; diff --git a/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc b/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc index 1485ff8f1..894034e6a 100644 --- a/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc +++ b/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc @@ -18,8 +18,10 @@ using namespace XrdCephBuffer; XrdCephBufferAlgSimple::XrdCephBufferAlgSimple(std::unique_ptr buffer, - std::unique_ptr cephio, int fd ): -m_bufferdata(std::move(buffer)), m_cephio(std::move(cephio)), m_fd(fd){ + std::unique_ptr cephio, int fd, + bool useStriperlessReads): +m_bufferdata(std::move(buffer)), m_cephio(std::move(cephio)), m_fd(fd), +m_useStriperlessReads(useStriperlessReads) { } @@ -111,7 +113,8 @@ ssize_t XrdCephBufferAlgSimple::read(volatile void *buf, off_t offset, size_t m_bufferdata->invalidate(); m_bufferLength =0; // ensure cached data is set to zero length // #FIXME JW: const_cast is probably a bit poor. - ssize_t rc = ceph_posix_pread(m_fd, const_cast(buf), blen, offset); + + ssize_t rc = ceph_posix_maybestriper_pread (m_fd, const_cast(buf), blen, offset, m_useStriperlessReads); if (rc > 0) { m_stats_bytes_fromceph += rc; m_stats_bytes_toclient += rc; diff --git a/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.hh b/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.hh index fdd0a2227..e96bd4018 100644 --- a/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.hh +++ b/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.hh @@ -26,7 +26,8 @@ namespace XrdCephBuffer { class XrdCephBufferAlgSimple : public virtual IXrdCephBufferAlg { public: - XrdCephBufferAlgSimple(std::unique_ptr buffer, std::unique_ptr cephio, int fd ); + XrdCephBufferAlgSimple(std::unique_ptr buffer, std::unique_ptr cephio, int fd, + bool useStriperlessReads = true ); virtual ~XrdCephBufferAlgSimple(); virtual ssize_t read_aio (XrdSfsAio *aoip) override; @@ -49,6 +50,7 @@ class XrdCephBufferAlgSimple : public virtual IXrdCephBufferAlg { std::unique_ptr m_bufferdata; //! this algorithm takes ownership of the buffer, and will delete it on destruction std::unique_ptr m_cephio ; // no ownership is taken here int m_fd = -1; + bool m_useStriperlessReads {true}; off_t m_bufferStartingOffset = 0; size_t m_bufferLength = 0; diff --git a/src/XrdCeph/XrdCephOssBufferedFile.cc b/src/XrdCeph/XrdCephOssBufferedFile.cc index 1b230317a..83476cb3e 100644 --- a/src/XrdCeph/XrdCephOssBufferedFile.cc +++ b/src/XrdCeph/XrdCephOssBufferedFile.cc @@ -327,7 +327,8 @@ std::unique_ptr XrdCephOssBufferedFile::create if (m_bufferIOmode == "aio") { cephio = std::unique_ptr(new CephIOAdapterAIORaw(cephbuffer.get(),m_fd)); } else if (m_bufferIOmode == "io") { - cephio = std::unique_ptr(new CephIOAdapterRaw(cephbuffer.get(),m_fd)); + cephio = std::unique_ptr(new CephIOAdapterRaw(cephbuffer.get(),m_fd, + !m_cephoss->m_useDefaultPreadAlg)); } else { BUFLOG("XrdCephOssBufferedFile: buffer mode needs to be one of aio|io " ); m_xrdOssDF->Close(); diff --git a/src/XrdCeph/XrdCephPosix.cc b/src/XrdCeph/XrdCephPosix.cc index 01bd213a1..5640b9f7c 100644 --- a/src/XrdCeph/XrdCephPosix.cc +++ b/src/XrdCeph/XrdCephPosix.cc @@ -1089,6 +1089,26 @@ ssize_t ceph_posix_pread(int fd, void *buf, size_t count, off64_t offset) { } } +ssize_t ceph_posix_maybestriper_pread(int fd, void *buf, size_t count, off64_t offset, bool allowStriper) { + ssize_t rc {0}; + if (!allowStriper) { + rc = ceph_posix_pread(fd,buf,count,offset); + return rc; + } + rc = ceph_posix_nonstriper_pread(fd, buf, count,offset); + if (-ENOENT == rc || -ENOTSUP == rc) { + //This might be a sparse file or nbstripes > 1, so let's try striper read + rc = ceph_posix_pread(fd, buf, count,offset); + if (rc >= 0) { + char err_str[100]; //99 symbols should be enough for the short message + snprintf(err_str, 100, "WARNING! The file (fd %d) seem to be sparse, this is not expected", fd); + logwrapper(err_str); + } + } + return rc; +} + + static void ceph_aio_read_complete(rados_completion_t c, void *arg) { AioArgs *awa = reinterpret_cast(arg); size_t rc = rados_aio_get_return_value(c); diff --git a/src/XrdCeph/XrdCephPosix.hh b/src/XrdCeph/XrdCephPosix.hh index 9b9c2e0f0..25a7ea013 100644 --- a/src/XrdCeph/XrdCephPosix.hh +++ b/src/XrdCeph/XrdCephPosix.hh @@ -69,6 +69,8 @@ ssize_t ceph_striper_readv(int fd, XrdOucIOVec *readV, int n); ssize_t ceph_posix_read(int fd, void *buf, size_t count); ssize_t ceph_posix_nonstriper_pread(int fd, void *buf, size_t count, off64_t offset); ssize_t ceph_posix_pread(int fd, void *buf, size_t count, off64_t offset); +ssize_t ceph_posix_maybestriper_pread(int fd, void *buf, size_t count, off64_t offset, bool allowStriper=true); + ssize_t ceph_aio_read(int fd, XrdSfsAio *aiop, AioCB *cb); int ceph_posix_fstat(int fd, struct stat *buf); int ceph_posix_stat(XrdOucEnv* env, const char *pathname, struct stat *buf); From d8e1d985b077d60e1c473452cf4ec512f109fcce Mon Sep 17 00:00:00 2001 From: snafus Date: Thu, 1 Jun 2023 09:43:49 +0200 Subject: [PATCH 14/18] Update XrdCephBufferAlgSimple.cc (#45) Remove verbose logging for case when cache is bypassed, as the read size is at least the size of the buffer. --- src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc b/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc index 894034e6a..43c355fbb 100644 --- a/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc +++ b/src/XrdCeph/XrdCephBuffers/XrdCephBufferAlgSimple.cc @@ -107,8 +107,8 @@ ssize_t XrdCephBufferAlgSimple::read(volatile void *buf, off_t offset, size_t * Invalidate the cache in anycase */ if (blen >= m_bufferdata->capacity()) { - BUFLOG("XrdCephBufferAlgSimple::read: Readthrough cache: fd: " << m_fd - << " " << offset << " " << blen); + //BUFLOG("XrdCephBufferAlgSimple::read: Readthrough cache: fd: " << m_fd + // << " " << offset << " " << blen); // larger than cache, so read through, and invalidate the cache anyway m_bufferdata->invalidate(); m_bufferLength =0; // ensure cached data is set to zero length From cfee4f0dbf890754847b51178bc7a7fb71d3cea9 Mon Sep 17 00:00:00 2001 From: Jo-stfc <71326101+Jo-stfc@users.noreply.github.com> Date: Wed, 5 Jul 2023 15:18:43 +0100 Subject: [PATCH 15/18] Buffclosdiv01 (#47) * catch division by 0 in CephIOAdapterRaw.cc, increase granularity to nanoseconds * long to unsigned long long explicit typecasting --- .../XrdCephBuffers/CephIOAdapterRaw.cc | 24 +++++++++---------- .../XrdCephBuffers/CephIOAdapterRaw.hh | 8 +++---- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc b/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc index cb9b2d7fa..130fb71a8 100644 --- a/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc +++ b/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.cc @@ -20,18 +20,18 @@ CephIOAdapterRaw::CephIOAdapterRaw(IXrdCephBufferData * bufferdata, int fd, CephIOAdapterRaw::~CephIOAdapterRaw() { // nothing to specifically to do; just print out some stats float read_speed{0}, write_speed{0}; - if (m_stats_read_req.load() > 0) { - read_speed = m_stats_read_bytes.load() / m_stats_read_timer.load() * 1e-3; + if (m_stats_read_req.load() > 0 && m_stats_read_timer.load() > 0 ) { + read_speed = m_stats_read_bytes.load() / m_stats_read_timer.load() * 1e-6; } - if (m_stats_write_req.load() > 0) { - write_speed = m_stats_write_bytes.load() / m_stats_write_timer.load() * 1e-3; + if (m_stats_write_req.load() > 0 && m_stats_read_timer.load() > 0 ) { + write_speed = m_stats_write_bytes.load() / m_stats_write_timer.load() * 1e-6; } BUFLOG("CephIOAdapterRaw::Summary fd:" << m_fd << " nwrite:" << m_stats_write_req << " byteswritten:" << m_stats_write_bytes << " write_s:" - << m_stats_write_timer * 1e-3 << " writemax_s" << m_stats_write_longest * 1e-3 + << m_stats_write_timer * 1e-6 << " writemax_s" << m_stats_write_longest * 1e-6 << " write_MBs:" << write_speed << " nread:" << m_stats_read_req << " bytesread:" << m_stats_read_bytes << " read_s:" - << m_stats_read_timer * 1e-3 << " readmax_s:" << m_stats_read_longest * 1e-3 + << m_stats_read_timer * 1e-6 << " readmax_s:" << m_stats_read_longest * 1e-6 << " read_MBs:" << read_speed << " striperlessRead: " << m_useStriperlessReads ); @@ -45,14 +45,14 @@ ssize_t CephIOAdapterRaw::write(off64_t offset,size_t count) { auto start = std::chrono::steady_clock::now(); ssize_t rc = ceph_posix_pwrite(m_fd,buf,count,offset); auto end = std::chrono::steady_clock::now(); - auto int_ms = std::chrono::duration_cast(end-start); + auto int_ns = std::chrono::duration_cast(end-start); // BUFLOG("CephIOAdapterRaw::write fd:" << m_fd << " " << rc << " " // << offset << " " << count << " " << rc << " " << int_ms.count() ); if (rc < 0) return rc; - m_stats_write_longest = std::max(m_stats_write_longest,int_ms.count()); - m_stats_write_timer.fetch_add(int_ms.count()); + m_stats_write_longest = std::max(m_stats_write_longest,static_cast(int_ns.count())); + m_stats_write_timer.fetch_add(static_cast(int_ns.count())); m_stats_write_bytes.fetch_add(rc); ++m_stats_write_req; return rc; @@ -71,15 +71,15 @@ ssize_t CephIOAdapterRaw::read(off64_t offset, size_t count) { rc = ceph_posix_maybestriper_pread(m_fd,buf,count,offset, m_useStriperlessReads); auto end = std::chrono::steady_clock::now(); //auto elapsed = end-start; - auto int_ms = std::chrono::duration_cast(end-start); + auto int_ns = std::chrono::duration_cast(end-start); if (rc < 0) { BUFLOG("CephIOAdapterRaw::read: Error in read: " << rc ); return rc; } - m_stats_read_longest = std::max(m_stats_read_longest,int_ms.count()); - m_stats_read_timer.fetch_add(int_ms.count()); + m_stats_read_longest = std::max(m_stats_read_longest,static_cast(int_ns.count())); + m_stats_read_timer.fetch_add(static_cast(int_ns.count())); m_stats_read_bytes.fetch_add(rc); ++m_stats_read_req; diff --git a/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.hh b/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.hh index f893bb022..55d427f9e 100644 --- a/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.hh +++ b/src/XrdCeph/XrdCephBuffers/CephIOAdapterRaw.hh @@ -61,10 +61,10 @@ class CephIOAdapterRaw: public virtual ICephIOAdapter { bool m_useStriperlessReads {true}; //!< use the striperless read code // timer and counter info - std::atomic< long> m_stats_read_timer{0}, m_stats_write_timer{0}; - std::atomic< long> m_stats_read_bytes{0}, m_stats_write_bytes{0}; - std::atomic< long> m_stats_read_req{0}, m_stats_write_req{0}; - long m_stats_read_longest{0}, m_stats_write_longest{0}; + std::atomic< long long> m_stats_read_timer{0}, m_stats_write_timer{0}; + std::atomic< long long> m_stats_read_bytes{0}, m_stats_write_bytes{0}; + std::atomic< long long> m_stats_read_req{0}, m_stats_write_req{0}; + long long m_stats_read_longest{0}, m_stats_write_longest{0}; }; From 04fabbb877b2a274ba30b927bda7242098e95224 Mon Sep 17 00:00:00 2001 From: Jo-stfc <71326101+Jo-stfc@users.noreply.github.com> Date: Fri, 22 Sep 2023 12:00:16 +0100 Subject: [PATCH 16/18] return error code on read error (#49) return read return value when triggering error while read --- src/XrdCeph/XrdCephPosix.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/XrdCeph/XrdCephPosix.cc b/src/XrdCeph/XrdCephPosix.cc index 950e7f50c..4339a80d3 100644 --- a/src/XrdCeph/XrdCephPosix.cc +++ b/src/XrdCeph/XrdCephPosix.cc @@ -1054,7 +1054,7 @@ ssize_t ceph_posix_nonstriper_pread(int fd, void *buf, size_t count, off64_t off XrdSysMutexHelper lock(fr->statsMutex); fr->rdcount++; } else { - logwrapper( (char*)"Error while read\n"); + logwrapper( (char*)"Error while read: %d\n", bytes_read); } return bytes_read; } catch (std::bad_alloc&) { From 1af1e5d317c9d9ed08d58c25fbaa219abf610888 Mon Sep 17 00:00:00 2001 From: Jo-stfc <71326101+Jo-stfc@users.noreply.github.com> Date: Wed, 10 Jan 2024 15:00:00 +0000 Subject: [PATCH 17/18] bugfix for calculating striper objects in direct reads (#50) * get stripeunit and object size from xattr of first stripe use striper.layout.object_size, not striper.size as that is the size of the whole object get the striper layout info on file open use min of return code of object striper layout metadata * use striper.layout.object_size, not striper.size as that is the size of the whole object * improvements from review --------- Co-authored-by: root --- src/XrdCeph/XrdCephPosix.cc | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/XrdCeph/XrdCephPosix.cc b/src/XrdCeph/XrdCephPosix.cc index 4339a80d3..62be37d7b 100644 --- a/src/XrdCeph/XrdCephPosix.cc +++ b/src/XrdCeph/XrdCephPosix.cc @@ -684,6 +684,31 @@ int ceph_posix_open(XrdOucEnv* env, const char *pathname, int flags, mode_t mode if ((flags&O_ACCMODE) == O_RDONLY) { // Access mode is READ if (fileExists) { + librados::bufferlist d_stripeUnit; + librados::bufferlist d_objectSize; + std::string obj_name; + librados::IoCtx *context = getIoCtx(fr); + // read first stripe of the object for xattr stripe unit and object size + // this will fail if the object was not written in stripes e.g. s3 + // TBD: fallback to direct object (no stripe id appends to filename, + // replace striper metadata with corresponding metadata) + // + try { + obj_name = fr.name + std::string(".0000000000000000"); + } catch (std::bad_alloc&) { + logwrapper((char*)"Can not create object string for file %s)", fr.name.c_str()); + } + int ret = 0; + ret = context->getxattr(obj_name, "striper.layout.stripe_unit", d_stripeUnit); + ret = std::min(ret,context->getxattr(obj_name, "striper.layout.object_size", d_objectSize)); + //log_func((char*)"size xattr for %s , %llu ,%llu", file_ref->name.c_str(), file_ref->objectSize, file_ref->stripeUnit ); + if (ret<0){ + logwrapper((char*)"Could not find size or stripe_unit xattr for %s", fr.name.c_str()); + } + else{ + fr.stripeUnit = std::stoull(d_stripeUnit.c_str()); + fr.objectSize = std::stoull(d_objectSize.c_str()); + } int fd = insertFileRef(fr); logwrapper((char*)"File descriptor %d associated to file %s opened in read mode", fd, pathname); return fd; From 03e018132bfe1ccf16aec94c2b819480742286a2 Mon Sep 17 00:00:00 2001 From: Jo-stfc <71326101+Jo-stfc@users.noreply.github.com> Date: Fri, 26 Jul 2024 09:21:15 +0100 Subject: [PATCH 18/18] clean garbage from rados read (#52) * clean garbage from rados read * static alloc * static alloc * static alloc needs manual null * comments and warning for nondefault params * add filename in log * add filename in log * code review changes * c++14 compatibility fixes --------- Co-authored-by: root Co-authored-by: root --- packaging/rhel/xrootd-ceph-buffered.spec.in | 179 ++++++++++++++++++++ src/XrdCeph/XrdCephOssBufferedFile.cc | 2 +- src/XrdCeph/XrdCephPosix.cc | 24 ++- src/XrdCeph/XrdCephPosix.hh | 2 + 4 files changed, 203 insertions(+), 4 deletions(-) create mode 100644 packaging/rhel/xrootd-ceph-buffered.spec.in diff --git a/packaging/rhel/xrootd-ceph-buffered.spec.in b/packaging/rhel/xrootd-ceph-buffered.spec.in new file mode 100644 index 000000000..96264c87a --- /dev/null +++ b/packaging/rhel/xrootd-ceph-buffered.spec.in @@ -0,0 +1,179 @@ +#------------------------------------------------------------------------------- +# Helper macros +#------------------------------------------------------------------------------- +%if %{?rhel:1}%{!?rhel:0} + %if %{rhel} >= 7 + %define use_systemd 1 + %else + %define use_systemd 0 + %endif +%else + %if %{?fedora}%{!?fedora:0} >= 19 + %define use_systemd 1 + %else + %define use_systemd 0 + %endif +%endif + +%if %{?fedora}%{!?fedora:0} >= 22 + %define use_libc_semaphore 1 +%else + %define use_libc_semaphore 0 +%endif + +%if %{?_with_ceph11:1}%{!?_with_ceph11:0} + %define _with_ceph 1 +%endif + +%if %{?rhel:1}%{!?rhel:0} + %if %{rhel} > 7 + %define use_cmake3 0 + %else + %define use_cmake3 1 + %endif +%else + %define use_cmake3 0 +%endif + +#------------------------------------------------------------------------------- +# Package definitions +#------------------------------------------------------------------------------- +Name: xrootd-ceph-buffered +Epoch: 1 +Version: __VERSION__ +Release: __RELEASE__%{?dist}%{?_with_clang:.clang} +Summary: CEPH plug-in for XRootD +Group: System Environment/Daemons +License: LGPLv3+ +URL: http://xrootd.org/ + +# git clone http://xrootd.org/repo/xrootd.git xrootd +# cd xrootd +# git-archive master | gzip -9 > ~/rpmbuild/SOURCES/xrootd.tgz +Source0: xrootd-ceph-buffered.tar.gz + +BuildRoot: %{_tmppath}/%{name}-root + +%if %{use_cmake3} +BuildRequires: cmake3 +%else +BuildRequires: cmake +%endif + +%if %{?_with_tests:1}%{!?_with_tests:0} +BuildRequires: cppunit-devel +%endif + +BuildRequires: librados-devel = 2:14.2.22 +BuildRequires: libradosstriper-devel = 2:14.2.22 + +%if %{?_with_clang:1}%{!?_with_clang:0} +BuildRequires: clang +%endif + +#BuildRequires: xrootd-server-devel%{?_isa} = %{epoch}:%{version}-%{release} +#BuildRequires: xrootd-private-devel%{?_isa} = %{epoch}:%{version}-%{release} +#BuildRequires: xrootd-libs%{?_isa} = %{epoch}:%{version}-%{release} +#BuildRequires: xrootd-server-libs%{?_isa} = %{epoch}:%{version}-%{release} +#BuildRequires: xrootd-client-libs%{?_isa} = %{epoch}:%{version}-%{release} + +#Requires: xrootd-server-libs%{?_isa} = %{epoch}:%{version}-%{release} +#Requires: xrootd-client-libs%{?_isa} = %{epoch}:%{version}-%{release} +#Requires: xrootd-libs%{?_isa} = %{epoch}:%{version}-%{release} + +BuildRequires: xrootd-server-devel%{?_isa} >= 1:5.3.3 +BuildRequires: xrootd-private-devel%{?_isa} >= 1:5.3.3 +BuildRequires: xrootd-libs%{?_isa} >= 1:5.3.1 +BuildRequires: xrootd-server-libs%{?_isa} >= 1:5.3.3 +BuildRequires: xrootd-client-libs%{?_isa} >= 1:5.3.3 + +Requires: xrootd-server-libs%{?_isa} >= 1:5.3.3 +Requires: xrootd-client-libs%{?_isa} >= 1:5.3.3 +Requires: xrootd-libs%{?_isa} >= 1:5.3.3 + +%description +The xrootd-ceph-buffered is an OSS layer plug-in for the XRootD server for interfacing +with the Ceph storage platform. + +#------------------------------------------------------------------------------- +# Build instructions +#------------------------------------------------------------------------------- +%prep +%setup -c -n xrootd-ceph-buffered + +%build +cd xrootd-ceph-buffered + +%if %{?_with_clang:1}%{!?_with_clang:0} +export CC=clang +export CXX=clang++ +%endif + +mkdir build +pushd build + +%if %{use_cmake3} +cmake3 \ +%else +cmake \ +%endif + -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_BUILD_TYPE=RelWithDebInfo \ +%if %{?_with_tests:1}%{!?_with_tests:0} + -DENABLE_TESTS=TRUE \ +%else + -DENABLE_TESTS=FALSE \ +%endif + ../ + +make -i VERBOSE=1 %{?_smp_mflags} +popd + +#------------------------------------------------------------------------------- +# Installation +#------------------------------------------------------------------------------- +%install +rm -rf $RPM_BUILD_ROOT + +#------------------------------------------------------------------------------- +# Install 4.x.y +#------------------------------------------------------------------------------- +pushd xrootd-ceph-buffered +pushd build +make install DESTDIR=$RPM_BUILD_ROOT +popd + +# ceph posix unversioned so +rm -f $RPM_BUILD_ROOT%{_libdir}/libXrdCephPosix.so + + +%clean +rm -rf $RPM_BUILD_ROOT + +#------------------------------------------------------------------------------- +# Files +#------------------------------------------------------------------------------- +%files +%defattr(-,root,root,-) +%{_libdir}/libXrdCeph-5.so +%{_libdir}/libXrdCephXattr-5.so +%{_libdir}/libXrdCephPosix.so* + +%if %{?_with_tests:1}%{!?_with_tests:0} +%files tests +%defattr(-,root,root,-) +%{_libdir}/libXrdCephTests*.so +%endif + +#------------------------------------------------------------------------------- +# Changelog +#------------------------------------------------------------------------------- +%changelog +* Mon Mar 14 2022 Jyothish Thomas +-offline file bug fix +* Wed Dec 16 2020 George Patargias +- updated version for librados-devel and libradosstriper-devel to 14.2.15 following the recent upgrade on external Echo gateways +- fixed version in xrootd-ceph-buffered shared libraries +* Mon Mar 02 2020 Michal Simon +- fixed RPM dependencies +* Thu Mar 08 2018 Michal Simon +- initial release diff --git a/src/XrdCeph/XrdCephOssBufferedFile.cc b/src/XrdCeph/XrdCephOssBufferedFile.cc index 83476cb3e..113cddcfa 100644 --- a/src/XrdCeph/XrdCephOssBufferedFile.cc +++ b/src/XrdCeph/XrdCephOssBufferedFile.cc @@ -159,7 +159,7 @@ ssize_t XrdCephOssBufferedFile::Read(void *buff, off_t offset, size_t blen) { auto buffer_itr = m_bufferReadAlgs.find(thread_id); if (buffer_itr == m_bufferReadAlgs.end()) { // only create a buffer, if we haven't hit the max buffers yet - auto buffer_ptr = std::move(createBuffer()); + auto buffer_ptr = createBuffer(); if (buffer_ptr) { buffer = buffer_ptr.get(); m_bufferReadAlgs[thread_id] = std::move(buffer_ptr); diff --git a/src/XrdCeph/XrdCephPosix.cc b/src/XrdCeph/XrdCephPosix.cc index 62be37d7b..910505a66 100644 --- a/src/XrdCeph/XrdCephPosix.cc +++ b/src/XrdCeph/XrdCephPosix.cc @@ -688,6 +688,7 @@ int ceph_posix_open(XrdOucEnv* env, const char *pathname, int flags, mode_t mode librados::bufferlist d_objectSize; std::string obj_name; librados::IoCtx *context = getIoCtx(fr); + // read first stripe of the object for xattr stripe unit and object size // this will fail if the object was not written in stripes e.g. s3 // TBD: fallback to direct object (no stripe id appends to filename, @@ -697,17 +698,34 @@ int ceph_posix_open(XrdOucEnv* env, const char *pathname, int flags, mode_t mode obj_name = fr.name + std::string(".0000000000000000"); } catch (std::bad_alloc&) { logwrapper((char*)"Can not create object string for file %s)", fr.name.c_str()); + return -ENOMEM; } int ret = 0; ret = context->getxattr(obj_name, "striper.layout.stripe_unit", d_stripeUnit); ret = std::min(ret,context->getxattr(obj_name, "striper.layout.object_size", d_objectSize)); //log_func((char*)"size xattr for %s , %llu ,%llu", file_ref->name.c_str(), file_ref->objectSize, file_ref->stripeUnit ); - if (ret<0){ + if (ret<=0){ logwrapper((char*)"Could not find size or stripe_unit xattr for %s", fr.name.c_str()); } else{ - fr.stripeUnit = std::stoull(d_stripeUnit.c_str()); - fr.objectSize = std::stoull(d_objectSize.c_str()); + //librados's c_str() method does not return a NULL-terminated string, hence why we need to cleanup here + char cleanStripeUnit[MAXDIGITSIZE]; + char cleanObjectSize[MAXDIGITSIZE]; + unsigned int stripeUnitLength = std::min((unsigned int)MAXDIGITSIZE-1, d_stripeUnit.length()); + unsigned int objectSizeLength = std::min((unsigned int)MAXDIGITSIZE-1, d_objectSize.length()); + (void)strncpy( cleanStripeUnit, d_stripeUnit.c_str(), stripeUnitLength ); + (void)strncpy( cleanObjectSize, d_objectSize.c_str(), objectSizeLength ); + cleanStripeUnit[stripeUnitLength] = '\0'; + cleanObjectSize[objectSizeLength] = '\0'; + //only change defaults if different + if(fr.stripeUnit != std::stoull(cleanStripeUnit)){ + logwrapper((char*)"WARNING: stripe unit of %s does not match defaults. object size is %s", pathname, cleanStripeUnit); + fr.stripeUnit = std::stoull(cleanStripeUnit); + } + if(fr.objectSize != std::stoull(cleanObjectSize)){ + logwrapper((char*)"WARNING: object size of %s does not match defaults. object size is %s",pathname, cleanObjectSize); + fr.objectSize = std::stoull(cleanObjectSize); + } } int fd = insertFileRef(fr); logwrapper((char*)"File descriptor %d associated to file %s opened in read mode", fd, pathname); diff --git a/src/XrdCeph/XrdCephPosix.hh b/src/XrdCeph/XrdCephPosix.hh index 5740d4c62..5d01129b6 100644 --- a/src/XrdCeph/XrdCephPosix.hh +++ b/src/XrdCeph/XrdCephPosix.hh @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -41,6 +42,7 @@ // simple logging for XrdCeph buffering code #define XRDCEPHLOGLEVEL 1 +#define MAXDIGITSIZE 32 #ifdef XRDCEPHLOGLEVEL // ensure that // extern XrdOucTrace XrdCephTrace;