udo_kmeans.cpp

#include <algorithm>
#include <atomic>
#include <cmath>
#include <cstdint>
#include <cstdlib>
#include <functional>
#include <iterator>
#include <limits>
#include <memory>
#include <new>
#include <optional>
#include <random>
#include <span>
#include <type_traits>
#include <utility>
#include <vector>
#ifdef UDO_STANDALONE
#include <charconv>
#include <chrono>
#include <fstream>
#include <iostream>
#include <map>
#include <string>
#include <udo/UDOStandalone.hpp>
#include <sched.h>
#endif
//---------------------------------------------------------------------------
#include <udo/UDOperator.hpp>
//---------------------------------------------------------------------------
using namespace std;
//---------------------------------------------------------------------------
/// A tuple this UDO takes as an input
struct Input {
   // The x-coordinate
   double x;
   // The y-coordinate
   double y;
   // The payload
   uint64_t payload;
};
//---------------------------------------------------------------------------
/// An tuple generated by this UDO
struct Output {
   // The x-coordinate
   double x;
   // The y-coordinate
   double y;
   // The payload
   uint64_t payload;
   // The cluster id
   uint16_t clusterId;
};
//---------------------------------------------------------------------------
/// A container that has stable references, constant time insertion at the end
/// and allocates memory in exponentially increasing sizes.
template <typename T>
class ChunkedStorage {
   private:
   /// The header of a chunk. Make sure that this is aligned by at least the
   /// alignment of T so that the very first address after this is a valid
   /// address for T.
   struct alignas(std::max({alignof(T), alignof(size_t), alignof(void*)})) ChunkHeader {
      /// The total size of this chunk in bytes
      size_t size;
      /// The next chunk in the list
      ChunkHeader* next = nullptr;
      /// The number of elements that are stored in this chunk
      size_t numElements = 0;

      /// Constructor from a size
      explicit ChunkHeader(size_t size) : size(size) {}

      /// Get the pointer to the first element
      T* getElements() {
         return reinterpret_cast<T*>(this + 1);
      }

      /// The maxmimum number of elements this chunk can hold
      size_t maxNumElements() const {
         return (size - sizeof(ChunkHeader)) / sizeof(T);
      }
   };

   /// The iterator
   template <bool isConst>
   class Iterator {
      public:
      using difference_type = std::ptrdiff_t;
      using value_type = std::conditional_t<isConst, const T, T>;
      using pointer = value_type*;
      using reference = value_type&;
      using iterator_category = std::forward_iterator_tag;

      private:
      friend class ChunkedStorage;

      /// The current chunk
      ChunkHeader* chunk = nullptr;
      /// The current index in the chunk
      size_t elementIndex = 0;

      /// Forward the iterator to the first non-empty chunk
      void forward() {
         while (chunk && chunk->numElements == 0)
            chunk = chunk->next;
      }

      /// Constructor
      Iterator(ChunkHeader* chunk, size_t elementIndex) : chunk(chunk), elementIndex(elementIndex) {
         forward();
      }

      public:
      /// Default constructor
      Iterator() = default;

      /// Dereference
      reference operator*() const {
         return chunk->getElements()[elementIndex];
      }
      /// Dereference
      pointer operator->() const {
         return &operator*();
      }

      /// Pre-increment
      Iterator& operator++() {
         ++elementIndex;
         if (elementIndex == chunk->numElements) {
            chunk = chunk->next;
            elementIndex = 0;
            forward();
         }
         return *this;
      }
      /// Post-increment
      Iterator operator++(int) {
         Iterator it(*this);
         operator++();
         return it;
      }

      /// Equality comparison
      bool operator==(const Iterator& other) const = default;
   };

   /// A helper to iterate over a ChunkedStorage in parallel
   template <bool isConst>
   class ParallelIterator {
      public:
      /// A range of elements over which a thread iterates exclusively
      class Range {
         public:
         /// The iterator of a range
         class Iterator {
            public:
            using difference_type = std::ptrdiff_t;
            using value_type = std::conditional_t<isConst, const T, T>;
            using pointer = value_type*;
            using reference = value_type&;
            using iterator_category = std::forward_iterator_tag;

            private:
            friend class ParallelIterator;

            /// The chunk
            ChunkHeader* chunk = nullptr;
            /// The current index in the chunk
            size_t elementIndex = 0;

            /// Constructor
            Iterator(ChunkHeader* chunk, size_t elementIndex) : chunk(chunk), elementIndex(elementIndex) {}

            public:
            /// Default constructor
            Iterator() = default;

            /// Dereference
            reference operator*() const {
               return chunk->getElements()[elementIndex];
            }
            /// Dereference
            pointer operator->() const {
               return &operator*();
            }

            /// Pre-increment
            Iterator& operator++() {
               ++elementIndex;
               return *this;
            }
            /// Post-increment
            Iterator operator++(int) {
               Iterator it(*this);
               operator++();
               return it;
            }

            /// Equality comparison
            bool operator==(const Iterator& other) const = default;
         };

         private:
         friend class ParallelIterator;

         /// The chunk for this range
         ChunkHeader* chunk = nullptr;

         /// Constructor
         Range(ChunkHeader* chunk) : chunk(chunk) {}

         public:
         /// Constructor
         Range() = default;

         /// Get the begin iterator
         Iterator begin() const {
            return Iterator(chunk, 0);
         }

         /// Get the end iterator
         Iterator end() const {
            if (chunk)
               return Iterator(chunk, chunk->numElements);
            else
               return Iterator(chunk, 0);
         }
      };

      private:
      friend class ChunkedStorage;

      /// The next chunk that can be used
      ChunkHeader* chunk = nullptr;

      /// Constructor
      ParallelIterator(ChunkHeader* chunk) : chunk(chunk) {}

      public:
      /// Constructor
      ParallelIterator() = default;

      /// Get the next range concurrently
      std::optional<Range> next() {
         // TODO: This should be atomic_ref, but libc++ hasn't implemented that yet.
         auto& chunkAtomic = reinterpret_cast<atomic<ChunkHeader*>&>(chunk);
         auto* currentChunk = chunkAtomic.load();
         while (currentChunk) {
            if (chunkAtomic.compare_exchange_weak(currentChunk, currentChunk->next)) {
               return Range(currentChunk);
            }
         }
         return std::nullopt;
      }
   };

   public:
   using value_type = T;
   using reference = T&;
   using const_reference = const T&;
   using iterator = Iterator<false>;
   using const_iterator = Iterator<true>;
   using difference_type = std::ptrdiff_t;
   using size_type = std::size_t;

   using parallel_iterator = ParallelIterator<false>;
   using parallel_const_iterator = ParallelIterator<true>;

   private:
   /// Get the minimum number of elements in a chunk. The size of a chunk
   /// should be at least 1024 bytes.
   static constexpr size_t minimumNumElements() {
      if (sizeof(ChunkHeader) + sizeof(T) >= 1024)
         return 1;

      return (1024 - sizeof(ChunkHeader) - 1) / sizeof(T) + 1;
   }

   /// The first chunk
   ChunkHeader* frontChunk = nullptr;
   /// The last chunk
   ChunkHeader* backChunk = nullptr;
   /// The total number of elements
   size_t numElements = 0;

   /// Remove all elements and chunks
   void freeChunks() {
      auto* chunk = frontChunk;
      while (chunk) {
         auto* next = chunk->next;
         std::destroy_n(chunk->getElements(), chunk->numElements);
         std::free(chunk);
         chunk = next;
      }
      frontChunk = nullptr;
      backChunk = nullptr;
      numElements = 0;
   }

   /// Create a new chunk and append it at the end
   void addChunk() {
      size_t newChunkElements = std::max(numElements / 8, minimumNumElements());
      size_t newChunkSize = sizeof(ChunkHeader) + newChunkElements * sizeof(T);
      auto* chunkPtr = static_cast<ChunkHeader*>(std::malloc(newChunkSize));
      new (chunkPtr) ChunkHeader(newChunkSize);

      if (backChunk)
         backChunk->next = chunkPtr;
      else
         frontChunk = chunkPtr;
      backChunk = chunkPtr;
   }

   public:
   /// Constructor
   ChunkedStorage() = default;

   /// Destructor
   ~ChunkedStorage() {
      freeChunks();
   }

   /// Move constructor
   ChunkedStorage(ChunkedStorage&& other) noexcept : frontChunk(other.frontChunk), backChunk(other.backChunk), numElements(other.numElements) {
      other.frontChunk = nullptr;
      other.backChunk = nullptr;
      other.numElements = 0;
   }

   /// Move assignment
   ChunkedStorage& operator=(ChunkedStorage&& other) noexcept {
      if (this == &other)
         return *this;

      freeChunks();

      frontChunk = other.frontChunk;
      backChunk = other.backChunk;
      numElements = other.numElements;
      other.frontChunk = nullptr;
      other.backChunk = nullptr;
      other.numElements = 0;

      return *this;
   }

   /// Get the number of elements stored in this ChunkedStorage
   size_type size() const { return numElements; }

   /// Emplace a value at the end
   template <typename... Args>
   T& emplace_back(Args&&... args) {
      if (!backChunk || backChunk->numElements == backChunk->maxNumElements())
         addChunk();

      T* ptr = backChunk->getElements() + backChunk->numElements;
      new (ptr) T(std::forward<Args>(args)...);
      ++(backChunk->numElements);
      ++numElements;
      return *ptr;
   }

   /// Merge another ChunkedStorage into this
   void merge(ChunkedStorage&& other) noexcept {
      if (!other.frontChunk)
         return;
      if (!backChunk) {
         *this = std::move(other);
         return;
      }
      backChunk->next = other.frontChunk;
      backChunk = other.backChunk;
      numElements += other.numElements;
      other.frontChunk = nullptr;
      other.backChunk = nullptr;
      other.numElements = 0;
   }

   /// Get the iterator to the first element
   iterator begin() {
      return iterator(frontChunk, 0);
   }
   /// Get the iterator to the first element
   const_iterator begin() const {
      return const_iterator(frontChunk, 0);
   }
   /// Get the end iterator
   iterator end() {
      return iterator(nullptr, 0);
   }
   /// Get the end iterator
   const_iterator end() const {
      return iterator(nullptr, 0);
   }

   /// Get a parallel iterator
   parallel_iterator parallelIter() {
      return parallel_iterator(frontChunk);
   }
   /// Get a parallel iterator
   parallel_const_iterator parallelIter() const {
      return parallel_const_iterator(frontChunk);
   }
};
//---------------------------------------------------------------------------
template <typename T1, typename T2>
double distance(const T1& a, const T2& b)
// Calculate the distance between two points
{
   double x = b.x - a.x;
   double y = b.y - a.y;
   // Return the squared euclidian distance
   return x * x + y * y;
}
//---------------------------------------------------------------------------
/// A helper class to to reservoir sampling
template <typename T>
class ReservoirSample {
   private:
   /// The actual sample
   vector<T> sample;
   /// The sample size
   uint64_t limit;
   /// The number of tuples seen for sampling
   uint64_t elementsSeen;
   /// The random engine
   mt19937_64 mt;
   /// The distribution for random numbers
   uniform_real_distribution<double> doubleDist;
   /// The distribution for random slots
   uniform_int_distribution<uint64_t> slotDist;
   /// The number of elements to skip
   uint64_t skip;
   /// The W of Li's algorithm L
   double w;

   public:
   /// Constructor
   ReservoirSample(uint64_t sampleSize, uint64_t seed)
      : sample(sampleSize), limit(sampleSize), elementsSeen(0), mt(seed), doubleDist(0.0, 1.0), slotDist(0, sampleSize - 1) {
      // Calculate initial skip after algorithm l https://doi.org/10.1145/198429.198435
      w = exp(log(doubleDist(mt)) / limit);
      skip = static_cast<uint64_t>(floor(log(doubleDist(mt)) / log(1.0 - w)));
   }

   /// Set the number of tuples that were seen for this sample
   void setElementsSeen(uint64_t n) {
      elementsSeen = n;
   }

   /// Get the sample
   span<T> getSample() {
      return sample;
   }

   /// Get random index for reservoir slot
   uint64_t getRandomSlot() {
      // Calculate next step after algorithm l https://doi.org/10.1145/198429.198435
      if (skip == 0) {
         w *= exp(log(doubleDist(mt)) / limit);
         skip = static_cast<uint64_t>(floor(log(doubleDist(mt)) / log(1.0 - w)));
         return slotDist(mt);
      }
      skip--;
      return limit + skip;
   }

   /// Combine two reservoirs keeping uniformity
   void mergeInto(ReservoirSample& target) {
      if (elementsSeen == 0)
         return;

      if (target.elementsSeen < limit && elementsSeen < limit) {
         // We have two incomplete samples. We just complete the sample of the
         // target by using the samples of the source as individual tuples.
         uint64_t copySamples = min(limit - target.elementsSeen, elementsSeen);
         move(sample.begin(), sample.begin() + copySamples, target.sample.begin() + target.elementsSeen);
         target.elementsSeen += copySamples;
         elementsSeen -= copySamples;

         if (elementsSeen == 0)
            return;
      }

      // If either the source or the target does not have a full sample, we have
      // to special case this to make sure the merged sample is still uniform.
      if (target.elementsSeen < limit || elementsSeen < limit) {
         auto* mergeSource = this;
         auto* mergeTarget = &target;

         // When this operator already has a full sample but the target doesn't,
         // we instead merge the target into the source which makes it easier to
         // keep uniformity.
         if (target.elementsSeen < limit && elementsSeen >= limit) {
            mergeSource = &target;
            mergeTarget = this;
         }

         // Treat the source as individual new tuples and use the regular sampling
         // logic to add them to the target. At this point we know that the target
         // is definitely full.
         // Use algorithm R to merge the remaining tuples
         for (uint64_t i = 0; i < mergeSource->elementsSeen; ++i) {
            auto dist = uniform_int_distribution<uint64_t>(0, mergeTarget->elementsSeen + i);
            auto sampleIndex = dist(mt);
            if (sampleIndex < limit)
               mergeTarget->sample[sampleIndex] = move(mergeSource->sample[i]);
         }

         // If we swapped source and target, we need to copy the samples back to the target.
         if (target.elementsSeen < limit && elementsSeen >= limit)
            move(mergeTarget->sample.begin(), mergeTarget->sample.end(), mergeSource->sample.begin());
      } else {
         // Do a regular merge of two full samples.
         auto dist = uniform_int_distribution<uint64_t>(1, elementsSeen + target.elementsSeen);
         for (auto i = 0u; i < limit; i++)
            if (dist(mt) <= elementsSeen)
               target.sample[i] = move(sample[i]);
      }

      target.elementsSeen += elementsSeen;
   }
};
//---------------------------------------------------------------------------
// The k-means Operator
class KMeans : public udo::UDOperator<Input, Output> {
   private:
   /// Possible operation types
   enum Operation : uint32_t {
      PrepareInitializeClusters = 0,
      FinishInitializeClusters,
      PrepareAssociatePoints,
      AssociatePoints,
      FinishAssociatePoints,
      PrepareRecalculateMeans,
      RecalculateMeans,
      FinishRecalculateMeans,
      PrepareWriteOutput,
      WriteOutput = extraWorkDone,
   };

   /// The locale state in consume()
   struct ConsumeLocalState {
      /// The tuple storage for this worker.
      ChunkedStorage<Output> tuples;
      /// The sample for this worker
      ReservoirSample<Output*> sample;
      /// The next local state
      ConsumeLocalState* next = nullptr;

      /// Constructor
      ConsumeLocalState(size_t sampleSize, uint64_t seed) : sample(sampleSize, seed) {}
   };

   /// A cluster center
   struct ClusterCenter {
      /// The x coordinate
      double x;
      /// The y coordinate
      double y;
   };

   /// A cluster center that also tracks the number of points per cluster
   struct LocalClusterCenter {
      /// The x coordinate
      double x;
      /// The y coordinate
      double y;
      /// The number of points
      uint64_t numPoints;
   };

   /// One element of the linked list that contains all local cluster centers
   /// in recalculateMeans
   struct LocalClustersEntry {
      /// The cluster centers
      vector<LocalClusterCenter> centers;
      /// The next entry
      LocalClustersEntry* next = nullptr;
   };

   /// How many tuples should be passed to produceOutputTuple in every call of postProduce()
   static constexpr uint64_t morselSize = 10000;
   /// The number of clusters
   unsigned numClusters = 8;
   /// The storage for all tuples
   ChunkedStorage<Output> tuples;
   /// The local states in consume
   atomic<ConsumeLocalState*> consumeLocalStateList = nullptr;
   /// The cluster centers
   vector<ClusterCenter> centers;
   /// The linked list of local cluster centers used in recalculateMeans
   atomic<LocalClustersEntry*> localClusterCentersList = nullptr;
   /// The mutex flag for the prepare steps of the operations
   atomic_flag prepareMutex = false;
   /// The number of iterations
   unsigned numIterations = 0;
   /// The number of points that changed their cluster
   atomic<size_t> numChangedPoints;
   /// The parallel iterator that is used to iterate through the tuples.
   decltype(tuples.parallelIter()) tuplesIter;

   public:
   /// Constructor
   KMeans() {
      centers.resize(numClusters);
   }

   /// Destructor
   ~KMeans() {
      // Make sure that the local states are cleaned up in case the query was
      // aborted early.
      for (auto* localState = consumeLocalStateList.load(); localState;) {
         unique_ptr<ConsumeLocalState> localStatePtr(localState);
         localState = localStatePtr->next;
      }
      for (auto* localState = localClusterCentersList.load(); localState;) {
         unique_ptr<LocalClustersEntry> localStatePtr(localState);
         localState = localStatePtr->next;
      }
   }

   /// Consume an input tuple
   void consume(LocalState& rawLocalState, const Input& input) {
      auto*& localState = reinterpret_cast<ConsumeLocalState*&>(rawLocalState.data);
      if (!localState) {
         auto newLocalState = make_unique<ConsumeLocalState>(numClusters, udo::getRandom());
         newLocalState->next = consumeLocalStateList.load();
         while (!consumeLocalStateList.compare_exchange_weak(newLocalState->next, newLocalState.get()))
            ;

         localState = newLocalState.get();
         // This will be deallocated in PrepareInitializeClusters
         newLocalState.release();
      }

      Output tuple;
      tuple.x = input.x;
      tuple.y = input.y;
      tuple.payload = input.payload;
      tuple.clusterId = 0;
      auto& insertedTuple = localState->tuples.emplace_back(tuple);

      if (auto numTuples = localState->tuples.size(); numTuples <= numClusters)
         localState->sample.getSample()[numTuples - 1] = &insertedTuple;
      else if (auto slot = localState->sample.getRandomSlot(); slot < numClusters)
         localState->sample.getSample()[slot] = &insertedTuple;
   }

   private:
   /// Prepare the initialization of clusters after all input points were seen
   Operation prepareInitializeClusters() {
      if (!prepareMutex.test_and_set()) {
         // Merge the tuples and samples of all workers
         ReservoirSample<Output*> mergedSample(numClusters, 0);
         for (auto* consumeLocalState = consumeLocalStateList.exchange(nullptr); consumeLocalState;) {
            unique_ptr<ConsumeLocalState> localStatePtr(consumeLocalState);

            localStatePtr->sample.setElementsSeen(localStatePtr->tuples.size());
            tuples.merge(move(localStatePtr->tuples));
            localStatePtr->sample.mergeInto(mergedSample);

            consumeLocalState = consumeLocalState->next;
         }

         if (tuples.size() < numClusters) {
            udo::printDebug("less points than clusters, aborting\n");
            abort();
         }

         // Write the sampled points into the cluster centers
         auto sample = mergedSample.getSample();
         for (unsigned i = 0; i < numClusters; ++i) {
            centers[i].x = sample[i]->x;
            centers[i].y = sample[i]->y;
         }
      }
      return FinishInitializeClusters;
   }

   /// Determine the next operation after cluster centers were initialized
   Operation finishInitializeClusters() {
      prepareMutex.clear();
      if (tuples.size() < numClusters)
         return WriteOutput;
      else
         return PrepareAssociatePoints;
   }

   /// Prepare the associate points operation
   Operation prepareAssociatePoints() {
      if (!prepareMutex.test_and_set()) {
         numChangedPoints.store(0);
         tuplesIter = tuples.parallelIter();
      }
      return AssociatePoints;
   }

   /// Associate the points to the cluster centers
   Operation associatePoints() {
      auto tuples = tuplesIter.next();
      if (!tuples)
         return FinishAssociatePoints;

      size_t localNumChangedPoints = 0;
      for (auto& tuple : *tuples) {
         uint16_t bestClusterId = 0;
         double currentDistance = distance(tuple, centers[0]);
         for (uint16_t i = 1; i < numClusters; ++i) {
            double newDistance = distance(tuple, centers[i]);
            if (newDistance < currentDistance) {
               bestClusterId = i;
               currentDistance = newDistance;
            }
         }
         if (bestClusterId != tuple.clusterId) {
            tuple.clusterId = bestClusterId;
            ++localNumChangedPoints;
         }
      }
      numChangedPoints.fetch_add(localNumChangedPoints);
      return AssociatePoints;
   }

   /// Decide whether to continue or not after associating points
   Operation finishAssociatePoints() {
      prepareMutex.clear();
      //XXX if (numChangedPoints.load() <= tuples.size() / 1000) {
      if (numIterations == 10) {
         return PrepareWriteOutput;
      } else {
         return PrepareRecalculateMeans;
      }
   }

   /// Prepare the recalculate means operation
   Operation prepareRecalculateMeans() {
      if (!prepareMutex.test_and_set()) {
         tuplesIter = tuples.parallelIter();
         ++numIterations;
      }
      return RecalculateMeans;
   }

   /// Calculate the means of the clusters
   Operation recalculateMeans(LocalState& localState) {
      auto*& localClusters = reinterpret_cast<LocalClustersEntry*&>(localState.data);
      if (!localClusters) {
         auto newLocalClusters = make_unique<LocalClustersEntry>();
         newLocalClusters->centers.resize(numClusters);
         newLocalClusters->next = localClusterCentersList.load();
         while (!localClusterCentersList.compare_exchange_weak(newLocalClusters->next, newLocalClusters.get()))
            ;

         localClusters = newLocalClusters.get();
         // This will be deallocated in FinishRecalculateMeans
         newLocalClusters.release();
      }

      auto tuples = tuplesIter.next();
      if (!tuples)
         return FinishRecalculateMeans;

      for (auto& tuple : *tuples) {
         auto& cluster = localClusters->centers[tuple.clusterId];
         cluster.x += tuple.x;
         cluster.y += tuple.y;
         ++cluster.numPoints;
      }

      return RecalculateMeans;
   }

   /// Switch to associate points after recalculating means
   Operation finishRecalculateMeans() {
      auto* localEntry = localClusterCentersList.exchange(nullptr);
      if (!localEntry)
         return PrepareAssociatePoints;

      prepareMutex.clear();

      // Loop over the local cluster centers and sum them up
      vector<LocalClusterCenter> mergedClusters(numClusters);
      while (localEntry) {
         unique_ptr<LocalClustersEntry> entryPtr(localEntry);
         for (unsigned i = 0; i < numClusters; ++i) {
            auto& mergedCenter = mergedClusters[i];
            auto& localCenter = entryPtr->centers[i];
            mergedCenter.x += localCenter.x;
            mergedCenter.y += localCenter.y;
            mergedCenter.numPoints += localCenter.numPoints;
         }
         localEntry = entryPtr->next;
      }

      // Write out the new cluster centers
      for (unsigned i = 0; i < numClusters; ++i) {
         auto& mergedCenter = mergedClusters[i];
         centers[i].x = mergedCenter.x / mergedCenter.numPoints;
         centers[i].y = mergedCenter.y / mergedCenter.numPoints;
      }

      return PrepareAssociatePoints;
   }

   /// Prepare to output the tuples
   Operation prepareWriteOutput() {
      if (!prepareMutex.test_and_set()) {
         ++numIterations;
         tuplesIter = tuples.parallelIter();
      }
      return WriteOutput;
   }

   public:
   /// Do extra work
   uint32_t extraWork(LocalState& localState, uint32_t step) {
      switch (static_cast<Operation>(step)) {
         case PrepareInitializeClusters:
            return static_cast<uint32_t>(prepareInitializeClusters());
         case FinishInitializeClusters:
            return static_cast<uint32_t>(finishInitializeClusters());
         case PrepareAssociatePoints:
            return static_cast<uint32_t>(prepareAssociatePoints());
         case AssociatePoints:
            return static_cast<uint32_t>(associatePoints());
         case FinishAssociatePoints:
            return static_cast<uint32_t>(finishAssociatePoints());
         case PrepareRecalculateMeans:
            return static_cast<uint32_t>(prepareRecalculateMeans());
         case RecalculateMeans:
            return static_cast<uint32_t>(recalculateMeans(localState));
         case FinishRecalculateMeans:
            return static_cast<uint32_t>(finishRecalculateMeans());
         case PrepareWriteOutput:
            return static_cast<uint32_t>(prepareWriteOutput());
         case WriteOutput:
            return static_cast<uint32_t>(WriteOutput);
      }
      __builtin_unreachable();
   }

   /// Produce the output
   bool postProduce(LocalState& /*localState*/) {
      auto tuples = tuplesIter.next();
      if (tuples) {
         for (auto& tuple : *tuples)
            produceOutputTuple(tuple);
         return false;
      } else {
         return true;
      }
   }
};
//---------------------------------------------------------------------------
#ifdef UDO_STANDALONE
//---------------------------------------------------------------------------
static size_t getNumThreads()
/// Get the number of available threads
{
   ::cpu_set_t cpuSet = {};
   if (::sched_getaffinity(0, sizeof(cpuSet), &cpuSet) != 0)
      return ~0ull;

   size_t threadCount = CPU_COUNT(&cpuSet);
   return threadCount;
}
//---------------------------------------------------------------------------
int main(int argc, const char** argv) {
   bool argError = false;
   bool fullOutput = false;
   bool benchmark = false;
   string_view inputFileName;

   const char** argIt = argv;
   ++argIt;
   const char** argEnd = argv + argc;
   for (; argIt != argEnd; ++argIt) {
      string_view arg(*argIt);
      if (arg.empty())
         continue;
      if (arg == "--full-output") {
         fullOutput = true;
      } else if (arg == "--benchmark") {
         benchmark = true;
      } else {
         if (inputFileName.empty()) {
            inputFileName = arg;
         } else {
            argError = true;
            break;
         }
      }
   }

   if (!argError && inputFileName.empty())
      argError = true;

   if (argError) {
      cerr << "Usage: " << argv[0] << " [--full-output] [--benchmark] <input file>" << std::endl;
      return 2;
   }

   ifstream inputFile(inputFileName);

   // Discard the header line
   {
      string header;
      getline(inputFile, header);
   }

   vector<Input> inputs;

   while (inputFile) {
      Input i;
      string field;
      char* end;

      getline(inputFile, field, ',');
      if (!inputFile)
         break;
      end = field.data() + field.size();
      i.x = strtod(field.data(), &end);
      getline(inputFile, field, ',');
      if (!inputFile)
         break;
      end = field.data() + field.size();
      i.y = strtod(field.data(), &end);
      getline(inputFile, field);
      if (!inputFile)
         break;
      from_chars(field.data(), field.data() + field.size(), i.payload);

      inputs.push_back(i);
   }

   vector<Output> outputs(inputs.size());

   if (benchmark) {
      for (unsigned i = 0; i < 11; ++i) {
         udo::UDOStandalone<KMeans> standalone(getNumThreads(), 10000);
         KMeans kMeans;

         auto start = chrono::steady_clock::now();
         standalone.run(kMeans, inputs, outputs);
         auto end = chrono::steady_clock::now();
         auto duration_ms = chrono::duration_cast<chrono::nanoseconds>(end - start).count();
         // Don't measure the first run
         if (i > 0)
            cout << duration_ms << '\n';
      }
   } else {
      udo::UDOStandalone<KMeans> standalone(getNumThreads(), 10000);
      KMeans kMeans;
      standalone.run(kMeans, inputs, outputs);

      if (fullOutput) {
         for (auto& output : standalone.getOutput())
            cout << output.x << ',' << output.y << ',' << output.payload << ',' << output.clusterId << '\n';
      } else {
         vector<size_t> clusterCounts(8);
         for (auto& output : standalone.getOutput())
            ++clusterCounts[output.clusterId];

         for (size_t i = 0; i < clusterCounts.size(); ++i)
            cout << i << ": " << clusterCounts[i] << '\n';
      }
   }

   return 0;
}
//---------------------------------------------------------------------------
#endif
//---------------------------------------------------------------------------