From 57da0d4e88c2aae3e060a5f36cf92bd6dfc14eb4 Mon Sep 17 00:00:00 2001 From: Ziheng Wang Date: Tue, 4 Oct 2022 09:36:37 -0700 Subject: [PATCH] 0.0.8 --- apps/tpc-h/tpch.py | 2 +- docs/docs/simple.md | 38 ++++++++++++++++++++--------- docs/docs/started.md | 17 ++++++++----- docs/site/index.html | 2 +- docs/site/search/search_index.json | 2 +- docs/site/simple/index.html | 27 +++++++++++--------- docs/site/sitemap.xml | 18 +++++++------- docs/site/sitemap.xml.gz | Bin 202 -> 202 bytes docs/site/started/index.html | 15 +++++++----- pyquokka/dataset.py | 10 ++------ pyquokka/datastream.py | 2 +- pyquokka/logical.py | 5 +++- pyquokka/nodes.py | 2 +- pyquokka/quokka_runtime.py | 19 +++++++++------ pyquokka/redis.conf | 2 +- pyquokka/utils.py | 10 ++++---- setup.py | 12 ++++----- 17 files changed, 106 insertions(+), 77 deletions(-) diff --git a/apps/tpc-h/tpch.py b/apps/tpc-h/tpch.py index 92414a1..d8b7657 100644 --- a/apps/tpc-h/tpch.py +++ b/apps/tpc-h/tpch.py @@ -1,7 +1,7 @@ from pyquokka.df import * from pyquokka.utils import LocalCluster, QuokkaClusterManager from schema import * -mode = "DISK" +mode = "S3" format = "parquet" disk_path = "/home/ziheng/tpc-h/" #disk_path = "s3://yugan/tpc-h-out/" diff --git a/docs/docs/simple.md b/docs/docs/simple.md index 6fe984c..0af1511 100644 --- a/docs/docs/simple.md +++ b/docs/docs/simple.md @@ -2,7 +2,7 @@ This section is for learning how to use Quokka's DataStream API. **Quokka's DataStream API is basically a dataframe API.** It takes heavy inspiration from SparkSQL and Polars, and adopts a lazy execution model. This means that in contrast to Pandas, your operations are not executed immediately after you define them. Instead, Quokka builds a logical plan under the hood and executes it only when the user wants to "collect" the result, just like Spark. -For the first part of our tutorial, we are going to go through implementing a few SQL queries in the TPC-H benchmark suite. You can download the data [here](https://drive.google.com/file/d/19hgYxZ4u28Cxe0s616Q3yAfkuRdQlmvO/view?usp=sharing). It is about 1GB unzipped. Please download the data (should take 2 minutes) and extract it to some directory locally. The SQL queries themselves can be found on this awesome [interface](https://umbra-db.com/interface/). +For the first part of our tutorial, we are going to go through implementing a few SQL queries in the TPC-H benchmark suite. You can download the data [here](https://drive.google.com/file/d/19hgYxZ4u28Cxe0s616Q3yAfkuRdQlmvO/view?usp=sharing). It is about 1GB unzipped. Please download the data (should take 2 minutes) and extract it to some directory locally. If you are testing this on a VM where clicking the link can't work, try this command after pip installing gdown: `~/.local/bin/gdown https://drive.google.com/uc?id=19hgYxZ4u28Cxe0s616Q3yAfkuRdQlmvO`. The SQL queries themselves can be found on this awesome [interface](https://umbra-db.com/interface/). These tutorials will use your local machine. They shouldn't take too long to run. It would be great if you can follow along, not just for fun -- **if you find a bug in this tutorial I will buy you a cup of coffee!** @@ -89,13 +89,10 @@ This is how you would write it in Quokka. This is very similar to how you'd writ ~~~python def do_1(): - d = lineitem.filter("l_shipdate <= date '1998-12-01' - interval '90' day") d = d.with_column("disc_price", lambda x: x["l_extendedprice"] * (1 - x["l_discount"]), required_columns ={"l_extendedprice", "l_discount"}) d = d.with_column("charge", lambda x: x["l_extendedprice"] * (1 - x["l_discount"]) * (1 + x["l_tax"]), required_columns={"l_extendedprice", "l_discount", "l_tax"}) - f = d.groupby(["l_returnflag", "l_linestatus"], orderby=["l_returnflag","l_linestatus"]).agg({"l_quantity":["sum","avg"], "l_extendedprice":["sum","avg"], "disc_price":"sum", "charge":"sum", "l_discount":"avg","*":"count"}) - return f.collect() ~~~ @@ -122,15 +119,11 @@ When you call `.collect()`, the logical plan you have built is actually optimize Joins work very intuitively. For example, this is how to do [TPC-H query 12](https://github.com/dragansah/tpch-dbgen/blob/master/tpch-queries/12.sql). ~~~python def do_12(): - d = lineitem.join(orders,left_on="l_orderkey", right_on="o_orderkey") - d = d.filter("l_shipmode IN ('MAIL','SHIP') and l_commitdate < l_receiptdate and l_shipdate < l_commitdate and \ l_receiptdate >= date '1994-01-01' and l_receiptdate < date '1995-01-01'") - d = d.with_column("high", lambda x: (x["o_orderpriority"] == "1-URGENT") | (x["o_orderpriority"] == "2-HIGH"), required_columns={"o_orderpriority"}) d = d.with_column("low", lambda x: (x["o_orderpriority"] != "1-URGENT") & (x["o_orderpriority"] != "2-HIGH"), required_columns={"o_orderpriority"}) - f = d.groupby("l_shipmode").aggregate(aggregations={'high':['sum'], 'low':['sum']}) return f.collect() ~~~ @@ -143,7 +136,6 @@ def do_3(): d = customer.join(d,left_on="c_custkey", right_on="o_custkey") d = d.filter("c_mktsegment = 'BUILDING' and o_orderdate < date '1995-03-15' and l_shipdate > date '1995-03-15'") d = d.with_column("revenue", lambda x: x["l_extendedprice"] * ( 1 - x["l_discount"]) , required_columns={"l_extendedprice", "l_discount"}) - f = d.groupby(["l_orderkey","o_orderdate","o_shippriority"]).agg({"revenue":["sum"]}) return f.collect() ~~~ @@ -155,10 +147,34 @@ An important thing to note is that Quokka currently only support inner joins. Ot Feel free to look at some other queries in the Quokka [github](https://github.com/marsupialtail/quokka/tree/master/apps), or browse the [API reference](datastream.md). While you are there, please give Quokka a star! ##Lesson 2: Writing Things -So far, we have just learned about +So far, we have just learned about how to read things into DataStreams and do things to DataStreams. You can also write out DataStreams to persistent storage like disk or S3 to record all the amazing things we did with them. + +Quokka currently operates like Spark and by default writes a directory of files, with a default maximum file size for different file formats. This makes it easy to perform parallel writing. + +To write out a DataStream to CSV or Parquet to a local directory (you must specify a valid absolute path), simply do: + +~~~python +d.write_csv("/home/ubuntu/test-path/") +d.write_parquet("/home/ubuntu/test-path/") +~~~ + +To write out a DataStream to S3, you should specify an S3 bucket and prefix like this: + +~~~python +d.write_csv("s3://bucket/prefix/") +d.write_parquet("s3://bucket/prefix/") +~~~ + +Writing out a DataStream is a blocking API and will automatically call a `collect()` for you. The collected Polars DataFrame at the end is just a column of filenames produced. ##Lesson 3: Things you can't do. Here is a brief discussion of what Quokka is not great for. Quokka's main advantage stems from the fact it can pipeline the execution of DataStreams. Once a partition (typically a Polars DataFrame) in a DataStream has been generated, it can be immediately consumed by a downstream user. This means downstream processing of this partition and upstream generation of the next partition can be overlapped. -Now, if an operator processing a DataStream cannot emit any partitions downstream until it has seen all of the partitions in its input DataStreams, the pipeline breaks. An example of this is an aggregation. \ No newline at end of file +Now, if an operator processing a DataStream cannot emit any partitions downstream until it has seen all of the partitions in its input DataStreams, the pipeline breaks. An example of this is an aggregation. You cannot safely emit the result of a sum of a column of a table until you have seen every row! The main examples of this in data processing are groupby-aggregations and distributed sorts. + +Currently, calling `groupby().agg()` or just `agg()` on a DataStream will produce another DataStream. However that DataStream will consist of exactly one batch, which holds the final result, emitted when it's computed. It is recommended to just call `collect()` or `compute()` on that result. + +Quokka currently does not support distributed sort -- indeed a sort heavy workload is really great for Spark. Distributed sorting is not exactly needed for many analytical SQL workloads since you typically do the aggregation before the order by, which greatly reduce the number of rows you have to sort. You can then sort after you have done `collect()`. However for many other workloads distributed sorting is critical, and Quokka aims to support this as soon as possible. + +Things that Quokka can do and doesn't do yet: fine grained placement of UDFs or UDAFs on GPUs or CPUs, core-count-control, Docker support, reading JSON, etc. Most of these can be easily implemented (and some already are) in the graph level API, however it takes effort to figure out what's the best abstractions to expose in the DataStream API. If you want to make this list shorter, I welcome contributions: zihengw@stanford.edu. \ No newline at end of file diff --git a/docs/docs/started.md b/docs/docs/started.md index d4b9bc4..df0c6b3 100644 --- a/docs/docs/started.md +++ b/docs/docs/started.md @@ -33,17 +33,22 @@ Quokka can be installed as a pip package: ~~~bash pip3 install pyquokka ~~~ -However it needs the latest version of Redis (at least 6.0), which you can get by running the following in sudo: + +**However it needs the latest version of Redis (at least 7.0)**, which you can get by running the following: ~~~bash -curl https://packages.redis.io/gpg | apt-key add - -echo "deb https://packages.redis.io/deb $(lsb_release -cs) main" | tee /etc/apt/sources.list.d/redis.list -apt-get update -apt-get install redis +curl -fsSL https://packages.redis.io/gpg | sudo gpg --dearmor -o /usr/share/keyrings/redis-archive-keyring.gpg + +echo "deb [signed-by=/usr/share/keyrings/redis-archive-keyring.gpg] https://packages.redis.io/deb $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/redis.list + +sudo apt-get update +sudo apt-get install redis ~~~ If you only plan on running Quokka locally, you are done. Here is a [10 min lesson](simple.md) on how it works. -If you plan on using Quokka for cloud, there's a bit more setup that needs to be done. Currently Quokka only provides support for AWS. Quokka provides a utility library under `pyquokka.utils` which allows you to manager clusters and connect to them. It assumes that awscli is configured locally and you have a keypair and a security group with the proper configurations. To set these things up, you can follow the [AWS guide](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html). +If you are planning on reading files from S3, you need to install the awscli and you have your credentials set up. + +If you plan on using Quokka for cloud by launching EC2 clusters, there's a bit more setup that needs to be done. Currently Quokka only provides support for AWS. Quokka provides a utility library under `pyquokka.utils` which allows you to manager clusters and connect to them. It assumes that awscli is configured locally and you have a keypair and a security group with the proper configurations. To set these things up, you can follow the [AWS guide](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html). More detailed instructions can be found in [Setting Up Cloud Cluster](cloud.md). diff --git a/docs/site/index.html b/docs/site/index.html index 865b71c..d121029 100644 --- a/docs/site/index.html +++ b/docs/site/index.html @@ -172,5 +172,5 @@

Contact

diff --git a/docs/site/search/search_index.json b/docs/site/search/search_index.json index 546b1ba..ccae684 100644 --- a/docs/site/search/search_index.json +++ b/docs/site/search/search_index.json @@ -1 +1 @@ -{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"If you like, please: Introduction Quokka is a lightweight distributed dataflow engine written completely in Python targeting modern data science use cases involving 100GBs to TBs of data. At its core, Quokka manipulates streams of data with stateful actors. Quokka offers a stream-centric, Python-native perspective to tasks commonly done today by Spark. Please see the Getting Started for further details. This streaming paradigm inspired by high performance databases such as DuckDB and Snowflake allows Quokka to greatly outperform Apache Spark performance on SQL type workloads reading from cloud blob storage like S3 for formats like CSV and Parquet. Fineprint: benchmark done using four c5.4xlarge instances for Quokka and EMR 6.5.0 with five c5.4xlarge instances for Spark where one instance is used as a coordinator. Ignores initialization costs which are generally comparable between Quokka and Spark. What's even better than being cheap and fast is the fact that since Quokka is Python native, you can easily use your favorite machine learning libraries like Scikit-Learn and Pytorch with Quokka inside of arbitrary Python functions to transform your DataStreams. Another great advantage is that a streaming data paradigm is more in line with how data arrives in the real world, making it easy to bridge your data application to production, or conduct time-series backfilling on your historical data. You develop with Quokka locally, and deploy to cloud (currently AWS) with a single line of code change. Quokka is specifically designed for the following workloads. SQLish data engineering workloads on data lake. You can try Quokka if you want to speed up some Spark data jobs, or if you want to implement \"stateful Python UDFs\" in your SQL pipeline, which is kind of a nightmare in Spark. (e.g. forward computing some feature based on historical data) Quokka can also typically achieve much better performance than Spark on pure SQL workloads when input data comes from cloud storage, especially if the data is in CSV format. The drawback is Quokka currently does not support SQL interface, so you are stuck with a dataframe-like DataStream API. However SQL optimizations such as predicate pushdown and early projection are implemented. (support forthcoming) ML engineering pipelines on large unstructured data datasets. Since Quokka is Python-native, it interfaces perfectly with the Python machine learning ecosystem. No more JVM troubles. Unlike Spark, Quokka also will let you precisely control the placement of your stateful operators on machines, preventing GPU out-of-memory and improving performance by reducing contention. Support for these workloads are still in the works. If you are interested, please drop me a note: zihengw@stanford.edu or Discord . Roadmap Streaming support. Although Quokka follows a streaming model, it currently does not support \"streaming\" computations from Kafka, Kinesis etc. They will soon be supported. This will allow batch data pipelines to be deployed to production with one line code change. Target Q4 2022. Fault tolerance. Currently Quokka's fault tolerance mechanism is experimental. Improvements are being made in this direction transparent to the API. Please use on-demand instances for important workloads. (Well if you are planning on using Quokka for important workloads or any workload, please contact me: zihengw@stanford.edu.) The goal is to support Spark-like fault recovery stability by Q1 2023. Full SQL support. I want to be able to do qc.sql(SQL_QUERY). I am working with SQLGlot to make this happen. Target pass TPC-H and say 75% of TPC-DS Q1 2023. Time Series Package. Quokka will support point-in-time joins and asof joins natively by Q4 2022. This will be useful for feature backtesting, etc. Contact If you are interested in trying out Quokka or hit any problems (any problems at all), please contact me at zihengw@stanford.edu or Discord . I will try my best to make Quokka work for you.","title":"Home"},{"location":"#if-you-like-please","text":"","title":"If you like, please: "},{"location":"#introduction","text":"Quokka is a lightweight distributed dataflow engine written completely in Python targeting modern data science use cases involving 100GBs to TBs of data. At its core, Quokka manipulates streams of data with stateful actors. Quokka offers a stream-centric, Python-native perspective to tasks commonly done today by Spark. Please see the Getting Started for further details. This streaming paradigm inspired by high performance databases such as DuckDB and Snowflake allows Quokka to greatly outperform Apache Spark performance on SQL type workloads reading from cloud blob storage like S3 for formats like CSV and Parquet. Fineprint: benchmark done using four c5.4xlarge instances for Quokka and EMR 6.5.0 with five c5.4xlarge instances for Spark where one instance is used as a coordinator. Ignores initialization costs which are generally comparable between Quokka and Spark. What's even better than being cheap and fast is the fact that since Quokka is Python native, you can easily use your favorite machine learning libraries like Scikit-Learn and Pytorch with Quokka inside of arbitrary Python functions to transform your DataStreams. Another great advantage is that a streaming data paradigm is more in line with how data arrives in the real world, making it easy to bridge your data application to production, or conduct time-series backfilling on your historical data. You develop with Quokka locally, and deploy to cloud (currently AWS) with a single line of code change. Quokka is specifically designed for the following workloads. SQLish data engineering workloads on data lake. You can try Quokka if you want to speed up some Spark data jobs, or if you want to implement \"stateful Python UDFs\" in your SQL pipeline, which is kind of a nightmare in Spark. (e.g. forward computing some feature based on historical data) Quokka can also typically achieve much better performance than Spark on pure SQL workloads when input data comes from cloud storage, especially if the data is in CSV format. The drawback is Quokka currently does not support SQL interface, so you are stuck with a dataframe-like DataStream API. However SQL optimizations such as predicate pushdown and early projection are implemented. (support forthcoming) ML engineering pipelines on large unstructured data datasets. Since Quokka is Python-native, it interfaces perfectly with the Python machine learning ecosystem. No more JVM troubles. Unlike Spark, Quokka also will let you precisely control the placement of your stateful operators on machines, preventing GPU out-of-memory and improving performance by reducing contention. Support for these workloads are still in the works. If you are interested, please drop me a note: zihengw@stanford.edu or Discord .","title":"Introduction"},{"location":"#roadmap","text":"Streaming support. Although Quokka follows a streaming model, it currently does not support \"streaming\" computations from Kafka, Kinesis etc. They will soon be supported. This will allow batch data pipelines to be deployed to production with one line code change. Target Q4 2022. Fault tolerance. Currently Quokka's fault tolerance mechanism is experimental. Improvements are being made in this direction transparent to the API. Please use on-demand instances for important workloads. (Well if you are planning on using Quokka for important workloads or any workload, please contact me: zihengw@stanford.edu.) The goal is to support Spark-like fault recovery stability by Q1 2023. Full SQL support. I want to be able to do qc.sql(SQL_QUERY). I am working with SQLGlot to make this happen. Target pass TPC-H and say 75% of TPC-DS Q1 2023. Time Series Package. Quokka will support point-in-time joins and asof joins natively by Q4 2022. This will be useful for feature backtesting, etc.","title":"Roadmap"},{"location":"#contact","text":"If you are interested in trying out Quokka or hit any problems (any problems at all), please contact me at zihengw@stanford.edu or Discord . I will try my best to make Quokka work for you.","title":"Contact"},{"location":"api/","text":"API reference First do: from pyquokka.df import * qc = QuokkaContext() If working with S3, do: from pyquokka.df import * manager = QuokkaClusterManager() cluster = manager.get_cluster_from_json(\"config.json\") This assumes you have a cluster saved in config.json. Please refer to the guide here to do this. qc.read_csv","title":"API reference"},{"location":"api/#api-reference","text":"First do: from pyquokka.df import * qc = QuokkaContext() If working with S3, do: from pyquokka.df import * manager = QuokkaClusterManager() cluster = manager.get_cluster_from_json(\"config.json\") This assumes you have a cluster saved in config.json. Please refer to the guide here to do this.","title":"API reference"},{"location":"api/#qcread_csv","text":"","title":"qc.read_csv"},{"location":"cloud/","text":"Setting up Quokka for EC2 To use Quokka for EC2, you need to (at minimum) have an AWS account with permissions to launch instances and create new security groups. You will probably run into issues since everybody's AWS setup is a little bit different, so please email: zihengw@stanford.edu or Discord . Quokka requires a security group that allows inbound and outbound connections to ports 5005 (Flight), 6379 (Ray) and 6800 (Redis) from IP addresses within the cluster. For simplicity, you can just enable all inbound and outbound connections from all IP addresses. The easiest way to make this is to manually create an instance on EC2 through the dashboard, e.g. t2.micro, and manually add rules to the security group EC2 assigns that instance. Then you can either copy that security group to a new group, or keep using that modified security group for Quokka. There must be an automated way to do this in the AWS CLI, but I am too lazy to figure it out. If you want to tell me how to do it, I'll post the steps here and buy you a coffee. You also need to generate a pem key pair. The easiest way to do this, again, is to start a t2.micro in the console and using the dashboard. Save the pem key somewhere and write down the absolute path. After you have the security group and you can use the QuokkaClusterManager in pyquokka.utils to spin up a cluster. The code to do this: from pyquokka.utils import QuokkaClusterManager manager = QuokkaClusterManager(key_name = YOUR_KEY, key_location = ABSOLUTE_PATH_TO_KEY, security_group= SECURITY_GROUP_ID) cluster = manager.create_cluster(aws_access_key, aws_access_id, num_instances = 4, instance_type = \"i3.2xlarge\", requirements = [\"pytorch\"]) cluster.to_json(\"config.json\") This would spin up four i3.2xlarge instances and install pytorch on each of them. The QuokkaClusterManager also has other utilities such as launch_all , terminate_cluster and get_cluster_from_json . Importantly, currently only on-demand instances are supported. This will change in the near future. The most interesting utility is probably manager.launch_all(command) , which basically runs a custom command on each machine. You can use this command to massage your cluster into your desired state. In general, all of the machines in your cluster must have all the Python packages you need installed with pip . Importantly, if you are using on demand instances, creating a cluster only needs to happen once. Once you have saved the cluster configs to a json, the next time you want to run a job and use this cluster, you can just do: from pyquokka.utils import QuokkaClusterManager manager = QuokkaClusterManager(key_name = YOUR_KEY, key_location = ABSOLUTE_PATH_TO_KEY, security_group= SECURITY_GROUP_ID) cluster = manager.get_cluster_from_json(\"config.json\") This will work if the cluster is either fully stopped or fully running, i.e. every machine must be in either stopped or running state. If the cluster is running, this assumes it was started by running the get_cluster_from_json command! Please do not manually start the instances and try to use get_cluster_from_json to connect to a cluster. Quokka also plans to extend support to Docker/Kubernetes based deployments based on KubeRay. (Contributions welcome!) Of course, there are plans to support GCP and Azure. The best way to make sure that happens is by sending me a message on email or Discord .","title":"Setting Up Cloud Cluster"},{"location":"cloud/#setting-up-quokka-for-ec2","text":"To use Quokka for EC2, you need to (at minimum) have an AWS account with permissions to launch instances and create new security groups. You will probably run into issues since everybody's AWS setup is a little bit different, so please email: zihengw@stanford.edu or Discord . Quokka requires a security group that allows inbound and outbound connections to ports 5005 (Flight), 6379 (Ray) and 6800 (Redis) from IP addresses within the cluster. For simplicity, you can just enable all inbound and outbound connections from all IP addresses. The easiest way to make this is to manually create an instance on EC2 through the dashboard, e.g. t2.micro, and manually add rules to the security group EC2 assigns that instance. Then you can either copy that security group to a new group, or keep using that modified security group for Quokka. There must be an automated way to do this in the AWS CLI, but I am too lazy to figure it out. If you want to tell me how to do it, I'll post the steps here and buy you a coffee. You also need to generate a pem key pair. The easiest way to do this, again, is to start a t2.micro in the console and using the dashboard. Save the pem key somewhere and write down the absolute path. After you have the security group and you can use the QuokkaClusterManager in pyquokka.utils to spin up a cluster. The code to do this: from pyquokka.utils import QuokkaClusterManager manager = QuokkaClusterManager(key_name = YOUR_KEY, key_location = ABSOLUTE_PATH_TO_KEY, security_group= SECURITY_GROUP_ID) cluster = manager.create_cluster(aws_access_key, aws_access_id, num_instances = 4, instance_type = \"i3.2xlarge\", requirements = [\"pytorch\"]) cluster.to_json(\"config.json\") This would spin up four i3.2xlarge instances and install pytorch on each of them. The QuokkaClusterManager also has other utilities such as launch_all , terminate_cluster and get_cluster_from_json . Importantly, currently only on-demand instances are supported. This will change in the near future. The most interesting utility is probably manager.launch_all(command) , which basically runs a custom command on each machine. You can use this command to massage your cluster into your desired state. In general, all of the machines in your cluster must have all the Python packages you need installed with pip . Importantly, if you are using on demand instances, creating a cluster only needs to happen once. Once you have saved the cluster configs to a json, the next time you want to run a job and use this cluster, you can just do: from pyquokka.utils import QuokkaClusterManager manager = QuokkaClusterManager(key_name = YOUR_KEY, key_location = ABSOLUTE_PATH_TO_KEY, security_group= SECURITY_GROUP_ID) cluster = manager.get_cluster_from_json(\"config.json\") This will work if the cluster is either fully stopped or fully running, i.e. every machine must be in either stopped or running state. If the cluster is running, this assumes it was started by running the get_cluster_from_json command! Please do not manually start the instances and try to use get_cluster_from_json to connect to a cluster. Quokka also plans to extend support to Docker/Kubernetes based deployments based on KubeRay. (Contributions welcome!) Of course, there are plans to support GCP and Azure. The best way to make sure that happens is by sending me a message on email or Discord .","title":"Setting up Quokka for EC2"},{"location":"datastream/","text":"DataStream source DataStream( quokka_context, schema: list, source_node_id: int ) Quokka DataStream class is how most users are expected to interact with Quokka. However users are not expected to create a DataStream directly by calling its constructor. Note that constructor takes an argument called source_node_id , which would confuse most data scientists -- even me! Args quokka_context (pyquokka.df.QuokkaContext) : Similar to Spark SQLContext. schema (list) : The schema of this DataStream, i.e. a list of column names. We might change it to be a dictionary with type information in the future to do better static code checking. source_node_id (int) : the node in the logical plan that produces this DataStream. Attributes quokka_context (pyquokka.df.QuokkaContext) : Similar to Spark SQLContext. schema (list) : The schema of this DataStream, i.e. a list of column names. We might change it to be a dictionary with type information in the future to do better static code checking. source_node_id (int) : the node in the logical plan that produces this DataStream. Methods: .collect source .collect() This will trigger the execution of computational graph, similar to Spark collect(). The result will be a Polars DataFrame on the master Return: Polars DataFrame. Examples >>> f = qc.read_csv(\"my_csv.csv\") >>> result = f.collect() # result will be a Polars dataframe, as if you did polars.read_csv(\"my_csv.csv\") .compute source .compute() This will trigger the execution of computational graph, similar to Spark collect The result will be a Quokka DataSet, which you can then call to_df() or call to_stream() to initiate another computation. Return: Quokka Quokka DataSet. Currently this is going to be just a list of objects distributed across the Redis servers on the workers. .explain source .explain( mode = 'graph' ) This will not trigger the execution of your computation graph but will produce a graph of the execution plan. Args mode (str) : 'graph' will show a graph, 'text' will print a textual description. Return: None. .write_csv source .write_csv( table_location, output_line_limit = 1000000 ) This will write out the entire contents of the DataStream to a list of CSVs. This is a blocking operation, and will call collect() under the hood. Args table_lcation (str) : the root directory to write the output CSVs to. Similar to Spark, Quokka by default writes out a directory of CSVs instead of dumping all the results to a single CSV so the output can be done in parallel. If your dataset is small and you want a single file, you can adjust the output_line_limit parameter. Example table_locations: s3://bucket/prefix for cloud, absolute path /home/user/files for disk. output_line_limit (int) : how many rows each CSV in the output should have. The current implementation simply buffers this many rows in memory instead of using file appends, so you should have enough memory! Return: Polars DataFrame containing the filenames of the CSVs that were produced. Examples >>> f = qc.read_csv(\"lineitem.csv\") >>> f = f.filter(\"l_orderkey < 10 and l_partkey > 5\") >>> f.write_csv(\"/home/user/test-out\") # you should create the directory before hand. .write_parquet source .write_parquet( table_location, output_line_limit = 10000000 ) This will write out the entire contents of the DataStream to a list of Parquets. This is a blocking operation, and will call collect() under the hood. By default, each output Parquet file will contain one row group. Args table_lcation (str) : the root directory to write the output Parquets to. Similar to Spark, Quokka by default writes out a directory of Parquets instead of dumping all the results to a single Parquet so the output can be done in parallel. If your dataset is small and you want a single file, you can adjust the output_line_limit parameter. Example table_locations: s3://bucket/prefix for cloud, absolute path /home/user/files for disk. output_line_limit (int) : the row group size in each output file. Return: Polars DataFrame containing the filenames of the Parquets that were produced. Examples >>> f = qc.read_csv(\"lineitem.csv\") >>> f = f.filter(\"l_orderkey < 10 and l_partkey > 5\") >>> f.write_parquet(\"/home/user/test-out\") # you should create the directory before hand. .filter source .filter( predicate: str ) This will filter the DataStream to contain only rows that match a certain predicate. Currently this predicate must be specified in SQL syntax. You can write any SQL clause you would generally put in a WHERE statement containing arbitrary conjunctions and disjunctions. The identifiers however, must be in the schema of this DataStream! We aim to soon support a more Pythonic interface that better resembles Pandas which allows you to do things like d = d[d.a > 10]. Please look at the examples below. Since a DataStream is implemented as a stream of batches, you might be tempted to think of a filtered DataStream as a stream of batches where each batch directly results from a filter being applied to a batch in the source DataStream. While this certainly may be the case, filters are aggressively optimized by Quokka and is most likely pushed all the way down to the input readers. As a result, you typically should not see a filter node in a Quokka execution plan shown by explain() . It is much better to think of a DataStream simply as a stream of rows that meet certain criteria, and who may be non-deterministically batched together by the Quokka runtime. Indeed, Quokka makes no guarantees on the sizes of these batches, which is determined at runtime. This flexibility is an important reason for Quokka's superior performance. Args predicate (str) : a SQL WHERE clause, look at the examples. Return: A DataStream consisting of rows from the source DataStream that match the predicate. Examples >>> f = qc.read_csv(\"lineitem.csv\") # filter for all the rows where l_orderkey smaller than 10 and l_partkey greater than 5 >>> f = f.filter(\"l_orderkey < 10 and l_partkey > 5\") # nested conditions are supported >>> f = f.filter(\"l_orderkey < 10 and (l_partkey > 5 or l_partkey < 1)\") # most SQL features such as IN and date are supported. >>> f = f.filter(\"l_shipmode IN ('MAIL','SHIP') and l_receiptdate < date '1995-01-01'\") # you can do arithmetic in the predicate just like in SQL. >>> f = f.filter(\"l_shipdate < date '1994-01-01' + interval '1' year and l_discount between 0.06 - 0.01 and 0.06 + 0.01\") # this will fail! Assuming c_custkey is not in f.schema >>> f = f.filter(\"c_custkey > 10\") .select source .select( columns: list ) This will create a new DataStream that contains only selected columns from the source DataStream. Since a DataStream is implemented as a stream of batches, you might be tempted to think of a filtered DataStream as a stream of batches where each batch directly results from selecting columns from a batch in the source DataStream. While this certainly may be the case, select() is aggressively optimized by Quokka and is most likely pushed all the way down to the input readers. As a result, you typically should not see a select node in a Quokka execution plan shown by explain() . It is much better to think of a DataStream simply as a stream of rows that meet certain criteria, and who may be non-deterministically batched together by the Quokka runtime. Indeed, Quokka makes no guarantees on the sizes of these batches, which is determined at runtime. This flexibility is an important reason for Quokka's superior performance. Args columns (list) : a list of columns to select from the source DataStream Return: A DataStream consisting of only the columns selected. Examples >>> f = qc.read_csv(\"lineitem.csv\") # select only the l_orderdate and l_orderkey columns >>> f = f.select([\"l_orderdate\", \"l_orderkey\"]) # this will now fail, since f's schema now consists of only two columns. >>> f = f.select([\"l_linenumber\"]) .drop source .drop( cols_to_drop: list ) Think of this as the anti-opereator to select. Instead of selecting columns, this will drop columns. This is implemented in Quokka as selecting the columns in the DataStream's schema that are not dropped. Args cols_to_drop (list) : a list of columns to drop from the source DataStream Return: A DataStream consisting of all columns in the source DataStream that are not in cols_to_drop . Examples >>> f = qc.read_csv(\"lineitem.csv\") # select only the l_orderdate and l_orderkey columns >>> f = f.drop([\"l_orderdate\", \"l_orderkey\"]) # this will now fail, since you dropped l_orderdate >>> f = f.select([\"l_orderdate\"]) .rename source .rename( rename_dict ) Renames columns in the DataStream according to rename_dict. This is similar to polars.rename . The keys you supply in rename_dict must be present in the schema, and the rename operation must not lead to duplicate column names. Note this will lead to a physical operation at runtime. Args rename_dict (dict) : key is old column name, value is new column name. Return: A DataStream with new schema according to rename. .transform source .transform( f, new_schema: list, required_columns: set, foldable = True ) This is a rather Quokka-specific API that allows arbitrary transformations on a DataStream, similar to Spark RDD.map. Each batch in the DataStream is going to be transformed according to a user defined function, which can produce a new batch. The new batch can have completely different schema or even length as the original batch, and the original data is considered lost, or consumed by this transformation function. This could be used to implement user-defined-aggregation-functions (UDAFs). Note in cases where you are simply generating a new column from other columns for each row, i.e. UDF, you probably want to use the with_column method instead. A DataStream is implemented as a stream of batches. In the runtime, your transformation function will be applied to each of those batches. However, there are no guarantees whatsoever on the sizes of these batches! You should probably make sure your logic is correct regardless of the sizes of the batches. For example, if your DataStream consists of a column of numbers, and you wish to compute the sum of those numbers, you could first transform the DataStream to return just the sum of each batch, and then hook this DataStream up to a stateful operator that adds up all the sums. You can use whatever libraries you have installed in your Python environment in this transformation function. If you are using this on a cloud cluster, you have to make sure the necessary libraries are installed on each machine. You can use the utils package in pyquokka to help you do this. This is very similar to Spark's seldom used combineByKey feature. Note a transformation in the logical plan basically precludes any predicate pushdown or early projection past it, since the original columns are assumed to be lost, and we cannot directly establish correspendences between the input columns to a transformation and its output columns for the purposes of predicate pushdown or early projection. The user is required to supply a set or list of required columns, and we will select for those columns (which can be pushed down) before we apply the transformation. Args f (function) : The transformation function. This transformation function must take as input a Polars DataFrame and output a Polars DataFrame. The transformation function must not have expectations on the length of its input. Similarly, the transformation function does not have to emit outputs of a specific size. The transformation function must produce the same output columns for every possible input. new_schema (list) : The names of the columns of the Polars DataFrame that the transformation function produces. required_columns (list or set) : The names of the columns that are required for this transformation. This argument is made mandatory because it's often trivial to supply and can often greatly speed things up. foldable (bool) : Whether or not the transformation can be executed as part of the batch post-processing of the previous operation in the execution graph. This is set to True by default. Correctly setting this flag requires some insight into how Quokka works. Lightweight functions generally benefit from being folded. Heavyweight functions or those whose efficiency improve with large input sizes might benefit from not being folded. Return: A new transformed DataStream with the supplied schema. Examples # a user defined function that takes in a Polars DataFrame with a single column \"text\", converts it to a Pyarrow table, # and uses nice Pyarrow compute functions to perform the word count on this Polars DataFrame. Note 1) we have to convert it # back to a Polars DataFrame afterwards, 2) the function works regardless of input length and 3) the output columns are the # same regardless of the input. def udf2(x): x = x.to_arrow() da = compute.list_flatten(compute.ascii_split_whitespace(x[\"text\"])) c = da.value_counts().flatten() return polars.from_arrow(pa.Table.from_arrays([c[0], c[1]], names=[\"word\",\"count\"])) # this is a trick to read in text files, just use read_csv with a separator you know won't appear. # the result will just be DataStream with one column. >>> words = qc.read_csv(\"random_words.txt\", [\"text\"], sep = \"|\") # transform words to counts >>> counted = words.transform( udf2, new_schema = [\"word\", \"count\"], required_columns = {\"text\"}, foldable=True) .with_column source .with_column( new_column, f, required_columns = None, foldable = True ) This will create new columns from certain columns in the dataframe. This is similar to pandas df.apply() that makes new columns. This is similar to Spark UDF or Pandas UDF, Polars with_column , Spark with_column , etc. Note that this function, like most Quokka DataStream functions, are not in-place, and will return a new DataStream, with the new column. This is a separate API from transform because the semantics allow for projection and predicate pushdown through this node, since the original columns are all preserved. Use this instead of transform if possible. A DataStream is implemented as a stream of batches. In the runtime, your function will be applied to each of those batches. The function must take as input a Polars DataFrame and produce a Polars DataFrame. This is a different mental model from say Pandas df.apply , where the function is written for each row. There are two restrictions. First, your result must only have one column, and it should have the same name as your new_column argument. Second, your result must have the same length as the input Polars DataFrame. You can use whatever libraries you have installed in your Python environment in this function. If you are using this on a cloud cluster, you have to make sure the necessary libraries are installed on each machine. You can use the utils package in pyquokka to help you do this. Importantly, your function can take full advantage of Polars' columnar APIs to make use of SIMD and other forms of speedy goodness. You can even use Polars LazyFrame abstractions inside of this function. Of course, for ultimate flexbility, you are more than welcome to convert the Polars DataFrame to a Pandas DataFrame and use df.apply . Just remember to convert it back to a Polars DataFrame with only the result column in the end! Args new_column (str) : The name of the new column. f (function) : The apply function. This apply function must take as input a Polars DataFrame and output a Polars DataFrame. The apply function must not have expectations on the length of its input. The output must have the same length as the input. The apply function must produce the same output columns for every possible input. required_columns (list or set) : The names of the columns that are required for your function. If this is not specified then Quokka assumes all the columns are required for your function. Early projection past this function becomes impossible. Long story short, if you can specify this argument, do it. foldable (bool) : Whether or not the function can be executed as part of the batch post-processing of the previous operation in the execution graph. This is set to True by default. Correctly setting this flag requires some insight into how Quokka works. Lightweight functions generally benefit from being folded. Heavyweight functions or those whose efficiency improve with large input sizes might benefit from not being folded. Return: A new DataStream with a new column made by the user defined function. Examples >>> f = qc.read_csv(\"lineitem.csv\") # people who care about speed of execution make full use of Polars columnar APIs. >>> d = d.with_column(\"high\", lambda x:(x[\"o_orderpriority\"] == \"1-URGENT\") | (x[\"o_orderpriority\"] == \"2-HIGH\"), required_columns = {\"o_orderpriority\"}) # people who care about speed of development can do something that hurts my eyes. def f(x): y = x.to_pandas() y[\"high\"] = y.apply(lambda x:(x[\"o_orderpriority\"] == \"1-URGENT\") | (x[\"o_orderpriority\"] == \"2-HIGH\"), axis = 1) return polars.from_pandas(y[\"high\"]) >>> d = d.with_column(\"high\", f, required_columns={\"o_orderpriority\"}) .stateful_transform source .stateful_transform( executor: Executor, new_schema: list, required_columns: set, partitioner = PassThroughPartitioner(), placement = 'cpu' ) EXPERIMENTAL API This is like transform , except you can use a stateful object as your transformation function. This is useful for example, if you want to run a heavy Pytorch model on each batch coming in, and you don't want to reload this model for each function call. Remember the transform API only supports stateless transformations. You could also implement much more complicated stateful transformations, like implementing your own aggregation function if you are not satisfied with Quokka's default operator's performance. This API is still being finalized. A version of it that takes multiple input streams is also going to be added. This is the part of the DataStream level api that is closest to the underlying execution engine. Quokka's underlying execution engine basically executes a series of stateful transformations on batches of data. The difficulty here is how much of that underlying API to expose here so it's still useful without the user having to understand how the Quokka runtime works. To that end, we have to come up with suitable partitioner and placement strategy abstraction classes and interfaces. If you are interested in helping us hammer out this API, please talke to me: zihengw@stanford.edu. Args executor (pyquokka.executors.Executor) : The stateful executor. It must be a subclass of pyquokka.executors.Executor , and expose the execute and done functions. More details forthcoming. new_schema (list) : The names of the columns of the Polars DataFrame that the transformation function produces. required_columns (list or set) : The names of the columns that are required for this transformation. This argument is made mandatory because it's often trivial to supply and can often greatly speed things up. Return: A transformed DataStream. Examples Forthcoming. .distinct source .distinct( keys: list ) Return a new DataStream with specified columns and unique rows. This is like SELECT DISTINCT(KEYS) FROM ... in SQL. Note all the other columns will be dropped, since their behavior is unspecified. If you want to do deduplication, you can use this operator with keys set to all the columns. This could be accomplished by using groupby().agg() but using distinct is generally faster because it is nonblocking, compared to a groupby. Quokka really likes nonblocking operations because it can then pipeline it with other operators. Args keys (list) : a list of columns to select distinct on. Return: A transformed DataStream whose columns are in keys and whose rows are unique. Examples >>> f = qc.read_csv(\"lineitem.csv\") # select only the l_orderdate and l_orderkey columns, return only unique rows. >>> f = f.distinct([\"l_orderdate\", \"l_orderkey\"]) # this will now fail, since l_comment is no longer in f's schema. >>> f = f.select([\"l_comment\"]) .join source .join( right, on = None, left_on = None, right_on = None, suffix = '_2', how = 'inner' ) Join a DataStream with another DataStream or a small Polars DataFrame (<10MB). If you have a Polars DataFrame bigger than this, the best solution right now is to write it out to a file and have Quokka read it back in as a DataStream. I realize this is perhaps suboptimal, and this will be improved. A streaming two-sided distributed join will be executed for two DataStream joins and a streaming broadcast join will be executed for DataStream joined with Polars DataFrame. Joins are obviously very important, and we are constantly improving how we do joins. Eventually we will support out of core joins, when @savebuffer merges his PR into Arrow 10.0. Args right (DataStream or Polars DataFrame) : the DataStream or Polars DataFrame to join to. on (str) : You could either specify this, if the join column has the same name in this DataStream and right , or left_on and right_on if the join columns don't have the same name. left_on (str) : the name of the join column in this DataStream. right_on (str) : the name of the join column in right . suffix (str) : if right has columns with the same names as columns in this DataStream, their names will be appended with the suffix in the result. how (str) : only supports \"inner\" for now. Return: A new DataStream that's the joined result of this DataStream and \"right\". By default, columns from both side will be retained, except for right_on from the right side. Examples >>> lineitem = qc.read_csv(\"lineitem.csv\") >>> orders = qc.read_csv(\"orders.csv\") >>> result = lineitem.join(orders, left_on = \"l_orderkey\", right_on = \"o_orderkey\") # this will now fail, since o_orderkey is not in the joined DataStream. >>> result = result.select([\"o_orderkey\"]) .groupby source .groupby( groupby: list, orderby = None ) Group a DataStream on a list of columns, optionally specifying an ordering requirement. This returns a GroupedDataStream object, which currently only expose the aggregate method. This is similar to Pandas df.groupby().agg() syntax. Eventually the GroupedDataStream object will also support different kinds of window functions. Args groupby (list or str) : a column or a list of columns to group on. orderby (list) : a list of ordering requirements of the groupby columns, specified in a list like this: [(col1, \"asc\"), (col2, \"desc\")]. Return: A GroupedDataStream object with the specified grouping and the current DataStream. Examples >>> lineitem = qc.read_csv(\"lineitem.csv\") >>> result = lineitem.groupby([\"l_orderkey\",\"l_orderdate\"], orderby = [(\"l_orderkey\", \"asc\"), (\"l_orderdate\", \"desc\")]) .agg source .agg( aggregations ) Aggregate this DataStream according to the defined aggregations without any pre-grouping. This is similar to Pandas df.agg() . The result will be one row. The result is a DataStream that will return a batch when the entire aggregation is done, since it's impossible to return any aggregation results without seeing the entire dataset. As a result, you should call .compute() or .collect() on this DataStream instead of doing additional operations on it like .filter() since those won't be pipelined anyways. The only reason Quokka by default returns a DataStream instead of just returning a Polars DataFrame or a Quokka DataSet is so you can do .explain() on it. Args aggregations (dict) : similar to a dictionary argument to Pandas df.agg() . The key is the column name, where the value is a str that is \"min\", \"max\", \"mean\", \"sum\", \"avg\" or a list of such strings. If you desire to have the count column in your result, add a key \"*\" with value \"count\". Look at the examples. Return: A DataStream object that holds the aggregation result. It will only emit one batch, which is the result when it's done. You should call .collect() or .compute() on it as it is impossible to pipeline past an aggregation, so might as well as materialize it right now. Examples >>> lineitem = qc.read_csv(\"lineitem.csv\") >>> d = lineitem.filter(\"l_shipdate <= date '1998-12-01' - interval '90' day\") >>> d = d.with_column(\"disc_price\", lambda x:x[\"l_extendedprice\"] * (1 - x[\"l_discount\"]), required_columns ={\"l_extendedprice\", \"l_discount\"}) # I want the sum and average of the l_quantity column and the l_extendedprice colum, the sum of the disc_price column, the minimum of the l_discount # column, and oh give me the total row count as well. >>> f = d.agg({\"l_quantity\":[\"sum\",\"avg\"], \"l_extendedprice\":[\"sum\",\"avg\"], \"disc_price\":\"sum\", \"l_discount\":\"min\",\"*\":\"count\"}) .aggregate source .aggregate( aggregations ) Alias of agg . .count source .count() Return total row count. .sum source .sum( columns ) Return the sums of the specified columns. .max source .max( columns ) Return the maximum values of the specified columns. .min source .min( columns ) Return the minimum values of the specified columns. .mean source .mean( columns ) Return the mean values of the specified columns. GroupedDataStream source GroupedDataStream( source_data_stream: DataStream, groupby, orderby ) Methods: .agg source .agg( aggregations: dict ) Aggregate this GroupedDataStream according to the defined aggregations. This is similar to Pandas df.groupby().agg() . The result's length will be however number of rows as there are unique group keys combinations. The result is a DataStream that will return a batch when the entire aggregation is done, since it's impossible to return any aggregation results without seeing the entire dataset. As a result, you should call .compute() or .collect() on this DataStream instead of doing additional operations on it like .filter() since those won't be pipelined anyways. The only reason Quokka by default returns a DataStream instead of just returning a Polars DataFrame or a Quokka DataSet is so you can do .explain() on it. Args aggregations (dict) : similar to a dictionary argument to Pandas df.agg() . The key is the column name, where the value is a str that is \"min\", \"max\", \"mean\", \"sum\", \"avg\" or a list of such strings. If you desire to have the count column in your result, add a key \"*\" with value \"count\". Look at the examples. Return: A DataStream object that holds the aggregation result. It will only emit one batch, which is the result when it's done. You should call .collect() or .compute() on it as it is impossible to pipeline past an aggregation, so might as well as materialize it right now. Examples >>> lineitem = qc.read_csv(\"lineitem.csv\") >>> d = lineitem.filter(\"l_shipdate <= date '1998-12-01' - interval '90' day\") >>> d = d.with_column(\"disc_price\", lambda x:x[\"l_extendedprice\"] * (1 - x[\"l_discount\"]), required_columns ={\"l_extendedprice\", \"l_discount\"}) # I want the sum and average of the l_quantity column and the l_extendedprice colum, the sum of the disc_price column, the minimum of the l_discount # column, and oh give me the total row count as well, of each unique combination of l_returnflag and l_linestatus >>> f = d.groupby([\"l_returnflag\", \"l_linestatus\"]).agg({\"l_quantity\":[\"sum\",\"avg\"], \"l_extendedprice\":[\"sum\",\"avg\"], \"disc_price\":\"sum\", \"l_discount\":\"min\",\"*\":\"count\"}) .aggregate source .aggregate( aggregations: dict ) Alias for agg.","title":"DataStream"},{"location":"datastream/#_1","text":"","title":""},{"location":"datastream/#datastream","text":"source DataStream( quokka_context, schema: list, source_node_id: int ) Quokka DataStream class is how most users are expected to interact with Quokka. However users are not expected to create a DataStream directly by calling its constructor. Note that constructor takes an argument called source_node_id , which would confuse most data scientists -- even me! Args quokka_context (pyquokka.df.QuokkaContext) : Similar to Spark SQLContext. schema (list) : The schema of this DataStream, i.e. a list of column names. We might change it to be a dictionary with type information in the future to do better static code checking. source_node_id (int) : the node in the logical plan that produces this DataStream. Attributes quokka_context (pyquokka.df.QuokkaContext) : Similar to Spark SQLContext. schema (list) : The schema of this DataStream, i.e. a list of column names. We might change it to be a dictionary with type information in the future to do better static code checking. source_node_id (int) : the node in the logical plan that produces this DataStream. Methods:","title":"DataStream"},{"location":"datastream/#collect","text":"source .collect() This will trigger the execution of computational graph, similar to Spark collect(). The result will be a Polars DataFrame on the master Return: Polars DataFrame. Examples >>> f = qc.read_csv(\"my_csv.csv\") >>> result = f.collect() # result will be a Polars dataframe, as if you did polars.read_csv(\"my_csv.csv\")","title":".collect"},{"location":"datastream/#compute","text":"source .compute() This will trigger the execution of computational graph, similar to Spark collect The result will be a Quokka DataSet, which you can then call to_df() or call to_stream() to initiate another computation. Return: Quokka Quokka DataSet. Currently this is going to be just a list of objects distributed across the Redis servers on the workers.","title":".compute"},{"location":"datastream/#explain","text":"source .explain( mode = 'graph' ) This will not trigger the execution of your computation graph but will produce a graph of the execution plan. Args mode (str) : 'graph' will show a graph, 'text' will print a textual description. Return: None.","title":".explain"},{"location":"datastream/#write_csv","text":"source .write_csv( table_location, output_line_limit = 1000000 ) This will write out the entire contents of the DataStream to a list of CSVs. This is a blocking operation, and will call collect() under the hood. Args table_lcation (str) : the root directory to write the output CSVs to. Similar to Spark, Quokka by default writes out a directory of CSVs instead of dumping all the results to a single CSV so the output can be done in parallel. If your dataset is small and you want a single file, you can adjust the output_line_limit parameter. Example table_locations: s3://bucket/prefix for cloud, absolute path /home/user/files for disk. output_line_limit (int) : how many rows each CSV in the output should have. The current implementation simply buffers this many rows in memory instead of using file appends, so you should have enough memory! Return: Polars DataFrame containing the filenames of the CSVs that were produced. Examples >>> f = qc.read_csv(\"lineitem.csv\") >>> f = f.filter(\"l_orderkey < 10 and l_partkey > 5\") >>> f.write_csv(\"/home/user/test-out\") # you should create the directory before hand.","title":".write_csv"},{"location":"datastream/#write_parquet","text":"source .write_parquet( table_location, output_line_limit = 10000000 ) This will write out the entire contents of the DataStream to a list of Parquets. This is a blocking operation, and will call collect() under the hood. By default, each output Parquet file will contain one row group. Args table_lcation (str) : the root directory to write the output Parquets to. Similar to Spark, Quokka by default writes out a directory of Parquets instead of dumping all the results to a single Parquet so the output can be done in parallel. If your dataset is small and you want a single file, you can adjust the output_line_limit parameter. Example table_locations: s3://bucket/prefix for cloud, absolute path /home/user/files for disk. output_line_limit (int) : the row group size in each output file. Return: Polars DataFrame containing the filenames of the Parquets that were produced. Examples >>> f = qc.read_csv(\"lineitem.csv\") >>> f = f.filter(\"l_orderkey < 10 and l_partkey > 5\") >>> f.write_parquet(\"/home/user/test-out\") # you should create the directory before hand.","title":".write_parquet"},{"location":"datastream/#filter","text":"source .filter( predicate: str ) This will filter the DataStream to contain only rows that match a certain predicate. Currently this predicate must be specified in SQL syntax. You can write any SQL clause you would generally put in a WHERE statement containing arbitrary conjunctions and disjunctions. The identifiers however, must be in the schema of this DataStream! We aim to soon support a more Pythonic interface that better resembles Pandas which allows you to do things like d = d[d.a > 10]. Please look at the examples below. Since a DataStream is implemented as a stream of batches, you might be tempted to think of a filtered DataStream as a stream of batches where each batch directly results from a filter being applied to a batch in the source DataStream. While this certainly may be the case, filters are aggressively optimized by Quokka and is most likely pushed all the way down to the input readers. As a result, you typically should not see a filter node in a Quokka execution plan shown by explain() . It is much better to think of a DataStream simply as a stream of rows that meet certain criteria, and who may be non-deterministically batched together by the Quokka runtime. Indeed, Quokka makes no guarantees on the sizes of these batches, which is determined at runtime. This flexibility is an important reason for Quokka's superior performance. Args predicate (str) : a SQL WHERE clause, look at the examples. Return: A DataStream consisting of rows from the source DataStream that match the predicate. Examples >>> f = qc.read_csv(\"lineitem.csv\") # filter for all the rows where l_orderkey smaller than 10 and l_partkey greater than 5 >>> f = f.filter(\"l_orderkey < 10 and l_partkey > 5\") # nested conditions are supported >>> f = f.filter(\"l_orderkey < 10 and (l_partkey > 5 or l_partkey < 1)\") # most SQL features such as IN and date are supported. >>> f = f.filter(\"l_shipmode IN ('MAIL','SHIP') and l_receiptdate < date '1995-01-01'\") # you can do arithmetic in the predicate just like in SQL. >>> f = f.filter(\"l_shipdate < date '1994-01-01' + interval '1' year and l_discount between 0.06 - 0.01 and 0.06 + 0.01\") # this will fail! Assuming c_custkey is not in f.schema >>> f = f.filter(\"c_custkey > 10\")","title":".filter"},{"location":"datastream/#select","text":"source .select( columns: list ) This will create a new DataStream that contains only selected columns from the source DataStream. Since a DataStream is implemented as a stream of batches, you might be tempted to think of a filtered DataStream as a stream of batches where each batch directly results from selecting columns from a batch in the source DataStream. While this certainly may be the case, select() is aggressively optimized by Quokka and is most likely pushed all the way down to the input readers. As a result, you typically should not see a select node in a Quokka execution plan shown by explain() . It is much better to think of a DataStream simply as a stream of rows that meet certain criteria, and who may be non-deterministically batched together by the Quokka runtime. Indeed, Quokka makes no guarantees on the sizes of these batches, which is determined at runtime. This flexibility is an important reason for Quokka's superior performance. Args columns (list) : a list of columns to select from the source DataStream Return: A DataStream consisting of only the columns selected. Examples >>> f = qc.read_csv(\"lineitem.csv\") # select only the l_orderdate and l_orderkey columns >>> f = f.select([\"l_orderdate\", \"l_orderkey\"]) # this will now fail, since f's schema now consists of only two columns. >>> f = f.select([\"l_linenumber\"])","title":".select"},{"location":"datastream/#drop","text":"source .drop( cols_to_drop: list ) Think of this as the anti-opereator to select. Instead of selecting columns, this will drop columns. This is implemented in Quokka as selecting the columns in the DataStream's schema that are not dropped. Args cols_to_drop (list) : a list of columns to drop from the source DataStream Return: A DataStream consisting of all columns in the source DataStream that are not in cols_to_drop . Examples >>> f = qc.read_csv(\"lineitem.csv\") # select only the l_orderdate and l_orderkey columns >>> f = f.drop([\"l_orderdate\", \"l_orderkey\"]) # this will now fail, since you dropped l_orderdate >>> f = f.select([\"l_orderdate\"])","title":".drop"},{"location":"datastream/#rename","text":"source .rename( rename_dict ) Renames columns in the DataStream according to rename_dict. This is similar to polars.rename . The keys you supply in rename_dict must be present in the schema, and the rename operation must not lead to duplicate column names. Note this will lead to a physical operation at runtime. Args rename_dict (dict) : key is old column name, value is new column name. Return: A DataStream with new schema according to rename.","title":".rename"},{"location":"datastream/#transform","text":"source .transform( f, new_schema: list, required_columns: set, foldable = True ) This is a rather Quokka-specific API that allows arbitrary transformations on a DataStream, similar to Spark RDD.map. Each batch in the DataStream is going to be transformed according to a user defined function, which can produce a new batch. The new batch can have completely different schema or even length as the original batch, and the original data is considered lost, or consumed by this transformation function. This could be used to implement user-defined-aggregation-functions (UDAFs). Note in cases where you are simply generating a new column from other columns for each row, i.e. UDF, you probably want to use the with_column method instead. A DataStream is implemented as a stream of batches. In the runtime, your transformation function will be applied to each of those batches. However, there are no guarantees whatsoever on the sizes of these batches! You should probably make sure your logic is correct regardless of the sizes of the batches. For example, if your DataStream consists of a column of numbers, and you wish to compute the sum of those numbers, you could first transform the DataStream to return just the sum of each batch, and then hook this DataStream up to a stateful operator that adds up all the sums. You can use whatever libraries you have installed in your Python environment in this transformation function. If you are using this on a cloud cluster, you have to make sure the necessary libraries are installed on each machine. You can use the utils package in pyquokka to help you do this. This is very similar to Spark's seldom used combineByKey feature. Note a transformation in the logical plan basically precludes any predicate pushdown or early projection past it, since the original columns are assumed to be lost, and we cannot directly establish correspendences between the input columns to a transformation and its output columns for the purposes of predicate pushdown or early projection. The user is required to supply a set or list of required columns, and we will select for those columns (which can be pushed down) before we apply the transformation. Args f (function) : The transformation function. This transformation function must take as input a Polars DataFrame and output a Polars DataFrame. The transformation function must not have expectations on the length of its input. Similarly, the transformation function does not have to emit outputs of a specific size. The transformation function must produce the same output columns for every possible input. new_schema (list) : The names of the columns of the Polars DataFrame that the transformation function produces. required_columns (list or set) : The names of the columns that are required for this transformation. This argument is made mandatory because it's often trivial to supply and can often greatly speed things up. foldable (bool) : Whether or not the transformation can be executed as part of the batch post-processing of the previous operation in the execution graph. This is set to True by default. Correctly setting this flag requires some insight into how Quokka works. Lightweight functions generally benefit from being folded. Heavyweight functions or those whose efficiency improve with large input sizes might benefit from not being folded. Return: A new transformed DataStream with the supplied schema. Examples # a user defined function that takes in a Polars DataFrame with a single column \"text\", converts it to a Pyarrow table, # and uses nice Pyarrow compute functions to perform the word count on this Polars DataFrame. Note 1) we have to convert it # back to a Polars DataFrame afterwards, 2) the function works regardless of input length and 3) the output columns are the # same regardless of the input. def udf2(x): x = x.to_arrow() da = compute.list_flatten(compute.ascii_split_whitespace(x[\"text\"])) c = da.value_counts().flatten() return polars.from_arrow(pa.Table.from_arrays([c[0], c[1]], names=[\"word\",\"count\"])) # this is a trick to read in text files, just use read_csv with a separator you know won't appear. # the result will just be DataStream with one column. >>> words = qc.read_csv(\"random_words.txt\", [\"text\"], sep = \"|\") # transform words to counts >>> counted = words.transform( udf2, new_schema = [\"word\", \"count\"], required_columns = {\"text\"}, foldable=True)","title":".transform"},{"location":"datastream/#with_column","text":"source .with_column( new_column, f, required_columns = None, foldable = True ) This will create new columns from certain columns in the dataframe. This is similar to pandas df.apply() that makes new columns. This is similar to Spark UDF or Pandas UDF, Polars with_column , Spark with_column , etc. Note that this function, like most Quokka DataStream functions, are not in-place, and will return a new DataStream, with the new column. This is a separate API from transform because the semantics allow for projection and predicate pushdown through this node, since the original columns are all preserved. Use this instead of transform if possible. A DataStream is implemented as a stream of batches. In the runtime, your function will be applied to each of those batches. The function must take as input a Polars DataFrame and produce a Polars DataFrame. This is a different mental model from say Pandas df.apply , where the function is written for each row. There are two restrictions. First, your result must only have one column, and it should have the same name as your new_column argument. Second, your result must have the same length as the input Polars DataFrame. You can use whatever libraries you have installed in your Python environment in this function. If you are using this on a cloud cluster, you have to make sure the necessary libraries are installed on each machine. You can use the utils package in pyquokka to help you do this. Importantly, your function can take full advantage of Polars' columnar APIs to make use of SIMD and other forms of speedy goodness. You can even use Polars LazyFrame abstractions inside of this function. Of course, for ultimate flexbility, you are more than welcome to convert the Polars DataFrame to a Pandas DataFrame and use df.apply . Just remember to convert it back to a Polars DataFrame with only the result column in the end! Args new_column (str) : The name of the new column. f (function) : The apply function. This apply function must take as input a Polars DataFrame and output a Polars DataFrame. The apply function must not have expectations on the length of its input. The output must have the same length as the input. The apply function must produce the same output columns for every possible input. required_columns (list or set) : The names of the columns that are required for your function. If this is not specified then Quokka assumes all the columns are required for your function. Early projection past this function becomes impossible. Long story short, if you can specify this argument, do it. foldable (bool) : Whether or not the function can be executed as part of the batch post-processing of the previous operation in the execution graph. This is set to True by default. Correctly setting this flag requires some insight into how Quokka works. Lightweight functions generally benefit from being folded. Heavyweight functions or those whose efficiency improve with large input sizes might benefit from not being folded. Return: A new DataStream with a new column made by the user defined function. Examples >>> f = qc.read_csv(\"lineitem.csv\") # people who care about speed of execution make full use of Polars columnar APIs. >>> d = d.with_column(\"high\", lambda x:(x[\"o_orderpriority\"] == \"1-URGENT\") | (x[\"o_orderpriority\"] == \"2-HIGH\"), required_columns = {\"o_orderpriority\"}) # people who care about speed of development can do something that hurts my eyes. def f(x): y = x.to_pandas() y[\"high\"] = y.apply(lambda x:(x[\"o_orderpriority\"] == \"1-URGENT\") | (x[\"o_orderpriority\"] == \"2-HIGH\"), axis = 1) return polars.from_pandas(y[\"high\"]) >>> d = d.with_column(\"high\", f, required_columns={\"o_orderpriority\"})","title":".with_column"},{"location":"datastream/#stateful_transform","text":"source .stateful_transform( executor: Executor, new_schema: list, required_columns: set, partitioner = PassThroughPartitioner(), placement = 'cpu' ) EXPERIMENTAL API This is like transform , except you can use a stateful object as your transformation function. This is useful for example, if you want to run a heavy Pytorch model on each batch coming in, and you don't want to reload this model for each function call. Remember the transform API only supports stateless transformations. You could also implement much more complicated stateful transformations, like implementing your own aggregation function if you are not satisfied with Quokka's default operator's performance. This API is still being finalized. A version of it that takes multiple input streams is also going to be added. This is the part of the DataStream level api that is closest to the underlying execution engine. Quokka's underlying execution engine basically executes a series of stateful transformations on batches of data. The difficulty here is how much of that underlying API to expose here so it's still useful without the user having to understand how the Quokka runtime works. To that end, we have to come up with suitable partitioner and placement strategy abstraction classes and interfaces. If you are interested in helping us hammer out this API, please talke to me: zihengw@stanford.edu. Args executor (pyquokka.executors.Executor) : The stateful executor. It must be a subclass of pyquokka.executors.Executor , and expose the execute and done functions. More details forthcoming. new_schema (list) : The names of the columns of the Polars DataFrame that the transformation function produces. required_columns (list or set) : The names of the columns that are required for this transformation. This argument is made mandatory because it's often trivial to supply and can often greatly speed things up. Return: A transformed DataStream. Examples Forthcoming.","title":".stateful_transform"},{"location":"datastream/#distinct","text":"source .distinct( keys: list ) Return a new DataStream with specified columns and unique rows. This is like SELECT DISTINCT(KEYS) FROM ... in SQL. Note all the other columns will be dropped, since their behavior is unspecified. If you want to do deduplication, you can use this operator with keys set to all the columns. This could be accomplished by using groupby().agg() but using distinct is generally faster because it is nonblocking, compared to a groupby. Quokka really likes nonblocking operations because it can then pipeline it with other operators. Args keys (list) : a list of columns to select distinct on. Return: A transformed DataStream whose columns are in keys and whose rows are unique. Examples >>> f = qc.read_csv(\"lineitem.csv\") # select only the l_orderdate and l_orderkey columns, return only unique rows. >>> f = f.distinct([\"l_orderdate\", \"l_orderkey\"]) # this will now fail, since l_comment is no longer in f's schema. >>> f = f.select([\"l_comment\"])","title":".distinct"},{"location":"datastream/#join","text":"source .join( right, on = None, left_on = None, right_on = None, suffix = '_2', how = 'inner' ) Join a DataStream with another DataStream or a small Polars DataFrame (<10MB). If you have a Polars DataFrame bigger than this, the best solution right now is to write it out to a file and have Quokka read it back in as a DataStream. I realize this is perhaps suboptimal, and this will be improved. A streaming two-sided distributed join will be executed for two DataStream joins and a streaming broadcast join will be executed for DataStream joined with Polars DataFrame. Joins are obviously very important, and we are constantly improving how we do joins. Eventually we will support out of core joins, when @savebuffer merges his PR into Arrow 10.0. Args right (DataStream or Polars DataFrame) : the DataStream or Polars DataFrame to join to. on (str) : You could either specify this, if the join column has the same name in this DataStream and right , or left_on and right_on if the join columns don't have the same name. left_on (str) : the name of the join column in this DataStream. right_on (str) : the name of the join column in right . suffix (str) : if right has columns with the same names as columns in this DataStream, their names will be appended with the suffix in the result. how (str) : only supports \"inner\" for now. Return: A new DataStream that's the joined result of this DataStream and \"right\". By default, columns from both side will be retained, except for right_on from the right side. Examples >>> lineitem = qc.read_csv(\"lineitem.csv\") >>> orders = qc.read_csv(\"orders.csv\") >>> result = lineitem.join(orders, left_on = \"l_orderkey\", right_on = \"o_orderkey\") # this will now fail, since o_orderkey is not in the joined DataStream. >>> result = result.select([\"o_orderkey\"])","title":".join"},{"location":"datastream/#groupby","text":"source .groupby( groupby: list, orderby = None ) Group a DataStream on a list of columns, optionally specifying an ordering requirement. This returns a GroupedDataStream object, which currently only expose the aggregate method. This is similar to Pandas df.groupby().agg() syntax. Eventually the GroupedDataStream object will also support different kinds of window functions. Args groupby (list or str) : a column or a list of columns to group on. orderby (list) : a list of ordering requirements of the groupby columns, specified in a list like this: [(col1, \"asc\"), (col2, \"desc\")]. Return: A GroupedDataStream object with the specified grouping and the current DataStream. Examples >>> lineitem = qc.read_csv(\"lineitem.csv\") >>> result = lineitem.groupby([\"l_orderkey\",\"l_orderdate\"], orderby = [(\"l_orderkey\", \"asc\"), (\"l_orderdate\", \"desc\")])","title":".groupby"},{"location":"datastream/#agg","text":"source .agg( aggregations ) Aggregate this DataStream according to the defined aggregations without any pre-grouping. This is similar to Pandas df.agg() . The result will be one row. The result is a DataStream that will return a batch when the entire aggregation is done, since it's impossible to return any aggregation results without seeing the entire dataset. As a result, you should call .compute() or .collect() on this DataStream instead of doing additional operations on it like .filter() since those won't be pipelined anyways. The only reason Quokka by default returns a DataStream instead of just returning a Polars DataFrame or a Quokka DataSet is so you can do .explain() on it. Args aggregations (dict) : similar to a dictionary argument to Pandas df.agg() . The key is the column name, where the value is a str that is \"min\", \"max\", \"mean\", \"sum\", \"avg\" or a list of such strings. If you desire to have the count column in your result, add a key \"*\" with value \"count\". Look at the examples. Return: A DataStream object that holds the aggregation result. It will only emit one batch, which is the result when it's done. You should call .collect() or .compute() on it as it is impossible to pipeline past an aggregation, so might as well as materialize it right now. Examples >>> lineitem = qc.read_csv(\"lineitem.csv\") >>> d = lineitem.filter(\"l_shipdate <= date '1998-12-01' - interval '90' day\") >>> d = d.with_column(\"disc_price\", lambda x:x[\"l_extendedprice\"] * (1 - x[\"l_discount\"]), required_columns ={\"l_extendedprice\", \"l_discount\"}) # I want the sum and average of the l_quantity column and the l_extendedprice colum, the sum of the disc_price column, the minimum of the l_discount # column, and oh give me the total row count as well. >>> f = d.agg({\"l_quantity\":[\"sum\",\"avg\"], \"l_extendedprice\":[\"sum\",\"avg\"], \"disc_price\":\"sum\", \"l_discount\":\"min\",\"*\":\"count\"})","title":".agg"},{"location":"datastream/#aggregate","text":"source .aggregate( aggregations ) Alias of agg .","title":".aggregate"},{"location":"datastream/#count","text":"source .count() Return total row count.","title":".count"},{"location":"datastream/#sum","text":"source .sum( columns ) Return the sums of the specified columns.","title":".sum"},{"location":"datastream/#max","text":"source .max( columns ) Return the maximum values of the specified columns.","title":".max"},{"location":"datastream/#min","text":"source .min( columns ) Return the minimum values of the specified columns.","title":".min"},{"location":"datastream/#mean","text":"source .mean( columns ) Return the mean values of the specified columns.","title":".mean"},{"location":"datastream/#groupeddatastream","text":"source GroupedDataStream( source_data_stream: DataStream, groupby, orderby ) Methods:","title":"GroupedDataStream"},{"location":"datastream/#agg_1","text":"source .agg( aggregations: dict ) Aggregate this GroupedDataStream according to the defined aggregations. This is similar to Pandas df.groupby().agg() . The result's length will be however number of rows as there are unique group keys combinations. The result is a DataStream that will return a batch when the entire aggregation is done, since it's impossible to return any aggregation results without seeing the entire dataset. As a result, you should call .compute() or .collect() on this DataStream instead of doing additional operations on it like .filter() since those won't be pipelined anyways. The only reason Quokka by default returns a DataStream instead of just returning a Polars DataFrame or a Quokka DataSet is so you can do .explain() on it. Args aggregations (dict) : similar to a dictionary argument to Pandas df.agg() . The key is the column name, where the value is a str that is \"min\", \"max\", \"mean\", \"sum\", \"avg\" or a list of such strings. If you desire to have the count column in your result, add a key \"*\" with value \"count\". Look at the examples. Return: A DataStream object that holds the aggregation result. It will only emit one batch, which is the result when it's done. You should call .collect() or .compute() on it as it is impossible to pipeline past an aggregation, so might as well as materialize it right now. Examples >>> lineitem = qc.read_csv(\"lineitem.csv\") >>> d = lineitem.filter(\"l_shipdate <= date '1998-12-01' - interval '90' day\") >>> d = d.with_column(\"disc_price\", lambda x:x[\"l_extendedprice\"] * (1 - x[\"l_discount\"]), required_columns ={\"l_extendedprice\", \"l_discount\"}) # I want the sum and average of the l_quantity column and the l_extendedprice colum, the sum of the disc_price column, the minimum of the l_discount # column, and oh give me the total row count as well, of each unique combination of l_returnflag and l_linestatus >>> f = d.groupby([\"l_returnflag\", \"l_linestatus\"]).agg({\"l_quantity\":[\"sum\",\"avg\"], \"l_extendedprice\":[\"sum\",\"avg\"], \"disc_price\":\"sum\", \"l_discount\":\"min\",\"*\":\"count\"})","title":".agg"},{"location":"datastream/#aggregate_1","text":"source .aggregate( aggregations: dict ) Alias for agg.","title":".aggregate"},{"location":"quokka_context/","text":"QuokkaContext source QuokkaContext( cluster = None ) Methods: .read_files source .read_files( table_location: str ) .read_csv source .read_csv( table_location: str, schema = None, has_header = False, sep = ', ' ) .read_parquet source .read_parquet( table_location: str, schema = None ) .new_stream source .new_stream( sources: dict, partitioners: dict, node: Node, schema: list, ordering = None ) .new_dataset source .new_dataset( source, schema: list ) .optimize source .optimize( node_id ) .lower source .lower( end_node_id, collect = True ) .execute_node source .execute_node( node_id, explain = False, mode = None, collect = True ) .explain source .explain( node_id, mode = 'graph' ) DataSet source DataSet( quokka_context: QuokkaContext, schema: dict, source_node_id: int )","title":"QuokkaContext"},{"location":"quokka_context/#_1","text":"","title":""},{"location":"quokka_context/#quokkacontext","text":"source QuokkaContext( cluster = None ) Methods:","title":"QuokkaContext"},{"location":"quokka_context/#read_files","text":"source .read_files( table_location: str )","title":".read_files"},{"location":"quokka_context/#read_csv","text":"source .read_csv( table_location: str, schema = None, has_header = False, sep = ', ' )","title":".read_csv"},{"location":"quokka_context/#read_parquet","text":"source .read_parquet( table_location: str, schema = None )","title":".read_parquet"},{"location":"quokka_context/#new_stream","text":"source .new_stream( sources: dict, partitioners: dict, node: Node, schema: list, ordering = None )","title":".new_stream"},{"location":"quokka_context/#new_dataset","text":"source .new_dataset( source, schema: list )","title":".new_dataset"},{"location":"quokka_context/#optimize","text":"source .optimize( node_id )","title":".optimize"},{"location":"quokka_context/#lower","text":"source .lower( end_node_id, collect = True )","title":".lower"},{"location":"quokka_context/#execute_node","text":"source .execute_node( node_id, explain = False, mode = None, collect = True )","title":".execute_node"},{"location":"quokka_context/#explain","text":"source .explain( node_id, mode = 'graph' )","title":".explain"},{"location":"quokka_context/#dataset","text":"source DataSet( quokka_context: QuokkaContext, schema: dict, source_node_id: int )","title":"DataSet"},{"location":"runtime/","text":"Quokka Runtime API documentation Programming Model A note about the name: the name is inspired by the Apache Flink icon, which is a chipmunk. A quokka is a marsupial that resembles a chipmunk. Motivation Popular big data processing frameworks such as Spark and Dask rely on bulk-synchronous execution on distributed datasets. Often, a map-reduce style model is adopted, where mappers perform functions on partitions of the input, the mapper outputs are shuffled into groups, and after the shuffle has fully/mostly completed , reducers start working on each group. Typically this is implemented as a pull-based model where reducers pull required data from the mappers, who persist their output in some kind of external storage (disk or network) when fault tolerance is desired. There are a couple problems with this approach. The first, as recent works such as LinkedIn Magnet and Uber Zeus have identified, is that when each mapper doesn't have too much data for each reducer, the pull operation amounts to a bunch of random disk/network reads. This is horrible. The solution is push-based shuffles, where mappers push data to the reducers. Data can now be persisted on the reducer side for fault tolerance. However, this only addresses part of the problem. In a synchronous shuffle, even when mapper output is pushed to the reducers as soon as they are generated, the reducers can't start operating on said data until they have received near everything. This is because the current Map-Reduce paradigm stipulates that the reduction function is a function on all the data assigned to it from the mappers. This forces the reducers to start only after most of the mappers have completely executed, making any kind of pipelined parallel execution between the two impossible. This is unfortunate, because mappers and reducers often use very different resources (network I/O bound mappers + compute bound reducers), and can often be scheduled for parallel execution on the same physical instances without compromising too much the performance of either. Quokka's solution is to support two different kinds of reducer functions. Blocking reducers are similar to classic Map-Reduce reducers and block until they receive all mapper outputs. However, non-blocking reducers can start executing on mapper outputs as soon as they arrive, producing some output of its own and updating some local state. For example, sort, count and aggregation are blocking reducer functions because their output depend on all the data. However, join, filter and projection can be implemented in a non-blocking fashion with streaming operators. Non-blocking reducers can be pipelined with other non-blocking reducers, while a blocking reducer breaks the pipeline. Mappers are treated as non-blocking reducers where the output already exists in network/disk storage. We impose some limitations on the kinds of non-blocking operators we support, which are described in detail later. Logically, one can view Quokka execution as a series of stages, where each stage start with the output produced by a blocking operator, ends with another blocking operator, and executes non-blocking operators in between. The entire stage is executed in a pipeline-parallel fashion, and can be viewed as a pure streaming system. The stage inputs/outputs use Spark's lineage tracking based fault-tolerance and persistence mechanism. Since each Quokka stage now corresponds to a few Spark stages, Quokka also implements intra-stage fault tolerance based on checkpointing. The checkpointing recovery mechanism in Quokka conveniently avoids global asynchronous rollbacks, the bane of streaming systems, thanks to the restrictions we impose on the non-blocking operators. Quokka also aims to support autoscaling. (I have a plan to do this, but likely will not get to this until after the rotation.) Execution Model The Quokka runtime API allows you to construct a task graph of nodes , which corresponds to a Quokka stage. This is very similar to other DAG-based processing frameworks such as Apache Spark or Tensorflow . For example, you can write the following code in the runtime API to execute TPC-H query 6: task_graph = TaskGraph() lineitem = task_graph.new_input_csv(bucket,key,lineitem_scheme,8,batch_func=lineitem_filter, sep=\"|\") agg_executor = AggExecutor() agged = task_graph.new_blocking_node({0:lineitem}, agg_executor, 1, {0:None}) task_graph.initialize() task_graph.run() There are perhaps a couple of things to note here. Firstly, there are two types of nodes in the runtime API. There are input nodes , declared with APIs such as new_input_csv or new_input_parquet , which interface with the external world (you can define where they will read their data), and task nodes , declared with new_non_blocking_node or new_blocking_node , which take as input the outputs generated from another node in the task graph, either an input node or another task node. Secondly, we see that the task node agged depends on the outputs from the input node lineitem . We will describe what exactly are the types of lineitem and agged later (the former is a stream and the latter is a dataset). Finally, note that the task graph ends with a blocking node. This is currently required, if you want to be able to interact with the results of the task graph execution. Multiple stages are implemented with multiple task graphs, with the first node of stage 2 reading from the output of stage 1, like the following: task_graph = TaskGraph() a = task_graph.new_input_csv(\"bump\",\"a-big.csv\",[\"key\"] + [\"avalue\" + str(i) for i in range(100)],{'localhost':2}) b = task_graph.new_input_csv(\"bump\",\"b-big.csv\",[\"key\"] + [\"bvalue\" + str(i) for i in range(100)],{'localhost':2}) join_executor = OOCJoinExecutor(on=\"key\") output = task_graph.new_blocking_node({0:quotes,1:trades},None, join_executor,{'localhost':4},{0:\"key\", 1:\"key\"}) task_graph.initialize() task_graph.run() del task_graph task_graph2 = TaskGraph() count_executor = CountExecutor() joined_stream = task_graph2.new_input_from_dataset(output,{'localhost':4}) final = task_graph2.new_blocking_node({0:joined_stream}, None, count_executor, {'localhost':4}, {0:'key'}) task_graph2.initialize() task_graph2.run() Note that since the output of a stage is persisted as in Spark, one can delete the first task graph and still access its outputs. Since a task graph represents one Quokka stage, it strictly follows push-based execution. This means that a node does not wait for its downstream dependencies to ask for data, but instead actively pushes data to its downstream dependencies whenever some intermediate results become available. In short, execution proceeds as follows : input nodes read batches of data from a specified source, which might be an external data source or the outputs of a previous stage, and pushes those batches to downstream task nodes. A task node exposes a handler to process incoming batches as they arrive, possibly updating some internal state, and for each input batch possibly produces an output batch for its own downstream children. The programmer is expected to supply this handler function as an executor object (e.g. OOCJoinExecutor , AggExecutor ). Quokka provides a library of pre-implemented executor objects that the programmer can use for SQL, ML and graph analytics. Each task node can have multiple physical executors, referred to as channels . This is a form of intra-operator data parallelism, as opposed to the inter-operator pipeline parallelism that results from all task nodes executing at the same time. These physical executors all execute the same handler function, but on different portions of the input batch, partitioned by a user-specified partition function. A Map-Reduce job with M mappers and R reducers would be implemented in Quokka as a single mapper task node and a single reducer task node, where the mapper task node has M channels and the reducer task node has R channels. In the example above, we specified that the input node lineitem has 8 channels, and the task node agged has only 1 channel. The partition key was not specified ( {0:None} ) since there is no parallelism, thus no need for partitioning. The situation looks something like the following picture: Quokka keeps track of all the channels and schedules them onto physical computing resources. For the engine, two channels from different task nodes are on more or less equal footing -- they can be scheduled on the same hardware or different hardware. A channel from an input node completes execution when there's no more inputs to be read or if all of its downstream dependencies have completed execution. A channel from a task node completes execution when: all of its upstream sources have completed execution if its execution handler decides to terminate early based on the input batch and its state (e.g. for a task node that executes the limit operator in a limit query, it might keep as local state the buffered output, and decide to terminate when that output size surpasses the limit number) if all its downstream dependencies have completed execution. By default, all channels start execution at once. This does not necessarily mean that they will start processing data, this means that they will all start waiting for input batches from their upstream sources to arrive. One could specify that an input node delay execution until another input node has finished. For example to implement a hash join one might want to stream in one table to build the hash table, then stream in the other table for probing. The runtime API is meant to be very flexible and support all manners of batch and stream processing. For example, one could specify an input node that listens to a Kafka stream, some task nodes which processes batches of data from that stream, and an output node that writes to another Kafka stream. In this case, since the input node will never terminate, and assuming the other nodes do not trigger early termination, the task graph will always be running. As a result of this flexibility, it requires quite a lot of knowledge for efficient utilization. As a result, we aim to provide higher level APIs to support common batch and streaming tasks in SQL, machine learning and graph analytics. Most programmers are not expected to program at the runtime API level, but rather make use of the pre-packaged higher-level APIs. Stateful Actors Let's talk more about task nodes in Quokka. Channels in task nodes can be treated as stateful operators in an actor programming model. Quokka adopts the notion of channels in a task node to specify that a group of actors all execute the same code, for fault tolerance and autoscaling purposes. One could override default Quokka behavior by simply specifying different task nodes with one channel each, all executing the same code. The key property of stateful operators in Quokka is confluence : in the context of nondeterministic message delivery, an operation on a single machine is confluent if it produces the same set of outputs for any nondeterministic ordering and batching of a set of inputs. (Hellerstein, CALM) Note that the output itself can also be produced in any order. It\u2019s easy to see that any composition of confluent operators is still confluent. We relax the confluent definition somewhat here to accept potentially different output sets, assuming they are all semantically correct. For example an operator that implements the LIMIT N clause in SQL can admit any of N input records it sees. More importantly, for Quokka we allow operators to depend on intra-stream ordering, just not inter-stream ordering. This means that it might still expect the inputs produced by a certain stream to observe some order, while there are no restrictions on the relative orderings between different input streams. Quokka as a system enforces intra-stream message order, but makes zero gurantees about inter-stream message orders. Henceforth, confluence will refer to this narrow definition, not the one defined in the CALM paper. Confluence is a very nice property to have in general, more so for streaming systems. Let\u2019s imagine a stateful operator with two different upstream operators producing messages. It is very nice if the system\u2019s correctness does not depend on the order in which the two upstream operators produce the messages, which could depend on network delay, task scheduling, etc. This is critical for performance in a push-based framework since a node should never wait on any one of its input streams. In addition, it also greatly facilitates fault tolerance, as messages from different sources can be replayed in any order in regards to one another, as we will describe later. Confluence is perhaps the key difference between Quokka and streaming-centric systems like Flink. In Flink you can totally write pipelines where the outputs depend very strongly on the order the inputs are supplied. In Quokka it is not allowed. (Really at this point, it's only \"not recommended\" -- there are no checks in place to see if your actor is confluent or not. What's guaranteed is that all the operators in the libraries supplied follow this model. Enforcing this is future work.) What are some examples of confluent stateful operators? First let's categorize the world of stateful operators we'd like to implement in data analytics. As mentioned previosuly, there are two important cateogories: nonblocking and blocking . Blocking operators cannot emit any outputs to their downstream children until all of their inputs have been processed. Examples are any kind of aggregation and sort. For (naive) aggregation, the stateful operator does not know it has the final result for any of its aggregation keys until it has seen all of its inputs. For sorting, the stateful operator cannot guarantee that it would emit results in sorted order until it has received all its inputs. We call any operator that is not blocking non-blocking. Example non-blocking operators are map, filter, projection and join. Blocking operators are pipeline breakers, and negate the benefits of using a streaming framework like Quokka. Confluence is easy to reason about for blocking operators. The blocking operator emit only one output, at the very end. We just have to make sure that this output is the same regardless of the order in which we supply the operator's inputs. Since this operator is typically a function of the final state, we just have to ensure that the final state is the same. If we imagine that each incoming message changes the state of the operator by function f , then it's easy to see that as long as f is commutative this is true. For example, any kind of aggregation is commutative, the merge step in merge-sort is commutative, etc. Confluence is harder to reason about for nonblocking operators. We must guarantee that regardless of the order the input batches are supplied, the set of output batches do not change. Let\u2019s say we only have two incoming messages, m and n, to a node with starting state S. Then the outputs produced by giving m first to S, changing the state S to f(m, S), while producing output o(m, S) and then giving n to S, changing the state to f(n, f(m,S)) while producing output o(n, f(m,S)), which is {o(m,S), o(n,f(m,s))} is the same as if we gave the outputs in the reverse order. Note that this assumes that m and n are all the messages the node will see. Confluence is about eventual consistency. While in general there are many ways to achieve this kind of behavior as long as only monotonic operations are applied to the state at each input batch (Bloom), in Quokka all the stock non-blocking operators take the approach of setting the state as sets of immutable batches of data, that can only be added to. This is clearly monotonic. If you are writing a stateful operator for Quokka, this is the recommended approach. What this means is that it is impossible to perform operations that require a specific batch amongst the set of batches, such as list indexing, since ordering of the batches in a set in the state is undefined. Most meaningful operations take the incoming message and produce an output that depends on the entire set, or not at all. An example of a confluent stateful operator in Quokka is a join. The code can be roughly summarized as follows: state0 = set() state1 = set() for each input: if input from stream0: state0.add(input) emit set(input.join(i) for i in state1) else: state1.add(input) emit set(i.join(input) for i in state0) Note that there is in fact a non-monotonic domain-specific optimization we can make that will preserve confluence in the case of a primary key join. Any input streamed in from stream0 can guarantee that any future records from that table will not have the same key value. Thus all state1 related to the record\u2019s key can be safely deleted. Quokka currently does not implement this optimization. Datasets and Streams Let's talk more about how non-blocking and blocking operators work in Quokka. Blocking operators could be introduced by operations like aggregations and sort, or simply by user command when they wish to materialize data with .materialize() (similar to .cache() semantics in Spark or .compute() semantics in Dask). Such blocking operators will produce a Dataset in Quokka, while non-blocking operators will produce a Stream . Downstream operators could depend on both upstream datasets and streams. The difference is that the upstream dataset need to be completely materialized when an operator starts executing, while a stream is just a promise that batches of data will be produced at some point in the future in any order. In other words, from the perspective of the operator, it can pull data from an upstream dataset and expects data to be pushed to it from the stream. In the very first code listing for TPC-H query 6, agged is a dataset whereas lineitem is a stream. In practice, a Quokka DAG can consist of many blocking operators and non-blocking operators organized in complicated ways. For example, here is the DAG for a PageRank application: As previously described, Quokka decomposes the computation into stages, with each stage ending in the creation of a Dataset. In this case the computation will be broken into two stages, the first of which consists of the nonblocking input sparse matrix read and caching (the upper row). The second will be the bottom row. The second stage depends on the first one, so it will be launched after the first one has completed. This is very similar to how stages in Spark work. (Note that strictly speaking, every stage has to start from a Dataset too. In this case the input nodes depend on Datasets that are pre-created in S3 or Disk, and are abbreviated in this graph.) Similarly to an RDD, Quokka represents a Dataset as a collection of immutable objects, and some associated metadata on those objects, which is itself an immutable object. The objects are all stored on a shared-memory object store with persistence (currently RocksDB). When you use task_graph.add_blocking_node in Quokka, a Dataset object will be returned. You can use this Dataset object in downstream operators. Quokka guarantees that by the time the downstream operators execute, all the Datasets that they depend on would have been materialized in this object store. The stock Dataset class in Quokka exposes some convenience methods such as an iterator to iterate through the objects. The user could also interact directly with the object store after looking up metadata from the Dataset object. There are more specialized Dataset class implementations in Quokka like KVDataset or RangeDataset which corresponds to hash-based partitioning or range-based partitioning of objects that expose more methods. The user could also implement a custom Dataset class that descends from Dataset with even more methods. It is important to ensure that when using a Dataset in a downstream operator that also takes streaming inputs, the confluence property is respected. Unfortunately, Quokka currently does not enforce this and it's possible for you to mess this up when writing your code. Although it's not that easy to mess up, since you cannot change the objects you read from the Dataset. A downstream operator could treat the Dataset as a stream by simply invoking the iterator to iterate through the objects in the Dataset. However, for many downstream operations, it might be desirable to explicitly convert a Dataset into a Stream again (e.g. to use stock operators that only have stream-based implementations). You can do that by using the specialized task node add_input_dataset . Internally, this task node just calls the iterator repeatedly and produce a stream of batches corresponding to the objects in the Dataset. Fault tolerance (future work) The current theory is a bit complicated. I am still thinking through how this should work exactly, but hopefully the gist gets through. Given our group of confluent stateful operators, how do we achieve fault tolerance? A Quokka application can be thought of as a DAG, where each node corresponds to a channel, from one of the task nodes. Each node is assigned to a physical hardware instance. Quokka is designed to expect many nodes to be assigned to one physical instance. For example, let's imagine the following case, where the nodes circled belongs to machine A and the rest belong to machine B, and nodes 1 and 2 are channels of the input node. 3, 4 and 5 are non-blocking operators, 6 and 7 are blocking operators. Quokka follows a checkpoint-based system where each channel periodically asynchronously checkpoints its local state to persistent storage (AWS S3). Note that this is quite efficient given the types of states we typically have, such as (typically) small intermediate aggregation results and sets of batches that are monotonically added to. (This is definitely an area of future work) The problem is easy to spot: \"yes checkpoints are great, but you must turn off the entire system when a machine fails to sync it back to the latest good state, and then reapply all the inputs.\" Yes that is true for a general-purpose streaming system like Flink or Naiad. Coordinated global rollbacks really suck. But in Quokka where all the stateful operators are confluent, this need not happen. What happens when machine A dies? TLDR: machine B can keep doing work as if nothing is wrong, while machine A's workload eventually gets rescheduled. The gory details: nodes 1, 3, 6 and 7 carry on with life (they won't even know machine A just died). 1 will notice that it can no longer send messages to 4 and 5. That's ok, it will just buffer those messages. 3 and 6 will realize that they have fewer incoming messages now. 7 will notice that they have no more incoming messages. That's ok, they can work on their backlog. The system then goes about recovering 2, 4 and 5. It will request a new machine to schedule 2, 4 and 5, or simply schedule them to machine B. 2 is a channel of an input node, which has no state. In Quokka, all message sent between channels are tagged with a sequence number. The number starts from 0 and monotonically increases. This way, the task node discards messages with a wrong sequence number. The state of a stateful operator is also tagged with a number. The state number starts from 0 and monotonically increases every time. When an operator checkpoints, it writes its state, its state number, and the latest sequence number it expects from its inputs. A consistent checkpoint contains all this information. Quokka will look at the last consistent checkpoint of nodes 4 and 5, and find the minimum of all the latest sequence numbers across both consistent checkpoints. This is the batch that 2 will now have to start to produce. Let's say that node 4 had the smaller latest sequence number. Then node 4 will immediately start catching up. Node 5 will look at the incoming batches, find that their sequence numbers are smaller than expected, and proceed to ignore all of them. Eventually, node 5 will start recovering state as well. After both nodes catch up to the point where they died, node 6 and 7 will start accepting messages from node 4 and node 5 since now their messages have valid sequence numbers. What if in this example, node 2 was not an input node but a task node? Then the dead subgraph has no way of re-reading the input. Long story short, each node needs to buffer outgoing messages, until its children notify it that the state change affected by that outgoing message has been persisted to a checkpoint. This way, messages can be replayed when needed. All this while, machine B has been carrying on with its life. This means that if we started out in a load balanced way, then this fault recovery has introduced stragglers -- node 4 and 5 will now finish after node 3. This is actually okay from a resource-usage point of view. Note that nowhere in this process are we wasting resources across the cluster, as seen in global synchronous rollbacks. Only the lost states need to be recomputed, similar in vein to the fault-tolerance mechanism in Spark. In addition, fault recovery required minimal communication with workers that did not experience a fault, minimizing fault recovery overhead. Stragglers are okay for Quokka, we will mediate them through the dynamic scheduling mechanism described in the next section. Scheduling and Autoscaling (future work) There are two auto-scaling strategies in Quokka. The first is automatic, while the second might require some user input. Recall that Quokka is designed to expect many channels to be assigned to the same physical hardware. But first, let's talk about how Quokka schedules channels to hardware, assuming that the graph is static, and the number and type of machines are fixed. Firstly, in the current runtime API, when instantiating a task node or input node, the user manually specifies how many channels are there and where those channels go. Dynamic channel scheduling is done when programming in higher-level APIs. We observe that each channel is in effect an independent stateful oeprator that can be scheduled independently. However, different scheduling strategies entail different communication costs. If channel A sends a large volume of messages to channel B, then we should schedule them on the same machine. Note that contrary to intuition, there is no benefit at all in scheduling multiple channels from the same input node or task node on the same machine apart from parallelism, since they never talk to each other. Channel scheduling can be dynamic, in the sense that a channel can be moved from one physical machine to another in a very straight-forward way. The self-contained nature of an actor is an oft-quoted strength of the actor model. All that needs to happen is for Quokka to transfer the state of the actor to another node (which could be done asynchronously after the transfer decision is made), and change the partition function for the channel's parents so that the appropriate physical machine receives the incoming messages. The data transfer cost is the only cost in moving an actor. Different criteria can be used to decide if a channel should be moved to another physical machine. These could include machine specific characteristics, such as limited memory available or high CPU usage on the current machine, or the lack thereof on the other machine. Quokka can also use channel-specific information, for example if the system observes the channel transfering large amounts of data to another channel on another machine and determines that the cost in moving this channel can be overcame by the benefit in data locality achieved after the move. The stragglers introduced by fault recovery can be mediated in this fashion. Node 1 and 3 will finish before node 2 and 4/5, creating less resource usage on machine B. The system will then try to move one of node 4/5 onto machine B. Manual autoscaling using combiner functions To be written. Example Applications TPC-H query 12 Pagerank Let's talk about how PageRank works in the Quokka programming model. TaskGraph API new_input_csv (bucket, key, names, parallelism, ip='localhost',batch_func=None, sep = \",\", dependents = [], stride = 64 * 1024 * 1024) Currently, new_input_csv only supports reading a CSV in batches from an AWS S3 bucket. Required arguments in order: bucket : str. AWS S3 bucket key : str. AWS S3 key names : list of str. Column names. Note that if your rows ends with a delimiter value, such as in TPC-H, you will have to end this list with a placeholder such as \"null\". Look at the TPC-H code examples under apps. parallelism : int. the runtime API expects the programmer to explicitly state the amount of intra-op parallelism to expose. 8 is typically a good number. Keyword arguments: ip : str. the IP address of the physical machine the input node should be placed. Defaults to local execution. batch_func : function. the user can optionally pass in a function to execute on the input CSV chunk before it's passed off to downstream dependents. Currently the input CSV is parsed into a Pandas Dataframe, so batch_func can be any Python function that can take a Pandas Dataframe as input and produces a Pandas Dataframe. This can be done to perform predicate pushdown for SQL for example. sep : str. delimiter dependents : list of int. an input node can depend on other input nodes, i.e. only start once another input node is done. For example to implement as hash join where one input might depend on another, one could do the following: a = new_input_csv(...) b = new_input_csv(...,dependents=[a]) stide : int. how many bytes to read from the input S3 file to read at a time, default to 64 MB. Returns : a node id which is a handle to this input node, that can be used as the sources argument for task nodes or dependents arguments for other input nodes. new_input_parquet(bucket, key, names, parallelism, columns, skip_conditions, ip='localhost',batch_func=None, sep = \",\", dependents = [], stride = 64 * 1024 * 1024) Not yet implemented. new_task_node(sources, functionObject, parallelism, partition_key, ip='localhost') Instantiate a new task node with an executor object that defines the handler function which runs on each incoming batch. Required arguments in order: sources : dict of int -> int. the upstream sources that feed batches to this task node. Expects a dictionary, where the keys are integers and values are node ids (also stored as integers). This in effect names the source nodes. i.e. if you specify {0: source_node_id_x, 1:source_node_id_y} , from the perspective of this task node you are calling the batches coming from source_node_id_x source 0 and the batches coming from node_id_y source 1. You will make use of these identifiers writing the executor class's handler function for incoming batches. functionObject : an executor object which defines the input batch handler function. More details on this in the next section. You can write your own or use a pre-supplied one from the sql, ml or graph packages. parallelism : int. the runtime API expects the programmer to explicitly state the amount of intra-op parallelism to expose. Think carefully about this choice. Computationally intensive tasks might benefit from parallelism, while simple tasks such as aggregation might not. partition_key : dict of int -> in. This argument expects a dictionary with a key for each key in the sources dict. It describes how the input batches should be partitioned amongst the channels. If the value is None, then the input batch is copied and broadcast to all channels. Otherwise, currently each channel receives the sub-batch input_batch[input_batch.partition_key % parallelism == channel_id]. If this partition key is not in the input batch's columns from the specified source node, a runtime error would ensue. Keyword arguments: ip : str. the IP address of the physical machine the input node should be placed. Defaults to local execution. Writing Your Own (Stateless) Executor Object The best place to learn how to write your own executor object classes is by looking at the available executor object classes in the SQL library. In short, an executor class is simply a child class of this base class: class StatelessExecutor: def __init__(self) -> None: raise NotImplementedError def early_termination(self): self.early_termination = True def execute(self,batch,stream_id, executor_id): raise NotImplementedError def done(self,executor_id): raise NotImplementedError The Stateless","title":"Quokka Runtime API documentation"},{"location":"runtime/#quokka-runtime-api-documentation","text":"","title":"Quokka Runtime API documentation"},{"location":"runtime/#programming-model","text":"A note about the name: the name is inspired by the Apache Flink icon, which is a chipmunk. A quokka is a marsupial that resembles a chipmunk.","title":"Programming Model"},{"location":"runtime/#motivation","text":"Popular big data processing frameworks such as Spark and Dask rely on bulk-synchronous execution on distributed datasets. Often, a map-reduce style model is adopted, where mappers perform functions on partitions of the input, the mapper outputs are shuffled into groups, and after the shuffle has fully/mostly completed , reducers start working on each group. Typically this is implemented as a pull-based model where reducers pull required data from the mappers, who persist their output in some kind of external storage (disk or network) when fault tolerance is desired. There are a couple problems with this approach. The first, as recent works such as LinkedIn Magnet and Uber Zeus have identified, is that when each mapper doesn't have too much data for each reducer, the pull operation amounts to a bunch of random disk/network reads. This is horrible. The solution is push-based shuffles, where mappers push data to the reducers. Data can now be persisted on the reducer side for fault tolerance. However, this only addresses part of the problem. In a synchronous shuffle, even when mapper output is pushed to the reducers as soon as they are generated, the reducers can't start operating on said data until they have received near everything. This is because the current Map-Reduce paradigm stipulates that the reduction function is a function on all the data assigned to it from the mappers. This forces the reducers to start only after most of the mappers have completely executed, making any kind of pipelined parallel execution between the two impossible. This is unfortunate, because mappers and reducers often use very different resources (network I/O bound mappers + compute bound reducers), and can often be scheduled for parallel execution on the same physical instances without compromising too much the performance of either. Quokka's solution is to support two different kinds of reducer functions. Blocking reducers are similar to classic Map-Reduce reducers and block until they receive all mapper outputs. However, non-blocking reducers can start executing on mapper outputs as soon as they arrive, producing some output of its own and updating some local state. For example, sort, count and aggregation are blocking reducer functions because their output depend on all the data. However, join, filter and projection can be implemented in a non-blocking fashion with streaming operators. Non-blocking reducers can be pipelined with other non-blocking reducers, while a blocking reducer breaks the pipeline. Mappers are treated as non-blocking reducers where the output already exists in network/disk storage. We impose some limitations on the kinds of non-blocking operators we support, which are described in detail later. Logically, one can view Quokka execution as a series of stages, where each stage start with the output produced by a blocking operator, ends with another blocking operator, and executes non-blocking operators in between. The entire stage is executed in a pipeline-parallel fashion, and can be viewed as a pure streaming system. The stage inputs/outputs use Spark's lineage tracking based fault-tolerance and persistence mechanism. Since each Quokka stage now corresponds to a few Spark stages, Quokka also implements intra-stage fault tolerance based on checkpointing. The checkpointing recovery mechanism in Quokka conveniently avoids global asynchronous rollbacks, the bane of streaming systems, thanks to the restrictions we impose on the non-blocking operators. Quokka also aims to support autoscaling. (I have a plan to do this, but likely will not get to this until after the rotation.)","title":"Motivation"},{"location":"runtime/#execution-model","text":"The Quokka runtime API allows you to construct a task graph of nodes , which corresponds to a Quokka stage. This is very similar to other DAG-based processing frameworks such as Apache Spark or Tensorflow . For example, you can write the following code in the runtime API to execute TPC-H query 6: task_graph = TaskGraph() lineitem = task_graph.new_input_csv(bucket,key,lineitem_scheme,8,batch_func=lineitem_filter, sep=\"|\") agg_executor = AggExecutor() agged = task_graph.new_blocking_node({0:lineitem}, agg_executor, 1, {0:None}) task_graph.initialize() task_graph.run() There are perhaps a couple of things to note here. Firstly, there are two types of nodes in the runtime API. There are input nodes , declared with APIs such as new_input_csv or new_input_parquet , which interface with the external world (you can define where they will read their data), and task nodes , declared with new_non_blocking_node or new_blocking_node , which take as input the outputs generated from another node in the task graph, either an input node or another task node. Secondly, we see that the task node agged depends on the outputs from the input node lineitem . We will describe what exactly are the types of lineitem and agged later (the former is a stream and the latter is a dataset). Finally, note that the task graph ends with a blocking node. This is currently required, if you want to be able to interact with the results of the task graph execution. Multiple stages are implemented with multiple task graphs, with the first node of stage 2 reading from the output of stage 1, like the following: task_graph = TaskGraph() a = task_graph.new_input_csv(\"bump\",\"a-big.csv\",[\"key\"] + [\"avalue\" + str(i) for i in range(100)],{'localhost':2}) b = task_graph.new_input_csv(\"bump\",\"b-big.csv\",[\"key\"] + [\"bvalue\" + str(i) for i in range(100)],{'localhost':2}) join_executor = OOCJoinExecutor(on=\"key\") output = task_graph.new_blocking_node({0:quotes,1:trades},None, join_executor,{'localhost':4},{0:\"key\", 1:\"key\"}) task_graph.initialize() task_graph.run() del task_graph task_graph2 = TaskGraph() count_executor = CountExecutor() joined_stream = task_graph2.new_input_from_dataset(output,{'localhost':4}) final = task_graph2.new_blocking_node({0:joined_stream}, None, count_executor, {'localhost':4}, {0:'key'}) task_graph2.initialize() task_graph2.run() Note that since the output of a stage is persisted as in Spark, one can delete the first task graph and still access its outputs. Since a task graph represents one Quokka stage, it strictly follows push-based execution. This means that a node does not wait for its downstream dependencies to ask for data, but instead actively pushes data to its downstream dependencies whenever some intermediate results become available. In short, execution proceeds as follows : input nodes read batches of data from a specified source, which might be an external data source or the outputs of a previous stage, and pushes those batches to downstream task nodes. A task node exposes a handler to process incoming batches as they arrive, possibly updating some internal state, and for each input batch possibly produces an output batch for its own downstream children. The programmer is expected to supply this handler function as an executor object (e.g. OOCJoinExecutor , AggExecutor ). Quokka provides a library of pre-implemented executor objects that the programmer can use for SQL, ML and graph analytics. Each task node can have multiple physical executors, referred to as channels . This is a form of intra-operator data parallelism, as opposed to the inter-operator pipeline parallelism that results from all task nodes executing at the same time. These physical executors all execute the same handler function, but on different portions of the input batch, partitioned by a user-specified partition function. A Map-Reduce job with M mappers and R reducers would be implemented in Quokka as a single mapper task node and a single reducer task node, where the mapper task node has M channels and the reducer task node has R channels. In the example above, we specified that the input node lineitem has 8 channels, and the task node agged has only 1 channel. The partition key was not specified ( {0:None} ) since there is no parallelism, thus no need for partitioning. The situation looks something like the following picture: Quokka keeps track of all the channels and schedules them onto physical computing resources. For the engine, two channels from different task nodes are on more or less equal footing -- they can be scheduled on the same hardware or different hardware. A channel from an input node completes execution when there's no more inputs to be read or if all of its downstream dependencies have completed execution. A channel from a task node completes execution when: all of its upstream sources have completed execution if its execution handler decides to terminate early based on the input batch and its state (e.g. for a task node that executes the limit operator in a limit query, it might keep as local state the buffered output, and decide to terminate when that output size surpasses the limit number) if all its downstream dependencies have completed execution. By default, all channels start execution at once. This does not necessarily mean that they will start processing data, this means that they will all start waiting for input batches from their upstream sources to arrive. One could specify that an input node delay execution until another input node has finished. For example to implement a hash join one might want to stream in one table to build the hash table, then stream in the other table for probing. The runtime API is meant to be very flexible and support all manners of batch and stream processing. For example, one could specify an input node that listens to a Kafka stream, some task nodes which processes batches of data from that stream, and an output node that writes to another Kafka stream. In this case, since the input node will never terminate, and assuming the other nodes do not trigger early termination, the task graph will always be running. As a result of this flexibility, it requires quite a lot of knowledge for efficient utilization. As a result, we aim to provide higher level APIs to support common batch and streaming tasks in SQL, machine learning and graph analytics. Most programmers are not expected to program at the runtime API level, but rather make use of the pre-packaged higher-level APIs.","title":"Execution Model"},{"location":"runtime/#stateful-actors","text":"Let's talk more about task nodes in Quokka. Channels in task nodes can be treated as stateful operators in an actor programming model. Quokka adopts the notion of channels in a task node to specify that a group of actors all execute the same code, for fault tolerance and autoscaling purposes. One could override default Quokka behavior by simply specifying different task nodes with one channel each, all executing the same code. The key property of stateful operators in Quokka is confluence : in the context of nondeterministic message delivery, an operation on a single machine is confluent if it produces the same set of outputs for any nondeterministic ordering and batching of a set of inputs. (Hellerstein, CALM) Note that the output itself can also be produced in any order. It\u2019s easy to see that any composition of confluent operators is still confluent. We relax the confluent definition somewhat here to accept potentially different output sets, assuming they are all semantically correct. For example an operator that implements the LIMIT N clause in SQL can admit any of N input records it sees. More importantly, for Quokka we allow operators to depend on intra-stream ordering, just not inter-stream ordering. This means that it might still expect the inputs produced by a certain stream to observe some order, while there are no restrictions on the relative orderings between different input streams. Quokka as a system enforces intra-stream message order, but makes zero gurantees about inter-stream message orders. Henceforth, confluence will refer to this narrow definition, not the one defined in the CALM paper. Confluence is a very nice property to have in general, more so for streaming systems. Let\u2019s imagine a stateful operator with two different upstream operators producing messages. It is very nice if the system\u2019s correctness does not depend on the order in which the two upstream operators produce the messages, which could depend on network delay, task scheduling, etc. This is critical for performance in a push-based framework since a node should never wait on any one of its input streams. In addition, it also greatly facilitates fault tolerance, as messages from different sources can be replayed in any order in regards to one another, as we will describe later. Confluence is perhaps the key difference between Quokka and streaming-centric systems like Flink. In Flink you can totally write pipelines where the outputs depend very strongly on the order the inputs are supplied. In Quokka it is not allowed. (Really at this point, it's only \"not recommended\" -- there are no checks in place to see if your actor is confluent or not. What's guaranteed is that all the operators in the libraries supplied follow this model. Enforcing this is future work.) What are some examples of confluent stateful operators? First let's categorize the world of stateful operators we'd like to implement in data analytics. As mentioned previosuly, there are two important cateogories: nonblocking and blocking . Blocking operators cannot emit any outputs to their downstream children until all of their inputs have been processed. Examples are any kind of aggregation and sort. For (naive) aggregation, the stateful operator does not know it has the final result for any of its aggregation keys until it has seen all of its inputs. For sorting, the stateful operator cannot guarantee that it would emit results in sorted order until it has received all its inputs. We call any operator that is not blocking non-blocking. Example non-blocking operators are map, filter, projection and join. Blocking operators are pipeline breakers, and negate the benefits of using a streaming framework like Quokka. Confluence is easy to reason about for blocking operators. The blocking operator emit only one output, at the very end. We just have to make sure that this output is the same regardless of the order in which we supply the operator's inputs. Since this operator is typically a function of the final state, we just have to ensure that the final state is the same. If we imagine that each incoming message changes the state of the operator by function f , then it's easy to see that as long as f is commutative this is true. For example, any kind of aggregation is commutative, the merge step in merge-sort is commutative, etc. Confluence is harder to reason about for nonblocking operators. We must guarantee that regardless of the order the input batches are supplied, the set of output batches do not change. Let\u2019s say we only have two incoming messages, m and n, to a node with starting state S. Then the outputs produced by giving m first to S, changing the state S to f(m, S), while producing output o(m, S) and then giving n to S, changing the state to f(n, f(m,S)) while producing output o(n, f(m,S)), which is {o(m,S), o(n,f(m,s))} is the same as if we gave the outputs in the reverse order. Note that this assumes that m and n are all the messages the node will see. Confluence is about eventual consistency. While in general there are many ways to achieve this kind of behavior as long as only monotonic operations are applied to the state at each input batch (Bloom), in Quokka all the stock non-blocking operators take the approach of setting the state as sets of immutable batches of data, that can only be added to. This is clearly monotonic. If you are writing a stateful operator for Quokka, this is the recommended approach. What this means is that it is impossible to perform operations that require a specific batch amongst the set of batches, such as list indexing, since ordering of the batches in a set in the state is undefined. Most meaningful operations take the incoming message and produce an output that depends on the entire set, or not at all. An example of a confluent stateful operator in Quokka is a join. The code can be roughly summarized as follows: state0 = set() state1 = set() for each input: if input from stream0: state0.add(input) emit set(input.join(i) for i in state1) else: state1.add(input) emit set(i.join(input) for i in state0) Note that there is in fact a non-monotonic domain-specific optimization we can make that will preserve confluence in the case of a primary key join. Any input streamed in from stream0 can guarantee that any future records from that table will not have the same key value. Thus all state1 related to the record\u2019s key can be safely deleted. Quokka currently does not implement this optimization.","title":"Stateful Actors"},{"location":"runtime/#datasets-and-streams","text":"Let's talk more about how non-blocking and blocking operators work in Quokka. Blocking operators could be introduced by operations like aggregations and sort, or simply by user command when they wish to materialize data with .materialize() (similar to .cache() semantics in Spark or .compute() semantics in Dask). Such blocking operators will produce a Dataset in Quokka, while non-blocking operators will produce a Stream . Downstream operators could depend on both upstream datasets and streams. The difference is that the upstream dataset need to be completely materialized when an operator starts executing, while a stream is just a promise that batches of data will be produced at some point in the future in any order. In other words, from the perspective of the operator, it can pull data from an upstream dataset and expects data to be pushed to it from the stream. In the very first code listing for TPC-H query 6, agged is a dataset whereas lineitem is a stream. In practice, a Quokka DAG can consist of many blocking operators and non-blocking operators organized in complicated ways. For example, here is the DAG for a PageRank application: As previously described, Quokka decomposes the computation into stages, with each stage ending in the creation of a Dataset. In this case the computation will be broken into two stages, the first of which consists of the nonblocking input sparse matrix read and caching (the upper row). The second will be the bottom row. The second stage depends on the first one, so it will be launched after the first one has completed. This is very similar to how stages in Spark work. (Note that strictly speaking, every stage has to start from a Dataset too. In this case the input nodes depend on Datasets that are pre-created in S3 or Disk, and are abbreviated in this graph.) Similarly to an RDD, Quokka represents a Dataset as a collection of immutable objects, and some associated metadata on those objects, which is itself an immutable object. The objects are all stored on a shared-memory object store with persistence (currently RocksDB). When you use task_graph.add_blocking_node in Quokka, a Dataset object will be returned. You can use this Dataset object in downstream operators. Quokka guarantees that by the time the downstream operators execute, all the Datasets that they depend on would have been materialized in this object store. The stock Dataset class in Quokka exposes some convenience methods such as an iterator to iterate through the objects. The user could also interact directly with the object store after looking up metadata from the Dataset object. There are more specialized Dataset class implementations in Quokka like KVDataset or RangeDataset which corresponds to hash-based partitioning or range-based partitioning of objects that expose more methods. The user could also implement a custom Dataset class that descends from Dataset with even more methods. It is important to ensure that when using a Dataset in a downstream operator that also takes streaming inputs, the confluence property is respected. Unfortunately, Quokka currently does not enforce this and it's possible for you to mess this up when writing your code. Although it's not that easy to mess up, since you cannot change the objects you read from the Dataset. A downstream operator could treat the Dataset as a stream by simply invoking the iterator to iterate through the objects in the Dataset. However, for many downstream operations, it might be desirable to explicitly convert a Dataset into a Stream again (e.g. to use stock operators that only have stream-based implementations). You can do that by using the specialized task node add_input_dataset . Internally, this task node just calls the iterator repeatedly and produce a stream of batches corresponding to the objects in the Dataset.","title":"Datasets and Streams"},{"location":"runtime/#fault-tolerance-future-work","text":"The current theory is a bit complicated. I am still thinking through how this should work exactly, but hopefully the gist gets through. Given our group of confluent stateful operators, how do we achieve fault tolerance? A Quokka application can be thought of as a DAG, where each node corresponds to a channel, from one of the task nodes. Each node is assigned to a physical hardware instance. Quokka is designed to expect many nodes to be assigned to one physical instance. For example, let's imagine the following case, where the nodes circled belongs to machine A and the rest belong to machine B, and nodes 1 and 2 are channels of the input node. 3, 4 and 5 are non-blocking operators, 6 and 7 are blocking operators. Quokka follows a checkpoint-based system where each channel periodically asynchronously checkpoints its local state to persistent storage (AWS S3). Note that this is quite efficient given the types of states we typically have, such as (typically) small intermediate aggregation results and sets of batches that are monotonically added to. (This is definitely an area of future work) The problem is easy to spot: \"yes checkpoints are great, but you must turn off the entire system when a machine fails to sync it back to the latest good state, and then reapply all the inputs.\" Yes that is true for a general-purpose streaming system like Flink or Naiad. Coordinated global rollbacks really suck. But in Quokka where all the stateful operators are confluent, this need not happen. What happens when machine A dies? TLDR: machine B can keep doing work as if nothing is wrong, while machine A's workload eventually gets rescheduled. The gory details: nodes 1, 3, 6 and 7 carry on with life (they won't even know machine A just died). 1 will notice that it can no longer send messages to 4 and 5. That's ok, it will just buffer those messages. 3 and 6 will realize that they have fewer incoming messages now. 7 will notice that they have no more incoming messages. That's ok, they can work on their backlog. The system then goes about recovering 2, 4 and 5. It will request a new machine to schedule 2, 4 and 5, or simply schedule them to machine B. 2 is a channel of an input node, which has no state. In Quokka, all message sent between channels are tagged with a sequence number. The number starts from 0 and monotonically increases. This way, the task node discards messages with a wrong sequence number. The state of a stateful operator is also tagged with a number. The state number starts from 0 and monotonically increases every time. When an operator checkpoints, it writes its state, its state number, and the latest sequence number it expects from its inputs. A consistent checkpoint contains all this information. Quokka will look at the last consistent checkpoint of nodes 4 and 5, and find the minimum of all the latest sequence numbers across both consistent checkpoints. This is the batch that 2 will now have to start to produce. Let's say that node 4 had the smaller latest sequence number. Then node 4 will immediately start catching up. Node 5 will look at the incoming batches, find that their sequence numbers are smaller than expected, and proceed to ignore all of them. Eventually, node 5 will start recovering state as well. After both nodes catch up to the point where they died, node 6 and 7 will start accepting messages from node 4 and node 5 since now their messages have valid sequence numbers. What if in this example, node 2 was not an input node but a task node? Then the dead subgraph has no way of re-reading the input. Long story short, each node needs to buffer outgoing messages, until its children notify it that the state change affected by that outgoing message has been persisted to a checkpoint. This way, messages can be replayed when needed. All this while, machine B has been carrying on with its life. This means that if we started out in a load balanced way, then this fault recovery has introduced stragglers -- node 4 and 5 will now finish after node 3. This is actually okay from a resource-usage point of view. Note that nowhere in this process are we wasting resources across the cluster, as seen in global synchronous rollbacks. Only the lost states need to be recomputed, similar in vein to the fault-tolerance mechanism in Spark. In addition, fault recovery required minimal communication with workers that did not experience a fault, minimizing fault recovery overhead. Stragglers are okay for Quokka, we will mediate them through the dynamic scheduling mechanism described in the next section.","title":"Fault tolerance (future work)"},{"location":"runtime/#scheduling-and-autoscaling-future-work","text":"There are two auto-scaling strategies in Quokka. The first is automatic, while the second might require some user input. Recall that Quokka is designed to expect many channels to be assigned to the same physical hardware. But first, let's talk about how Quokka schedules channels to hardware, assuming that the graph is static, and the number and type of machines are fixed. Firstly, in the current runtime API, when instantiating a task node or input node, the user manually specifies how many channels are there and where those channels go. Dynamic channel scheduling is done when programming in higher-level APIs. We observe that each channel is in effect an independent stateful oeprator that can be scheduled independently. However, different scheduling strategies entail different communication costs. If channel A sends a large volume of messages to channel B, then we should schedule them on the same machine. Note that contrary to intuition, there is no benefit at all in scheduling multiple channels from the same input node or task node on the same machine apart from parallelism, since they never talk to each other. Channel scheduling can be dynamic, in the sense that a channel can be moved from one physical machine to another in a very straight-forward way. The self-contained nature of an actor is an oft-quoted strength of the actor model. All that needs to happen is for Quokka to transfer the state of the actor to another node (which could be done asynchronously after the transfer decision is made), and change the partition function for the channel's parents so that the appropriate physical machine receives the incoming messages. The data transfer cost is the only cost in moving an actor. Different criteria can be used to decide if a channel should be moved to another physical machine. These could include machine specific characteristics, such as limited memory available or high CPU usage on the current machine, or the lack thereof on the other machine. Quokka can also use channel-specific information, for example if the system observes the channel transfering large amounts of data to another channel on another machine and determines that the cost in moving this channel can be overcame by the benefit in data locality achieved after the move. The stragglers introduced by fault recovery can be mediated in this fashion. Node 1 and 3 will finish before node 2 and 4/5, creating less resource usage on machine B. The system will then try to move one of node 4/5 onto machine B.","title":"Scheduling and Autoscaling (future work)"},{"location":"runtime/#manual-autoscaling-using-combiner-functions","text":"To be written.","title":"Manual autoscaling using combiner functions"},{"location":"runtime/#example-applications","text":"","title":"Example Applications"},{"location":"runtime/#tpc-h-query-12","text":"","title":"TPC-H query 12"},{"location":"runtime/#pagerank","text":"Let's talk about how PageRank works in the Quokka programming model.","title":"Pagerank"},{"location":"runtime/#taskgraph-api","text":"","title":"TaskGraph API"},{"location":"runtime/#new_input_csv-bucket-key-names-parallelism-iplocalhostbatch_funcnone-sep-dependents-stride-64-1024-1024","text":"Currently, new_input_csv only supports reading a CSV in batches from an AWS S3 bucket. Required arguments in order: bucket : str. AWS S3 bucket key : str. AWS S3 key names : list of str. Column names. Note that if your rows ends with a delimiter value, such as in TPC-H, you will have to end this list with a placeholder such as \"null\". Look at the TPC-H code examples under apps. parallelism : int. the runtime API expects the programmer to explicitly state the amount of intra-op parallelism to expose. 8 is typically a good number. Keyword arguments: ip : str. the IP address of the physical machine the input node should be placed. Defaults to local execution. batch_func : function. the user can optionally pass in a function to execute on the input CSV chunk before it's passed off to downstream dependents. Currently the input CSV is parsed into a Pandas Dataframe, so batch_func can be any Python function that can take a Pandas Dataframe as input and produces a Pandas Dataframe. This can be done to perform predicate pushdown for SQL for example. sep : str. delimiter dependents : list of int. an input node can depend on other input nodes, i.e. only start once another input node is done. For example to implement as hash join where one input might depend on another, one could do the following: a = new_input_csv(...) b = new_input_csv(...,dependents=[a]) stide : int. how many bytes to read from the input S3 file to read at a time, default to 64 MB. Returns : a node id which is a handle to this input node, that can be used as the sources argument for task nodes or dependents arguments for other input nodes.","title":"new_input_csv (bucket, key, names, parallelism, ip='localhost',batch_func=None, sep = \",\", dependents = [], stride = 64 * 1024 * 1024)"},{"location":"runtime/#new_input_parquetbucket-key-names-parallelism-columns-skip_conditions-iplocalhostbatch_funcnone-sep-dependents-stride-64-1024-1024","text":"Not yet implemented.","title":"new_input_parquet(bucket, key, names, parallelism, columns, skip_conditions, ip='localhost',batch_func=None, sep = \",\", dependents = [], stride = 64 * 1024 * 1024)"},{"location":"runtime/#new_task_nodesources-functionobject-parallelism-partition_key-iplocalhost","text":"Instantiate a new task node with an executor object that defines the handler function which runs on each incoming batch. Required arguments in order: sources : dict of int -> int. the upstream sources that feed batches to this task node. Expects a dictionary, where the keys are integers and values are node ids (also stored as integers). This in effect names the source nodes. i.e. if you specify {0: source_node_id_x, 1:source_node_id_y} , from the perspective of this task node you are calling the batches coming from source_node_id_x source 0 and the batches coming from node_id_y source 1. You will make use of these identifiers writing the executor class's handler function for incoming batches. functionObject : an executor object which defines the input batch handler function. More details on this in the next section. You can write your own or use a pre-supplied one from the sql, ml or graph packages. parallelism : int. the runtime API expects the programmer to explicitly state the amount of intra-op parallelism to expose. Think carefully about this choice. Computationally intensive tasks might benefit from parallelism, while simple tasks such as aggregation might not. partition_key : dict of int -> in. This argument expects a dictionary with a key for each key in the sources dict. It describes how the input batches should be partitioned amongst the channels. If the value is None, then the input batch is copied and broadcast to all channels. Otherwise, currently each channel receives the sub-batch input_batch[input_batch.partition_key % parallelism == channel_id]. If this partition key is not in the input batch's columns from the specified source node, a runtime error would ensue. Keyword arguments: ip : str. the IP address of the physical machine the input node should be placed. Defaults to local execution.","title":"new_task_node(sources, functionObject, parallelism, partition_key, ip='localhost')"},{"location":"runtime/#writing-your-own-stateless-executor-object","text":"The best place to learn how to write your own executor object classes is by looking at the available executor object classes in the SQL library. In short, an executor class is simply a child class of this base class: class StatelessExecutor: def __init__(self) -> None: raise NotImplementedError def early_termination(self): self.early_termination = True def execute(self,batch,stream_id, executor_id): raise NotImplementedError def done(self,executor_id): raise NotImplementedError The Stateless","title":"Writing Your Own (Stateless) Executor Object"},{"location":"simple/","text":"Tutorials This section is for learning how to use Quokka's DataStream API. Quokka's DataStream API is basically a dataframe API. It takes heavy inspiration from SparkSQL and Polars, and adopts a lazy execution model. This means that in contrast to Pandas, your operations are not executed immediately after you define them. Instead, Quokka builds a logical plan under the hood and executes it only when the user wants to \"collect\" the result, just like Spark. For the first part of our tutorial, we are going to go through implementing a few SQL queries in the TPC-H benchmark suite. You can download the data here . It is about 1GB unzipped. Please download the data (should take 2 minutes) and extract it to some directory locally. The SQL queries themselves can be found on this awesome interface . These tutorials will use your local machine. They shouldn't take too long to run. It would be great if you can follow along, not just for fun -- if you find a bug in this tutorial I will buy you a cup of coffee! For an extensive API reference, please refer to here . Lesson -1: Things Please read the Getting Started section. I spent way too much time making the cartoons on that page. Lesson 0: Reading Things For every Quokka program, we need to set up a QuokkaContext object. This is similar to the Spark SQLContext . This can easily be done by running the following two lines of code in your Python terminal. from pyquokka.df import * qc = QuokkaContext() Once we have the QuokkaContext object, we can start reading data to obtain DataStreams. Quokka can read data on disk and on the cloud (currently S3). For the purposes of this tutorial we will be reading data from disk. Quokka currently reads CSV and Parquet, with plans to add JSON soon. Here is how you would read a CSV file if you know the schema: # the last column is called NULL, because the TPC-H data generator likes to put a | at the end of each row, making it appear as if there is a final column # with no values. Don't worry, we can drop this column. lineitem_scheme = [\"l_orderkey\",\"l_partkey\",\"l_suppkey\",\"l_linenumber\",\"l_quantity\",\"l_extendedprice\", \"l_discount\",\"l_tax\",\"l_returnflag\",\"l_linestatus\",\"l_shipdate\",\"l_commitdate\",\"l_receiptdate\",\"l_shipinstruct\",\"l_shipmode\",\"l_comment\", \"null\"] lineitem = qc.read_csv(disk_path + \"lineitem.tbl\", lineitem_scheme, sep=\"|\") And if you don't know the schema but there is a header row where column names are separated with the same separator as the data : lineitem = qc.read_csv(disk_path + \"lineitem.tbl.named\", sep=\"|\", has_header=True) You can also read a directory of CSV files: lineitem = qc.read_csv(disk_path + \"lineitem/*\", lineitem_scheme, sep=\"|\", has_header = True) Now let's read all the tables of the TPC-H benchmark suite. Set disk_path to where you unzipped the files. lineitem = qc.read_csv(disk_path + \"lineitem.tbl\", sep=\"|\", has_header=True) orders = qc.read_csv(disk_path + \"orders.tbl\", sep=\"|\", has_header=True) customer = qc.read_csv(disk_path + \"customer.tbl\",sep = \"|\", has_header=True) part = qc.read_csv(disk_path + \"part.tbl\", sep = \"|\", has_header=True) supplier = qc.read_csv(disk_path + \"supplier.tbl\", sep = \"|\", has_header=True) partsupp = qc.read_csv(disk_path + \"partsupp.tbl\", sep = \"|\", has_header=True) nation = qc.read_csv(disk_path + \"nation.tbl\", sep = \"|\", has_header=True) region = qc.read_csv(disk_path + \"region.tbl\", sep = \"|\", has_header=True) If you want to read the Parquet files, you should first run this script to generate the Parquet files: import polars as pl disk_path = \"/home/ubuntu/tpc-h/\" #replace files = [\"lineitem.tbl\",\"orders.tbl\",\"customer.tbl\",\"part.tbl\",\"supplier.tbl\",\"partsupp.tbl\",\"nation.tbl\",\"region.tbl\"] for file in files: df = pl.read_csv(disk_path + file,sep=\"|\",has_header = True, parse_dates = True).drop(\"null\") df.write_parquet(disk_path + file.replace(\"tbl\", \"parquet\"), row_group_size=100000) To read in a Parquet file, you don't have to worry about headers or schema, just do: lineitem = qc.read_parquet(disk_path + \"lineitem.parquet\") Currently, qc.read_csv and qc.read_parquet will either return a DataStream or just a Polars DataFrame directly if the data size is small (set at 10 MB). Lesson 1: Doing Things Now that we have read the data, let's do things with it. First, why don't we count how many rows there are in the lineitem table. >>> lineitem.aggregate({\"*\":\"count\"}).collect() If you don't see the number 6001215 after a while, something is very wrong. Please send me an email, I will help you fix things (and buy you a coffee): zihengw@stanford.edu. Feel free to type other random things and see if it's supported, but for those interested, let's follow a structured curriculum. Let's take a look at TPC-H query 1 . This is how you would write it in Quokka. This is very similar to how you'd write in another DataFrame library like Polars or Dask. def do_1(): d = lineitem.filter(\"l_shipdate <= date '1998-12-01' - interval '90' day\") d = d.with_column(\"disc_price\", lambda x: x[\"l_extendedprice\"] * (1 - x[\"l_discount\"]), required_columns ={\"l_extendedprice\", \"l_discount\"}) d = d.with_column(\"charge\", lambda x: x[\"l_extendedprice\"] * (1 - x[\"l_discount\"]) * (1 + x[\"l_tax\"]), required_columns={\"l_extendedprice\", \"l_discount\", \"l_tax\"}) f = d.groupby([\"l_returnflag\", \"l_linestatus\"], orderby=[\"l_returnflag\",\"l_linestatus\"]).agg({\"l_quantity\":[\"sum\",\"avg\"], \"l_extendedprice\":[\"sum\",\"avg\"], \"disc_price\":\"sum\", \"charge\":\"sum\", \"l_discount\":\"avg\",\"*\":\"count\"}) return f.collect() Quokka supports filtering DataStreams by DataStream.filter() . Filters can be specified in SQL syntax. The columns in the SQL expression must exist in the schema of the DataStream. A more Pythonic way of doing this like b = b[b.a < 5] isn't supported yet, mainly due to the finickiness surrounding date types etc. The result of a filter() is another DataStream whose Polars DataFrames will only contain rows that respect the predicate. On the plus side, Quokka uses the amazing SQLGlot library to support most ANSI-SQL compliant predicates, including dates, between, IN, even arithmetic in conditions. Try out some different predicates ! Please give SQLGlot a star when you're at it. For example, you can specify this super complicated predicate for TPC-H query 6 : def do_6(): d = lineitem.filter(\"l_shipdate >= date '1994-01-01' and l_shipdate < date '1994-01-01' + interval '1' year and l_discount between 0.06 - 0.01 and 0.06 + 0.01 and l_quantity < 24\") d = d.with_column(\"revenue\", lambda x: x[\"l_extendedprice\"] * x[\"l_discount\"], required_columns={\"l_extendedprice\", \"l_discount\"}) f = d.aggregate({\"revenue\":[\"sum\"]}) return f.collect() Quokka supports creating new columns in DataStreams with with_column . Read more about how this works here . This is in principle similar to Spark df.with_column and Pandas UDFs. The main thing to keep in mind is that the function you supply will be applied to each batch in the DataStream, instead of row by row. As a result, you can make use of fast vectorized execution with Polars. The mental model here is that we have a DataStream d of Polars DataFrames, each of which have rows from the lineitem table satisfying the filter predicate. Then, each Polars DataFrame is transformed by our functions to add the columns disk_price and charge . Like most Quokka operations, with_column will produce a new DataStream with an added column and is not inplace. This means that the command is lazy, and won't trigger the runtime to produce the actual data. It simply builds a logical plan of what to do in the background, which can be optimized when the user specifically ask for the result. Finally, we can group the DataStream and aggregate it to get the result. Read more about aggregation syntax here . The aggregation will produce another DataStream, which we call collect() on, to convert it to a Polars DataFrame in your Python terminal. When you call .collect() , the logical plan you have built is actually optimized and executed. This is exactly how Spark works. To view the optimized logical plan and learn more about what Quokka is doing, you can do f.explain() which will produce a graph, or f.explain(mode=\"text\") which will produce a textual explanation. Joins work very intuitively. For example, this is how to do TPC-H query 12 . def do_12(): d = lineitem.join(orders,left_on=\"l_orderkey\", right_on=\"o_orderkey\") d = d.filter(\"l_shipmode IN ('MAIL','SHIP') and l_commitdate < l_receiptdate and l_shipdate < l_commitdate and \\ l_receiptdate >= date '1994-01-01' and l_receiptdate < date '1995-01-01'\") d = d.with_column(\"high\", lambda x: (x[\"o_orderpriority\"] == \"1-URGENT\") | (x[\"o_orderpriority\"] == \"2-HIGH\"), required_columns={\"o_orderpriority\"}) d = d.with_column(\"low\", lambda x: (x[\"o_orderpriority\"] != \"1-URGENT\") & (x[\"o_orderpriority\"] != \"2-HIGH\"), required_columns={\"o_orderpriority\"}) f = d.groupby(\"l_shipmode\").aggregate(aggregations={'high':['sum'], 'low':['sum']}) return f.collect() Note it does not matter if you filter after the join or before the join, Quokka will automatically push them down during the logical plan optimization. The join operator on a DataStream takes in either another DataStream or a Polars DataFrame in your Python session. In the latter case, this Polars DataFrame will be broadcasted to different workers similar to Spark's broadcast join. Here is another example, TPC-H query 3 . def do_3(): d = lineitem.join(orders,left_on=\"l_orderkey\", right_on=\"o_orderkey\") d = customer.join(d,left_on=\"c_custkey\", right_on=\"o_custkey\") d = d.filter(\"c_mktsegment = 'BUILDING' and o_orderdate < date '1995-03-15' and l_shipdate > date '1995-03-15'\") d = d.with_column(\"revenue\", lambda x: x[\"l_extendedprice\"] * ( 1 - x[\"l_discount\"]) , required_columns={\"l_extendedprice\", \"l_discount\"}) f = d.groupby([\"l_orderkey\",\"o_orderdate\",\"o_shippriority\"]).agg({\"revenue\":[\"sum\"]}) return f.collect() Note unlike some SQL engines, Quokka currently will not try to figure out the optimal join ordering between the specified three-way join between lineitem, orders and customer tables. You are responsible for figuring that out at the moment -- try to join smaller tables first and then join them against larger tables, or try to minimize the intermeidate result size from those joins. An important thing to note is that Quokka currently only support inner joins. Other kinds of joins are coming soon. Feel free to look at some other queries in the Quokka github , or browse the API reference . While you are there, please give Quokka a star! Lesson 2: Writing Things So far, we have just learned about Lesson 3: Things you can't do. Here is a brief discussion of what Quokka is not great for. Quokka's main advantage stems from the fact it can pipeline the execution of DataStreams. Once a partition (typically a Polars DataFrame) in a DataStream has been generated, it can be immediately consumed by a downstream user. This means downstream processing of this partition and upstream generation of the next partition can be overlapped. Now, if an operator processing a DataStream cannot emit any partitions downstream until it has seen all of the partitions in its input DataStreams, the pipeline breaks. An example of this is an aggregation.","title":"DataStream API"},{"location":"simple/#tutorials","text":"This section is for learning how to use Quokka's DataStream API. Quokka's DataStream API is basically a dataframe API. It takes heavy inspiration from SparkSQL and Polars, and adopts a lazy execution model. This means that in contrast to Pandas, your operations are not executed immediately after you define them. Instead, Quokka builds a logical plan under the hood and executes it only when the user wants to \"collect\" the result, just like Spark. For the first part of our tutorial, we are going to go through implementing a few SQL queries in the TPC-H benchmark suite. You can download the data here . It is about 1GB unzipped. Please download the data (should take 2 minutes) and extract it to some directory locally. The SQL queries themselves can be found on this awesome interface . These tutorials will use your local machine. They shouldn't take too long to run. It would be great if you can follow along, not just for fun -- if you find a bug in this tutorial I will buy you a cup of coffee! For an extensive API reference, please refer to here .","title":"Tutorials"},{"location":"simple/#lesson-1-things","text":"Please read the Getting Started section. I spent way too much time making the cartoons on that page.","title":"Lesson -1: Things"},{"location":"simple/#lesson-0-reading-things","text":"For every Quokka program, we need to set up a QuokkaContext object. This is similar to the Spark SQLContext . This can easily be done by running the following two lines of code in your Python terminal. from pyquokka.df import * qc = QuokkaContext() Once we have the QuokkaContext object, we can start reading data to obtain DataStreams. Quokka can read data on disk and on the cloud (currently S3). For the purposes of this tutorial we will be reading data from disk. Quokka currently reads CSV and Parquet, with plans to add JSON soon. Here is how you would read a CSV file if you know the schema: # the last column is called NULL, because the TPC-H data generator likes to put a | at the end of each row, making it appear as if there is a final column # with no values. Don't worry, we can drop this column. lineitem_scheme = [\"l_orderkey\",\"l_partkey\",\"l_suppkey\",\"l_linenumber\",\"l_quantity\",\"l_extendedprice\", \"l_discount\",\"l_tax\",\"l_returnflag\",\"l_linestatus\",\"l_shipdate\",\"l_commitdate\",\"l_receiptdate\",\"l_shipinstruct\",\"l_shipmode\",\"l_comment\", \"null\"] lineitem = qc.read_csv(disk_path + \"lineitem.tbl\", lineitem_scheme, sep=\"|\") And if you don't know the schema but there is a header row where column names are separated with the same separator as the data : lineitem = qc.read_csv(disk_path + \"lineitem.tbl.named\", sep=\"|\", has_header=True) You can also read a directory of CSV files: lineitem = qc.read_csv(disk_path + \"lineitem/*\", lineitem_scheme, sep=\"|\", has_header = True) Now let's read all the tables of the TPC-H benchmark suite. Set disk_path to where you unzipped the files. lineitem = qc.read_csv(disk_path + \"lineitem.tbl\", sep=\"|\", has_header=True) orders = qc.read_csv(disk_path + \"orders.tbl\", sep=\"|\", has_header=True) customer = qc.read_csv(disk_path + \"customer.tbl\",sep = \"|\", has_header=True) part = qc.read_csv(disk_path + \"part.tbl\", sep = \"|\", has_header=True) supplier = qc.read_csv(disk_path + \"supplier.tbl\", sep = \"|\", has_header=True) partsupp = qc.read_csv(disk_path + \"partsupp.tbl\", sep = \"|\", has_header=True) nation = qc.read_csv(disk_path + \"nation.tbl\", sep = \"|\", has_header=True) region = qc.read_csv(disk_path + \"region.tbl\", sep = \"|\", has_header=True) If you want to read the Parquet files, you should first run this script to generate the Parquet files: import polars as pl disk_path = \"/home/ubuntu/tpc-h/\" #replace files = [\"lineitem.tbl\",\"orders.tbl\",\"customer.tbl\",\"part.tbl\",\"supplier.tbl\",\"partsupp.tbl\",\"nation.tbl\",\"region.tbl\"] for file in files: df = pl.read_csv(disk_path + file,sep=\"|\",has_header = True, parse_dates = True).drop(\"null\") df.write_parquet(disk_path + file.replace(\"tbl\", \"parquet\"), row_group_size=100000) To read in a Parquet file, you don't have to worry about headers or schema, just do: lineitem = qc.read_parquet(disk_path + \"lineitem.parquet\") Currently, qc.read_csv and qc.read_parquet will either return a DataStream or just a Polars DataFrame directly if the data size is small (set at 10 MB).","title":"Lesson 0: Reading Things"},{"location":"simple/#lesson-1-doing-things","text":"Now that we have read the data, let's do things with it. First, why don't we count how many rows there are in the lineitem table. >>> lineitem.aggregate({\"*\":\"count\"}).collect() If you don't see the number 6001215 after a while, something is very wrong. Please send me an email, I will help you fix things (and buy you a coffee): zihengw@stanford.edu. Feel free to type other random things and see if it's supported, but for those interested, let's follow a structured curriculum. Let's take a look at TPC-H query 1 . This is how you would write it in Quokka. This is very similar to how you'd write in another DataFrame library like Polars or Dask. def do_1(): d = lineitem.filter(\"l_shipdate <= date '1998-12-01' - interval '90' day\") d = d.with_column(\"disc_price\", lambda x: x[\"l_extendedprice\"] * (1 - x[\"l_discount\"]), required_columns ={\"l_extendedprice\", \"l_discount\"}) d = d.with_column(\"charge\", lambda x: x[\"l_extendedprice\"] * (1 - x[\"l_discount\"]) * (1 + x[\"l_tax\"]), required_columns={\"l_extendedprice\", \"l_discount\", \"l_tax\"}) f = d.groupby([\"l_returnflag\", \"l_linestatus\"], orderby=[\"l_returnflag\",\"l_linestatus\"]).agg({\"l_quantity\":[\"sum\",\"avg\"], \"l_extendedprice\":[\"sum\",\"avg\"], \"disc_price\":\"sum\", \"charge\":\"sum\", \"l_discount\":\"avg\",\"*\":\"count\"}) return f.collect() Quokka supports filtering DataStreams by DataStream.filter() . Filters can be specified in SQL syntax. The columns in the SQL expression must exist in the schema of the DataStream. A more Pythonic way of doing this like b = b[b.a < 5] isn't supported yet, mainly due to the finickiness surrounding date types etc. The result of a filter() is another DataStream whose Polars DataFrames will only contain rows that respect the predicate. On the plus side, Quokka uses the amazing SQLGlot library to support most ANSI-SQL compliant predicates, including dates, between, IN, even arithmetic in conditions. Try out some different predicates ! Please give SQLGlot a star when you're at it. For example, you can specify this super complicated predicate for TPC-H query 6 : def do_6(): d = lineitem.filter(\"l_shipdate >= date '1994-01-01' and l_shipdate < date '1994-01-01' + interval '1' year and l_discount between 0.06 - 0.01 and 0.06 + 0.01 and l_quantity < 24\") d = d.with_column(\"revenue\", lambda x: x[\"l_extendedprice\"] * x[\"l_discount\"], required_columns={\"l_extendedprice\", \"l_discount\"}) f = d.aggregate({\"revenue\":[\"sum\"]}) return f.collect() Quokka supports creating new columns in DataStreams with with_column . Read more about how this works here . This is in principle similar to Spark df.with_column and Pandas UDFs. The main thing to keep in mind is that the function you supply will be applied to each batch in the DataStream, instead of row by row. As a result, you can make use of fast vectorized execution with Polars. The mental model here is that we have a DataStream d of Polars DataFrames, each of which have rows from the lineitem table satisfying the filter predicate. Then, each Polars DataFrame is transformed by our functions to add the columns disk_price and charge . Like most Quokka operations, with_column will produce a new DataStream with an added column and is not inplace. This means that the command is lazy, and won't trigger the runtime to produce the actual data. It simply builds a logical plan of what to do in the background, which can be optimized when the user specifically ask for the result. Finally, we can group the DataStream and aggregate it to get the result. Read more about aggregation syntax here . The aggregation will produce another DataStream, which we call collect() on, to convert it to a Polars DataFrame in your Python terminal. When you call .collect() , the logical plan you have built is actually optimized and executed. This is exactly how Spark works. To view the optimized logical plan and learn more about what Quokka is doing, you can do f.explain() which will produce a graph, or f.explain(mode=\"text\") which will produce a textual explanation. Joins work very intuitively. For example, this is how to do TPC-H query 12 . def do_12(): d = lineitem.join(orders,left_on=\"l_orderkey\", right_on=\"o_orderkey\") d = d.filter(\"l_shipmode IN ('MAIL','SHIP') and l_commitdate < l_receiptdate and l_shipdate < l_commitdate and \\ l_receiptdate >= date '1994-01-01' and l_receiptdate < date '1995-01-01'\") d = d.with_column(\"high\", lambda x: (x[\"o_orderpriority\"] == \"1-URGENT\") | (x[\"o_orderpriority\"] == \"2-HIGH\"), required_columns={\"o_orderpriority\"}) d = d.with_column(\"low\", lambda x: (x[\"o_orderpriority\"] != \"1-URGENT\") & (x[\"o_orderpriority\"] != \"2-HIGH\"), required_columns={\"o_orderpriority\"}) f = d.groupby(\"l_shipmode\").aggregate(aggregations={'high':['sum'], 'low':['sum']}) return f.collect() Note it does not matter if you filter after the join or before the join, Quokka will automatically push them down during the logical plan optimization. The join operator on a DataStream takes in either another DataStream or a Polars DataFrame in your Python session. In the latter case, this Polars DataFrame will be broadcasted to different workers similar to Spark's broadcast join. Here is another example, TPC-H query 3 . def do_3(): d = lineitem.join(orders,left_on=\"l_orderkey\", right_on=\"o_orderkey\") d = customer.join(d,left_on=\"c_custkey\", right_on=\"o_custkey\") d = d.filter(\"c_mktsegment = 'BUILDING' and o_orderdate < date '1995-03-15' and l_shipdate > date '1995-03-15'\") d = d.with_column(\"revenue\", lambda x: x[\"l_extendedprice\"] * ( 1 - x[\"l_discount\"]) , required_columns={\"l_extendedprice\", \"l_discount\"}) f = d.groupby([\"l_orderkey\",\"o_orderdate\",\"o_shippriority\"]).agg({\"revenue\":[\"sum\"]}) return f.collect() Note unlike some SQL engines, Quokka currently will not try to figure out the optimal join ordering between the specified three-way join between lineitem, orders and customer tables. You are responsible for figuring that out at the moment -- try to join smaller tables first and then join them against larger tables, or try to minimize the intermeidate result size from those joins. An important thing to note is that Quokka currently only support inner joins. Other kinds of joins are coming soon. Feel free to look at some other queries in the Quokka github , or browse the API reference . While you are there, please give Quokka a star!","title":"Lesson 1: Doing Things"},{"location":"simple/#lesson-2-writing-things","text":"So far, we have just learned about","title":"Lesson 2: Writing Things"},{"location":"simple/#lesson-3-things-you-cant-do","text":"Here is a brief discussion of what Quokka is not great for. Quokka's main advantage stems from the fact it can pipeline the execution of DataStreams. Once a partition (typically a Polars DataFrame) in a DataStream has been generated, it can be immediately consumed by a downstream user. This means downstream processing of this partition and upstream generation of the next partition can be overlapped. Now, if an operator processing a DataStream cannot emit any partitions downstream until it has seen all of the partitions in its input DataStreams, the pipeline breaks. An example of this is an aggregation.","title":"Lesson 3: Things you can't do."},{"location":"started/","text":"Getting Started Quokka in Three Cartoons The fundamental concept in Quokka is a stream of Polars DataFrames , which we call a DataStream . A Polars DataFrame is basically a Pandas DataFrame, except that it's backed by Apache Arrow and supports fast compute with Polars . Readers familiar with Spark RDDs can interpret a DataStream as an RDD where data partitions are materialized in sequence. In contrast to Spark, partitions can be consumed as soon as they are generated. This facilitates pipelining between multiple data processing stages and is the primary reason why Quokka is fast. The user defines input readers that generate a DataStream from a dataset. For example, Quokka's cloud CSV reader generates a DataStream from an S3 bucket of CSV files. The user can also define stateful operators that operate on one or more DataStreams to produce one more DataStream. Finally a DataStream could be written to an output sink , which could be a distributed in-memory dataset that can be converted to Pandas or stable storage on disk or S3. In this illustration, the bush produces a DataStream of leaves and the forest produces a DataStream of acorns. The brown quokka consumes those two streams and magically turn it into a stream of strawberries. The grey quokka takes in this stream of strawberries, slices them up and puts them in a salad bowl. Unfortunately, people like us can't slice strawberries for a living and have to process tables of numbers. Quokka exposes useful primitives that allow you to filter, aggregate and join DataStreams, similar to what you can do in Pandas or Spark. Please look at the tutorials to learn more. It would be a dismal world if there is only one quokka of each kind. Quokka supports parallelism for stateful operators with channels , which are parallel instantiations of a stateful operator to achieve data parallelism. Input sources can also have channels to parallelize the reading of a data source. For example, we can have two bushes and two forests, and four brown quokkas. While the user can manually specify the number of channels they want for operators, in most cases it's automagically decided for you based on what you are doing, similar to Spark. At its core, Quokka uses Ray actors. Each channel in an input source or stateful operator constitutes an actor that can be scheduled independently to a machine in a cluster. Actors on the same machine talk to each other through memory while actors on different machines communicate through the network. An example scheduling of our quokkas is shown below. The user also shouldn't have to worry about this scheduling in most cases if using the DataStream API. However I couldn't resist making this cartoon, and it might be cool to know how Quokka works under the hood. Installation If you plan on trying out Quokka for whatever reason, I'd love to hear from you. Please send an email to zihengw@stanford.edu or join the Discord . Quokka can be installed as a pip package: pip3 install pyquokka However it needs the latest version of Redis (at least 6.0), which you can get by running the following in sudo: curl https://packages.redis.io/gpg | apt-key add - echo \"deb https://packages.redis.io/deb $(lsb_release -cs) main\" | tee /etc/apt/sources.list.d/redis.list apt-get update apt-get install redis If you only plan on running Quokka locally, you are done. Here is a 10 min lesson on how it works. If you plan on using Quokka for cloud, there's a bit more setup that needs to be done. Currently Quokka only provides support for AWS. Quokka provides a utility library under pyquokka.utils which allows you to manager clusters and connect to them. It assumes that awscli is configured locally and you have a keypair and a security group with the proper configurations. To set these things up, you can follow the AWS guide . More detailed instructions can be found in Setting Up Cloud Cluster . Quokka also plans to extend support to Docker/Kubernetes based deployments based on KubeRay. (Contributions welcome!) Image credits: some icons taken from flaticon.com.","title":"Getting Started"},{"location":"started/#getting-started","text":"","title":"Getting Started"},{"location":"started/#quokka-in-three-cartoons","text":"The fundamental concept in Quokka is a stream of Polars DataFrames , which we call a DataStream . A Polars DataFrame is basically a Pandas DataFrame, except that it's backed by Apache Arrow and supports fast compute with Polars . Readers familiar with Spark RDDs can interpret a DataStream as an RDD where data partitions are materialized in sequence. In contrast to Spark, partitions can be consumed as soon as they are generated. This facilitates pipelining between multiple data processing stages and is the primary reason why Quokka is fast. The user defines input readers that generate a DataStream from a dataset. For example, Quokka's cloud CSV reader generates a DataStream from an S3 bucket of CSV files. The user can also define stateful operators that operate on one or more DataStreams to produce one more DataStream. Finally a DataStream could be written to an output sink , which could be a distributed in-memory dataset that can be converted to Pandas or stable storage on disk or S3. In this illustration, the bush produces a DataStream of leaves and the forest produces a DataStream of acorns. The brown quokka consumes those two streams and magically turn it into a stream of strawberries. The grey quokka takes in this stream of strawberries, slices them up and puts them in a salad bowl. Unfortunately, people like us can't slice strawberries for a living and have to process tables of numbers. Quokka exposes useful primitives that allow you to filter, aggregate and join DataStreams, similar to what you can do in Pandas or Spark. Please look at the tutorials to learn more. It would be a dismal world if there is only one quokka of each kind. Quokka supports parallelism for stateful operators with channels , which are parallel instantiations of a stateful operator to achieve data parallelism. Input sources can also have channels to parallelize the reading of a data source. For example, we can have two bushes and two forests, and four brown quokkas. While the user can manually specify the number of channels they want for operators, in most cases it's automagically decided for you based on what you are doing, similar to Spark. At its core, Quokka uses Ray actors. Each channel in an input source or stateful operator constitutes an actor that can be scheduled independently to a machine in a cluster. Actors on the same machine talk to each other through memory while actors on different machines communicate through the network. An example scheduling of our quokkas is shown below. The user also shouldn't have to worry about this scheduling in most cases if using the DataStream API. However I couldn't resist making this cartoon, and it might be cool to know how Quokka works under the hood.","title":"Quokka in Three Cartoons"},{"location":"started/#installation","text":"If you plan on trying out Quokka for whatever reason, I'd love to hear from you. Please send an email to zihengw@stanford.edu or join the Discord . Quokka can be installed as a pip package: pip3 install pyquokka However it needs the latest version of Redis (at least 6.0), which you can get by running the following in sudo: curl https://packages.redis.io/gpg | apt-key add - echo \"deb https://packages.redis.io/deb $(lsb_release -cs) main\" | tee /etc/apt/sources.list.d/redis.list apt-get update apt-get install redis If you only plan on running Quokka locally, you are done. Here is a 10 min lesson on how it works. If you plan on using Quokka for cloud, there's a bit more setup that needs to be done. Currently Quokka only provides support for AWS. Quokka provides a utility library under pyquokka.utils which allows you to manager clusters and connect to them. It assumes that awscli is configured locally and you have a keypair and a security group with the proper configurations. To set these things up, you can follow the AWS guide . More detailed instructions can be found in Setting Up Cloud Cluster . Quokka also plans to extend support to Docker/Kubernetes based deployments based on KubeRay. (Contributions welcome!) Image credits: some icons taken from flaticon.com.","title":"Installation"},{"location":"tutorial/","text":"Advanced Tutorials This section is for learning how to use Quokka's graph level API. This is expected for use cases where the dataframe API cannot satisfy your needs. Most users are not expected to program at this level. You should contact me: zihengw@stanford.edu if you want to do this. You should probably stop reading now, unless you are a Stanford undergrad or masters student (or somebody else) who somehow decided to work with me on Quokka. The code for the tutorials can be found under apps/tutorials . They might perform meaningless tasks or perform tasks which you shoudn't necessarily use Quokka for, but they will showcase how Quokka works. I wrote Quokka. As a result I might take some things for granted that you might not. If you spot a typo or find some sections too difficult to understand, I would appreciate your feedback! Better yet, the docs are also open source under quokka/docs, so you can also make a PR. Lesson 0: Addition Let's walk through our first Quokka program. This first example defines an input reader which produces a stream of numbers, and a stateful operator which adds them up. Please read the comments in the code. Let's first look at the import section. # we need to import Quokka specific objects. A TaskGraph is always needed in a program # that uses the DAG runtime API. We will define a TaskGraph by defining input readers # and stateful operators and adding them to the TaskGraph. Then we will execute the TaskGraph. from pyquokka.quokka_runtime import TaskGraph # Quokka also needs a notion of the compute substrate the TaskGraph is executing on. # LocalCluster is meant for single-machine execution. For distributed execution, # you would need to import QuokkaClusterManager and create a new cluster or initialize # one from a json config. from pyquokka.utils import LocalCluster # Executor is an abstract class which you should extend to implement your own executors. # Quokka also provides canned executors which you call import from pyquokka.executors such # as joins, sort and asof_join. from pyquokka.executors import Executor import time # define a LocalCluster execution context. This will make a cluster object with information # such as local core count etc. cluster = LocalCluster() Quokka provides many optimized input readers for different input data formats. However, in this tutorial we are going to define a custom input reader class to showcase how the input reader works. The mindset here is that there will be many channels of this input reader (by default equal to the number of cores in the cluster), and each channel will have its own copy of an object of this class. They will all be initialized in the same way, but when each channel calls the get_next_batch method of its own object, the channel argument supplied will be different. class SimpleDataset: # the object will be initialized once locally. You can define whatever attributes you want. # You can also set attributes to None if they will be supplied later by the framework # in set_num_channels method def __init__(self, limit) -> None: self.limit = limit self.num_channels = None # this is an optional method that will be called by the runtime on this object during # TaskGraph construction, if the method exists. This mainly updates the num_channel # attribute of the object. For some input readers what a channel produces is independent # of the total number of channels, and they don't have to implement this method. Other # input readers might need to perform additional computation upon learning the total # number of channels, such as byte ranges to read in a CSV file. # # This method can be used to set additional class attributes. The programmer could # do that in the __init__ method too, if she knows the total number of channels # and does not want to rely on Quokka's default behavior etc. def set_num_channels(self, num_channels): self.num_channels = num_channels # the get_next_batch method defines an iterator. Each channel will iterate through # its own copy of the object's get_next_batch method, with the channel argument # set to its own channel id. In this example, if there are N channels, channel k # will yield numbers k, k + N, k + 2N, all the way up to the limit. # Note that the get_next_batch method takes an optional parameter pos, and yields # two objects, with the first being None here. Let's not worry about these things # for the time being. They are used for Quokka's parallelized fault recovery. def get_next_batch(self, channel, pos=None): assert self.num_channels is not None curr_number = channel while curr_number < self.limit: yield None, curr_number curr_number += self.num_channels Now that we defined the input reader, we are going to define the stateful operator. Similar to the input reader, we define a Python class. All channels of the stateful operator will have a copy of an object of this class. The stateful operator exposes two important methods, execute and done , which might produce outputs for more downstream stateful operators. execute is called whenever upstream input reader channels have produced some input batches for the stateful operator channel to process. done is called when the stateful operator channel knows it will no longer receive any more inputs and has already processed all the inputs it has. Our stateful operator here adds up all the elements in an input stream and returns the sum. class AddExecutor(Executor): # initialize state. This will be done locally. This initial state will be copied # along with the object to all the channels. def __init__(self) -> None: self.sum = 0 # the execute method takes three arguments. The first argument batches, is a list of # batches from an input QStream, which could be the output of an input reader or another # stateful operator. The items in the batch could have come from one channel, several, # or all of them! it is best practice that the stateful operator doesn't make # any assumptions on where these batches originated, except that they belong # to the same QStream. # the second argument, stream_id, is used to identify the QStream the batches came from. # in this example we only have one input QStream so we can ignore this argument. # the third argument, channel, denotes the channel id of the channel executing the object # similar to the argument for the input reader. Here we also don't use this argument. def execute(self,batches,stream_id, channel): for batch in batches: assert type(batch) == int self.sum += batch # note that we can't return anything in our execute method. We don't know what the sum is # until we have seen all of the elements in the input QStream. # done only has one argument, which is the channel. It can return an element or an iterator # of elements. def done(self,channel): print(\"I am executor \", channel, \" my sum is \", self.sum) return self.sum Now that we have defined our input reader and stateful operator, we can hook them up together in a TaskGraph. Defining the TaskGraph requires a cluster object, which is LocalCluster here but can be an S3Cluster or AzureCluster for cloud deployments. We will then initialize the objects for the input reader and stateful operators. Again, we initialize one object, which will be copied to each channel. We can now add the input reader and stateful operator to our TaskGraph. task_graph = TaskGraph(cluster) reader = SimpleDataset(80) # define a new input reader in our TaskGraph. numbers is a QStream. numbers = task_graph.new_input_reader_node(reader) executor = AddExecutor() # define a new blocking node. A blocking node writes out its results in a materialized Dataset # object instead of producing a QStream. Note the first argument is a dictionary. This assigns # each input stream an internal name, which corresponds to the stream_id field in the execute # method. Since we called the numbers QStream 0, when execute is called on batches from this QStream, # the stream_id argument will be 0. sum = task_graph.new_blocking_node({0:numbers},executor) # create() must be called before run() task_graph.create() start = time.time() task_graph.run() print(\"total time \", time.time() - start) # we can call to_list() on a Dataset object to collect its elements, which will simply be all # the objects returned by the blocking node's execute and done methods. print(sum.to_list()) Here we used new_blocking_node to define the stateful operator in the TaskGraph. The TaskGraph exposes two different APIs: new_nonblocking_node and new_blocking node . The former will put their outputs in a QStream, which could be consumed by downstream operators immediately, while the latter will materialize the outputs into a Dataset object. Downstream operators cannot read a Dataset until it's complete. This is intimately related to the idea of nonblocking vs blocking stateful operators. Some operators such as streaming join can emit valid outputs as soon as they have seen partial inputs, while other operators like aggregation must wait until seeing all of the input before emitting any partial output. However, you could define a nonblocking operator as a new_blocking_node , if you want to materialize its outputs instead of streaming them forward, e.g. to limit pipeline depth. You could also define a blocking operator as a new_nonblocking_node , the QStream will just consist of the elements returned during the done method (which could return an iterator). The TaskGraph also exposes an API to define stateless operators: new_task . This defines a stateless transform on a QStream and is very similar to Spark's map . We will cover this in a later tutorial to showcase deep learning inference. Note that we covered most of the important concepts covered in the getting started cartoons. However the astute reader would notice that we didn't define a partition function here, nor did we specify how many channels of the input reader or the stateful operator to launch. The answer is that Quokka tries to provide suitable defaults for these things. Quokka currently launches one channel per core for input readers, and one channel per machine for stateful operators. These defaults are subject to change and you shouldn't rely on them. Quokka's default partition function is to send all the outputs generated by a channel to the channel of the target on the same machine. Lesson 1: Joins If you think the first lesson was too complicated, it proably was. This is because we had to define custom input readers and stateful operators. Hopefully in the process you learned a few things about how Quokka works. In most scenarios, it is my hope that you don't have to define custom objects, and use canned implementations which you can just import. This is similar to how Tensorflow or Pytorch works. If you know how to import torch.nn.Conv2d , you get the idea. Here, we are going to take two CSVs on Disk, join them, and count the number of records in the result: select count(*) from a and b where a.key = b.key . You can use the a.csv and b.csv provided in the apps/tutorials folder, or you can supply your own and change the CSV input reader arguments appropriately. Without further ado, here's the code with comments: import time from pyquokka.quokka_runtime import TaskGraph from pyquokka.executors import PolarJoinExecutor, CountExecutor from pyquokka.dataset import InputDiskCSVDataset from pyquokka.utils import LocalCluster cluster = LocalCluster() task_graph = TaskGraph(cluster) # the arguments are: filename, column names, how many bytes to read in a batch a_reader = InputDiskCSVDataset(\"a.csv\", [\"key\",\"val1\",\"val2\"] , stride = 1024) b_reader = InputDiskCSVDataset(\"b.csv\", [\"key\",\"val1\",\"val2\"] , stride = 1024) a = task_graph.new_input_reader_node(a_reader) b = task_graph.new_input_reader_node(b_reader) # define a streaming join operator using the Polars library for internal join implementation. join_executor = PolarJoinExecutor(on=\"key\") # the default partition strategy will not work for join! We need to specify # an alternative partition function. Quokka has the notion of \"keyed\" QStreams, # which are QStreams where the batch elements are Pandas or Polars DataFrames # or Pyarrow tables. In this case, we can provide a column name as partition key. joined = task_graph.new_non_blocking_node({0:a,1:b},join_executor,partition_key_supplied={0:\"key\", 1:\"key\"}) count_executor = CountExecutor() count = task_graph.new_blocking_node({0:joined},count_executor) task_graph.create() start = time.time() task_graph.run() print(\"total time \", time.time() - start) print(count.to_list()) Note here we defined a new_nonblocking_node for the join operator and a new_blocking_node for the count operator. This means that Quokka will execute the join in a pipelined parallel fashion with the count. As a result, the input reader, join and count actors are all executing concurrently in the system. The count operator will return the count as a single number which will be stored in a Dataset object. About benchmarking Quokka programs. Quokka programs do a bit of processing locally. For example, when an input reader is added to the TaskGraph with an InputDiskCSVDataset object, Quokka performs set_num_channels on the object, and compute byte offsets for each channel to start reading from. This could be expensive for large CSV files, especially if we are using blob storage input sources. In pratice this completes in a few seconds for datasets TBs in size. This is quite similar to what Spark's dataframe API does. The TaskGraph also needs to be initialized by calling task_graph.create() . This actually spawns the Ray actors executing the channels, and could take a while when you have a lot of channels. However, the time of both the input reader initialization and the TaskGraph initialization do not strongly scale with the input data size, unlike the actual execution time of the TaskGraph! As a result, while on trivial input sizes one might find the initialization times to be longer than the actual execution time, on real programs it is best practice to just time the task_graph.run() call. This example showed how to execute a simple SQL query by describing its physical plan. You can execute much mroe complex SQL queries with Quokka (check out the TPC-H implementations under quokka/apps). Quokka can currently typically achieve around 3x speedup compared to SparkSQL (EMR 6.5.0). If you have an expensive query you have to periodically run and would like to try writing out its physical plan in Quokka API, give it a shot! Again, contact me at zihengw@stanford.edu if you run into any problems. We are working very hard to add a dataframe and SQL API to Quokka, targeting release Sep/Oct 2022. Keep tuned for more information.","title":"TaskGraph API"},{"location":"tutorial/#advanced-tutorials","text":"This section is for learning how to use Quokka's graph level API. This is expected for use cases where the dataframe API cannot satisfy your needs. Most users are not expected to program at this level. You should contact me: zihengw@stanford.edu if you want to do this. You should probably stop reading now, unless you are a Stanford undergrad or masters student (or somebody else) who somehow decided to work with me on Quokka. The code for the tutorials can be found under apps/tutorials . They might perform meaningless tasks or perform tasks which you shoudn't necessarily use Quokka for, but they will showcase how Quokka works. I wrote Quokka. As a result I might take some things for granted that you might not. If you spot a typo or find some sections too difficult to understand, I would appreciate your feedback! Better yet, the docs are also open source under quokka/docs, so you can also make a PR.","title":"Advanced Tutorials"},{"location":"tutorial/#lesson-0-addition","text":"Let's walk through our first Quokka program. This first example defines an input reader which produces a stream of numbers, and a stateful operator which adds them up. Please read the comments in the code. Let's first look at the import section. # we need to import Quokka specific objects. A TaskGraph is always needed in a program # that uses the DAG runtime API. We will define a TaskGraph by defining input readers # and stateful operators and adding them to the TaskGraph. Then we will execute the TaskGraph. from pyquokka.quokka_runtime import TaskGraph # Quokka also needs a notion of the compute substrate the TaskGraph is executing on. # LocalCluster is meant for single-machine execution. For distributed execution, # you would need to import QuokkaClusterManager and create a new cluster or initialize # one from a json config. from pyquokka.utils import LocalCluster # Executor is an abstract class which you should extend to implement your own executors. # Quokka also provides canned executors which you call import from pyquokka.executors such # as joins, sort and asof_join. from pyquokka.executors import Executor import time # define a LocalCluster execution context. This will make a cluster object with information # such as local core count etc. cluster = LocalCluster() Quokka provides many optimized input readers for different input data formats. However, in this tutorial we are going to define a custom input reader class to showcase how the input reader works. The mindset here is that there will be many channels of this input reader (by default equal to the number of cores in the cluster), and each channel will have its own copy of an object of this class. They will all be initialized in the same way, but when each channel calls the get_next_batch method of its own object, the channel argument supplied will be different. class SimpleDataset: # the object will be initialized once locally. You can define whatever attributes you want. # You can also set attributes to None if they will be supplied later by the framework # in set_num_channels method def __init__(self, limit) -> None: self.limit = limit self.num_channels = None # this is an optional method that will be called by the runtime on this object during # TaskGraph construction, if the method exists. This mainly updates the num_channel # attribute of the object. For some input readers what a channel produces is independent # of the total number of channels, and they don't have to implement this method. Other # input readers might need to perform additional computation upon learning the total # number of channels, such as byte ranges to read in a CSV file. # # This method can be used to set additional class attributes. The programmer could # do that in the __init__ method too, if she knows the total number of channels # and does not want to rely on Quokka's default behavior etc. def set_num_channels(self, num_channels): self.num_channels = num_channels # the get_next_batch method defines an iterator. Each channel will iterate through # its own copy of the object's get_next_batch method, with the channel argument # set to its own channel id. In this example, if there are N channels, channel k # will yield numbers k, k + N, k + 2N, all the way up to the limit. # Note that the get_next_batch method takes an optional parameter pos, and yields # two objects, with the first being None here. Let's not worry about these things # for the time being. They are used for Quokka's parallelized fault recovery. def get_next_batch(self, channel, pos=None): assert self.num_channels is not None curr_number = channel while curr_number < self.limit: yield None, curr_number curr_number += self.num_channels Now that we defined the input reader, we are going to define the stateful operator. Similar to the input reader, we define a Python class. All channels of the stateful operator will have a copy of an object of this class. The stateful operator exposes two important methods, execute and done , which might produce outputs for more downstream stateful operators. execute is called whenever upstream input reader channels have produced some input batches for the stateful operator channel to process. done is called when the stateful operator channel knows it will no longer receive any more inputs and has already processed all the inputs it has. Our stateful operator here adds up all the elements in an input stream and returns the sum. class AddExecutor(Executor): # initialize state. This will be done locally. This initial state will be copied # along with the object to all the channels. def __init__(self) -> None: self.sum = 0 # the execute method takes three arguments. The first argument batches, is a list of # batches from an input QStream, which could be the output of an input reader or another # stateful operator. The items in the batch could have come from one channel, several, # or all of them! it is best practice that the stateful operator doesn't make # any assumptions on where these batches originated, except that they belong # to the same QStream. # the second argument, stream_id, is used to identify the QStream the batches came from. # in this example we only have one input QStream so we can ignore this argument. # the third argument, channel, denotes the channel id of the channel executing the object # similar to the argument for the input reader. Here we also don't use this argument. def execute(self,batches,stream_id, channel): for batch in batches: assert type(batch) == int self.sum += batch # note that we can't return anything in our execute method. We don't know what the sum is # until we have seen all of the elements in the input QStream. # done only has one argument, which is the channel. It can return an element or an iterator # of elements. def done(self,channel): print(\"I am executor \", channel, \" my sum is \", self.sum) return self.sum Now that we have defined our input reader and stateful operator, we can hook them up together in a TaskGraph. Defining the TaskGraph requires a cluster object, which is LocalCluster here but can be an S3Cluster or AzureCluster for cloud deployments. We will then initialize the objects for the input reader and stateful operators. Again, we initialize one object, which will be copied to each channel. We can now add the input reader and stateful operator to our TaskGraph. task_graph = TaskGraph(cluster) reader = SimpleDataset(80) # define a new input reader in our TaskGraph. numbers is a QStream. numbers = task_graph.new_input_reader_node(reader) executor = AddExecutor() # define a new blocking node. A blocking node writes out its results in a materialized Dataset # object instead of producing a QStream. Note the first argument is a dictionary. This assigns # each input stream an internal name, which corresponds to the stream_id field in the execute # method. Since we called the numbers QStream 0, when execute is called on batches from this QStream, # the stream_id argument will be 0. sum = task_graph.new_blocking_node({0:numbers},executor) # create() must be called before run() task_graph.create() start = time.time() task_graph.run() print(\"total time \", time.time() - start) # we can call to_list() on a Dataset object to collect its elements, which will simply be all # the objects returned by the blocking node's execute and done methods. print(sum.to_list()) Here we used new_blocking_node to define the stateful operator in the TaskGraph. The TaskGraph exposes two different APIs: new_nonblocking_node and new_blocking node . The former will put their outputs in a QStream, which could be consumed by downstream operators immediately, while the latter will materialize the outputs into a Dataset object. Downstream operators cannot read a Dataset until it's complete. This is intimately related to the idea of nonblocking vs blocking stateful operators. Some operators such as streaming join can emit valid outputs as soon as they have seen partial inputs, while other operators like aggregation must wait until seeing all of the input before emitting any partial output. However, you could define a nonblocking operator as a new_blocking_node , if you want to materialize its outputs instead of streaming them forward, e.g. to limit pipeline depth. You could also define a blocking operator as a new_nonblocking_node , the QStream will just consist of the elements returned during the done method (which could return an iterator). The TaskGraph also exposes an API to define stateless operators: new_task . This defines a stateless transform on a QStream and is very similar to Spark's map . We will cover this in a later tutorial to showcase deep learning inference. Note that we covered most of the important concepts covered in the getting started cartoons. However the astute reader would notice that we didn't define a partition function here, nor did we specify how many channels of the input reader or the stateful operator to launch. The answer is that Quokka tries to provide suitable defaults for these things. Quokka currently launches one channel per core for input readers, and one channel per machine for stateful operators. These defaults are subject to change and you shouldn't rely on them. Quokka's default partition function is to send all the outputs generated by a channel to the channel of the target on the same machine.","title":"Lesson 0: Addition"},{"location":"tutorial/#lesson-1-joins","text":"If you think the first lesson was too complicated, it proably was. This is because we had to define custom input readers and stateful operators. Hopefully in the process you learned a few things about how Quokka works. In most scenarios, it is my hope that you don't have to define custom objects, and use canned implementations which you can just import. This is similar to how Tensorflow or Pytorch works. If you know how to import torch.nn.Conv2d , you get the idea. Here, we are going to take two CSVs on Disk, join them, and count the number of records in the result: select count(*) from a and b where a.key = b.key . You can use the a.csv and b.csv provided in the apps/tutorials folder, or you can supply your own and change the CSV input reader arguments appropriately. Without further ado, here's the code with comments: import time from pyquokka.quokka_runtime import TaskGraph from pyquokka.executors import PolarJoinExecutor, CountExecutor from pyquokka.dataset import InputDiskCSVDataset from pyquokka.utils import LocalCluster cluster = LocalCluster() task_graph = TaskGraph(cluster) # the arguments are: filename, column names, how many bytes to read in a batch a_reader = InputDiskCSVDataset(\"a.csv\", [\"key\",\"val1\",\"val2\"] , stride = 1024) b_reader = InputDiskCSVDataset(\"b.csv\", [\"key\",\"val1\",\"val2\"] , stride = 1024) a = task_graph.new_input_reader_node(a_reader) b = task_graph.new_input_reader_node(b_reader) # define a streaming join operator using the Polars library for internal join implementation. join_executor = PolarJoinExecutor(on=\"key\") # the default partition strategy will not work for join! We need to specify # an alternative partition function. Quokka has the notion of \"keyed\" QStreams, # which are QStreams where the batch elements are Pandas or Polars DataFrames # or Pyarrow tables. In this case, we can provide a column name as partition key. joined = task_graph.new_non_blocking_node({0:a,1:b},join_executor,partition_key_supplied={0:\"key\", 1:\"key\"}) count_executor = CountExecutor() count = task_graph.new_blocking_node({0:joined},count_executor) task_graph.create() start = time.time() task_graph.run() print(\"total time \", time.time() - start) print(count.to_list()) Note here we defined a new_nonblocking_node for the join operator and a new_blocking_node for the count operator. This means that Quokka will execute the join in a pipelined parallel fashion with the count. As a result, the input reader, join and count actors are all executing concurrently in the system. The count operator will return the count as a single number which will be stored in a Dataset object. About benchmarking Quokka programs. Quokka programs do a bit of processing locally. For example, when an input reader is added to the TaskGraph with an InputDiskCSVDataset object, Quokka performs set_num_channels on the object, and compute byte offsets for each channel to start reading from. This could be expensive for large CSV files, especially if we are using blob storage input sources. In pratice this completes in a few seconds for datasets TBs in size. This is quite similar to what Spark's dataframe API does. The TaskGraph also needs to be initialized by calling task_graph.create() . This actually spawns the Ray actors executing the channels, and could take a while when you have a lot of channels. However, the time of both the input reader initialization and the TaskGraph initialization do not strongly scale with the input data size, unlike the actual execution time of the TaskGraph! As a result, while on trivial input sizes one might find the initialization times to be longer than the actual execution time, on real programs it is best practice to just time the task_graph.run() call. This example showed how to execute a simple SQL query by describing its physical plan. You can execute much mroe complex SQL queries with Quokka (check out the TPC-H implementations under quokka/apps). Quokka can currently typically achieve around 3x speedup compared to SparkSQL (EMR 6.5.0). If you have an expensive query you have to periodically run and would like to try writing out its physical plan in Quokka API, give it a shot! Again, contact me at zihengw@stanford.edu if you run into any problems. We are working very hard to add a dataframe and SQL API to Quokka, targeting release Sep/Oct 2022. Keep tuned for more information.","title":"Lesson 1: Joins"}]} \ No newline at end of file +{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"If you like, please: Introduction Quokka is a lightweight distributed dataflow engine written completely in Python targeting modern data science use cases involving 100GBs to TBs of data. At its core, Quokka manipulates streams of data with stateful actors. Quokka offers a stream-centric, Python-native perspective to tasks commonly done today by Spark. Please see the Getting Started for further details. This streaming paradigm inspired by high performance databases such as DuckDB and Snowflake allows Quokka to greatly outperform Apache Spark performance on SQL type workloads reading from cloud blob storage like S3 for formats like CSV and Parquet. Fineprint: benchmark done using four c5.4xlarge instances for Quokka and EMR 6.5.0 with five c5.4xlarge instances for Spark where one instance is used as a coordinator. Ignores initialization costs which are generally comparable between Quokka and Spark. What's even better than being cheap and fast is the fact that since Quokka is Python native, you can easily use your favorite machine learning libraries like Scikit-Learn and Pytorch with Quokka inside of arbitrary Python functions to transform your DataStreams. Another great advantage is that a streaming data paradigm is more in line with how data arrives in the real world, making it easy to bridge your data application to production, or conduct time-series backfilling on your historical data. You develop with Quokka locally, and deploy to cloud (currently AWS) with a single line of code change. Quokka is specifically designed for the following workloads. SQLish data engineering workloads on data lake. You can try Quokka if you want to speed up some Spark data jobs, or if you want to implement \"stateful Python UDFs\" in your SQL pipeline, which is kind of a nightmare in Spark. (e.g. forward computing some feature based on historical data) Quokka can also typically achieve much better performance than Spark on pure SQL workloads when input data comes from cloud storage, especially if the data is in CSV format. The drawback is Quokka currently does not support SQL interface, so you are stuck with a dataframe-like DataStream API. However SQL optimizations such as predicate pushdown and early projection are implemented. (support forthcoming) ML engineering pipelines on large unstructured data datasets. Since Quokka is Python-native, it interfaces perfectly with the Python machine learning ecosystem. No more JVM troubles. Unlike Spark, Quokka also will let you precisely control the placement of your stateful operators on machines, preventing GPU out-of-memory and improving performance by reducing contention. Support for these workloads are still in the works. If you are interested, please drop me a note: zihengw@stanford.edu or Discord . Roadmap Streaming support. Although Quokka follows a streaming model, it currently does not support \"streaming\" computations from Kafka, Kinesis etc. They will soon be supported. This will allow batch data pipelines to be deployed to production with one line code change. Target Q4 2022. Fault tolerance. Currently Quokka's fault tolerance mechanism is experimental. Improvements are being made in this direction transparent to the API. Please use on-demand instances for important workloads. (Well if you are planning on using Quokka for important workloads or any workload, please contact me: zihengw@stanford.edu.) The goal is to support Spark-like fault recovery stability by Q1 2023. Full SQL support. I want to be able to do qc.sql(SQL_QUERY). I am working with SQLGlot to make this happen. Target pass TPC-H and say 75% of TPC-DS Q1 2023. Time Series Package. Quokka will support point-in-time joins and asof joins natively by Q4 2022. This will be useful for feature backtesting, etc. Contact If you are interested in trying out Quokka or hit any problems (any problems at all), please contact me at zihengw@stanford.edu or Discord . I will try my best to make Quokka work for you.","title":"Home"},{"location":"#if-you-like-please","text":"","title":"If you like, please: "},{"location":"#introduction","text":"Quokka is a lightweight distributed dataflow engine written completely in Python targeting modern data science use cases involving 100GBs to TBs of data. At its core, Quokka manipulates streams of data with stateful actors. Quokka offers a stream-centric, Python-native perspective to tasks commonly done today by Spark. Please see the Getting Started for further details. This streaming paradigm inspired by high performance databases such as DuckDB and Snowflake allows Quokka to greatly outperform Apache Spark performance on SQL type workloads reading from cloud blob storage like S3 for formats like CSV and Parquet. Fineprint: benchmark done using four c5.4xlarge instances for Quokka and EMR 6.5.0 with five c5.4xlarge instances for Spark where one instance is used as a coordinator. Ignores initialization costs which are generally comparable between Quokka and Spark. What's even better than being cheap and fast is the fact that since Quokka is Python native, you can easily use your favorite machine learning libraries like Scikit-Learn and Pytorch with Quokka inside of arbitrary Python functions to transform your DataStreams. Another great advantage is that a streaming data paradigm is more in line with how data arrives in the real world, making it easy to bridge your data application to production, or conduct time-series backfilling on your historical data. You develop with Quokka locally, and deploy to cloud (currently AWS) with a single line of code change. Quokka is specifically designed for the following workloads. SQLish data engineering workloads on data lake. You can try Quokka if you want to speed up some Spark data jobs, or if you want to implement \"stateful Python UDFs\" in your SQL pipeline, which is kind of a nightmare in Spark. (e.g. forward computing some feature based on historical data) Quokka can also typically achieve much better performance than Spark on pure SQL workloads when input data comes from cloud storage, especially if the data is in CSV format. The drawback is Quokka currently does not support SQL interface, so you are stuck with a dataframe-like DataStream API. However SQL optimizations such as predicate pushdown and early projection are implemented. (support forthcoming) ML engineering pipelines on large unstructured data datasets. Since Quokka is Python-native, it interfaces perfectly with the Python machine learning ecosystem. No more JVM troubles. Unlike Spark, Quokka also will let you precisely control the placement of your stateful operators on machines, preventing GPU out-of-memory and improving performance by reducing contention. Support for these workloads are still in the works. If you are interested, please drop me a note: zihengw@stanford.edu or Discord .","title":"Introduction"},{"location":"#roadmap","text":"Streaming support. Although Quokka follows a streaming model, it currently does not support \"streaming\" computations from Kafka, Kinesis etc. They will soon be supported. This will allow batch data pipelines to be deployed to production with one line code change. Target Q4 2022. Fault tolerance. Currently Quokka's fault tolerance mechanism is experimental. Improvements are being made in this direction transparent to the API. Please use on-demand instances for important workloads. (Well if you are planning on using Quokka for important workloads or any workload, please contact me: zihengw@stanford.edu.) The goal is to support Spark-like fault recovery stability by Q1 2023. Full SQL support. I want to be able to do qc.sql(SQL_QUERY). I am working with SQLGlot to make this happen. Target pass TPC-H and say 75% of TPC-DS Q1 2023. Time Series Package. Quokka will support point-in-time joins and asof joins natively by Q4 2022. This will be useful for feature backtesting, etc.","title":"Roadmap"},{"location":"#contact","text":"If you are interested in trying out Quokka or hit any problems (any problems at all), please contact me at zihengw@stanford.edu or Discord . I will try my best to make Quokka work for you.","title":"Contact"},{"location":"api/","text":"API reference First do: from pyquokka.df import * qc = QuokkaContext() If working with S3, do: from pyquokka.df import * manager = QuokkaClusterManager() cluster = manager.get_cluster_from_json(\"config.json\") This assumes you have a cluster saved in config.json. Please refer to the guide here to do this. qc.read_csv","title":"API reference"},{"location":"api/#api-reference","text":"First do: from pyquokka.df import * qc = QuokkaContext() If working with S3, do: from pyquokka.df import * manager = QuokkaClusterManager() cluster = manager.get_cluster_from_json(\"config.json\") This assumes you have a cluster saved in config.json. Please refer to the guide here to do this.","title":"API reference"},{"location":"api/#qcread_csv","text":"","title":"qc.read_csv"},{"location":"cloud/","text":"Setting up Quokka for EC2 To use Quokka for EC2, you need to (at minimum) have an AWS account with permissions to launch instances and create new security groups. You will probably run into issues since everybody's AWS setup is a little bit different, so please email: zihengw@stanford.edu or Discord . Quokka requires a security group that allows inbound and outbound connections to ports 5005 (Flight), 6379 (Ray) and 6800 (Redis) from IP addresses within the cluster. For simplicity, you can just enable all inbound and outbound connections from all IP addresses. The easiest way to make this is to manually create an instance on EC2 through the dashboard, e.g. t2.micro, and manually add rules to the security group EC2 assigns that instance. Then you can either copy that security group to a new group, or keep using that modified security group for Quokka. There must be an automated way to do this in the AWS CLI, but I am too lazy to figure it out. If you want to tell me how to do it, I'll post the steps here and buy you a coffee. You also need to generate a pem key pair. The easiest way to do this, again, is to start a t2.micro in the console and using the dashboard. Save the pem key somewhere and write down the absolute path. After you have the security group and you can use the QuokkaClusterManager in pyquokka.utils to spin up a cluster. The code to do this: from pyquokka.utils import QuokkaClusterManager manager = QuokkaClusterManager(key_name = YOUR_KEY, key_location = ABSOLUTE_PATH_TO_KEY, security_group= SECURITY_GROUP_ID) cluster = manager.create_cluster(aws_access_key, aws_access_id, num_instances = 4, instance_type = \"i3.2xlarge\", requirements = [\"pytorch\"]) cluster.to_json(\"config.json\") This would spin up four i3.2xlarge instances and install pytorch on each of them. The QuokkaClusterManager also has other utilities such as launch_all , terminate_cluster and get_cluster_from_json . Importantly, currently only on-demand instances are supported. This will change in the near future. The most interesting utility is probably manager.launch_all(command) , which basically runs a custom command on each machine. You can use this command to massage your cluster into your desired state. In general, all of the machines in your cluster must have all the Python packages you need installed with pip . Importantly, if you are using on demand instances, creating a cluster only needs to happen once. Once you have saved the cluster configs to a json, the next time you want to run a job and use this cluster, you can just do: from pyquokka.utils import QuokkaClusterManager manager = QuokkaClusterManager(key_name = YOUR_KEY, key_location = ABSOLUTE_PATH_TO_KEY, security_group= SECURITY_GROUP_ID) cluster = manager.get_cluster_from_json(\"config.json\") This will work if the cluster is either fully stopped or fully running, i.e. every machine must be in either stopped or running state. If the cluster is running, this assumes it was started by running the get_cluster_from_json command! Please do not manually start the instances and try to use get_cluster_from_json to connect to a cluster. Quokka also plans to extend support to Docker/Kubernetes based deployments based on KubeRay. (Contributions welcome!) Of course, there are plans to support GCP and Azure. The best way to make sure that happens is by sending me a message on email or Discord .","title":"Setting Up Cloud Cluster"},{"location":"cloud/#setting-up-quokka-for-ec2","text":"To use Quokka for EC2, you need to (at minimum) have an AWS account with permissions to launch instances and create new security groups. You will probably run into issues since everybody's AWS setup is a little bit different, so please email: zihengw@stanford.edu or Discord . Quokka requires a security group that allows inbound and outbound connections to ports 5005 (Flight), 6379 (Ray) and 6800 (Redis) from IP addresses within the cluster. For simplicity, you can just enable all inbound and outbound connections from all IP addresses. The easiest way to make this is to manually create an instance on EC2 through the dashboard, e.g. t2.micro, and manually add rules to the security group EC2 assigns that instance. Then you can either copy that security group to a new group, or keep using that modified security group for Quokka. There must be an automated way to do this in the AWS CLI, but I am too lazy to figure it out. If you want to tell me how to do it, I'll post the steps here and buy you a coffee. You also need to generate a pem key pair. The easiest way to do this, again, is to start a t2.micro in the console and using the dashboard. Save the pem key somewhere and write down the absolute path. After you have the security group and you can use the QuokkaClusterManager in pyquokka.utils to spin up a cluster. The code to do this: from pyquokka.utils import QuokkaClusterManager manager = QuokkaClusterManager(key_name = YOUR_KEY, key_location = ABSOLUTE_PATH_TO_KEY, security_group= SECURITY_GROUP_ID) cluster = manager.create_cluster(aws_access_key, aws_access_id, num_instances = 4, instance_type = \"i3.2xlarge\", requirements = [\"pytorch\"]) cluster.to_json(\"config.json\") This would spin up four i3.2xlarge instances and install pytorch on each of them. The QuokkaClusterManager also has other utilities such as launch_all , terminate_cluster and get_cluster_from_json . Importantly, currently only on-demand instances are supported. This will change in the near future. The most interesting utility is probably manager.launch_all(command) , which basically runs a custom command on each machine. You can use this command to massage your cluster into your desired state. In general, all of the machines in your cluster must have all the Python packages you need installed with pip . Importantly, if you are using on demand instances, creating a cluster only needs to happen once. Once you have saved the cluster configs to a json, the next time you want to run a job and use this cluster, you can just do: from pyquokka.utils import QuokkaClusterManager manager = QuokkaClusterManager(key_name = YOUR_KEY, key_location = ABSOLUTE_PATH_TO_KEY, security_group= SECURITY_GROUP_ID) cluster = manager.get_cluster_from_json(\"config.json\") This will work if the cluster is either fully stopped or fully running, i.e. every machine must be in either stopped or running state. If the cluster is running, this assumes it was started by running the get_cluster_from_json command! Please do not manually start the instances and try to use get_cluster_from_json to connect to a cluster. Quokka also plans to extend support to Docker/Kubernetes based deployments based on KubeRay. (Contributions welcome!) Of course, there are plans to support GCP and Azure. The best way to make sure that happens is by sending me a message on email or Discord .","title":"Setting up Quokka for EC2"},{"location":"datastream/","text":"DataStream source DataStream( quokka_context, schema: list, source_node_id: int ) Quokka DataStream class is how most users are expected to interact with Quokka. However users are not expected to create a DataStream directly by calling its constructor. Note that constructor takes an argument called source_node_id , which would confuse most data scientists -- even me! Args quokka_context (pyquokka.df.QuokkaContext) : Similar to Spark SQLContext. schema (list) : The schema of this DataStream, i.e. a list of column names. We might change it to be a dictionary with type information in the future to do better static code checking. source_node_id (int) : the node in the logical plan that produces this DataStream. Attributes quokka_context (pyquokka.df.QuokkaContext) : Similar to Spark SQLContext. schema (list) : The schema of this DataStream, i.e. a list of column names. We might change it to be a dictionary with type information in the future to do better static code checking. source_node_id (int) : the node in the logical plan that produces this DataStream. Methods: .collect source .collect() This will trigger the execution of computational graph, similar to Spark collect(). The result will be a Polars DataFrame on the master Return: Polars DataFrame. Examples >>> f = qc.read_csv(\"my_csv.csv\") >>> result = f.collect() # result will be a Polars dataframe, as if you did polars.read_csv(\"my_csv.csv\") .compute source .compute() This will trigger the execution of computational graph, similar to Spark collect The result will be a Quokka DataSet, which you can then call to_df() or call to_stream() to initiate another computation. Return: Quokka Quokka DataSet. Currently this is going to be just a list of objects distributed across the Redis servers on the workers. .explain source .explain( mode = 'graph' ) This will not trigger the execution of your computation graph but will produce a graph of the execution plan. Args mode (str) : 'graph' will show a graph, 'text' will print a textual description. Return: None. .write_csv source .write_csv( table_location, output_line_limit = 1000000 ) This will write out the entire contents of the DataStream to a list of CSVs. This is a blocking operation, and will call collect() under the hood. Args table_lcation (str) : the root directory to write the output CSVs to. Similar to Spark, Quokka by default writes out a directory of CSVs instead of dumping all the results to a single CSV so the output can be done in parallel. If your dataset is small and you want a single file, you can adjust the output_line_limit parameter. Example table_locations: s3://bucket/prefix for cloud, absolute path /home/user/files for disk. output_line_limit (int) : how many rows each CSV in the output should have. The current implementation simply buffers this many rows in memory instead of using file appends, so you should have enough memory! Return: Polars DataFrame containing the filenames of the CSVs that were produced. Examples >>> f = qc.read_csv(\"lineitem.csv\") >>> f = f.filter(\"l_orderkey < 10 and l_partkey > 5\") >>> f.write_csv(\"/home/user/test-out\") # you should create the directory before hand. .write_parquet source .write_parquet( table_location, output_line_limit = 10000000 ) This will write out the entire contents of the DataStream to a list of Parquets. This is a blocking operation, and will call collect() under the hood. By default, each output Parquet file will contain one row group. Args table_lcation (str) : the root directory to write the output Parquets to. Similar to Spark, Quokka by default writes out a directory of Parquets instead of dumping all the results to a single Parquet so the output can be done in parallel. If your dataset is small and you want a single file, you can adjust the output_line_limit parameter. Example table_locations: s3://bucket/prefix for cloud, absolute path /home/user/files for disk. output_line_limit (int) : the row group size in each output file. Return: Polars DataFrame containing the filenames of the Parquets that were produced. Examples >>> f = qc.read_csv(\"lineitem.csv\") >>> f = f.filter(\"l_orderkey < 10 and l_partkey > 5\") >>> f.write_parquet(\"/home/user/test-out\") # you should create the directory before hand. .filter source .filter( predicate: str ) This will filter the DataStream to contain only rows that match a certain predicate. Currently this predicate must be specified in SQL syntax. You can write any SQL clause you would generally put in a WHERE statement containing arbitrary conjunctions and disjunctions. The identifiers however, must be in the schema of this DataStream! We aim to soon support a more Pythonic interface that better resembles Pandas which allows you to do things like d = d[d.a > 10]. Please look at the examples below. Since a DataStream is implemented as a stream of batches, you might be tempted to think of a filtered DataStream as a stream of batches where each batch directly results from a filter being applied to a batch in the source DataStream. While this certainly may be the case, filters are aggressively optimized by Quokka and is most likely pushed all the way down to the input readers. As a result, you typically should not see a filter node in a Quokka execution plan shown by explain() . It is much better to think of a DataStream simply as a stream of rows that meet certain criteria, and who may be non-deterministically batched together by the Quokka runtime. Indeed, Quokka makes no guarantees on the sizes of these batches, which is determined at runtime. This flexibility is an important reason for Quokka's superior performance. Args predicate (str) : a SQL WHERE clause, look at the examples. Return: A DataStream consisting of rows from the source DataStream that match the predicate. Examples >>> f = qc.read_csv(\"lineitem.csv\") # filter for all the rows where l_orderkey smaller than 10 and l_partkey greater than 5 >>> f = f.filter(\"l_orderkey < 10 and l_partkey > 5\") # nested conditions are supported >>> f = f.filter(\"l_orderkey < 10 and (l_partkey > 5 or l_partkey < 1)\") # most SQL features such as IN and date are supported. >>> f = f.filter(\"l_shipmode IN ('MAIL','SHIP') and l_receiptdate < date '1995-01-01'\") # you can do arithmetic in the predicate just like in SQL. >>> f = f.filter(\"l_shipdate < date '1994-01-01' + interval '1' year and l_discount between 0.06 - 0.01 and 0.06 + 0.01\") # this will fail! Assuming c_custkey is not in f.schema >>> f = f.filter(\"c_custkey > 10\") .select source .select( columns: list ) This will create a new DataStream that contains only selected columns from the source DataStream. Since a DataStream is implemented as a stream of batches, you might be tempted to think of a filtered DataStream as a stream of batches where each batch directly results from selecting columns from a batch in the source DataStream. While this certainly may be the case, select() is aggressively optimized by Quokka and is most likely pushed all the way down to the input readers. As a result, you typically should not see a select node in a Quokka execution plan shown by explain() . It is much better to think of a DataStream simply as a stream of rows that meet certain criteria, and who may be non-deterministically batched together by the Quokka runtime. Indeed, Quokka makes no guarantees on the sizes of these batches, which is determined at runtime. This flexibility is an important reason for Quokka's superior performance. Args columns (list) : a list of columns to select from the source DataStream Return: A DataStream consisting of only the columns selected. Examples >>> f = qc.read_csv(\"lineitem.csv\") # select only the l_orderdate and l_orderkey columns >>> f = f.select([\"l_orderdate\", \"l_orderkey\"]) # this will now fail, since f's schema now consists of only two columns. >>> f = f.select([\"l_linenumber\"]) .drop source .drop( cols_to_drop: list ) Think of this as the anti-opereator to select. Instead of selecting columns, this will drop columns. This is implemented in Quokka as selecting the columns in the DataStream's schema that are not dropped. Args cols_to_drop (list) : a list of columns to drop from the source DataStream Return: A DataStream consisting of all columns in the source DataStream that are not in cols_to_drop . Examples >>> f = qc.read_csv(\"lineitem.csv\") # select only the l_orderdate and l_orderkey columns >>> f = f.drop([\"l_orderdate\", \"l_orderkey\"]) # this will now fail, since you dropped l_orderdate >>> f = f.select([\"l_orderdate\"]) .rename source .rename( rename_dict ) Renames columns in the DataStream according to rename_dict. This is similar to polars.rename . The keys you supply in rename_dict must be present in the schema, and the rename operation must not lead to duplicate column names. Note this will lead to a physical operation at runtime. Args rename_dict (dict) : key is old column name, value is new column name. Return: A DataStream with new schema according to rename. .transform source .transform( f, new_schema: list, required_columns: set, foldable = True ) This is a rather Quokka-specific API that allows arbitrary transformations on a DataStream, similar to Spark RDD.map. Each batch in the DataStream is going to be transformed according to a user defined function, which can produce a new batch. The new batch can have completely different schema or even length as the original batch, and the original data is considered lost, or consumed by this transformation function. This could be used to implement user-defined-aggregation-functions (UDAFs). Note in cases where you are simply generating a new column from other columns for each row, i.e. UDF, you probably want to use the with_column method instead. A DataStream is implemented as a stream of batches. In the runtime, your transformation function will be applied to each of those batches. However, there are no guarantees whatsoever on the sizes of these batches! You should probably make sure your logic is correct regardless of the sizes of the batches. For example, if your DataStream consists of a column of numbers, and you wish to compute the sum of those numbers, you could first transform the DataStream to return just the sum of each batch, and then hook this DataStream up to a stateful operator that adds up all the sums. You can use whatever libraries you have installed in your Python environment in this transformation function. If you are using this on a cloud cluster, you have to make sure the necessary libraries are installed on each machine. You can use the utils package in pyquokka to help you do this. This is very similar to Spark's seldom used combineByKey feature. Note a transformation in the logical plan basically precludes any predicate pushdown or early projection past it, since the original columns are assumed to be lost, and we cannot directly establish correspendences between the input columns to a transformation and its output columns for the purposes of predicate pushdown or early projection. The user is required to supply a set or list of required columns, and we will select for those columns (which can be pushed down) before we apply the transformation. Args f (function) : The transformation function. This transformation function must take as input a Polars DataFrame and output a Polars DataFrame. The transformation function must not have expectations on the length of its input. Similarly, the transformation function does not have to emit outputs of a specific size. The transformation function must produce the same output columns for every possible input. new_schema (list) : The names of the columns of the Polars DataFrame that the transformation function produces. required_columns (list or set) : The names of the columns that are required for this transformation. This argument is made mandatory because it's often trivial to supply and can often greatly speed things up. foldable (bool) : Whether or not the transformation can be executed as part of the batch post-processing of the previous operation in the execution graph. This is set to True by default. Correctly setting this flag requires some insight into how Quokka works. Lightweight functions generally benefit from being folded. Heavyweight functions or those whose efficiency improve with large input sizes might benefit from not being folded. Return: A new transformed DataStream with the supplied schema. Examples # a user defined function that takes in a Polars DataFrame with a single column \"text\", converts it to a Pyarrow table, # and uses nice Pyarrow compute functions to perform the word count on this Polars DataFrame. Note 1) we have to convert it # back to a Polars DataFrame afterwards, 2) the function works regardless of input length and 3) the output columns are the # same regardless of the input. def udf2(x): x = x.to_arrow() da = compute.list_flatten(compute.ascii_split_whitespace(x[\"text\"])) c = da.value_counts().flatten() return polars.from_arrow(pa.Table.from_arrays([c[0], c[1]], names=[\"word\",\"count\"])) # this is a trick to read in text files, just use read_csv with a separator you know won't appear. # the result will just be DataStream with one column. >>> words = qc.read_csv(\"random_words.txt\", [\"text\"], sep = \"|\") # transform words to counts >>> counted = words.transform( udf2, new_schema = [\"word\", \"count\"], required_columns = {\"text\"}, foldable=True) .with_column source .with_column( new_column, f, required_columns = None, foldable = True ) This will create new columns from certain columns in the dataframe. This is similar to pandas df.apply() that makes new columns. This is similar to Spark UDF or Pandas UDF, Polars with_column , Spark with_column , etc. Note that this function, like most Quokka DataStream functions, are not in-place, and will return a new DataStream, with the new column. This is a separate API from transform because the semantics allow for projection and predicate pushdown through this node, since the original columns are all preserved. Use this instead of transform if possible. A DataStream is implemented as a stream of batches. In the runtime, your function will be applied to each of those batches. The function must take as input a Polars DataFrame and produce a Polars DataFrame. This is a different mental model from say Pandas df.apply , where the function is written for each row. There are two restrictions. First, your result must only have one column, and it should have the same name as your new_column argument. Second, your result must have the same length as the input Polars DataFrame. You can use whatever libraries you have installed in your Python environment in this function. If you are using this on a cloud cluster, you have to make sure the necessary libraries are installed on each machine. You can use the utils package in pyquokka to help you do this. Importantly, your function can take full advantage of Polars' columnar APIs to make use of SIMD and other forms of speedy goodness. You can even use Polars LazyFrame abstractions inside of this function. Of course, for ultimate flexbility, you are more than welcome to convert the Polars DataFrame to a Pandas DataFrame and use df.apply . Just remember to convert it back to a Polars DataFrame with only the result column in the end! Args new_column (str) : The name of the new column. f (function) : The apply function. This apply function must take as input a Polars DataFrame and output a Polars DataFrame. The apply function must not have expectations on the length of its input. The output must have the same length as the input. The apply function must produce the same output columns for every possible input. required_columns (list or set) : The names of the columns that are required for your function. If this is not specified then Quokka assumes all the columns are required for your function. Early projection past this function becomes impossible. Long story short, if you can specify this argument, do it. foldable (bool) : Whether or not the function can be executed as part of the batch post-processing of the previous operation in the execution graph. This is set to True by default. Correctly setting this flag requires some insight into how Quokka works. Lightweight functions generally benefit from being folded. Heavyweight functions or those whose efficiency improve with large input sizes might benefit from not being folded. Return: A new DataStream with a new column made by the user defined function. Examples >>> f = qc.read_csv(\"lineitem.csv\") # people who care about speed of execution make full use of Polars columnar APIs. >>> d = d.with_column(\"high\", lambda x:(x[\"o_orderpriority\"] == \"1-URGENT\") | (x[\"o_orderpriority\"] == \"2-HIGH\"), required_columns = {\"o_orderpriority\"}) # people who care about speed of development can do something that hurts my eyes. def f(x): y = x.to_pandas() y[\"high\"] = y.apply(lambda x:(x[\"o_orderpriority\"] == \"1-URGENT\") | (x[\"o_orderpriority\"] == \"2-HIGH\"), axis = 1) return polars.from_pandas(y[\"high\"]) >>> d = d.with_column(\"high\", f, required_columns={\"o_orderpriority\"}) .stateful_transform source .stateful_transform( executor: Executor, new_schema: list, required_columns: set, partitioner = PassThroughPartitioner(), placement = 'cpu' ) EXPERIMENTAL API This is like transform , except you can use a stateful object as your transformation function. This is useful for example, if you want to run a heavy Pytorch model on each batch coming in, and you don't want to reload this model for each function call. Remember the transform API only supports stateless transformations. You could also implement much more complicated stateful transformations, like implementing your own aggregation function if you are not satisfied with Quokka's default operator's performance. This API is still being finalized. A version of it that takes multiple input streams is also going to be added. This is the part of the DataStream level api that is closest to the underlying execution engine. Quokka's underlying execution engine basically executes a series of stateful transformations on batches of data. The difficulty here is how much of that underlying API to expose here so it's still useful without the user having to understand how the Quokka runtime works. To that end, we have to come up with suitable partitioner and placement strategy abstraction classes and interfaces. If you are interested in helping us hammer out this API, please talke to me: zihengw@stanford.edu. Args executor (pyquokka.executors.Executor) : The stateful executor. It must be a subclass of pyquokka.executors.Executor , and expose the execute and done functions. More details forthcoming. new_schema (list) : The names of the columns of the Polars DataFrame that the transformation function produces. required_columns (list or set) : The names of the columns that are required for this transformation. This argument is made mandatory because it's often trivial to supply and can often greatly speed things up. Return: A transformed DataStream. Examples Forthcoming. .distinct source .distinct( keys: list ) Return a new DataStream with specified columns and unique rows. This is like SELECT DISTINCT(KEYS) FROM ... in SQL. Note all the other columns will be dropped, since their behavior is unspecified. If you want to do deduplication, you can use this operator with keys set to all the columns. This could be accomplished by using groupby().agg() but using distinct is generally faster because it is nonblocking, compared to a groupby. Quokka really likes nonblocking operations because it can then pipeline it with other operators. Args keys (list) : a list of columns to select distinct on. Return: A transformed DataStream whose columns are in keys and whose rows are unique. Examples >>> f = qc.read_csv(\"lineitem.csv\") # select only the l_orderdate and l_orderkey columns, return only unique rows. >>> f = f.distinct([\"l_orderdate\", \"l_orderkey\"]) # this will now fail, since l_comment is no longer in f's schema. >>> f = f.select([\"l_comment\"]) .join source .join( right, on = None, left_on = None, right_on = None, suffix = '_2', how = 'inner' ) Join a DataStream with another DataStream or a small Polars DataFrame (<10MB). If you have a Polars DataFrame bigger than this, the best solution right now is to write it out to a file and have Quokka read it back in as a DataStream. I realize this is perhaps suboptimal, and this will be improved. A streaming two-sided distributed join will be executed for two DataStream joins and a streaming broadcast join will be executed for DataStream joined with Polars DataFrame. Joins are obviously very important, and we are constantly improving how we do joins. Eventually we will support out of core joins, when @savebuffer merges his PR into Arrow 10.0. Args right (DataStream or Polars DataFrame) : the DataStream or Polars DataFrame to join to. on (str) : You could either specify this, if the join column has the same name in this DataStream and right , or left_on and right_on if the join columns don't have the same name. left_on (str) : the name of the join column in this DataStream. right_on (str) : the name of the join column in right . suffix (str) : if right has columns with the same names as columns in this DataStream, their names will be appended with the suffix in the result. how (str) : only supports \"inner\" for now. Return: A new DataStream that's the joined result of this DataStream and \"right\". By default, columns from both side will be retained, except for right_on from the right side. Examples >>> lineitem = qc.read_csv(\"lineitem.csv\") >>> orders = qc.read_csv(\"orders.csv\") >>> result = lineitem.join(orders, left_on = \"l_orderkey\", right_on = \"o_orderkey\") # this will now fail, since o_orderkey is not in the joined DataStream. >>> result = result.select([\"o_orderkey\"]) .groupby source .groupby( groupby: list, orderby = None ) Group a DataStream on a list of columns, optionally specifying an ordering requirement. This returns a GroupedDataStream object, which currently only expose the aggregate method. This is similar to Pandas df.groupby().agg() syntax. Eventually the GroupedDataStream object will also support different kinds of window functions. Args groupby (list or str) : a column or a list of columns to group on. orderby (list) : a list of ordering requirements of the groupby columns, specified in a list like this: [(col1, \"asc\"), (col2, \"desc\")]. Return: A GroupedDataStream object with the specified grouping and the current DataStream. Examples >>> lineitem = qc.read_csv(\"lineitem.csv\") >>> result = lineitem.groupby([\"l_orderkey\",\"l_orderdate\"], orderby = [(\"l_orderkey\", \"asc\"), (\"l_orderdate\", \"desc\")]) .agg source .agg( aggregations ) Aggregate this DataStream according to the defined aggregations without any pre-grouping. This is similar to Pandas df.agg() . The result will be one row. The result is a DataStream that will return a batch when the entire aggregation is done, since it's impossible to return any aggregation results without seeing the entire dataset. As a result, you should call .compute() or .collect() on this DataStream instead of doing additional operations on it like .filter() since those won't be pipelined anyways. The only reason Quokka by default returns a DataStream instead of just returning a Polars DataFrame or a Quokka DataSet is so you can do .explain() on it. Args aggregations (dict) : similar to a dictionary argument to Pandas df.agg() . The key is the column name, where the value is a str that is \"min\", \"max\", \"mean\", \"sum\", \"avg\" or a list of such strings. If you desire to have the count column in your result, add a key \"*\" with value \"count\". Look at the examples. Return: A DataStream object that holds the aggregation result. It will only emit one batch, which is the result when it's done. You should call .collect() or .compute() on it as it is impossible to pipeline past an aggregation, so might as well as materialize it right now. Examples >>> lineitem = qc.read_csv(\"lineitem.csv\") >>> d = lineitem.filter(\"l_shipdate <= date '1998-12-01' - interval '90' day\") >>> d = d.with_column(\"disc_price\", lambda x:x[\"l_extendedprice\"] * (1 - x[\"l_discount\"]), required_columns ={\"l_extendedprice\", \"l_discount\"}) # I want the sum and average of the l_quantity column and the l_extendedprice colum, the sum of the disc_price column, the minimum of the l_discount # column, and oh give me the total row count as well. >>> f = d.agg({\"l_quantity\":[\"sum\",\"avg\"], \"l_extendedprice\":[\"sum\",\"avg\"], \"disc_price\":\"sum\", \"l_discount\":\"min\",\"*\":\"count\"}) .aggregate source .aggregate( aggregations ) Alias of agg . .count source .count() Return total row count. .sum source .sum( columns ) Return the sums of the specified columns. .max source .max( columns ) Return the maximum values of the specified columns. .min source .min( columns ) Return the minimum values of the specified columns. .mean source .mean( columns ) Return the mean values of the specified columns. GroupedDataStream source GroupedDataStream( source_data_stream: DataStream, groupby, orderby ) Methods: .agg source .agg( aggregations: dict ) Aggregate this GroupedDataStream according to the defined aggregations. This is similar to Pandas df.groupby().agg() . The result's length will be however number of rows as there are unique group keys combinations. The result is a DataStream that will return a batch when the entire aggregation is done, since it's impossible to return any aggregation results without seeing the entire dataset. As a result, you should call .compute() or .collect() on this DataStream instead of doing additional operations on it like .filter() since those won't be pipelined anyways. The only reason Quokka by default returns a DataStream instead of just returning a Polars DataFrame or a Quokka DataSet is so you can do .explain() on it. Args aggregations (dict) : similar to a dictionary argument to Pandas df.agg() . The key is the column name, where the value is a str that is \"min\", \"max\", \"mean\", \"sum\", \"avg\" or a list of such strings. If you desire to have the count column in your result, add a key \"*\" with value \"count\". Look at the examples. Return: A DataStream object that holds the aggregation result. It will only emit one batch, which is the result when it's done. You should call .collect() or .compute() on it as it is impossible to pipeline past an aggregation, so might as well as materialize it right now. Examples >>> lineitem = qc.read_csv(\"lineitem.csv\") >>> d = lineitem.filter(\"l_shipdate <= date '1998-12-01' - interval '90' day\") >>> d = d.with_column(\"disc_price\", lambda x:x[\"l_extendedprice\"] * (1 - x[\"l_discount\"]), required_columns ={\"l_extendedprice\", \"l_discount\"}) # I want the sum and average of the l_quantity column and the l_extendedprice colum, the sum of the disc_price column, the minimum of the l_discount # column, and oh give me the total row count as well, of each unique combination of l_returnflag and l_linestatus >>> f = d.groupby([\"l_returnflag\", \"l_linestatus\"]).agg({\"l_quantity\":[\"sum\",\"avg\"], \"l_extendedprice\":[\"sum\",\"avg\"], \"disc_price\":\"sum\", \"l_discount\":\"min\",\"*\":\"count\"}) .aggregate source .aggregate( aggregations: dict ) Alias for agg.","title":"DataStream"},{"location":"datastream/#_1","text":"","title":""},{"location":"datastream/#datastream","text":"source DataStream( quokka_context, schema: list, source_node_id: int ) Quokka DataStream class is how most users are expected to interact with Quokka. However users are not expected to create a DataStream directly by calling its constructor. Note that constructor takes an argument called source_node_id , which would confuse most data scientists -- even me! Args quokka_context (pyquokka.df.QuokkaContext) : Similar to Spark SQLContext. schema (list) : The schema of this DataStream, i.e. a list of column names. We might change it to be a dictionary with type information in the future to do better static code checking. source_node_id (int) : the node in the logical plan that produces this DataStream. Attributes quokka_context (pyquokka.df.QuokkaContext) : Similar to Spark SQLContext. schema (list) : The schema of this DataStream, i.e. a list of column names. We might change it to be a dictionary with type information in the future to do better static code checking. source_node_id (int) : the node in the logical plan that produces this DataStream. Methods:","title":"DataStream"},{"location":"datastream/#collect","text":"source .collect() This will trigger the execution of computational graph, similar to Spark collect(). The result will be a Polars DataFrame on the master Return: Polars DataFrame. Examples >>> f = qc.read_csv(\"my_csv.csv\") >>> result = f.collect() # result will be a Polars dataframe, as if you did polars.read_csv(\"my_csv.csv\")","title":".collect"},{"location":"datastream/#compute","text":"source .compute() This will trigger the execution of computational graph, similar to Spark collect The result will be a Quokka DataSet, which you can then call to_df() or call to_stream() to initiate another computation. Return: Quokka Quokka DataSet. Currently this is going to be just a list of objects distributed across the Redis servers on the workers.","title":".compute"},{"location":"datastream/#explain","text":"source .explain( mode = 'graph' ) This will not trigger the execution of your computation graph but will produce a graph of the execution plan. Args mode (str) : 'graph' will show a graph, 'text' will print a textual description. Return: None.","title":".explain"},{"location":"datastream/#write_csv","text":"source .write_csv( table_location, output_line_limit = 1000000 ) This will write out the entire contents of the DataStream to a list of CSVs. This is a blocking operation, and will call collect() under the hood. Args table_lcation (str) : the root directory to write the output CSVs to. Similar to Spark, Quokka by default writes out a directory of CSVs instead of dumping all the results to a single CSV so the output can be done in parallel. If your dataset is small and you want a single file, you can adjust the output_line_limit parameter. Example table_locations: s3://bucket/prefix for cloud, absolute path /home/user/files for disk. output_line_limit (int) : how many rows each CSV in the output should have. The current implementation simply buffers this many rows in memory instead of using file appends, so you should have enough memory! Return: Polars DataFrame containing the filenames of the CSVs that were produced. Examples >>> f = qc.read_csv(\"lineitem.csv\") >>> f = f.filter(\"l_orderkey < 10 and l_partkey > 5\") >>> f.write_csv(\"/home/user/test-out\") # you should create the directory before hand.","title":".write_csv"},{"location":"datastream/#write_parquet","text":"source .write_parquet( table_location, output_line_limit = 10000000 ) This will write out the entire contents of the DataStream to a list of Parquets. This is a blocking operation, and will call collect() under the hood. By default, each output Parquet file will contain one row group. Args table_lcation (str) : the root directory to write the output Parquets to. Similar to Spark, Quokka by default writes out a directory of Parquets instead of dumping all the results to a single Parquet so the output can be done in parallel. If your dataset is small and you want a single file, you can adjust the output_line_limit parameter. Example table_locations: s3://bucket/prefix for cloud, absolute path /home/user/files for disk. output_line_limit (int) : the row group size in each output file. Return: Polars DataFrame containing the filenames of the Parquets that were produced. Examples >>> f = qc.read_csv(\"lineitem.csv\") >>> f = f.filter(\"l_orderkey < 10 and l_partkey > 5\") >>> f.write_parquet(\"/home/user/test-out\") # you should create the directory before hand.","title":".write_parquet"},{"location":"datastream/#filter","text":"source .filter( predicate: str ) This will filter the DataStream to contain only rows that match a certain predicate. Currently this predicate must be specified in SQL syntax. You can write any SQL clause you would generally put in a WHERE statement containing arbitrary conjunctions and disjunctions. The identifiers however, must be in the schema of this DataStream! We aim to soon support a more Pythonic interface that better resembles Pandas which allows you to do things like d = d[d.a > 10]. Please look at the examples below. Since a DataStream is implemented as a stream of batches, you might be tempted to think of a filtered DataStream as a stream of batches where each batch directly results from a filter being applied to a batch in the source DataStream. While this certainly may be the case, filters are aggressively optimized by Quokka and is most likely pushed all the way down to the input readers. As a result, you typically should not see a filter node in a Quokka execution plan shown by explain() . It is much better to think of a DataStream simply as a stream of rows that meet certain criteria, and who may be non-deterministically batched together by the Quokka runtime. Indeed, Quokka makes no guarantees on the sizes of these batches, which is determined at runtime. This flexibility is an important reason for Quokka's superior performance. Args predicate (str) : a SQL WHERE clause, look at the examples. Return: A DataStream consisting of rows from the source DataStream that match the predicate. Examples >>> f = qc.read_csv(\"lineitem.csv\") # filter for all the rows where l_orderkey smaller than 10 and l_partkey greater than 5 >>> f = f.filter(\"l_orderkey < 10 and l_partkey > 5\") # nested conditions are supported >>> f = f.filter(\"l_orderkey < 10 and (l_partkey > 5 or l_partkey < 1)\") # most SQL features such as IN and date are supported. >>> f = f.filter(\"l_shipmode IN ('MAIL','SHIP') and l_receiptdate < date '1995-01-01'\") # you can do arithmetic in the predicate just like in SQL. >>> f = f.filter(\"l_shipdate < date '1994-01-01' + interval '1' year and l_discount between 0.06 - 0.01 and 0.06 + 0.01\") # this will fail! Assuming c_custkey is not in f.schema >>> f = f.filter(\"c_custkey > 10\")","title":".filter"},{"location":"datastream/#select","text":"source .select( columns: list ) This will create a new DataStream that contains only selected columns from the source DataStream. Since a DataStream is implemented as a stream of batches, you might be tempted to think of a filtered DataStream as a stream of batches where each batch directly results from selecting columns from a batch in the source DataStream. While this certainly may be the case, select() is aggressively optimized by Quokka and is most likely pushed all the way down to the input readers. As a result, you typically should not see a select node in a Quokka execution plan shown by explain() . It is much better to think of a DataStream simply as a stream of rows that meet certain criteria, and who may be non-deterministically batched together by the Quokka runtime. Indeed, Quokka makes no guarantees on the sizes of these batches, which is determined at runtime. This flexibility is an important reason for Quokka's superior performance. Args columns (list) : a list of columns to select from the source DataStream Return: A DataStream consisting of only the columns selected. Examples >>> f = qc.read_csv(\"lineitem.csv\") # select only the l_orderdate and l_orderkey columns >>> f = f.select([\"l_orderdate\", \"l_orderkey\"]) # this will now fail, since f's schema now consists of only two columns. >>> f = f.select([\"l_linenumber\"])","title":".select"},{"location":"datastream/#drop","text":"source .drop( cols_to_drop: list ) Think of this as the anti-opereator to select. Instead of selecting columns, this will drop columns. This is implemented in Quokka as selecting the columns in the DataStream's schema that are not dropped. Args cols_to_drop (list) : a list of columns to drop from the source DataStream Return: A DataStream consisting of all columns in the source DataStream that are not in cols_to_drop . Examples >>> f = qc.read_csv(\"lineitem.csv\") # select only the l_orderdate and l_orderkey columns >>> f = f.drop([\"l_orderdate\", \"l_orderkey\"]) # this will now fail, since you dropped l_orderdate >>> f = f.select([\"l_orderdate\"])","title":".drop"},{"location":"datastream/#rename","text":"source .rename( rename_dict ) Renames columns in the DataStream according to rename_dict. This is similar to polars.rename . The keys you supply in rename_dict must be present in the schema, and the rename operation must not lead to duplicate column names. Note this will lead to a physical operation at runtime. Args rename_dict (dict) : key is old column name, value is new column name. Return: A DataStream with new schema according to rename.","title":".rename"},{"location":"datastream/#transform","text":"source .transform( f, new_schema: list, required_columns: set, foldable = True ) This is a rather Quokka-specific API that allows arbitrary transformations on a DataStream, similar to Spark RDD.map. Each batch in the DataStream is going to be transformed according to a user defined function, which can produce a new batch. The new batch can have completely different schema or even length as the original batch, and the original data is considered lost, or consumed by this transformation function. This could be used to implement user-defined-aggregation-functions (UDAFs). Note in cases where you are simply generating a new column from other columns for each row, i.e. UDF, you probably want to use the with_column method instead. A DataStream is implemented as a stream of batches. In the runtime, your transformation function will be applied to each of those batches. However, there are no guarantees whatsoever on the sizes of these batches! You should probably make sure your logic is correct regardless of the sizes of the batches. For example, if your DataStream consists of a column of numbers, and you wish to compute the sum of those numbers, you could first transform the DataStream to return just the sum of each batch, and then hook this DataStream up to a stateful operator that adds up all the sums. You can use whatever libraries you have installed in your Python environment in this transformation function. If you are using this on a cloud cluster, you have to make sure the necessary libraries are installed on each machine. You can use the utils package in pyquokka to help you do this. This is very similar to Spark's seldom used combineByKey feature. Note a transformation in the logical plan basically precludes any predicate pushdown or early projection past it, since the original columns are assumed to be lost, and we cannot directly establish correspendences between the input columns to a transformation and its output columns for the purposes of predicate pushdown or early projection. The user is required to supply a set or list of required columns, and we will select for those columns (which can be pushed down) before we apply the transformation. Args f (function) : The transformation function. This transformation function must take as input a Polars DataFrame and output a Polars DataFrame. The transformation function must not have expectations on the length of its input. Similarly, the transformation function does not have to emit outputs of a specific size. The transformation function must produce the same output columns for every possible input. new_schema (list) : The names of the columns of the Polars DataFrame that the transformation function produces. required_columns (list or set) : The names of the columns that are required for this transformation. This argument is made mandatory because it's often trivial to supply and can often greatly speed things up. foldable (bool) : Whether or not the transformation can be executed as part of the batch post-processing of the previous operation in the execution graph. This is set to True by default. Correctly setting this flag requires some insight into how Quokka works. Lightweight functions generally benefit from being folded. Heavyweight functions or those whose efficiency improve with large input sizes might benefit from not being folded. Return: A new transformed DataStream with the supplied schema. Examples # a user defined function that takes in a Polars DataFrame with a single column \"text\", converts it to a Pyarrow table, # and uses nice Pyarrow compute functions to perform the word count on this Polars DataFrame. Note 1) we have to convert it # back to a Polars DataFrame afterwards, 2) the function works regardless of input length and 3) the output columns are the # same regardless of the input. def udf2(x): x = x.to_arrow() da = compute.list_flatten(compute.ascii_split_whitespace(x[\"text\"])) c = da.value_counts().flatten() return polars.from_arrow(pa.Table.from_arrays([c[0], c[1]], names=[\"word\",\"count\"])) # this is a trick to read in text files, just use read_csv with a separator you know won't appear. # the result will just be DataStream with one column. >>> words = qc.read_csv(\"random_words.txt\", [\"text\"], sep = \"|\") # transform words to counts >>> counted = words.transform( udf2, new_schema = [\"word\", \"count\"], required_columns = {\"text\"}, foldable=True)","title":".transform"},{"location":"datastream/#with_column","text":"source .with_column( new_column, f, required_columns = None, foldable = True ) This will create new columns from certain columns in the dataframe. This is similar to pandas df.apply() that makes new columns. This is similar to Spark UDF or Pandas UDF, Polars with_column , Spark with_column , etc. Note that this function, like most Quokka DataStream functions, are not in-place, and will return a new DataStream, with the new column. This is a separate API from transform because the semantics allow for projection and predicate pushdown through this node, since the original columns are all preserved. Use this instead of transform if possible. A DataStream is implemented as a stream of batches. In the runtime, your function will be applied to each of those batches. The function must take as input a Polars DataFrame and produce a Polars DataFrame. This is a different mental model from say Pandas df.apply , where the function is written for each row. There are two restrictions. First, your result must only have one column, and it should have the same name as your new_column argument. Second, your result must have the same length as the input Polars DataFrame. You can use whatever libraries you have installed in your Python environment in this function. If you are using this on a cloud cluster, you have to make sure the necessary libraries are installed on each machine. You can use the utils package in pyquokka to help you do this. Importantly, your function can take full advantage of Polars' columnar APIs to make use of SIMD and other forms of speedy goodness. You can even use Polars LazyFrame abstractions inside of this function. Of course, for ultimate flexbility, you are more than welcome to convert the Polars DataFrame to a Pandas DataFrame and use df.apply . Just remember to convert it back to a Polars DataFrame with only the result column in the end! Args new_column (str) : The name of the new column. f (function) : The apply function. This apply function must take as input a Polars DataFrame and output a Polars DataFrame. The apply function must not have expectations on the length of its input. The output must have the same length as the input. The apply function must produce the same output columns for every possible input. required_columns (list or set) : The names of the columns that are required for your function. If this is not specified then Quokka assumes all the columns are required for your function. Early projection past this function becomes impossible. Long story short, if you can specify this argument, do it. foldable (bool) : Whether or not the function can be executed as part of the batch post-processing of the previous operation in the execution graph. This is set to True by default. Correctly setting this flag requires some insight into how Quokka works. Lightweight functions generally benefit from being folded. Heavyweight functions or those whose efficiency improve with large input sizes might benefit from not being folded. Return: A new DataStream with a new column made by the user defined function. Examples >>> f = qc.read_csv(\"lineitem.csv\") # people who care about speed of execution make full use of Polars columnar APIs. >>> d = d.with_column(\"high\", lambda x:(x[\"o_orderpriority\"] == \"1-URGENT\") | (x[\"o_orderpriority\"] == \"2-HIGH\"), required_columns = {\"o_orderpriority\"}) # people who care about speed of development can do something that hurts my eyes. def f(x): y = x.to_pandas() y[\"high\"] = y.apply(lambda x:(x[\"o_orderpriority\"] == \"1-URGENT\") | (x[\"o_orderpriority\"] == \"2-HIGH\"), axis = 1) return polars.from_pandas(y[\"high\"]) >>> d = d.with_column(\"high\", f, required_columns={\"o_orderpriority\"})","title":".with_column"},{"location":"datastream/#stateful_transform","text":"source .stateful_transform( executor: Executor, new_schema: list, required_columns: set, partitioner = PassThroughPartitioner(), placement = 'cpu' ) EXPERIMENTAL API This is like transform , except you can use a stateful object as your transformation function. This is useful for example, if you want to run a heavy Pytorch model on each batch coming in, and you don't want to reload this model for each function call. Remember the transform API only supports stateless transformations. You could also implement much more complicated stateful transformations, like implementing your own aggregation function if you are not satisfied with Quokka's default operator's performance. This API is still being finalized. A version of it that takes multiple input streams is also going to be added. This is the part of the DataStream level api that is closest to the underlying execution engine. Quokka's underlying execution engine basically executes a series of stateful transformations on batches of data. The difficulty here is how much of that underlying API to expose here so it's still useful without the user having to understand how the Quokka runtime works. To that end, we have to come up with suitable partitioner and placement strategy abstraction classes and interfaces. If you are interested in helping us hammer out this API, please talke to me: zihengw@stanford.edu. Args executor (pyquokka.executors.Executor) : The stateful executor. It must be a subclass of pyquokka.executors.Executor , and expose the execute and done functions. More details forthcoming. new_schema (list) : The names of the columns of the Polars DataFrame that the transformation function produces. required_columns (list or set) : The names of the columns that are required for this transformation. This argument is made mandatory because it's often trivial to supply and can often greatly speed things up. Return: A transformed DataStream. Examples Forthcoming.","title":".stateful_transform"},{"location":"datastream/#distinct","text":"source .distinct( keys: list ) Return a new DataStream with specified columns and unique rows. This is like SELECT DISTINCT(KEYS) FROM ... in SQL. Note all the other columns will be dropped, since their behavior is unspecified. If you want to do deduplication, you can use this operator with keys set to all the columns. This could be accomplished by using groupby().agg() but using distinct is generally faster because it is nonblocking, compared to a groupby. Quokka really likes nonblocking operations because it can then pipeline it with other operators. Args keys (list) : a list of columns to select distinct on. Return: A transformed DataStream whose columns are in keys and whose rows are unique. Examples >>> f = qc.read_csv(\"lineitem.csv\") # select only the l_orderdate and l_orderkey columns, return only unique rows. >>> f = f.distinct([\"l_orderdate\", \"l_orderkey\"]) # this will now fail, since l_comment is no longer in f's schema. >>> f = f.select([\"l_comment\"])","title":".distinct"},{"location":"datastream/#join","text":"source .join( right, on = None, left_on = None, right_on = None, suffix = '_2', how = 'inner' ) Join a DataStream with another DataStream or a small Polars DataFrame (<10MB). If you have a Polars DataFrame bigger than this, the best solution right now is to write it out to a file and have Quokka read it back in as a DataStream. I realize this is perhaps suboptimal, and this will be improved. A streaming two-sided distributed join will be executed for two DataStream joins and a streaming broadcast join will be executed for DataStream joined with Polars DataFrame. Joins are obviously very important, and we are constantly improving how we do joins. Eventually we will support out of core joins, when @savebuffer merges his PR into Arrow 10.0. Args right (DataStream or Polars DataFrame) : the DataStream or Polars DataFrame to join to. on (str) : You could either specify this, if the join column has the same name in this DataStream and right , or left_on and right_on if the join columns don't have the same name. left_on (str) : the name of the join column in this DataStream. right_on (str) : the name of the join column in right . suffix (str) : if right has columns with the same names as columns in this DataStream, their names will be appended with the suffix in the result. how (str) : only supports \"inner\" for now. Return: A new DataStream that's the joined result of this DataStream and \"right\". By default, columns from both side will be retained, except for right_on from the right side. Examples >>> lineitem = qc.read_csv(\"lineitem.csv\") >>> orders = qc.read_csv(\"orders.csv\") >>> result = lineitem.join(orders, left_on = \"l_orderkey\", right_on = \"o_orderkey\") # this will now fail, since o_orderkey is not in the joined DataStream. >>> result = result.select([\"o_orderkey\"])","title":".join"},{"location":"datastream/#groupby","text":"source .groupby( groupby: list, orderby = None ) Group a DataStream on a list of columns, optionally specifying an ordering requirement. This returns a GroupedDataStream object, which currently only expose the aggregate method. This is similar to Pandas df.groupby().agg() syntax. Eventually the GroupedDataStream object will also support different kinds of window functions. Args groupby (list or str) : a column or a list of columns to group on. orderby (list) : a list of ordering requirements of the groupby columns, specified in a list like this: [(col1, \"asc\"), (col2, \"desc\")]. Return: A GroupedDataStream object with the specified grouping and the current DataStream. Examples >>> lineitem = qc.read_csv(\"lineitem.csv\") >>> result = lineitem.groupby([\"l_orderkey\",\"l_orderdate\"], orderby = [(\"l_orderkey\", \"asc\"), (\"l_orderdate\", \"desc\")])","title":".groupby"},{"location":"datastream/#agg","text":"source .agg( aggregations ) Aggregate this DataStream according to the defined aggregations without any pre-grouping. This is similar to Pandas df.agg() . The result will be one row. The result is a DataStream that will return a batch when the entire aggregation is done, since it's impossible to return any aggregation results without seeing the entire dataset. As a result, you should call .compute() or .collect() on this DataStream instead of doing additional operations on it like .filter() since those won't be pipelined anyways. The only reason Quokka by default returns a DataStream instead of just returning a Polars DataFrame or a Quokka DataSet is so you can do .explain() on it. Args aggregations (dict) : similar to a dictionary argument to Pandas df.agg() . The key is the column name, where the value is a str that is \"min\", \"max\", \"mean\", \"sum\", \"avg\" or a list of such strings. If you desire to have the count column in your result, add a key \"*\" with value \"count\". Look at the examples. Return: A DataStream object that holds the aggregation result. It will only emit one batch, which is the result when it's done. You should call .collect() or .compute() on it as it is impossible to pipeline past an aggregation, so might as well as materialize it right now. Examples >>> lineitem = qc.read_csv(\"lineitem.csv\") >>> d = lineitem.filter(\"l_shipdate <= date '1998-12-01' - interval '90' day\") >>> d = d.with_column(\"disc_price\", lambda x:x[\"l_extendedprice\"] * (1 - x[\"l_discount\"]), required_columns ={\"l_extendedprice\", \"l_discount\"}) # I want the sum and average of the l_quantity column and the l_extendedprice colum, the sum of the disc_price column, the minimum of the l_discount # column, and oh give me the total row count as well. >>> f = d.agg({\"l_quantity\":[\"sum\",\"avg\"], \"l_extendedprice\":[\"sum\",\"avg\"], \"disc_price\":\"sum\", \"l_discount\":\"min\",\"*\":\"count\"})","title":".agg"},{"location":"datastream/#aggregate","text":"source .aggregate( aggregations ) Alias of agg .","title":".aggregate"},{"location":"datastream/#count","text":"source .count() Return total row count.","title":".count"},{"location":"datastream/#sum","text":"source .sum( columns ) Return the sums of the specified columns.","title":".sum"},{"location":"datastream/#max","text":"source .max( columns ) Return the maximum values of the specified columns.","title":".max"},{"location":"datastream/#min","text":"source .min( columns ) Return the minimum values of the specified columns.","title":".min"},{"location":"datastream/#mean","text":"source .mean( columns ) Return the mean values of the specified columns.","title":".mean"},{"location":"datastream/#groupeddatastream","text":"source GroupedDataStream( source_data_stream: DataStream, groupby, orderby ) Methods:","title":"GroupedDataStream"},{"location":"datastream/#agg_1","text":"source .agg( aggregations: dict ) Aggregate this GroupedDataStream according to the defined aggregations. This is similar to Pandas df.groupby().agg() . The result's length will be however number of rows as there are unique group keys combinations. The result is a DataStream that will return a batch when the entire aggregation is done, since it's impossible to return any aggregation results without seeing the entire dataset. As a result, you should call .compute() or .collect() on this DataStream instead of doing additional operations on it like .filter() since those won't be pipelined anyways. The only reason Quokka by default returns a DataStream instead of just returning a Polars DataFrame or a Quokka DataSet is so you can do .explain() on it. Args aggregations (dict) : similar to a dictionary argument to Pandas df.agg() . The key is the column name, where the value is a str that is \"min\", \"max\", \"mean\", \"sum\", \"avg\" or a list of such strings. If you desire to have the count column in your result, add a key \"*\" with value \"count\". Look at the examples. Return: A DataStream object that holds the aggregation result. It will only emit one batch, which is the result when it's done. You should call .collect() or .compute() on it as it is impossible to pipeline past an aggregation, so might as well as materialize it right now. Examples >>> lineitem = qc.read_csv(\"lineitem.csv\") >>> d = lineitem.filter(\"l_shipdate <= date '1998-12-01' - interval '90' day\") >>> d = d.with_column(\"disc_price\", lambda x:x[\"l_extendedprice\"] * (1 - x[\"l_discount\"]), required_columns ={\"l_extendedprice\", \"l_discount\"}) # I want the sum and average of the l_quantity column and the l_extendedprice colum, the sum of the disc_price column, the minimum of the l_discount # column, and oh give me the total row count as well, of each unique combination of l_returnflag and l_linestatus >>> f = d.groupby([\"l_returnflag\", \"l_linestatus\"]).agg({\"l_quantity\":[\"sum\",\"avg\"], \"l_extendedprice\":[\"sum\",\"avg\"], \"disc_price\":\"sum\", \"l_discount\":\"min\",\"*\":\"count\"})","title":".agg"},{"location":"datastream/#aggregate_1","text":"source .aggregate( aggregations: dict ) Alias for agg.","title":".aggregate"},{"location":"quokka_context/","text":"QuokkaContext source QuokkaContext( cluster = None ) Methods: .read_files source .read_files( table_location: str ) .read_csv source .read_csv( table_location: str, schema = None, has_header = False, sep = ', ' ) .read_parquet source .read_parquet( table_location: str, schema = None ) .new_stream source .new_stream( sources: dict, partitioners: dict, node: Node, schema: list, ordering = None ) .new_dataset source .new_dataset( source, schema: list ) .optimize source .optimize( node_id ) .lower source .lower( end_node_id, collect = True ) .execute_node source .execute_node( node_id, explain = False, mode = None, collect = True ) .explain source .explain( node_id, mode = 'graph' ) DataSet source DataSet( quokka_context: QuokkaContext, schema: dict, source_node_id: int )","title":"QuokkaContext"},{"location":"quokka_context/#_1","text":"","title":""},{"location":"quokka_context/#quokkacontext","text":"source QuokkaContext( cluster = None ) Methods:","title":"QuokkaContext"},{"location":"quokka_context/#read_files","text":"source .read_files( table_location: str )","title":".read_files"},{"location":"quokka_context/#read_csv","text":"source .read_csv( table_location: str, schema = None, has_header = False, sep = ', ' )","title":".read_csv"},{"location":"quokka_context/#read_parquet","text":"source .read_parquet( table_location: str, schema = None )","title":".read_parquet"},{"location":"quokka_context/#new_stream","text":"source .new_stream( sources: dict, partitioners: dict, node: Node, schema: list, ordering = None )","title":".new_stream"},{"location":"quokka_context/#new_dataset","text":"source .new_dataset( source, schema: list )","title":".new_dataset"},{"location":"quokka_context/#optimize","text":"source .optimize( node_id )","title":".optimize"},{"location":"quokka_context/#lower","text":"source .lower( end_node_id, collect = True )","title":".lower"},{"location":"quokka_context/#execute_node","text":"source .execute_node( node_id, explain = False, mode = None, collect = True )","title":".execute_node"},{"location":"quokka_context/#explain","text":"source .explain( node_id, mode = 'graph' )","title":".explain"},{"location":"quokka_context/#dataset","text":"source DataSet( quokka_context: QuokkaContext, schema: dict, source_node_id: int )","title":"DataSet"},{"location":"runtime/","text":"Quokka Runtime API documentation Programming Model A note about the name: the name is inspired by the Apache Flink icon, which is a chipmunk. A quokka is a marsupial that resembles a chipmunk. Motivation Popular big data processing frameworks such as Spark and Dask rely on bulk-synchronous execution on distributed datasets. Often, a map-reduce style model is adopted, where mappers perform functions on partitions of the input, the mapper outputs are shuffled into groups, and after the shuffle has fully/mostly completed , reducers start working on each group. Typically this is implemented as a pull-based model where reducers pull required data from the mappers, who persist their output in some kind of external storage (disk or network) when fault tolerance is desired. There are a couple problems with this approach. The first, as recent works such as LinkedIn Magnet and Uber Zeus have identified, is that when each mapper doesn't have too much data for each reducer, the pull operation amounts to a bunch of random disk/network reads. This is horrible. The solution is push-based shuffles, where mappers push data to the reducers. Data can now be persisted on the reducer side for fault tolerance. However, this only addresses part of the problem. In a synchronous shuffle, even when mapper output is pushed to the reducers as soon as they are generated, the reducers can't start operating on said data until they have received near everything. This is because the current Map-Reduce paradigm stipulates that the reduction function is a function on all the data assigned to it from the mappers. This forces the reducers to start only after most of the mappers have completely executed, making any kind of pipelined parallel execution between the two impossible. This is unfortunate, because mappers and reducers often use very different resources (network I/O bound mappers + compute bound reducers), and can often be scheduled for parallel execution on the same physical instances without compromising too much the performance of either. Quokka's solution is to support two different kinds of reducer functions. Blocking reducers are similar to classic Map-Reduce reducers and block until they receive all mapper outputs. However, non-blocking reducers can start executing on mapper outputs as soon as they arrive, producing some output of its own and updating some local state. For example, sort, count and aggregation are blocking reducer functions because their output depend on all the data. However, join, filter and projection can be implemented in a non-blocking fashion with streaming operators. Non-blocking reducers can be pipelined with other non-blocking reducers, while a blocking reducer breaks the pipeline. Mappers are treated as non-blocking reducers where the output already exists in network/disk storage. We impose some limitations on the kinds of non-blocking operators we support, which are described in detail later. Logically, one can view Quokka execution as a series of stages, where each stage start with the output produced by a blocking operator, ends with another blocking operator, and executes non-blocking operators in between. The entire stage is executed in a pipeline-parallel fashion, and can be viewed as a pure streaming system. The stage inputs/outputs use Spark's lineage tracking based fault-tolerance and persistence mechanism. Since each Quokka stage now corresponds to a few Spark stages, Quokka also implements intra-stage fault tolerance based on checkpointing. The checkpointing recovery mechanism in Quokka conveniently avoids global asynchronous rollbacks, the bane of streaming systems, thanks to the restrictions we impose on the non-blocking operators. Quokka also aims to support autoscaling. (I have a plan to do this, but likely will not get to this until after the rotation.) Execution Model The Quokka runtime API allows you to construct a task graph of nodes , which corresponds to a Quokka stage. This is very similar to other DAG-based processing frameworks such as Apache Spark or Tensorflow . For example, you can write the following code in the runtime API to execute TPC-H query 6: task_graph = TaskGraph() lineitem = task_graph.new_input_csv(bucket,key,lineitem_scheme,8,batch_func=lineitem_filter, sep=\"|\") agg_executor = AggExecutor() agged = task_graph.new_blocking_node({0:lineitem}, agg_executor, 1, {0:None}) task_graph.initialize() task_graph.run() There are perhaps a couple of things to note here. Firstly, there are two types of nodes in the runtime API. There are input nodes , declared with APIs such as new_input_csv or new_input_parquet , which interface with the external world (you can define where they will read their data), and task nodes , declared with new_non_blocking_node or new_blocking_node , which take as input the outputs generated from another node in the task graph, either an input node or another task node. Secondly, we see that the task node agged depends on the outputs from the input node lineitem . We will describe what exactly are the types of lineitem and agged later (the former is a stream and the latter is a dataset). Finally, note that the task graph ends with a blocking node. This is currently required, if you want to be able to interact with the results of the task graph execution. Multiple stages are implemented with multiple task graphs, with the first node of stage 2 reading from the output of stage 1, like the following: task_graph = TaskGraph() a = task_graph.new_input_csv(\"bump\",\"a-big.csv\",[\"key\"] + [\"avalue\" + str(i) for i in range(100)],{'localhost':2}) b = task_graph.new_input_csv(\"bump\",\"b-big.csv\",[\"key\"] + [\"bvalue\" + str(i) for i in range(100)],{'localhost':2}) join_executor = OOCJoinExecutor(on=\"key\") output = task_graph.new_blocking_node({0:quotes,1:trades},None, join_executor,{'localhost':4},{0:\"key\", 1:\"key\"}) task_graph.initialize() task_graph.run() del task_graph task_graph2 = TaskGraph() count_executor = CountExecutor() joined_stream = task_graph2.new_input_from_dataset(output,{'localhost':4}) final = task_graph2.new_blocking_node({0:joined_stream}, None, count_executor, {'localhost':4}, {0:'key'}) task_graph2.initialize() task_graph2.run() Note that since the output of a stage is persisted as in Spark, one can delete the first task graph and still access its outputs. Since a task graph represents one Quokka stage, it strictly follows push-based execution. This means that a node does not wait for its downstream dependencies to ask for data, but instead actively pushes data to its downstream dependencies whenever some intermediate results become available. In short, execution proceeds as follows : input nodes read batches of data from a specified source, which might be an external data source or the outputs of a previous stage, and pushes those batches to downstream task nodes. A task node exposes a handler to process incoming batches as they arrive, possibly updating some internal state, and for each input batch possibly produces an output batch for its own downstream children. The programmer is expected to supply this handler function as an executor object (e.g. OOCJoinExecutor , AggExecutor ). Quokka provides a library of pre-implemented executor objects that the programmer can use for SQL, ML and graph analytics. Each task node can have multiple physical executors, referred to as channels . This is a form of intra-operator data parallelism, as opposed to the inter-operator pipeline parallelism that results from all task nodes executing at the same time. These physical executors all execute the same handler function, but on different portions of the input batch, partitioned by a user-specified partition function. A Map-Reduce job with M mappers and R reducers would be implemented in Quokka as a single mapper task node and a single reducer task node, where the mapper task node has M channels and the reducer task node has R channels. In the example above, we specified that the input node lineitem has 8 channels, and the task node agged has only 1 channel. The partition key was not specified ( {0:None} ) since there is no parallelism, thus no need for partitioning. The situation looks something like the following picture: Quokka keeps track of all the channels and schedules them onto physical computing resources. For the engine, two channels from different task nodes are on more or less equal footing -- they can be scheduled on the same hardware or different hardware. A channel from an input node completes execution when there's no more inputs to be read or if all of its downstream dependencies have completed execution. A channel from a task node completes execution when: all of its upstream sources have completed execution if its execution handler decides to terminate early based on the input batch and its state (e.g. for a task node that executes the limit operator in a limit query, it might keep as local state the buffered output, and decide to terminate when that output size surpasses the limit number) if all its downstream dependencies have completed execution. By default, all channels start execution at once. This does not necessarily mean that they will start processing data, this means that they will all start waiting for input batches from their upstream sources to arrive. One could specify that an input node delay execution until another input node has finished. For example to implement a hash join one might want to stream in one table to build the hash table, then stream in the other table for probing. The runtime API is meant to be very flexible and support all manners of batch and stream processing. For example, one could specify an input node that listens to a Kafka stream, some task nodes which processes batches of data from that stream, and an output node that writes to another Kafka stream. In this case, since the input node will never terminate, and assuming the other nodes do not trigger early termination, the task graph will always be running. As a result of this flexibility, it requires quite a lot of knowledge for efficient utilization. As a result, we aim to provide higher level APIs to support common batch and streaming tasks in SQL, machine learning and graph analytics. Most programmers are not expected to program at the runtime API level, but rather make use of the pre-packaged higher-level APIs. Stateful Actors Let's talk more about task nodes in Quokka. Channels in task nodes can be treated as stateful operators in an actor programming model. Quokka adopts the notion of channels in a task node to specify that a group of actors all execute the same code, for fault tolerance and autoscaling purposes. One could override default Quokka behavior by simply specifying different task nodes with one channel each, all executing the same code. The key property of stateful operators in Quokka is confluence : in the context of nondeterministic message delivery, an operation on a single machine is confluent if it produces the same set of outputs for any nondeterministic ordering and batching of a set of inputs. (Hellerstein, CALM) Note that the output itself can also be produced in any order. It\u2019s easy to see that any composition of confluent operators is still confluent. We relax the confluent definition somewhat here to accept potentially different output sets, assuming they are all semantically correct. For example an operator that implements the LIMIT N clause in SQL can admit any of N input records it sees. More importantly, for Quokka we allow operators to depend on intra-stream ordering, just not inter-stream ordering. This means that it might still expect the inputs produced by a certain stream to observe some order, while there are no restrictions on the relative orderings between different input streams. Quokka as a system enforces intra-stream message order, but makes zero gurantees about inter-stream message orders. Henceforth, confluence will refer to this narrow definition, not the one defined in the CALM paper. Confluence is a very nice property to have in general, more so for streaming systems. Let\u2019s imagine a stateful operator with two different upstream operators producing messages. It is very nice if the system\u2019s correctness does not depend on the order in which the two upstream operators produce the messages, which could depend on network delay, task scheduling, etc. This is critical for performance in a push-based framework since a node should never wait on any one of its input streams. In addition, it also greatly facilitates fault tolerance, as messages from different sources can be replayed in any order in regards to one another, as we will describe later. Confluence is perhaps the key difference between Quokka and streaming-centric systems like Flink. In Flink you can totally write pipelines where the outputs depend very strongly on the order the inputs are supplied. In Quokka it is not allowed. (Really at this point, it's only \"not recommended\" -- there are no checks in place to see if your actor is confluent or not. What's guaranteed is that all the operators in the libraries supplied follow this model. Enforcing this is future work.) What are some examples of confluent stateful operators? First let's categorize the world of stateful operators we'd like to implement in data analytics. As mentioned previosuly, there are two important cateogories: nonblocking and blocking . Blocking operators cannot emit any outputs to their downstream children until all of their inputs have been processed. Examples are any kind of aggregation and sort. For (naive) aggregation, the stateful operator does not know it has the final result for any of its aggregation keys until it has seen all of its inputs. For sorting, the stateful operator cannot guarantee that it would emit results in sorted order until it has received all its inputs. We call any operator that is not blocking non-blocking. Example non-blocking operators are map, filter, projection and join. Blocking operators are pipeline breakers, and negate the benefits of using a streaming framework like Quokka. Confluence is easy to reason about for blocking operators. The blocking operator emit only one output, at the very end. We just have to make sure that this output is the same regardless of the order in which we supply the operator's inputs. Since this operator is typically a function of the final state, we just have to ensure that the final state is the same. If we imagine that each incoming message changes the state of the operator by function f , then it's easy to see that as long as f is commutative this is true. For example, any kind of aggregation is commutative, the merge step in merge-sort is commutative, etc. Confluence is harder to reason about for nonblocking operators. We must guarantee that regardless of the order the input batches are supplied, the set of output batches do not change. Let\u2019s say we only have two incoming messages, m and n, to a node with starting state S. Then the outputs produced by giving m first to S, changing the state S to f(m, S), while producing output o(m, S) and then giving n to S, changing the state to f(n, f(m,S)) while producing output o(n, f(m,S)), which is {o(m,S), o(n,f(m,s))} is the same as if we gave the outputs in the reverse order. Note that this assumes that m and n are all the messages the node will see. Confluence is about eventual consistency. While in general there are many ways to achieve this kind of behavior as long as only monotonic operations are applied to the state at each input batch (Bloom), in Quokka all the stock non-blocking operators take the approach of setting the state as sets of immutable batches of data, that can only be added to. This is clearly monotonic. If you are writing a stateful operator for Quokka, this is the recommended approach. What this means is that it is impossible to perform operations that require a specific batch amongst the set of batches, such as list indexing, since ordering of the batches in a set in the state is undefined. Most meaningful operations take the incoming message and produce an output that depends on the entire set, or not at all. An example of a confluent stateful operator in Quokka is a join. The code can be roughly summarized as follows: state0 = set() state1 = set() for each input: if input from stream0: state0.add(input) emit set(input.join(i) for i in state1) else: state1.add(input) emit set(i.join(input) for i in state0) Note that there is in fact a non-monotonic domain-specific optimization we can make that will preserve confluence in the case of a primary key join. Any input streamed in from stream0 can guarantee that any future records from that table will not have the same key value. Thus all state1 related to the record\u2019s key can be safely deleted. Quokka currently does not implement this optimization. Datasets and Streams Let's talk more about how non-blocking and blocking operators work in Quokka. Blocking operators could be introduced by operations like aggregations and sort, or simply by user command when they wish to materialize data with .materialize() (similar to .cache() semantics in Spark or .compute() semantics in Dask). Such blocking operators will produce a Dataset in Quokka, while non-blocking operators will produce a Stream . Downstream operators could depend on both upstream datasets and streams. The difference is that the upstream dataset need to be completely materialized when an operator starts executing, while a stream is just a promise that batches of data will be produced at some point in the future in any order. In other words, from the perspective of the operator, it can pull data from an upstream dataset and expects data to be pushed to it from the stream. In the very first code listing for TPC-H query 6, agged is a dataset whereas lineitem is a stream. In practice, a Quokka DAG can consist of many blocking operators and non-blocking operators organized in complicated ways. For example, here is the DAG for a PageRank application: As previously described, Quokka decomposes the computation into stages, with each stage ending in the creation of a Dataset. In this case the computation will be broken into two stages, the first of which consists of the nonblocking input sparse matrix read and caching (the upper row). The second will be the bottom row. The second stage depends on the first one, so it will be launched after the first one has completed. This is very similar to how stages in Spark work. (Note that strictly speaking, every stage has to start from a Dataset too. In this case the input nodes depend on Datasets that are pre-created in S3 or Disk, and are abbreviated in this graph.) Similarly to an RDD, Quokka represents a Dataset as a collection of immutable objects, and some associated metadata on those objects, which is itself an immutable object. The objects are all stored on a shared-memory object store with persistence (currently RocksDB). When you use task_graph.add_blocking_node in Quokka, a Dataset object will be returned. You can use this Dataset object in downstream operators. Quokka guarantees that by the time the downstream operators execute, all the Datasets that they depend on would have been materialized in this object store. The stock Dataset class in Quokka exposes some convenience methods such as an iterator to iterate through the objects. The user could also interact directly with the object store after looking up metadata from the Dataset object. There are more specialized Dataset class implementations in Quokka like KVDataset or RangeDataset which corresponds to hash-based partitioning or range-based partitioning of objects that expose more methods. The user could also implement a custom Dataset class that descends from Dataset with even more methods. It is important to ensure that when using a Dataset in a downstream operator that also takes streaming inputs, the confluence property is respected. Unfortunately, Quokka currently does not enforce this and it's possible for you to mess this up when writing your code. Although it's not that easy to mess up, since you cannot change the objects you read from the Dataset. A downstream operator could treat the Dataset as a stream by simply invoking the iterator to iterate through the objects in the Dataset. However, for many downstream operations, it might be desirable to explicitly convert a Dataset into a Stream again (e.g. to use stock operators that only have stream-based implementations). You can do that by using the specialized task node add_input_dataset . Internally, this task node just calls the iterator repeatedly and produce a stream of batches corresponding to the objects in the Dataset. Fault tolerance (future work) The current theory is a bit complicated. I am still thinking through how this should work exactly, but hopefully the gist gets through. Given our group of confluent stateful operators, how do we achieve fault tolerance? A Quokka application can be thought of as a DAG, where each node corresponds to a channel, from one of the task nodes. Each node is assigned to a physical hardware instance. Quokka is designed to expect many nodes to be assigned to one physical instance. For example, let's imagine the following case, where the nodes circled belongs to machine A and the rest belong to machine B, and nodes 1 and 2 are channels of the input node. 3, 4 and 5 are non-blocking operators, 6 and 7 are blocking operators. Quokka follows a checkpoint-based system where each channel periodically asynchronously checkpoints its local state to persistent storage (AWS S3). Note that this is quite efficient given the types of states we typically have, such as (typically) small intermediate aggregation results and sets of batches that are monotonically added to. (This is definitely an area of future work) The problem is easy to spot: \"yes checkpoints are great, but you must turn off the entire system when a machine fails to sync it back to the latest good state, and then reapply all the inputs.\" Yes that is true for a general-purpose streaming system like Flink or Naiad. Coordinated global rollbacks really suck. But in Quokka where all the stateful operators are confluent, this need not happen. What happens when machine A dies? TLDR: machine B can keep doing work as if nothing is wrong, while machine A's workload eventually gets rescheduled. The gory details: nodes 1, 3, 6 and 7 carry on with life (they won't even know machine A just died). 1 will notice that it can no longer send messages to 4 and 5. That's ok, it will just buffer those messages. 3 and 6 will realize that they have fewer incoming messages now. 7 will notice that they have no more incoming messages. That's ok, they can work on their backlog. The system then goes about recovering 2, 4 and 5. It will request a new machine to schedule 2, 4 and 5, or simply schedule them to machine B. 2 is a channel of an input node, which has no state. In Quokka, all message sent between channels are tagged with a sequence number. The number starts from 0 and monotonically increases. This way, the task node discards messages with a wrong sequence number. The state of a stateful operator is also tagged with a number. The state number starts from 0 and monotonically increases every time. When an operator checkpoints, it writes its state, its state number, and the latest sequence number it expects from its inputs. A consistent checkpoint contains all this information. Quokka will look at the last consistent checkpoint of nodes 4 and 5, and find the minimum of all the latest sequence numbers across both consistent checkpoints. This is the batch that 2 will now have to start to produce. Let's say that node 4 had the smaller latest sequence number. Then node 4 will immediately start catching up. Node 5 will look at the incoming batches, find that their sequence numbers are smaller than expected, and proceed to ignore all of them. Eventually, node 5 will start recovering state as well. After both nodes catch up to the point where they died, node 6 and 7 will start accepting messages from node 4 and node 5 since now their messages have valid sequence numbers. What if in this example, node 2 was not an input node but a task node? Then the dead subgraph has no way of re-reading the input. Long story short, each node needs to buffer outgoing messages, until its children notify it that the state change affected by that outgoing message has been persisted to a checkpoint. This way, messages can be replayed when needed. All this while, machine B has been carrying on with its life. This means that if we started out in a load balanced way, then this fault recovery has introduced stragglers -- node 4 and 5 will now finish after node 3. This is actually okay from a resource-usage point of view. Note that nowhere in this process are we wasting resources across the cluster, as seen in global synchronous rollbacks. Only the lost states need to be recomputed, similar in vein to the fault-tolerance mechanism in Spark. In addition, fault recovery required minimal communication with workers that did not experience a fault, minimizing fault recovery overhead. Stragglers are okay for Quokka, we will mediate them through the dynamic scheduling mechanism described in the next section. Scheduling and Autoscaling (future work) There are two auto-scaling strategies in Quokka. The first is automatic, while the second might require some user input. Recall that Quokka is designed to expect many channels to be assigned to the same physical hardware. But first, let's talk about how Quokka schedules channels to hardware, assuming that the graph is static, and the number and type of machines are fixed. Firstly, in the current runtime API, when instantiating a task node or input node, the user manually specifies how many channels are there and where those channels go. Dynamic channel scheduling is done when programming in higher-level APIs. We observe that each channel is in effect an independent stateful oeprator that can be scheduled independently. However, different scheduling strategies entail different communication costs. If channel A sends a large volume of messages to channel B, then we should schedule them on the same machine. Note that contrary to intuition, there is no benefit at all in scheduling multiple channels from the same input node or task node on the same machine apart from parallelism, since they never talk to each other. Channel scheduling can be dynamic, in the sense that a channel can be moved from one physical machine to another in a very straight-forward way. The self-contained nature of an actor is an oft-quoted strength of the actor model. All that needs to happen is for Quokka to transfer the state of the actor to another node (which could be done asynchronously after the transfer decision is made), and change the partition function for the channel's parents so that the appropriate physical machine receives the incoming messages. The data transfer cost is the only cost in moving an actor. Different criteria can be used to decide if a channel should be moved to another physical machine. These could include machine specific characteristics, such as limited memory available or high CPU usage on the current machine, or the lack thereof on the other machine. Quokka can also use channel-specific information, for example if the system observes the channel transfering large amounts of data to another channel on another machine and determines that the cost in moving this channel can be overcame by the benefit in data locality achieved after the move. The stragglers introduced by fault recovery can be mediated in this fashion. Node 1 and 3 will finish before node 2 and 4/5, creating less resource usage on machine B. The system will then try to move one of node 4/5 onto machine B. Manual autoscaling using combiner functions To be written. Example Applications TPC-H query 12 Pagerank Let's talk about how PageRank works in the Quokka programming model. TaskGraph API new_input_csv (bucket, key, names, parallelism, ip='localhost',batch_func=None, sep = \",\", dependents = [], stride = 64 * 1024 * 1024) Currently, new_input_csv only supports reading a CSV in batches from an AWS S3 bucket. Required arguments in order: bucket : str. AWS S3 bucket key : str. AWS S3 key names : list of str. Column names. Note that if your rows ends with a delimiter value, such as in TPC-H, you will have to end this list with a placeholder such as \"null\". Look at the TPC-H code examples under apps. parallelism : int. the runtime API expects the programmer to explicitly state the amount of intra-op parallelism to expose. 8 is typically a good number. Keyword arguments: ip : str. the IP address of the physical machine the input node should be placed. Defaults to local execution. batch_func : function. the user can optionally pass in a function to execute on the input CSV chunk before it's passed off to downstream dependents. Currently the input CSV is parsed into a Pandas Dataframe, so batch_func can be any Python function that can take a Pandas Dataframe as input and produces a Pandas Dataframe. This can be done to perform predicate pushdown for SQL for example. sep : str. delimiter dependents : list of int. an input node can depend on other input nodes, i.e. only start once another input node is done. For example to implement as hash join where one input might depend on another, one could do the following: a = new_input_csv(...) b = new_input_csv(...,dependents=[a]) stide : int. how many bytes to read from the input S3 file to read at a time, default to 64 MB. Returns : a node id which is a handle to this input node, that can be used as the sources argument for task nodes or dependents arguments for other input nodes. new_input_parquet(bucket, key, names, parallelism, columns, skip_conditions, ip='localhost',batch_func=None, sep = \",\", dependents = [], stride = 64 * 1024 * 1024) Not yet implemented. new_task_node(sources, functionObject, parallelism, partition_key, ip='localhost') Instantiate a new task node with an executor object that defines the handler function which runs on each incoming batch. Required arguments in order: sources : dict of int -> int. the upstream sources that feed batches to this task node. Expects a dictionary, where the keys are integers and values are node ids (also stored as integers). This in effect names the source nodes. i.e. if you specify {0: source_node_id_x, 1:source_node_id_y} , from the perspective of this task node you are calling the batches coming from source_node_id_x source 0 and the batches coming from node_id_y source 1. You will make use of these identifiers writing the executor class's handler function for incoming batches. functionObject : an executor object which defines the input batch handler function. More details on this in the next section. You can write your own or use a pre-supplied one from the sql, ml or graph packages. parallelism : int. the runtime API expects the programmer to explicitly state the amount of intra-op parallelism to expose. Think carefully about this choice. Computationally intensive tasks might benefit from parallelism, while simple tasks such as aggregation might not. partition_key : dict of int -> in. This argument expects a dictionary with a key for each key in the sources dict. It describes how the input batches should be partitioned amongst the channels. If the value is None, then the input batch is copied and broadcast to all channels. Otherwise, currently each channel receives the sub-batch input_batch[input_batch.partition_key % parallelism == channel_id]. If this partition key is not in the input batch's columns from the specified source node, a runtime error would ensue. Keyword arguments: ip : str. the IP address of the physical machine the input node should be placed. Defaults to local execution. Writing Your Own (Stateless) Executor Object The best place to learn how to write your own executor object classes is by looking at the available executor object classes in the SQL library. In short, an executor class is simply a child class of this base class: class StatelessExecutor: def __init__(self) -> None: raise NotImplementedError def early_termination(self): self.early_termination = True def execute(self,batch,stream_id, executor_id): raise NotImplementedError def done(self,executor_id): raise NotImplementedError The Stateless","title":"Quokka Runtime API documentation"},{"location":"runtime/#quokka-runtime-api-documentation","text":"","title":"Quokka Runtime API documentation"},{"location":"runtime/#programming-model","text":"A note about the name: the name is inspired by the Apache Flink icon, which is a chipmunk. A quokka is a marsupial that resembles a chipmunk.","title":"Programming Model"},{"location":"runtime/#motivation","text":"Popular big data processing frameworks such as Spark and Dask rely on bulk-synchronous execution on distributed datasets. Often, a map-reduce style model is adopted, where mappers perform functions on partitions of the input, the mapper outputs are shuffled into groups, and after the shuffle has fully/mostly completed , reducers start working on each group. Typically this is implemented as a pull-based model where reducers pull required data from the mappers, who persist their output in some kind of external storage (disk or network) when fault tolerance is desired. There are a couple problems with this approach. The first, as recent works such as LinkedIn Magnet and Uber Zeus have identified, is that when each mapper doesn't have too much data for each reducer, the pull operation amounts to a bunch of random disk/network reads. This is horrible. The solution is push-based shuffles, where mappers push data to the reducers. Data can now be persisted on the reducer side for fault tolerance. However, this only addresses part of the problem. In a synchronous shuffle, even when mapper output is pushed to the reducers as soon as they are generated, the reducers can't start operating on said data until they have received near everything. This is because the current Map-Reduce paradigm stipulates that the reduction function is a function on all the data assigned to it from the mappers. This forces the reducers to start only after most of the mappers have completely executed, making any kind of pipelined parallel execution between the two impossible. This is unfortunate, because mappers and reducers often use very different resources (network I/O bound mappers + compute bound reducers), and can often be scheduled for parallel execution on the same physical instances without compromising too much the performance of either. Quokka's solution is to support two different kinds of reducer functions. Blocking reducers are similar to classic Map-Reduce reducers and block until they receive all mapper outputs. However, non-blocking reducers can start executing on mapper outputs as soon as they arrive, producing some output of its own and updating some local state. For example, sort, count and aggregation are blocking reducer functions because their output depend on all the data. However, join, filter and projection can be implemented in a non-blocking fashion with streaming operators. Non-blocking reducers can be pipelined with other non-blocking reducers, while a blocking reducer breaks the pipeline. Mappers are treated as non-blocking reducers where the output already exists in network/disk storage. We impose some limitations on the kinds of non-blocking operators we support, which are described in detail later. Logically, one can view Quokka execution as a series of stages, where each stage start with the output produced by a blocking operator, ends with another blocking operator, and executes non-blocking operators in between. The entire stage is executed in a pipeline-parallel fashion, and can be viewed as a pure streaming system. The stage inputs/outputs use Spark's lineage tracking based fault-tolerance and persistence mechanism. Since each Quokka stage now corresponds to a few Spark stages, Quokka also implements intra-stage fault tolerance based on checkpointing. The checkpointing recovery mechanism in Quokka conveniently avoids global asynchronous rollbacks, the bane of streaming systems, thanks to the restrictions we impose on the non-blocking operators. Quokka also aims to support autoscaling. (I have a plan to do this, but likely will not get to this until after the rotation.)","title":"Motivation"},{"location":"runtime/#execution-model","text":"The Quokka runtime API allows you to construct a task graph of nodes , which corresponds to a Quokka stage. This is very similar to other DAG-based processing frameworks such as Apache Spark or Tensorflow . For example, you can write the following code in the runtime API to execute TPC-H query 6: task_graph = TaskGraph() lineitem = task_graph.new_input_csv(bucket,key,lineitem_scheme,8,batch_func=lineitem_filter, sep=\"|\") agg_executor = AggExecutor() agged = task_graph.new_blocking_node({0:lineitem}, agg_executor, 1, {0:None}) task_graph.initialize() task_graph.run() There are perhaps a couple of things to note here. Firstly, there are two types of nodes in the runtime API. There are input nodes , declared with APIs such as new_input_csv or new_input_parquet , which interface with the external world (you can define where they will read their data), and task nodes , declared with new_non_blocking_node or new_blocking_node , which take as input the outputs generated from another node in the task graph, either an input node or another task node. Secondly, we see that the task node agged depends on the outputs from the input node lineitem . We will describe what exactly are the types of lineitem and agged later (the former is a stream and the latter is a dataset). Finally, note that the task graph ends with a blocking node. This is currently required, if you want to be able to interact with the results of the task graph execution. Multiple stages are implemented with multiple task graphs, with the first node of stage 2 reading from the output of stage 1, like the following: task_graph = TaskGraph() a = task_graph.new_input_csv(\"bump\",\"a-big.csv\",[\"key\"] + [\"avalue\" + str(i) for i in range(100)],{'localhost':2}) b = task_graph.new_input_csv(\"bump\",\"b-big.csv\",[\"key\"] + [\"bvalue\" + str(i) for i in range(100)],{'localhost':2}) join_executor = OOCJoinExecutor(on=\"key\") output = task_graph.new_blocking_node({0:quotes,1:trades},None, join_executor,{'localhost':4},{0:\"key\", 1:\"key\"}) task_graph.initialize() task_graph.run() del task_graph task_graph2 = TaskGraph() count_executor = CountExecutor() joined_stream = task_graph2.new_input_from_dataset(output,{'localhost':4}) final = task_graph2.new_blocking_node({0:joined_stream}, None, count_executor, {'localhost':4}, {0:'key'}) task_graph2.initialize() task_graph2.run() Note that since the output of a stage is persisted as in Spark, one can delete the first task graph and still access its outputs. Since a task graph represents one Quokka stage, it strictly follows push-based execution. This means that a node does not wait for its downstream dependencies to ask for data, but instead actively pushes data to its downstream dependencies whenever some intermediate results become available. In short, execution proceeds as follows : input nodes read batches of data from a specified source, which might be an external data source or the outputs of a previous stage, and pushes those batches to downstream task nodes. A task node exposes a handler to process incoming batches as they arrive, possibly updating some internal state, and for each input batch possibly produces an output batch for its own downstream children. The programmer is expected to supply this handler function as an executor object (e.g. OOCJoinExecutor , AggExecutor ). Quokka provides a library of pre-implemented executor objects that the programmer can use for SQL, ML and graph analytics. Each task node can have multiple physical executors, referred to as channels . This is a form of intra-operator data parallelism, as opposed to the inter-operator pipeline parallelism that results from all task nodes executing at the same time. These physical executors all execute the same handler function, but on different portions of the input batch, partitioned by a user-specified partition function. A Map-Reduce job with M mappers and R reducers would be implemented in Quokka as a single mapper task node and a single reducer task node, where the mapper task node has M channels and the reducer task node has R channels. In the example above, we specified that the input node lineitem has 8 channels, and the task node agged has only 1 channel. The partition key was not specified ( {0:None} ) since there is no parallelism, thus no need for partitioning. The situation looks something like the following picture: Quokka keeps track of all the channels and schedules them onto physical computing resources. For the engine, two channels from different task nodes are on more or less equal footing -- they can be scheduled on the same hardware or different hardware. A channel from an input node completes execution when there's no more inputs to be read or if all of its downstream dependencies have completed execution. A channel from a task node completes execution when: all of its upstream sources have completed execution if its execution handler decides to terminate early based on the input batch and its state (e.g. for a task node that executes the limit operator in a limit query, it might keep as local state the buffered output, and decide to terminate when that output size surpasses the limit number) if all its downstream dependencies have completed execution. By default, all channels start execution at once. This does not necessarily mean that they will start processing data, this means that they will all start waiting for input batches from their upstream sources to arrive. One could specify that an input node delay execution until another input node has finished. For example to implement a hash join one might want to stream in one table to build the hash table, then stream in the other table for probing. The runtime API is meant to be very flexible and support all manners of batch and stream processing. For example, one could specify an input node that listens to a Kafka stream, some task nodes which processes batches of data from that stream, and an output node that writes to another Kafka stream. In this case, since the input node will never terminate, and assuming the other nodes do not trigger early termination, the task graph will always be running. As a result of this flexibility, it requires quite a lot of knowledge for efficient utilization. As a result, we aim to provide higher level APIs to support common batch and streaming tasks in SQL, machine learning and graph analytics. Most programmers are not expected to program at the runtime API level, but rather make use of the pre-packaged higher-level APIs.","title":"Execution Model"},{"location":"runtime/#stateful-actors","text":"Let's talk more about task nodes in Quokka. Channels in task nodes can be treated as stateful operators in an actor programming model. Quokka adopts the notion of channels in a task node to specify that a group of actors all execute the same code, for fault tolerance and autoscaling purposes. One could override default Quokka behavior by simply specifying different task nodes with one channel each, all executing the same code. The key property of stateful operators in Quokka is confluence : in the context of nondeterministic message delivery, an operation on a single machine is confluent if it produces the same set of outputs for any nondeterministic ordering and batching of a set of inputs. (Hellerstein, CALM) Note that the output itself can also be produced in any order. It\u2019s easy to see that any composition of confluent operators is still confluent. We relax the confluent definition somewhat here to accept potentially different output sets, assuming they are all semantically correct. For example an operator that implements the LIMIT N clause in SQL can admit any of N input records it sees. More importantly, for Quokka we allow operators to depend on intra-stream ordering, just not inter-stream ordering. This means that it might still expect the inputs produced by a certain stream to observe some order, while there are no restrictions on the relative orderings between different input streams. Quokka as a system enforces intra-stream message order, but makes zero gurantees about inter-stream message orders. Henceforth, confluence will refer to this narrow definition, not the one defined in the CALM paper. Confluence is a very nice property to have in general, more so for streaming systems. Let\u2019s imagine a stateful operator with two different upstream operators producing messages. It is very nice if the system\u2019s correctness does not depend on the order in which the two upstream operators produce the messages, which could depend on network delay, task scheduling, etc. This is critical for performance in a push-based framework since a node should never wait on any one of its input streams. In addition, it also greatly facilitates fault tolerance, as messages from different sources can be replayed in any order in regards to one another, as we will describe later. Confluence is perhaps the key difference between Quokka and streaming-centric systems like Flink. In Flink you can totally write pipelines where the outputs depend very strongly on the order the inputs are supplied. In Quokka it is not allowed. (Really at this point, it's only \"not recommended\" -- there are no checks in place to see if your actor is confluent or not. What's guaranteed is that all the operators in the libraries supplied follow this model. Enforcing this is future work.) What are some examples of confluent stateful operators? First let's categorize the world of stateful operators we'd like to implement in data analytics. As mentioned previosuly, there are two important cateogories: nonblocking and blocking . Blocking operators cannot emit any outputs to their downstream children until all of their inputs have been processed. Examples are any kind of aggregation and sort. For (naive) aggregation, the stateful operator does not know it has the final result for any of its aggregation keys until it has seen all of its inputs. For sorting, the stateful operator cannot guarantee that it would emit results in sorted order until it has received all its inputs. We call any operator that is not blocking non-blocking. Example non-blocking operators are map, filter, projection and join. Blocking operators are pipeline breakers, and negate the benefits of using a streaming framework like Quokka. Confluence is easy to reason about for blocking operators. The blocking operator emit only one output, at the very end. We just have to make sure that this output is the same regardless of the order in which we supply the operator's inputs. Since this operator is typically a function of the final state, we just have to ensure that the final state is the same. If we imagine that each incoming message changes the state of the operator by function f , then it's easy to see that as long as f is commutative this is true. For example, any kind of aggregation is commutative, the merge step in merge-sort is commutative, etc. Confluence is harder to reason about for nonblocking operators. We must guarantee that regardless of the order the input batches are supplied, the set of output batches do not change. Let\u2019s say we only have two incoming messages, m and n, to a node with starting state S. Then the outputs produced by giving m first to S, changing the state S to f(m, S), while producing output o(m, S) and then giving n to S, changing the state to f(n, f(m,S)) while producing output o(n, f(m,S)), which is {o(m,S), o(n,f(m,s))} is the same as if we gave the outputs in the reverse order. Note that this assumes that m and n are all the messages the node will see. Confluence is about eventual consistency. While in general there are many ways to achieve this kind of behavior as long as only monotonic operations are applied to the state at each input batch (Bloom), in Quokka all the stock non-blocking operators take the approach of setting the state as sets of immutable batches of data, that can only be added to. This is clearly monotonic. If you are writing a stateful operator for Quokka, this is the recommended approach. What this means is that it is impossible to perform operations that require a specific batch amongst the set of batches, such as list indexing, since ordering of the batches in a set in the state is undefined. Most meaningful operations take the incoming message and produce an output that depends on the entire set, or not at all. An example of a confluent stateful operator in Quokka is a join. The code can be roughly summarized as follows: state0 = set() state1 = set() for each input: if input from stream0: state0.add(input) emit set(input.join(i) for i in state1) else: state1.add(input) emit set(i.join(input) for i in state0) Note that there is in fact a non-monotonic domain-specific optimization we can make that will preserve confluence in the case of a primary key join. Any input streamed in from stream0 can guarantee that any future records from that table will not have the same key value. Thus all state1 related to the record\u2019s key can be safely deleted. Quokka currently does not implement this optimization.","title":"Stateful Actors"},{"location":"runtime/#datasets-and-streams","text":"Let's talk more about how non-blocking and blocking operators work in Quokka. Blocking operators could be introduced by operations like aggregations and sort, or simply by user command when they wish to materialize data with .materialize() (similar to .cache() semantics in Spark or .compute() semantics in Dask). Such blocking operators will produce a Dataset in Quokka, while non-blocking operators will produce a Stream . Downstream operators could depend on both upstream datasets and streams. The difference is that the upstream dataset need to be completely materialized when an operator starts executing, while a stream is just a promise that batches of data will be produced at some point in the future in any order. In other words, from the perspective of the operator, it can pull data from an upstream dataset and expects data to be pushed to it from the stream. In the very first code listing for TPC-H query 6, agged is a dataset whereas lineitem is a stream. In practice, a Quokka DAG can consist of many blocking operators and non-blocking operators organized in complicated ways. For example, here is the DAG for a PageRank application: As previously described, Quokka decomposes the computation into stages, with each stage ending in the creation of a Dataset. In this case the computation will be broken into two stages, the first of which consists of the nonblocking input sparse matrix read and caching (the upper row). The second will be the bottom row. The second stage depends on the first one, so it will be launched after the first one has completed. This is very similar to how stages in Spark work. (Note that strictly speaking, every stage has to start from a Dataset too. In this case the input nodes depend on Datasets that are pre-created in S3 or Disk, and are abbreviated in this graph.) Similarly to an RDD, Quokka represents a Dataset as a collection of immutable objects, and some associated metadata on those objects, which is itself an immutable object. The objects are all stored on a shared-memory object store with persistence (currently RocksDB). When you use task_graph.add_blocking_node in Quokka, a Dataset object will be returned. You can use this Dataset object in downstream operators. Quokka guarantees that by the time the downstream operators execute, all the Datasets that they depend on would have been materialized in this object store. The stock Dataset class in Quokka exposes some convenience methods such as an iterator to iterate through the objects. The user could also interact directly with the object store after looking up metadata from the Dataset object. There are more specialized Dataset class implementations in Quokka like KVDataset or RangeDataset which corresponds to hash-based partitioning or range-based partitioning of objects that expose more methods. The user could also implement a custom Dataset class that descends from Dataset with even more methods. It is important to ensure that when using a Dataset in a downstream operator that also takes streaming inputs, the confluence property is respected. Unfortunately, Quokka currently does not enforce this and it's possible for you to mess this up when writing your code. Although it's not that easy to mess up, since you cannot change the objects you read from the Dataset. A downstream operator could treat the Dataset as a stream by simply invoking the iterator to iterate through the objects in the Dataset. However, for many downstream operations, it might be desirable to explicitly convert a Dataset into a Stream again (e.g. to use stock operators that only have stream-based implementations). You can do that by using the specialized task node add_input_dataset . Internally, this task node just calls the iterator repeatedly and produce a stream of batches corresponding to the objects in the Dataset.","title":"Datasets and Streams"},{"location":"runtime/#fault-tolerance-future-work","text":"The current theory is a bit complicated. I am still thinking through how this should work exactly, but hopefully the gist gets through. Given our group of confluent stateful operators, how do we achieve fault tolerance? A Quokka application can be thought of as a DAG, where each node corresponds to a channel, from one of the task nodes. Each node is assigned to a physical hardware instance. Quokka is designed to expect many nodes to be assigned to one physical instance. For example, let's imagine the following case, where the nodes circled belongs to machine A and the rest belong to machine B, and nodes 1 and 2 are channels of the input node. 3, 4 and 5 are non-blocking operators, 6 and 7 are blocking operators. Quokka follows a checkpoint-based system where each channel periodically asynchronously checkpoints its local state to persistent storage (AWS S3). Note that this is quite efficient given the types of states we typically have, such as (typically) small intermediate aggregation results and sets of batches that are monotonically added to. (This is definitely an area of future work) The problem is easy to spot: \"yes checkpoints are great, but you must turn off the entire system when a machine fails to sync it back to the latest good state, and then reapply all the inputs.\" Yes that is true for a general-purpose streaming system like Flink or Naiad. Coordinated global rollbacks really suck. But in Quokka where all the stateful operators are confluent, this need not happen. What happens when machine A dies? TLDR: machine B can keep doing work as if nothing is wrong, while machine A's workload eventually gets rescheduled. The gory details: nodes 1, 3, 6 and 7 carry on with life (they won't even know machine A just died). 1 will notice that it can no longer send messages to 4 and 5. That's ok, it will just buffer those messages. 3 and 6 will realize that they have fewer incoming messages now. 7 will notice that they have no more incoming messages. That's ok, they can work on their backlog. The system then goes about recovering 2, 4 and 5. It will request a new machine to schedule 2, 4 and 5, or simply schedule them to machine B. 2 is a channel of an input node, which has no state. In Quokka, all message sent between channels are tagged with a sequence number. The number starts from 0 and monotonically increases. This way, the task node discards messages with a wrong sequence number. The state of a stateful operator is also tagged with a number. The state number starts from 0 and monotonically increases every time. When an operator checkpoints, it writes its state, its state number, and the latest sequence number it expects from its inputs. A consistent checkpoint contains all this information. Quokka will look at the last consistent checkpoint of nodes 4 and 5, and find the minimum of all the latest sequence numbers across both consistent checkpoints. This is the batch that 2 will now have to start to produce. Let's say that node 4 had the smaller latest sequence number. Then node 4 will immediately start catching up. Node 5 will look at the incoming batches, find that their sequence numbers are smaller than expected, and proceed to ignore all of them. Eventually, node 5 will start recovering state as well. After both nodes catch up to the point where they died, node 6 and 7 will start accepting messages from node 4 and node 5 since now their messages have valid sequence numbers. What if in this example, node 2 was not an input node but a task node? Then the dead subgraph has no way of re-reading the input. Long story short, each node needs to buffer outgoing messages, until its children notify it that the state change affected by that outgoing message has been persisted to a checkpoint. This way, messages can be replayed when needed. All this while, machine B has been carrying on with its life. This means that if we started out in a load balanced way, then this fault recovery has introduced stragglers -- node 4 and 5 will now finish after node 3. This is actually okay from a resource-usage point of view. Note that nowhere in this process are we wasting resources across the cluster, as seen in global synchronous rollbacks. Only the lost states need to be recomputed, similar in vein to the fault-tolerance mechanism in Spark. In addition, fault recovery required minimal communication with workers that did not experience a fault, minimizing fault recovery overhead. Stragglers are okay for Quokka, we will mediate them through the dynamic scheduling mechanism described in the next section.","title":"Fault tolerance (future work)"},{"location":"runtime/#scheduling-and-autoscaling-future-work","text":"There are two auto-scaling strategies in Quokka. The first is automatic, while the second might require some user input. Recall that Quokka is designed to expect many channels to be assigned to the same physical hardware. But first, let's talk about how Quokka schedules channels to hardware, assuming that the graph is static, and the number and type of machines are fixed. Firstly, in the current runtime API, when instantiating a task node or input node, the user manually specifies how many channels are there and where those channels go. Dynamic channel scheduling is done when programming in higher-level APIs. We observe that each channel is in effect an independent stateful oeprator that can be scheduled independently. However, different scheduling strategies entail different communication costs. If channel A sends a large volume of messages to channel B, then we should schedule them on the same machine. Note that contrary to intuition, there is no benefit at all in scheduling multiple channels from the same input node or task node on the same machine apart from parallelism, since they never talk to each other. Channel scheduling can be dynamic, in the sense that a channel can be moved from one physical machine to another in a very straight-forward way. The self-contained nature of an actor is an oft-quoted strength of the actor model. All that needs to happen is for Quokka to transfer the state of the actor to another node (which could be done asynchronously after the transfer decision is made), and change the partition function for the channel's parents so that the appropriate physical machine receives the incoming messages. The data transfer cost is the only cost in moving an actor. Different criteria can be used to decide if a channel should be moved to another physical machine. These could include machine specific characteristics, such as limited memory available or high CPU usage on the current machine, or the lack thereof on the other machine. Quokka can also use channel-specific information, for example if the system observes the channel transfering large amounts of data to another channel on another machine and determines that the cost in moving this channel can be overcame by the benefit in data locality achieved after the move. The stragglers introduced by fault recovery can be mediated in this fashion. Node 1 and 3 will finish before node 2 and 4/5, creating less resource usage on machine B. The system will then try to move one of node 4/5 onto machine B.","title":"Scheduling and Autoscaling (future work)"},{"location":"runtime/#manual-autoscaling-using-combiner-functions","text":"To be written.","title":"Manual autoscaling using combiner functions"},{"location":"runtime/#example-applications","text":"","title":"Example Applications"},{"location":"runtime/#tpc-h-query-12","text":"","title":"TPC-H query 12"},{"location":"runtime/#pagerank","text":"Let's talk about how PageRank works in the Quokka programming model.","title":"Pagerank"},{"location":"runtime/#taskgraph-api","text":"","title":"TaskGraph API"},{"location":"runtime/#new_input_csv-bucket-key-names-parallelism-iplocalhostbatch_funcnone-sep-dependents-stride-64-1024-1024","text":"Currently, new_input_csv only supports reading a CSV in batches from an AWS S3 bucket. Required arguments in order: bucket : str. AWS S3 bucket key : str. AWS S3 key names : list of str. Column names. Note that if your rows ends with a delimiter value, such as in TPC-H, you will have to end this list with a placeholder such as \"null\". Look at the TPC-H code examples under apps. parallelism : int. the runtime API expects the programmer to explicitly state the amount of intra-op parallelism to expose. 8 is typically a good number. Keyword arguments: ip : str. the IP address of the physical machine the input node should be placed. Defaults to local execution. batch_func : function. the user can optionally pass in a function to execute on the input CSV chunk before it's passed off to downstream dependents. Currently the input CSV is parsed into a Pandas Dataframe, so batch_func can be any Python function that can take a Pandas Dataframe as input and produces a Pandas Dataframe. This can be done to perform predicate pushdown for SQL for example. sep : str. delimiter dependents : list of int. an input node can depend on other input nodes, i.e. only start once another input node is done. For example to implement as hash join where one input might depend on another, one could do the following: a = new_input_csv(...) b = new_input_csv(...,dependents=[a]) stide : int. how many bytes to read from the input S3 file to read at a time, default to 64 MB. Returns : a node id which is a handle to this input node, that can be used as the sources argument for task nodes or dependents arguments for other input nodes.","title":"new_input_csv (bucket, key, names, parallelism, ip='localhost',batch_func=None, sep = \",\", dependents = [], stride = 64 * 1024 * 1024)"},{"location":"runtime/#new_input_parquetbucket-key-names-parallelism-columns-skip_conditions-iplocalhostbatch_funcnone-sep-dependents-stride-64-1024-1024","text":"Not yet implemented.","title":"new_input_parquet(bucket, key, names, parallelism, columns, skip_conditions, ip='localhost',batch_func=None, sep = \",\", dependents = [], stride = 64 * 1024 * 1024)"},{"location":"runtime/#new_task_nodesources-functionobject-parallelism-partition_key-iplocalhost","text":"Instantiate a new task node with an executor object that defines the handler function which runs on each incoming batch. Required arguments in order: sources : dict of int -> int. the upstream sources that feed batches to this task node. Expects a dictionary, where the keys are integers and values are node ids (also stored as integers). This in effect names the source nodes. i.e. if you specify {0: source_node_id_x, 1:source_node_id_y} , from the perspective of this task node you are calling the batches coming from source_node_id_x source 0 and the batches coming from node_id_y source 1. You will make use of these identifiers writing the executor class's handler function for incoming batches. functionObject : an executor object which defines the input batch handler function. More details on this in the next section. You can write your own or use a pre-supplied one from the sql, ml or graph packages. parallelism : int. the runtime API expects the programmer to explicitly state the amount of intra-op parallelism to expose. Think carefully about this choice. Computationally intensive tasks might benefit from parallelism, while simple tasks such as aggregation might not. partition_key : dict of int -> in. This argument expects a dictionary with a key for each key in the sources dict. It describes how the input batches should be partitioned amongst the channels. If the value is None, then the input batch is copied and broadcast to all channels. Otherwise, currently each channel receives the sub-batch input_batch[input_batch.partition_key % parallelism == channel_id]. If this partition key is not in the input batch's columns from the specified source node, a runtime error would ensue. Keyword arguments: ip : str. the IP address of the physical machine the input node should be placed. Defaults to local execution.","title":"new_task_node(sources, functionObject, parallelism, partition_key, ip='localhost')"},{"location":"runtime/#writing-your-own-stateless-executor-object","text":"The best place to learn how to write your own executor object classes is by looking at the available executor object classes in the SQL library. In short, an executor class is simply a child class of this base class: class StatelessExecutor: def __init__(self) -> None: raise NotImplementedError def early_termination(self): self.early_termination = True def execute(self,batch,stream_id, executor_id): raise NotImplementedError def done(self,executor_id): raise NotImplementedError The Stateless","title":"Writing Your Own (Stateless) Executor Object"},{"location":"simple/","text":"Tutorials This section is for learning how to use Quokka's DataStream API. Quokka's DataStream API is basically a dataframe API. It takes heavy inspiration from SparkSQL and Polars, and adopts a lazy execution model. This means that in contrast to Pandas, your operations are not executed immediately after you define them. Instead, Quokka builds a logical plan under the hood and executes it only when the user wants to \"collect\" the result, just like Spark. For the first part of our tutorial, we are going to go through implementing a few SQL queries in the TPC-H benchmark suite. You can download the data here . It is about 1GB unzipped. Please download the data (should take 2 minutes) and extract it to some directory locally. If you are testing this on a VM where clicking the link can't work, try this command after pip installing gdown: ~/.local/bin/gdown https://drive.google.com/uc?id=19hgYxZ4u28Cxe0s616Q3yAfkuRdQlmvO . The SQL queries themselves can be found on this awesome interface . These tutorials will use your local machine. They shouldn't take too long to run. It would be great if you can follow along, not just for fun -- if you find a bug in this tutorial I will buy you a cup of coffee! For an extensive API reference, please refer to here . Lesson -1: Things Please read the Getting Started section. I spent way too much time making the cartoons on that page. Lesson 0: Reading Things For every Quokka program, we need to set up a QuokkaContext object. This is similar to the Spark SQLContext . This can easily be done by running the following two lines of code in your Python terminal. from pyquokka.df import * qc = QuokkaContext() Once we have the QuokkaContext object, we can start reading data to obtain DataStreams. Quokka can read data on disk and on the cloud (currently S3). For the purposes of this tutorial we will be reading data from disk. Quokka currently reads CSV and Parquet, with plans to add JSON soon. Here is how you would read a CSV file if you know the schema: # the last column is called NULL, because the TPC-H data generator likes to put a | at the end of each row, making it appear as if there is a final column # with no values. Don't worry, we can drop this column. lineitem_scheme = [\"l_orderkey\",\"l_partkey\",\"l_suppkey\",\"l_linenumber\",\"l_quantity\",\"l_extendedprice\", \"l_discount\",\"l_tax\",\"l_returnflag\",\"l_linestatus\",\"l_shipdate\",\"l_commitdate\",\"l_receiptdate\",\"l_shipinstruct\",\"l_shipmode\",\"l_comment\", \"null\"] lineitem = qc.read_csv(disk_path + \"lineitem.tbl\", lineitem_scheme, sep=\"|\") And if you don't know the schema but there is a header row where column names are separated with the same separator as the data : lineitem = qc.read_csv(disk_path + \"lineitem.tbl.named\", sep=\"|\", has_header=True) You can also read a directory of CSV files: lineitem = qc.read_csv(disk_path + \"lineitem/*\", lineitem_scheme, sep=\"|\", has_header = True) Now let's read all the tables of the TPC-H benchmark suite. Set disk_path to where you unzipped the files. lineitem = qc.read_csv(disk_path + \"lineitem.tbl\", sep=\"|\", has_header=True) orders = qc.read_csv(disk_path + \"orders.tbl\", sep=\"|\", has_header=True) customer = qc.read_csv(disk_path + \"customer.tbl\",sep = \"|\", has_header=True) part = qc.read_csv(disk_path + \"part.tbl\", sep = \"|\", has_header=True) supplier = qc.read_csv(disk_path + \"supplier.tbl\", sep = \"|\", has_header=True) partsupp = qc.read_csv(disk_path + \"partsupp.tbl\", sep = \"|\", has_header=True) nation = qc.read_csv(disk_path + \"nation.tbl\", sep = \"|\", has_header=True) region = qc.read_csv(disk_path + \"region.tbl\", sep = \"|\", has_header=True) If you want to read the Parquet files, you should first run this script to generate the Parquet files: import polars as pl disk_path = \"/home/ubuntu/tpc-h/\" #replace files = [\"lineitem.tbl\",\"orders.tbl\",\"customer.tbl\",\"part.tbl\",\"supplier.tbl\",\"partsupp.tbl\",\"nation.tbl\",\"region.tbl\"] for file in files: df = pl.read_csv(disk_path + file,sep=\"|\",has_header = True, parse_dates = True).drop(\"null\") df.write_parquet(disk_path + file.replace(\"tbl\", \"parquet\"), row_group_size=100000) To read in a Parquet file, you don't have to worry about headers or schema, just do: lineitem = qc.read_parquet(disk_path + \"lineitem.parquet\") Currently, qc.read_csv and qc.read_parquet will either return a DataStream or just a Polars DataFrame directly if the data size is small (set at 10 MB). Lesson 1: Doing Things Now that we have read the data, let's do things with it. First, why don't we count how many rows there are in the lineitem table. >>> lineitem.aggregate({\"*\":\"count\"}).collect() If you don't see the number 6001215 after a while, something is very wrong. Please send me an email, I will help you fix things (and buy you a coffee): zihengw@stanford.edu. Feel free to type other random things and see if it's supported, but for those interested, let's follow a structured curriculum. Let's take a look at TPC-H query 1 . This is how you would write it in Quokka. This is very similar to how you'd write in another DataFrame library like Polars or Dask. def do_1(): d = lineitem.filter(\"l_shipdate <= date '1998-12-01' - interval '90' day\") d = d.with_column(\"disc_price\", lambda x: x[\"l_extendedprice\"] * (1 - x[\"l_discount\"]), required_columns ={\"l_extendedprice\", \"l_discount\"}) d = d.with_column(\"charge\", lambda x: x[\"l_extendedprice\"] * (1 - x[\"l_discount\"]) * (1 + x[\"l_tax\"]), required_columns={\"l_extendedprice\", \"l_discount\", \"l_tax\"}) f = d.groupby([\"l_returnflag\", \"l_linestatus\"], orderby=[\"l_returnflag\",\"l_linestatus\"]).agg({\"l_quantity\":[\"sum\",\"avg\"], \"l_extendedprice\":[\"sum\",\"avg\"], \"disc_price\":\"sum\", \"charge\":\"sum\", \"l_discount\":\"avg\",\"*\":\"count\"}) return f.collect() Quokka supports filtering DataStreams by DataStream.filter() . Filters can be specified in SQL syntax. The columns in the SQL expression must exist in the schema of the DataStream. A more Pythonic way of doing this like b = b[b.a < 5] isn't supported yet, mainly due to the finickiness surrounding date types etc. The result of a filter() is another DataStream whose Polars DataFrames will only contain rows that respect the predicate. On the plus side, Quokka uses the amazing SQLGlot library to support most ANSI-SQL compliant predicates, including dates, between, IN, even arithmetic in conditions. Try out some different predicates ! Please give SQLGlot a star when you're at it. For example, you can specify this super complicated predicate for TPC-H query 6 : def do_6(): d = lineitem.filter(\"l_shipdate >= date '1994-01-01' and l_shipdate < date '1994-01-01' + interval '1' year and l_discount between 0.06 - 0.01 and 0.06 + 0.01 and l_quantity < 24\") d = d.with_column(\"revenue\", lambda x: x[\"l_extendedprice\"] * x[\"l_discount\"], required_columns={\"l_extendedprice\", \"l_discount\"}) f = d.aggregate({\"revenue\":[\"sum\"]}) return f.collect() Quokka supports creating new columns in DataStreams with with_column . Read more about how this works here . This is in principle similar to Spark df.with_column and Pandas UDFs. The main thing to keep in mind is that the function you supply will be applied to each batch in the DataStream, instead of row by row. As a result, you can make use of fast vectorized execution with Polars. The mental model here is that we have a DataStream d of Polars DataFrames, each of which have rows from the lineitem table satisfying the filter predicate. Then, each Polars DataFrame is transformed by our functions to add the columns disk_price and charge . Like most Quokka operations, with_column will produce a new DataStream with an added column and is not inplace. This means that the command is lazy, and won't trigger the runtime to produce the actual data. It simply builds a logical plan of what to do in the background, which can be optimized when the user specifically ask for the result. Finally, we can group the DataStream and aggregate it to get the result. Read more about aggregation syntax here . The aggregation will produce another DataStream, which we call collect() on, to convert it to a Polars DataFrame in your Python terminal. When you call .collect() , the logical plan you have built is actually optimized and executed. This is exactly how Spark works. To view the optimized logical plan and learn more about what Quokka is doing, you can do f.explain() which will produce a graph, or f.explain(mode=\"text\") which will produce a textual explanation. Joins work very intuitively. For example, this is how to do TPC-H query 12 . def do_12(): d = lineitem.join(orders,left_on=\"l_orderkey\", right_on=\"o_orderkey\") d = d.filter(\"l_shipmode IN ('MAIL','SHIP') and l_commitdate < l_receiptdate and l_shipdate < l_commitdate and \\ l_receiptdate >= date '1994-01-01' and l_receiptdate < date '1995-01-01'\") d = d.with_column(\"high\", lambda x: (x[\"o_orderpriority\"] == \"1-URGENT\") | (x[\"o_orderpriority\"] == \"2-HIGH\"), required_columns={\"o_orderpriority\"}) d = d.with_column(\"low\", lambda x: (x[\"o_orderpriority\"] != \"1-URGENT\") & (x[\"o_orderpriority\"] != \"2-HIGH\"), required_columns={\"o_orderpriority\"}) f = d.groupby(\"l_shipmode\").aggregate(aggregations={'high':['sum'], 'low':['sum']}) return f.collect() Note it does not matter if you filter after the join or before the join, Quokka will automatically push them down during the logical plan optimization. The join operator on a DataStream takes in either another DataStream or a Polars DataFrame in your Python session. In the latter case, this Polars DataFrame will be broadcasted to different workers similar to Spark's broadcast join. Here is another example, TPC-H query 3 . def do_3(): d = lineitem.join(orders,left_on=\"l_orderkey\", right_on=\"o_orderkey\") d = customer.join(d,left_on=\"c_custkey\", right_on=\"o_custkey\") d = d.filter(\"c_mktsegment = 'BUILDING' and o_orderdate < date '1995-03-15' and l_shipdate > date '1995-03-15'\") d = d.with_column(\"revenue\", lambda x: x[\"l_extendedprice\"] * ( 1 - x[\"l_discount\"]) , required_columns={\"l_extendedprice\", \"l_discount\"}) f = d.groupby([\"l_orderkey\",\"o_orderdate\",\"o_shippriority\"]).agg({\"revenue\":[\"sum\"]}) return f.collect() Note unlike some SQL engines, Quokka currently will not try to figure out the optimal join ordering between the specified three-way join between lineitem, orders and customer tables. You are responsible for figuring that out at the moment -- try to join smaller tables first and then join them against larger tables, or try to minimize the intermeidate result size from those joins. An important thing to note is that Quokka currently only support inner joins. Other kinds of joins are coming soon. Feel free to look at some other queries in the Quokka github , or browse the API reference . While you are there, please give Quokka a star! Lesson 2: Writing Things So far, we have just learned about how to read things into DataStreams and do things to DataStreams. You can also write out DataStreams to persistent storage like disk or S3 to record all the amazing things we did with them. Quokka currently operates like Spark and by default writes a directory of files, with a default maximum file size for different file formats. This makes it easy to perform parallel writing. To write out a DataStream to CSV or Parquet to a local directory (you must specify a valid absolute path), simply do: d.write_csv(\"/home/ubuntu/test-path/\") d.write_parquet(\"/home/ubuntu/test-path/\") To write out a DataStream to S3, you should specify an S3 bucket and prefix like this: d.write_csv(\"s3://bucket/prefix/\") d.write_parquet(\"s3://bucket/prefix/\") Writing out a DataStream is a blocking API and will automatically call a collect() for you. The collected Polars DataFrame at the end is just a column of filenames produced. Lesson 3: Things you can't do. Here is a brief discussion of what Quokka is not great for. Quokka's main advantage stems from the fact it can pipeline the execution of DataStreams. Once a partition (typically a Polars DataFrame) in a DataStream has been generated, it can be immediately consumed by a downstream user. This means downstream processing of this partition and upstream generation of the next partition can be overlapped. Now, if an operator processing a DataStream cannot emit any partitions downstream until it has seen all of the partitions in its input DataStreams, the pipeline breaks. An example of this is an aggregation. You cannot safely emit the result of a sum of a column of a table until you have seen every row! The main examples of this in data processing are groupby-aggregations and distributed sorts. Currently, calling groupby().agg() or just agg() on a DataStream will produce another DataStream. However that DataStream will consist of exactly one batch, which holds the final result, emitted when it's computed. It is recommended to just call collect() or compute() on that result. Quokka currently does not support distributed sort -- indeed a sort heavy workload is really great for Spark. Distributed sorting is not exactly needed for many analytical SQL workloads since you typically do the aggregation before the order by, which greatly reduce the number of rows you have to sort. You can then sort after you have done collect() . However for many other workloads distributed sorting is critical, and Quokka aims to support this as soon as possible. Things that Quokka can do and doesn't do yet: fine grained placement of UDFs or UDAFs on GPUs or CPUs, core-count-control, Docker support, reading JSON, etc. Most of these can be easily implemented (and some already are) in the graph level API, however it takes effort to figure out what's the best abstractions to expose in the DataStream API. If you want to make this list shorter, I welcome contributions: zihengw@stanford.edu.","title":"DataStream API"},{"location":"simple/#tutorials","text":"This section is for learning how to use Quokka's DataStream API. Quokka's DataStream API is basically a dataframe API. It takes heavy inspiration from SparkSQL and Polars, and adopts a lazy execution model. This means that in contrast to Pandas, your operations are not executed immediately after you define them. Instead, Quokka builds a logical plan under the hood and executes it only when the user wants to \"collect\" the result, just like Spark. For the first part of our tutorial, we are going to go through implementing a few SQL queries in the TPC-H benchmark suite. You can download the data here . It is about 1GB unzipped. Please download the data (should take 2 minutes) and extract it to some directory locally. If you are testing this on a VM where clicking the link can't work, try this command after pip installing gdown: ~/.local/bin/gdown https://drive.google.com/uc?id=19hgYxZ4u28Cxe0s616Q3yAfkuRdQlmvO . The SQL queries themselves can be found on this awesome interface . These tutorials will use your local machine. They shouldn't take too long to run. It would be great if you can follow along, not just for fun -- if you find a bug in this tutorial I will buy you a cup of coffee! For an extensive API reference, please refer to here .","title":"Tutorials"},{"location":"simple/#lesson-1-things","text":"Please read the Getting Started section. I spent way too much time making the cartoons on that page.","title":"Lesson -1: Things"},{"location":"simple/#lesson-0-reading-things","text":"For every Quokka program, we need to set up a QuokkaContext object. This is similar to the Spark SQLContext . This can easily be done by running the following two lines of code in your Python terminal. from pyquokka.df import * qc = QuokkaContext() Once we have the QuokkaContext object, we can start reading data to obtain DataStreams. Quokka can read data on disk and on the cloud (currently S3). For the purposes of this tutorial we will be reading data from disk. Quokka currently reads CSV and Parquet, with plans to add JSON soon. Here is how you would read a CSV file if you know the schema: # the last column is called NULL, because the TPC-H data generator likes to put a | at the end of each row, making it appear as if there is a final column # with no values. Don't worry, we can drop this column. lineitem_scheme = [\"l_orderkey\",\"l_partkey\",\"l_suppkey\",\"l_linenumber\",\"l_quantity\",\"l_extendedprice\", \"l_discount\",\"l_tax\",\"l_returnflag\",\"l_linestatus\",\"l_shipdate\",\"l_commitdate\",\"l_receiptdate\",\"l_shipinstruct\",\"l_shipmode\",\"l_comment\", \"null\"] lineitem = qc.read_csv(disk_path + \"lineitem.tbl\", lineitem_scheme, sep=\"|\") And if you don't know the schema but there is a header row where column names are separated with the same separator as the data : lineitem = qc.read_csv(disk_path + \"lineitem.tbl.named\", sep=\"|\", has_header=True) You can also read a directory of CSV files: lineitem = qc.read_csv(disk_path + \"lineitem/*\", lineitem_scheme, sep=\"|\", has_header = True) Now let's read all the tables of the TPC-H benchmark suite. Set disk_path to where you unzipped the files. lineitem = qc.read_csv(disk_path + \"lineitem.tbl\", sep=\"|\", has_header=True) orders = qc.read_csv(disk_path + \"orders.tbl\", sep=\"|\", has_header=True) customer = qc.read_csv(disk_path + \"customer.tbl\",sep = \"|\", has_header=True) part = qc.read_csv(disk_path + \"part.tbl\", sep = \"|\", has_header=True) supplier = qc.read_csv(disk_path + \"supplier.tbl\", sep = \"|\", has_header=True) partsupp = qc.read_csv(disk_path + \"partsupp.tbl\", sep = \"|\", has_header=True) nation = qc.read_csv(disk_path + \"nation.tbl\", sep = \"|\", has_header=True) region = qc.read_csv(disk_path + \"region.tbl\", sep = \"|\", has_header=True) If you want to read the Parquet files, you should first run this script to generate the Parquet files: import polars as pl disk_path = \"/home/ubuntu/tpc-h/\" #replace files = [\"lineitem.tbl\",\"orders.tbl\",\"customer.tbl\",\"part.tbl\",\"supplier.tbl\",\"partsupp.tbl\",\"nation.tbl\",\"region.tbl\"] for file in files: df = pl.read_csv(disk_path + file,sep=\"|\",has_header = True, parse_dates = True).drop(\"null\") df.write_parquet(disk_path + file.replace(\"tbl\", \"parquet\"), row_group_size=100000) To read in a Parquet file, you don't have to worry about headers or schema, just do: lineitem = qc.read_parquet(disk_path + \"lineitem.parquet\") Currently, qc.read_csv and qc.read_parquet will either return a DataStream or just a Polars DataFrame directly if the data size is small (set at 10 MB).","title":"Lesson 0: Reading Things"},{"location":"simple/#lesson-1-doing-things","text":"Now that we have read the data, let's do things with it. First, why don't we count how many rows there are in the lineitem table. >>> lineitem.aggregate({\"*\":\"count\"}).collect() If you don't see the number 6001215 after a while, something is very wrong. Please send me an email, I will help you fix things (and buy you a coffee): zihengw@stanford.edu. Feel free to type other random things and see if it's supported, but for those interested, let's follow a structured curriculum. Let's take a look at TPC-H query 1 . This is how you would write it in Quokka. This is very similar to how you'd write in another DataFrame library like Polars or Dask. def do_1(): d = lineitem.filter(\"l_shipdate <= date '1998-12-01' - interval '90' day\") d = d.with_column(\"disc_price\", lambda x: x[\"l_extendedprice\"] * (1 - x[\"l_discount\"]), required_columns ={\"l_extendedprice\", \"l_discount\"}) d = d.with_column(\"charge\", lambda x: x[\"l_extendedprice\"] * (1 - x[\"l_discount\"]) * (1 + x[\"l_tax\"]), required_columns={\"l_extendedprice\", \"l_discount\", \"l_tax\"}) f = d.groupby([\"l_returnflag\", \"l_linestatus\"], orderby=[\"l_returnflag\",\"l_linestatus\"]).agg({\"l_quantity\":[\"sum\",\"avg\"], \"l_extendedprice\":[\"sum\",\"avg\"], \"disc_price\":\"sum\", \"charge\":\"sum\", \"l_discount\":\"avg\",\"*\":\"count\"}) return f.collect() Quokka supports filtering DataStreams by DataStream.filter() . Filters can be specified in SQL syntax. The columns in the SQL expression must exist in the schema of the DataStream. A more Pythonic way of doing this like b = b[b.a < 5] isn't supported yet, mainly due to the finickiness surrounding date types etc. The result of a filter() is another DataStream whose Polars DataFrames will only contain rows that respect the predicate. On the plus side, Quokka uses the amazing SQLGlot library to support most ANSI-SQL compliant predicates, including dates, between, IN, even arithmetic in conditions. Try out some different predicates ! Please give SQLGlot a star when you're at it. For example, you can specify this super complicated predicate for TPC-H query 6 : def do_6(): d = lineitem.filter(\"l_shipdate >= date '1994-01-01' and l_shipdate < date '1994-01-01' + interval '1' year and l_discount between 0.06 - 0.01 and 0.06 + 0.01 and l_quantity < 24\") d = d.with_column(\"revenue\", lambda x: x[\"l_extendedprice\"] * x[\"l_discount\"], required_columns={\"l_extendedprice\", \"l_discount\"}) f = d.aggregate({\"revenue\":[\"sum\"]}) return f.collect() Quokka supports creating new columns in DataStreams with with_column . Read more about how this works here . This is in principle similar to Spark df.with_column and Pandas UDFs. The main thing to keep in mind is that the function you supply will be applied to each batch in the DataStream, instead of row by row. As a result, you can make use of fast vectorized execution with Polars. The mental model here is that we have a DataStream d of Polars DataFrames, each of which have rows from the lineitem table satisfying the filter predicate. Then, each Polars DataFrame is transformed by our functions to add the columns disk_price and charge . Like most Quokka operations, with_column will produce a new DataStream with an added column and is not inplace. This means that the command is lazy, and won't trigger the runtime to produce the actual data. It simply builds a logical plan of what to do in the background, which can be optimized when the user specifically ask for the result. Finally, we can group the DataStream and aggregate it to get the result. Read more about aggregation syntax here . The aggregation will produce another DataStream, which we call collect() on, to convert it to a Polars DataFrame in your Python terminal. When you call .collect() , the logical plan you have built is actually optimized and executed. This is exactly how Spark works. To view the optimized logical plan and learn more about what Quokka is doing, you can do f.explain() which will produce a graph, or f.explain(mode=\"text\") which will produce a textual explanation. Joins work very intuitively. For example, this is how to do TPC-H query 12 . def do_12(): d = lineitem.join(orders,left_on=\"l_orderkey\", right_on=\"o_orderkey\") d = d.filter(\"l_shipmode IN ('MAIL','SHIP') and l_commitdate < l_receiptdate and l_shipdate < l_commitdate and \\ l_receiptdate >= date '1994-01-01' and l_receiptdate < date '1995-01-01'\") d = d.with_column(\"high\", lambda x: (x[\"o_orderpriority\"] == \"1-URGENT\") | (x[\"o_orderpriority\"] == \"2-HIGH\"), required_columns={\"o_orderpriority\"}) d = d.with_column(\"low\", lambda x: (x[\"o_orderpriority\"] != \"1-URGENT\") & (x[\"o_orderpriority\"] != \"2-HIGH\"), required_columns={\"o_orderpriority\"}) f = d.groupby(\"l_shipmode\").aggregate(aggregations={'high':['sum'], 'low':['sum']}) return f.collect() Note it does not matter if you filter after the join or before the join, Quokka will automatically push them down during the logical plan optimization. The join operator on a DataStream takes in either another DataStream or a Polars DataFrame in your Python session. In the latter case, this Polars DataFrame will be broadcasted to different workers similar to Spark's broadcast join. Here is another example, TPC-H query 3 . def do_3(): d = lineitem.join(orders,left_on=\"l_orderkey\", right_on=\"o_orderkey\") d = customer.join(d,left_on=\"c_custkey\", right_on=\"o_custkey\") d = d.filter(\"c_mktsegment = 'BUILDING' and o_orderdate < date '1995-03-15' and l_shipdate > date '1995-03-15'\") d = d.with_column(\"revenue\", lambda x: x[\"l_extendedprice\"] * ( 1 - x[\"l_discount\"]) , required_columns={\"l_extendedprice\", \"l_discount\"}) f = d.groupby([\"l_orderkey\",\"o_orderdate\",\"o_shippriority\"]).agg({\"revenue\":[\"sum\"]}) return f.collect() Note unlike some SQL engines, Quokka currently will not try to figure out the optimal join ordering between the specified three-way join between lineitem, orders and customer tables. You are responsible for figuring that out at the moment -- try to join smaller tables first and then join them against larger tables, or try to minimize the intermeidate result size from those joins. An important thing to note is that Quokka currently only support inner joins. Other kinds of joins are coming soon. Feel free to look at some other queries in the Quokka github , or browse the API reference . While you are there, please give Quokka a star!","title":"Lesson 1: Doing Things"},{"location":"simple/#lesson-2-writing-things","text":"So far, we have just learned about how to read things into DataStreams and do things to DataStreams. You can also write out DataStreams to persistent storage like disk or S3 to record all the amazing things we did with them. Quokka currently operates like Spark and by default writes a directory of files, with a default maximum file size for different file formats. This makes it easy to perform parallel writing. To write out a DataStream to CSV or Parquet to a local directory (you must specify a valid absolute path), simply do: d.write_csv(\"/home/ubuntu/test-path/\") d.write_parquet(\"/home/ubuntu/test-path/\") To write out a DataStream to S3, you should specify an S3 bucket and prefix like this: d.write_csv(\"s3://bucket/prefix/\") d.write_parquet(\"s3://bucket/prefix/\") Writing out a DataStream is a blocking API and will automatically call a collect() for you. The collected Polars DataFrame at the end is just a column of filenames produced.","title":"Lesson 2: Writing Things"},{"location":"simple/#lesson-3-things-you-cant-do","text":"Here is a brief discussion of what Quokka is not great for. Quokka's main advantage stems from the fact it can pipeline the execution of DataStreams. Once a partition (typically a Polars DataFrame) in a DataStream has been generated, it can be immediately consumed by a downstream user. This means downstream processing of this partition and upstream generation of the next partition can be overlapped. Now, if an operator processing a DataStream cannot emit any partitions downstream until it has seen all of the partitions in its input DataStreams, the pipeline breaks. An example of this is an aggregation. You cannot safely emit the result of a sum of a column of a table until you have seen every row! The main examples of this in data processing are groupby-aggregations and distributed sorts. Currently, calling groupby().agg() or just agg() on a DataStream will produce another DataStream. However that DataStream will consist of exactly one batch, which holds the final result, emitted when it's computed. It is recommended to just call collect() or compute() on that result. Quokka currently does not support distributed sort -- indeed a sort heavy workload is really great for Spark. Distributed sorting is not exactly needed for many analytical SQL workloads since you typically do the aggregation before the order by, which greatly reduce the number of rows you have to sort. You can then sort after you have done collect() . However for many other workloads distributed sorting is critical, and Quokka aims to support this as soon as possible. Things that Quokka can do and doesn't do yet: fine grained placement of UDFs or UDAFs on GPUs or CPUs, core-count-control, Docker support, reading JSON, etc. Most of these can be easily implemented (and some already are) in the graph level API, however it takes effort to figure out what's the best abstractions to expose in the DataStream API. If you want to make this list shorter, I welcome contributions: zihengw@stanford.edu.","title":"Lesson 3: Things you can't do."},{"location":"started/","text":"Getting Started Quokka in Three Cartoons The fundamental concept in Quokka is a stream of Polars DataFrames , which we call a DataStream . A Polars DataFrame is basically a Pandas DataFrame, except that it's backed by Apache Arrow and supports fast compute with Polars . Readers familiar with Spark RDDs can interpret a DataStream as an RDD where data partitions are materialized in sequence. In contrast to Spark, partitions can be consumed as soon as they are generated. This facilitates pipelining between multiple data processing stages and is the primary reason why Quokka is fast. The user defines input readers that generate a DataStream from a dataset. For example, Quokka's cloud CSV reader generates a DataStream from an S3 bucket of CSV files. The user can also define stateful operators that operate on one or more DataStreams to produce one more DataStream. Finally a DataStream could be written to an output sink , which could be a distributed in-memory dataset that can be converted to Pandas or stable storage on disk or S3. In this illustration, the bush produces a DataStream of leaves and the forest produces a DataStream of acorns. The brown quokka consumes those two streams and magically turn it into a stream of strawberries. The grey quokka takes in this stream of strawberries, slices them up and puts them in a salad bowl. Unfortunately, people like us can't slice strawberries for a living and have to process tables of numbers. Quokka exposes useful primitives that allow you to filter, aggregate and join DataStreams, similar to what you can do in Pandas or Spark. Please look at the tutorials to learn more. It would be a dismal world if there is only one quokka of each kind. Quokka supports parallelism for stateful operators with channels , which are parallel instantiations of a stateful operator to achieve data parallelism. Input sources can also have channels to parallelize the reading of a data source. For example, we can have two bushes and two forests, and four brown quokkas. While the user can manually specify the number of channels they want for operators, in most cases it's automagically decided for you based on what you are doing, similar to Spark. At its core, Quokka uses Ray actors. Each channel in an input source or stateful operator constitutes an actor that can be scheduled independently to a machine in a cluster. Actors on the same machine talk to each other through memory while actors on different machines communicate through the network. An example scheduling of our quokkas is shown below. The user also shouldn't have to worry about this scheduling in most cases if using the DataStream API. However I couldn't resist making this cartoon, and it might be cool to know how Quokka works under the hood. Installation If you plan on trying out Quokka for whatever reason, I'd love to hear from you. Please send an email to zihengw@stanford.edu or join the Discord . Quokka can be installed as a pip package: pip3 install pyquokka However it needs the latest version of Redis (at least 7.0) , which you can get by running the following: curl -fsSL https://packages.redis.io/gpg | sudo gpg --dearmor -o /usr/share/keyrings/redis-archive-keyring.gpg echo \"deb [signed-by=/usr/share/keyrings/redis-archive-keyring.gpg] https://packages.redis.io/deb $(lsb_release -cs) main\" | sudo tee /etc/apt/sources.list.d/redis.list sudo apt-get update sudo apt-get install redis If you only plan on running Quokka locally, you are done. Here is a 10 min lesson on how it works. If you are planning on reading files from S3, you need to install the awscli and you have your credentials set up. If you plan on using Quokka for cloud by launching EC2 clusters, there's a bit more setup that needs to be done. Currently Quokka only provides support for AWS. Quokka provides a utility library under pyquokka.utils which allows you to manager clusters and connect to them. It assumes that awscli is configured locally and you have a keypair and a security group with the proper configurations. To set these things up, you can follow the AWS guide . More detailed instructions can be found in Setting Up Cloud Cluster . Quokka also plans to extend support to Docker/Kubernetes based deployments based on KubeRay. (Contributions welcome!) Image credits: some icons taken from flaticon.com.","title":"Getting Started"},{"location":"started/#getting-started","text":"","title":"Getting Started"},{"location":"started/#quokka-in-three-cartoons","text":"The fundamental concept in Quokka is a stream of Polars DataFrames , which we call a DataStream . A Polars DataFrame is basically a Pandas DataFrame, except that it's backed by Apache Arrow and supports fast compute with Polars . Readers familiar with Spark RDDs can interpret a DataStream as an RDD where data partitions are materialized in sequence. In contrast to Spark, partitions can be consumed as soon as they are generated. This facilitates pipelining between multiple data processing stages and is the primary reason why Quokka is fast. The user defines input readers that generate a DataStream from a dataset. For example, Quokka's cloud CSV reader generates a DataStream from an S3 bucket of CSV files. The user can also define stateful operators that operate on one or more DataStreams to produce one more DataStream. Finally a DataStream could be written to an output sink , which could be a distributed in-memory dataset that can be converted to Pandas or stable storage on disk or S3. In this illustration, the bush produces a DataStream of leaves and the forest produces a DataStream of acorns. The brown quokka consumes those two streams and magically turn it into a stream of strawberries. The grey quokka takes in this stream of strawberries, slices them up and puts them in a salad bowl. Unfortunately, people like us can't slice strawberries for a living and have to process tables of numbers. Quokka exposes useful primitives that allow you to filter, aggregate and join DataStreams, similar to what you can do in Pandas or Spark. Please look at the tutorials to learn more. It would be a dismal world if there is only one quokka of each kind. Quokka supports parallelism for stateful operators with channels , which are parallel instantiations of a stateful operator to achieve data parallelism. Input sources can also have channels to parallelize the reading of a data source. For example, we can have two bushes and two forests, and four brown quokkas. While the user can manually specify the number of channels they want for operators, in most cases it's automagically decided for you based on what you are doing, similar to Spark. At its core, Quokka uses Ray actors. Each channel in an input source or stateful operator constitutes an actor that can be scheduled independently to a machine in a cluster. Actors on the same machine talk to each other through memory while actors on different machines communicate through the network. An example scheduling of our quokkas is shown below. The user also shouldn't have to worry about this scheduling in most cases if using the DataStream API. However I couldn't resist making this cartoon, and it might be cool to know how Quokka works under the hood.","title":"Quokka in Three Cartoons"},{"location":"started/#installation","text":"If you plan on trying out Quokka for whatever reason, I'd love to hear from you. Please send an email to zihengw@stanford.edu or join the Discord . Quokka can be installed as a pip package: pip3 install pyquokka However it needs the latest version of Redis (at least 7.0) , which you can get by running the following: curl -fsSL https://packages.redis.io/gpg | sudo gpg --dearmor -o /usr/share/keyrings/redis-archive-keyring.gpg echo \"deb [signed-by=/usr/share/keyrings/redis-archive-keyring.gpg] https://packages.redis.io/deb $(lsb_release -cs) main\" | sudo tee /etc/apt/sources.list.d/redis.list sudo apt-get update sudo apt-get install redis If you only plan on running Quokka locally, you are done. Here is a 10 min lesson on how it works. If you are planning on reading files from S3, you need to install the awscli and you have your credentials set up. If you plan on using Quokka for cloud by launching EC2 clusters, there's a bit more setup that needs to be done. Currently Quokka only provides support for AWS. Quokka provides a utility library under pyquokka.utils which allows you to manager clusters and connect to them. It assumes that awscli is configured locally and you have a keypair and a security group with the proper configurations. To set these things up, you can follow the AWS guide . More detailed instructions can be found in Setting Up Cloud Cluster . Quokka also plans to extend support to Docker/Kubernetes based deployments based on KubeRay. (Contributions welcome!) Image credits: some icons taken from flaticon.com.","title":"Installation"},{"location":"tutorial/","text":"Advanced Tutorials This section is for learning how to use Quokka's graph level API. This is expected for use cases where the dataframe API cannot satisfy your needs. Most users are not expected to program at this level. You should contact me: zihengw@stanford.edu if you want to do this. You should probably stop reading now, unless you are a Stanford undergrad or masters student (or somebody else) who somehow decided to work with me on Quokka. The code for the tutorials can be found under apps/tutorials . They might perform meaningless tasks or perform tasks which you shoudn't necessarily use Quokka for, but they will showcase how Quokka works. I wrote Quokka. As a result I might take some things for granted that you might not. If you spot a typo or find some sections too difficult to understand, I would appreciate your feedback! Better yet, the docs are also open source under quokka/docs, so you can also make a PR. Lesson 0: Addition Let's walk through our first Quokka program. This first example defines an input reader which produces a stream of numbers, and a stateful operator which adds them up. Please read the comments in the code. Let's first look at the import section. # we need to import Quokka specific objects. A TaskGraph is always needed in a program # that uses the DAG runtime API. We will define a TaskGraph by defining input readers # and stateful operators and adding them to the TaskGraph. Then we will execute the TaskGraph. from pyquokka.quokka_runtime import TaskGraph # Quokka also needs a notion of the compute substrate the TaskGraph is executing on. # LocalCluster is meant for single-machine execution. For distributed execution, # you would need to import QuokkaClusterManager and create a new cluster or initialize # one from a json config. from pyquokka.utils import LocalCluster # Executor is an abstract class which you should extend to implement your own executors. # Quokka also provides canned executors which you call import from pyquokka.executors such # as joins, sort and asof_join. from pyquokka.executors import Executor import time # define a LocalCluster execution context. This will make a cluster object with information # such as local core count etc. cluster = LocalCluster() Quokka provides many optimized input readers for different input data formats. However, in this tutorial we are going to define a custom input reader class to showcase how the input reader works. The mindset here is that there will be many channels of this input reader (by default equal to the number of cores in the cluster), and each channel will have its own copy of an object of this class. They will all be initialized in the same way, but when each channel calls the get_next_batch method of its own object, the channel argument supplied will be different. class SimpleDataset: # the object will be initialized once locally. You can define whatever attributes you want. # You can also set attributes to None if they will be supplied later by the framework # in set_num_channels method def __init__(self, limit) -> None: self.limit = limit self.num_channels = None # this is an optional method that will be called by the runtime on this object during # TaskGraph construction, if the method exists. This mainly updates the num_channel # attribute of the object. For some input readers what a channel produces is independent # of the total number of channels, and they don't have to implement this method. Other # input readers might need to perform additional computation upon learning the total # number of channels, such as byte ranges to read in a CSV file. # # This method can be used to set additional class attributes. The programmer could # do that in the __init__ method too, if she knows the total number of channels # and does not want to rely on Quokka's default behavior etc. def set_num_channels(self, num_channels): self.num_channels = num_channels # the get_next_batch method defines an iterator. Each channel will iterate through # its own copy of the object's get_next_batch method, with the channel argument # set to its own channel id. In this example, if there are N channels, channel k # will yield numbers k, k + N, k + 2N, all the way up to the limit. # Note that the get_next_batch method takes an optional parameter pos, and yields # two objects, with the first being None here. Let's not worry about these things # for the time being. They are used for Quokka's parallelized fault recovery. def get_next_batch(self, channel, pos=None): assert self.num_channels is not None curr_number = channel while curr_number < self.limit: yield None, curr_number curr_number += self.num_channels Now that we defined the input reader, we are going to define the stateful operator. Similar to the input reader, we define a Python class. All channels of the stateful operator will have a copy of an object of this class. The stateful operator exposes two important methods, execute and done , which might produce outputs for more downstream stateful operators. execute is called whenever upstream input reader channels have produced some input batches for the stateful operator channel to process. done is called when the stateful operator channel knows it will no longer receive any more inputs and has already processed all the inputs it has. Our stateful operator here adds up all the elements in an input stream and returns the sum. class AddExecutor(Executor): # initialize state. This will be done locally. This initial state will be copied # along with the object to all the channels. def __init__(self) -> None: self.sum = 0 # the execute method takes three arguments. The first argument batches, is a list of # batches from an input QStream, which could be the output of an input reader or another # stateful operator. The items in the batch could have come from one channel, several, # or all of them! it is best practice that the stateful operator doesn't make # any assumptions on where these batches originated, except that they belong # to the same QStream. # the second argument, stream_id, is used to identify the QStream the batches came from. # in this example we only have one input QStream so we can ignore this argument. # the third argument, channel, denotes the channel id of the channel executing the object # similar to the argument for the input reader. Here we also don't use this argument. def execute(self,batches,stream_id, channel): for batch in batches: assert type(batch) == int self.sum += batch # note that we can't return anything in our execute method. We don't know what the sum is # until we have seen all of the elements in the input QStream. # done only has one argument, which is the channel. It can return an element or an iterator # of elements. def done(self,channel): print(\"I am executor \", channel, \" my sum is \", self.sum) return self.sum Now that we have defined our input reader and stateful operator, we can hook them up together in a TaskGraph. Defining the TaskGraph requires a cluster object, which is LocalCluster here but can be an S3Cluster or AzureCluster for cloud deployments. We will then initialize the objects for the input reader and stateful operators. Again, we initialize one object, which will be copied to each channel. We can now add the input reader and stateful operator to our TaskGraph. task_graph = TaskGraph(cluster) reader = SimpleDataset(80) # define a new input reader in our TaskGraph. numbers is a QStream. numbers = task_graph.new_input_reader_node(reader) executor = AddExecutor() # define a new blocking node. A blocking node writes out its results in a materialized Dataset # object instead of producing a QStream. Note the first argument is a dictionary. This assigns # each input stream an internal name, which corresponds to the stream_id field in the execute # method. Since we called the numbers QStream 0, when execute is called on batches from this QStream, # the stream_id argument will be 0. sum = task_graph.new_blocking_node({0:numbers},executor) # create() must be called before run() task_graph.create() start = time.time() task_graph.run() print(\"total time \", time.time() - start) # we can call to_list() on a Dataset object to collect its elements, which will simply be all # the objects returned by the blocking node's execute and done methods. print(sum.to_list()) Here we used new_blocking_node to define the stateful operator in the TaskGraph. The TaskGraph exposes two different APIs: new_nonblocking_node and new_blocking node . The former will put their outputs in a QStream, which could be consumed by downstream operators immediately, while the latter will materialize the outputs into a Dataset object. Downstream operators cannot read a Dataset until it's complete. This is intimately related to the idea of nonblocking vs blocking stateful operators. Some operators such as streaming join can emit valid outputs as soon as they have seen partial inputs, while other operators like aggregation must wait until seeing all of the input before emitting any partial output. However, you could define a nonblocking operator as a new_blocking_node , if you want to materialize its outputs instead of streaming them forward, e.g. to limit pipeline depth. You could also define a blocking operator as a new_nonblocking_node , the QStream will just consist of the elements returned during the done method (which could return an iterator). The TaskGraph also exposes an API to define stateless operators: new_task . This defines a stateless transform on a QStream and is very similar to Spark's map . We will cover this in a later tutorial to showcase deep learning inference. Note that we covered most of the important concepts covered in the getting started cartoons. However the astute reader would notice that we didn't define a partition function here, nor did we specify how many channels of the input reader or the stateful operator to launch. The answer is that Quokka tries to provide suitable defaults for these things. Quokka currently launches one channel per core for input readers, and one channel per machine for stateful operators. These defaults are subject to change and you shouldn't rely on them. Quokka's default partition function is to send all the outputs generated by a channel to the channel of the target on the same machine. Lesson 1: Joins If you think the first lesson was too complicated, it proably was. This is because we had to define custom input readers and stateful operators. Hopefully in the process you learned a few things about how Quokka works. In most scenarios, it is my hope that you don't have to define custom objects, and use canned implementations which you can just import. This is similar to how Tensorflow or Pytorch works. If you know how to import torch.nn.Conv2d , you get the idea. Here, we are going to take two CSVs on Disk, join them, and count the number of records in the result: select count(*) from a and b where a.key = b.key . You can use the a.csv and b.csv provided in the apps/tutorials folder, or you can supply your own and change the CSV input reader arguments appropriately. Without further ado, here's the code with comments: import time from pyquokka.quokka_runtime import TaskGraph from pyquokka.executors import PolarJoinExecutor, CountExecutor from pyquokka.dataset import InputDiskCSVDataset from pyquokka.utils import LocalCluster cluster = LocalCluster() task_graph = TaskGraph(cluster) # the arguments are: filename, column names, how many bytes to read in a batch a_reader = InputDiskCSVDataset(\"a.csv\", [\"key\",\"val1\",\"val2\"] , stride = 1024) b_reader = InputDiskCSVDataset(\"b.csv\", [\"key\",\"val1\",\"val2\"] , stride = 1024) a = task_graph.new_input_reader_node(a_reader) b = task_graph.new_input_reader_node(b_reader) # define a streaming join operator using the Polars library for internal join implementation. join_executor = PolarJoinExecutor(on=\"key\") # the default partition strategy will not work for join! We need to specify # an alternative partition function. Quokka has the notion of \"keyed\" QStreams, # which are QStreams where the batch elements are Pandas or Polars DataFrames # or Pyarrow tables. In this case, we can provide a column name as partition key. joined = task_graph.new_non_blocking_node({0:a,1:b},join_executor,partition_key_supplied={0:\"key\", 1:\"key\"}) count_executor = CountExecutor() count = task_graph.new_blocking_node({0:joined},count_executor) task_graph.create() start = time.time() task_graph.run() print(\"total time \", time.time() - start) print(count.to_list()) Note here we defined a new_nonblocking_node for the join operator and a new_blocking_node for the count operator. This means that Quokka will execute the join in a pipelined parallel fashion with the count. As a result, the input reader, join and count actors are all executing concurrently in the system. The count operator will return the count as a single number which will be stored in a Dataset object. About benchmarking Quokka programs. Quokka programs do a bit of processing locally. For example, when an input reader is added to the TaskGraph with an InputDiskCSVDataset object, Quokka performs set_num_channels on the object, and compute byte offsets for each channel to start reading from. This could be expensive for large CSV files, especially if we are using blob storage input sources. In pratice this completes in a few seconds for datasets TBs in size. This is quite similar to what Spark's dataframe API does. The TaskGraph also needs to be initialized by calling task_graph.create() . This actually spawns the Ray actors executing the channels, and could take a while when you have a lot of channels. However, the time of both the input reader initialization and the TaskGraph initialization do not strongly scale with the input data size, unlike the actual execution time of the TaskGraph! As a result, while on trivial input sizes one might find the initialization times to be longer than the actual execution time, on real programs it is best practice to just time the task_graph.run() call. This example showed how to execute a simple SQL query by describing its physical plan. You can execute much mroe complex SQL queries with Quokka (check out the TPC-H implementations under quokka/apps). Quokka can currently typically achieve around 3x speedup compared to SparkSQL (EMR 6.5.0). If you have an expensive query you have to periodically run and would like to try writing out its physical plan in Quokka API, give it a shot! Again, contact me at zihengw@stanford.edu if you run into any problems. We are working very hard to add a dataframe and SQL API to Quokka, targeting release Sep/Oct 2022. Keep tuned for more information.","title":"TaskGraph API"},{"location":"tutorial/#advanced-tutorials","text":"This section is for learning how to use Quokka's graph level API. This is expected for use cases where the dataframe API cannot satisfy your needs. Most users are not expected to program at this level. You should contact me: zihengw@stanford.edu if you want to do this. You should probably stop reading now, unless you are a Stanford undergrad or masters student (or somebody else) who somehow decided to work with me on Quokka. The code for the tutorials can be found under apps/tutorials . They might perform meaningless tasks or perform tasks which you shoudn't necessarily use Quokka for, but they will showcase how Quokka works. I wrote Quokka. As a result I might take some things for granted that you might not. If you spot a typo or find some sections too difficult to understand, I would appreciate your feedback! Better yet, the docs are also open source under quokka/docs, so you can also make a PR.","title":"Advanced Tutorials"},{"location":"tutorial/#lesson-0-addition","text":"Let's walk through our first Quokka program. This first example defines an input reader which produces a stream of numbers, and a stateful operator which adds them up. Please read the comments in the code. Let's first look at the import section. # we need to import Quokka specific objects. A TaskGraph is always needed in a program # that uses the DAG runtime API. We will define a TaskGraph by defining input readers # and stateful operators and adding them to the TaskGraph. Then we will execute the TaskGraph. from pyquokka.quokka_runtime import TaskGraph # Quokka also needs a notion of the compute substrate the TaskGraph is executing on. # LocalCluster is meant for single-machine execution. For distributed execution, # you would need to import QuokkaClusterManager and create a new cluster or initialize # one from a json config. from pyquokka.utils import LocalCluster # Executor is an abstract class which you should extend to implement your own executors. # Quokka also provides canned executors which you call import from pyquokka.executors such # as joins, sort and asof_join. from pyquokka.executors import Executor import time # define a LocalCluster execution context. This will make a cluster object with information # such as local core count etc. cluster = LocalCluster() Quokka provides many optimized input readers for different input data formats. However, in this tutorial we are going to define a custom input reader class to showcase how the input reader works. The mindset here is that there will be many channels of this input reader (by default equal to the number of cores in the cluster), and each channel will have its own copy of an object of this class. They will all be initialized in the same way, but when each channel calls the get_next_batch method of its own object, the channel argument supplied will be different. class SimpleDataset: # the object will be initialized once locally. You can define whatever attributes you want. # You can also set attributes to None if they will be supplied later by the framework # in set_num_channels method def __init__(self, limit) -> None: self.limit = limit self.num_channels = None # this is an optional method that will be called by the runtime on this object during # TaskGraph construction, if the method exists. This mainly updates the num_channel # attribute of the object. For some input readers what a channel produces is independent # of the total number of channels, and they don't have to implement this method. Other # input readers might need to perform additional computation upon learning the total # number of channels, such as byte ranges to read in a CSV file. # # This method can be used to set additional class attributes. The programmer could # do that in the __init__ method too, if she knows the total number of channels # and does not want to rely on Quokka's default behavior etc. def set_num_channels(self, num_channels): self.num_channels = num_channels # the get_next_batch method defines an iterator. Each channel will iterate through # its own copy of the object's get_next_batch method, with the channel argument # set to its own channel id. In this example, if there are N channels, channel k # will yield numbers k, k + N, k + 2N, all the way up to the limit. # Note that the get_next_batch method takes an optional parameter pos, and yields # two objects, with the first being None here. Let's not worry about these things # for the time being. They are used for Quokka's parallelized fault recovery. def get_next_batch(self, channel, pos=None): assert self.num_channels is not None curr_number = channel while curr_number < self.limit: yield None, curr_number curr_number += self.num_channels Now that we defined the input reader, we are going to define the stateful operator. Similar to the input reader, we define a Python class. All channels of the stateful operator will have a copy of an object of this class. The stateful operator exposes two important methods, execute and done , which might produce outputs for more downstream stateful operators. execute is called whenever upstream input reader channels have produced some input batches for the stateful operator channel to process. done is called when the stateful operator channel knows it will no longer receive any more inputs and has already processed all the inputs it has. Our stateful operator here adds up all the elements in an input stream and returns the sum. class AddExecutor(Executor): # initialize state. This will be done locally. This initial state will be copied # along with the object to all the channels. def __init__(self) -> None: self.sum = 0 # the execute method takes three arguments. The first argument batches, is a list of # batches from an input QStream, which could be the output of an input reader or another # stateful operator. The items in the batch could have come from one channel, several, # or all of them! it is best practice that the stateful operator doesn't make # any assumptions on where these batches originated, except that they belong # to the same QStream. # the second argument, stream_id, is used to identify the QStream the batches came from. # in this example we only have one input QStream so we can ignore this argument. # the third argument, channel, denotes the channel id of the channel executing the object # similar to the argument for the input reader. Here we also don't use this argument. def execute(self,batches,stream_id, channel): for batch in batches: assert type(batch) == int self.sum += batch # note that we can't return anything in our execute method. We don't know what the sum is # until we have seen all of the elements in the input QStream. # done only has one argument, which is the channel. It can return an element or an iterator # of elements. def done(self,channel): print(\"I am executor \", channel, \" my sum is \", self.sum) return self.sum Now that we have defined our input reader and stateful operator, we can hook them up together in a TaskGraph. Defining the TaskGraph requires a cluster object, which is LocalCluster here but can be an S3Cluster or AzureCluster for cloud deployments. We will then initialize the objects for the input reader and stateful operators. Again, we initialize one object, which will be copied to each channel. We can now add the input reader and stateful operator to our TaskGraph. task_graph = TaskGraph(cluster) reader = SimpleDataset(80) # define a new input reader in our TaskGraph. numbers is a QStream. numbers = task_graph.new_input_reader_node(reader) executor = AddExecutor() # define a new blocking node. A blocking node writes out its results in a materialized Dataset # object instead of producing a QStream. Note the first argument is a dictionary. This assigns # each input stream an internal name, which corresponds to the stream_id field in the execute # method. Since we called the numbers QStream 0, when execute is called on batches from this QStream, # the stream_id argument will be 0. sum = task_graph.new_blocking_node({0:numbers},executor) # create() must be called before run() task_graph.create() start = time.time() task_graph.run() print(\"total time \", time.time() - start) # we can call to_list() on a Dataset object to collect its elements, which will simply be all # the objects returned by the blocking node's execute and done methods. print(sum.to_list()) Here we used new_blocking_node to define the stateful operator in the TaskGraph. The TaskGraph exposes two different APIs: new_nonblocking_node and new_blocking node . The former will put their outputs in a QStream, which could be consumed by downstream operators immediately, while the latter will materialize the outputs into a Dataset object. Downstream operators cannot read a Dataset until it's complete. This is intimately related to the idea of nonblocking vs blocking stateful operators. Some operators such as streaming join can emit valid outputs as soon as they have seen partial inputs, while other operators like aggregation must wait until seeing all of the input before emitting any partial output. However, you could define a nonblocking operator as a new_blocking_node , if you want to materialize its outputs instead of streaming them forward, e.g. to limit pipeline depth. You could also define a blocking operator as a new_nonblocking_node , the QStream will just consist of the elements returned during the done method (which could return an iterator). The TaskGraph also exposes an API to define stateless operators: new_task . This defines a stateless transform on a QStream and is very similar to Spark's map . We will cover this in a later tutorial to showcase deep learning inference. Note that we covered most of the important concepts covered in the getting started cartoons. However the astute reader would notice that we didn't define a partition function here, nor did we specify how many channels of the input reader or the stateful operator to launch. The answer is that Quokka tries to provide suitable defaults for these things. Quokka currently launches one channel per core for input readers, and one channel per machine for stateful operators. These defaults are subject to change and you shouldn't rely on them. Quokka's default partition function is to send all the outputs generated by a channel to the channel of the target on the same machine.","title":"Lesson 0: Addition"},{"location":"tutorial/#lesson-1-joins","text":"If you think the first lesson was too complicated, it proably was. This is because we had to define custom input readers and stateful operators. Hopefully in the process you learned a few things about how Quokka works. In most scenarios, it is my hope that you don't have to define custom objects, and use canned implementations which you can just import. This is similar to how Tensorflow or Pytorch works. If you know how to import torch.nn.Conv2d , you get the idea. Here, we are going to take two CSVs on Disk, join them, and count the number of records in the result: select count(*) from a and b where a.key = b.key . You can use the a.csv and b.csv provided in the apps/tutorials folder, or you can supply your own and change the CSV input reader arguments appropriately. Without further ado, here's the code with comments: import time from pyquokka.quokka_runtime import TaskGraph from pyquokka.executors import PolarJoinExecutor, CountExecutor from pyquokka.dataset import InputDiskCSVDataset from pyquokka.utils import LocalCluster cluster = LocalCluster() task_graph = TaskGraph(cluster) # the arguments are: filename, column names, how many bytes to read in a batch a_reader = InputDiskCSVDataset(\"a.csv\", [\"key\",\"val1\",\"val2\"] , stride = 1024) b_reader = InputDiskCSVDataset(\"b.csv\", [\"key\",\"val1\",\"val2\"] , stride = 1024) a = task_graph.new_input_reader_node(a_reader) b = task_graph.new_input_reader_node(b_reader) # define a streaming join operator using the Polars library for internal join implementation. join_executor = PolarJoinExecutor(on=\"key\") # the default partition strategy will not work for join! We need to specify # an alternative partition function. Quokka has the notion of \"keyed\" QStreams, # which are QStreams where the batch elements are Pandas or Polars DataFrames # or Pyarrow tables. In this case, we can provide a column name as partition key. joined = task_graph.new_non_blocking_node({0:a,1:b},join_executor,partition_key_supplied={0:\"key\", 1:\"key\"}) count_executor = CountExecutor() count = task_graph.new_blocking_node({0:joined},count_executor) task_graph.create() start = time.time() task_graph.run() print(\"total time \", time.time() - start) print(count.to_list()) Note here we defined a new_nonblocking_node for the join operator and a new_blocking_node for the count operator. This means that Quokka will execute the join in a pipelined parallel fashion with the count. As a result, the input reader, join and count actors are all executing concurrently in the system. The count operator will return the count as a single number which will be stored in a Dataset object. About benchmarking Quokka programs. Quokka programs do a bit of processing locally. For example, when an input reader is added to the TaskGraph with an InputDiskCSVDataset object, Quokka performs set_num_channels on the object, and compute byte offsets for each channel to start reading from. This could be expensive for large CSV files, especially if we are using blob storage input sources. In pratice this completes in a few seconds for datasets TBs in size. This is quite similar to what Spark's dataframe API does. The TaskGraph also needs to be initialized by calling task_graph.create() . This actually spawns the Ray actors executing the channels, and could take a while when you have a lot of channels. However, the time of both the input reader initialization and the TaskGraph initialization do not strongly scale with the input data size, unlike the actual execution time of the TaskGraph! As a result, while on trivial input sizes one might find the initialization times to be longer than the actual execution time, on real programs it is best practice to just time the task_graph.run() call. This example showed how to execute a simple SQL query by describing its physical plan. You can execute much mroe complex SQL queries with Quokka (check out the TPC-H implementations under quokka/apps). Quokka can currently typically achieve around 3x speedup compared to SparkSQL (EMR 6.5.0). If you have an expensive query you have to periodically run and would like to try writing out its physical plan in Quokka API, give it a shot! Again, contact me at zihengw@stanford.edu if you run into any problems. We are working very hard to add a dataframe and SQL API to Quokka, targeting release Sep/Oct 2022. Keep tuned for more information.","title":"Lesson 1: Joins"}]} \ No newline at end of file diff --git a/docs/site/simple/index.html b/docs/site/simple/index.html index 3cdacc4..e2c3c1f 100644 --- a/docs/site/simple/index.html +++ b/docs/site/simple/index.html @@ -105,7 +105,7 @@

Tutorials

This section is for learning how to use Quokka's DataStream API. Quokka's DataStream API is basically a dataframe API. It takes heavy inspiration from SparkSQL and Polars, and adopts a lazy execution model. This means that in contrast to Pandas, your operations are not executed immediately after you define them. Instead, Quokka builds a logical plan under the hood and executes it only when the user wants to "collect" the result, just like Spark.

-

For the first part of our tutorial, we are going to go through implementing a few SQL queries in the TPC-H benchmark suite. You can download the data here. It is about 1GB unzipped. Please download the data (should take 2 minutes) and extract it to some directory locally. The SQL queries themselves can be found on this awesome interface.

+

For the first part of our tutorial, we are going to go through implementing a few SQL queries in the TPC-H benchmark suite. You can download the data here. It is about 1GB unzipped. Please download the data (should take 2 minutes) and extract it to some directory locally. If you are testing this on a VM where clicking the link can't work, try this command after pip installing gdown: ~/.local/bin/gdown https://drive.google.com/uc?id=19hgYxZ4u28Cxe0s616Q3yAfkuRdQlmvO. The SQL queries themselves can be found on this awesome interface.

These tutorials will use your local machine. They shouldn't take too long to run. It would be great if you can follow along, not just for fun -- if you find a bug in this tutorial I will buy you a cup of coffee!

For an extensive API reference, please refer to here.

Lesson -1: Things

@@ -158,13 +158,10 @@

Lesson 1: Doing Things

Feel free to type other random things and see if it's supported, but for those interested, let's follow a structured curriculum. Let's take a look at TPC-H query 1.

This is how you would write it in Quokka. This is very similar to how you'd write in another DataFrame library like Polars or Dask.

def do_1():
-
     d = lineitem.filter("l_shipdate <= date '1998-12-01' - interval '90' day")
     d = d.with_column("disc_price", lambda x: x["l_extendedprice"] * (1 - x["l_discount"]), required_columns ={"l_extendedprice", "l_discount"})
     d = d.with_column("charge", lambda x: x["l_extendedprice"] * (1 - x["l_discount"]) * (1 + x["l_tax"]), required_columns={"l_extendedprice", "l_discount", "l_tax"})
-
     f = d.groupby(["l_returnflag", "l_linestatus"], orderby=["l_returnflag","l_linestatus"]).agg({"l_quantity":["sum","avg"], "l_extendedprice":["sum","avg"], "disc_price":"sum", "charge":"sum", "l_discount":"avg","*":"count"})
-
     return f.collect()
 

Quokka supports filtering DataStreams by DataStream.filter(). Filters can be specified in SQL syntax. The columns in the SQL expression must exist in the schema of the DataStream. A more Pythonic way of doing this like b = b[b.a < 5] isn't supported yet, mainly due to the finickiness surrounding date types etc. The result of a filter() is another DataStream whose Polars DataFrames will only contain rows that respect the predicate.

@@ -181,15 +178,11 @@

Lesson 1: Doing Things

When you call .collect(), the logical plan you have built is actually optimized and executed. This is exactly how Spark works. To view the optimized logical plan and learn more about what Quokka is doing, you can do f.explain() which will produce a graph, or f.explain(mode="text") which will produce a textual explanation.

Joins work very intuitively. For example, this is how to do TPC-H query 12.

def do_12():
-
     d = lineitem.join(orders,left_on="l_orderkey", right_on="o_orderkey")
-
     d = d.filter("l_shipmode IN ('MAIL','SHIP') and l_commitdate < l_receiptdate and l_shipdate < l_commitdate and \
         l_receiptdate >= date '1994-01-01' and l_receiptdate < date '1995-01-01'")
-
     d = d.with_column("high", lambda x: (x["o_orderpriority"] == "1-URGENT") | (x["o_orderpriority"] == "2-HIGH"), required_columns={"o_orderpriority"})
     d = d.with_column("low", lambda x: (x["o_orderpriority"] != "1-URGENT") & (x["o_orderpriority"] != "2-HIGH"), required_columns={"o_orderpriority"})
-
     f = d.groupby("l_shipmode").aggregate(aggregations={'high':['sum'], 'low':['sum']})
     return f.collect()
 
@@ -199,7 +192,6 @@

Lesson 1: Doing Things

d = customer.join(d,left_on="c_custkey", right_on="o_custkey") d = d.filter("c_mktsegment = 'BUILDING' and o_orderdate < date '1995-03-15' and l_shipdate > date '1995-03-15'") d = d.with_column("revenue", lambda x: x["l_extendedprice"] * ( 1 - x["l_discount"]) , required_columns={"l_extendedprice", "l_discount"}) - f = d.groupby(["l_orderkey","o_orderdate","o_shippriority"]).agg({"revenue":["sum"]}) return f.collect() @@ -207,10 +199,23 @@

Lesson 1: Doing Things

An important thing to note is that Quokka currently only support inner joins. Other kinds of joins are coming soon.

Feel free to look at some other queries in the Quokka github, or browse the API reference. While you are there, please give Quokka a star!

Lesson 2: Writing Things

-

So far, we have just learned about

+

So far, we have just learned about how to read things into DataStreams and do things to DataStreams. You can also write out DataStreams to persistent storage like disk or S3 to record all the amazing things we did with them.

+

Quokka currently operates like Spark and by default writes a directory of files, with a default maximum file size for different file formats. This makes it easy to perform parallel writing.

+

To write out a DataStream to CSV or Parquet to a local directory (you must specify a valid absolute path), simply do:

+
d.write_csv("/home/ubuntu/test-path/")
+d.write_parquet("/home/ubuntu/test-path/")
+
+

To write out a DataStream to S3, you should specify an S3 bucket and prefix like this:

+
d.write_csv("s3://bucket/prefix/")
+d.write_parquet("s3://bucket/prefix/")
+
+

Writing out a DataStream is a blocking API and will automatically call a collect() for you. The collected Polars DataFrame at the end is just a column of filenames produced.

Lesson 3: Things you can't do.

Here is a brief discussion of what Quokka is not great for. Quokka's main advantage stems from the fact it can pipeline the execution of DataStreams. Once a partition (typically a Polars DataFrame) in a DataStream has been generated, it can be immediately consumed by a downstream user. This means downstream processing of this partition and upstream generation of the next partition can be overlapped.

-

Now, if an operator processing a DataStream cannot emit any partitions downstream until it has seen all of the partitions in its input DataStreams, the pipeline breaks. An example of this is an aggregation.

+

Now, if an operator processing a DataStream cannot emit any partitions downstream until it has seen all of the partitions in its input DataStreams, the pipeline breaks. An example of this is an aggregation. You cannot safely emit the result of a sum of a column of a table until you have seen every row! The main examples of this in data processing are groupby-aggregations and distributed sorts.

+

Currently, calling groupby().agg() or just agg() on a DataStream will produce another DataStream. However that DataStream will consist of exactly one batch, which holds the final result, emitted when it's computed. It is recommended to just call collect() or compute() on that result.

+

Quokka currently does not support distributed sort -- indeed a sort heavy workload is really great for Spark. Distributed sorting is not exactly needed for many analytical SQL workloads since you typically do the aggregation before the order by, which greatly reduce the number of rows you have to sort. You can then sort after you have done collect(). However for many other workloads distributed sorting is critical, and Quokka aims to support this as soon as possible.

+

Things that Quokka can do and doesn't do yet: fine grained placement of UDFs or UDAFs on GPUs or CPUs, core-count-control, Docker support, reading JSON, etc. Most of these can be easily implemented (and some already are) in the graph level API, however it takes effort to figure out what's the best abstractions to expose in the DataStream API. If you want to make this list shorter, I welcome contributions: zihengw@stanford.edu.