diff --git a/client/app/components/EditVisualizationButton/QueryControlDropdown.jsx b/client/app/components/EditVisualizationButton/QueryControlDropdown.jsx
index 89ce988fd3..f3e9bb70d9 100644
--- a/client/app/components/EditVisualizationButton/QueryControlDropdown.jsx
+++ b/client/app/components/EditVisualizationButton/QueryControlDropdown.jsx
@@ -66,6 +66,17 @@ export default function QueryControlDropdown(props) {
Download as Excel File
+
+
+ Download as Parquet File
+
+
);
diff --git a/client/app/components/dashboards/dashboard-widget/VisualizationWidget.jsx b/client/app/components/dashboards/dashboard-widget/VisualizationWidget.jsx
index 9a021cc8bd..0807c4ef4a 100644
--- a/client/app/components/dashboards/dashboard-widget/VisualizationWidget.jsx
+++ b/client/app/components/dashboards/dashboard-widget/VisualizationWidget.jsx
@@ -58,6 +58,15 @@ function visualizationWidgetMenuOptions({ widget, canEditDashboard, onParameters
"Download as Excel File"
)}
,
+
+ {!isQueryResultEmpty ? (
+
+ Download as Parquet File
+
+ ) : (
+ "Download as Parquet File"
+ )}
+ ,
(canViewQuery || canEditParameters) && ,
canViewQuery && (
diff --git a/client/app/pages/queries/VisualizationEmbed.jsx b/client/app/pages/queries/VisualizationEmbed.jsx
index a4bcaf3177..a446f162f6 100644
--- a/client/app/pages/queries/VisualizationEmbed.jsx
+++ b/client/app/pages/queries/VisualizationEmbed.jsx
@@ -100,6 +100,17 @@ function VisualizationEmbedFooter({
Download as Excel File
+
+
+ Download as Parquet File
+
+
);
diff --git a/redash/handlers/query_results.py b/redash/handlers/query_results.py
index cb8b8d14f2..9b426c8065 100644
--- a/redash/handlers/query_results.py
+++ b/redash/handlers/query_results.py
@@ -1,40 +1,42 @@
import logging
import time
-
import unicodedata
+
from flask import make_response, request
from flask_login import current_user
from flask_restful import abort
from werkzeug.urls import url_quote
+
from redash import models, settings
from redash.handlers.base import BaseResource, get_object_or_404, record_event
+from redash.models.parameterized_query import (
+ InvalidParameterError,
+ ParameterizedQuery,
+ QueryDetachedFromDataSourceError,
+ dropdown_values,
+)
from redash.permissions import (
has_access,
not_view_only,
require_access,
- require_permission,
require_any_of_permission,
+ require_permission,
view_only,
)
+from redash.serializers import (
+ serialize_job,
+ serialize_query_result,
+ serialize_query_result_to_dsv,
+ serialize_query_result_to_parquet,
+ serialize_query_result_to_xlsx,
+)
from redash.tasks import Job
from redash.tasks.queries import enqueue_query
from redash.utils import (
collect_parameters_from_request,
json_dumps,
- utcnow,
to_filename,
-)
-from redash.models.parameterized_query import (
- ParameterizedQuery,
- InvalidParameterError,
- QueryDetachedFromDataSourceError,
- dropdown_values,
-)
-from redash.serializers import (
- serialize_query_result,
- serialize_query_result_to_dsv,
- serialize_query_result_to_xlsx,
- serialize_job,
+ utcnow,
)
@@ -119,9 +121,11 @@ def run_query(
current_user.id,
current_user.is_api_user(),
metadata={
- "Username": repr(current_user)
- if current_user.is_api_user()
- else current_user.email,
+ "Username": (
+ repr(current_user)
+ if current_user.is_api_user()
+ else current_user.email
+ ),
"query_id": query_id,
},
)
@@ -260,14 +264,14 @@ def options(self, query_id=None, query_result_id=None, filetype="json"):
self.add_cors_headers(headers)
if settings.ACCESS_CONTROL_REQUEST_METHOD:
- headers[
- "Access-Control-Request-Method"
- ] = settings.ACCESS_CONTROL_REQUEST_METHOD
+ headers["Access-Control-Request-Method"] = (
+ settings.ACCESS_CONTROL_REQUEST_METHOD
+ )
if settings.ACCESS_CONTROL_ALLOW_HEADERS:
- headers[
- "Access-Control-Allow-Headers"
- ] = settings.ACCESS_CONTROL_ALLOW_HEADERS
+ headers["Access-Control-Allow-Headers"] = (
+ settings.ACCESS_CONTROL_ALLOW_HEADERS
+ )
return make_response("", 200, headers)
@@ -402,6 +406,7 @@ def get(self, query_id=None, query_result_id=None, filetype="json"):
"xlsx": self.make_excel_response,
"csv": self.make_csv_response,
"tsv": self.make_tsv_response,
+ "parquet": self.make_parquet_response,
}
response = response_builders[filetype](query_result)
@@ -450,6 +455,16 @@ def make_excel_response(query_result):
}
return make_response(serialize_query_result_to_xlsx(query_result), 200, headers)
+ @staticmethod
+ def make_parquet_response(query_result):
+ headers = {
+ # https://issues.apache.org/jira/browse/PARQUET-1889
+ # "Content-Type": "application/parquet"
+ }
+ return make_response(
+ serialize_query_result_to_parquet(query_result), 200, headers
+ )
+
class JobResource(BaseResource):
def get(self, job_id, query_id=None):
diff --git a/redash/serializers/__init__.py b/redash/serializers/__init__.py
index 6105364c49..b2f1de5294 100644
--- a/redash/serializers/__init__.py
+++ b/redash/serializers/__init__.py
@@ -3,6 +3,7 @@
classes we have. This will ensure cleaner code and better
separation of concerns.
"""
+
from funcy import project
from flask_login import current_user
@@ -19,6 +20,7 @@
serialize_query_result,
serialize_query_result_to_dsv,
serialize_query_result_to_xlsx,
+ serialize_query_result_to_parquet,
)
@@ -55,7 +57,14 @@ def public_widget(widget):
def public_dashboard(dashboard):
dashboard_dict = project(
serialize_dashboard(dashboard, with_favorite_state=False),
- ("name", "layout", "dashboard_filters_enabled", "updated_at", "created_at", "options"),
+ (
+ "name",
+ "layout",
+ "dashboard_filters_enabled",
+ "updated_at",
+ "created_at",
+ "options",
+ ),
)
widget_list = (
diff --git a/redash/serializers/query_result.py b/redash/serializers/query_result.py
index 9eab2a1a42..6e087e9d6f 100644
--- a/redash/serializers/query_result.py
+++ b/redash/serializers/query_result.py
@@ -1,11 +1,24 @@
-import io
import csv
+import io
+from typing import Optional
+
+import pyarrow
+import pyarrow.compute
+import pyarrow.parquet
import xlsxwriter
-from funcy import rpartial, project
from dateutil.parser import isoparse as parse_date
-from redash.utils import json_loads, UnicodeWriter
-from redash.query_runner import TYPE_BOOLEAN, TYPE_DATE, TYPE_DATETIME
+from funcy import project, rpartial
+
from redash.authentication.org_resolving import current_org
+from redash.query_runner import (
+ TYPE_BOOLEAN,
+ TYPE_DATE,
+ TYPE_DATETIME,
+ TYPE_FLOAT,
+ TYPE_INTEGER,
+ TYPE_STRING,
+)
+from redash.utils import UnicodeWriter, json_loads
def _convert_format(fmt):
@@ -86,7 +99,9 @@ def serialize_query_result_to_dsv(query_result, delimiter):
fieldnames, special_columns = _get_column_lists(query_data["columns"] or [])
- writer = csv.DictWriter(s, extrasaction="ignore", fieldnames=fieldnames, delimiter=delimiter)
+ writer = csv.DictWriter(
+ s, extrasaction="ignore", fieldnames=fieldnames, delimiter=delimiter
+ )
writer.writeheader()
for row in query_data["rows"]:
@@ -121,3 +136,82 @@ def serialize_query_result_to_xlsx(query_result):
book.close()
return output.getvalue()
+
+
+def serialize_query_result_to_parquet(query_result):
+ output = io.BytesIO()
+ query_data = query_result.data
+
+ def redash_datetime_to_pyarrow_timestamp(
+ table: "pyarrow.Table",
+ field: "pyarrow.Field",
+ conversion: Optional[dict] = None,
+ ) -> "pyarrow.Table":
+ column_index: int = table.schema.get_field_index(field.name)
+ column_data = table.column(column_index)
+
+ formats = conversion["redash_formats"]
+ for datetime_format in formats:
+ try:
+ column_data = pyarrow.compute.strptime(
+ column_data,
+ format=datetime_format,
+ unit="s",
+ )
+ break
+ except pyarrow.lib.ArrowInvalid:
+ continue
+
+ new_table = table.set_column(column_index, field.name, column_data)
+ return new_table
+
+ conversions = [
+ {"pyarrow_type": pyarrow.bool_(), "redash_type": TYPE_BOOLEAN},
+ {
+ "pyarrow_type": pyarrow.date32(),
+ "redash_type": TYPE_DATE,
+ "redash_formats": [r"%Y-%m-%d"],
+ "redash_to_pyarrow": redash_datetime_to_pyarrow_timestamp,
+ },
+ {
+ "pyarrow_type": pyarrow.timestamp("s"),
+ "redash_type": TYPE_DATETIME,
+ "redash_formats": [r"%Y-%m-%dT%H:%M:%S", r"%Y-%m-%d %H:%M:%S"],
+ "redash_to_pyarrow": redash_datetime_to_pyarrow_timestamp,
+ },
+ {"pyarrow_type": pyarrow.float64(), "redash_type": TYPE_FLOAT},
+ {"pyarrow_type": pyarrow.int64(), "redash_type": TYPE_INTEGER},
+ {"pyarrow_type": pyarrow.string(), "redash_type": TYPE_STRING},
+ ]
+
+ table = pyarrow.Table.from_pylist(query_data["rows"])
+ fields = []
+
+ for column in query_data["columns"]:
+ for conversion in conversions:
+ if column["type"] == conversion["redash_type"]:
+ field = pyarrow.field(
+ name=column["name"],
+ type=conversion["pyarrow_type"],
+ metadata={"friendly_name": column["friendly_name"]},
+ )
+ fields.append(field)
+ converter = conversion.get("redash_to_pyarrow")
+ if converter:
+ table = converter(
+ table=table,
+ field=field,
+ conversion=conversion,
+ )
+ break
+
+ target_schema = pyarrow.schema(fields)
+ table = table.cast(target_schema=target_schema)
+
+ with pyarrow.parquet.ParquetWriter(
+ where=output,
+ schema=target_schema,
+ ) as writer:
+ writer.write_table(table)
+
+ return output.getvalue()
diff --git a/requirements_all_ds.txt b/requirements_all_ds.txt
index 3b1eb21a64..2c66031445 100644
--- a/requirements_all_ds.txt
+++ b/requirements_all_ds.txt
@@ -47,4 +47,4 @@ nzpy>=1.15
nzalchemy
python-arango==6.1.0
pinotdb>=0.4.5
-pyarrow==10.0.0
\ No newline at end of file
+pyarrow==10.0.0