diff --git a/client/app/components/EditVisualizationButton/QueryControlDropdown.jsx b/client/app/components/EditVisualizationButton/QueryControlDropdown.jsx index 89ce988fd3..f3e9bb70d9 100644 --- a/client/app/components/EditVisualizationButton/QueryControlDropdown.jsx +++ b/client/app/components/EditVisualizationButton/QueryControlDropdown.jsx @@ -66,6 +66,17 @@ export default function QueryControlDropdown(props) { Download as Excel File + + + Download as Parquet File + + ); diff --git a/client/app/components/dashboards/dashboard-widget/VisualizationWidget.jsx b/client/app/components/dashboards/dashboard-widget/VisualizationWidget.jsx index 9a021cc8bd..0807c4ef4a 100644 --- a/client/app/components/dashboards/dashboard-widget/VisualizationWidget.jsx +++ b/client/app/components/dashboards/dashboard-widget/VisualizationWidget.jsx @@ -58,6 +58,15 @@ function visualizationWidgetMenuOptions({ widget, canEditDashboard, onParameters "Download as Excel File" )} , + + {!isQueryResultEmpty ? ( + + Download as Parquet File + + ) : ( + "Download as Parquet File" + )} + , (canViewQuery || canEditParameters) && , canViewQuery && ( diff --git a/client/app/pages/queries/VisualizationEmbed.jsx b/client/app/pages/queries/VisualizationEmbed.jsx index a4bcaf3177..a446f162f6 100644 --- a/client/app/pages/queries/VisualizationEmbed.jsx +++ b/client/app/pages/queries/VisualizationEmbed.jsx @@ -100,6 +100,17 @@ function VisualizationEmbedFooter({ Download as Excel File + + + Download as Parquet File + + ); diff --git a/redash/handlers/query_results.py b/redash/handlers/query_results.py index cb8b8d14f2..9b426c8065 100644 --- a/redash/handlers/query_results.py +++ b/redash/handlers/query_results.py @@ -1,40 +1,42 @@ import logging import time - import unicodedata + from flask import make_response, request from flask_login import current_user from flask_restful import abort from werkzeug.urls import url_quote + from redash import models, settings from redash.handlers.base import BaseResource, get_object_or_404, record_event +from redash.models.parameterized_query import ( + InvalidParameterError, + ParameterizedQuery, + QueryDetachedFromDataSourceError, + dropdown_values, +) from redash.permissions import ( has_access, not_view_only, require_access, - require_permission, require_any_of_permission, + require_permission, view_only, ) +from redash.serializers import ( + serialize_job, + serialize_query_result, + serialize_query_result_to_dsv, + serialize_query_result_to_parquet, + serialize_query_result_to_xlsx, +) from redash.tasks import Job from redash.tasks.queries import enqueue_query from redash.utils import ( collect_parameters_from_request, json_dumps, - utcnow, to_filename, -) -from redash.models.parameterized_query import ( - ParameterizedQuery, - InvalidParameterError, - QueryDetachedFromDataSourceError, - dropdown_values, -) -from redash.serializers import ( - serialize_query_result, - serialize_query_result_to_dsv, - serialize_query_result_to_xlsx, - serialize_job, + utcnow, ) @@ -119,9 +121,11 @@ def run_query( current_user.id, current_user.is_api_user(), metadata={ - "Username": repr(current_user) - if current_user.is_api_user() - else current_user.email, + "Username": ( + repr(current_user) + if current_user.is_api_user() + else current_user.email + ), "query_id": query_id, }, ) @@ -260,14 +264,14 @@ def options(self, query_id=None, query_result_id=None, filetype="json"): self.add_cors_headers(headers) if settings.ACCESS_CONTROL_REQUEST_METHOD: - headers[ - "Access-Control-Request-Method" - ] = settings.ACCESS_CONTROL_REQUEST_METHOD + headers["Access-Control-Request-Method"] = ( + settings.ACCESS_CONTROL_REQUEST_METHOD + ) if settings.ACCESS_CONTROL_ALLOW_HEADERS: - headers[ - "Access-Control-Allow-Headers" - ] = settings.ACCESS_CONTROL_ALLOW_HEADERS + headers["Access-Control-Allow-Headers"] = ( + settings.ACCESS_CONTROL_ALLOW_HEADERS + ) return make_response("", 200, headers) @@ -402,6 +406,7 @@ def get(self, query_id=None, query_result_id=None, filetype="json"): "xlsx": self.make_excel_response, "csv": self.make_csv_response, "tsv": self.make_tsv_response, + "parquet": self.make_parquet_response, } response = response_builders[filetype](query_result) @@ -450,6 +455,16 @@ def make_excel_response(query_result): } return make_response(serialize_query_result_to_xlsx(query_result), 200, headers) + @staticmethod + def make_parquet_response(query_result): + headers = { + # https://issues.apache.org/jira/browse/PARQUET-1889 + # "Content-Type": "application/parquet" + } + return make_response( + serialize_query_result_to_parquet(query_result), 200, headers + ) + class JobResource(BaseResource): def get(self, job_id, query_id=None): diff --git a/redash/serializers/__init__.py b/redash/serializers/__init__.py index 6105364c49..b2f1de5294 100644 --- a/redash/serializers/__init__.py +++ b/redash/serializers/__init__.py @@ -3,6 +3,7 @@ classes we have. This will ensure cleaner code and better separation of concerns. """ + from funcy import project from flask_login import current_user @@ -19,6 +20,7 @@ serialize_query_result, serialize_query_result_to_dsv, serialize_query_result_to_xlsx, + serialize_query_result_to_parquet, ) @@ -55,7 +57,14 @@ def public_widget(widget): def public_dashboard(dashboard): dashboard_dict = project( serialize_dashboard(dashboard, with_favorite_state=False), - ("name", "layout", "dashboard_filters_enabled", "updated_at", "created_at", "options"), + ( + "name", + "layout", + "dashboard_filters_enabled", + "updated_at", + "created_at", + "options", + ), ) widget_list = ( diff --git a/redash/serializers/query_result.py b/redash/serializers/query_result.py index 9eab2a1a42..6e087e9d6f 100644 --- a/redash/serializers/query_result.py +++ b/redash/serializers/query_result.py @@ -1,11 +1,24 @@ -import io import csv +import io +from typing import Optional + +import pyarrow +import pyarrow.compute +import pyarrow.parquet import xlsxwriter -from funcy import rpartial, project from dateutil.parser import isoparse as parse_date -from redash.utils import json_loads, UnicodeWriter -from redash.query_runner import TYPE_BOOLEAN, TYPE_DATE, TYPE_DATETIME +from funcy import project, rpartial + from redash.authentication.org_resolving import current_org +from redash.query_runner import ( + TYPE_BOOLEAN, + TYPE_DATE, + TYPE_DATETIME, + TYPE_FLOAT, + TYPE_INTEGER, + TYPE_STRING, +) +from redash.utils import UnicodeWriter, json_loads def _convert_format(fmt): @@ -86,7 +99,9 @@ def serialize_query_result_to_dsv(query_result, delimiter): fieldnames, special_columns = _get_column_lists(query_data["columns"] or []) - writer = csv.DictWriter(s, extrasaction="ignore", fieldnames=fieldnames, delimiter=delimiter) + writer = csv.DictWriter( + s, extrasaction="ignore", fieldnames=fieldnames, delimiter=delimiter + ) writer.writeheader() for row in query_data["rows"]: @@ -121,3 +136,82 @@ def serialize_query_result_to_xlsx(query_result): book.close() return output.getvalue() + + +def serialize_query_result_to_parquet(query_result): + output = io.BytesIO() + query_data = query_result.data + + def redash_datetime_to_pyarrow_timestamp( + table: "pyarrow.Table", + field: "pyarrow.Field", + conversion: Optional[dict] = None, + ) -> "pyarrow.Table": + column_index: int = table.schema.get_field_index(field.name) + column_data = table.column(column_index) + + formats = conversion["redash_formats"] + for datetime_format in formats: + try: + column_data = pyarrow.compute.strptime( + column_data, + format=datetime_format, + unit="s", + ) + break + except pyarrow.lib.ArrowInvalid: + continue + + new_table = table.set_column(column_index, field.name, column_data) + return new_table + + conversions = [ + {"pyarrow_type": pyarrow.bool_(), "redash_type": TYPE_BOOLEAN}, + { + "pyarrow_type": pyarrow.date32(), + "redash_type": TYPE_DATE, + "redash_formats": [r"%Y-%m-%d"], + "redash_to_pyarrow": redash_datetime_to_pyarrow_timestamp, + }, + { + "pyarrow_type": pyarrow.timestamp("s"), + "redash_type": TYPE_DATETIME, + "redash_formats": [r"%Y-%m-%dT%H:%M:%S", r"%Y-%m-%d %H:%M:%S"], + "redash_to_pyarrow": redash_datetime_to_pyarrow_timestamp, + }, + {"pyarrow_type": pyarrow.float64(), "redash_type": TYPE_FLOAT}, + {"pyarrow_type": pyarrow.int64(), "redash_type": TYPE_INTEGER}, + {"pyarrow_type": pyarrow.string(), "redash_type": TYPE_STRING}, + ] + + table = pyarrow.Table.from_pylist(query_data["rows"]) + fields = [] + + for column in query_data["columns"]: + for conversion in conversions: + if column["type"] == conversion["redash_type"]: + field = pyarrow.field( + name=column["name"], + type=conversion["pyarrow_type"], + metadata={"friendly_name": column["friendly_name"]}, + ) + fields.append(field) + converter = conversion.get("redash_to_pyarrow") + if converter: + table = converter( + table=table, + field=field, + conversion=conversion, + ) + break + + target_schema = pyarrow.schema(fields) + table = table.cast(target_schema=target_schema) + + with pyarrow.parquet.ParquetWriter( + where=output, + schema=target_schema, + ) as writer: + writer.write_table(table) + + return output.getvalue() diff --git a/requirements_all_ds.txt b/requirements_all_ds.txt index 3b1eb21a64..2c66031445 100644 --- a/requirements_all_ds.txt +++ b/requirements_all_ds.txt @@ -47,4 +47,4 @@ nzpy>=1.15 nzalchemy python-arango==6.1.0 pinotdb>=0.4.5 -pyarrow==10.0.0 \ No newline at end of file +pyarrow==10.0.0